2022-07-08 07:26:42

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma()

Scattered across the archs are 3 basic forms of tlb_{start,end}_vma().
Provide two new MMU_GATHER_knobs to enumerate them and remove the per
arch tlb_{start,end}_vma() implementations.

- MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range()
but does *NOT* want to call it for each VMA.

- MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the
invalidate across multiple VMAs if possible.

With these it is possible to capture the three forms:

1) empty stubs;
select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS

2) start: flush_cache_range(), end: empty;
select MMU_GATHER_MERGE_VMAS

3) start: flush_cache_range(), end: flush_tlb_range();
default

Obviously, if the architecture does not have flush_cache_range() then
it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/Kconfig | 7 +++++++
arch/csky/include/asm/tlb.h | 13 -------------
arch/loongarch/Kconfig | 1 +
arch/loongarch/include/asm/tlb.h | 10 ----------
arch/powerpc/Kconfig | 1 +
arch/powerpc/include/asm/tlb.h | 2 --
arch/s390/Kconfig | 1 +
arch/s390/include/asm/tlb.h | 3 ---
arch/sparc/Kconfig | 2 ++
arch/sparc/include/asm/tlb_64.h | 2 --
arch/x86/Kconfig | 1 +
arch/x86/include/asm/tlb.h | 3 ---
include/asm-generic/tlb.h | 21 +++++++++++++++++++--
13 files changed, 32 insertions(+), 35 deletions(-)

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE

config MMU_GATHER_NO_RANGE
bool
+ select MMU_GATHER_MERGE_VMAS
+
+config MMU_GATHER_NO_FLUSH_CACHE
+ bool
+
+config MMU_GATHER_MERGE_VMAS
+ bool

config MMU_GATHER_NO_GATHER
bool
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -4,19 +4,6 @@
#define __ASM_CSKY_TLB_H

#include <asm/cacheflush.h>
-
-#define tlb_start_vma(tlb, vma) \
- do { \
- if (!(tlb)->fullmm) \
- flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
- } while (0)
-
-#define tlb_end_vma(tlb, vma) \
- do { \
- if (!(tlb)->fullmm) \
- flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
- } while (0)
-
#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

#include <asm-generic/tlb.h>
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -112,6 +112,7 @@ config LOONGARCH
select TRACE_IRQFLAGS_SUPPORT
select USE_PERCPU_NUMA_NODE_ID
select ZONE_DMA32
+ select MMU_GATHER_MERGE_VMAS if MMU

config 32BIT
bool
--- a/arch/loongarch/include/asm/tlb.h
+++ b/arch/loongarch/include/asm/tlb.h
@@ -137,16 +137,6 @@ static inline void invtlb_all(u32 op, u3
);
}

-/*
- * LoongArch doesn't need any special per-pte or per-vma handling, except
- * we need to flush cache for area to be unmapped.
- */
-#define tlb_start_vma(tlb, vma) \
- do { \
- if (!(tlb)->fullmm) \
- flush_cache_range(vma, vma->vm_start, vma->vm_end); \
- } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)

static void tlb_flush(struct mmu_gather *tlb);
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -256,6 +256,7 @@ config PPC
select IRQ_FORCED_THREADING
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE
select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -19,8 +19,6 @@

#include <linux/pagemap.h>

-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry __tlb_remove_tlb_entry

#define tlb_flush tlb_flush
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -204,6 +204,7 @@ config S390
select IOMMU_SUPPORT if PCI
select MMU_GATHER_NO_GATHER
select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE if PCI
select NEED_SG_DMA_LENGTH if PCI
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size);

-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-
#define tlb_flush tlb_flush
#define pte_free_tlb pte_free_tlb
#define pmd_free_tlb pmd_free_tlb
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -67,6 +67,8 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select MMU_GATHER_RCU_TABLE_FREE if SMP
+ select MMU_GATHER_MERGE_VMAS
+ select MMU_GATHER_NO_FLUSH_CACHE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct *
void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *);
void flush_tlb_pending(void);

-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
#define tlb_flush(tlb) flush_tlb_pending()

/*
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -245,6 +245,7 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -2,9 +2,6 @@
#ifndef _ASM_X86_TLB_H
#define _ASM_X86_TLB_H

-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-
#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -158,9 +158,24 @@
* Useful if your architecture doesn't use IPIs for remote TLB invalidates
* and therefore doesn't naturally serialize with software page-table walkers.
*
+ * MMU_GATHER_NO_FLUSH_CACHE
+ *
+ * Indicates the architecture has flush_cache_range() but it needs *NOT* be called
+ * before unmapping a VMA.
+ *
+ * NOTE: strictly speaking we shouldn't have this knob and instead rely on
+ * flush_cache_range() being a NOP, except Sparc64 seems to be
+ * different here.
+ *
+ * MMU_GATHER_MERGE_VMAS
+ *
+ * Indicates the architecture wants to merge ranges over VMAs; typical when
+ * multiple range invalidates are more expensive than a full invalidate.
+ *
* MMU_GATHER_NO_RANGE
*
- * Use this if your architecture lacks an efficient flush_tlb_range().
+ * Use this if your architecture lacks an efficient flush_tlb_range(). This
+ * option implies MMU_GATHER_MERGE_VMAS above.
*
* MMU_GATHER_NO_GATHER
*
@@ -493,14 +508,16 @@ static inline void tlb_start_vma(struct
return;

tlb_update_vma_flags(tlb, vma);
+#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE
flush_cache_range(vma, vma->vm_start, vma->vm_end);
+#endif
}
#endif

#ifndef tlb_end_vma
static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
- if (tlb->fullmm)
+ if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
return;

/*



2022-07-08 13:30:27

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma()

On Fri, Jul 08, 2022 at 09:18:03AM +0200, Peter Zijlstra wrote:
> Scattered across the archs are 3 basic forms of tlb_{start,end}_vma().
> Provide two new MMU_GATHER_knobs to enumerate them and remove the per
> arch tlb_{start,end}_vma() implementations.
>
> - MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range()
> but does *NOT* want to call it for each VMA.
>
> - MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the
> invalidate across multiple VMAs if possible.
>
> With these it is possible to capture the three forms:
>
> 1) empty stubs;
> select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS
>
> 2) start: flush_cache_range(), end: empty;
> select MMU_GATHER_MERGE_VMAS
>
> 3) start: flush_cache_range(), end: flush_tlb_range();
> default
>
> Obviously, if the architecture does not have flush_cache_range() then
> it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/Kconfig | 7 +++++++
> arch/csky/include/asm/tlb.h | 13 -------------
> arch/loongarch/Kconfig | 1 +
> arch/loongarch/include/asm/tlb.h | 10 ----------
> arch/powerpc/Kconfig | 1 +
> arch/powerpc/include/asm/tlb.h | 2 --
> arch/s390/Kconfig | 1 +
> arch/s390/include/asm/tlb.h | 3 ---
> arch/sparc/Kconfig | 2 ++
> arch/sparc/include/asm/tlb_64.h | 2 --
> arch/x86/Kconfig | 1 +
> arch/x86/include/asm/tlb.h | 3 ---
> include/asm-generic/tlb.h | 21 +++++++++++++++++++--
> 13 files changed, 32 insertions(+), 35 deletions(-)
>
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE
>
> config MMU_GATHER_NO_RANGE
> bool
> + select MMU_GATHER_MERGE_VMAS
> +
> +config MMU_GATHER_NO_FLUSH_CACHE
> + bool

If this is really a sparc-special and we don't necessarily want it to
proliferate, then maybe:

default y
depends on SPARC

would keep it confined?

But I don't mind either way and the important bits of the patch look good:

Acked-by: Will Deacon <[email protected]>

Thanks,

Will