LinuxLists.cc - [PATCH 12/18] arch/tlb: Clean up simple architectures

2018-09-26 11:58:37

Subject: [PATCH 12/18] arch/tlb: Clean up simple architectures

There are generally two cases:

1) either the platform has an efficient flush_tlb_range() and
asm-generic/tlb.h doesn't need any overrides at all.

2) or an architecture lacks an efficient flush_tlb_range() and
we override tlb_end_vma() and tlb_flush().

Convert all 'simple' architectures to one of these two forms.

alpha: has no range invalidate -> 2
arc: already used flush_tlb_range() -> 1
c6x: has no range invalidate -> 2
h8300: has no mmu
hexagon: has an efficient flush_tlb_range() -> 1
(flush_tlb_mm() is in fact a full range invalidate,
so no need to shoot down everything)
m68k: has inefficient flush_tlb_range() -> 2
microblaze: has no flush_tlb_range() -> 2
mips: has efficient flush_tlb_range() -> 1
(even though it currently seems to use flush_tlb_mm())
nds32: already uses flush_tlb_range() -> 1
nios2: has inefficient flush_tlb_range() -> 2
(no limit on range iteration)
openrisc: has inefficient flush_tlb_range() -> 2
(no limit on range iteration)
parisc: already uses flush_tlb_range() -> 1
sparc32: already uses flush_tlb_range() -> 1
unicore32: has inefficient flush_tlb_range() -> 2
(no limit on range iteration)
xtensa: has efficient flush_tlb_range() -> 1

Cc: Richard Henderson <[email protected]>
Cc: Vineet Gupta <[email protected]>
Cc: Mark Salter <[email protected]>
Cc: Richard Kuo <[email protected]>
Cc: Michal Simek <[email protected]>
Cc: Paul Burton <[email protected]>
Cc: Greentime Hu <[email protected]>
Cc: Ley Foon Tan <[email protected]>
Cc: Jonas Bonn <[email protected]>
Cc: Helge Deller <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Guan Xuetao <[email protected]>
Cc: Max Filippov <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: "Aneesh Kumar K.V" <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Nick Piggin <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/alpha/include/asm/tlb.h | 2 --
arch/arc/include/asm/tlb.h | 23 -----------------------
arch/c6x/include/asm/tlb.h | 1 +
arch/h8300/include/asm/tlb.h | 2 --
arch/hexagon/include/asm/tlb.h | 12 ------------
arch/m68k/include/asm/tlb.h | 1 -
arch/microblaze/include/asm/tlb.h | 4 +---
arch/mips/include/asm/tlb.h | 8 --------
arch/nds32/include/asm/tlb.h | 10 ----------
arch/nios2/include/asm/tlb.h | 8 +++++---
arch/openrisc/include/asm/tlb.h | 6 ++++--
arch/parisc/include/asm/tlb.h | 13 -------------
arch/powerpc/include/asm/tlb.h | 1 -
arch/sparc/include/asm/tlb_32.h | 13 -------------
arch/unicore32/include/asm/tlb.h | 10 ++++++----
arch/xtensa/include/asm/tlb.h | 17 -----------------
16 files changed, 17 insertions(+), 114 deletions(-)

--- a/arch/alpha/include/asm/tlb.h
+++ b/arch/alpha/include/asm/tlb.h
@@ -4,8 +4,6 @@

#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0)
-
#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

#include <asm-generic/tlb.h>
--- a/arch/arc/include/asm/tlb.h
+++ b/arch/arc/include/asm/tlb.h
@@ -9,29 +9,6 @@
#ifndef _ASM_ARC_TLB_H
#define _ASM_ARC_TLB_H

-#define tlb_flush(tlb) \
-do { \
- if (tlb->fullmm) \
- flush_tlb_mm((tlb)->mm); \
-} while (0)
-
-/*
- * This pair is called at time of munmap/exit to flush cache and TLB entries
- * for mappings being torn down.
- * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
- * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
- *
- * Note, read http://lkml.org/lkml/2004/1/15/6
- */
-
-#define tlb_end_vma(tlb, vma) \
-do { \
- if (!tlb->fullmm) \
- flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
-} while (0)
-
-#define __tlb_remove_tlb_entry(tlb, ptep, address)
-
#include <linux/pagemap.h>
#include <asm-generic/tlb.h>

--- a/arch/c6x/include/asm/tlb.h
+++ b/arch/c6x/include/asm/tlb.h
@@ -2,6 +2,7 @@
#ifndef _ASM_C6X_TLB_H
#define _ASM_C6X_TLB_H

+#define tlb_end_vma(tlb,vma) do { } while (0)
#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

#include <asm-generic/tlb.h>
--- a/arch/h8300/include/asm/tlb.h
+++ b/arch/h8300/include/asm/tlb.h
@@ -2,8 +2,6 @@
#ifndef __H8300_TLB_H__
#define __H8300_TLB_H__

-#define tlb_flush(tlb) do { } while (0)
-
#include <asm-generic/tlb.h>

#endif
--- a/arch/hexagon/include/asm/tlb.h
+++ b/arch/hexagon/include/asm/tlb.h
@@ -22,18 +22,6 @@
#include <linux/pagemap.h>
#include <asm/tlbflush.h>

-/*
- * We don't need any special per-pte or per-vma handling...
- */
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-
-/*
- * .. because we flush the whole mm when it fills up
- */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
#include <asm-generic/tlb.h>

#endif
--- a/arch/m68k/include/asm/tlb.h
+++ b/arch/m68k/include/asm/tlb.h
@@ -8,7 +8,6 @@
*/
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)

/*
* .. because we flush the whole mm when it
--- a/arch/microblaze/include/asm/tlb.h
+++ b/arch/microblaze/include/asm/tlb.h
@@ -11,14 +11,12 @@
#ifndef _ASM_MICROBLAZE_TLB_H
#define _ASM_MICROBLAZE_TLB_H

-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
#include <linux/pagemap.h>

#ifdef CONFIG_MMU
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0)
+#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
#endif

#include <asm-generic/tlb.h>
--- a/arch/mips/include/asm/tlb.h
+++ b/arch/mips/include/asm/tlb.h
@@ -5,14 +5,6 @@
#include <asm/cpu-features.h>
#include <asm/mipsregs.h>

-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-
-/*
- * .. because we flush the whole mm when it fills up.
- */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
#define _UNIQUE_ENTRYHI(base, idx) \
(((base) + ((idx) << (PAGE_SHIFT + 1))) | \
(cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0))
--- a/arch/nds32/include/asm/tlb.h
+++ b/arch/nds32/include/asm/tlb.h
@@ -4,16 +4,6 @@
#ifndef __ASMNDS32_TLB_H
#define __ASMNDS32_TLB_H

-#define tlb_end_vma(tlb,vma) \
- do { \
- if(!tlb->fullmm) \
- flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
- } while (0)
-
-#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0)
-
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
#include <asm-generic/tlb.h>

#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte)
--- a/arch/nios2/include/asm/tlb.h
+++ b/arch/nios2/include/asm/tlb.h
@@ -11,12 +11,14 @@
#ifndef _ASM_NIOS2_TLB_H
#define _ASM_NIOS2_TLB_H

-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
extern void set_mmu_pid(unsigned long pid);

+/*
+ * NIOS32 does have flush_tlb_range(), but it lacks a limit and fallback to
+ * full mm invalidation. So use flush_tlb_mm() for everything.
+ */
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

#include <linux/pagemap.h>
#include <asm-generic/tlb.h>
--- a/arch/openrisc/include/asm/tlb.h
+++ b/arch/openrisc/include/asm/tlb.h
@@ -22,12 +22,14 @@
/*
* or32 doesn't need any special per-pte or
* per-vma handling..
+ *
+ * OpenRISC doesn't have an efficient flush_tlb_range() so use flush_tlb_mm()
+ * for everything.
*/
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-
#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
#include <linux/pagemap.h>
#include <asm-generic/tlb.h>

--- a/arch/parisc/include/asm/tlb.h
+++ b/arch/parisc/include/asm/tlb.h
@@ -2,19 +2,6 @@
#ifndef _PARISC_TLB_H
#define _PARISC_TLB_H

-#define tlb_flush(tlb) \
-do { if ((tlb)->fullmm) \
- flush_tlb_mm((tlb)->mm);\
-} while (0)
-
-#define tlb_end_vma(tlb, vma) \
-do { if (!(tlb)->fullmm) \
- flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
-} while (0)
-
-#define __tlb_remove_tlb_entry(tlb, pte, address) \
- do { } while (0)
-
#include <asm-generic/tlb.h>

#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd)
--- a/arch/sparc/include/asm/tlb_32.h
+++ b/arch/sparc/include/asm/tlb_32.h
@@ -2,19 +2,6 @@
#ifndef _SPARC_TLB_H
#define _SPARC_TLB_H

-#define tlb_end_vma(tlb, vma) \
-do { \
- flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
-} while (0)
-
-#define __tlb_remove_tlb_entry(tlb, pte, address) \
- do { } while (0)
-
-#define tlb_flush(tlb) \
-do { \
- flush_tlb_mm((tlb)->mm); \
-} while (0)
-
#include <asm-generic/tlb.h>

#endif /* _SPARC_TLB_H */
--- a/arch/unicore32/include/asm/tlb.h
+++ b/arch/unicore32/include/asm/tlb.h
@@ -12,10 +12,12 @@
#ifndef __UNICORE_TLB_H__
#define __UNICORE_TLB_H__

-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+/*
+ * unicore32 lacks an afficient flush_tlb_range(), use flush_tlb_mm().
+ */
+#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_end_vma(tlb, vma) do { } while (0)
+#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

#define __pte_free_tlb(tlb, pte, addr) \
do { \
--- a/arch/xtensa/include/asm/tlb.h
+++ b/arch/xtensa/include/asm/tlb.h
@@ -14,23 +14,6 @@
#include <asm/cache.h>
#include <asm/page.h>

-#if (DCACHE_WAY_SIZE <= PAGE_SIZE)
-
-# define tlb_end_vma(tlb,vma) do { } while (0)
-
-#else
-
-# define tlb_end_vma(tlb, vma) \
- do { \
- if (!tlb->fullmm) \
- flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
- } while(0)
-
-#endif
-
-#define __tlb_remove_tlb_entry(tlb,pte,addr) do { } while (0)
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
#include <asm-generic/tlb.h>

#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte)

2018-10-03 17:06:10

by Vineet Gupta

[permalink] [raw]

Subject: Re: [PATCH 12/18] arch/tlb: Clean up simple architectures

On 09/26/2018 04:56 AM, Peter Zijlstra wrote:
> There are generally two cases:
>
> 1) either the platform has an efficient flush_tlb_range() and
> asm-generic/tlb.h doesn't need any overrides at all.
>
> 2) or an architecture lacks an efficient flush_tlb_range() and
> we override tlb_end_vma() and tlb_flush().
>
> Convert all 'simple' architectures to one of these two forms.
>
> alpha: has no range invalidate -> 2
> arc: already used flush_tlb_range() -> 1
> c6x: has no range invalidate -> 2
> h8300: has no mmu
> hexagon: has an efficient flush_tlb_range() -> 1
> (flush_tlb_mm() is in fact a full range invalidate,
> so no need to shoot down everything)
> m68k: has inefficient flush_tlb_range() -> 2
> microblaze: has no flush_tlb_range() -> 2
> mips: has efficient flush_tlb_range() -> 1
> (even though it currently seems to use flush_tlb_mm())
> nds32: already uses flush_tlb_range() -> 1
> nios2: has inefficient flush_tlb_range() -> 2
> (no limit on range iteration)
> openrisc: has inefficient flush_tlb_range() -> 2
> (no limit on range iteration)
> parisc: already uses flush_tlb_range() -> 1
> sparc32: already uses flush_tlb_range() -> 1
> unicore32: has inefficient flush_tlb_range() -> 2
> (no limit on range iteration)
> xtensa: has efficient flush_tlb_range() -> 1
>
> Cc: Richard Henderson <[email protected]>
> Cc: Vineet Gupta <[email protected]>
> Cc: Mark Salter <[email protected]>
> Cc: Richard Kuo <[email protected]>
> Cc: Michal Simek <[email protected]>
> Cc: Paul Burton <[email protected]>
> Cc: Greentime Hu <[email protected]>
> Cc: Ley Foon Tan <[email protected]>
> Cc: Jonas Bonn <[email protected]>
> Cc: Helge Deller <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Guan Xuetao <[email protected]>
> Cc: Max Filippov <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: "Aneesh Kumar K.V" <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Nick Piggin <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/alpha/include/asm/tlb.h | 2 --
> arch/arc/include/asm/tlb.h | 23 -----------------------
> arch/c6x/include/asm/tlb.h | 1 +
> arch/h8300/include/asm/tlb.h | 2 --
> arch/hexagon/include/asm/tlb.h | 12 ------------
> arch/m68k/include/asm/tlb.h | 1 -
> arch/microblaze/include/asm/tlb.h | 4 +---
> arch/mips/include/asm/tlb.h | 8 --------
> arch/nds32/include/asm/tlb.h | 10 ----------
> arch/nios2/include/asm/tlb.h | 8 +++++---
> arch/openrisc/include/asm/tlb.h | 6 ++++--
> arch/parisc/include/asm/tlb.h | 13 -------------
> arch/powerpc/include/asm/tlb.h | 1 -
> arch/sparc/include/asm/tlb_32.h | 13 -------------
> arch/unicore32/include/asm/tlb.h | 10 ++++++----
> arch/xtensa/include/asm/tlb.h | 17 -----------------
> 16 files changed, 17 insertions(+), 114 deletions(-)
>
> --- a/arch/alpha/include/asm/tlb.h
> +++ b/arch/alpha/include/asm/tlb.h
> @@ -4,8 +4,6 @@
>
> #define tlb_start_vma(tlb, vma) do { } while (0)
> #define tlb_end_vma(tlb, vma) do { } while (0)
> -#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0)
> -
> #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
>
> #include <asm-generic/tlb.h>
> --- a/arch/arc/include/asm/tlb.h
> +++ b/arch/arc/include/asm/tlb.h
> @@ -9,29 +9,6 @@
> #ifndef _ASM_ARC_TLB_H
> #define _ASM_ARC_TLB_H
>
> -#define tlb_flush(tlb) \
> -do { \
> - if (tlb->fullmm) \
> - flush_tlb_mm((tlb)->mm); \
> -} while (0)
> -
> -/*
> - * This pair is called at time of munmap/exit to flush cache and TLB entries
> - * for mappings being torn down.
> - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
> - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
> - *
> - * Note, read https://urldefense.proofpoint.com/v2/url?u=http-3A__lkml.org_lkml_2004_1_15_6&d=DwIBaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=c14YS-cH-kdhTOW89KozFhBtBJgs1zXscZojEZQ0THs&m=5jiyvgRek4SKK5DUWDBGufVcuLez5G-jJCh3K-ndHsg&s=7uAzzw_jdAXMfb07B-vGPh3V1vggbTAsB7xL6Kie47A&e=
> - */
> -
> -#define tlb_end_vma(tlb, vma) \
> -do { \
> - if (!tlb->fullmm) \
> - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
> -} while (0)
> -
> -#define __tlb_remove_tlb_entry(tlb, ptep, address)
> -
> #include <linux/pagemap.h>
> #include <asm-generic/tlb.h>

LGTM per discussion in an earlier thread. However given that for "simpler" arches
the whole series doesn't apply can you please beef up the changelog so I don't go
scratching my head 2 years down the line. It currently describes the hows of
things but not exactly whys: shift_arg_pages missing tlb_start_vma,
move_page_tables look dodgy, yady yadda ?

Thx,
-Vineet

2018-10-11 15:09:36

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 12/18] arch/tlb: Clean up simple architectures

On Wed, Oct 03, 2018 at 05:03:50PM +0000, Vineet Gupta wrote:
> On 09/26/2018 04:56 AM, Peter Zijlstra wrote:
> > There are generally two cases:
> >
> > 1) either the platform has an efficient flush_tlb_range() and
> > asm-generic/tlb.h doesn't need any overrides at all.
> >
> > 2) or an architecture lacks an efficient flush_tlb_range() and
> > we override tlb_end_vma() and tlb_flush().
> >
> > Convert all 'simple' architectures to one of these two forms.
> >

> > --- a/arch/arc/include/asm/tlb.h
> > +++ b/arch/arc/include/asm/tlb.h
> > @@ -9,29 +9,6 @@
> > #ifndef _ASM_ARC_TLB_H
> > #define _ASM_ARC_TLB_H
> >
> > -#define tlb_flush(tlb) \
> > -do { \
> > - if (tlb->fullmm) \
> > - flush_tlb_mm((tlb)->mm); \
> > -} while (0)
> > -
> > -/*
> > - * This pair is called at time of munmap/exit to flush cache and TLB entries
> > - * for mappings being torn down.
> > - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
> > - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
> > - *
> > - * Note, read https://urldefense.proofpoint.com/v2/url?u=http-3A__lkml.org_lkml_2004_1_15_6&d=DwIBaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=c14YS-cH-kdhTOW89KozFhBtBJgs1zXscZojEZQ0THs&m=5jiyvgRek4SKK5DUWDBGufVcuLez5G-jJCh3K-ndHsg&s=7uAzzw_jdAXMfb07B-vGPh3V1vggbTAsB7xL6Kie47A&e=
> > - */
> > -
> > -#define tlb_end_vma(tlb, vma) \
> > -do { \
> > - if (!tlb->fullmm) \
> > - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
> > -} while (0)
> > -
> > -#define __tlb_remove_tlb_entry(tlb, ptep, address)
> > -
> > #include <linux/pagemap.h>
> > #include <asm-generic/tlb.h>
>
> LGTM per discussion in an earlier thread. However given that for "simpler" arches
> the whole series doesn't apply can you please beef up the changelog so I don't go
> scratching my head 2 years down the line. It currently describes the hows of
> things but not exactly whys: shift_arg_pages missing tlb_start_vma,
> move_page_tables look dodgy, yady yadda ?

Right you are. Thanks for pointing out the somewhat sparse Changelog;
typically I end up kicking myself a few years down the line.

I think I will in fact change the implementation a little and provide a
symbol/Kconfig to switch the default implementation between
flush_tlb_vma() and flush_tlb_mm().

That avoids some of the repetition. But see here a preview of the new
Changelog, does that clarify things enough?

---
Subject: arch/tlb: Clean up simple architectures
From: Peter Zijlstra <[email protected]>
Date: Tue Sep 4 17:04:07 CEST 2018

The generic mmu_gather implementation is geared towards range tracking
and provided the architecture provides a fairly efficient
flush_tlb_range() implementation (or provides a custom tlb_flush()
implementation) things will work well.

The one case this doesn't cover well is where there is no (efficient)
range invalidate at all. In this case we can select
MMU_GATHER_NO_RANGE.

So this reduces to two cases:

1) either the platform has an efficient flush_tlb_range() and
asm-generic/tlb.h doesn't need any overrides at all.

2) or an architecture lacks an efficient flush_tlb_range() and
we need to select MMU_GATHER_NO_RANGE.

Convert all 'simple' architectures to one of these two forms.

alpha: has no range invalidate -> 2
arc: already used flush_tlb_range() -> 1
c6x: has no range invalidate -> 2
hexagon: has an efficient flush_tlb_range() -> 1
(flush_tlb_mm() is in fact a full range invalidate,
so no need to shoot down everything)
m68k: has inefficient flush_tlb_range() -> 2
microblaze: has no flush_tlb_range() -> 2
mips: has efficient flush_tlb_range() -> 1
(even though it currently seems to use flush_tlb_mm())
nds32: already uses flush_tlb_range() -> 1
nios2: has inefficient flush_tlb_range() -> 2
(no limit on range iteration)
openrisc: has inefficient flush_tlb_range() -> 2
(no limit on range iteration)
parisc: already uses flush_tlb_range() -> 1
sparc32: already uses flush_tlb_range() -> 1
unicore32: has inefficient flush_tlb_range() -> 2
(no limit on range iteration)
xtensa: has efficient flush_tlb_range() -> 1

Note this also fixes a bug in the existing code for a number
platforms. Those platforms that did:

tlb_end_vma() -> if (!fullmm) flush_tlb_*()
tlb_flush -> if (full_mm) flush_tlb_mm()

missed the case of shift_arg_pages(), which doesn't have @fullmm set,
nor calls into tlb_*vma(), but still frees page-tables and thus needs
an invalidate. The new code handles this by detecting a non-empty
range, and either issuing the matching range invalidate or a full
invalidate, depending on the capabilities.

Cc: Nick Piggin <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Michal Simek <[email protected]>
Cc: Helge Deller <[email protected]>
Cc: Greentime Hu <[email protected]>
Cc: Richard Henderson <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Aneesh Kumar K.V" <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Ley Foon Tan <[email protected]>
Cc: Jonas Bonn <[email protected]>
Cc: Mark Salter <[email protected]>
Cc: Richard Kuo <[email protected]
Cc: Vineet Gupta <[email protected]>
Cc: Paul Burton <[email protected]>
Cc: Max Filippov <[email protected]>
Cc: Guan Xuetao <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

2018-10-12 19:40:50

by Vineet Gupta

[permalink] [raw]

Subject: Re: [PATCH 12/18] arch/tlb: Clean up simple architectures

On 10/11/2018 08:06 AM, Peter Zijlstra wrote:
> On Wed, Oct 03, 2018 at 05:03:50PM +0000, Vineet Gupta wrote:
>> On 09/26/2018 04:56 AM, Peter Zijlstra wrote:
>>> There are generally two cases:
>>>
>>> 1) either the platform has an efficient flush_tlb_range() and
>>> asm-generic/tlb.h doesn't need any overrides at all.
>>>
>>> 2) or an architecture lacks an efficient flush_tlb_range() and
>>> we override tlb_end_vma() and tlb_flush().
>>>
>>> Convert all 'simple' architectures to one of these two forms.
>>>
>>> --- a/arch/arc/include/asm/tlb.h
>>> +++ b/arch/arc/include/asm/tlb.h
>>> @@ -9,29 +9,6 @@
>>> #ifndef _ASM_ARC_TLB_H
>>> #define _ASM_ARC_TLB_H
>>>
>>> -#define tlb_flush(tlb) \
>>> -do { \
>>> - if (tlb->fullmm) \
>>> - flush_tlb_mm((tlb)->mm); \
>>> -} while (0)
>>> -
>>> -/*
>>> - * This pair is called at time of munmap/exit to flush cache and TLB entries
>>> - * for mappings being torn down.
>>> - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
>>> - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
>>> - *
>>> - * Note, read https://urldefense.proofpoint.com/v2/url?u=http-3A__lkml.org_lkml_2004_1_15_6&d=DwIBaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=c14YS-cH-kdhTOW89KozFhBtBJgs1zXscZojEZQ0THs&m=5jiyvgRek4SKK5DUWDBGufVcuLez5G-jJCh3K-ndHsg&s=7uAzzw_jdAXMfb07B-vGPh3V1vggbTAsB7xL6Kie47A&e=
>>> - */
>>> -
>>> -#define tlb_end_vma(tlb, vma) \
>>> -do { \
>>> - if (!tlb->fullmm) \
>>> - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
>>> -} while (0)
>>> -
>>> -#define __tlb_remove_tlb_entry(tlb, ptep, address)
>>> -
>>> #include <linux/pagemap.h>
>>> #include <asm-generic/tlb.h>
>> LGTM per discussion in an earlier thread. However given that for "simpler" arches
>> the whole series doesn't apply can you please beef up the changelog so I don't go
>> scratching my head 2 years down the line. It currently describes the hows of
>> things but not exactly whys: shift_arg_pages missing tlb_start_vma,
>> move_page_tables look dodgy, yady yadda ?
> Right you are. Thanks for pointing out the somewhat sparse Changelog;
> typically I end up kicking myself a few years down the line.
>
> I think I will in fact change the implementation a little and provide a
> symbol/Kconfig to switch the default implementation between
> flush_tlb_vma() and flush_tlb_mm().
>
> That avoids some of the repetition. But see here a preview of the new
> Changelog, does that clarify things enough?
>
> ---
> Subject: arch/tlb: Clean up simple architectures
> From: Peter Zijlstra <[email protected]>
> Date: Tue Sep 4 17:04:07 CEST 2018
>
> The generic mmu_gather implementation is geared towards range tracking
> and provided the architecture provides a fairly efficient
> flush_tlb_range() implementation (or provides a custom tlb_flush()
> implementation) things will work well.
>
> The one case this doesn't cover well is where there is no (efficient)
> range invalidate at all. In this case we can select
> MMU_GATHER_NO_RANGE.
>
> So this reduces to two cases:
>
> 1) either the platform has an efficient flush_tlb_range() and
> asm-generic/tlb.h doesn't need any overrides at all.
>
> 2) or an architecture lacks an efficient flush_tlb_range() and
> we need to select MMU_GATHER_NO_RANGE.
>
> Convert all 'simple' architectures to one of these two forms.
>
> alpha: has no range invalidate -> 2
> arc: already used flush_tlb_range() -> 1
> c6x: has no range invalidate -> 2
> hexagon: has an efficient flush_tlb_range() -> 1
> (flush_tlb_mm() is in fact a full range invalidate,
> so no need to shoot down everything)
> m68k: has inefficient flush_tlb_range() -> 2
> microblaze: has no flush_tlb_range() -> 2
> mips: has efficient flush_tlb_range() -> 1
> (even though it currently seems to use flush_tlb_mm())
> nds32: already uses flush_tlb_range() -> 1
> nios2: has inefficient flush_tlb_range() -> 2
> (no limit on range iteration)
> openrisc: has inefficient flush_tlb_range() -> 2
> (no limit on range iteration)
> parisc: already uses flush_tlb_range() -> 1
> sparc32: already uses flush_tlb_range() -> 1
> unicore32: has inefficient flush_tlb_range() -> 2
> (no limit on range iteration)
> xtensa: has efficient flush_tlb_range() -> 1
>
> Note this also fixes a bug in the existing code for a number
> platforms. Those platforms that did:
>
> tlb_end_vma() -> if (!fullmm) flush_tlb_*()
> tlb_flush -> if (full_mm) flush_tlb_mm()
>
> missed the case of shift_arg_pages(), which doesn't have @fullmm set,
> nor calls into tlb_*vma(), but still frees page-tables and thus needs
> an invalidate. The new code handles this by detecting a non-empty
> range, and either issuing the matching range invalidate or a full
> invalidate, depending on the capabilities.
>
> Cc: Nick Piggin <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Michal Simek <[email protected]>
> Cc: Helge Deller <[email protected]>
> Cc: Greentime Hu <[email protected]>
> Cc: Richard Henderson <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: "Aneesh Kumar K.V" <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Ley Foon Tan <[email protected]>
> Cc: Jonas Bonn <[email protected]>
> Cc: Mark Salter <[email protected]>
> Cc: Richard Kuo <[email protected]
> Cc: Vineet Gupta <[email protected]>
> Cc: Paul Burton <[email protected]>
> Cc: Max Filippov <[email protected]>
> Cc: Guan Xuetao <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

Very nice. Thx for doing this.

Once you have redone this, please point me to a branch so I can give this a spin.
I've always been interested in tracking down / optimizing the full TLB flushes -
which ARC implements by simply moving the MMU/process to a new ASID (TLB entries
tagged with an 8 bit value - unique per process). When I started looking into this
, a simple ls (fork+execve) would increment the ASID by 13 which I'd optimized to
a reasonable 4. Haven't checked that in recent times though so would be fun to
revive that measurement.

-Vineet

2018-10-15 14:17:07

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 12/18] arch/tlb: Clean up simple architectures

On Fri, Oct 12, 2018 at 07:40:04PM +0000, Vineet Gupta wrote:
> Very nice. Thx for doing this.
>
> Once you have redone this, please point me to a branch so I can give this a spin.
> I've always been interested in tracking down / optimizing the full TLB flushes -
> which ARC implements by simply moving the MMU/process to a new ASID (TLB entries
> tagged with an 8 bit value - unique per process). When I started looking into this
> , a simple ls (fork+execve) would increment the ASID by 13 which I'd optimized to
> a reasonable 4. Haven't checked that in recent times though so would be fun to
> revive that measurement.

I just pushed out the latest version to:

git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git mm/tlb

(mandatory caution: that tree is unstable / throw-away)

I'll wait a few days to see what, if anything, comes back from 0day
before posting again.