LinuxLists.cc - [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

2011-03-02 18:05:53

Subject: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

Might want to optimize the tlb_flush() function to do a full mm flush
when the range is 'large', IA64 does this too.

Cc: Russell King <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/arm/Kconfig | 1
arch/arm/include/asm/tlb.h | 174 ++-------------------------------------------
2 files changed, 9 insertions(+), 166 deletions(-)

Index: linux-2.6/arch/arm/Kconfig
===================================================================
--- linux-2.6.orig/arch/arm/Kconfig
+++ linux-2.6/arch/arm/Kconfig
@@ -28,6 +28,7 @@ config ARM
select HAVE_C_RECORDMCOUNT
select HAVE_GENERIC_HARDIRQS
select HAVE_SPARSE_IRQ
+ select HAVE_MMU_GATHER_RANGE if MMU
help
The ARM series is a line of low-power-consumption RISC chip designs
licensed by ARM Ltd and targeted at embedded applications and
Index: linux-2.6/arch/arm/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/arm/include/asm/tlb.h
+++ linux-2.6/arch/arm/include/asm/tlb.h
@@ -29,184 +29,26 @@

#else /* !CONFIG_MMU */

-#include <linux/swap.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-/*
- * We need to delay page freeing for SMP as other CPUs can access pages
- * which have been removed but not yet had their TLB entries invalidated.
- * Also, as ARMv7 speculative prefetch can drag new entries into the TLB,
- * we need to apply this same delaying tactic to ensure correct operation.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_CPU_32v7)
-#define tlb_fast_mode(tlb) 0
-#else
-#define tlb_fast_mode(tlb) 1
-#endif
-
-#define MMU_GATHER_BUNDLE 8
-
-/*
- * TLB handling. This allows us to remove pages from the page
- * tables, and efficiently handle the TLB issues.
- */
-struct mmu_gather {
- struct mm_struct *mm;
- unsigned int fullmm;
- struct vm_area_struct *vma;
- unsigned long range_start;
- unsigned long range_end;
- unsigned int nr;
- unsigned int max;
- struct page **pages;
- struct page *local[MMU_GATHER_BUNDLE];
-};
-
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
-/*
- * This is unnecessarily complex. There's three ways the TLB shootdown
- * code is used:
- * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region().
- * tlb->fullmm = 0, and tlb_start_vma/tlb_end_vma will be called.
- * tlb->vma will be non-NULL.
- * 2. Unmapping all vmas. See exit_mmap().
- * tlb->fullmm = 1, and tlb_start_vma/tlb_end_vma will be called.
- * tlb->vma will be non-NULL. Additionally, page tables will be freed.
- * 3. Unmapping argument pages. See shift_arg_pages().
- * tlb->fullmm = 0, but tlb_start_vma/tlb_end_vma will not be called.
- * tlb->vma will be NULL.
- */
-static inline void tlb_flush(struct mmu_gather *tlb)
-{
- if (tlb->fullmm || !tlb->vma)
- flush_tlb_mm(tlb->mm);
- else if (tlb->range_end > 0) {
- flush_tlb_range(tlb->mm, tlb->range_start, tlb->range_end);
- tlb->range_start = TASK_SIZE;
- tlb->range_end = 0;
- }
-}
-
-static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr)
-{
- if (!tlb->fullmm) {
- if (addr < tlb->range_start)
- tlb->range_start = addr;
- if (addr + PAGE_SIZE > tlb->range_end)
- tlb->range_end = addr + PAGE_SIZE;
- }
-}
-
-static inline void __tlb_alloc_page(struct mmu_gather *tlb)
-{
- unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
-
- if (addr) {
- tlb->pages = (void *)addr;
- tlb->max = PAGE_SIZE / sizeof(struct page *);
- }
-}
-
-static inline void tlb_flush_mmu(struct mmu_gather *tlb)
-{
- tlb_flush(tlb);
- if (!tlb_fast_mode(tlb)) {
- free_pages_and_swap_cache(tlb->pages, tlb->nr);
- tlb->nr = 0;
- if (tlb->pages == tlb->local)
- __tlb_alloc_page(tlb);
- }
-}
+static inline void tlb_flush(struct mmu_gather *tlb);

-static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm)
-{
- tlb->mm = mm;
- tlb->fullmm = fullmm;
- tlb->vma = NULL;
- tlb->max = ARRAY_SIZE(tlb->local);
- tlb->pages = tlb->local;
- tlb->nr = 0;
- __tlb_alloc_page(tlb);
-}
+#define __tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0)

static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
-{
- tlb_flush_mmu(tlb);
+__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr);
+#define __pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp)

- /* keep the page table cache within bounds */
- check_pgt_cache();
+#include <asm-generic/tlb.h>

- if (tlb->pages != tlb->local)
- free_pages((unsigned long)tlb->pages, 0);
-}
-
-/*
- * Memorize the range for the TLB flush.
- */
-static inline void
-tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr)
-{
- tlb_add_flush(tlb, addr);
-}
-
-/*
- * In the case of tlb vma handling, we can optimise these away in the
- * case where we're doing a full MM flush. When we're doing a munmap,
- * the vmas are adjusted to only cover the region to be torn down.
- */
-static inline void
-tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
+static inline void tlb_flush(struct mmu_gather *tlb)
{
- if (!tlb->fullmm) {
- flush_cache_range(vma, vma->vm_start, vma->vm_end);
- tlb->vma = vma;
- tlb->range_start = TASK_SIZE;
- tlb->range_end = 0;
- }
+ flush_tlb_range(tlb->mm, tlb->start, tlb->end);
}

static inline void
-tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
-{
- if (!tlb->fullmm)
- tlb_flush(tlb);
-}
-
-static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
-{
- if (tlb_fast_mode(tlb)) {
- free_page_and_swap_cache(page);
- } else {
- tlb->pages[tlb->nr++] = page;
- if (tlb->nr >= tlb->max)
- return 1;
- }
- return 0;
-}
-
-static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
-{
- if (__tlb_remove_page(tlb, page))
- tlb_flush_mmu(tlb);
-}
-
-static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
- unsigned long addr)
+__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
{
pgtable_page_dtor(pte);
- tlb_add_flush(tlb, addr);
tlb_remove_page(tlb, pte);
}
-
-#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr)
-#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp)
-#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp)
-
-#define tlb_migrate_finish(mm) do { } while (0)
-
#endif /* CONFIG_MMU */
#endif

2011-03-09 15:16:10

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

Hi Peter,

On 2 March 2011 17:59, Peter Zijlstra <[email protected]> wrote:
> --- linux-2.6.orig/arch/arm/include/asm/tlb.h
> +++ linux-2.6/arch/arm/include/asm/tlb.h
[...]
> +__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
> {
> pgtable_page_dtor(pte);
> - tlb_add_flush(tlb, addr);
> tlb_remove_page(tlb, pte);
> }

I think we still need a tlb_track_range() call here. On the path to
pte_free_tlb() (for example shift_arg_pages ... free_pte_range) there
doesn't seem to be any code setting the tlb->start/end range. Did I
miss anything?

Thanks.

--
Catalin

2011-03-09 15:20:18

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Wed, 2011-03-09 at 15:16 +0000, Catalin Marinas wrote:
> Hi Peter,
>
> On 2 March 2011 17:59, Peter Zijlstra <[email protected]> wrote:
> > --- linux-2.6.orig/arch/arm/include/asm/tlb.h
> > +++ linux-2.6/arch/arm/include/asm/tlb.h
> [...]
> > +__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
> > {
> > pgtable_page_dtor(pte);
> > - tlb_add_flush(tlb, addr);
> > tlb_remove_page(tlb, pte);
> > }
>
> I think we still need a tlb_track_range() call here. On the path to
> pte_free_tlb() (for example shift_arg_pages ... free_pte_range) there
> doesn't seem to be any code setting the tlb->start/end range. Did I
> miss anything?

Patch 3 included:

-#define pte_free_tlb(tlb, ptep, address) \
- do { \
- tlb->need_flush = 1; \
- __pte_free_tlb(tlb, ptep, address); \
+#define pte_free_tlb(tlb, ptep, address) \
+ do { \
+ tlb->need_flush = 1; \
+ tlb_track_range(tlb, address, pmd_addr_end(address, TASK_SIZE));\
+ __pte_free_tlb(tlb, ptep, address); \
} while (0)

Also, I posted a new version of this series here:

https://lkml.org/lkml/2011/3/7/308

2011-03-09 15:36:32

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Wed, 2011-03-09 at 15:19 +0000, Peter Zijlstra wrote:
> On Wed, 2011-03-09 at 15:16 +0000, Catalin Marinas wrote:
> > Hi Peter,
> >
> > On 2 March 2011 17:59, Peter Zijlstra <[email protected]> wrote:
> > > --- linux-2.6.orig/arch/arm/include/asm/tlb.h
> > > +++ linux-2.6/arch/arm/include/asm/tlb.h
> > [...]
> > > +__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
> > > {
> > > pgtable_page_dtor(pte);
> > > - tlb_add_flush(tlb, addr);
> > > tlb_remove_page(tlb, pte);
> > > }
> >
> > I think we still need a tlb_track_range() call here. On the path to
> > pte_free_tlb() (for example shift_arg_pages ... free_pte_range) there
> > doesn't seem to be any code setting the tlb->start/end range. Did I
> > miss anything?
>
> Patch 3 included:
>
> -#define pte_free_tlb(tlb, ptep, address) \
> - do { \
> - tlb->need_flush = 1; \
> - __pte_free_tlb(tlb, ptep, address); \
> +#define pte_free_tlb(tlb, ptep, address) \
> + do { \
> + tlb->need_flush = 1; \
> + tlb_track_range(tlb, address, pmd_addr_end(address, TASK_SIZE));\
> + __pte_free_tlb(tlb, ptep, address); \
> } while (0)

OK, so the range is tracked. The only issue is that for platforms with a
folded pmd the range end would go to TASK_SIZE. In this case
pgd_addr_end() would make more sense (or something like
PTRS_PER_PTE*PAGE_SIZE).

--
Catalin

2011-03-09 15:40:01

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Wed, 2011-03-09 at 15:36 +0000, Catalin Marinas wrote:
> On Wed, 2011-03-09 at 15:19 +0000, Peter Zijlstra wrote:
> > On Wed, 2011-03-09 at 15:16 +0000, Catalin Marinas wrote:
> > > Hi Peter,
> > >
> > > On 2 March 2011 17:59, Peter Zijlstra <[email protected]> wrote:
> > > > --- linux-2.6.orig/arch/arm/include/asm/tlb.h
> > > > +++ linux-2.6/arch/arm/include/asm/tlb.h
> > > [...]
> > > > +__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
> > > > {
> > > > pgtable_page_dtor(pte);
> > > > - tlb_add_flush(tlb, addr);
> > > > tlb_remove_page(tlb, pte);
> > > > }
> > >
> > > I think we still need a tlb_track_range() call here. On the path to
> > > pte_free_tlb() (for example shift_arg_pages ... free_pte_range) there
> > > doesn't seem to be any code setting the tlb->start/end range. Did I
> > > miss anything?
> >
> > Patch 3 included:
> >
> > -#define pte_free_tlb(tlb, ptep, address) \
> > - do { \
> > - tlb->need_flush = 1; \
> > - __pte_free_tlb(tlb, ptep, address); \
> > +#define pte_free_tlb(tlb, ptep, address) \
> > + do { \
> > + tlb->need_flush = 1; \
> > + tlb_track_range(tlb, address, pmd_addr_end(address, TASK_SIZE));\
> > + __pte_free_tlb(tlb, ptep, address); \
> > } while (0)
>
> OK, so the range is tracked. The only issue is that for platforms with a
> folded pmd the range end would go to TASK_SIZE. In this case
> pgd_addr_end() would make more sense (or something like
> PTRS_PER_PTE*PAGE_SIZE).

Urgh, so when pmds are folded pmd_addr_end() doesn't get to be the next
biggest thing? PTRS_PER_PTE*PAGE_SIZE like things don't work since
there is no guarantee addr is at the beginning of the pmd.

Ok, will try and sort that out.

2011-03-09 15:49:11

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Wed, 2011-03-09 at 16:39 +0100, Peter Zijlstra wrote:
>
> Ok, will try and sort that out.

We could do something like the below and use the end passed down, which
because it goes top down should be clipped at the appropriate size, just
means touching all the p??_free_tlb() implementations ;-)

Will do on the next iteration ;-)

---

diff --git a/mm/memory.c b/mm/memory.c
index 5823698..833bd90 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -222,11 +222,11 @@ void pmd_clear_bad(pmd_t *pmd)
* has been handled earlier when unmapping all the memory regions.
*/
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long addr)
+ unsigned long addr, unsigned long end)
{
pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
- pte_free_tlb(tlb, token, addr);
+ pte_free_tlb(tlb, token, addr, end);
tlb->mm->nr_ptes--;
}

@@ -244,7 +244,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- free_pte_range(tlb, pmd, addr);
+ free_pte_range(tlb, pmd, addr, next);
} while (pmd++, addr = next, addr != end);

start &= PUD_MASK;
@@ -260,7 +260,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,

pmd = pmd_offset(pud, start);
pud_clear(pud);
- pmd_free_tlb(tlb, pmd, start);
+ pmd_free_tlb(tlb, pmd, start, end);
}

static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -293,7 +293,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,

pud = pud_offset(pgd, start);
pgd_clear(pgd);
- pud_free_tlb(tlb, pud, start);
+ pud_free_tlb(tlb, pud, start, end);
}

/*

2011-03-09 16:34:40

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Wed, 2011-03-09 at 15:48 +0000, Peter Zijlstra wrote:
> On Wed, 2011-03-09 at 16:39 +0100, Peter Zijlstra wrote:
> >
> > Ok, will try and sort that out.
>
> We could do something like the below and use the end passed down, which
> because it goes top down should be clipped at the appropriate size, just
> means touching all the p??_free_tlb() implementations ;-)

Looks fine to me (apart from the hassle to change the p??_free_tlb()
definitions).

--
Catalin

2012-05-17 03:08:45

by Paul Mundt

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Wed, Mar 02, 2011 at 06:59:32PM +0100, Peter Zijlstra wrote:
> Might want to optimize the tlb_flush() function to do a full mm flush
> when the range is 'large', IA64 does this too.
>
> Cc: Russell King <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>

The current version in tlb-unify blows up due to a missing
tlb_add_flush() definition. I can see in this thread tlb_track_range()
was factored in, but the __pte_free_tlb()/__pmd_free_tlb() semantics have
changed since then. Adding a dumb tlb_add_flush() that wraps in to
tlb_track_range() seems to do the right thing, but someone more familiar
with LPAE and ARM's double PMDs will have to figure out whether the
tlb_track_range() in asm-generic/tlb.h's pmd/pte_free_tlb() are
sufficient to remove the tlb_add_flush() calls or not.

Here's the dumb build fix for now though:

Signed-off-by: Paul Mundt <[email protected]>

---

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 37dbce9..1de4b21 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -38,6 +38,11 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr);

#include <asm-generic/tlb.h>

+static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr)
+{
+ tlb_track_range(tlb, addr, addr + PAGE_SIZE);
+}
+
static inline void
__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
{

2012-05-17 09:30:46

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 04:05:52AM +0100, Paul Mundt wrote:
> On Wed, Mar 02, 2011 at 06:59:32PM +0100, Peter Zijlstra wrote:
> > Might want to optimize the tlb_flush() function to do a full mm flush
> > when the range is 'large', IA64 does this too.
> >
> > Cc: Russell King <[email protected]>
> > Signed-off-by: Peter Zijlstra <[email protected]>
>
> The current version in tlb-unify blows up due to a missing
> tlb_add_flush() definition. I can see in this thread tlb_track_range()
> was factored in, but the __pte_free_tlb()/__pmd_free_tlb() semantics have
> changed since then. Adding a dumb tlb_add_flush() that wraps in to
> tlb_track_range() seems to do the right thing, but someone more familiar
> with LPAE and ARM's double PMDs will have to figure out whether the
> tlb_track_range() in asm-generic/tlb.h's pmd/pte_free_tlb() are
> sufficient to remove the tlb_add_flush() calls or not.
>
> Here's the dumb build fix for now though:
>
> Signed-off-by: Paul Mundt <[email protected]>
>
> ---
>
> diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
> index 37dbce9..1de4b21 100644
> --- a/arch/arm/include/asm/tlb.h
> +++ b/arch/arm/include/asm/tlb.h
> @@ -38,6 +38,11 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr);
>
> #include <asm-generic/tlb.h>
>
> +static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr)
> +{
> + tlb_track_range(tlb, addr, addr + PAGE_SIZE);
> +}

I think that's still needed in case the range given to pte_free_tlb()
does not cover both pmd entries (1MB each) that the classic ARM MMU
uses. But we could call tlb_track_range() directly rather than adding a
tlb_add_flush() function (untested):

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 37dbce9..efe2831 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -42,15 +46,14 @@ static inline void
__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
{
pgtable_page_dtor(pte);
-
+#ifndef CONFIG_ARM_LPAE
/*
* With the classic ARM MMU, a pte page has two corresponding pmd
* entries, each covering 1MB.
*/
- addr &= PMD_MASK;
- tlb_add_flush(tlb, addr + SZ_1M - PAGE_SIZE);
- tlb_add_flush(tlb, addr + SZ_1M);
-
+ addr = (addr & PMD_MASK) + SZ_1M;
+ tlb_track_range(tlb, addr - PAGE_SIZE, addr + PAGE_SIZE);
+#endif
tlb_remove_page(tlb, pte);
}

@@ -58,7 +61,6 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
unsigned long addr)
{
#ifdef CONFIG_ARM_LPAE
- tlb_add_flush(tlb, addr);
tlb_remove_page(tlb, virt_to_page(pmdp));
#endif
}

Another minor thing is that on newer ARM processors (Cortex-A15) we
need the TLB shootdown even on UP systems, so tlb_fast_mode should
always return 0. Something like below (untested):

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 37dbce9..8e79689 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -23,6 +23,10 @@

#include <linux/pagemap.h>

+#ifdef CONFIG_CPU_32v7
+#define tlb_fast_mode (0)
+#endif
+
#include <asm-generic/tlb.h>

#else /* !CONFIG_MMU */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 90a725c..9ddf7ee 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -194,6 +194,7 @@ static inline void tlb_flush(struct mmu_gather *tlb)

#endif /* CONFIG_HAVE_MMU_GATHER_RANGE */

+#ifndef tlb_fast_mode
static inline int tlb_fast_mode(struct mmu_gather *tlb)
{
#ifdef CONFIG_SMP
@@ -206,6 +207,7 @@ static inline int tlb_fast_mode(struct mmu_gather *tlb)
return 1;
#endif
}
+#endif

void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm);
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end);

--
Catalin

2012-05-17 09:39:42

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 10:30:23AM +0100, Catalin Marinas wrote:
> Another minor thing is that on newer ARM processors (Cortex-A15) we
> need the TLB shootdown even on UP systems, so tlb_fast_mode should
> always return 0. Something like below (untested):
>
>
> diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
> index 37dbce9..8e79689 100644
> --- a/arch/arm/include/asm/tlb.h
> +++ b/arch/arm/include/asm/tlb.h
> @@ -23,6 +23,10 @@
>
> #include <linux/pagemap.h>
>
> +#ifdef CONFIG_CPU_32v7
> +#define tlb_fast_mode (0)
> +#endif
> +
> #include <asm-generic/tlb.h>
>
> #else /* !CONFIG_MMU */

This hunk should have been a few lines down for the CONFIG_MMU case.

--
Catalin

2012-05-17 09:53:12

by Russell King

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 10:30:23AM +0100, Catalin Marinas wrote:
> Another minor thing is that on newer ARM processors (Cortex-A15) we
> need the TLB shootdown even on UP systems, so tlb_fast_mode should
> always return 0. Something like below (untested):

No Catalin, we need this for virtually all ARMv7 CPUs whether they're UP
or SMP, not just for A15, because of the speculative prefetch which can
re-load TLB entries from the page tables at _any_ time.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of:

2012-05-17 11:29:00

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, 2012-05-17 at 10:51 +0100, Russell King wrote:
> On Thu, May 17, 2012 at 10:30:23AM +0100, Catalin Marinas wrote:
> > Another minor thing is that on newer ARM processors (Cortex-A15) we
> > need the TLB shootdown even on UP systems, so tlb_fast_mode should
> > always return 0. Something like below (untested):
>
> No Catalin, we need this for virtually all ARMv7 CPUs whether they're UP
> or SMP, not just for A15, because of the speculative prefetch which can
> re-load TLB entries from the page tables at _any_ time.

Hmm,. so this is mostly because of the confusion/coupling between
tlb_remove_page() and tlb_remove_table() I guess. Since I don't see the
freeing of the actual pages being a problem with speculative TLB
reloads, just the page-tables.

Should we introduce a tlb_remove_table() regardless of
HAVE_RCU_TABLE_FREE which always queues the tables regardless of
tlb_fast_mode()?

2012-05-17 12:15:21

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 12:28:06PM +0100, Peter Zijlstra wrote:
> On Thu, 2012-05-17 at 10:51 +0100, Russell King wrote:
> > On Thu, May 17, 2012 at 10:30:23AM +0100, Catalin Marinas wrote:
> > > Another minor thing is that on newer ARM processors (Cortex-A15) we
> > > need the TLB shootdown even on UP systems, so tlb_fast_mode should
> > > always return 0. Something like below (untested):
> >
> > No Catalin, we need this for virtually all ARMv7 CPUs whether they're UP
> > or SMP, not just for A15, because of the speculative prefetch which can
> > re-load TLB entries from the page tables at _any_ time.
>
> Hmm,. so this is mostly because of the confusion/coupling between
> tlb_remove_page() and tlb_remove_table() I guess. Since I don't see the
> freeing of the actual pages being a problem with speculative TLB
> reloads, just the page-tables.

The TLB on newer ARM cores can cache intermediate entries (e.g. pmd) as
long as they are valid, even if the full translation is not possible
(e.g. because the pte entry is 0). With fast_mode, this could lead to
the MMU reading the already freed pte page as it was pointed at by the
old pmd.

Older ARMv7 CPUs (Cortex-A8), don't do this intermediate caching and UP
should be fine with fast_mode==1 as we already track the pte range via
tlb_remove_tlb_entry(). The MMU on ARM is treated like any another agent
that accesses the memory, so standard memory ordering issues apply In
theory Linux can clear the pmd, free the page and it is re-used shortly
after while the MMU hasn't observed the pmd_clear() yet (we don't have a
barrier in this function).

> Should we introduce a tlb_remove_table() regardless of
> HAVE_RCU_TABLE_FREE which always queues the tables regardless of
> tlb_fast_mode()?

This would probably work as well (or we just add support for
HAVE_RCU_TABLE_FREE on ARM).

--
Catalin

2012-05-17 16:00:36

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 12:28:06PM +0100, Peter Zijlstra wrote:
> On Thu, 2012-05-17 at 10:51 +0100, Russell King wrote:
> > On Thu, May 17, 2012 at 10:30:23AM +0100, Catalin Marinas wrote:
> > > Another minor thing is that on newer ARM processors (Cortex-A15) we
> > > need the TLB shootdown even on UP systems, so tlb_fast_mode should
> > > always return 0. Something like below (untested):
> >
> > No Catalin, we need this for virtually all ARMv7 CPUs whether they're UP
> > or SMP, not just for A15, because of the speculative prefetch which can
> > re-load TLB entries from the page tables at _any_ time.
>
> Hmm,. so this is mostly because of the confusion/coupling between
> tlb_remove_page() and tlb_remove_table() I guess. Since I don't see the
> freeing of the actual pages being a problem with speculative TLB
> reloads, just the page-tables.
>
> Should we introduce a tlb_remove_table() regardless of
> HAVE_RCU_TABLE_FREE which always queues the tables regardless of
> tlb_fast_mode()?

BTW, looking at your tlb-unify branch, does tlb_remove_table() call
tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
after clearing the pmd and before freeing the pte page table (and
ideally doing it less often than at every pte_free_tlb() call).

--
Catalin

2012-05-17 16:25:31

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, 2012-05-17 at 17:00 +0100, Catalin Marinas wrote:

> BTW, looking at your tlb-unify branch, does tlb_remove_table() call
> tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
> tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
> after clearing the pmd and before freeing the pte page table (and
> ideally doing it less often than at every pte_free_tlb() call).

No I don't think it does, so far the only archs using the RCU stuff are
ppc,sparc and s390 and none of those needed that (Xen might join them
soon though). But I will have to look and consider this more carefully.
I 'lost' most of the ppc/sparc/s390 details from memory to say this with
any certainty.

2012-05-17 16:33:56

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, 2012-05-17 at 18:24 +0200, Peter Zijlstra wrote:
> On Thu, 2012-05-17 at 17:00 +0100, Catalin Marinas wrote:
>
> > BTW, looking at your tlb-unify branch, does tlb_remove_table() call
> > tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
> > tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
> > after clearing the pmd and before freeing the pte page table (and
> > ideally doing it less often than at every pte_free_tlb() call).
>
> No I don't think it does, so far the only archs using the RCU stuff are
> ppc,sparc and s390 and none of those needed that (Xen might join them
> soon though). But I will have to look and consider this more carefully.
> I 'lost' most of the ppc/sparc/s390 details from memory to say this with
> any certainty.

Hmm, no, thinking more that does indeed sounds strange, I'll still have
to consider it more carefully, but I think you might have found a bug
there.

2012-05-17 16:44:47

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, 2012-05-17 at 18:33 +0200, Peter Zijlstra wrote:
> On Thu, 2012-05-17 at 18:24 +0200, Peter Zijlstra wrote:
> > On Thu, 2012-05-17 at 17:00 +0100, Catalin Marinas wrote:
> >
> > > BTW, looking at your tlb-unify branch, does tlb_remove_table() call
> > > tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
> > > tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
> > > after clearing the pmd and before freeing the pte page table (and
> > > ideally doing it less often than at every pte_free_tlb() call).
> >
> > No I don't think it does, so far the only archs using the RCU stuff are
> > ppc,sparc and s390 and none of those needed that (Xen might join them
> > soon though). But I will have to look and consider this more carefully.
> > I 'lost' most of the ppc/sparc/s390 details from memory to say this with
> > any certainty.
>
>
> Hmm, no, thinking more that does indeed sounds strange, I'll still have
> to consider it more carefully, but I think you might have found a bug
> there.

So the RCU code can from ppc in commit
267239116987d64850ad2037d8e0f3071dc3b5ce, which has similar behaviour.
Also I suspect the mm_users < 2 test will be incorrect for ARM since
even the one user can be concurrent with your speculation engine.

2012-05-17 16:59:51

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, 2012-05-17 at 18:44 +0200, Peter Zijlstra wrote:
>
> So the RCU code can from ppc in commit
> 267239116987d64850ad2037d8e0f3071dc3b5ce, which has similar behaviour.
> Also I suspect the mm_users < 2 test will be incorrect for ARM since
> even the one user can be concurrent with your speculation engine.
>
>
Right, last mail, I promise, I've confused myself enough already! :-)

OK, so ppc/sparc are special (forgot all about s390) I think by the time
they are done with unmap_page_range() their hardware hash-tables are
empty and nobody but software page-table walkers will still access the
linux page tables.

So when we do free_pgtables() to clean up the actual page-tables.
Power/Sparc need to RCU free this to allow concurrent software
page-table walkers like gup_fast.

Thus I don't think they need to tlb flush again because their hardware
doesn't actually walk the link page-tables, it walks hash-tables, which
by this time are empty.

Now if x86/Xen were to use this, it would indeed also need to TLB flush
when freeing the page-tables, since its hardware walkers do indeed
traverse these pages and we need to sync against them.

So my first patch in the tlb-unify tree is actually buggy.

Humm,. what to do adding a tlb flush in there might slow down ppc/sparc
unnecessarily.. dave/ben? I guess we need more knobs :-(

Now its quite possible I've utterly confused myself and everybody
reading, apologies for that, I shall rest and purge all from memory and
start over before commenting more..

2012-05-17 17:01:56

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 05:44:13PM +0100, Peter Zijlstra wrote:
> On Thu, 2012-05-17 at 18:33 +0200, Peter Zijlstra wrote:
> > On Thu, 2012-05-17 at 18:24 +0200, Peter Zijlstra wrote:
> > > On Thu, 2012-05-17 at 17:00 +0100, Catalin Marinas wrote:
> > > > BTW, looking at your tlb-unify branch, does tlb_remove_table() call
> > > > tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
> > > > tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
> > > > after clearing the pmd and before freeing the pte page table (and
> > > > ideally doing it less often than at every pte_free_tlb() call).
> > >
> > > No I don't think it does, so far the only archs using the RCU stuff are
> > > ppc,sparc and s390 and none of those needed that (Xen might join them
> > > soon though). But I will have to look and consider this more carefully.
> > > I 'lost' most of the ppc/sparc/s390 details from memory to say this with
> > > any certainty.
> >
> > Hmm, no, thinking more that does indeed sounds strange, I'll still have
> > to consider it more carefully, but I think you might have found a bug
> > there.
>
> So the RCU code can from ppc in commit
> 267239116987d64850ad2037d8e0f3071dc3b5ce, which has similar behaviour.
> Also I suspect the mm_users < 2 test will be incorrect for ARM since
> even the one user can be concurrent with your speculation engine.

That's correct.

--
Catalin

2012-05-17 17:12:14

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, 2012-05-17 at 18:01 +0100, Catalin Marinas wrote:
> > So the RCU code can from ppc in commit
> > 267239116987d64850ad2037d8e0f3071dc3b5ce, which has similar behaviour.
> > Also I suspect the mm_users < 2 test will be incorrect for ARM since
> > even the one user can be concurrent with your speculation engine.
>
> That's correct.

(I'm not sending this... really :-)

---
commit cd94154cc6a28dd9dc271042c1a59c08d26da886
Author: Martin Schwidefsky <[email protected]>
Date: Wed Apr 11 14:28:07 2012 +0200

[S390] fix tlb flushing for page table pages

Git commit 36409f6353fc2d7b6516e631415f938eadd92ffa "use generic RCU
page-table freeing code" introduced a tlb flushing bug. Partially revert
the above git commit and go back to s390 specific page table flush code.

For s390 the TLB can contain three types of entries, "normal" TLB
page-table entries, TLB combined region-and-segment-table (CRST) entries
and real-space entries. Linux does not use real-space entries which
leaves normal TLB entries and CRST entries. The CRST entries are
intermediate steps in the page-table translation called translation paths.
For example a 4K page access in a three-level page table setup will
create two CRST TLB entries and one page-table TLB entry. The advantage
of that approach is that a page access next to the previous one can reuse
the CRST entries and needs just a single read from memory to create the
page-table TLB entry. The disadvantage is that the TLB flushing rules are
more complicated, before any page-table may be freed the TLB needs to be
flushed.

In short: the generic RCU page-table freeing code is incorrect for the
CRST entries, in particular the check for mm_users < 2 is troublesome.

This is applicable to 3.0+ kernels.

Cc: <[email protected]>
Signed-off-by: Martin Schwidefsky <[email protected]>

2012-05-17 17:23:46

by Russell King

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 05:00:12PM +0100, Catalin Marinas wrote:
> On Thu, May 17, 2012 at 12:28:06PM +0100, Peter Zijlstra wrote:
> > On Thu, 2012-05-17 at 10:51 +0100, Russell King wrote:
> > > On Thu, May 17, 2012 at 10:30:23AM +0100, Catalin Marinas wrote:
> > > > Another minor thing is that on newer ARM processors (Cortex-A15) we
> > > > need the TLB shootdown even on UP systems, so tlb_fast_mode should
> > > > always return 0. Something like below (untested):
> > >
> > > No Catalin, we need this for virtually all ARMv7 CPUs whether they're UP
> > > or SMP, not just for A15, because of the speculative prefetch which can
> > > re-load TLB entries from the page tables at _any_ time.
> >
> > Hmm,. so this is mostly because of the confusion/coupling between
> > tlb_remove_page() and tlb_remove_table() I guess. Since I don't see the
> > freeing of the actual pages being a problem with speculative TLB
> > reloads, just the page-tables.
> >
> > Should we introduce a tlb_remove_table() regardless of
> > HAVE_RCU_TABLE_FREE which always queues the tables regardless of
> > tlb_fast_mode()?
>
> BTW, looking at your tlb-unify branch, does tlb_remove_table() call
> tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
> tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
> after clearing the pmd and before freeing the pte page table (and
> ideally doing it less often than at every pte_free_tlb() call).

Catalin,

The way TLB shootdown stuff works is that _every_ single bit of memory
which gets freed, whether its a page or a page table, gets added to a
list of pages to be freed.

So, the sequence is:
- remove pte/pmd/pud/pgd pointers
- add pages, whether they be pages pointed to by pte entries or page tables
to be freed to a list
- when list is sufficiently full, invalidate TLBs
- free list of pages

That means the pages will not be freed, whether it be a page mapped
into userspace or a page table until such time that the TLB has been
invalidated.

For page tables, this is done via pXX_free_tlb(), which then calls out
to the arch specific __pXX_free_tlb(), which ultimately then hands the
page table over to tlb_remove_page() to add to the list of to-be-freed
pages.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of:

2012-05-17 18:32:04

by Catalin Marinas

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, May 17, 2012 at 06:22:15PM +0100, Russell King wrote:
> On Thu, May 17, 2012 at 05:00:12PM +0100, Catalin Marinas wrote:
> > On Thu, May 17, 2012 at 12:28:06PM +0100, Peter Zijlstra wrote:
> > > On Thu, 2012-05-17 at 10:51 +0100, Russell King wrote:
> > > > On Thu, May 17, 2012 at 10:30:23AM +0100, Catalin Marinas wrote:
> > > > > Another minor thing is that on newer ARM processors (Cortex-A15) we
> > > > > need the TLB shootdown even on UP systems, so tlb_fast_mode should
> > > > > always return 0. Something like below (untested):
> > > >
> > > > No Catalin, we need this for virtually all ARMv7 CPUs whether they're UP
> > > > or SMP, not just for A15, because of the speculative prefetch which can
> > > > re-load TLB entries from the page tables at _any_ time.
> > >
> > > Hmm,. so this is mostly because of the confusion/coupling between
> > > tlb_remove_page() and tlb_remove_table() I guess. Since I don't see the
> > > freeing of the actual pages being a problem with speculative TLB
> > > reloads, just the page-tables.
> > >
> > > Should we introduce a tlb_remove_table() regardless of
> > > HAVE_RCU_TABLE_FREE which always queues the tables regardless of
> > > tlb_fast_mode()?
> >
> > BTW, looking at your tlb-unify branch, does tlb_remove_table() call
> > tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
> > tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
> > after clearing the pmd and before freeing the pte page table (and
> > ideally doing it less often than at every pte_free_tlb() call).
>
> Catalin,
>
> The way TLB shootdown stuff works is that _every_ single bit of memory
> which gets freed, whether its a page or a page table, gets added to a
> list of pages to be freed.
>
> So, the sequence is:
> - remove pte/pmd/pud/pgd pointers
> - add pages, whether they be pages pointed to by pte entries or page tables
> to be freed to a list
> - when list is sufficiently full, invalidate TLBs
> - free list of pages
>
> That means the pages will not be freed, whether it be a page mapped
> into userspace or a page table until such time that the TLB has been
> invalidated.
>
> For page tables, this is done via pXX_free_tlb(), which then calls out
> to the arch specific __pXX_free_tlb(), which ultimately then hands the
> page table over to tlb_remove_page() to add to the list of to-be-freed
> pages.

I know that already, not sure why you explained it again (but it's good
for future reference).

My point was that if we move to HAVE_RCU_FREE_TLB, the other
architectures doing this are calling tlb_remove_table() instead of
tlb_remove_page() in __p??_free_tlb(). And tlb_remove_table() does not
do any TLB maintenance when it can no longer queue pages (batch table
overflow).

--
Catalin

2012-05-21 07:47:50

by Martin Schwidefsky

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/6] arm, mm: Convert arm to generic tlb

On Thu, 17 May 2012 18:24:44 +0200
Peter Zijlstra <[email protected]> wrote:

> On Thu, 2012-05-17 at 17:00 +0100, Catalin Marinas wrote:
>
> > BTW, looking at your tlb-unify branch, does tlb_remove_table() call
> > tlb_flush/tlb_flush_mmu before freeing the tables? I can only see
> > tlb_remove_page() doing this. On ARM, even UP, we need the TLB flushing
> > after clearing the pmd and before freeing the pte page table (and
> > ideally doing it less often than at every pte_free_tlb() call).
>
> No I don't think it does, so far the only archs using the RCU stuff are
> ppc,sparc and s390 and none of those needed that (Xen might join them
> soon though). But I will have to look and consider this more carefully.
> I 'lost' most of the ppc/sparc/s390 details from memory to say this with
> any certainty.

s390 needs a TLB flush for the pgd, pud and pmd tables. See git commit
cd94154cc6a28dd9dc271042c1a59c08d26da886 for the sad details.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.