LinuxLists.cc - [RFC][PATCH 0/2] convert s390 to generic mmu

2018-09-18 12:53:09

Subject: [RFC][PATCH 0/2] convert s390 to generic mmu_gather

Hi Peter,

as an add-on for the TLB flushing changes two patches to make
s390 use the generic mmu_gather code as well. I let it run for
a few hours with several TLB stress tests, no fallout so far.
It certainly can use more testing but it looks ok to me.

Martin Schwidefsky (2):
asm-generic/tlb: introduce HAVE_MMU_GATHER_NO_GATHER
s390/tlb: convert to generic mmu_gather

arch/Kconfig | 3 +
arch/s390/Kconfig | 3 +
arch/s390/include/asm/tlb.h | 130 ++++++++++++++------------------------------
arch/s390/mm/pgalloc.c | 63 +--------------------
include/asm-generic/tlb.h | 9 ++-
mm/mmu_gather.c | 114 ++++++++++++++++++++++----------------
6 files changed, 121 insertions(+), 201 deletions(-)

--
2.16.4

2018-09-18 12:52:49

by Martin Schwidefsky

[permalink] [raw]

Subject: [PATCH 1/2] asm-generic/tlb: introduce HAVE_MMU_GATHER_NO_GATHER

Add the Kconfig option HAVE_MMU_GATHER_NO_GATHER to the generic
mmu_gather code. If the option is set the mmu_gather will not
track individual pages for delayed page free anymore. A platform
that enables the option needs to provide its own implementation
of the __tlb_remove_page_size function to free pages.

Signed-off-by: Martin Schwidefsky <[email protected]>
---
arch/Kconfig | 3 ++
include/asm-generic/tlb.h | 9 +++-
mm/mmu_gather.c | 114 +++++++++++++++++++++++++++-------------------
3 files changed, 77 insertions(+), 49 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 053c44703539..edb338b66f16 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -368,6 +368,9 @@ config HAVE_RCU_TABLE_INVALIDATE
config HAVE_MMU_GATHER_PAGE_SIZE
bool

+config HAVE_MMU_GATHER_NO_GATHER
+ bool
+
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 21c751cd751e..9e714e1b6695 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -179,6 +179,7 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);

#endif

+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
/*
* If we can't allocate a page to make a big batch of page pointers
* to work on, then just handle a few from the on-stack structure.
@@ -203,6 +204,10 @@ struct mmu_gather_batch {
*/
#define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH)

+extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+ int page_size);
+#endif
+
/*
* struct mmu_gather is an opaque type used by the mm code for passing around
* any data needed by arch specific code for tlb_remove_page.
@@ -249,6 +254,7 @@ struct mmu_gather {

unsigned int batch_count;

+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
struct mmu_gather_batch *active;
struct mmu_gather_batch local;
struct page *__pages[MMU_GATHER_BUNDLE];
@@ -256,6 +262,7 @@ struct mmu_gather {
#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
unsigned int page_size;
#endif
+#endif
};

void arch_tlb_gather_mmu(struct mmu_gather *tlb,
@@ -264,8 +271,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb);
void arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end, bool force);
void tlb_flush_mmu_free(struct mmu_gather *tlb);
-extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
- int page_size);

static inline void __tlb_adjust_range(struct mmu_gather *tlb,
unsigned long address,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2d5e617131f6..6d5d812a040b 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -13,6 +13,8 @@

#ifdef HAVE_GENERIC_MMU_GATHER

+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+
static bool tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
@@ -41,6 +43,63 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}

+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
+ free_pages_and_swap_cache(batch->pages, batch->nr);
+ batch->nr = 0;
+ }
+ tlb->active = &tlb->local;
+}
+
+static void tlb_batch_list_free(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch, *next;
+
+ for (batch = tlb->local.next; batch; batch = next) {
+ next = batch->next;
+ free_pages((unsigned long)batch, 0);
+ }
+ tlb->local.next = NULL;
+}
+
+/* __tlb_remove_page
+ * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
+ * handling the additional races in SMP caused by other CPUs caching valid
+ * mappings in their TLBs. Returns the number of free page slots left.
+ * When out of page slots we must call tlb_flush_mmu().
+ *returns true if the caller should flush.
+ */
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+{
+ struct mmu_gather_batch *batch;
+
+ VM_BUG_ON(!tlb->end);
+
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+ VM_WARN_ON(tlb->page_size != page_size);
+#endif
+
+ batch = tlb->active;
+ /*
+ * Add the page and check if we are full. If so
+ * force a flush.
+ */
+ batch->pages[batch->nr++] = page;
+ if (batch->nr == batch->max) {
+ if (!tlb_next_batch(tlb))
+ return true;
+ batch = tlb->active;
+ }
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+
+ return false;
+}
+
+#endif
+
void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long start, unsigned long end)
{
@@ -48,12 +107,15 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,

/* Is it from 0 to ~0? */
tlb->fullmm = !(start | (end+1));
+
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
tlb->active = &tlb->local;
tlb->batch_count = 0;
+#endif

#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
@@ -67,16 +129,12 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,

void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
- struct mmu_gather_batch *batch;
-
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- free_pages_and_swap_cache(batch->pages, batch->nr);
- batch->nr = 0;
- }
- tlb->active = &tlb->local;
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb_batch_pages_flush(tlb);
+#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
@@ -92,8 +150,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
void arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end, bool force)
{
- struct mmu_gather_batch *batch, *next;
-
if (force) {
__tlb_reset_range(tlb);
__tlb_adjust_range(tlb, start, end - start);
@@ -103,45 +159,9 @@ void arch_tlb_finish_mmu(struct mmu_gather *tlb,

/* keep the page table cache within bounds */
check_pgt_cache();
-
- for (batch = tlb->local.next; batch; batch = next) {
- next = batch->next;
- free_pages((unsigned long)batch, 0);
- }
- tlb->local.next = NULL;
-}
-
-/* __tlb_remove_page
- * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
- * handling the additional races in SMP caused by other CPUs caching valid
- * mappings in their TLBs. Returns the number of free page slots left.
- * When out of page slots we must call tlb_flush_mmu().
- *returns true if the caller should flush.
- */
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
-{
- struct mmu_gather_batch *batch;
-
- VM_BUG_ON(!tlb->end);
-
-#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
- VM_WARN_ON(tlb->page_size != page_size);
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb_batch_list_free(tlb);
#endif
-
- batch = tlb->active;
- /*
- * Add the page and check if we are full. If so
- * force a flush.
- */
- batch->pages[batch->nr++] = page;
- if (batch->nr == batch->max) {
- if (!tlb_next_batch(tlb))
- return true;
- batch = tlb->active;
- }
- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
-
- return false;
}

#endif /* HAVE_GENERIC_MMU_GATHER */
--
2.16.4

2018-09-18 12:54:46

by Martin Schwidefsky

[permalink] [raw]

Subject: [PATCH 2/2] s390/tlb: convert to generic mmu_gather

Signed-off-by: Martin Schwidefsky <[email protected]>
---
arch/s390/Kconfig | 3 +
arch/s390/include/asm/tlb.h | 130 ++++++++++++++------------------------------
arch/s390/mm/pgalloc.c | 63 +--------------------
3 files changed, 44 insertions(+), 152 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 9a9c7a6fe925..b864462ac4a6 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -157,10 +157,13 @@ config S390
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MEMBLOCK_PHYS_MAP
+ select HAVE_MMU_GATHER_NO_GATHER
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NOP_MCOUNT
select HAVE_OPROFILE
select HAVE_PERF_EVENTS
+ select HAVE_RCU_TABLE_FREE
+ select HAVE_RCU_TABLE_INVALIDATE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index cf3d64313740..cb99f8310c3a 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -22,98 +22,39 @@
* Pages used for the page tables is a different story. FIXME: more
*/

-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <asm/processor.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-struct mmu_gather {
- struct mm_struct *mm;
- struct mmu_table_batch *batch;
- unsigned int fullmm;
- unsigned long start, end;
-};
-
-struct mmu_table_batch {
- struct rcu_head rcu;
- unsigned int nr;
- void *tables[0];
-};
-
-#define MAX_TABLE_BATCH \
- ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))
-
-extern void tlb_table_flush(struct mmu_gather *tlb);
-extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
-
-static inline void
-arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- tlb->mm = mm;
- tlb->start = start;
- tlb->end = end;
- tlb->fullmm = !(start | (end+1));
- tlb->batch = NULL;
-}
-
-static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
-{
- __tlb_flush_mm_lazy(tlb->mm);
-}
-
-static inline void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
- tlb_table_flush(tlb);
-}
-
+void __tlb_remove_table(void *_table);
+static inline void tlb_flush(struct mmu_gather *tlb);
+static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
+ struct page *page, int page_size);

-static inline void tlb_flush_mmu(struct mmu_gather *tlb)
-{
- tlb_flush_mmu_tlbonly(tlb);
- tlb_flush_mmu_free(tlb);
-}
+#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_end_vma(tlb, vma) do { } while (0)

-static inline void
-arch_tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end, bool force)
-{
- if (force) {
- tlb->start = start;
- tlb->end = end;
- }
+#define tlb_flush tlb_flush
+#define pte_free_tlb pte_free_tlb
+#define pmd_free_tlb pmd_free_tlb
+#define p4d_free_tlb p4d_free_tlb
+#define pud_free_tlb pud_free_tlb

- tlb_flush_mmu(tlb);
-}
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm-generic/tlb.h>

/*
* Release the page cache reference for a pte removed by
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
* has already been freed, so just do free_page_and_swap_cache.
*/
-static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
-{
- free_page_and_swap_cache(page);
- return false; /* avoid calling tlb_flush_mmu */
-}
-
-static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
-{
- free_page_and_swap_cache(page);
-}
-
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
- return __tlb_remove_page(tlb, page);
+ free_page_and_swap_cache(page);
+ return false;
}

-static inline void tlb_remove_page_size(struct mmu_gather *tlb,
- struct page *page, int page_size)
+static inline void tlb_flush(struct mmu_gather *tlb)
{
- return tlb_remove_page(tlb, page);
+ __tlb_flush_mm_lazy(tlb->mm);
}

/*
@@ -121,9 +62,18 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
* page table from the tlb.
*/
static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
- unsigned long address)
+ unsigned long address)
{
- page_table_free_rcu(tlb, (unsigned long *) pte, address);
+ __tlb_adjust_range(tlb, address, PAGE_SIZE);
+ tlb->mm->context.flush_mm = 1;
+ tlb->freed_tables = 1;
+ tlb->cleared_ptes = 1;
+ /*
+ * page_table_free_rcu takes care of the allocation bit masks
+ * of the 2K table fragments in the 4K page table page,
+ * then calls tlb_remove_table.
+ */
+ page_table_free_rcu(tlb, (unsigned long *) pte, address);
}

/*
@@ -139,6 +89,10 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
if (tlb->mm->context.asce_limit <= _REGION3_SIZE)
return;
pgtable_pmd_page_dtor(virt_to_page(pmd));
+ __tlb_adjust_range(tlb, address, PAGE_SIZE);
+ tlb->mm->context.flush_mm = 1;
+ tlb->freed_tables = 1;
+ tlb->cleared_puds = 1;
tlb_remove_table(tlb, pmd);
}

@@ -154,6 +108,10 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
{
if (tlb->mm->context.asce_limit <= _REGION1_SIZE)
return;
+ __tlb_adjust_range(tlb, address, PAGE_SIZE);
+ tlb->mm->context.flush_mm = 1;
+ tlb->freed_tables = 1;
+ tlb->cleared_p4ds = 1;
tlb_remove_table(tlb, p4d);
}

@@ -169,19 +127,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
{
if (tlb->mm->context.asce_limit <= _REGION2_SIZE)
return;
+ tlb->mm->context.flush_mm = 1;
+ tlb->freed_tables = 1;
+ tlb->cleared_puds = 1;
tlb_remove_table(tlb, pud);
}

-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0)
-#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0)
-#define tlb_migrate_finish(mm) do { } while (0)
-#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
- tlb_remove_tlb_entry(tlb, ptep, address)
-
-static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size)
-{
-}

#endif /* _S390_TLB_H */
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 76d89ee8b428..f7656a0b3a1a 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -288,7 +288,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
tlb_remove_table(tlb, table);
}

-static void __tlb_remove_table(void *_table)
+void __tlb_remove_table(void *_table)
{
unsigned int mask = (unsigned long) _table & 3;
void *table = (void *)((unsigned long) _table ^ mask);
@@ -314,67 +314,6 @@ static void __tlb_remove_table(void *_table)
}
}

-static void tlb_remove_table_smp_sync(void *arg)
-{
- /* Simply deliver the interrupt */
-}
-
-static void tlb_remove_table_one(void *table)
-{
- /*
- * This isn't an RCU grace period and hence the page-tables cannot be
- * assumed to be actually RCU-freed.
- *
- * It is however sufficient for software page-table walkers that rely
- * on IRQ disabling. See the comment near struct mmu_table_batch.
- */
- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
- __tlb_remove_table(table);
-}
-
-static void tlb_remove_table_rcu(struct rcu_head *head)
-{
- struct mmu_table_batch *batch;
- int i;
-
- batch = container_of(head, struct mmu_table_batch, rcu);
-
- for (i = 0; i < batch->nr; i++)
- __tlb_remove_table(batch->tables[i]);
-
- free_page((unsigned long)batch);
-}
-
-void tlb_table_flush(struct mmu_gather *tlb)
-{
- struct mmu_table_batch **batch = &tlb->batch;
-
- if (*batch) {
- call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
- *batch = NULL;
- }
-}
-
-void tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct mmu_table_batch **batch = &tlb->batch;
-
- tlb->mm->context.flush_mm = 1;
- if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
- if (*batch == NULL) {
- __tlb_flush_mm_lazy(tlb->mm);
- tlb_remove_table_one(table);
- return;
- }
- (*batch)->nr = 0;
- }
- (*batch)->tables[(*batch)->nr++] = table;
- if ((*batch)->nr == MAX_TABLE_BATCH)
- tlb_flush_mmu(tlb);
-}
-
/*
* Base infrastructure required to generate basic asces, region, segment,
* and page tables that do not make use of enhanced features like EDAT1.
--
2.16.4

2018-09-19 12:40:20

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 2/2] s390/tlb: convert to generic mmu_gather

On Tue, Sep 18, 2018 at 02:51:51PM +0200, Martin Schwidefsky wrote:
> +#define pte_free_tlb pte_free_tlb
> +#define pmd_free_tlb pmd_free_tlb
> +#define p4d_free_tlb p4d_free_tlb
> +#define pud_free_tlb pud_free_tlb

> @@ -121,9 +62,18 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
> * page table from the tlb.
> */
> static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
> + unsigned long address)
> {
> + __tlb_adjust_range(tlb, address, PAGE_SIZE);
> + tlb->mm->context.flush_mm = 1;
> + tlb->freed_tables = 1;
> + tlb->cleared_ptes = 1;
> + /*
> + * page_table_free_rcu takes care of the allocation bit masks
> + * of the 2K table fragments in the 4K page table page,
> + * then calls tlb_remove_table.
> + */
> + page_table_free_rcu(tlb, (unsigned long *) pte, address);

(whitespace damage, fixed)

Also, could you perhaps explain the need for that
page_table_alloc/page_table_free code? That is, I get the comment about
using 2K page-table fragments out of 4k physical page, but why this
custom allocator instead of kmem_cache? It feels like there's a little
extra complication, but it's not immediately obvious what.

> }

We _could_ use __pte_free_tlb() here I suppose, but...

> /*
> @@ -139,6 +89,10 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
> if (tlb->mm->context.asce_limit <= _REGION3_SIZE)
> return;
> pgtable_pmd_page_dtor(virt_to_page(pmd));
> + __tlb_adjust_range(tlb, address, PAGE_SIZE);
> + tlb->mm->context.flush_mm = 1;
> + tlb->freed_tables = 1;
> + tlb->cleared_puds = 1;
> tlb_remove_table(tlb, pmd);
> }
>
> @@ -154,6 +108,10 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
> {
> if (tlb->mm->context.asce_limit <= _REGION1_SIZE)
> return;
> + __tlb_adjust_range(tlb, address, PAGE_SIZE);
> + tlb->mm->context.flush_mm = 1;
> + tlb->freed_tables = 1;
> + tlb->cleared_p4ds = 1;
> tlb_remove_table(tlb, p4d);
> }
>
> @@ -169,19 +127,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
> {
> if (tlb->mm->context.asce_limit <= _REGION2_SIZE)
> return;
> + tlb->mm->context.flush_mm = 1;
> + tlb->freed_tables = 1;
> + tlb->cleared_puds = 1;
> tlb_remove_table(tlb, pud);
> }

It's that ASCE limit that makes it impossible to use the generic
helpers, right?

2018-09-19 14:28:46

by Martin Schwidefsky

[permalink] [raw]

Subject: Re: [PATCH 2/2] s390/tlb: convert to generic mmu_gather

On Wed, 19 Sep 2018 14:38:49 +0200
Peter Zijlstra <[email protected]> wrote:

> On Tue, Sep 18, 2018 at 02:51:51PM +0200, Martin Schwidefsky wrote:
> > +#define pte_free_tlb pte_free_tlb
> > +#define pmd_free_tlb pmd_free_tlb
> > +#define p4d_free_tlb p4d_free_tlb
> > +#define pud_free_tlb pud_free_tlb
>
> > @@ -121,9 +62,18 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
> > * page table from the tlb.
> > */
> > static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
> > + unsigned long address)
> > {
> > + __tlb_adjust_range(tlb, address, PAGE_SIZE);
> > + tlb->mm->context.flush_mm = 1;
> > + tlb->freed_tables = 1;
> > + tlb->cleared_ptes = 1;
> > + /*
> > + * page_table_free_rcu takes care of the allocation bit masks
> > + * of the 2K table fragments in the 4K page table page,
> > + * then calls tlb_remove_table.
> > + */
> > + page_table_free_rcu(tlb, (unsigned long *) pte, address);
>
> (whitespace damage, fixed)
>
> Also, could you perhaps explain the need for that
> page_table_alloc/page_table_free code? That is, I get the comment about
> using 2K page-table fragments out of 4k physical page, but why this
> custom allocator instead of kmem_cache? It feels like there's a little
> extra complication, but it's not immediately obvious what.

The kmem_cache code uses the fields of struct page for its tracking.
pgtable_page_ctor uses the same fields, e.g. for the ptl. Last time
I tried to convert the page_table_alloc/page_table_free to kmem_cache
it just crashed. Plus the split of 4K pages into 2 2K fragments is
done on a per mm basis, that should help a little bit with fragmentation.

> > }
>
> We _could_ use __pte_free_tlb() here I suppose, but...
>
> > /*
> > @@ -139,6 +89,10 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
> > if (tlb->mm->context.asce_limit <= _REGION3_SIZE)
> > return;
> > pgtable_pmd_page_dtor(virt_to_page(pmd));
> > + __tlb_adjust_range(tlb, address, PAGE_SIZE);
> > + tlb->mm->context.flush_mm = 1;
> > + tlb->freed_tables = 1;
> > + tlb->cleared_puds = 1;
> > tlb_remove_table(tlb, pmd);
> > }
> >
> > @@ -154,6 +108,10 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
> > {
> > if (tlb->mm->context.asce_limit <= _REGION1_SIZE)
> > return;
> > + __tlb_adjust_range(tlb, address, PAGE_SIZE);
> > + tlb->mm->context.flush_mm = 1;
> > + tlb->freed_tables = 1;
> > + tlb->cleared_p4ds = 1;
> > tlb_remove_table(tlb, p4d);
> > }
> >
> > @@ -169,19 +127,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
> > {
> > if (tlb->mm->context.asce_limit <= _REGION2_SIZE)
> > return;
> > + tlb->mm->context.flush_mm = 1;
> > + tlb->freed_tables = 1;
> > + tlb->cleared_puds = 1;
> > tlb_remove_table(tlb, pud);
> > }
>
> It's that ASCE limit that makes it impossible to use the generic
> helpers, right?

There are two problems, one of them is related to the ASCE limit:

1) s390 supports 4 different page table layouts. 2-levels (2^31 bytes) for 31-bit compat,
3-levels (2^42 bytes) as the default for 64-bit, 4-levels (2^53) if 4 tera-bytes are
not enough and 5-levels (2^64) for the bragging rights.
The pxd_free_tlb() turn into nops if the number of page table levels require it.

2) The mm->context.flush_mm indication.
That goes back to this beauty in the architecture:

* "A valid table entry must not be changed while it is attached
* to any CPU and may be used for translation by that CPU except to
* (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY,
* or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page
* table entry, or (3) make a change by means of a COMPARE AND SWAP
* AND PURGE instruction that purges the TLB."

If one CPU is doing a mmu_gather page table operation on the only active thread
in the system the individual page table updates are done in a lazy fashion with
simple stores. If a second CPU picks up another thread for execution, the
attach_count is increased and the page table updates are done with IPTE/IDTE
from now on. But there might by TLBs of around that are not flushed yet.
We may *not* let the second CPU see these TLBs, otherwise the CPU may start an
instruction, then loose the TLB without being able to recreate it. Due to that
the CPU can end up with a half finished instruction it can not roll back nor
complete, ending in a check-stop. The simplest example is MVC with a length
of e.g. 256 bytes. The instruction has to complete with all 256 bytes moved,
or no bytes may have at all.
That is where the mm->context.flush_mm indication comes into play, if the
second CPU finds the bit set at the time it attaches a thread, it will to
an IDTE for flush all TLBs for the mm.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2018-09-19 16:47:47

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 2/2] s390/tlb: convert to generic mmu_gather

On Wed, Sep 19, 2018 at 04:28:09PM +0200, Martin Schwidefsky wrote:
> On Wed, 19 Sep 2018 14:38:49 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > On Tue, Sep 18, 2018 at 02:51:51PM +0200, Martin Schwidefsky wrote:

> > > + page_table_free_rcu(tlb, (unsigned long *) pte, address);
> >
> > (whitespace damage, fixed)
> >
> > Also, could you perhaps explain the need for that
> > page_table_alloc/page_table_free code? That is, I get the comment about
> > using 2K page-table fragments out of 4k physical page, but why this
> > custom allocator instead of kmem_cache? It feels like there's a little
> > extra complication, but it's not immediately obvious what.
>
> The kmem_cache code uses the fields of struct page for its tracking.
> pgtable_page_ctor uses the same fields, e.g. for the ptl. Last time
> I tried to convert the page_table_alloc/page_table_free to kmem_cache
> it just crashed. Plus the split of 4K pages into 2 2K fragments is
> done on a per mm basis, that should help a little bit with fragmentation.

Fair enough, thanks for the information.

> > It's that ASCE limit that makes it impossible to use the generic
> > helpers, right?
>
> There are two problems, one of them is related to the ASCE limit:
>
> 1) s390 supports 4 different page table layouts. 2-levels (2^31 bytes) for 31-bit compat,
> 3-levels (2^42 bytes) as the default for 64-bit, 4-levels (2^53) if 4 tera-bytes are
> not enough and 5-levels (2^64) for the bragging rights.
> The pxd_free_tlb() turn into nops if the number of page table levels require it.

Shiny, I think we (x86) have to choose at boot time which paging mode we
want and have to stick to it.

> 2) The mm->context.flush_mm indication.
> That goes back to this beauty in the architecture:
>
> * "A valid table entry must not be changed while it is attached
> * to any CPU and may be used for translation by that CPU except to
> * (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY,
> * or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page
> * table entry, or (3) make a change by means of a COMPARE AND SWAP
> * AND PURGE instruction that purges the TLB."
>
> If one CPU is doing a mmu_gather page table operation on the only active thread
> in the system the individual page table updates are done in a lazy fashion with
> simple stores. If a second CPU picks up another thread for execution, the
> attach_count is increased and the page table updates are done with IPTE/IDTE
> from now on. But there might by TLBs of around that are not flushed yet.
> We may *not* let the second CPU see these TLBs, otherwise the CPU may start an
> instruction, then loose the TLB without being able to recreate it. Due to that
> the CPU can end up with a half finished instruction it can not roll back nor
> complete, ending in a check-stop. The simplest example is MVC with a length
> of e.g. 256 bytes. The instruction has to complete with all 256 bytes moved,
> or no bytes may have at all.
> That is where the mm->context.flush_mm indication comes into play, if the
> second CPU finds the bit set at the time it attaches a thread, it will to
> an IDTE for flush all TLBs for the mm.

Oh man.. what fun. Still, this bit could easily be set in the
__*_free_tlb() functions afaict. Still 1) above is enough.

Thanks!