LinuxLists.cc - [PATCH 0/3] Make x86's switch

2016-04-21 17:57:37

Subject: [PATCH 0/3] Make x86's switch_mm run with IRQs off

This is some of the PCID prep work, but I think it's reasonable on
its own, too. I might also want it for FSGSBASE.

Andy Lutomirski (3):
sched: Add switch_mm_irqs_off and use it in the scheduler
x86/mm: Uninline switch_mm
x86/mm: Turn off IRQs in switch_mm

arch/x86/include/asm/mmu_context.h | 101 ++-------------------------------
arch/x86/mm/tlb.c | 112 +++++++++++++++++++++++++++++++++++++
include/linux/mmu_context.h | 7 +++
kernel/sched/core.c | 6 +-
4 files changed, 127 insertions(+), 99 deletions(-)

--
2.5.5

2016-04-21 17:57:41

by Andy Lutomirski

[permalink] [raw]

Subject: [PATCH 1/3] sched: Add switch_mm_irqs_off and use it in the scheduler

By defailt, this is the same thing as switch_mm. x86 will override it
as an optimization.

Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---
include/linux/mmu_context.h | 7 +++++++
kernel/sched/core.c | 6 +++---
2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index 70fffeba7495..f9e09613c113 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -1,9 +1,16 @@
#ifndef _LINUX_MMU_CONTEXT_H
#define _LINUX_MMU_CONTEXT_H

+#include <asm/mmu_context.h>
+
struct mm_struct;

void use_mm(struct mm_struct *mm);
void unuse_mm(struct mm_struct *mm);

+/* Architectures that care about IRQ state in switch_mm can override this. */
+#ifndef switch_mm_irqs_off
+#define switch_mm_irqs_off switch_mm
+#endif
+
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8465eeab8b3..3636abcafac0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -2712,7 +2712,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
- switch_mm(oldmm, mm, next);
+ switch_mm_irqs_off(oldmm, mm, next);

if (!prev->mm) {
prev->active_mm = NULL;
@@ -5202,7 +5202,7 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));

if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ switch_mm_irqs_off(mm, &init_mm, current);
finish_arch_post_lock_switch();
}
mmdrop(mm);
--
2.5.5

2016-04-21 17:57:45

by Andy Lutomirski

[permalink] [raw]

Subject: [PATCH 2/3] x86/mm: Uninline switch_mm

It's fairly large and it has quite a few callers. This may also
help untangle some headers down the road.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/mmu_context.h | 98 +----------------------------------
arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++++++++++
2 files changed, 104 insertions(+), 96 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 84280029cafd..bb911dd7cd01 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -115,103 +115,9 @@ static inline void destroy_context(struct mm_struct *mm)
destroy_context_ldt(mm);
}

-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
-{
- unsigned cpu = smp_processor_id();
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);

- if (likely(prev != next)) {
-#ifdef CONFIG_SMP
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- *
- */
- load_cr3(next->pgd);
-
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-
- /* Stop flush ipis for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
-
- /* Load per-mm CR4 state */
- load_mm_cr4(next);
-
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- /*
- * Load the LDT, if the LDT is different.
- *
- * It's possible that prev->context.ldt doesn't match
- * the LDT register. This can happen if leave_mm(prev)
- * was called and then modify_ldt changed
- * prev->context.ldt but suppressed an IPI to this CPU.
- * In this case, prev->context.ldt != NULL, because we
- * never set context.ldt to NULL while the mm still
- * exists. That means that next->context.ldt !=
- * prev->context.ldt, because mms never share an LDT.
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
-#endif
- }
-#ifdef CONFIG_SMP
- else {
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * On established mms, the mm_cpumask is only changed
- * from irq context, from ptep_clear_flush() while in
- * lazy tlb mode, and here. Irqs are blocked during
- * schedule, protecting us from simultaneous changes.
- */
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, load_cr3() is serializing and orders TLB
- * fills with respect to the mm_cpumask write.
- */
- load_cr3(next->pgd);
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- load_mm_cr4(next);
- load_mm_ldt(next);
- }
- }
-#endif
-}

#define activate_mm(prev, next) \
do { \
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8f4cc3dfac32..c826ea193279 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -57,6 +57,108 @@ void leave_mm(int cpu)
}
EXPORT_SYMBOL_GPL(leave_mm);

+#endif /* CONFIG_SMP */
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+#ifdef CONFIG_SMP
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ this_cpu_write(cpu_tlbstate.active_mm, next);
+#endif
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ *
+ */
+ load_cr3(next->pgd);
+
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+ /* Stop flush ipis for the previous mm */
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+ /* Load per-mm CR4 state */
+ load_mm_cr4(next);
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * Load the LDT, if the LDT is different.
+ *
+ * It's possible that prev->context.ldt doesn't match
+ * the LDT register. This can happen if leave_mm(prev)
+ * was called and then modify_ldt changed
+ * prev->context.ldt but suppressed an IPI to this CPU.
+ * In this case, prev->context.ldt != NULL, because we
+ * never set context.ldt to NULL while the mm still
+ * exists. That means that next->context.ldt !=
+ * prev->context.ldt, because mms never share an LDT.
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_mm_ldt(next);
+#endif
+ }
+#ifdef CONFIG_SMP
+ else {
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * On established mms, the mm_cpumask is only changed
+ * from irq context, from ptep_clear_flush() while in
+ * lazy tlb mode, and here. Irqs are blocked during
+ * schedule, protecting us from simultaneous changes.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ *
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
+ */
+ load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
+ load_mm_ldt(next);
+ }
+ }
+#endif
+}
+
+#ifdef CONFIG_SMP
+
/*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
--
2.5.5

2016-04-21 17:57:46

by Andy Lutomirski

[permalink] [raw]

Subject: [PATCH 3/3] x86/mm: Turn off IRQs in switch_mm

Potential races between switch_mm and TLB-flush or LDT-flush IPIs
could be very messy. AFAICT the code is currently okay, whether by
accident or by careful design, but enabling PCID will make it
considerably more complicated and will no longer be obviously safe.

Fix it with a bug hammer: run switch_mm with IRQs off.

To avoid a performance hit in the scheduler, we take advantage of
our knowledge that the scheduler already has IRQs disabled when it
calls switch_mm.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/mmu_context.h | 3 +++
arch/x86/mm/tlb.c | 10 ++++++++++
2 files changed, 13 insertions(+)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bb911dd7cd01..396348196aa7 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -118,6 +118,9 @@ static inline void destroy_context(struct mm_struct *mm)
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);

+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off

#define activate_mm(prev, next) \
do { \
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c826ea193279..40acd07da324 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -62,6 +62,16 @@ EXPORT_SYMBOL_GPL(leave_mm);
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
unsigned cpu = smp_processor_id();

if (likely(prev != next)) {
--
2.5.5

2016-04-21 18:41:38

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [PATCH 0/3] Make x86's switch_mm run with IRQs off

On Thu, Apr 21, 2016 at 10:57 AM, Andy Lutomirski <[email protected]> wrote:
> This is some of the PCID prep work, but I think it's reasonable on
> its own, too. I might also want it for FSGSBASE.

I rebased this wrong. I'll send a new version after the kbuild bot
has some time to test it.

>
> Andy Lutomirski (3):
> sched: Add switch_mm_irqs_off and use it in the scheduler
> x86/mm: Uninline switch_mm
> x86/mm: Turn off IRQs in switch_mm
>
> arch/x86/include/asm/mmu_context.h | 101 ++-------------------------------
> arch/x86/mm/tlb.c | 112 +++++++++++++++++++++++++++++++++++++
> include/linux/mmu_context.h | 7 +++
> kernel/sched/core.c | 6 +-
> 4 files changed, 127 insertions(+), 99 deletions(-)
>
> --
> 2.5.5
>

--
Andy Lutomirski
AMA Capital Management, LLC