2018-08-29 15:49:19

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH v2] x86/nmi: Fix some races in NMI uaccess

In NMI context, we might be in the middle of context switching or in
the middle of switch_mm_irqs_off(). In either case, CR3 might not
match current->mm, which could cause copy_from_user_nmi() and
friends to read the wrong memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Cc: [email protected]
Cc: Peter Zijlstra <[email protected]>
Cc: Nadav Amit <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---

Nadav, this is intended for your series. Want to add it right
before the use_temporary_mm() stuff?

Changes from v1:
- Improved comments
- Add debugging
- Fix bugs

arch/x86/events/core.c | 2 +-
arch/x86/include/asm/tlbflush.h | 44 +++++++++++++++++++++++++++++++++
arch/x86/lib/usercopy.c | 5 ++++
arch/x86/mm/tlb.c | 7 ++++++
4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs

perf_callchain_store(entry, regs->ip);

- if (!current->mm)
+ if (!nmi_uaccess_okay())
return;

if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..183d36a98b04 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
* are on. This means that it may not match current->active_mm,
* which will contain the previous user mm when we're in lazy TLB
* mode even if we've already switched back to swapper_pg_dir.
+ *
+ * During switch_mm_irqs_off(), loaded_mm will be set to
+ * LOADED_MM_SWITCHING during the brief interrupts-off window
+ * when CR3 and loaded_mm would otherwise be inconsistent. This
+ * is for nmi_uaccess_okay()'s benefit.
*/
struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
u16 loaded_mm_asid;
u16 next_asid;
/* last user mm's ctx id */
@@ -246,6 +254,42 @@ struct tlb_state {
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);

+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON_ONCE(!loaded_mm);
+#endif
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+#endif
+
+ return true;
+}
+
/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
#include <linux/uaccess.h>
#include <linux/export.h>

+#include <asm/tlbflush.h>
+
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
if (__range_not_ok(from, n, TASK_SIZE))
return n;

+ if (!nmi_uaccess_okay())
+ return n;
+
/*
* Even though this function is typically called from NMI/IRQ context
* disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e721cf2d4290..707d2b2a333f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -304,6 +304,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);

+ /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -334,6 +338,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);

+ /* Make sure we write CR3 before loaded_mm. */
+ barrier();
+
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
--
2.17.1



2018-08-29 16:08:56

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess

On Wed, 2018-08-29 at 08:47 -0700, Andy Lutomirski wrote:
> In NMI context, we might be in the middle of context switching or in
> the middle of switch_mm_irqs_off(). In either case, CR3 might not
> match current->mm, which could cause copy_from_user_nmi() and
> friends to read the wrong memory.
>
> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
>
> Cc: [email protected]
> Cc: Peter Zijlstra <[email protected]>
> Cc: Nadav Amit <[email protected]>
> Signed-off-by: Andy Lutomirski <[email protected]>

Reviewed-by: Rik van Riel <[email protected]>

--
All Rights Reversed.


Attachments:
signature.asc (499.00 B)
This is a digitally signed message part

2018-08-29 18:58:17

by Nadav Amit

[permalink] [raw]
Subject: Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess

at 8:47 AM, Andy Lutomirski <[email protected]> wrote:

> In NMI context, we might be in the middle of context switching or in
> the middle of switch_mm_irqs_off(). In either case, CR3 might not
> match current->mm, which could cause copy_from_user_nmi() and
> friends to read the wrong memory.
>
> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
>
> Cc: [email protected]
> Cc: Peter Zijlstra <[email protected]>
> Cc: Nadav Amit <[email protected]>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> Nadav, this is intended for your series. Want to add it right
> before the use_temporary_mm() stuff?

Sure. Thanks! I will apply the following small fix:

> +
> +#ifdef CONFIG_DEBUG_VM
> + WARN_ON_ONCE(!loaded_mm);
> +#endif

Will be changed to VM_WARN_ON_ONCE() in the two instances.

Regards,
Nadav

2018-08-30 13:38:37

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess

On Wed, 29 Aug 2018, Nadav Amit wrote:
> at 8:47 AM, Andy Lutomirski <[email protected]> wrote:
>
> > In NMI context, we might be in the middle of context switching or in
> > the middle of switch_mm_irqs_off(). In either case, CR3 might not
> > match current->mm, which could cause copy_from_user_nmi() and
> > friends to read the wrong memory.
> >
> > Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> > copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> >
> > Cc: [email protected]
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Nadav Amit <[email protected]>
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> >
> > Nadav, this is intended for your series. Want to add it right
> > before the use_temporary_mm() stuff?
>
> Sure. Thanks! I will apply the following small fix:
>
> > +
> > +#ifdef CONFIG_DEBUG_VM
> > + WARN_ON_ONCE(!loaded_mm);
> > +#endif
>
> Will be changed to VM_WARN_ON_ONCE() in the two instances.

Unless I'm completely lost, this can just be applied to tip right
away. It's not depending on anything else.

Thanks,

tglx

2018-08-30 14:22:46

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess



> On Aug 30, 2018, at 6:36 AM, Thomas Gleixner <[email protected]> wrote:
>
>> On Wed, 29 Aug 2018, Nadav Amit wrote:
>> at 8:47 AM, Andy Lutomirski <[email protected]> wrote:
>>
>>> In NMI context, we might be in the middle of context switching or in
>>> the middle of switch_mm_irqs_off(). In either case, CR3 might not
>>> match current->mm, which could cause copy_from_user_nmi() and
>>> friends to read the wrong memory.
>>>
>>> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
>>> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
>>>
>>> Cc: [email protected]
>>> Cc: Peter Zijlstra <[email protected]>
>>> Cc: Nadav Amit <[email protected]>
>>> Signed-off-by: Andy Lutomirski <[email protected]>
>>> ---
>>>
>>> Nadav, this is intended for your series. Want to add it right
>>> before the use_temporary_mm() stuff?
>>
>> Sure. Thanks! I will apply the following small fix:
>>
>>> +
>>> +#ifdef CONFIG_DEBUG_VM
>>> + WARN_ON_ONCE(!loaded_mm);
>>> +#endif
>>
>> Will be changed to VM_WARN_ON_ONCE() in the two instances.
>
> Unless I'm completely lost, this can just be applied to tip right
> away. It's not depending on anything else.
>

Fine with me. Do you want to do the VM_WARN_ON cleanup yourself or should I send a v3?

2018-08-30 14:29:12

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess

On Thu, 30 Aug 2018, Andy Lutomirski wrote:
> > On Aug 30, 2018, at 6:36 AM, Thomas Gleixner <[email protected]> wrote:
> >
> >> On Wed, 29 Aug 2018, Nadav Amit wrote:
> >> at 8:47 AM, Andy Lutomirski <[email protected]> wrote:
> >>
> >>> In NMI context, we might be in the middle of context switching or in
> >>> the middle of switch_mm_irqs_off(). In either case, CR3 might not
> >>> match current->mm, which could cause copy_from_user_nmi() and
> >>> friends to read the wrong memory.
> >>>
> >>> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> >>> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> >>>
> >>> Cc: [email protected]
> >>> Cc: Peter Zijlstra <[email protected]>
> >>> Cc: Nadav Amit <[email protected]>
> >>> Signed-off-by: Andy Lutomirski <[email protected]>
> >>> ---
> >>>
> >>> Nadav, this is intended for your series. Want to add it right
> >>> before the use_temporary_mm() stuff?
> >>
> >> Sure. Thanks! I will apply the following small fix:
> >>
> >>> +
> >>> +#ifdef CONFIG_DEBUG_VM
> >>> + WARN_ON_ONCE(!loaded_mm);
> >>> +#endif
> >>
> >> Will be changed to VM_WARN_ON_ONCE() in the two instances.
> >
> > Unless I'm completely lost, this can just be applied to tip right
> > away. It's not depending on anything else.
> >
>
> Fine with me. Do you want to do the VM_WARN_ON cleanup yourself or should
> I send a v3?

I think, I'll manage

Subject: [tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching

Commit-ID: 16f54a362e4083218ac8d67a4879532c6eef2d98
Gitweb: https://git.kernel.org/tip/16f54a362e4083218ac8d67a4879532c6eef2d98
Author: Andy Lutomirski <[email protected]>
AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700
Committer: Thomas Gleixner <[email protected]>
CommitDate: Thu, 30 Aug 2018 16:31:19 +0200

x86/nmi: Fix NMI uaccess race against CR3 switching

A NMI can hit in the middle of context switching or in the middle of
switch_mm_irqs_off(). In either case, CR3 might not match current->mm,
which could cause copy_from_user_nmi() and friends to read the wrong
memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Signed-off-by: Andy Lutomirski <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
Cc: Nadav Amit <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Jann Horn <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: [email protected]
Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.luto@kernel.org

---
arch/x86/events/core.c | 2 +-
arch/x86/include/asm/tlbflush.h | 40 ++++++++++++++++++++++++++++++++++++++++
arch/x86/lib/usercopy.c | 5 +++++
arch/x86/mm/tlb.c | 7 +++++++
4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs

perf_callchain_store(entry, regs->ip);

- if (!current->mm)
+ if (!nmi_uaccess_okay())
return;

if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..58ce5288878e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
* are on. This means that it may not match current->active_mm,
* which will contain the previous user mm when we're in lazy TLB
* mode even if we've already switched back to swapper_pg_dir.
+ *
+ * During switch_mm_irqs_off(), loaded_mm will be set to
+ * LOADED_MM_SWITCHING during the brief interrupts-off window
+ * when CR3 and loaded_mm would otherwise be inconsistent. This
+ * is for nmi_uaccess_okay()'s benefit.
*/
struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
u16 loaded_mm_asid;
u16 next_asid;
/* last user mm's ctx id */
@@ -246,6 +254,38 @@ struct tlb_state {
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);

+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ VM_WARN_ON_ONCE(!loaded_mm);
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+ return true;
+}
+
/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
#include <linux/uaccess.h>
#include <linux/export.h>

+#include <asm/tlbflush.h>
+
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
if (__range_not_ok(from, n, TASK_SIZE))
return n;

+ if (!nmi_uaccess_okay())
+ return n;
+
/*
* Even though this function is typically called from NMI/IRQ context
* disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..e96b99eb800c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);

+ /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -335,6 +339,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);

+ /* Make sure we write CR3 before loaded_mm. */
+ barrier();
+
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}

Subject: [tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching

Commit-ID: 4012e77a903d114f915fc607d6d2ed54a3d6c9b1
Gitweb: https://git.kernel.org/tip/4012e77a903d114f915fc607d6d2ed54a3d6c9b1
Author: Andy Lutomirski <[email protected]>
AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700
Committer: Thomas Gleixner <[email protected]>
CommitDate: Fri, 31 Aug 2018 17:08:22 +0200

x86/nmi: Fix NMI uaccess race against CR3 switching

A NMI can hit in the middle of context switching or in the middle of
switch_mm_irqs_off(). In either case, CR3 might not match current->mm,
which could cause copy_from_user_nmi() and friends to read the wrong
memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Signed-off-by: Andy Lutomirski <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
Cc: Nadav Amit <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Jann Horn <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: [email protected]
Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.luto@kernel.org


---
arch/x86/events/core.c | 2 +-
arch/x86/include/asm/tlbflush.h | 40 ++++++++++++++++++++++++++++++++++++++++
arch/x86/lib/usercopy.c | 5 +++++
arch/x86/mm/tlb.c | 7 +++++++
4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs

perf_callchain_store(entry, regs->ip);

- if (!current->mm)
+ if (!nmi_uaccess_okay())
return;

if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..58ce5288878e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
* are on. This means that it may not match current->active_mm,
* which will contain the previous user mm when we're in lazy TLB
* mode even if we've already switched back to swapper_pg_dir.
+ *
+ * During switch_mm_irqs_off(), loaded_mm will be set to
+ * LOADED_MM_SWITCHING during the brief interrupts-off window
+ * when CR3 and loaded_mm would otherwise be inconsistent. This
+ * is for nmi_uaccess_okay()'s benefit.
*/
struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
u16 loaded_mm_asid;
u16 next_asid;
/* last user mm's ctx id */
@@ -246,6 +254,38 @@ struct tlb_state {
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);

+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ VM_WARN_ON_ONCE(!loaded_mm);
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+ return true;
+}
+
/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
#include <linux/uaccess.h>
#include <linux/export.h>

+#include <asm/tlbflush.h>
+
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
if (__range_not_ok(from, n, TASK_SIZE))
return n;

+ if (!nmi_uaccess_okay())
+ return n;
+
/*
* Even though this function is typically called from NMI/IRQ context
* disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..e96b99eb800c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);

+ /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -335,6 +339,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);

+ /* Make sure we write CR3 before loaded_mm. */
+ barrier();
+
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}