It is an abomination; and in prepration of removing the whole
ist_enter() thing, it needs to go.
Convert #MC over to using task_work_add() instead; it will run the
same code slightly later, on the return to user path of the same
exception.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/traps.h | 2 -
arch/x86/kernel/cpu/mce/core.c | 53 +++++++++++++++++++++++------------------
arch/x86/kernel/traps.c | 37 ----------------------------
include/linux/sched.h | 6 ++++
4 files changed, 36 insertions(+), 62 deletions(-)
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -123,8 +123,6 @@ asmlinkage void smp_irq_move_cleanup_int
extern void ist_enter(struct pt_regs *regs);
extern void ist_exit(struct pt_regs *regs);
-extern void ist_begin_non_atomic(struct pt_regs *regs);
-extern void ist_end_non_atomic(void);
#ifdef CONFIG_VMAP_STACK
void __noreturn handle_stack_overflow(const char *message,
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -42,6 +42,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>
#include <linux/set_memory.h>
+#include <linux/task_work.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -1084,23 +1085,6 @@ static void mce_clear_state(unsigned lon
}
}
-static int do_memory_failure(struct mce *m)
-{
- int flags = MF_ACTION_REQUIRED;
- int ret;
-
- pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
- if (!(m->mcgstatus & MCG_STATUS_RIPV))
- flags |= MF_MUST_KILL;
- ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
- if (ret)
- pr_err("Memory error not recovered");
- else
- set_mce_nospec(m->addr >> PAGE_SHIFT);
- return ret;
-}
-
-
/*
* Cases where we avoid rendezvous handler timeout:
* 1) If this CPU is offline.
@@ -1202,6 +1186,29 @@ static void __mc_scan_banks(struct mce *
*m = *final;
}
+static void mce_kill_me_now(struct callback_head *ch)
+{
+ force_sig(SIGBUS);
+}
+
+static void mce_kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+ if (!(p->mce_status & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
+
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ return;
+ }
+
+ pr_err("Memory error not recovered");
+ mce_kill_me_now(cb);
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1344,13 +1351,13 @@ void do_machine_check(struct pt_regs *re
/* Fault was in user mode and we need to take some action */
if ((m.cs & 3) == 3) {
- ist_begin_non_atomic(regs);
- local_irq_enable();
+ current->mce_addr = m.addr;
+ current->mce_status = m.mcgstatus;
+ current->mce_kill_me.func = mce_kill_me_maybe;
+ if (kill_it)
+ current->mce_kill_me.func = mce_kill_me_now;
- if (kill_it || do_memory_failure(&m))
- force_sig(SIGBUS);
- local_irq_disable();
- ist_end_non_atomic();
+ task_work_add(current, ¤t->mce_kill_me, true);
} else {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
mce_panic("Failed kernel mode recovery", &m, msg);
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -117,43 +117,6 @@ void ist_exit(struct pt_regs *regs)
rcu_nmi_exit();
}
-/**
- * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
- * @regs: regs passed to the IST exception handler
- *
- * IST exception handlers normally cannot schedule. As a special
- * exception, if the exception interrupted userspace code (i.e.
- * user_mode(regs) would return true) and the exception was not
- * a double fault, it can be safe to schedule. ist_begin_non_atomic()
- * begins a non-atomic section within an ist_enter()/ist_exit() region.
- * Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call ist_end_non_atomic()
- * before ist_exit().
- */
-void ist_begin_non_atomic(struct pt_regs *regs)
-{
- BUG_ON(!user_mode(regs));
-
- /*
- * Sanity check: we need to be on the normal thread stack. This
- * will catch asm bugs and any attempt to use ist_preempt_enable
- * from double_fault.
- */
- BUG_ON(!on_thread_stack());
-
- preempt_enable_no_resched();
-}
-
-/**
- * ist_end_non_atomic() - begin a non-atomic section in an IST exception
- *
- * Ends a non-atomic section started with ist_begin_non_atomic().
- */
-void ist_end_non_atomic(void)
-{
- preempt_disable();
-}
-
int is_valid_bugaddr(unsigned long addr)
{
unsigned short ud;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1285,6 +1285,12 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_X86_MCE
+ u64 mce_addr;
+ u64 mce_status;
+ struct callback_head mce_kill_me;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
On Wed, Feb 19, 2020 at 03:47:26PM +0100, Peter Zijlstra wrote:
> Subject: Re: [PATCH v3 02/22] x86,mce: Delete ist_begin_non_atomic()
x86/mce: ...
> It is an abomination; and in prepration of removing the whole
> ist_enter() thing, it needs to go.
>
> Convert #MC over to using task_work_add() instead; it will run the
> same code slightly later, on the return to user path of the same
> exception.
That's fine because the error happened in userspace.
...
> @@ -1202,6 +1186,29 @@ static void __mc_scan_banks(struct mce *
> *m = *final;
> }
>
> +static void mce_kill_me_now(struct callback_head *ch)
> +{
> + force_sig(SIGBUS);
> +}
> +
> +static void mce_kill_me_maybe(struct callback_head *cb)
You don't even need the "mce_" prefixes - those are static functions and
in mce-land.
Change looks good otherwise.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Feb 19, 2020 at 9:13 AM Borislav Petkov <[email protected]> wrote:
>
> On Wed, Feb 19, 2020 at 03:47:26PM +0100, Peter Zijlstra wrote:
> > Subject: Re: [PATCH v3 02/22] x86,mce: Delete ist_begin_non_atomic()
>
> x86/mce: ...
>
> > It is an abomination; and in prepration of removing the whole
> > ist_enter() thing, it needs to go.
> >
> > Convert #MC over to using task_work_add() instead; it will run the
> > same code slightly later, on the return to user path of the same
> > exception.
>
> That's fine because the error happened in userspace.
Unless there is a signal pending and the signal setup code is about to
hit the same failed memory. I suppose we can just treat cases like
this as "oh well, time to kill the whole system".
But we should genuinely agree that we're okay with deferring this handling.
On Wed, Feb 19, 2020 at 09:21:48AM -0800, Andy Lutomirski wrote:
> On Wed, Feb 19, 2020 at 9:13 AM Borislav Petkov <[email protected]> wrote:
> >
> > On Wed, Feb 19, 2020 at 03:47:26PM +0100, Peter Zijlstra wrote:
> > > Subject: Re: [PATCH v3 02/22] x86,mce: Delete ist_begin_non_atomic()
> >
> > x86/mce: ...
> >
> > > It is an abomination; and in prepration of removing the whole
> > > ist_enter() thing, it needs to go.
> > >
> > > Convert #MC over to using task_work_add() instead; it will run the
> > > same code slightly later, on the return to user path of the same
> > > exception.
> >
> > That's fine because the error happened in userspace.
>
> Unless there is a signal pending and the signal setup code is about to
> hit the same failed memory. I suppose we can just treat cases like
> this as "oh well, time to kill the whole system".
>
> But we should genuinely agree that we're okay with deferring this handling.
It doesn't delay much. The moment it does that local_irq_enable() it's
subject to preemption, just like it is on the return to user path.
Do you really want to create code that unwinds enough of nmi_enter() to
get you to a preemptible context? *shudder*
On Wed, Feb 19, 2020 at 09:21:48AM -0800, Andy Lutomirski wrote:
> Unless there is a signal pending and the signal setup code is about to
> hit the same failed memory. I suppose we can just treat cases like
> this as "oh well, time to kill the whole system".
>
> But we should genuinely agree that we're okay with deferring this handling.
Good catch!
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
...
/* deal with pending signal delivery */
if (cached_flags & _TIF_SIGPENDING)
do_signal(regs);
if (cached_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
rseq_handle_notify_resume(NULL, regs);
}
Err, can we make task_work run before we handle signals? Or there's a
reason it is run in this order?
Comment over task_work_add() says:
* This is like the signal handler which runs in kernel mode, but it doesn't
* try to wake up the @task.
which sounds to me like this should really run before the signal
handlers...
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Feb 19, 2020 at 06:42:23PM +0100, Borislav Petkov wrote:
> On Wed, Feb 19, 2020 at 09:21:48AM -0800, Andy Lutomirski wrote:
> > Unless there is a signal pending and the signal setup code is about to
> > hit the same failed memory. I suppose we can just treat cases like
> > this as "oh well, time to kill the whole system".
> >
> > But we should genuinely agree that we're okay with deferring this handling.
>
> Good catch!
>
> static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
> {
>
> ...
>
> /* deal with pending signal delivery */
> if (cached_flags & _TIF_SIGPENDING)
> do_signal(regs);
>
> if (cached_flags & _TIF_NOTIFY_RESUME) {
> clear_thread_flag(TIF_NOTIFY_RESUME);
> tracehook_notify_resume(regs);
> rseq_handle_notify_resume(NULL, regs);
> }
>
>
> Err, can we make task_work run before we handle signals? Or there's a
> reason it is run in this order?
>
> Comment over task_work_add() says:
>
> * This is like the signal handler which runs in kernel mode, but it doesn't
> * try to wake up the @task.
>
> which sounds to me like this should really run before the signal
> handlers...
here goes...
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -155,16 +155,16 @@ static void exit_to_usermode_loop(struct
if (cached_flags & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
- /* deal with pending signal delivery */
- if (cached_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
if (cached_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
rseq_handle_notify_resume(NULL, regs);
}
+ /* deal with pending signal delivery */
+ if (cached_flags & _TIF_SIGPENDING)
+ do_signal(regs);
+
if (cached_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
On Wed, Feb 19, 2020 at 9:34 AM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Feb 19, 2020 at 09:21:48AM -0800, Andy Lutomirski wrote:
> > On Wed, Feb 19, 2020 at 9:13 AM Borislav Petkov <[email protected]> wrote:
> > >
> > > On Wed, Feb 19, 2020 at 03:47:26PM +0100, Peter Zijlstra wrote:
> > > > Subject: Re: [PATCH v3 02/22] x86,mce: Delete ist_begin_non_atomic()
> > >
> > > x86/mce: ...
> > >
> > > > It is an abomination; and in prepration of removing the whole
> > > > ist_enter() thing, it needs to go.
> > > >
> > > > Convert #MC over to using task_work_add() instead; it will run the
> > > > same code slightly later, on the return to user path of the same
> > > > exception.
> > >
> > > That's fine because the error happened in userspace.
> >
> > Unless there is a signal pending and the signal setup code is about to
> > hit the same failed memory. I suppose we can just treat cases like
> > this as "oh well, time to kill the whole system".
> >
> > But we should genuinely agree that we're okay with deferring this handling.
>
> It doesn't delay much. The moment it does that local_irq_enable() it's
> subject to preemption, just like it is on the return to user path.
>
> Do you really want to create code that unwinds enough of nmi_enter() to
> get you to a preemptible context? *shudder*
Well, there's another way to approach this:
void notrace nonothing do_machine_check(struct pt_regs *regs)
{
if (user_mode(regs))
do_sane_machine_check(regs);
else
do_awful_machine_check(regs);
}
void do_sane_machine_check(regs)
{
nothing special here. just a regular exception, more or less.
}
void do_awful_macine_check(regs)
{
basically an NMI. No funny business, no recovery possible.
task_work_add() not allowed.
}
Or, even better, depending on how tglx's series shakes out, we could
build on this patch:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/idtentry&id=ebd8303dda34ea21476e4493cee671d998e83a48
and actually have two separate do_machine_check entry points. So we'd
do, roughly:
idtentry_ist ... normal_stack_entry=do_sane_machine_check
ist_stack_entry=do_awful_machine_check
and now there's no chance for confusion.
All of the above has some issues when Tony decides that he wants to
recover from specially annotated recoverable kernel memory accesses.
Then task_word_add() is a nonstarter, but the current
stack-switch-from-usermode *also* doesn't work. I floated the idea of
also doing the stack switch if we come from an IF=1 context, but
that's starting to get nasty.
One big question here: are memory failure #MC exceptions synchronous
or can they be delayed? If we get a memory failure, is it possible
that the #MC hits some random context and not the actual context where
the error occurred?
I suppose the general consideration I'm trying to get at is: is
task_work_add() actually useful at all here? For the case when a
kernel thread does memcpy_mcsafe() or similar, task work registered
using task_work_add() will never run.
--Andy
> One big question here: are memory failure #MC exceptions synchronous
> or can they be delayed? If we get a memory failure, is it possible
> that the #MC hits some random context and not the actual context where
> the error occurred?
There are a few cases:
1) SRAO (Software recoverable action optional) [Patrol scrub or L3 cache eviction]
These aren't synchronous with any core execution. Using machine check to signal
was probably a mistake - compounded by it being broadcast :-( Could pick any CPU
to handle (actually choose the first to arrive in do_machine_check()). That guy should
arrange to soft offline the affected page. Every CPU can return to what they were doing
before.
2) SRAR (Software recoverable action required)
These are synchronous. Starting with Skylake they may be signaled just to the thread
that hit the poison. Earlier generations broadcast.
2a) Hit in ring3 code ... we want to offline the page and SIGBUS the task(s)
2b) Memcpy_mcsafe() ... kernel has a recovery path. "Return" to the recovery code instead of to the original RIP.
2c) copy_from_user ... not implemented yet. We are in kernel, but would like to treat this like case 2a
3) Fatal
Always broadcast. Some bank has MCi_STATUS.PCC==1. System must be shutdown.
-Tony
> On Feb 19, 2020, at 2:33 PM, Luck, Tony <[email protected]> wrote:
>
>
>>
>> One big question here: are memory failure #MC exceptions synchronous
>> or can they be delayed? If we get a memory failure, is it possible
>> that the #MC hits some random context and not the actual context where
>> the error occurred?
>
> There are a few cases:
> 1) SRAO (Software recoverable action optional) [Patrol scrub or L3 cache eviction]
> These aren't synchronous with any core execution. Using machine check to signal
> was probably a mistake - compounded by it being broadcast :-( Could pick any CPU
> to handle (actually choose the first to arrive in do_machine_check()). That guy should
> arrange to soft offline the affected page. Every CPU can return to what they were doing
> before.
You could handle this by sending IPI-to-self and dealing with it in the interrupt handler. Or even wake a high-priority kthread or workqueue. irq_work may help. Relying on task_work or the non_atomic stuff seems silly - you can’t rely on anything about the interrupted context, and the context is more or less irrelevant anyway.
>
> 2) SRAR (Software recoverable action required)
> These are synchronous. Starting with Skylake they may be signaled just to the thread
> that hit the poison. Earlier generations broadcast.
Here’s where dealing with one that came from kernel code is just nasty, right?
I would argue that, if IF=0, killing the machine is reasonable. If IF=1, we should be okay. Actually making this work sanely is gross, and arguably the goal should be minimizing grossness.
Perhaps, if we came from kernel mode, we should IPI-to-self and use a special vector that is idtentry, not apicinterrupt. Or maybe even do this for entries from usermode just to keep everything consistent.
> 2a) Hit in ring3 code ... we want to offline the page and SIGBUS the task(s)
> 2b) Memcpy_mcsafe() ... kernel has a recovery path. "Return" to the recovery code instead of to the original RIP.
> 2c) copy_from_user ... not implemented yet. We are in kernel, but would like to treat this like case 2a
>
> 3) Fatal
> Always broadcast. Some bank has MCi_STATUS.PCC==1. System must be shutdown.
Easy :)
It would be really, really nice if NMI was masked in MCE context.
>
> -Tony
On Wed, Feb 19, 2020 at 02:12:13PM -0800, Andy Lutomirski wrote:
> On Wed, Feb 19, 2020 at 9:34 AM Peter Zijlstra <[email protected]> wrote:
> > Do you really want to create code that unwinds enough of nmi_enter() to
> > get you to a preemptible context? *shudder*
>
> Well, there's another way to approach this:
>
> void notrace nonothing do_machine_check(struct pt_regs *regs)
> {
> if (user_mode(regs))
> do_sane_machine_check(regs);
> else
> do_awful_machine_check(regs);
> }
>
> void do_sane_machine_check(regs)
> {
> nothing special here. just a regular exception, more or less.
> }
>
> void do_awful_macine_check(regs)
> {
> basically an NMI. No funny business, no recovery possible.
> task_work_add() not allowed.
> }
Right, that looks like major surgery to the current code though; I'd
much prefer someone that knows that code do that.
> I suppose the general consideration I'm trying to get at is: is
> task_work_add() actually useful at all here? For the case when a
> kernel thread does memcpy_mcsafe() or similar, task work registered
> using task_work_add() will never run.
task_work isn't at all useful when we didn't come from userspace. In
that case irq_work is the best option, but that doesn't provide a
preemptible context.