LinuxLists.cc - [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

2020-05-05 14:17:04

Subject: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

From: Peter Zijlstra <[email protected]>

Convert #MC over to using task_work_add(); it will run the same code
slightly later, on the return to user path of the same exception.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 56 ++++++++++++++++++++++-------------------
include/linux/sched.h | 6 ++++
2 files changed, 37 insertions(+), 25 deletions(-)

--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -42,6 +42,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>
#include <linux/set_memory.h>
+#include <linux/task_work.h>

#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -1086,23 +1087,6 @@ static void mce_clear_state(unsigned lon
}
}

-static int do_memory_failure(struct mce *m)
-{
- int flags = MF_ACTION_REQUIRED;
- int ret;
-
- pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
- if (!(m->mcgstatus & MCG_STATUS_RIPV))
- flags |= MF_MUST_KILL;
- ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
- if (ret)
- pr_err("Memory error not recovered");
- else
- set_mce_nospec(m->addr >> PAGE_SHIFT);
- return ret;
-}
-
-
/*
* Cases where we avoid rendezvous handler timeout:
* 1) If this CPU is offline.
@@ -1204,6 +1188,29 @@ static void __mc_scan_banks(struct mce *
*m = *final;
}

+static void kill_me_now(struct callback_head *ch)
+{
+ force_sig(SIGBUS);
+}
+
+static void kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+ if (!(p->mce_status & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
+
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ return;
+ }
+
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1222,7 +1229,7 @@ static void __mc_scan_banks(struct mce *
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
*/
-void notrace do_machine_check(struct pt_regs *regs, long error_code)
+void noinstr do_machine_check(struct pt_regs *regs, long error_code)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
@@ -1354,13 +1361,13 @@ void notrace do_machine_check(struct pt_
if ((m.cs & 3) == 3) {
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- local_irq_enable();
- preempt_enable();

- if (kill_it || do_memory_failure(&m))
- force_sig(SIGBUS);
- preempt_disable();
- local_irq_disable();
+ current->mce_addr = m.addr;
+ current->mce_status = m.mcgstatus;
+ current->mce_kill_me.func = kill_me_maybe;
+ if (kill_it)
+ current->mce_kill_me.func = kill_me_now;
+ task_work_add(current, &current->mce_kill_me, true);
} else {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
mce_panic("Failed kernel mode recovery", &m, msg);
@@ -1370,7 +1377,6 @@ void notrace do_machine_check(struct pt_
ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
-NOKPROBE_SYMBOL(do_machine_check);

#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int flags)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1290,6 +1290,12 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif

+#ifdef CONFIG_X86_MCE
+ u64 mce_addr;
+ u64 mce_status;
+ struct callback_head mce_kill_me;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.

2020-05-07 18:05:51

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

On Tue, May 5, 2020 at 7:13 AM Thomas Gleixner <[email protected]> wrote:
>
> From: Peter Zijlstra <[email protected]>
>
> Convert #MC over to using task_work_add(); it will run the same code
> slightly later, on the return to user path of the same exception.

I think this patch is correct, but I think it's only one small and not
that obviously wrong step away from being broken:

> if ((m.cs & 3) == 3) {
> /* If this triggers there is no way to recover. Die hard. */
> BUG_ON(!on_thread_stack() || !user_mode(regs));
> - local_irq_enable();
> - preempt_enable();
>
> - if (kill_it || do_memory_failure(&m))
> - force_sig(SIGBUS);
> - preempt_disable();
> - local_irq_disable();
> + current->mce_addr = m.addr;
> + current->mce_status = m.mcgstatus;
> + current->mce_kill_me.func = kill_me_maybe;
> + if (kill_it)
> + current->mce_kill_me.func = kill_me_now;
> + task_work_add(current, &current->mce_kill_me, true);

This is fine if the source was CPL3, but it's not going to work if CPL
was 0. We don't *currently* do this from CPL0, but people keep
wanting to. So perhaps there should be a comment like:

/*
* The #MC originated at CPL3, so we know that we will go execute the
task_work before returning to the offending user code.
*/

IOW, if we want to recover from CPL0 #MC, we will need a different mechanism.

I also confess a certain amount of sadness that my beautiful
haha-not-really-atomic-here mechanism isn't being used anymore. :(

--Andy

2020-05-08 08:53:27

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

On Thu, May 07, 2020 at 11:02:09AM -0700, Andy Lutomirski wrote:
> On Tue, May 5, 2020 at 7:13 AM Thomas Gleixner <[email protected]> wrote:
> >
> > From: Peter Zijlstra <[email protected]>
> >
> > Convert #MC over to using task_work_add(); it will run the same code
> > slightly later, on the return to user path of the same exception.
>
> I think this patch is correct, but I think it's only one small and not
> that obviously wrong step away from being broken:
>
> > if ((m.cs & 3) == 3) {
> > /* If this triggers there is no way to recover. Die hard. */
> > BUG_ON(!on_thread_stack() || !user_mode(regs));
> > - local_irq_enable();
> > - preempt_enable();
> >
> > - if (kill_it || do_memory_failure(&m))
> > - force_sig(SIGBUS);
> > - preempt_disable();
> > - local_irq_disable();
> > + current->mce_addr = m.addr;
> > + current->mce_status = m.mcgstatus;
> > + current->mce_kill_me.func = kill_me_maybe;
> > + if (kill_it)
> > + current->mce_kill_me.func = kill_me_now;
> > + task_work_add(current, &current->mce_kill_me, true);
>
> This is fine if the source was CPL3, but it's not going to work if CPL
> was 0. We don't *currently* do this from CPL0, but people keep
> wanting to. So perhaps there should be a comment like:
>
> /*
> * The #MC originated at CPL3, so we know that we will go execute the
> task_work before returning to the offending user code.
> */
>
> IOW, if we want to recover from CPL0 #MC, we will need a different mechanism.

See part4-18's IDTRENTRY_NOIST. That will get us a clear CPL3/CPL0
separation.

> I also confess a certain amount of sadness that my beautiful
> haha-not-really-atomic-here mechanism isn't being used anymore. :(

I think we have a subtely different interpretation of 'beautiful' here.

2020-05-08 21:32:28

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

On Fri, May 8, 2020 at 1:48 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, May 07, 2020 at 11:02:09AM -0700, Andy Lutomirski wrote:
> > On Tue, May 5, 2020 at 7:13 AM Thomas Gleixner <[email protected]> wrote:
> > >
> > > From: Peter Zijlstra <[email protected]>
> > >
> > > Convert #MC over to using task_work_add(); it will run the same code
> > > slightly later, on the return to user path of the same exception.
> >
> > I think this patch is correct, but I think it's only one small and not
> > that obviously wrong step away from being broken:
> >
> > > if ((m.cs & 3) == 3) {
> > > /* If this triggers there is no way to recover. Die hard. */
> > > BUG_ON(!on_thread_stack() || !user_mode(regs));
> > > - local_irq_enable();
> > > - preempt_enable();
> > >
> > > - if (kill_it || do_memory_failure(&m))
> > > - force_sig(SIGBUS);
> > > - preempt_disable();
> > > - local_irq_disable();
> > > + current->mce_addr = m.addr;
> > > + current->mce_status = m.mcgstatus;
> > > + current->mce_kill_me.func = kill_me_maybe;
> > > + if (kill_it)
> > > + current->mce_kill_me.func = kill_me_now;
> > > + task_work_add(current, &current->mce_kill_me, true);
> >
> > This is fine if the source was CPL3, but it's not going to work if CPL
> > was 0. We don't *currently* do this from CPL0, but people keep
> > wanting to. So perhaps there should be a comment like:
> >
> > /*
> > * The #MC originated at CPL3, so we know that we will go execute the
> > task_work before returning to the offending user code.
> > */
> >
> > IOW, if we want to recover from CPL0 #MC, we will need a different mechanism.
>
> See part4-18's IDTRENTRY_NOIST. That will get us a clear CPL3/CPL0
> separation.

I will hold my breath.

>
> > I also confess a certain amount of sadness that my beautiful
> > haha-not-really-atomic-here mechanism isn't being used anymore. :(
>
> I think we have a subtely different interpretation of 'beautiful' here.

Beauty is in the eye of the beholder. And sometimes in the eye of the
person who wrote the code :)

2020-05-13 23:46:11

by Mathieu Desnoyers

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

----- On May 5, 2020, at 9:16 AM, Thomas Gleixner [email protected] wrote:

> From: Peter Zijlstra <[email protected]>
>

Patch title: singal -> signal.

> Convert #MC over to using task_work_add(); it will run the same code
> slightly later, on the return to user path of the same exception.

So I suspect that switching the order between tracehook_notify_resume()
(which ends up calling task_work_run()) and do_signal() done by an
earlier patch in this series intends to ensure the information about the
instruction pointer causing the #MC is not overwritten by do_signal()
(but I'm just guessing).

If it's the case, I think it should be clearly stated as the intent of the
earlier patch.

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

2020-05-14 14:19:14

by Borislav Petkov

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

+ Tony.

On Tue, May 05, 2020 at 03:16:31PM +0200, Thomas Gleixner wrote:
> From: Peter Zijlstra <[email protected]>
>
> Convert #MC over to using task_work_add(); it will run the same code
> slightly later, on the return to user path of the same exception.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Reviewed-by: Frederic Weisbecker <[email protected]>
> ---
> arch/x86/kernel/cpu/mce/core.c | 56 ++++++++++++++++++++++-------------------
> include/linux/sched.h | 6 ++++
> 2 files changed, 37 insertions(+), 25 deletions(-)

I like this:

Reviewed-by: Borislav Petkov <[email protected]>

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2020-05-14 14:20:21

by Borislav Petkov

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

On Thu, May 07, 2020 at 11:02:09AM -0700, Andy Lutomirski wrote:
> IOW, if we want to recover from CPL0 #MC, we will need a different mechanism.

Recovering from CPL0 #MC is mostly doomed to failure. Except this mcsafe
crap with the exception handling:

/*
* Handle an MCE which has happened in kernel space but from
* which the kernel can recover: ex_has_fault_handler() has
* already verified that the rIP at which the error happened is
* a rIP from which the kernel can recover (by jumping to
* recovery code specified in _ASM_EXTABLE_FAULT()) and the
* corresponding exception handler which would do that is the
* proper one.
*/
if (m.kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
mce_panic("Failed kernel mode recovery", &m, msg);

Other than that, we iz done.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2020-05-14 16:08:55

by Mathieu Desnoyers

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

----- On May 14, 2020, at 10:17 AM, Borislav Petkov [email protected] wrote:

> + Tony.
>
> On Tue, May 05, 2020 at 03:16:31PM +0200, Thomas Gleixner wrote:
>> From: Peter Zijlstra <[email protected]>
>>
>> Convert #MC over to using task_work_add(); it will run the same code
>> slightly later, on the return to user path of the same exception.
>>
>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>> Signed-off-by: Thomas Gleixner <[email protected]>
>> Reviewed-by: Frederic Weisbecker <[email protected]>
>> ---
>> arch/x86/kernel/cpu/mce/core.c | 56 ++++++++++++++++++++++-------------------
>> include/linux/sched.h | 6 ++++
>> 2 files changed, 37 insertions(+), 25 deletions(-)
>
> I like this:
>
> Reviewed-by: Borislav Petkov <[email protected]>

What I am not fully grasping here is whether this patch preserves the instruction
pointer (and possibly other relevant information for siginfo_t) triggering the
exception in a scenario where we have:

- #MC triggered, queuing task work,
- unrelated signal happens to be delivered to task,
- exit to usermode loop handles do_signal first,
- then it runs task work.

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

2020-05-14 16:21:40

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

> On May 14, 2020, at 9:03 AM, Mathieu Desnoyers <[email protected]> wrote:
>
> ----- On May 14, 2020, at 10:17 AM, Borislav Petkov [email protected] wrote:
>
>> + Tony.
>>
>>> On Tue, May 05, 2020 at 03:16:31PM +0200, Thomas Gleixner wrote:
>>> From: Peter Zijlstra <[email protected]>
>>>
>>> Convert #MC over to using task_work_add(); it will run the same code
>>> slightly later, on the return to user path of the same exception.
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>>> Signed-off-by: Thomas Gleixner <[email protected]>
>>> Reviewed-by: Frederic Weisbecker <[email protected]>
>>> ---
>>> arch/x86/kernel/cpu/mce/core.c | 56 ++++++++++++++++++++++-------------------
>>> include/linux/sched.h | 6 ++++
>>> 2 files changed, 37 insertions(+), 25 deletions(-)
>>
>> I like this:
>>
>> Reviewed-by: Borislav Petkov <[email protected]>
>
> What I am not fully grasping here is whether this patch preserves the instruction
> pointer (and possibly other relevant information for siginfo_t) triggering the
> exception in a scenario where we have:
>
> - #MC triggered, queuing task work,
> - unrelated signal happens to be delivered to task,
> - exit to usermode loop handles do_signal first,
> - then it runs task work.

If anyone wants to ponder this, I suspect that we have lots of delightful bugs in our handling of cr2, trapnr, and error_code in signals. We should move them to the sigcontext, at least in kernel, and fix up ucontext when we deliver the signal. The current code can’t possibly be correct.

2020-05-14 16:41:40

by Borislav Petkov

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

On Thu, May 14, 2020 at 12:03:30PM -0400, Mathieu Desnoyers wrote:
> - #MC triggered, queuing task work,
> - unrelated signal happens to be delivered to task,
> - exit to usermode loop handles do_signal first,
> - then it runs task work.

How can that even happen?

exit_to_usermode_loop->do_signal->get_signal and that does:

if (unlikely(current->task_works))
task_work_run();

at the top.

So the task work will always run before the signal handler.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2020-05-14 17:07:14

by Mathieu Desnoyers

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

----- On May 14, 2020, at 12:39 PM, Borislav Petkov [email protected] wrote:

> On Thu, May 14, 2020 at 12:03:30PM -0400, Mathieu Desnoyers wrote:
>> - #MC triggered, queuing task work,
>> - unrelated signal happens to be delivered to task,
>> - exit to usermode loop handles do_signal first,
>> - then it runs task work.
>
> How can that even happen?
>
> exit_to_usermode_loop->do_signal->get_signal and that does:
>
> if (unlikely(current->task_works))
> task_work_run();
>
> at the top.
>
> So the task work will always run before the signal handler.

OK yes, nevermind. I focused on its invocation from tracehook_notify_resume
and missed this invocation in do_signal. My bad.

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

2020-05-14 17:40:56

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

Mathieu Desnoyers <[email protected]> writes:
> ----- On May 5, 2020, at 9:16 AM, Thomas Gleixner [email protected] wrote:
>
>> From: Peter Zijlstra <[email protected]>
>>
>
> Patch title: singal -> signal.
>
>> Convert #MC over to using task_work_add(); it will run the same code
>> slightly later, on the return to user path of the same exception.
>
> So I suspect that switching the order between tracehook_notify_resume()
> (which ends up calling task_work_run()) and do_signal() done by an
> earlier patch in this series intends to ensure the information about the
> instruction pointer causing the #MC is not overwritten by do_signal()
> (but I'm just guessing).

No, it does not. See the ordering discussion.

Aside of that signal never transported any address information. It uses
force_sig(SIGBUS).

Even if a different signal would be sent first then the register frame
of the #MC is still there when the fatal signal is sent later.

But even w/o changing the ordering the taskwork check in do_signal()
runs the pending work before delivering anything.

Thanks,

tglx

2020-05-14 17:46:37

by Mathieu Desnoyers

[permalink] [raw]

Subject: Re: [patch V4 part 1 29/36] x86/mce: Send #MC singal from task work

----- On May 14, 2020, at 1:38 PM, Thomas Gleixner [email protected] wrote:

> Mathieu Desnoyers <[email protected]> writes:
>> ----- On May 5, 2020, at 9:16 AM, Thomas Gleixner [email protected] wrote:
>>
>>> From: Peter Zijlstra <[email protected]>
>>>
>>
>> Patch title: singal -> signal.
>>
>>> Convert #MC over to using task_work_add(); it will run the same code
>>> slightly later, on the return to user path of the same exception.
>>
>> So I suspect that switching the order between tracehook_notify_resume()
>> (which ends up calling task_work_run()) and do_signal() done by an
>> earlier patch in this series intends to ensure the information about the
>> instruction pointer causing the #MC is not overwritten by do_signal()
>> (but I'm just guessing).
>
> No, it does not. See the ordering discussion.
>
> Aside of that signal never transported any address information. It uses
> force_sig(SIGBUS).
>
> Even if a different signal would be sent first then the register frame
> of the #MC is still there when the fatal signal is sent later.
>
> But even w/o changing the ordering the taskwork check in do_signal()
> runs the pending work before delivering anything.

Yep, that was the key thing I missed,

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

2020-05-19 19:54:48

by tip-bot2 for Alexey Makhalov

[permalink] [raw]

Subject: [tip: core/rcu] x86/mce: Send #MC singal from task work

The following commit has been merged into the core/rcu branch of tip:

Commit-ID: 5567d11c21a1d508a91a8cb64a819783a0835d9f
Gitweb: https://git.kernel.org/tip/5567d11c21a1d508a91a8cb64a819783a0835d9f
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 19 Feb 2020 10:22:06 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 19 May 2020 15:51:19 +02:00

x86/mce: Send #MC singal from task work

Convert #MC over to using task_work_add(); it will run the same code
slightly later, on the return to user path of the same exception.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
Reviewed-by: Alexandre Chartre <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]

---
arch/x86/kernel/cpu/mce/core.c | 56 ++++++++++++++++++---------------
include/linux/sched.h | 6 ++++-
2 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 98bf91c..2f0ef95 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -42,6 +42,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>
#include <linux/set_memory.h>
+#include <linux/task_work.h>

#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -1086,23 +1087,6 @@ static void mce_clear_state(unsigned long *toclear)
}
}

-static int do_memory_failure(struct mce *m)
-{
- int flags = MF_ACTION_REQUIRED;
- int ret;
-
- pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
- if (!(m->mcgstatus & MCG_STATUS_RIPV))
- flags |= MF_MUST_KILL;
- ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
- if (ret)
- pr_err("Memory error not recovered");
- else
- set_mce_nospec(m->addr >> PAGE_SHIFT);
- return ret;
-}
-
-
/*
* Cases where we avoid rendezvous handler timeout:
* 1) If this CPU is offline.
@@ -1204,6 +1188,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
*m = *final;
}

+static void kill_me_now(struct callback_head *ch)
+{
+ force_sig(SIGBUS);
+}
+
+static void kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+ if (!(p->mce_status & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
+
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ return;
+ }
+
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1222,7 +1229,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
*/
-void notrace do_machine_check(struct pt_regs *regs, long error_code)
+void noinstr do_machine_check(struct pt_regs *regs, long error_code)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
@@ -1354,13 +1361,13 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code)
if ((m.cs & 3) == 3) {
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- local_irq_enable();
- preempt_enable();

- if (kill_it || do_memory_failure(&m))
- force_sig(SIGBUS);
- preempt_disable();
- local_irq_disable();
+ current->mce_addr = m.addr;
+ current->mce_status = m.mcgstatus;
+ current->mce_kill_me.func = kill_me_maybe;
+ if (kill_it)
+ current->mce_kill_me.func = kill_me_now;
+ task_work_add(current, &current->mce_kill_me, true);
} else {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
mce_panic("Failed kernel mode recovery", &m, msg);
@@ -1370,7 +1377,6 @@ out_ist:
ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
-NOKPROBE_SYMBOL(do_machine_check);

#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int flags)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9437b53..57d0ed0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1297,6 +1297,12 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif

+#ifdef CONFIG_X86_MCE
+ u64 mce_addr;
+ u64 mce_status;
+ struct callback_head mce_kill_me;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.