2021-01-31 21:35:28

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH 07/11] x86/fault: Split the OOPS code out from no_context()

Not all callers of no_context() want to run exception fixups.
Separate the OOPS code out from the fixup code in no_context().

Cc: Dave Hansen <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/mm/fault.c | 116 +++++++++++++++++++++++---------------------
1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 1939e546beae..6f43d080e1e8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -618,53 +618,20 @@ static void set_signal_archinfo(unsigned long address,
}

static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int signal, int si_code)
+page_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
- struct task_struct *tsk = current;
unsigned long flags;
int sig;

if (user_mode(regs)) {
/*
- * This is an implicit supervisor-mode access from user
- * mode. Bypass all the kernel-mode recovery code and just
- * OOPS.
+ * Implicit kernel access from user mode? Skip the stack
+ * overflow and EFI special cases.
*/
goto oops;
}

- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
- return;
- }
-
#ifdef CONFIG_VMAP_STACK
/*
* Stack overflow? During boot, we can fault near the initial
@@ -672,8 +639,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -696,20 +663,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
}
#endif

- /*
- * 32-bit:
- *
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * 64-bit:
- *
- * Hall of shame of CPU/BIOS bugs.
- */
- if (is_prefetch(regs, error_code, address))
- return;
-
/*
* Buggy firmware could access regions which might page fault, try to
* recover from such faults.
@@ -726,7 +679,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,

show_fault_oops(regs, error_code, address);

- if (task_stack_end_corrupted(tsk))
+ if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

sig = SIGKILL;
@@ -739,6 +692,61 @@ no_context(struct pt_regs *regs, unsigned long error_code,
oops_end(flags, regs, sig);
}

+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code)
+{
+ if (user_mode(regs)) {
+ /*
+ * This is an implicit supervisor-mode access from user
+ * mode. Bypass all the kernel-mode recovery code and just
+ * OOPS.
+ */
+ goto oops;
+ }
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current->thread.sig_on_uaccess_err && signal) {
+ sanitize_error_code(address, &error_code);
+
+ set_signal_archinfo(address, error_code);
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_fault(signal, si_code, (void __user *)address);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
+ return;
+ }
+
+ /*
+ * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
+ * instruction.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+oops:
+ page_fault_oops(regs, error_code, address);
+}
+
/*
* Print out info about fatal segfaults, if the show_unhandled_signals
* sysctl is set:
--
2.29.2


2021-02-03 18:58:30

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 07/11] x86/fault: Split the OOPS code out from no_context()

On Sun, Jan 31, 2021 at 09:24:38AM -0800, Andy Lutomirski wrote:
> Not all callers of no_context() want to run exception fixups.
> Separate the OOPS code out from the fixup code in no_context().
>
> Cc: Dave Hansen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/mm/fault.c | 116 +++++++++++++++++++++++---------------------
> 1 file changed, 62 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 1939e546beae..6f43d080e1e8 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -618,53 +618,20 @@ static void set_signal_archinfo(unsigned long address,
> }
>
> static noinline void
> -no_context(struct pt_regs *regs, unsigned long error_code,
> - unsigned long address, int signal, int si_code)
> +page_fault_oops(struct pt_regs *regs, unsigned long error_code,

Not sure about this name - it still tries to recover:
efi_recover_from_page_fault().

Judging by where it is called, maybe no_context_tail() or
no_context_oops() or no_context_finish_me_already()...

Yah, I haz no better idea. :-\

...

> @@ -739,6 +692,61 @@ no_context(struct pt_regs *regs, unsigned long error_code,
> oops_end(flags, regs, sig);
> }
>
> +static noinline void
> +no_context(struct pt_regs *regs, unsigned long error_code,
> + unsigned long address, int signal, int si_code)
> +{
> + if (user_mode(regs)) {
> + /*
> + * This is an implicit supervisor-mode access from user
> + * mode. Bypass all the kernel-mode recovery code and just
> + * OOPS.
> + */
> + goto oops;

Just do

return page_fault_oops(...);

here and get rid of the label.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2021-02-03 19:34:45

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 07/11] x86/fault: Split the OOPS code out from no_context()



> On Feb 3, 2021, at 10:56 AM, Borislav Petkov <[email protected]> wrote:
>
> On Sun, Jan 31, 2021 at 09:24:38AM -0800, Andy Lutomirski wrote:
>> Not all callers of no_context() want to run exception fixups.
>> Separate the OOPS code out from the fixup code in no_context().
>>
>> Cc: Dave Hansen <[email protected]>
>> Cc: Peter Zijlstra <[email protected]>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>> arch/x86/mm/fault.c | 116 +++++++++++++++++++++++---------------------
>> 1 file changed, 62 insertions(+), 54 deletions(-)
>>
>> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
>> index 1939e546beae..6f43d080e1e8 100644
>> --- a/arch/x86/mm/fault.c
>> +++ b/arch/x86/mm/fault.c
>> @@ -618,53 +618,20 @@ static void set_signal_archinfo(unsigned long address,
>> }
>>
>> static noinline void
>> -no_context(struct pt_regs *regs, unsigned long error_code,
>> - unsigned long address, int signal, int si_code)
>> +page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>
> Not sure about this name - it still tries to recover:
> efi_recover_from_page_fault().
>

That function is a lie. It tries to keep the system alive but it doesn’t return if it “recovers”. Maybe I should add a comment?

> Judging by where it is called, maybe no_context_tail() or
> no_context_oops() or no_context_finish_me_already()...
>
> Yah, I haz no better idea. :-\
>
> ...
>
>> @@ -739,6 +692,61 @@ no_context(struct pt_regs *regs, unsigned long error_code,
>> oops_end(flags, regs, sig);
>> }
>>
>> +static noinline void
>> +no_context(struct pt_regs *regs, unsigned long error_code,
>> + unsigned long address, int signal, int si_code)
>> +{
>> + if (user_mode(regs)) {
>> + /*
>> + * This is an implicit supervisor-mode access from user
>> + * mode. Bypass all the kernel-mode recovery code and just
>> + * OOPS.
>> + */
>> + goto oops;
>
> Just do
>
> return page_fault_oops(...);
>
> here and get rid of the label.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

2021-02-03 19:48:39

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 07/11] x86/fault: Split the OOPS code out from no_context()

On Wed, Feb 03, 2021 at 11:29:39AM -0800, Andy Lutomirski wrote:
> That function is a lie. It tries to keep the system alive but it
> doesn’t return if it “recovers”. Maybe I should add a comment?

... or rename it?

That schedule() at the end, I dunno if that does anything since we're
going to oops anyway...

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2021-02-10 00:11:14

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 07/11] x86/fault: Split the OOPS code out from no_context()

On Wed, Feb 3, 2021 at 10:56 AM Borislav Petkov <[email protected]> wrote:
>
> On Sun, Jan 31, 2021 at 09:24:38AM -0800, Andy Lutomirski wrote:
> > Not all callers of no_context() want to run exception fixups.
> > Separate the OOPS code out from the fixup code in no_context().
> >
> > Cc: Dave Hansen <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> > arch/x86/mm/fault.c | 116 +++++++++++++++++++++++---------------------
> > 1 file changed, 62 insertions(+), 54 deletions(-)
> >
> > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > index 1939e546beae..6f43d080e1e8 100644
> > --- a/arch/x86/mm/fault.c
> > +++ b/arch/x86/mm/fault.c
> > @@ -618,53 +618,20 @@ static void set_signal_archinfo(unsigned long address,
> > }
> >
> > static noinline void
> > -no_context(struct pt_regs *regs, unsigned long error_code,
> > - unsigned long address, int signal, int si_code)
> > +page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>
> Not sure about this name - it still tries to recover:
> efi_recover_from_page_fault().
>
> Judging by where it is called, maybe no_context_tail() or
> no_context_oops() or no_context_finish_me_already()...
>
> Yah, I haz no better idea. :-\
>
> ...
>
> > @@ -739,6 +692,61 @@ no_context(struct pt_regs *regs, unsigned long error_code,
> > oops_end(flags, regs, sig);
> > }
> >
> > +static noinline void
> > +no_context(struct pt_regs *regs, unsigned long error_code,
> > + unsigned long address, int signal, int si_code)
> > +{
> > + if (user_mode(regs)) {
> > + /*
> > + * This is an implicit supervisor-mode access from user
> > + * mode. Bypass all the kernel-mode recovery code and just
> > + * OOPS.
> > + */
> > + goto oops;
>
> Just do
>
> return page_fault_oops(...);
>
> here and get rid of the label.

I'm going to skip this one, because that code is deleted later in the
series, and fixing it here just adds more churn.