A frequent cause of #GP exceptions are memory accesses to non-canonical
addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
the kernel doesn't currently print the fault address for #GP.
Luckily, we already have the necessary infrastructure for decoding X86
instructions and computing the memory address that is being accessed;
hook it up to the #GP handler so that we can figure out whether the #GP
looks like it was caused by a non-canonical address, and if so, print
that address.
While it is already possible to compute the faulting address manually by
disassembling the opcode dump and evaluating the instruction against the
register dump, this should make it slightly easier to identify crashes
at a glance.
Signed-off-by: Jann Horn <[email protected]>
---
Notes:
v2:
- print different message for segment-related GP (Borislav)
- rewrite check for non-canonical address (Sean)
- make it clear we don't know for sure why the GP happened (Andy)
arch/x86/kernel/traps.c | 45 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..12d42697a18e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -56,6 +56,8 @@
#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -509,6 +511,38 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
}
+/*
+ * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
+ * address, print that address.
+ */
+static void print_kernel_gp_address(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+ u8 insn_bytes[MAX_INSN_SIZE];
+ struct insn insn;
+ unsigned long addr_ref;
+
+ if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
+ return;
+
+ kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
+ addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+ /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
+ if (addr_ref >= ~__VIRTUAL_MASK)
+ return;
+
+ /* Bail out if the entire operand is in the canonical user half. */
+ if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
+ return;
+
+ pr_alert("probably dereferencing non-canonical address 0x%016lx\n",
+ addr_ref);
+#endif
+}
+
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
@@ -547,8 +581,15 @@ do_general_protection(struct pt_regs *regs, long error_code)
return;
if (notify_die(DIE_GPF, desc, regs, error_code,
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die(desc, regs, error_code);
+ X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ if (error_code)
+ pr_alert("GPF is segment-related (see error code)\n");
+ else
+ print_kernel_gp_address(regs);
+
+ die(desc, regs, error_code);
return;
}
--
2.24.0.432.g9d3f5f5b63-goog
On Fri, Nov 15, 2019 at 08:17:27PM +0100, Jann Horn wrote:
> dotraplinkage void
> do_general_protection(struct pt_regs *regs, long error_code)
> {
> @@ -547,8 +581,15 @@ do_general_protection(struct pt_regs *regs, long error_code)
> return;
>
> if (notify_die(DIE_GPF, desc, regs, error_code,
> - X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
> - die(desc, regs, error_code);
> + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
> + return;
> +
> + if (error_code)
> + pr_alert("GPF is segment-related (see error code)\n");
> + else
> + print_kernel_gp_address(regs);
> +
> + die(desc, regs, error_code);
Right, this way, those messages appear before the main "general
protection ..." message:
[ 2.434372] traps: probably dereferencing non-canonical address 0xdfff000000000001
[ 2.442492] general protection fault: 0000 [#1] PREEMPT SMP
Can we glue/merge them together? Or is this going to confuse tools too much:
[ 2.542218] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
(and that sentence could be shorter too:
"general protection fault for non-canonical address 0xdfff000000000001"
looks ok to me too.)
Here's a dirty diff together with a reproducer ontop of yours:
---
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf796f8c9998..dab702ba28a6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -515,7 +515,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
* On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
* address, print that address.
*/
-static void print_kernel_gp_address(struct pt_regs *regs)
+static unsigned long get_kernel_gp_address(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
u8 insn_bytes[MAX_INSN_SIZE];
@@ -523,7 +523,7 @@ static void print_kernel_gp_address(struct pt_regs *regs)
unsigned long addr_ref;
if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
- return;
+ return 0;
kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
insn_get_modrm(&insn);
@@ -532,22 +532,22 @@ static void print_kernel_gp_address(struct pt_regs *regs)
/* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
if (addr_ref >= ~__VIRTUAL_MASK)
- return;
+ return 0;
/* Bail out if the entire operand is in the canonical user half. */
if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
- return;
+ return 0;
- pr_alert("probably dereferencing non-canonical address 0x%016lx\n",
- addr_ref);
+ return addr_ref;
#endif
}
+#define GPFSTR "general protection fault"
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
- const char *desc = "general protection fault";
struct task_struct *tsk;
+ char desc[90];
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
@@ -584,12 +584,18 @@ do_general_protection(struct pt_regs *regs, long error_code)
X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
return;
- if (error_code)
- pr_alert("GPF is segment-related (see error code)\n");
- else
- print_kernel_gp_address(regs);
+ if (error_code) {
+ snprintf(desc, 90, "segment-related " GPFSTR);
+ } else {
+ unsigned long addr_ref = get_kernel_gp_address(regs);
+
+ if (addr_ref)
+ snprintf(desc, 90, GPFSTR " while derefing a non-canonical address 0x%lx", addr_ref);
+ else
+ snprintf(desc, 90, GPFSTR);
+ }
- die(desc, regs, error_code);
+ die((const char *)desc, regs, error_code);
return;
}
diff --git a/init/main.c b/init/main.c
index 91f6ebb30ef0..7acc7e660be9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1124,6 +1124,9 @@ static int __ref kernel_init(void *unused)
rcu_end_inkernel_boot();
+ asm volatile("mov $0xdfff000000000001, %rax\n\t"
+ "jmpq *%rax\n\t");
+
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Mon, Nov 18, 2019 at 3:21 PM Borislav Petkov <[email protected]> wrote:
>
> On Fri, Nov 15, 2019 at 08:17:27PM +0100, Jann Horn wrote:
> > dotraplinkage void
> > do_general_protection(struct pt_regs *regs, long error_code)
> > {
> > @@ -547,8 +581,15 @@ do_general_protection(struct pt_regs *regs, long error_code)
> > return;
> >
> > if (notify_die(DIE_GPF, desc, regs, error_code,
> > - X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
> > - die(desc, regs, error_code);
> > + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
> > + return;
> > +
> > + if (error_code)
> > + pr_alert("GPF is segment-related (see error code)\n");
> > + else
> > + print_kernel_gp_address(regs);
> > +
> > + die(desc, regs, error_code);
>
> Right, this way, those messages appear before the main "general
> protection ..." message:
>
> [ 2.434372] traps: probably dereferencing non-canonical address 0xdfff000000000001
> [ 2.442492] general protection fault: 0000 [#1] PREEMPT SMP
>
> Can we glue/merge them together? Or is this going to confuse tools too much:
>
> [ 2.542218] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
>
> (and that sentence could be shorter too:
>
> "general protection fault for non-canonical address 0xdfff000000000001"
>
> looks ok to me too.)
This exact form will confuse syzkaller crash parsing for Linux kernel:
https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L1347
It expects a "general protection fault:" line for these crashes.
A graceful way to update kernel crash messages would be to add more
tests with the new format here:
https://github.com/google/syzkaller/tree/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/testdata/linux/report
Update parsing code. Roll out new version. Update all other testing
systems that detect and parse kernel crashes. Then commit kernel
changes.
An unfortunate consequence of offloading testing to third-party systems...
> Here's a dirty diff together with a reproducer ontop of yours:
>
> ---
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index bf796f8c9998..dab702ba28a6 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -515,7 +515,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
> * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
> * address, print that address.
> */
> -static void print_kernel_gp_address(struct pt_regs *regs)
> +static unsigned long get_kernel_gp_address(struct pt_regs *regs)
> {
> #ifdef CONFIG_X86_64
> u8 insn_bytes[MAX_INSN_SIZE];
> @@ -523,7 +523,7 @@ static void print_kernel_gp_address(struct pt_regs *regs)
> unsigned long addr_ref;
>
> if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
> - return;
> + return 0;
>
> kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
> insn_get_modrm(&insn);
> @@ -532,22 +532,22 @@ static void print_kernel_gp_address(struct pt_regs *regs)
>
> /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
> if (addr_ref >= ~__VIRTUAL_MASK)
> - return;
> + return 0;
>
> /* Bail out if the entire operand is in the canonical user half. */
> if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
> - return;
> + return 0;
>
> - pr_alert("probably dereferencing non-canonical address 0x%016lx\n",
> - addr_ref);
> + return addr_ref;
> #endif
> }
>
> +#define GPFSTR "general protection fault"
> dotraplinkage void
> do_general_protection(struct pt_regs *regs, long error_code)
> {
> - const char *desc = "general protection fault";
> struct task_struct *tsk;
> + char desc[90];
>
> RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> cond_local_irq_enable(regs);
> @@ -584,12 +584,18 @@ do_general_protection(struct pt_regs *regs, long error_code)
> X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
> return;
>
> - if (error_code)
> - pr_alert("GPF is segment-related (see error code)\n");
> - else
> - print_kernel_gp_address(regs);
> + if (error_code) {
> + snprintf(desc, 90, "segment-related " GPFSTR);
> + } else {
> + unsigned long addr_ref = get_kernel_gp_address(regs);
> +
> + if (addr_ref)
> + snprintf(desc, 90, GPFSTR " while derefing a non-canonical address 0x%lx", addr_ref);
> + else
> + snprintf(desc, 90, GPFSTR);
> + }
>
> - die(desc, regs, error_code);
> + die((const char *)desc, regs, error_code);
> return;
> }
>
> diff --git a/init/main.c b/init/main.c
> index 91f6ebb30ef0..7acc7e660be9 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -1124,6 +1124,9 @@ static int __ref kernel_init(void *unused)
>
> rcu_end_inkernel_boot();
>
> + asm volatile("mov $0xdfff000000000001, %rax\n\t"
> + "jmpq *%rax\n\t");
> +
> if (ramdisk_execute_command) {
> ret = run_init_process(ramdisk_execute_command);
> if (!ret)
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>
> --
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20191118142144.GC6363%40zn.tnic.
On Mon, Nov 18, 2019 at 5:03 PM Dmitry Vyukov <[email protected]> wrote:
> On Mon, Nov 18, 2019 at 3:21 PM Borislav Petkov <[email protected]> wrote:
> >
> > On Fri, Nov 15, 2019 at 08:17:27PM +0100, Jann Horn wrote:
> > > dotraplinkage void
> > > do_general_protection(struct pt_regs *regs, long error_code)
> > > {
> > > @@ -547,8 +581,15 @@ do_general_protection(struct pt_regs *regs, long error_code)
> > > return;
> > >
> > > if (notify_die(DIE_GPF, desc, regs, error_code,
> > > - X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
> > > - die(desc, regs, error_code);
> > > + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
> > > + return;
> > > +
> > > + if (error_code)
> > > + pr_alert("GPF is segment-related (see error code)\n");
> > > + else
> > > + print_kernel_gp_address(regs);
> > > +
> > > + die(desc, regs, error_code);
> >
> > Right, this way, those messages appear before the main "general
> > protection ..." message:
> >
> > [ 2.434372] traps: probably dereferencing non-canonical address 0xdfff000000000001
> > [ 2.442492] general protection fault: 0000 [#1] PREEMPT SMP
> >
> > Can we glue/merge them together? Or is this going to confuse tools too much:
> >
> > [ 2.542218] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> >
> > (and that sentence could be shorter too:
> >
> > "general protection fault for non-canonical address 0xdfff000000000001"
> >
> > looks ok to me too.)
>
> This exact form will confuse syzkaller crash parsing for Linux kernel:
> https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L1347
> It expects a "general protection fault:" line for these crashes.
>
> A graceful way to update kernel crash messages would be to add more
> tests with the new format here:
> https://github.com/google/syzkaller/tree/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/testdata/linux/report
> Update parsing code. Roll out new version. Update all other testing
> systems that detect and parse kernel crashes. Then commit kernel
> changes.
So for syzkaller, it'd be fine as long as we keep the colon there?
Something like:
general protection fault: derefing non-canonical address
0xdfff000000000001: 0000 [#1] PREEMPT SMP
And it looks like the 0day test bot doesn't have any specific pattern
for #GP, it seems to just look for the panic triggered by
panic-on-oops as far as I can tell (oops=panic in lkp-exec/qemu, no
"general protection fault" in etc/dmesg-kill-pattern).
> An unfortunate consequence of offloading testing to third-party systems...
And of not having a standard way to signal "this line starts something
that should be reported as a bug"? Maybe as a longer-term idea, it'd
help to have some sort of extra prefix byte that the kernel can print
to say "here comes a bug report, first line should be the subject", or
something like that, similar to how we have loglevels...
On Mon, Nov 18, 2019 at 5:20 PM 'Jann Horn' via kasan-dev
<[email protected]> wrote:
>
> On Mon, Nov 18, 2019 at 5:03 PM Dmitry Vyukov <[email protected]> wrote:
> > On Mon, Nov 18, 2019 at 3:21 PM Borislav Petkov <[email protected]> wrote:
> > >
> > > On Fri, Nov 15, 2019 at 08:17:27PM +0100, Jann Horn wrote:
> > > > dotraplinkage void
> > > > do_general_protection(struct pt_regs *regs, long error_code)
> > > > {
> > > > @@ -547,8 +581,15 @@ do_general_protection(struct pt_regs *regs, long error_code)
> > > > return;
> > > >
> > > > if (notify_die(DIE_GPF, desc, regs, error_code,
> > > > - X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
> > > > - die(desc, regs, error_code);
> > > > + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
> > > > + return;
> > > > +
> > > > + if (error_code)
> > > > + pr_alert("GPF is segment-related (see error code)\n");
> > > > + else
> > > > + print_kernel_gp_address(regs);
> > > > +
> > > > + die(desc, regs, error_code);
> > >
> > > Right, this way, those messages appear before the main "general
> > > protection ..." message:
> > >
> > > [ 2.434372] traps: probably dereferencing non-canonical address 0xdfff000000000001
> > > [ 2.442492] general protection fault: 0000 [#1] PREEMPT SMP
> > >
> > > Can we glue/merge them together? Or is this going to confuse tools too much:
> > >
> > > [ 2.542218] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> > >
> > > (and that sentence could be shorter too:
> > >
> > > "general protection fault for non-canonical address 0xdfff000000000001"
> > >
> > > looks ok to me too.)
> >
> > This exact form will confuse syzkaller crash parsing for Linux kernel:
> > https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L1347
> > It expects a "general protection fault:" line for these crashes.
> >
> > A graceful way to update kernel crash messages would be to add more
> > tests with the new format here:
> > https://github.com/google/syzkaller/tree/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/testdata/linux/report
> > Update parsing code. Roll out new version. Update all other testing
> > systems that detect and parse kernel crashes. Then commit kernel
> > changes.
>
> So for syzkaller, it'd be fine as long as we keep the colon there?
> Something like:
>
> general protection fault: derefing non-canonical address
> 0xdfff000000000001: 0000 [#1] PREEMPT SMP
Probably. Tests help a lot to answer such questions ;) But presumably
it should break parsing.
> And it looks like the 0day test bot doesn't have any specific pattern
> for #GP, it seems to just look for the panic triggered by
> panic-on-oops as far as I can tell (oops=panic in lkp-exec/qemu, no
> "general protection fault" in etc/dmesg-kill-pattern).
>
> > An unfortunate consequence of offloading testing to third-party systems...
>
> And of not having a standard way to signal "this line starts something
> that should be reported as a bug"? Maybe as a longer-term idea, it'd
> help to have some sort of extra prefix byte that the kernel can print
> to say "here comes a bug report, first line should be the subject", or
> something like that, similar to how we have loglevels...
This would be great.
Also a way to denote crash end.
However we have lots of special logic for subjects, not sure if kernel
could provide good subject:
https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L537-L1588
Probably it could, but it won't be completely trivial. E.g. if there
is a stall inside of a timer function, it should give the name of the
actual timer callback as identity ("stall in timer_subsystem_foo"). Or
for syscalls we use more disambiguation b/c "in sys_ioclt" is not much
different than saying "there is a bug in kernel" :)
On Mon, Nov 18, 2019 at 5:29 PM Dmitry Vyukov <[email protected]> wrote:
> On Mon, Nov 18, 2019 at 5:20 PM 'Jann Horn' via kasan-dev
> <[email protected]> wrote:
> > On Mon, Nov 18, 2019 at 5:03 PM Dmitry Vyukov <[email protected]> wrote:
> > > This exact form will confuse syzkaller crash parsing for Linux kernel:
> > > https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L1347
> > > It expects a "general protection fault:" line for these crashes.
> > >
> > > A graceful way to update kernel crash messages would be to add more
> > > tests with the new format here:
> > > https://github.com/google/syzkaller/tree/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/testdata/linux/report
> > > Update parsing code. Roll out new version. Update all other testing
> > > systems that detect and parse kernel crashes. Then commit kernel
> > > changes.
[...]
> > > An unfortunate consequence of offloading testing to third-party systems...
> >
> > And of not having a standard way to signal "this line starts something
> > that should be reported as a bug"? Maybe as a longer-term idea, it'd
> > help to have some sort of extra prefix byte that the kernel can print
> > to say "here comes a bug report, first line should be the subject", or
> > something like that, similar to how we have loglevels...
>
> This would be great.
> Also a way to denote crash end.
> However we have lots of special logic for subjects, not sure if kernel
> could provide good subject:
> https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L537-L1588
> Probably it could, but it won't be completely trivial. E.g. if there
> is a stall inside of a timer function, it should give the name of the
> actual timer callback as identity ("stall in timer_subsystem_foo"). Or
> for syscalls we use more disambiguation b/c "in sys_ioclt" is not much
> different than saying "there is a bug in kernel" :)
Maybe I'm overthinking things, and maybe this is too much effort
relative to the benefit it brings, but here's a crazy idea:
For the specific case of stalls, it might help if the kernel could put
markers on the stack on the first stall warning (e.g. assuming that
ORC is enabled, by walking the stack and replacing all saved
instruction pointers with a pointer to some magic trampoline that
jumps back to the original caller using some sort of shadow stack),
then wait a few seconds, and then check how far on the stack the
markers have been cleared. Then hopefully you'd know exactly in which
function you're looping.
On Mon, Nov 18, 2019 at 05:29:42PM +0100, Dmitry Vyukov wrote:
> > And of not having a standard way to signal "this line starts something
> > that should be reported as a bug"? Maybe as a longer-term idea, it'd
> > help to have some sort of extra prefix byte that the kernel can print
> > to say "here comes a bug report, first line should be the subject", or
> > something like that, similar to how we have loglevels...
>
> This would be great.
> Also a way to denote crash end.
> However we have lots of special logic for subjects, not sure if kernel
> could provide good subject:
> https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L537-L1588
> Probably it could, but it won't be completely trivial. E.g. if there
> is a stall inside of a timer function, it should give the name of the
> actual timer callback as identity ("stall in timer_subsystem_foo"). Or
> for syscalls we use more disambiguation b/c "in sys_ioclt" is not much
> different than saying "there is a bug in kernel" :)
While external tools are fine and cool, they can't really block kernel
development and printk strings format is not an ABI. And yeah, we have
this discussion each time someone proposes changes to those "magic"
strings but I guess it is about time to fix this in a way that any
future changes don't break tools.
And so I like the idea of marking *only* the first splat with some small
prefix char as that first splat is the special and very important one.
I.e., the one where die_counter is 0.
So I could very well imagine something like:
...
[ 2.523708] Write protecting the kernel read-only data: 16384k
[ 2.524729] Freeing unused kernel image (text/rodata gap) memory: 2040K
[ 2.525594] Freeing unused kernel image (rodata/data gap) memory: 368K
[ 2.541414] x86/mm: Checked W+X mappings: passed, no W+X pages found.
<--- important first splat starts here:
[ 2.542218] [*] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
[ 2.543343] [*] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
[ 2.544138] [*] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
[ 2.545120] [*] RIP: 0010:kernel_init+0x58/0x107
[ 2.546055] [*] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
[ 2.550242] [*] RSP: 0018:ffffc90000013f50 EFLAGS: 00010246
[ 2.551691] [*] RAX: dfff000000000001 RBX: ffffffff817b7ac9 RCX: 0000000000000040
[ 2.553435] [*] RDX: 0000000000000030 RSI: ffff88807da2f170 RDI: 000000000002f170
[ 2.555169] [*] RBP: 0000000000000000 R08: 00000000000001a6 R09: 00000000ad55ad55
[ 2.556393] [*] R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000
[ 2.557268] [*] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 2.558417] [*] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
[ 2.559370] [*] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2.560138] [*] CR2: 0000000000000000 CR3: 0000000002009000 CR4: 00000000003406f0
[ 2.561013] [*] Call Trace:
[ 2.561506] [*] ret_from_fork+0x22/0x40
[ 2.562080] [*] Modules linked in:
[ 2.583706] [*] ---[ end trace 8ceb5a62d3ebbfa6 ]---
[ 2.584384] [*] RIP: 0010:kernel_init+0x58/0x107
[ 2.584999] [*] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
[ 2.591746] [*] RSP: 0018:ffffc90000013f50 EFLAGS: 00010246
[ 2.593175] [*] RAX: dfff000000000001 RBX: ffffffff817b7ac9 RCX: 0000000000000040
[ 2.594892] [*] RDX: 0000000000000030 RSI: ffff88807da2f170 RDI: 000000000002f170
[ 2.599706] [*] RBP: 0000000000000000 R08: 00000000000001a6 R09: 00000000ad55ad55
[ 2.600585] [*] R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000
[ 2.601433] [*] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 2.602283] [*] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
[ 2.603207] [*] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2.607706] [*] CR2: 0000000000000000 CR3: 0000000002009000 CR4: 00000000003406f0
[ 2.608565] [*] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
[ 2.609600] [*] Kernel Offset: disabled
[ 2.610168] [*] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---
<--- and ends here.
to denote that first splat. And the '*' thing is just an example - it
can be any char - whatever's easier to grep for.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Mon, Nov 18, 2019 at 05:44:07PM +0100, Borislav Petkov wrote:
> [ 2.523708] Write protecting the kernel read-only data: 16384k
> [ 2.524729] Freeing unused kernel image (text/rodata gap) memory: 2040K
> [ 2.525594] Freeing unused kernel image (rodata/data gap) memory: 368K
> [ 2.541414] x86/mm: Checked W+X mappings: passed, no W+X pages found.
>
> <--- important first splat starts here:
>
> [ 2.542218] [*] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
^
Btw, tglx just suggested on IRC to simply slap the die_counter number here so
that you have
[ 2.543343] [1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
[ 2.544138] [1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
...
which also tells you to which splat the line belongs to.
Also useful.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
Jann Horn <[email protected]> writes:
> +
> + if (error_code)
> + pr_alert("GPF is segment-related (see error code)\n");
> + else
> + print_kernel_gp_address(regs);
Is this really correct? There are a lot of instructions that can do #GP
(it's the CPU's equivalent of EINVAL) and I'm pretty sure many of them
don't set an error code, and many don't have operands either.
You would need to make sure the instruction decoder handles these
cases correctly, and ideally that you detect it instead of printing
a bogus address.
-Andi
On Wed, Nov 20, 2019 at 5:25 AM Andi Kleen <[email protected]> wrote:
> Jann Horn <[email protected]> writes:
> > + if (error_code)
> > + pr_alert("GPF is segment-related (see error code)\n");
> > + else
> > + print_kernel_gp_address(regs);
>
> Is this really correct? There are a lot of instructions that can do #GP
> (it's the CPU's equivalent of EINVAL) and I'm pretty sure many of them
> don't set an error code, and many don't have operands either.
>
> You would need to make sure the instruction decoder handles these
> cases correctly, and ideally that you detect it instead of printing
> a bogus address.
Is there a specific concern you have about the instruction decoder? As
far as I can tell, all the paths of insn_get_addr_ref() only work if
the instruction has a mod R/M byte according to the instruction
tables, and then figures out the address based on that. While that
means that there's a wide variety of cases in which we won't be able
to figure out the address, I'm not aware of anything specific that is
likely to lead to false positives.
But Andy did suggest that we hedge a bit in the error message because
even if the address passed to the instruction is non-canonical, we
don't know for sure whether that's actually the reason why things
failed, and that's why it says "probably" in the message about the
address now.
* Borislav Petkov <[email protected]> wrote:
> On Mon, Nov 18, 2019 at 05:29:42PM +0100, Dmitry Vyukov wrote:
> > > And of not having a standard way to signal "this line starts something
> > > that should be reported as a bug"? Maybe as a longer-term idea, it'd
> > > help to have some sort of extra prefix byte that the kernel can print
> > > to say "here comes a bug report, first line should be the subject", or
> > > something like that, similar to how we have loglevels...
> >
> > This would be great.
> > Also a way to denote crash end.
> > However we have lots of special logic for subjects, not sure if kernel
> > could provide good subject:
> > https://github.com/google/syzkaller/blob/1daed50ac33511e1a107228a9c3b80e5c4aebb5c/pkg/report/linux.go#L537-L1588
> > Probably it could, but it won't be completely trivial. E.g. if there
> > is a stall inside of a timer function, it should give the name of the
> > actual timer callback as identity ("stall in timer_subsystem_foo"). Or
> > for syscalls we use more disambiguation b/c "in sys_ioclt" is not much
> > different than saying "there is a bug in kernel" :)
>
> While external tools are fine and cool, they can't really block kernel
> development and printk strings format is not an ABI. And yeah, we have
> this discussion each time someone proposes changes to those "magic"
> strings but I guess it is about time to fix this in a way that any
> future changes don't break tools.
>
> And so I like the idea of marking *only* the first splat with some small
> prefix char as that first splat is the special and very important one.
> I.e., the one where die_counter is 0.
>
> So I could very well imagine something like:
>
> ...
> [ 2.523708] Write protecting the kernel read-only data: 16384k
> [ 2.524729] Freeing unused kernel image (text/rodata gap) memory: 2040K
> [ 2.525594] Freeing unused kernel image (rodata/data gap) memory: 368K
> [ 2.541414] x86/mm: Checked W+X mappings: passed, no W+X pages found.
>
> <--- important first splat starts here:
>
> [ 2.542218] [*] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> [ 2.543343] [*] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> [ 2.544138] [*] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
> [ 2.545120] [*] RIP: 0010:kernel_init+0x58/0x107
> [ 2.546055] [*] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
> <--- and ends here.
>
> to denote that first splat. And the '*' thing is just an example - it
> can be any char - whatever's easier to grep for.
Well, this would break various pieces of tooling I'm sure.
Maybe it would be nicer to tooling to embedd the splat-counter in the
timestamp in a way:
> [ 2.542218-#1] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> [ 2.543343-#1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> [ 2.544138-#1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
> [ 2.545120-#1] RIP: 0010:kernel_init+0x58/0x107
> [ 2.546055-#1] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
That way we'd not only know that it's the first splat, but we'd know it
from all the *other* splats as well where they are in the splat-rank ;-)
(Also Cc:-ed Linus.)
Thanks,
Ingo
* Borislav Petkov <[email protected]> wrote:
> On Mon, Nov 18, 2019 at 05:44:07PM +0100, Borislav Petkov wrote:
> > [ 2.523708] Write protecting the kernel read-only data: 16384k
> > [ 2.524729] Freeing unused kernel image (text/rodata gap) memory: 2040K
> > [ 2.525594] Freeing unused kernel image (rodata/data gap) memory: 368K
> > [ 2.541414] x86/mm: Checked W+X mappings: passed, no W+X pages found.
> >
> > <--- important first splat starts here:
> >
> > [ 2.542218] [*] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> ^
>
> Btw, tglx just suggested on IRC to simply slap the die_counter number here so
> that you have
>
> [ 2.543343] [1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> [ 2.544138] [1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
> ...
>
> which also tells you to which splat the line belongs to.
>
> Also useful.
Yeah - but I think it would be even better to make it part of the
timestamp - most tools will already discard the [] bit, so why not merge
the two:
> [ 2.543343-#1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> [ 2.544138-#1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
Thanks,
Ingo
On Wed, Nov 20, 2019 at 12:40:31PM +0100, Ingo Molnar wrote:
> Well, this would break various pieces of tooling I'm sure.
Well, if at all, this will break them one last time. Ironically, the
intent here is to have a markup which doesn't break them anymore, once
that markup is agreed upon by all parties.
Because each time we touch those printk formats, tools people complain
about us breaking their tools. So we should get the best of both worlds
by marking those splats in a way that tools can grep for and we won't
touch the markers anymore, once established.
Also, "[]" was only an example. It can be anything we want, as in "<>"
or "!" or whatever is a short prefix that prepends those lines.
> Maybe it would be nicer to tooling to embedd the splat-counter in the
> timestamp in a way:
Or that. Whatever we agree, as long as it is a unique marker for splats.
And it should say which splat it is, as that is also very useful
information to have it in each line.
> > [ 2.542218-#1] general protection fault while derefing a non-canonical address 0xdfff000000000001: 0000 [#1] PREEMPT SMP
> > [ 2.543343-#1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc8+ #8
> > [ 2.544138-#1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
> > [ 2.545120-#1] RIP: 0010:kernel_init+0x58/0x107
> > [ 2.546055-#1] Code: 48 c7 c7 e0 5c e7 81 e8 eb d2 90 ff c7 05 77 d6 95 00 02 00 00 00 e8 4e 1d a2 ff e8 69 b7 91 ff 48 b8 01 00 00 00 00 00 ff df <ff> e0 48 8b 3d fe 54 d7 00 48 85 ff 74 22 e8 76 93 84 ff 85 c0 89
>
> That way we'd not only know that it's the first splat, but we'd know it
> from all the *other* splats as well where they are in the splat-rank ;-)
That's exactly why I'd want the number in there.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
> Is there a specific concern you have about the instruction decoder? As
> far as I can tell, all the paths of insn_get_addr_ref() only work if
> the instruction has a mod R/M byte according to the instruction
> tables, and then figures out the address based on that. While that
> means that there's a wide variety of cases in which we won't be able
> to figure out the address, I'm not aware of anything specific that is
> likely to lead to false positives.
First there will be a lot of cases you'll just print 0, even
though 0 is canonical if there is no operand.
Then it might be that the address is canonical, but triggers
#GP anyways (e.g. unaligned SSE)
Or it might be the wrong address if there is an operand,
there are many complex instructions that reference something
in memory, and usually do canonical checking there.
And some other odd cases. For example when the instruction length
exceeds 15 bytes. I know there is fuzzing for the instruction
decoder, but it might be worth double checking it handles
all of that correctly. I'm not sure how good the fuzzer's coverage
is.
At a minimum you should probably check if the address is
actually non canonical. Maybe that's simple enough and weeds out
most cases.
-Andi
On Wed, Nov 20, 2019 at 2:56 PM Andi Kleen <[email protected]> wrote:
> > Is there a specific concern you have about the instruction decoder? As
> > far as I can tell, all the paths of insn_get_addr_ref() only work if
> > the instruction has a mod R/M byte according to the instruction
> > tables, and then figures out the address based on that. While that
> > means that there's a wide variety of cases in which we won't be able
> > to figure out the address, I'm not aware of anything specific that is
> > likely to lead to false positives.
>
> First there will be a lot of cases you'll just print 0, even
> though 0 is canonical if there is no operand.
Why would I print zeroes if there is no operand? The decoder logic
returns a -1 if it can't find a mod r/m byte, which causes the #GP
handler to not print any address at all. Or are you talking about some
weird instruction that takes an operand that is actually ignored, or
something weird like that?
> Then it might be that the address is canonical, but triggers
> #GP anyways (e.g. unaligned SSE)
Which is an argument for printing the address even if it is canonical,
as Ingo suggested, I guess.
> Or it might be the wrong address if there is an operand,
> there are many complex instructions that reference something
> in memory, and usually do canonical checking there.
In which case you'd probably usually see a canonical address in the
instruction's argument, which causes the error message to not appear
(in patch v2/v3) / to be different (in my current draft for patch v4).
And as Ingo said over in the other thread, even if the argument is not
directly the faulting address at all, it might still help with
figuring out what's going on.
> And some other odd cases. For example when the instruction length
> exceeds 15 bytes.
But this is the #GP handler. Don't overlong instructions give you #UD instead?
> I know there is fuzzing for the instruction
> decoder, but it might be worth double checking it handles
> all of that correctly. I'm not sure how good the fuzzer's coverage
> is.
>
> At a minimum you should probably check if the address is
> actually non canonical. Maybe that's simple enough and weeds out
> most cases.
The patch you're commenting on does that already; quoting the patch:
+ /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
+ if (addr_ref >= ~__VIRTUAL_MASK)
+ return;
+
+ /* Bail out if the entire operand is in the canonical user half. */
+ if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
+ return;
But at Ingo's request, I'm planning to change that in the v4 patch;
see <https://lore.kernel.org/lkml/[email protected]/>
and <https://lore.kernel.org/lkml/CAG48ez0Frp4-+xHZ=UhbHh0hC_h-1VtJfwHw=kDo6NahyMv1ig@mail.gmail.com/>.
On Fri, Nov 15, 2019 at 11:17 AM Jann Horn <[email protected]> wrote:
>
> A frequent cause of #GP exceptions are memory accesses to non-canonical
> addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
> the kernel doesn't currently print the fault address for #GP.
> Luckily, we already have the necessary infrastructure for decoding X86
> instructions and computing the memory address that is being accessed;
> hook it up to the #GP handler so that we can figure out whether the #GP
> looks like it was caused by a non-canonical address, and if so, print
> that address.
>
> While it is already possible to compute the faulting address manually by
> disassembling the opcode dump and evaluating the instruction against the
> register dump, this should make it slightly easier to identify crashes
> at a glance.
>
> Signed-off-by: Jann Horn <[email protected]>
> ---
>
> Notes:
> v2:
> - print different message for segment-related GP (Borislav)
> - rewrite check for non-canonical address (Sean)
> - make it clear we don't know for sure why the GP happened (Andy)
>
> arch/x86/kernel/traps.c | 45 +++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index c90312146da0..12d42697a18e 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -56,6 +56,8 @@
> #include <asm/mpx.h>
> #include <asm/vm86.h>
> #include <asm/umip.h>
> +#include <asm/insn.h>
> +#include <asm/insn-eval.h>
>
> #ifdef CONFIG_X86_64
> #include <asm/x86_init.h>
> @@ -509,6 +511,38 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
> do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
> }
>
> +/*
> + * On 64-bit, if an uncaught #GP occurs while dereferencing a non-canonical
> + * address, print that address.
> + */
> +static void print_kernel_gp_address(struct pt_regs *regs)
> +{
> +#ifdef CONFIG_X86_64
> + u8 insn_bytes[MAX_INSN_SIZE];
> + struct insn insn;
> + unsigned long addr_ref;
> +
> + if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
> + return;
> +
> + kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
> + insn_get_modrm(&insn);
> + insn_get_sib(&insn);
> + addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
> +
> + /* Bail out if insn_get_addr_ref() failed or we got a kernel address. */
> + if (addr_ref >= ~__VIRTUAL_MASK)
> + return;
> +
> + /* Bail out if the entire operand is in the canonical user half. */
> + if (addr_ref + insn.opnd_bytes - 1 <= __VIRTUAL_MASK)
> + return;
> +
> + pr_alert("probably dereferencing non-canonical address 0x%016lx\n",
> + addr_ref);
> +#endif
> +}
Could you refactor this a little bit so that we end up with a helper
that does the computation? Something like:
int probe_insn_get_memory_ref(void **addr, size_t *len, void *insn_addr);
returns 1 if there was a memory operand and fills in addr and len,
returns 0 if there was no memory operand, and returns a negative error
on error.
I think we're going to want this for #AC handling, too :)
On Sun, Nov 24, 2019 at 12:08 AM Andy Lutomirski <[email protected]> wrote:
> On Fri, Nov 15, 2019 at 11:17 AM Jann Horn <[email protected]> wrote:
> > A frequent cause of #GP exceptions are memory accesses to non-canonical
> > addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
> > the kernel doesn't currently print the fault address for #GP.
> > Luckily, we already have the necessary infrastructure for decoding X86
> > instructions and computing the memory address that is being accessed;
> > hook it up to the #GP handler so that we can figure out whether the #GP
> > looks like it was caused by a non-canonical address, and if so, print
> > that address.
[...]
> > +static void print_kernel_gp_address(struct pt_regs *regs)
> > +{
> > +#ifdef CONFIG_X86_64
> > + u8 insn_bytes[MAX_INSN_SIZE];
> > + struct insn insn;
> > + unsigned long addr_ref;
> > +
> > + if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
> > + return;
> > +
> > + kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
> > + insn_get_modrm(&insn);
> > + insn_get_sib(&insn);
> > + addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
[...]
> > +}
>
> Could you refactor this a little bit so that we end up with a helper
> that does the computation? Something like:
>
> int probe_insn_get_memory_ref(void **addr, size_t *len, void *insn_addr);
>
> returns 1 if there was a memory operand and fills in addr and len,
> returns 0 if there was no memory operand, and returns a negative error
> on error.
>
> I think we're going to want this for #AC handling, too :)
Mmmh... the instruction decoder doesn't currently give us a reliable
access size though. (I know, I'm using it here regardless, but it
doesn't really matter here if the decoded size is too big from time to
time... whereas I imagine that that'd matter quite a bit for #AC
handling.) IIRC e.g. a MOVZX that loads 1 byte into a 4-byte register
is decoded as having .opnd_bytes==4; and if you look through
arch/x86/lib/insn.c, there isn't even anything that would ever set
->opnd_bytes to 1. You'd have to add some plumbing to get reliable
access sizes. I don't want to add a helper for this before the
underlying infrastructure actually works properly.
On Wed, Nov 27, 2019 at 12:27 PM Jann Horn <[email protected]> wrote:
>
> On Sun, Nov 24, 2019 at 12:08 AM Andy Lutomirski <[email protected]> wrote:
> > On Fri, Nov 15, 2019 at 11:17 AM Jann Horn <[email protected]> wrote:
> > > A frequent cause of #GP exceptions are memory accesses to non-canonical
> > > addresses. Unlike #PF, #GP doesn't come with a fault address in CR2, so
> > > the kernel doesn't currently print the fault address for #GP.
> > > Luckily, we already have the necessary infrastructure for decoding X86
> > > instructions and computing the memory address that is being accessed;
> > > hook it up to the #GP handler so that we can figure out whether the #GP
> > > looks like it was caused by a non-canonical address, and if so, print
> > > that address.
> [...]
> > > +static void print_kernel_gp_address(struct pt_regs *regs)
> > > +{
> > > +#ifdef CONFIG_X86_64
> > > + u8 insn_bytes[MAX_INSN_SIZE];
> > > + struct insn insn;
> > > + unsigned long addr_ref;
> > > +
> > > + if (probe_kernel_read(insn_bytes, (void *)regs->ip, MAX_INSN_SIZE))
> > > + return;
> > > +
> > > + kernel_insn_init(&insn, insn_bytes, MAX_INSN_SIZE);
> > > + insn_get_modrm(&insn);
> > > + insn_get_sib(&insn);
> > > + addr_ref = (unsigned long)insn_get_addr_ref(&insn, regs);
> [...]
> > > +}
> >
> > Could you refactor this a little bit so that we end up with a helper
> > that does the computation? Something like:
> >
> > int probe_insn_get_memory_ref(void **addr, size_t *len, void *insn_addr);
> >
> > returns 1 if there was a memory operand and fills in addr and len,
> > returns 0 if there was no memory operand, and returns a negative error
> > on error.
> >
> > I think we're going to want this for #AC handling, too :)
>
> Mmmh... the instruction decoder doesn't currently give us a reliable
> access size though. (I know, I'm using it here regardless, but it
> doesn't really matter here if the decoded size is too big from time to
> time... whereas I imagine that that'd matter quite a bit for #AC
> handling.) IIRC e.g. a MOVZX that loads 1 byte into a 4-byte register
> is decoded as having .opnd_bytes==4; and if you look through
> arch/x86/lib/insn.c, there isn't even anything that would ever set
> ->opnd_bytes to 1. You'd have to add some plumbing to get reliable
> access sizes. I don't want to add a helper for this before the
> underlying infrastructure actually works properly.
Fair enough. Although, with #AC, we know a priori that the address is
unaligned, so we could at least print "Unaligned access at 0x%lx\n".
But we can certainly leave these details to someone else.
(For context, there are patches floating around to enable a formerly
secret CPU feature to generate #AC on a LOCK instruction that spans a
cache line.)
--Andy