2018-02-19 20:30:54

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

From: Borislav Petkov <[email protected]>

Hi,

so I've been thinking about doing this for a while now: be able to dump
the opcode bytes around the user rIP just like we do for kernel faults.

Why?

See patch 5's commit message. That's why I've marked it RFC.

The rest is cleanups: we're copying the opcodes byte-by-byte and that's
just wasteful.

Also, we're using probe_kernel_read() underneath and it does
__copy_from_user_inatomic() which makes copying user opcode bytes
trivial.

With that, it looks like this:

[ 696.837457] strsep[1733]: segfault at 40066b ip 00007fad558fccf8 sp 00007ffc5e662520 error 7 in libc-2.26.so[7fad55876000+1ad000]
[ 696.837538] Code: 1b 48 89 fd 48 89 df e8 77 99 f9 ff 48 01 d8 80 38 00 75 17 48 c7 45 00 00 00 00 00 48 83 c4 08 48 89 d8 5b 5d c3 0f 1f 44 00 00 <c6> 00 00 48 83 c0 01 48 89 45 00 48 83 c4 08 48 89 d8 5b 5d c3

and the code matches, as expected:

0000000000086cc0 <__strsep_g@@GLIBC_2.2.5>:
86cc0: 55 push %rbp
86cc1: 53 push %rbx
86cc2: 48 83 ec 08 sub $0x8,%rsp
86cc6: 48 8b 1f mov (%rdi),%rbx
86cc9: 48 85 db test %rbx,%rbx
86ccc: 74 1b je 86ce9 <__strsep_g@@GLIBC_2.2.5+0x29>
86cce: 48 89 fd mov %rdi,%rbp
86cd1: 48 89 df mov %rbx,%rdi
86cd4: e8 77 99 f9 ff callq 20650 <*ABS*+0x854e0@plt>
86cd9: 48 01 d8 add %rbx,%rax
86cdc: 80 38 00 cmpb $0x0,(%rax)
86cdf: 75 17 jne 86cf8 <__strsep_g@@GLIBC_2.2.5+0x38>
86ce1: 48 c7 45 00 00 00 00 movq $0x0,0x0(%rbp)
86ce8: 00
86ce9: 48 83 c4 08 add $0x8,%rsp
86ced: 48 89 d8 mov %rbx,%rax
86cf0: 5b pop %rbx
86cf1: 5d pop %rbp
86cf2: c3 retq
86cf3: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
86cf8: c6 00 00 movb $0x0,(%rax)
86cfb: 48 83 c0 01 add $0x1,%rax
86cff: 48 89 45 00 mov %rax,0x0(%rbp)
86d03: 48 83 c4 08 add $0x8,%rsp
86d07: 48 89 d8 mov %rbx,%rax
86d0a: 5b pop %rbx
86d0b: 5d pop %rbp
86d0c: c3 retq

Comments and suggestions are welcome!

Thx.

Borislav Petkov (5):
x86/dumpstack: Unify show_regs()
x86/dumpstack: Carve out Code: dumping into a function
x86/dumpstack: Improve opcodes dumping in the Code: section
x86/dumpstack: Add loglevel argument to show_opcodes()
x86/fault: Dump user opcode bytes on fatal faults

arch/x86/include/asm/stacktrace.h | 3 +-
arch/x86/kernel/dumpstack.c | 64 +++++++++++++++++++++++++++++++++++++--
arch/x86/kernel/dumpstack_32.c | 42 -------------------------
arch/x86/kernel/dumpstack_64.c | 42 -------------------------
arch/x86/mm/fault.c | 7 +++--
5 files changed, 68 insertions(+), 90 deletions(-)

--
2.13.0



2018-02-19 20:30:31

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 2/5] x86/dumpstack: Carve out Code: dumping into a function

From: Borislav Petkov <[email protected]>

No functionality change, carve it out into a separate function for later
changes.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/kernel/dumpstack.c | 57 ++++++++++++++++++++++++---------------------
1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 18fa9d74c182..954b1dd1c04a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -376,10 +376,38 @@ static int __init code_bytes_setup(char *s)
}
__setup("code_bytes=", code_bytes_setup);

+static void show_opcodes(u8 *rip)
+{
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+ unsigned char c;
+ u8 *ip;
+ int i;
+
+ printk(KERN_DEFAULT "Code: ");
+
+ ip = (u8 *)rip - code_prologue;
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+ /* try starting at IP */
+ ip = (u8 *)rip;
+ code_len = code_len - code_prologue + 1;
+ }
+ for (i = 0; i < code_len; i++, ip++) {
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+ pr_cont(" Bad RIP value.");
+ break;
+ }
+ if (ip == (u8 *)rip)
+ pr_cont("<%02x> ", c);
+ else
+ pr_cont("%02x ", c);
+ }
+ pr_cont("\n");
+}
+
void show_regs(struct pt_regs *regs)
{
bool all = true;
- int i;

show_regs_print_info(KERN_DEFAULT);

@@ -393,32 +421,7 @@ void show_regs(struct pt_regs *regs)
* time of the fault..
*/
if (!user_mode(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
-
- printk(KERN_DEFAULT "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- pr_cont(" Bad RIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- pr_cont("<%02x> ", c);
- else
- pr_cont("%02x ", c);
- }
+ show_opcodes((u8 *)regs->ip);
}
- pr_cont("\n");
}
--
2.13.0


2018-02-19 20:30:46

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 4/5] x86/dumpstack: Add loglevel argument to show_opcodes()

From: Borislav Petkov <[email protected]>

Will be used in the next patch.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/stacktrace.h | 1 +
arch/x86/kernel/dumpstack.c | 6 +++---
2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 133d9425fced..0630eeb18bbc 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -111,4 +111,5 @@ static inline unsigned long caller_frame_pointer(void)
return (unsigned long)frame;
}

+void show_opcodes(u8 *rip, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index aef0d98cc282..17e9f358c42a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -388,13 +388,13 @@ static int __init code_bytes_setup(char *s)
}
__setup("code_bytes=", code_bytes_setup);

-static void show_opcodes(u8 *rip)
+void show_opcodes(u8 *rip, const char *loglvl)
{
unsigned int code_prologue = code_bytes * 43 / OPCODE_BUFSIZE;
u8 *ip;
int i;

- printk(KERN_DEFAULT "Code: ");
+ printk("%sCode: ", loglvl);

ip = (u8 *)rip - code_prologue;
if (probe_kernel_read(opcodes, ip, code_bytes)) {
@@ -432,6 +432,6 @@ void show_regs(struct pt_regs *regs)
if (regs->ip < PAGE_OFFSET)
pr_cont(" Bad RIP value.\n");
else
- show_opcodes((u8 *)regs->ip);
+ show_opcodes((u8 *)regs->ip, KERN_DEFAULT);
}
}
--
2.13.0


2018-02-19 20:31:39

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 3/5] x86/dumpstack: Improve opcodes dumping in the Code: section

From: Borislav Petkov <[email protected]>

The code used to iterate byte-by-byte over the bytes around RIP and that
is expensive: disabling pagefaults around it, copy_from_user, etc...

Make it read the whole buffer of code_bytes size in one go. By default
use a statically allocated 64 bytes buffer. If "code_bytes=" is supplied
on the cmdline a new buffer gets allocated.

Also, do the PAGE_OFFSET check outside of the function because latter
will be reused in other context.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/kernel/dumpstack.c | 44 +++++++++++++++++++++++++++-----------------
1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 954b1dd1c04a..aef0d98cc282 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,9 +22,13 @@
#include <asm/stacktrace.h>
#include <asm/unwind.h>

+#define OPCODE_BUFSIZE 64
+
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
-static unsigned int code_bytes = 64;
+static unsigned int code_bytes = OPCODE_BUFSIZE;
+static u8 __opc[OPCODE_BUFSIZE];
+static u8 *opcodes = __opc;
static int die_counter;

bool in_task_stack(unsigned long *stack, struct task_struct *task,
@@ -358,8 +362,8 @@ void die(const char *str, struct pt_regs *regs, long err)

static int __init code_bytes_setup(char *s)
{
- ssize_t ret;
unsigned long val;
+ ssize_t ret;

if (!s)
return -EINVAL;
@@ -372,35 +376,37 @@ static int __init code_bytes_setup(char *s)
if (code_bytes > 8192)
code_bytes = 8192;

+ if (code_bytes > OPCODE_BUFSIZE) {
+ u8 *new_buf = kzalloc(code_bytes, GFP_KERNEL);
+ if (!new_buf)
+ return -EINVAL;
+
+ opcodes = new_buf;
+ }
+
return 1;
}
__setup("code_bytes=", code_bytes_setup);

static void show_opcodes(u8 *rip)
{
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
+ unsigned int code_prologue = code_bytes * 43 / OPCODE_BUFSIZE;
u8 *ip;
int i;

printk(KERN_DEFAULT "Code: ");

ip = (u8 *)rip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)rip;
- code_len = code_len - code_prologue + 1;
+ if (probe_kernel_read(opcodes, ip, code_bytes)) {
+ pr_cont(" Bad RIP value.\n");
+ return;
}
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- pr_cont(" Bad RIP value.");
- break;
- }
+
+ for (i = 0; i < code_bytes; i++, ip++) {
if (ip == (u8 *)rip)
- pr_cont("<%02x> ", c);
+ pr_cont("<%02x> ", opcodes[i]);
else
- pr_cont("%02x ", c);
+ pr_cont("%02x ", opcodes[i]);
}
pr_cont("\n");
}
@@ -422,6 +428,10 @@ void show_regs(struct pt_regs *regs)
*/
if (!user_mode(regs)) {
show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
- show_opcodes((u8 *)regs->ip);
+
+ if (regs->ip < PAGE_OFFSET)
+ pr_cont(" Bad RIP value.\n");
+ else
+ show_opcodes((u8 *)regs->ip);
}
}
--
2.13.0


2018-02-19 20:31:52

by Borislav Petkov

[permalink] [raw]
Subject: [RFC PATCH 5/5] x86/fault: Dump user opcode bytes on fatal faults

From: Borislav Petkov <[email protected]>

Sometimes it is useful to see which user opcode bytes RIP points to
when a fault happens: be it to rule out RIP corruption, to dump info
early during boot, when doing core dumps is impossible due to not having
writable fs yet.

Sometimes it is useful if debugging an issue and one doesn't have access
to the executable which caused the fault in order to disassemble it.

That last aspect might have some security implications so
show_unhandled_signals could be revisited for that or a new config
option added.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/mm/fault.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 321b78060e93..d81ea7835737 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -851,6 +851,8 @@ static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
unsigned long address, struct task_struct *tsk)
{
+ const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+
if (!unhandled_signal(tsk, SIGSEGV))
return;

@@ -858,13 +860,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
return;

printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address,
+ loglvl, tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);

print_vma_addr(KERN_CONT " in ", regs->ip);

printk(KERN_CONT "\n");
+
+ show_opcodes((u8 *)regs->ip, loglvl);
}

static void
--
2.13.0


2018-02-19 20:32:00

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 1/5] x86/dumpstack: Unify show_regs()

From: Borislav Petkov <[email protected]>

The 32-bit version uses KERN_EMERG and commit

b0f4c4b32c8e ("bugs, x86: Fix printk levels for panic, softlockups and stack dumps")

changed the 64-bit version to KERN_DEFAULT. The same justification in
that commit that those messages do not belong in the terminal, holds
true for 32-bit also, so make it so.

Make code_bytes static, while at it.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/stacktrace.h | 2 --
arch/x86/kernel/dumpstack.c | 49 ++++++++++++++++++++++++++++++++++++++-
arch/x86/kernel/dumpstack_32.c | 42 ---------------------------------
arch/x86/kernel/dumpstack_64.c | 42 ---------------------------------
4 files changed, 48 insertions(+), 87 deletions(-)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f73706878772..133d9425fced 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -87,8 +87,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl);

-extern unsigned int code_bytes;
-
/* The form of the top of the frame on the stack */
struct stack_frame {
struct stack_frame *next_frame;
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index a2d8a3908670..18fa9d74c182 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -24,7 +24,7 @@

int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
-unsigned int code_bytes = 64;
+static unsigned int code_bytes = 64;
static int die_counter;

bool in_task_stack(unsigned long *stack, struct task_struct *task,
@@ -375,3 +375,50 @@ static int __init code_bytes_setup(char *s)
return 1;
}
__setup("code_bytes=", code_bytes_setup);
+
+void show_regs(struct pt_regs *regs)
+{
+ bool all = true;
+ int i;
+
+ show_regs_print_info(KERN_DEFAULT);
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ all = !user_mode(regs);
+
+ __show_regs(regs, all);
+
+ /*
+ * When in-kernel, we also print out the stack and code at the
+ * time of the fault..
+ */
+ if (!user_mode(regs)) {
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+ unsigned char c;
+ u8 *ip;
+
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
+
+ printk(KERN_DEFAULT "Code: ");
+
+ ip = (u8 *)regs->ip - code_prologue;
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+ /* try starting at IP */
+ ip = (u8 *)regs->ip;
+ code_len = code_len - code_prologue + 1;
+ }
+ for (i = 0; i < code_len; i++, ip++) {
+ if (ip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(ip, c)) {
+ pr_cont(" Bad RIP value.");
+ break;
+ }
+ if (ip == (u8 *)regs->ip)
+ pr_cont("<%02x> ", c);
+ else
+ pr_cont("%02x ", c);
+ }
+ }
+ pr_cont("\n");
+}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 04170f63e3a1..cd53f3030e40 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -127,45 +127,3 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
info->type = STACK_TYPE_UNKNOWN;
return -EINVAL;
}
-
-void show_regs(struct pt_regs *regs)
-{
- int i;
-
- show_regs_print_info(KERN_EMERG);
- __show_regs(regs, !user_mode(regs));
-
- /*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
- */
- if (!user_mode(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- show_trace_log_lvl(current, regs, NULL, KERN_EMERG);
-
- pr_emerg("Code:");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- pr_cont(" Bad EIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- pr_cont(" <%02x>", c);
- else
- pr_cont(" %02x", c);
- }
- }
- pr_cont("\n");
-}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 563e28d14f2c..5cdb9e84da57 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -149,45 +149,3 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
info->type = STACK_TYPE_UNKNOWN;
return -EINVAL;
}
-
-void show_regs(struct pt_regs *regs)
-{
- int i;
-
- show_regs_print_info(KERN_DEFAULT);
- __show_regs(regs, 1);
-
- /*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
- */
- if (!user_mode(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
-
- printk(KERN_DEFAULT "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- pr_cont(" Bad RIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- pr_cont("<%02x> ", c);
- else
- pr_cont("%02x ", c);
- }
- }
- pr_cont("\n");
-}
--
2.13.0


2018-02-19 22:03:24

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH 3/5] x86/dumpstack: Improve opcodes dumping in the Code: section

On Mon, Feb 19, 2018 at 09:28:24PM +0100, Borislav Petkov wrote:
> bool in_task_stack(unsigned long *stack, struct task_struct *task,
> @@ -358,8 +362,8 @@ void die(const char *str, struct pt_regs *regs, long err)
>
> static int __init code_bytes_setup(char *s)
> {
> - ssize_t ret;
> unsigned long val;
> + ssize_t ret;
>
> if (!s)
> return -EINVAL;
> @@ -372,35 +376,37 @@ static int __init code_bytes_setup(char *s)
> if (code_bytes > 8192)
> code_bytes = 8192;
>
> + if (code_bytes > OPCODE_BUFSIZE) {
> + u8 *new_buf = kzalloc(code_bytes, GFP_KERNEL);
> + if (!new_buf)
> + return -EINVAL;

S/EINVAL/ENOMEM/ ?

Otherwise the code looks good to me, and I like the new feature.

--
Josh

2018-02-20 19:15:49

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Mon, Feb 19, 2018 at 8:28 PM, Borislav Petkov <[email protected]> wrote:
> From: Borislav Petkov <[email protected]>
>
> Hi,
>
> so I've been thinking about doing this for a while now: be able to dump
> the opcode bytes around the user rIP just like we do for kernel faults.
>
> Why?
>
> See patch 5's commit message. That's why I've marked it RFC.
>
> The rest is cleanups: we're copying the opcodes byte-by-byte and that's
> just wasteful.

Maybe this series already has this side-effect, but I'd really love to
see oopses show the code bytes for each kernel entry, not just the
innermode one. We already dump full regs including RIP -- adding
Code: should be easy and would be very helpful.

2018-02-20 19:31:27

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Tue, Feb 20, 2018 at 07:14:00PM +0000, Andy Lutomirski wrote:
> On Mon, Feb 19, 2018 at 8:28 PM, Borislav Petkov <[email protected]> wrote:
> > From: Borislav Petkov <[email protected]>
> >
> > Hi,
> >
> > so I've been thinking about doing this for a while now: be able to dump
> > the opcode bytes around the user rIP just like we do for kernel faults.
> >
> > Why?
> >
> > See patch 5's commit message. That's why I've marked it RFC.
> >
> > The rest is cleanups: we're copying the opcodes byte-by-byte and that's
> > just wasteful.
>
> Maybe this series already has this side-effect, but I'd really love to
> see oopses show the code bytes for each kernel entry, not just the
> innermode one. We already dump full regs including RIP -- adding
> Code: should be easy and would be very helpful.

Just to clarify, I think you want to show the Code: around regs->ip
every time we show the registers?

--
Josh

2018-02-20 20:46:29

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Tue, Feb 20, 2018 at 01:29:56PM -0600, Josh Poimboeuf wrote:
> > Maybe this series already has this side-effect, but I'd really love to
> > see oopses show the code bytes for each kernel entry, not just the
> > innermode one. We already dump full regs including RIP -- adding
> > Code: should be easy and would be very helpful.
>
> Just to clarify, I think you want to show the Code: around regs->ip
> every time we show the registers?

It is an easy change to always dump Code: section when we dump RIP:.
I.e., something like this:

[ 33.192733] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 33.196510] IP: sysrq_handle_crash+0x17/0x20
[ 33.196691] PGD 78b12067 P4D 78b12067 PUD 78b7f067 PMD 0
[ 33.196691] Oops: 0002 [#1] PREEMPT SMP
[ 33.196691] Modules linked in:
[ 33.196691] CPU: 6 PID: 3686 Comm: bash Not tainted 4.16.0-rc1+ #5
[ 33.196691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 33.196691] RIP: 0010:sysrq_handle_crash+0x17/0x20
[ 33.196691] RSP: 0018:ffffc9000054bdf0 EFLAGS: 00010246
[ 33.196691] Code: eb d1 e8 5d 17 b7 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 c6 24 bd ff c7 05 24 a2 19 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 21 c2 ff fb e9
[ 33.196691] RAX: 0000000000000000 RBX: 0000000000000063 RCX: 0000000000000000
[ 33.196691] RDX: 0000000000000000 RSI: ffffffff8110145a RDI: 0000000000000063
[ 33.196691] RBP: ffffffff82271400 R08: 0000000000000183 R09: 000000000002e771
[ 33.196691] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000000a
[ 33.196691] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 33.196691] FS: 00007ffff7fdb700(0000) GS:ffff88007ed80000(0000) knlGS:0000000000000000
[ 33.196691] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 33.196691] CR2: 0000000000000000 CR3: 000000007aa9e000 CR4: 00000000000406e0
[ 33.196691] Call Trace:
[ 33.196691] __handle_sysrq+0x9e/0x160
[ 33.196691] write_sysrq_trigger+0x2b/0x30
[ 33.196691] proc_reg_write+0x38/0x70
[ 33.196691] __vfs_write+0x36/0x160
[ 33.196691] ? __fd_install+0x69/0x110
[ 33.196691] ? preempt_count_add+0x74/0xb0
[ 33.196691] ? _raw_spin_lock+0x13/0x30
[ 33.196691] ? set_close_on_exec+0x41/0x80
[ 33.196691] ? preempt_count_sub+0xa8/0x100
[ 33.196691] vfs_write+0xc0/0x190
[ 33.196691] SyS_write+0x64/0xe0
[ 33.196691] ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 33.196691] do_syscall_64+0x70/0x130
[ 33.196691] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 33.196691] RIP: 0033:0x7ffff74b9620
[ 33.196691] RSP: 002b:00007fffffffe7a8 EFLAGS: 00000246
[ 33.196691] Code: ff 73 01 c3 48 8b 0d 68 98 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d bd f1 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04
[ 33.196691] ORIG_RAX: 0000000000000001
[ 33.196691] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007ffff74b9620
[ 33.196691] RDX: 0000000000000002 RSI: 0000000000705408 RDI: 0000000000000001
[ 33.196691] RBP: 0000000000705408 R08: 000000000000000a R09: 00007ffff7fdb700
[ 33.196691] R10: 00007ffff77826a0 R11: 0000000000000246 R12: 00007ffff77842a0
[ 33.196691] R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000
[ 33.196691] Code: eb d1 e8 5d 17 b7 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 c6 24 bd ff c7 05 24 a2 19 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 21 c2 ff fb e9
[ 33.196691] RIP: sysrq_handle_crash+0x17/0x20 RSP: ffffc9000054bdf0
[ 33.196691] CR2: 0000000000000000
[ 33.301033] ---[ end trace b97275941de8c6f4 ]---
[ 33.302600] Kernel panic - not syncing: Fatal exception
[ 33.304529] Kernel Offset: disabled
[ 33.304973] ---[ end Kernel panic - not syncing: Fatal exception

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2018-02-21 15:57:23

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section


* Borislav Petkov <[email protected]> wrote:

> On Tue, Feb 20, 2018 at 01:29:56PM -0600, Josh Poimboeuf wrote:
> > > Maybe this series already has this side-effect, but I'd really love to
> > > see oopses show the code bytes for each kernel entry, not just the
> > > innermode one. We already dump full regs including RIP -- adding
> > > Code: should be easy and would be very helpful.
> >
> > Just to clarify, I think you want to show the Code: around regs->ip
> > every time we show the registers?
>
> It is an easy change to always dump Code: section when we dump RIP:.
> I.e., something like this:
>
> [ 33.192733] BUG: unable to handle kernel NULL pointer dereference at (null)
> [ 33.196510] IP: sysrq_handle_crash+0x17/0x20
> [ 33.196691] PGD 78b12067 P4D 78b12067 PUD 78b7f067 PMD 0
> [ 33.196691] Oops: 0002 [#1] PREEMPT SMP
> [ 33.196691] Modules linked in:
> [ 33.196691] CPU: 6 PID: 3686 Comm: bash Not tainted 4.16.0-rc1+ #5
> [ 33.196691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> [ 33.196691] RIP: 0010:sysrq_handle_crash+0x17/0x20
> [ 33.196691] RSP: 0018:ffffc9000054bdf0 EFLAGS: 00010246
> [ 33.196691] Code: eb d1 e8 5d 17 b7 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 c6 24 bd ff c7 05 24 a2 19 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 21 c2 ff fb e9
> [ 33.196691] RAX: 0000000000000000 RBX: 0000000000000063 RCX: 0000000000000000
> [ 33.196691] RDX: 0000000000000000 RSI: ffffffff8110145a RDI: 0000000000000063
> [ 33.196691] RBP: ffffffff82271400 R08: 0000000000000183 R09: 000000000002e771
> [ 33.196691] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000000a
> [ 33.196691] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> [ 33.196691] FS: 00007ffff7fdb700(0000) GS:ffff88007ed80000(0000) knlGS:0000000000000000
> [ 33.196691] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 33.196691] CR2: 0000000000000000 CR3: 000000007aa9e000 CR4: 00000000000406e0
> [ 33.196691] Call Trace:
> [ 33.196691] __handle_sysrq+0x9e/0x160
> [ 33.196691] write_sysrq_trigger+0x2b/0x30
> [ 33.196691] proc_reg_write+0x38/0x70
> [ 33.196691] __vfs_write+0x36/0x160
> [ 33.196691] ? __fd_install+0x69/0x110
> [ 33.196691] ? preempt_count_add+0x74/0xb0
> [ 33.196691] ? _raw_spin_lock+0x13/0x30
> [ 33.196691] ? set_close_on_exec+0x41/0x80
> [ 33.196691] ? preempt_count_sub+0xa8/0x100
> [ 33.196691] vfs_write+0xc0/0x190
> [ 33.196691] SyS_write+0x64/0xe0
> [ 33.196691] ? trace_hardirqs_off_thunk+0x1a/0x1c
> [ 33.196691] do_syscall_64+0x70/0x130
> [ 33.196691] entry_SYSCALL_64_after_hwframe+0x42/0xb7
> [ 33.196691] RIP: 0033:0x7ffff74b9620
> [ 33.196691] RSP: 002b:00007fffffffe7a8 EFLAGS: 00000246
> [ 33.196691] Code: ff 73 01 c3 48 8b 0d 68 98 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d bd f1 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04
> [ 33.196691] ORIG_RAX: 0000000000000001
> [ 33.196691] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007ffff74b9620
> [ 33.196691] RDX: 0000000000000002 RSI: 0000000000705408 RDI: 0000000000000001
> [ 33.196691] RBP: 0000000000705408 R08: 000000000000000a R09: 00007ffff7fdb700
> [ 33.196691] R10: 00007ffff77826a0 R11: 0000000000000246 R12: 00007ffff77842a0
> [ 33.196691] R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000
> [ 33.196691] Code: eb d1 e8 5d 17 b7 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 c6 24 bd ff c7 05 24 a2 19 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 21 c2 ff fb e9
> [ 33.196691] RIP: sysrq_handle_crash+0x17/0x20 RSP: ffffc9000054bdf0
> [ 33.196691] CR2: 0000000000000000
> [ 33.301033] ---[ end trace b97275941de8c6f4 ]---
> [ 33.302600] Kernel panic - not syncing: Fatal exception
> [ 33.304529] Kernel Offset: disabled
> [ 33.304973] ---[ end Kernel panic - not syncing: Fatal exception

That looks really useful!

Thanks,

Ingo

2018-02-21 19:12:53

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Wed, Feb 21, 2018 at 10:15:53AM +0100, Ingo Molnar wrote:
> That looks really useful!

Ok, lemme run it by Linus too as he probably stares at this part of the
output a *lot* :-)

Combined patch at the end. I'll split it later.

@Linus: also, pls have a look at
https://lkml.kernel.org/r/[email protected]

I've added Code: section to user faults too because there might be
some usefulness in seeing the user opcode bytes when it faults. Some
arguments in the commit message there.

Anyway, here's a 64-bit splat. I'm basically dumping opcode bytes
everytime we dump RIP.

[ 18.304872] sysrq: SysRq : Trigger a crash
[ 18.306961] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 18.310702] IP: sysrq_handle_crash+0x17/0x20
[ 18.312830] PGD 7a972067 P4D 7a972067 PUD 7a351067 PMD 0
[ 18.316431] Oops: 0002 [#1] PREEMPT SMP
[ 18.317219] Modules linked in:
[ 18.317865] CPU: 6 PID: 3681 Comm: bash Not tainted 4.16.0-rc1+ #14
[ 18.319237] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 18.321162] RIP: 0010:sysrq_handle_crash+0x17/0x20
[ 18.322273] RSP: 0018:ffffc90000c23df0 EFLAGS: 00010246
[ 18.322274] Code: eb d1 e8 fd ca b6 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 f6 de bc ff c7 05 84 0d 1a 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 df c1 ff fb 66
[ 18.331362] RAX: 0000000000000000 RBX: 0000000000000063 RCX: 0000000000000000
[ 18.332660] RDX: 0000000000000000 RSI: ffffffff81103293 RDI: 0000000000000063
[ 18.333931] RBP: ffffffff82271880 R08: 00000000000001a4 R09: 000000000004a6e8
[ 18.335209] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000000a
[ 18.336874] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 18.338732] FS: 00007ffff7fdb700(0000) GS:ffff88007ed80000(0000) knlGS:0000000000000000
[ 18.344827] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 18.346395] CR2: 0000000000000000 CR3: 000000007c39e000 CR4: 00000000000406e0
[ 18.348205] Call Trace:
[ 18.348905] __handle_sysrq+0x9e/0x160
[ 18.349835] write_sysrq_trigger+0x2b/0x30
[ 18.350827] proc_reg_write+0x38/0x70
[ 18.351747] __vfs_write+0x36/0x160
[ 18.356573] ? __fd_install+0x69/0x110
[ 18.357516] ? preempt_count_add+0x74/0xb0
[ 18.358509] ? _raw_spin_lock+0x13/0x30
[ 18.359454] ? set_close_on_exec+0x41/0x80
[ 18.360468] ? preempt_count_sub+0xa8/0x100
[ 18.361476] vfs_write+0xc0/0x190
[ 18.362327] SyS_write+0x64/0xe0
[ 18.363162] ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 18.364269] do_syscall_64+0x76/0x140
[ 18.368969] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 18.370390] RIP: 0033:0x7ffff74b9620
[ 18.371479] RSP: 002b:00007fffffffe7a8 EFLAGS: 00000246
[ 18.371481] Code: ff 73 01 c3 48 8b 0d 68 98 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d bd f1 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04
[ 18.377743] ORIG_RAX: 0000000000000001
[ 18.381942] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007ffff74b9620
[ 18.383212] RDX: 0000000000000002 RSI: 0000000000705408 RDI: 0000000000000001
[ 18.384490] RBP: 0000000000705408 R08: 000000000000000a R09: 00007ffff7fdb700
[ 18.385759] R10: 00007ffff77826a0 R11: 0000000000000246 R12: 00007ffff77842a0
[ 18.387024] R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000
[ 18.388437] RIP: 0010:sysrq_handle_crash+0x17/0x20
[ 18.389550] RSP: 0018:ffffc90000c23df0 EFLAGS: 00010246
[ 18.389550] Code: eb d1 e8 fd ca b6 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 f6 de bc ff c7 05 84 0d 1a 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 df c1 ff fb 66
[ 18.398707] CR2: 0000000000000000
[ 18.399864] ---[ end trace e17dc9a4aa5cc4d9 ]---
[ 18.401351] Kernel panic - not syncing: Fatal exception
[ 18.401678] Kernel Offset: disabled
[ 18.408342] ---[ end Kernel panic - not syncing: Fatal exception

32-bit, respectively:

[ 20.992127] sysrq: SysRq : Trigger a crash
[ 20.994364] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 20.997892] IP: sysrq_handle_crash+0x1d/0x30
[ 20.999807] *pde = 00000000
[ 21.000512] Oops: 0002 [#1] PREEMPT SMP
[ 21.007170] Modules linked in:
[ 21.008299] CPU: 4 PID: 2079 Comm: bash Not tainted 4.16.0-rc1+ #18
[ 21.022652] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 21.024480] EIP: sysrq_handle_crash+0x1d/0x30 SS:ESP: 0068:f3f21e8c
[ 21.026929] Code: bf ff eb d6 e8 35 eb b9 ff 90 8d 74 26 00 0f 1f 44 00 00 55 89 e5 e8 d3 dd bf ff c7 05 b4 ba c3 c1 01 00 00 00 0f ae f8 0f 1f 00 <c6> 05 00 00 00 00 01 5d c3 8d 76 00 8d bc 27 00 00 00 00 0f 1f
[ 21.037904] EAX: 00000000 EBX: 0000000a ECX: 00000000 EDX: c1505fe0
[ 21.039740] ESI: 00000063 EDI: 00000000 EBP: f3f21e8c ESP: f3f21e8c
[ 21.041575] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
[ 21.043233] CR0: 80050033 CR2: 00000000 CR3: 33e6c000 CR4: 000406d0
[ 21.049620] Call Trace:
[ 21.050286] __handle_sysrq+0x93/0x130
[ 21.051167] ? sysrq_filter+0x3c0/0x3c0
[ 21.052053] write_sysrq_trigger+0x27/0x40
[ 21.052977] proc_reg_write+0x4d/0x80
[ 21.053939] ? proc_reg_poll+0x70/0x70
[ 21.055209] __vfs_write+0x38/0x160
[ 21.056433] ? preempt_count_sub+0xa0/0x110
[ 21.057805] ? __fd_install+0x51/0xd0
[ 21.063481] ? __sb_start_write+0x4c/0xc0
[ 21.064811] ? preempt_count_sub+0xa0/0x110
[ 21.066193] vfs_write+0x98/0x180
[ 21.067371] SyS_write+0x4f/0xb0
[ 21.068592] do_fast_syscall_32+0x9e/0x210
[ 21.069946] entry_SYSENTER_32+0x53/0x86
[ 21.071257] EIP: 0xb7fccce9 SS:ESP: 007b:bfa15fc0
[ 21.072733] Code: 55 08 8b 80 64 cd ff ff 85 d2 74 02 89 02 5d c3 8b 04 24 c3 8b 0c 24 c3 8b 1c 24 c3 8b 3c 24 c3 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d
[ 21.081472] EAX: ffffffda EBX: 00000001 ECX: 081e8a08 EDX: 00000002
[ 21.081474] ESI: 00000002 EDI: b7f97d80 EBP: 081e8a08 ESP: bfa15fc0
[ 21.081476] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
[ 21.081482] EIP: sysrq_handle_crash+0x1d/0x30 SS:ESP: 0068:f3f21e8c
[ 21.081483] Code: bf ff eb d6 e8 35 eb b9 ff 90 8d 74 26 00 0f 1f 44 00 00 55 89 e5 e8 d3 dd bf ff c7 05 b4 ba c3 c1 01 00 00 00 0f ae f8 0f 1f 00 <c6> 05 00 00 00 00 01 5d c3 8d 76 00 8d bc 27 00 00 00 00 0f 1f
[ 21.104318] CR2: 0000000000000000
[ 21.105155] ---[ end trace 7a104a17e091751a ]---
[ 21.106160] Kernel panic - not syncing: Fatal exception
[ 21.106479] Kernel Offset: disabled
[ 21.108329] ---[ end Kernel panic - not syncing: Fatal exception

Combined diff ontop:

---
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 0630eeb18bbc..b6dc698f992a 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -112,4 +112,5 @@ static inline unsigned long caller_frame_pointer(void)
}

void show_opcodes(u8 *rip, const char *loglvl);
+void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 3d595e4cf280..b60232d9969b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -73,14 +73,56 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}

-void show_opcodes(u8 *rip, const char *loglvl);
+void show_opcodes(u8 *rip, const char *loglvl)
+{
+ unsigned int code_prologue = code_bytes * 43 / OPCODE_BUFSIZE;
+ u8 *ip;
+ int i;

-void show_iret_regs(struct pt_regs *regs)
+ printk("%sCode: ", loglvl);
+
+ ip = (u8 *)rip - code_prologue;
+ if (probe_kernel_read(opcodes, ip, code_bytes)) {
+ pr_cont(" Bad RIP value.\n");
+ return;
+ }
+
+ for (i = 0; i < code_bytes; i++, ip++) {
+ if (ip == (u8 *)rip)
+ pr_cont("<%02x> ", opcodes[i]);
+ else
+ pr_cont("%02x ", opcodes[i]);
+ }
+ pr_cont("\n");
+}
+
+void show_ip(struct pt_regs *regs, const char *loglvl)
{
- printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
- printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+#ifdef CONFIG_X86_32
+ unsigned short ss;
+ unsigned long sp;
+
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss;
+ } else {
+ sp = kernel_stack_pointer(regs);
+ savesegment(ss, ss);
+ }
+
+ printk("%sEIP: %pS SS:ESP: %04x:%08lx\n", loglvl, (void *)regs->ip, ss, sp);
+#else
+ printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+ printk("%sRSP: %04x:%016lx EFLAGS: %08lx", loglvl, (int)regs->ss,
regs->sp, regs->flags);
- show_opcodes((u8 *)regs->ip, KERN_DEFAULT);
+#endif
+
+ show_opcodes((u8 *)regs->ip, loglvl);
+}
+
+void show_iret_regs(struct pt_regs *regs)
+{
+ show_ip(regs, KERN_DEFAULT);
}

static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
@@ -312,10 +354,6 @@ NOKPROBE_SYMBOL(oops_end);

int __die(const char *str, struct pt_regs *regs, long err)
{
-#ifdef CONFIG_X86_32
- unsigned short ss;
- unsigned long sp;
-#endif
printk(KERN_DEFAULT
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
@@ -331,20 +369,10 @@ int __die(const char *str, struct pt_regs *regs, long err)

print_modules();
show_regs(regs);
-#ifdef CONFIG_X86_32
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss;
- } else {
- sp = kernel_stack_pointer(regs);
- savesegment(ss, ss);
- }
- printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
- (void *)regs->ip, ss, sp);
-#else
+
/* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
-#endif
+ show_ip(regs, KERN_EMERG);
+
return 0;
}
NOKPROBE_SYMBOL(__die);
@@ -391,29 +419,6 @@ static int __init code_bytes_setup(char *s)
}
__setup("code_bytes=", code_bytes_setup);

-void show_opcodes(u8 *rip, const char *loglvl)
-{
- unsigned int code_prologue = code_bytes * 43 / OPCODE_BUFSIZE;
- u8 *ip;
- int i;
-
- printk("%sCode: ", loglvl);
-
- ip = (u8 *)rip - code_prologue;
- if (probe_kernel_read(opcodes, ip, code_bytes)) {
- pr_cont(" Bad RIP value.\n");
- return;
- }
-
- for (i = 0; i < code_bytes; i++, ip++) {
- if (ip == (u8 *)rip)
- pr_cont("<%02x> ", opcodes[i]);
- else
- pr_cont("%02x ", opcodes[i]);
- }
- pr_cont("\n");
-}
-
void show_regs(struct pt_regs *regs)
{
bool all = true;
@@ -434,7 +439,5 @@ void show_regs(struct pt_regs *regs)

if (regs->ip < PAGE_OFFSET)
pr_cont(" Bad RIP value.\n");
- else
- show_opcodes((u8 *)regs->ip, KERN_DEFAULT);
}
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5224c6099184..3d1f1226b972 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -76,9 +76,7 @@ void __show_regs(struct pt_regs *regs, int all)
savesegment(gs, gs);
}

- printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
- printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
- raw_smp_processor_id());
+ show_ip(regs, KERN_DEFAULT);

printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);


--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2018-02-21 21:42:06

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Wed, Feb 21, 2018 at 9:54 AM, Borislav Petkov <[email protected]> wrote:
>
> Ok, lemme run it by Linus too as he probably stares at this part of the
> output a *lot* :-)

Less than I used to, since there are others who are pretty good at
decoding them, but yes...

> Anyway, here's a 64-bit splat. I'm basically dumping opcode bytes
> everytime we dump RIP.

So I think that's probably a good idea, but the way it ends up being
repeated is really confusing. Particularly when you now do it for the
user code too - which might be occasionally useful, but your example
also shows how there are now _three_ code lines due to the duplication
at the end (for "maybe the first one scrolled off the screen" and the
user code).

And the "executive summary" used to be a dense one-liner (to not
scroll the non-summary away), now it's generally four lines on the
screen (you also made the RIP/RSP be now two lines, and the Code line
is usually two lines because it's so long).

So my gut feel is that yes, this is moving in the right direction, but
I also really think that it now makes the normal oops too big to fit
on a screen.

That matters, because we still end up having the "take a picture of
the oops" issue for hard hangs etc that don't survive in the logs.

Do I know what the right answer is? No. But I suspect at the very
least we would want to get rid of the RSP line from show_ip(), and
make that part of the regular regs. That would make the summary one
line less.

Hmm. The Code: line actually ends up being *three* lines on the
default 80x25 screen, so in your 64-bit example, you actually get
something like this:

? preempt_count_sub+0xa8/0x100
vfs_write+0xc0/0x190
SyS_write+0x64/0xe0
? trace_hardirqs_off_thunk+0x1a/0x1c
do_syscall_64+0x76/0x140
entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x7ffff74b9620
RSP: 002b:00007fffffffe7a8 EFLAGS: 00000246
Code: ff 73 01 c3 48 8b 0d 68 98 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66
0f 1f 44 00 00 83 3d bd f1 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d
01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04
ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007ffff74b9620
RDX: 0000000000000002 RSI: 0000000000705408 RDI: 0000000000000001
RBP: 0000000000705408 R08: 000000000000000a R09: 00007ffff7fdb700
R10: 00007ffff77826a0 R11: 0000000000000246 R12: 00007ffff77842a0
R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000
RIP: 0010:sysrq_handle_crash+0x17/0x20
RSP: 0018:ffffc90000c23df0 EFLAGS: 00010246
Code: eb d1 e8 fd ca b6 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f
44 00 00 e8 f6 de bc ff c7 05 84 0d 1a 01 01 00 00 00 0f ae f8 <c6> 04
25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 df c1 ff fb 66
CR2: 0000000000000000
---[ end trace e17dc9a4aa5cc4d9 ]---
Kernel panic - not syncing: Fatal exception

showing with a hung kernel. And most of the above is actually
completely useless. Those are the *usermode* registers it shows, not
the kernel registers at the time of the crash (the final rip/rsp/code
lines are for the actual kernel crash, but I'm talking about the
register dump above it).

So notice how most of the *useful* data has actually scrolled off the
screen and is all gone because the machine is hung. Instead, we've
added stuff that doesn't help at all, usually.

It's not just that last patch, obviously. The big hunk o fuser
register dumping is actually from Josh's trace improvements. But the
above really is a great example of how we have made oopses *harder* to
read by trying to add more data. They have gotten messier, but they
have also gotten so verbose that the *good* stuff has all scrolled
away.

So I think we should take a hard look at that "more data is better".
Look at the above 25 lines and tell me - is that actually 25 useful
lines for debugging a crash in sysrq_handle_crash?

No. The only really _useful_ lines above are

RIP: 0010:sysrq_handle_crash+0x17/0x20
RSP: 0018:ffffc90000c23df0 EFLAGS: 00010246
Code: eb d1 e8 fd ca b6 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f
44 00 00 e8 f6 de bc ff c7 05 84 0d 1a 01 01 00 00 00 0f ae f8 <c6> 04
25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 df c1 ff fb 66
CR2: 0000000000000000

which are actually about the crash. The rest is almost entirely useless.

Do I know what the corrent answer is? No.

Linus

2018-02-22 09:24:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Wed, Feb 21, 2018 at 01:39:52PM -0800, Linus Torvalds wrote:
> showing with a hung kernel. And most of the above is actually
> completely useless. Those are the *usermode* registers it shows, not
> the kernel registers at the time of the crash (the final rip/rsp/code
> lines are for the actual kernel crash, but I'm talking about the
> register dump above it).
>
> So notice how most of the *useful* data has actually scrolled off the
> screen and is all gone because the machine is hung. Instead, we've
> added stuff that doesn't help at all, usually.
>
> It's not just that last patch, obviously. The big hunk o fuser
> register dumping is actually from Josh's trace improvements. But the
> above really is a great example of how we have made oopses *harder* to
> read by trying to add more data. They have gotten messier, but they
> have also gotten so verbose that the *good* stuff has all scrolled
> away.
>
> So I think we should take a hard look at that "more data is better".
> Look at the above 25 lines and tell me - is that actually 25 useful
> lines for debugging a crash in sysrq_handle_crash?

So being one to only use machines that have a serial line this does not
really affect me; but it would appear to me that it might make sense to
try and reverse the entire dump.

That is 'obviously' going to be rather tricky, because we'll have to
print in the direct reverse direction we discover the data and the only
way to do that is with extra buffers, which adds extra complexity to
something we want absolutely robust.

But a simply line based reverse of the output would get us the most
useful data last, just what we want.

2018-02-22 18:43:58

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Thu, Feb 22, 2018 at 1:23 AM, Peter Zijlstra <[email protected]> wrote:
>
> So being one to only use machines that have a serial line this does not
> really affect me; but it would appear to me that it might make sense to
> try and reverse the entire dump.

In theory yes. But while you mention the buffering problem, the
*bigger* problem is that sometimes things go south half-way through.
We've definitely had cases where we only get the first few lines of
the oops, because the oops machinery itself has issues (ie takes a
fault in the middle because of random corruption issues or the stack
tracing acting up or whatever).

So while "most important last" is good for it not scrolling off the
screen, it's really bad for the "oops itself has problems" case.

I'd rather people try very hard to make the oops messages dense and
relevant, and not have information that isn't really useful in
practice. We already got rid of the stack content dumping for that
reason - it was useful 20+ years ago, but it had become more of a
detriment than a real help.

So I don't want people to think "this _might_ be useful, let's print
it out". It really should be "this is _critically_ useful, it's worth
printing out even if we're limited to a 80x25 screen".

Honestly, I think one option is to just mark parts for "printing" and
other parts for "logging". I think we may raise the loglevel a bit
_too_ much when an oops happens in "console_verbose()".

So what we could perhaps do is:

- make console_verbose() actually reset things to at least LOGLEVEL_DEBUG

- make sure the *default* loglevel be LOGLEVEL_WARNING

- now you can use pr_debug() in the oops code to print messages to
the log, but they won't be printed to the screen.

And people who really want everything can still set a loglevel that is
much higher, because "console_verbose" would only do that "at least"
thing.

That would seem like the best of both worlds, no?

Linus

2018-02-23 15:23:23

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Thu, Feb 22, 2018 at 10:42:52AM -0800, Linus Torvalds wrote:
> So what we could perhaps do is:
>
> - make console_verbose() actually reset things to at least LOGLEVEL_DEBUG
>
> - make sure the *default* loglevel be LOGLEVEL_WARNING
>
> - now you can use pr_debug() in the oops code to print messages to
> the log, but they won't be printed to the screen.
>
> And people who really want everything can still set a loglevel that is
> much higher, because "console_verbose" would only do that "at least"
> thing.
>
> That would seem like the best of both worlds, no?

Maybe.

Broadly speaking, I think our goal should be, in the worst case, to try
to ensure that the essential data is captured.

But the definitions of "worst case" and "essential data" can vary a lot,
depending on both the user's setup and the nature of the bug. We're not
going to be able to get it right 100% of the time.

You're assuming the worst case of

"an 80x25 screen is the only interface to the console".

But there's another worst case of

"we had unlimited serial port logging, but didn't dump enough data".

With your proposal, the latter might instead become:

"we had unlimited serial port logging, but didn't dump enough data
because the default loglevel was too low."

I did a little analysis of panics reported on lkml via .jpg files
(either attached or in a URL). In the last two years I only found 11
such reports. (And only two of them were 25x80, the rest were at least
47 rows.)

On the other hand, I found a *ton* of panics which were copy/pasted. It
was way too many to count, but a rough guess is about one per day.

So ~1.5% of bugs are reported via cell phone camera (with only about
5-10% of *those* on a tiny 25x80 screen, with the rest having at least
47 rows).

It's not very scientific, but it gives a general idea, I think. The
cell phone camera thing has become a pretty rare way to report bugs, and
with the proliferation of virtualization and automated testing I would
expect that trend to continue.

So my worry with your proposal is that many (or most?) people won't
change their default log level to DEBUG, and then all these nice
additional bits of data we're adding won't ever get printed, making
debug harder for the ~98.5% case of sane serial port logging.

But then again, I don't have any better ideas...

--
Josh

2018-02-23 20:14:26

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

Josh Poimboeuf <[email protected]> writes:

> On Thu, Feb 22, 2018 at 10:42:52AM -0800, Linus Torvalds wrote:
>> So what we could perhaps do is:
>>
>> - make console_verbose() actually reset things to at least LOGLEVEL_DEBUG
>>
>> - make sure the *default* loglevel be LOGLEVEL_WARNING
>>
>> - now you can use pr_debug() in the oops code to print messages to
>> the log, but they won't be printed to the screen.
>>
>> And people who really want everything can still set a loglevel that is
>> much higher, because "console_verbose" would only do that "at least"
>> thing.
>>
>> That would seem like the best of both worlds, no?
>
> Maybe.
>
> Broadly speaking, I think our goal should be, in the worst case, to try
> to ensure that the essential data is captured.
>
> But the definitions of "worst case" and "essential data" can vary a lot,
> depending on both the user's setup and the nature of the bug. We're not
> going to be able to get it right 100% of the time.
>
> You're assuming the worst case of
>
> "an 80x25 screen is the only interface to the console".
>
> But there's another worst case of
>
> "we had unlimited serial port logging, but didn't dump enough data".
>
> With your proposal, the latter might instead become:
>
> "we had unlimited serial port logging, but didn't dump enough data
> because the default loglevel was too low."
>
> I did a little analysis of panics reported on lkml via .jpg files
> (either attached or in a URL). In the last two years I only found 11
> such reports. (And only two of them were 25x80, the rest were at least
> 47 rows.)
>
> On the other hand, I found a *ton* of panics which were copy/pasted. It
> was way too many to count, but a rough guess is about one per day.
>
> So ~1.5% of bugs are reported via cell phone camera (with only about
> 5-10% of *those* on a tiny 25x80 screen, with the rest having at least
> 47 rows).
>
> It's not very scientific, but it gives a general idea, I think. The
> cell phone camera thing has become a pretty rare way to report bugs, and
> with the proliferation of virtualization and automated testing I would
> expect that trend to continue.
>
> So my worry with your proposal is that many (or most?) people won't
> change their default log level to DEBUG, and then all these nice
> additional bits of data we're adding won't ever get printed, making
> debug harder for the ~98.5% case of sane serial port logging.
>
> But then again, I don't have any better ideas...

Please also note there are serial ports and there are serials ports.
There are serial ports on virtual machines that don't have a speed.
Then there are serial ports on physical hardware some at 9600 baud,
and in all cases they are slow. So on a physical serial port tersness
is a virtue (unless the machine is completely dead).

Then we have panics and the like that are reported by kdump. Those
should be cut and pastable as well. But require that someone has done
the work to set that up so that is a reliable path.

I know that in working on kexec-on-panic what I have found is the less
code in a critical path you have to run in a b0rken kernel the higher
your chance of that code running successfully. I expect that applies
to the panic printer as much as anything else.

Eric


2018-02-25 11:38:39

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 0/5] x86/dumpstack: Cleanups and user opcode bytes Code: section

On Wed, Feb 21, 2018 at 01:39:52PM -0800, Linus Torvalds wrote:
> which are actually about the crash. The rest is almost entirely useless.
>
> Do I know what the corrent answer is? No.

Ok, I hear ya. I finally have some time to poke at this. So here's a new
splat, see below. Incremental diff at the end:

RSP is part of the registers dump now, after the GPRs.

I've added "EXEC SUMMARY" markers for now, for ease of discussing
this. Will remove them later.

My silly idea is to save the first regs when we enter __die(), i.e.,
die_counter == 0 and dump them in oops_end() as an exec summary.

I guess we can expand that executive summary into a full-fledged
function which dumps everything critical needed to debug an issue.

Lemme read the rest of the thread now.

[ 22.762334] sysrq: SysRq : Trigger a crash
[ 22.763456] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[ 22.765416] PGD 7b64d067 P4D 7b64d067 PUD 79402067 PMD 0
[ 22.766121] Oops: 0002 [#1] PREEMPT SMP
[ 22.766121] CPU: 0 PID: 3666 Comm: bash Not tainted 4.16.0-rc2+ #20
[ 22.766121] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 22.766121] RIP: 0010:sysrq_handle_crash+0x17/0x20
[ 22.766121] Code: eb d1 e8 4d 19 b7 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 96 27 bd ff c7 05 14 24 19 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 24 c2 ff fb e9
[ 22.766121] RAX: 0000000000000000 RBX: 0000000000000063 RCX: 0000000000000000
[ 22.766121] RDX: 0000000000000000 RSI: ffffffff8110154a RDI: 0000000000000063
[ 22.766121] RBP: ffffffff82271480 R08: 0000000000000185 R09: 00000000000ba1de
[ 22.766121] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000000a
[ 22.766121] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 22.766121] RSP: 0018:ffffc90000703df0 EFLAGS: 00010246
[ 22.766121] FS: 00007ffff7fdb700(0000) GS:ffff88007ec00000(0000) knlGS:0000000000000000
[ 22.766121] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 22.766121] CR2: 0000000000000000 CR3: 000000007b711000 CR4: 00000000000406f0
[ 22.766121] Call Trace:
[ 22.766121] __handle_sysrq+0x9e/0x160
[ 22.766121] write_sysrq_trigger+0x2b/0x30
[ 22.766121] proc_reg_write+0x38/0x70
[ 22.766121] __vfs_write+0x36/0x160
[ 22.766121] ? __fd_install+0x69/0x110
[ 22.766121] ? preempt_count_add+0x74/0xb0
[ 22.766121] ? _raw_spin_lock+0x13/0x30
[ 22.766121] ? set_close_on_exec+0x41/0x80
[ 22.766121] ? preempt_count_sub+0xa8/0x100
[ 22.766121] vfs_write+0xc0/0x190
[ 22.766121] SyS_write+0x64/0xe0
[ 22.766121] ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 22.766121] do_syscall_64+0x70/0x130
[ 22.766121] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 22.766121] RIP: 0033:0x7ffff74b9620
[ 22.766121] Code: ff 73 01 c3 48 8b 0d 68 98 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d bd f1 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04
[ 22.766121] ORIG_RAX: 0000000000000001
[ 22.766121] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007ffff74b9620
[ 22.766121] RDX: 0000000000000002 RSI: 0000000000705408 RDI: 0000000000000001
[ 22.766121] RBP: 0000000000705408 R08: 000000000000000a R09: 00007ffff7fdb700
[ 22.766121] R10: 00007fffffffe490 R11: 0000000000000246 R12: 00007ffff77842a0
[ 22.766121] R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000
[ 22.766121] RSP: 002b:00007fffffffe638 EFLAGS: 00000246
[ 22.766121] Modules linked in:
[ 22.766121] CR2: 0000000000000000
[ 22.817404] ---[ end trace 374137bfd9ca49cc ]---
[ 22.818727] <EXEC SUMMARY>:
[ 22.819608] RIP: 0010:sysrq_handle_crash+0x17/0x20
[ 22.820906] Code: eb d1 e8 4d 19 b7 ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 e8 96 27 bd ff c7 05 14 24 19 01 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 c3 0f 1f 44 00 00 e8 86 24 c2 ff fb e9
[ 22.824896] RAX: 0000000000000000 RBX: 0000000000000063 RCX: 0000000000000000
[ 22.826208] RDX: 0000000000000000 RSI: ffffffff8110154a RDI: 0000000000000063
[ 22.827506] RBP: ffffffff82271480 R08: 0000000000000185 R09: 00000000000ba1de
[ 22.828935] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000000a
[ 22.830257] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 22.831535] RSP: 0018:ffffc90000703df0 EFLAGS: 00010246
[ 22.831536] </EXEC SUMMARY>:
[ 22.836493] Kernel panic - not syncing: Fatal exception
[ 22.837871] Kernel Offset: disabled
[ 22.838648] ---[ end Kernel panic - not syncing: Fatal exception


---
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 0037bdc9e252..e71319194f6c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -31,6 +31,8 @@ static u8 __opc[OPCODE_BUFSIZE];
static u8 *opcodes = __opc;
static int die_counter;

+static struct pt_regs exec_summary_regs;
+
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info)
{
@@ -323,6 +325,11 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
raw_local_irq_restore(flags);
oops_exit();

+ /* Executive summary in case the oops scrolled away */
+ pr_emerg("<EXEC SUMMARY>:\n");
+ __show_regs(&exec_summary_regs, false);
+ pr_emerg("</EXEC SUMMARY>:\n");
+
if (!signr)
return;
if (in_interrupt())
@@ -341,6 +348,13 @@ NOKPROBE_SYMBOL(oops_end);

int __die(const char *str, struct pt_regs *regs, long err)
{
+
+ /*
+ * Save the first regs for the executive summary.
+ */
+ if (!die_counter)
+ exec_summary_regs = *regs;
+
printk(KERN_DEFAULT
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
@@ -350,15 +364,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");

+ show_regs(regs);
+
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;

print_modules();
- show_regs(regs);
-
- /* Executive summary in case the oops scrolled away */
- show_ip(regs, KERN_EMERG);

return 0;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8a1da018f0d5..b3c19f734442 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -699,7 +699,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT "paging request");

printk(KERN_CONT " at %px\n", (void *) address);
- printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);

dump_pagetable(address);
}


--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.