Sometimes non-interactive dump capture environment silently fails to
store vmcore due to hardware or software bug. It is hard then to
understand and fix reasons of system failure in both production and
dump capture environments to prevent this failure to reoccur. It would
be quite usefull to see log buffer contents of crashed kernel at
least.
This patchset adds possibility for dump capture mode to extract and
print log buffer and CPU registers from captured vmcore. Also the
state of running kernel is printed. This action can be triggered via
NMI button.
Signed-off-by: Vitaly Mayatskikh <[email protected]>
arch/x86/include/asm/elf.h | 46 +++++
arch/x86/include/asm/kdebug.h | 1 +
arch/x86/include/asm/nmi.h | 1 +
arch/x86/kernel/apic/nmi.c | 27 +++
arch/x86/kernel/process_32.c | 22 ++-
arch/x86/kernel/process_64.c | 20 ++-
fs/proc/vmcore.c | 365 +++++++++++++++++++++++++++++++++++++++--
include/linux/sysctl.h | 1 +
kernel/sysctl.c | 7 +
kernel/sysctl_binary.c | 1 +
10 files changed, 463 insertions(+), 28 deletions(-)
__show_regs dumps content of pt_regs and does other things beside
it. This patch moves pt_regs dumping part into separate function:
__show_main_regs.
Signed-off-by: Vitaly Mayatskikh <[email protected]>
---
arch/x86/include/asm/kdebug.h | 1 +
arch/x86/kernel/process_32.c | 22 ++++++++++++++--------
arch/x86/kernel/process_64.c | 20 ++++++++++++--------
3 files changed, 27 insertions(+), 16 deletions(-)
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fa7c0b9..fd346c8 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -30,6 +30,7 @@ extern void show_registers(struct pt_regs *regs);
extern void show_trace(struct task_struct *t, struct pt_regs *regs,
unsigned long *sp, unsigned long bp);
extern void __show_regs(struct pt_regs *regs, int all);
+extern void __show_main_regs(struct pt_regs *regs);
extern void show_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d12878..e67cad7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -119,10 +119,8 @@ void cpu_idle(void)
}
}
-void __show_regs(struct pt_regs *regs, int all)
+void __show_main_regs(struct pt_regs *regs)
{
- unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
- unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
@@ -135,12 +133,9 @@ void __show_regs(struct pt_regs *regs, int all)
savesegment(ss, ss);
savesegment(gs, gs);
}
+ printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx\n",
+ (u16)regs->cs, regs->ip, regs->flags);
- show_regs_common();
-
- printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
- (u16)regs->cs, regs->ip, regs->flags,
- smp_processor_id());
print_symbol("EIP is at %s\n", regs->ip);
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
@@ -149,6 +144,17 @@ void __show_regs(struct pt_regs *regs, int all)
regs->si, regs->di, regs->bp, sp);
printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
(u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+}
+
+void __show_regs(struct pt_regs *regs, int all)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+ unsigned long d0, d1, d2, d3, d6, d7;
+
+
+ show_regs_common();
+
+ __show_main_regs(regs);
if (!all)
return;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a..00a6614 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -151,15 +151,8 @@ void cpu_idle(void)
}
}
-/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs *regs, int all)
+void __show_main_regs(struct pt_regs *regs)
{
- unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
- unsigned long d0, d1, d2, d3, d6, d7;
- unsigned int fsindex, gsindex;
- unsigned int ds, cs, es;
-
- show_regs_common();
printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
@@ -174,7 +167,18 @@ void __show_regs(struct pt_regs *regs, int all)
regs->r10, regs->r11, regs->r12);
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
+}
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs *regs, int all)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned int fsindex, gsindex;
+ unsigned int ds, cs, es;
+
+ show_regs_common();
+ __show_main_regs(regs);
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
--
1.7.1
This function digs up and prints kernel log and CPU registers from
captured vmcore.
Signed-off-by: Vitaly Mayatskikh <[email protected]>
---
fs/proc/vmcore.c | 365 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 353 insertions(+), 12 deletions(-)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91c817f..a2fc826 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
#include <linux/list.h>
#include <asm/uaccess.h>
#include <asm/io.h>
+#include <asm/kdebug.h>
/* List representing chunks of contiguous memory areas and their offsets in
* vmcore file.
@@ -35,6 +36,14 @@ static u64 vmcore_size;
static struct proc_dir_entry *proc_vmcore = NULL;
+static int ei_class;
+
+static char *old_log_buf;
+static int old_log_len;
+static char print_log_buf[PAGE_SIZE];
+static struct pt_regs old_kernel_regs[NR_CPUS];
+static int old_kernel_nr_cpus;
+
/* Reads a page from the oldmem device from given offset. */
static ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf)
@@ -69,7 +78,7 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
return read;
}
-/* Maps vmcore file offset to respective physical address in memroy. */
+/* Maps vmcore file offset to respective physical address in memory. */
static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
struct vmcore **m_ptr)
{
@@ -90,11 +99,8 @@ static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
return 0;
}
-/* Read from the ELF header and then the crash dump. On error, negative value is
- * returned otherwise number of bytes read are returned.
- */
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
- size_t buflen, loff_t *fpos)
+static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
+ int user)
{
ssize_t acc = 0, tmp;
size_t tsz;
@@ -113,8 +119,12 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
tsz = elfcorebuf_sz - *fpos;
if (buflen < tsz)
tsz = buflen;
- if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
- return -EFAULT;
+ if (user) {
+ if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
+ return -EFAULT;
+ } else
+ memcpy(buffer, elfcorebuf + *fpos, tsz);
+
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
@@ -137,7 +147,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
tsz = nr_bytes;
while (buflen) {
- tmp = read_from_oldmem(buffer, tsz, &start, 1);
+ tmp = read_from_oldmem(buffer, tsz, &start, user);
if (tmp < 0)
return tmp;
buflen -= tsz;
@@ -161,6 +171,15 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
return acc;
}
+/* Read from the ELF header and then the crash dump. On error, negative value is
+ * returned otherwise number of bytes read are returned.
+ */
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+ size_t buflen, loff_t *fpos)
+{
+ return __read_vmcore(buffer, buflen, fpos, 1);
+}
+
static const struct file_operations proc_vmcore_operations = {
.read = read_vmcore,
.llseek = generic_file_llseek,
@@ -610,15 +629,15 @@ static int __init parse_crash_elf_headers(void)
" not found\n");
return -EINVAL;
}
-
- if (e_ident[EI_CLASS] == ELFCLASS64) {
+ ei_class = e_ident[EI_CLASS];
+ if (ei_class == ELFCLASS64) {
rc = parse_crash_elf64_headers();
if (rc)
return rc;
/* Determine vmcore size. */
vmcore_size = get_vmcore_size_elf64(elfcorebuf);
- } else if (e_ident[EI_CLASS] == ELFCLASS32) {
+ } else if (ei_class == ELFCLASS32) {
rc = parse_crash_elf32_headers();
if (rc)
return rc;
@@ -633,6 +652,283 @@ static int __init parse_crash_elf_headers(void)
return 0;
}
+/* Search symbol in buffer and read value. */
+static long read_symbol(char *buffer, long buffer_sz, char *symbol)
+{
+ char _symbol[64];
+ char *ptr, *end;
+ unsigned long value = 0;
+
+ snprintf(_symbol, sizeof(_symbol), "SYMBOL(%s)=", symbol);
+ ptr = strnstr(buffer, _symbol, buffer_sz);
+
+ if (ptr)
+ value = simple_strtoul(ptr + strlen(_symbol), &end, 16);
+ return value;
+}
+
+/* Find offset for given virtual address in vmcore file. */
+static loff_t map_vaddr_to_offset_elf64(u64 addr)
+{
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ Elf64_Phdr *phdr = (Elf64_Phdr *)((char*)ehdr + ehdr->e_phoff);
+ int i;
+ loff_t offset = -1;
+
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_LOAD &&
+ addr >= phdr->p_vaddr &&
+ addr < phdr->p_vaddr + phdr->p_memsz) {
+ offset = phdr->p_offset + addr - phdr->p_vaddr;
+ break;
+ }
+ }
+ return offset;
+}
+
+static loff_t map_vaddr_to_offset_elf32(u32 addr)
+{
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfcorebuf;
+ Elf32_Phdr *phdr = (Elf32_Phdr *)((char*)ehdr + ehdr->e_phoff);
+ int i;
+ loff_t offset = -1;
+
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_LOAD &&
+ addr >= phdr->p_vaddr &&
+ addr < phdr->p_vaddr + phdr->p_memsz) {
+ offset = phdr->p_offset + addr - phdr->p_vaddr;
+ break;
+ }
+ }
+ return offset;
+}
+
+static loff_t map_vaddr_to_offset(u64 addr)
+{
+ if (ei_class == ELFCLASS64)
+ return map_vaddr_to_offset_elf64(addr);
+ else if (ei_class == ELFCLASS32)
+ return map_vaddr_to_offset_elf32(addr);
+ return -1;
+}
+
+/* Read long at given address in old memory. */
+static long read_vmcore_long(u64 addr)
+{
+ loff_t off;
+ long tmp, value = 0;
+ char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ off = map_vaddr_to_offset(addr);
+ if (off < 0)
+ goto fail;
+
+ tmp = __read_vmcore(buf, PAGE_SIZE, &off, 0);
+ if (tmp > 0) {
+ value = *(long *)buf;
+ goto out;
+ }
+fail:
+ kfree(buf);
+out:
+ return value;
+}
+
+/* Find PT_NOTE section in vmcore's elf header. */
+Elf64_Phdr *find_elf64_pt_note(void)
+{
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ Elf64_Phdr *phdr = (Elf64_Phdr *)((char*)ehdr + ehdr->e_phoff);
+ int i;
+
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE)
+ return phdr;
+ }
+ return 0;
+}
+
+Elf32_Phdr *find_elf32_pt_note(void)
+{
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfcorebuf;
+ Elf32_Phdr *phdr = (Elf32_Phdr *)((char*)ehdr + ehdr->e_phoff);
+ int i;
+
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE)
+ return phdr;
+ }
+ return 0;
+}
+
+static long find_old_log_elf64(int *size)
+{
+ Elf64_Xword i;
+ char *buffer;
+ loff_t start;
+ long old_log = 0, sz;
+ Elf64_Nhdr *note;
+ Elf64_Phdr *phdr;
+
+ phdr = find_elf64_pt_note();
+ if (!phdr)
+ goto fail;
+
+ buffer = kmalloc(phdr->p_memsz, GFP_KERNEL);
+ if (!buffer)
+ goto fail;
+
+ note = (Elf64_Nhdr *)buffer;
+ start = phdr->p_offset;
+ sz = __read_vmcore(buffer, phdr->p_memsz, &start, 0);
+ if (sz < 0)
+ goto fail1;
+
+ for (i = 0; i < phdr->p_memsz; i += sz) {
+ char *ptr;
+
+ if (note->n_namesz == 0)
+ break;
+
+ ptr = (char *)note + sizeof(int) * 3;
+
+#ifdef ELF_CORE_EXTRACT_REGS
+ if (memcmp(ptr, KEXEC_CORE_NOTE_NAME,
+ sizeof(KEXEC_CORE_NOTE_NAME)) == 0) {
+ struct elf_prstatus *prstatus = (struct elf_prstatus *)
+ (ptr + ((note->n_namesz + 3) & ~3));
+ struct pt_regs regs;
+ ELF_CORE_EXTRACT_REGS(prstatus->pr_reg, ®s);
+ old_kernel_regs[old_kernel_nr_cpus++] = regs;
+ }
+#endif
+ if (memcmp(ptr, VMCOREINFO_NOTE_NAME,
+ sizeof(VMCOREINFO_NOTE_NAME)) == 0) {
+ unsigned long symbol;
+ symbol = read_symbol(ptr, note->n_descsz,
+ "log_buf");
+ old_log = read_vmcore_long(symbol);
+ symbol = read_symbol(ptr, note->n_descsz,
+ "logged_chars");
+ *size = (int)read_vmcore_long(symbol);
+ break;
+ }
+ sz = sizeof(Elf64_Nhdr) +
+ ((note->n_namesz + 3) & ~3) +
+ ((note->n_descsz + 3) & ~3);
+ note = (Elf64_Nhdr *)((char*)note + sz);
+ }
+fail1:
+ kfree(buffer);
+fail:
+ return old_log;
+}
+
+static long find_old_log_elf32(int *size)
+{
+ Elf32_Word i;
+ char *buffer;
+ loff_t start;
+ long old_log = 0, sz;
+ Elf32_Nhdr *note;
+ Elf32_Phdr *phdr;
+
+ phdr = find_elf32_pt_note();
+ if (!phdr)
+ goto fail;
+
+ buffer = kmalloc(phdr->p_memsz, GFP_KERNEL);
+ if (!buffer)
+ goto fail;
+
+ note = (Elf32_Nhdr *)buffer;
+ start = phdr->p_offset;
+ sz = __read_vmcore(buffer, phdr->p_memsz, &start, 0);
+ if (sz < 0)
+ goto fail1;
+
+ for (i = 0; i < phdr->p_memsz; i += sz) {
+ char *ptr;
+
+ if (note->n_namesz == 0)
+ break;
+
+ ptr = (char *)note + sizeof(int) * 3;
+
+#ifdef ELF_CORE_EXTRACT_REGS
+ if (memcmp(ptr, KEXEC_CORE_NOTE_NAME,
+ sizeof(KEXEC_CORE_NOTE_NAME)) == 0) {
+ struct elf_prstatus *prstatus = (struct elf_prstatus *)
+ (ptr + ((note->n_namesz + 3) & ~3));
+ struct pt_regs regs;
+ ELF_CORE_EXTRACT_REGS(prstatus->pr_reg, ®s);
+ old_kernel_regs[old_kernel_nr_cpus++] = regs;
+ }
+#endif
+ if (memcmp(ptr, VMCOREINFO_NOTE_NAME,
+ sizeof(VMCOREINFO_NOTE_NAME)) == 0) {
+ unsigned long symbol;
+ symbol = read_symbol(ptr, note->n_descsz,
+ "log_buf");
+ old_log = read_vmcore_long(symbol);
+ symbol = read_symbol(ptr, note->n_descsz,
+ "logged_chars");
+ *size = (int)read_vmcore_long(symbol);
+ break;
+ }
+ sz = sizeof(Elf32_Nhdr) +
+ ((note->n_namesz + 3) & ~3) +
+ ((note->n_descsz + 3) & ~3);
+ note = (Elf32_Nhdr *)((char*)note + sz);
+ }
+fail1:
+ kfree(buffer);
+fail:
+ return old_log;
+}
+
+/* Prepare old log_buf for use */
+static long find_vmcore_old_log(int *size)
+{
+ if (ei_class == ELFCLASS64)
+ return find_old_log_elf64(size);
+ else if (ei_class == ELFCLASS32)
+ return find_old_log_elf32(size);
+ return 0;
+}
+
+static char *read_vmcore_old_log(int *len)
+{
+ loff_t offset;
+ unsigned long old_log_vaddr;
+ char *ptr = 0;
+
+ old_log_vaddr = find_vmcore_old_log(len);
+
+ if (!old_log_vaddr)
+ goto out;
+
+ ptr = vmalloc(*len);
+ if (!ptr)
+ goto out;
+
+ offset = map_vaddr_to_offset(old_log_vaddr);
+ if (!offset)
+ goto fail;
+
+ if (__read_vmcore(ptr, *len, &offset, 0) < 0)
+ goto fail;
+
+ goto out;
+fail:
+ vfree(ptr);
+out:
+ return ptr;
+}
+
/* Init function for vmcore module. */
static int __init vmcore_init(void)
{
@@ -647,9 +943,54 @@ static int __init vmcore_init(void)
return rc;
}
+ old_log_buf = read_vmcore_old_log(&old_log_len);
+ if (!old_log_buf)
+ printk(KERN_WARNING "Kdump: can't read old log\n");
proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
if (proc_vmcore)
proc_vmcore->size = vmcore_size;
return 0;
}
+
+static void print_log(char *ptr, long len)
+{
+ char *p, *ptr1;
+ long len1;
+
+ while (len > 0) {
+ p = print_log_buf;
+ ptr1 = ptr; len1 = len;
+ while (len > 0 && *ptr != 0x0a
+ && p - print_log_buf < PAGE_SIZE) {
+ *p++ = *ptr++;
+ len--;
+ }
+ *p = 0; ptr++; len--;
+ printk(KERN_INFO "%s\n", print_log_buf);
+ }
+}
+
+void dump_old_log(void)
+{
+ int i;
+ console_verbose();
+ bust_spinlocks(1);
+
+#ifdef ELF_CORE_EXTRACT_REGS
+ if (old_kernel_nr_cpus) {
+ printk(KERN_INFO "--- old kernel registers begin here---\n");
+ for (i = 0; i < old_kernel_nr_cpus; i++) {
+ printk(KERN_INFO "CPU#%d:\n", i);
+ __show_main_regs(&old_kernel_regs[i]);
+ }
+ printk(KERN_INFO "--- old kernel registers end here---\n");
+ }
+#endif
+ if (old_log_buf) {
+ printk(KERN_INFO "--- old kernel log begins here ---\n");
+ print_log(old_log_buf, old_log_len);
+ printk(KERN_INFO "--- old kernel log ends here ---\n");
+ }
+}
+
module_init(vmcore_init)
--
1.7.1
Mention unknown_nmi_dump_log in sysctl documentation.
Signed-off-by: Vitaly Mayatskikh <[email protected]>
---
Documentation/sysctl/kernel.txt | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 3894eaa..b554838 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -65,6 +65,7 @@ show up in /proc/sys/kernel:
- tainted
- threads-max
- unknown_nmi_panic
+- unknown_nmi_dump_log
- version
==============================================================
@@ -515,6 +516,15 @@ If a system hangs up, try pressing the NMI switch.
==============================================================
+unknown_nmi_dump_log:
+
+The value in this file affects behavior of handling NMI. When dump
+capture kernel is loaded, the value is non-zero and unknown NMI is
+trapped, then vmcore's log buffer and CPU registers are extracted and
+displayed on console.
+
+==============================================================
+
panic_on_unrecovered_nmi:
The default Linux behaviour on an NMI of either memory or unknown is to continue
--
1.7.1
This patch introduces new NMI callback, controlled via sysctl variable
`unknown_nmi_dump_log'. When kernel is in dump capture mode, and user
presses NMI button on chassis or sends such event via control console,
callback extracts and prints kernel log and CPU registers from
captured vmcore, and also prints state of running kernel.
Signed-off-by: Vitaly Mayatskikh <[email protected]>
---
arch/x86/include/asm/nmi.h | 1 +
arch/x86/kernel/apic/nmi.c | 27 +++++++++++++++++++++++++++
include/linux/sysctl.h | 1 +
kernel/sysctl.c | 7 +++++++
kernel/sysctl_binary.c | 1 +
5 files changed, 37 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3..5ca20cf 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -42,6 +42,7 @@ struct ctl_table;
extern int proc_nmi_enabled(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int unknown_nmi_panic;
+extern int unknown_nmi_dump_log;
void arch_trigger_all_cpu_backtrace(void);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15..22a2688 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -37,7 +37,10 @@
#include <asm/mach_traps.h>
+#include <linux/crash_dump.h>
+
int unknown_nmi_panic;
+int unknown_nmi_dump_log;
int nmi_watchdog_enabled;
/* For reliability, we're prepared to waste bits here. */
@@ -507,6 +510,28 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
return 0;
}
+static int __init setup_unknown_nmi_dump_log(char *str)
+{
+ unknown_nmi_dump_log = 1;
+ return 1;
+}
+__setup("unknown_nmi_dump_log", setup_unknown_nmi_dump_log);
+
+static int unknown_nmi_dump_log_callback(struct pt_regs *regs, int cpu)
+{
+ printk(KERN_WARNING "NMI received for unknown reason %02x\n",
+ get_nmi_reason());
+#ifdef CONFIG_CRASH_DUMP
+ if (is_kdump_kernel()) {
+ dump_old_log();
+ printk(KERN_INFO "--- kdump kernel state begins here---\n");
+ show_state();
+ printk(KERN_INFO "--- kdump kernel state ends here---\n");
+ }
+#endif
+ return 0;
+}
+
/*
* proc handler for /proc/sys/kernel/nmi
*/
@@ -552,6 +577,8 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
#ifdef CONFIG_SYSCTL
if (unknown_nmi_panic)
return unknown_nmi_panic_callback(regs, cpu);
+ if (unknown_nmi_dump_log)
+ return unknown_nmi_dump_log_callback(regs, cpu);
#endif
return 0;
}
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7bb5cb6..7d2489d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+ KERN_UNKNOWN_NMI_DUMP_LOG=77, /* int: unknown nmi dump log flag */
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 997080f..03cc3c5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -719,6 +719,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "unknown_nmi_dump_log",
+ .data = &unknown_nmi_dump_log,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "nmi_watchdog",
.data = &nmi_watchdog_enabled,
.maxlen = sizeof (int),
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c57..e59d0b1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -128,6 +128,7 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
/* KERN_HZ_TIMER "hz_timer" no longer used */
{ CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
+ { CTL_INT, KERN_UNKNOWN_NMI_DUMP_LOG, "unknown_nmi_dump_log" },
{ CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
{ CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
--
1.7.1
There is macro ELF_CORE_COPY_REGS_COMMON, which packs registers into
elf_gregset_t. ELF_CORE_EXTRACT_REGS does contrary action: unpacks
elf_gregset_t.
Signed-off-by: Vitaly Mayatskikh <[email protected]>
---
arch/x86/include/asm/elf.h | 46 ++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 46 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..5e3decb 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -144,6 +144,27 @@ do { \
savesegment(gs, pr_reg[10]); \
} while (0);
+#define ELF_CORE_EXTRACT_REGS(pr_reg, regs) \
+do { \
+ (regs)->bx = (pr_reg)[0]; \
+ (regs)->cx = (pr_reg)[1]; \
+ (regs)->dx = (pr_reg)[2]; \
+ (regs)->si = (pr_reg)[3]; \
+ (regs)->di = (pr_reg)[4]; \
+ (regs)->bp = (pr_reg)[5]; \
+ (regs)->ax = (pr_reg)[6]; \
+ (regs)->ds = (pr_reg)[7]; \
+ (regs)->es = (pr_reg)[8]; \
+ (regs)->fs = (pr_reg)[9]; \
+ (regs)->gs = (pr_reg)[10]; \
+ (regs)->orig_ax = (pr_reg)[11]; \
+ (regs)->ip = (pr_reg)[12]; \
+ (regs)->cs = (pr_reg)[13]; \
+ (regs)->flags = (pr_reg)[14]; \
+ (regs)->sp = (pr_reg)[15]; \
+ (regs)->ss = (pr_reg)[16]; \
+} while (0);
+
#define ELF_PLATFORM (utsname()->machine)
#define set_personality_64bit() do { } while (0)
@@ -221,6 +242,31 @@ do { \
asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
} while (0);
+#define ELF_CORE_EXTRACT_REGS(pr_reg, regs) \
+do { \
+ (regs)->r15 = (pr_reg)[0]; \
+ (regs)->r14 = (pr_reg)[1]; \
+ (regs)->r13 = (pr_reg)[2]; \
+ (regs)->r12 = (pr_reg)[3]; \
+ (regs)->bp = (pr_reg)[4]; \
+ (regs)->bx = (pr_reg)[5]; \
+ (regs)->r11 = (pr_reg)[6]; \
+ (regs)->r10 = (pr_reg)[7]; \
+ (regs)->r9 = (pr_reg)[8]; \
+ (regs)->r8 = (pr_reg)[9]; \
+ (regs)->ax = (pr_reg)[10]; \
+ (regs)->cx = (pr_reg)[11]; \
+ (regs)->dx = (pr_reg)[12]; \
+ (regs)->si = (pr_reg)[13]; \
+ (regs)->di = (pr_reg)[14]; \
+ (regs)->orig_ax = (pr_reg)[15]; \
+ (regs)->ip = (pr_reg)[16]; \
+ (regs)->cs = (pr_reg)[17]; \
+ (regs)->flags = (pr_reg)[18]; \
+ (regs)->sp = (pr_reg)[19]; \
+ (regs)->ss = (pr_reg)[20]; \
+} while (0);
+
/* I'm not sure if we can use '-' here */
#define ELF_PLATFORM ("x86_64")
extern void set_personality_64bit(void);
--
1.7.1
On Wed, Jun 2, 2010 at 10:39 AM, Vitaly Mayatskikh
<[email protected]> wrote:
> __show_regs dumps content of pt_regs and does other things beside
> it. This patch moves pt_regs dumping part into separate function:
> __show_main_regs.
>
> Signed-off-by: Vitaly Mayatskikh <[email protected]>
[snip]
> +void __show_regs(struct pt_regs *regs, int all)
> +{
> + ? ? ? unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
> + ? ? ? unsigned long d0, d1, d2, d3, d6, d7;
> +
> +
Nitpick: extra newline here.
> + ? ? ? show_regs_common();
> +
> + ? ? ? __show_main_regs(regs);
>
> ? ? ? ?if (!all)
> ? ? ? ? ? ? ? ?return;
At Wed, 2 Jun 2010 12:50:03 +0300, Pekka Enberg wrote:
>
> On Wed, Jun 2, 2010 at 10:39 AM, Vitaly Mayatskikh
> <[email protected]> wrote:
> > __show_regs dumps content of pt_regs and does other things beside
> > it. This patch moves pt_regs dumping part into separate function:
> > __show_main_regs.
> >
> > Signed-off-by: Vitaly Mayatskikh <[email protected]>
>
> [snip]
>
> > +void __show_regs(struct pt_regs *regs, int all)
> > +{
> > + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
> > + unsigned long d0, d1, d2, d3, d6, d7;
> > +
> > +
>
> Nitpick: extra newline here.
Thanks for catching!
--
wbr, Vitaly
On Wed, Jun 02, 2010 at 09:39:14AM +0200, Vitaly Mayatskikh wrote:
> Sometimes non-interactive dump capture environment silently fails to
> store vmcore due to hardware or software bug. It is hard then to
> understand and fix reasons of system failure in both production and
> dump capture environments to prevent this failure to reoccur. It would
> be quite usefull to see log buffer contents of crashed kernel at
> least.
>
> This patchset adds possibility for dump capture mode to extract and
> print log buffer and CPU registers from captured vmcore. Also the
> state of running kernel is printed. This action can be triggered via
> NMI button.
Hi Vitaly,
I am not sure what is the problem we are trying to solve here. If we are
unable to capture the dump because second kernel did not boot due to
some dirver issue etc, above patch is not going to help either.
If kernel has booted, then one should be able to capture the dump, filter
it and look at the log buffers and cpu registers.
Most of the failures I have seen in capture kernel is that it was unable
to boot due to either deivce issues or failure in early boot. Once it has
crossed those hurdles, after that capturing the dump is easy part.
How many times does it happen in second kernel that kernel is spinning in
a loop and NMI can still get you information out.
So can you please give some more information about what kind of failures
while capturing the dump you are addressing by this patchset.
Thanks
Vivek
>
> Signed-off-by: Vitaly Mayatskikh <[email protected]>
>
> arch/x86/include/asm/elf.h | 46 +++++
> arch/x86/include/asm/kdebug.h | 1 +
> arch/x86/include/asm/nmi.h | 1 +
> arch/x86/kernel/apic/nmi.c | 27 +++
> arch/x86/kernel/process_32.c | 22 ++-
> arch/x86/kernel/process_64.c | 20 ++-
> fs/proc/vmcore.c | 365 +++++++++++++++++++++++++++++++++++++++--
> include/linux/sysctl.h | 1 +
> kernel/sysctl.c | 7 +
> kernel/sysctl_binary.c | 1 +
> 10 files changed, 463 insertions(+), 28 deletions(-)
At Wed, 2 Jun 2010 11:16:11 -0400, Vivek Goyal wrote:
> I am not sure what is the problem we are trying to solve here. If we are
> unable to capture the dump because second kernel did not boot due to
> some dirver issue etc, above patch is not going to help either.
>
> If kernel has booted, then one should be able to capture the dump, filter
> it and look at the log buffers and cpu registers.
>
> Most of the failures I have seen in capture kernel is that it was unable
> to boot due to either deivce issues or failure in early boot. Once it has
> crossed those hurdles, after that capturing the dump is easy part.
>
> How many times does it happen in second kernel that kernel is spinning in
> a loop and NMI can still get you information out.
>
> So can you please give some more information about what kind of failures
> while capturing the dump you are addressing by this patchset.
Obviously, this change doesn't help if 2nd kernel is not able to
boot. But there are other problems, which may prevent vmcore to be
captured. For example, machine has RAM > HDD and it may save vmcore
only over network. If network fails (e.g., due to bugs in NIC drivers
or NFS, what is not so rare), and dump capture environment is
non-interactive, or it doesn't have development tools like `crash',
there's no chance even to guess what has happened.
Other possibilities of failure may include broken RAID controller,
HDD, RAM. NMI button in such situations is a last chance to see old
log.
--
wbr, Vitaly
Vitaly Mayatskikh <[email protected]> writes:
>
> Obviously, this change doesn't help if 2nd kernel is not able to
> boot. But there are other problems, which may prevent vmcore to be
> captured. For example, machine has RAM > HDD and it may save vmcore
> only over network. If network fails (e.g., due to bugs in NIC drivers
> or NFS, what is not so rare), and dump capture environment is
> non-interactive, or it doesn't have development tools like `crash',
> there's no chance even to guess what has happened.
In this case you don't need NMI, sysrq or some /sys trigger
is good enough.
NMI would be only needed if the crash kernel is completely
hosed too.
> Other possibilities of failure may include broken RAID controller,
> HDD, RAM. NMI button in such situations is a last chance to see old
> log.
The big problem is that the NMI is used by more and more subsystems,
and several of them tend to eat all NMIs, so the leftovers are less and
less. Overall I would not consider it reliable.
Also NMI buttons are not actually all that common.
I'm also not sure you really need the analysis in kernel space.
Why not have a user space program that does a quick analysis
of the previous vmcore and dumps a summary only? In fact
I suspect crash can already do that.
-Andi
--
[email protected] -- Speaking for myself only.
At Thu, 03 Jun 2010 11:30:01 +0200, Andi Kleen wrote:
> > Obviously, this change doesn't help if 2nd kernel is not able to
> > boot. But there are other problems, which may prevent vmcore to be
> > captured. For example, machine has RAM > HDD and it may save vmcore
> > only over network. If network fails (e.g., due to bugs in NIC drivers
> > or NFS, what is not so rare), and dump capture environment is
> > non-interactive, or it doesn't have development tools like `crash',
> > there's no chance even to guess what has happened.
>
> In this case you don't need NMI, sysrq or some /sys trigger
> is good enough.
Yes, it can be enough if you still can login. Also NMI-part is small
and can be easily changed/removed.
> NMI would be only needed if the crash kernel is completely
> hosed too.
That's the case.
> > Other possibilities of failure may include broken RAID controller,
> > HDD, RAM. NMI button in such situations is a last chance to see old
> > log.
>
> The big problem is that the NMI is used by more and more subsystems,
> and several of them tend to eat all NMIs, so the leftovers are less and
> less. Overall I would not consider it reliable.
True. But as a last hope, when nothing else helps, it still may be
worth trying :)
> Also NMI buttons are not actually all that common.
True as well. This feature is generally not for desktop systems, but
for large servers running critical apps. Usually such servers have NMI
button facility (directly at front of chassis or as a function in
remote console software).
> I'm also not sure you really need the analysis in kernel space.
>
> Why not have a user space program that does a quick analysis
> of the previous vmcore and dumps a summary only? In fact
> I suspect crash can already do that.
I agree, that's fine and usually is enough, if it's still possible to
login into system and run this utility. What about scenario when
console session is available only for 1 unit in the rack at the same
time, main kernel crashed, and dump capture environment stuck? User
attaches to that machine, but cannot even login, so the kdump kernel
is probably also semi-dead. Also he don't see analysis dump, produced
by the utility, because he attached too late to see it's output.
--
wbr, Vitaly
> > NMI would be only needed if the crash kernel is completely
> > hosed too.
>
> That's the case.
You're saying it booted but is hosed after boot?
That seems like a very obscure case. Is that really common?
> True. But as a last hope, when nothing else helps, it still may be
> worth trying :)
It will simply not work.
-Andi
--
[email protected] -- Speaking for myself only.
At Thu, 3 Jun 2010 17:13:47 +0200, Andi Kleen wrote:
> > > NMI would be only needed if the crash kernel is completely
> > > hosed too.
> >
> > That's the case.
>
> You're saying it booted but is hosed after boot?
>
> That seems like a very obscure case. Is that really common?
As usual: for engineers, who have to deal with it - yes, it is common.
> > True. But as a last hope, when nothing else helps, it still may be
> > worth trying :)
>
> It will simply not work.
Why?
--
wbr, Vitaly
On Thu, Jun 03, 2010 at 05:13:47PM +0200, Andi Kleen wrote:
>> > NMI would be only needed if the crash kernel is completely
>> > hosed too.
>>
>> That's the case.
>
>You're saying it booted but is hosed after boot?
>
>That seems like a very obscure case. Is that really common?
>
Actually, I met two of this kind of cases recently.
One was stucked in early_idt_hander() during booting the second kernel,
the other one is even worse, I even can't find where it hangs.
On Fri, Jun 04, 2010 at 11:32:52AM +0200, Vitaly Mayatskikh wrote:
> At Thu, 3 Jun 2010 17:13:47 +0200, Andi Kleen wrote:
>
> > > > NMI would be only needed if the crash kernel is completely
> > > > hosed too.
> > >
> > > That's the case.
> >
> > You're saying it booted but is hosed after boot?
> >
> > That seems like a very obscure case. Is that really common?
>
> As usual: for engineers, who have to deal with it - yes, it is common.
Well it would be better then to find out why that happens and fix it.
Is this related to kexec driver problems?
>
> > > True. But as a last hope, when nothing else helps, it still may be
> > > worth trying :)
> >
> > It will simply not work.
>
> Why?
Because someone else steals all the NMIs.
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, Jun 04, 2010 at 05:49:36PM +0800, Am?rico Wang wrote:
> On Thu, Jun 03, 2010 at 05:13:47PM +0200, Andi Kleen wrote:
> >> > NMI would be only needed if the crash kernel is completely
> >> > hosed too.
> >>
> >> That's the case.
> >
> >You're saying it booted but is hosed after boot?
> >
> >That seems like a very obscure case. Is that really common?
> >
>
> Actually, I met two of this kind of cases recently.
>
> One was stucked in early_idt_hander() during booting the second kernel,
Most likely screaming interrupt from some device.
I have some ideas how to improve this by more forcefully resetting
devices on kexec, but in general perhaps early idt needs to be hardened for
this case too.
-Andi
--
[email protected] -- Speaking for myself only.
At Fri, 4 Jun 2010 12:15:19 +0200, Andi Kleen wrote:
> > As usual: for engineers, who have to deal with it - yes, it is common.
>
> Well it would be better then to find out why that happens and fix it.
>
> Is this related to kexec driver problems?
This patchset is not fixing some particular bug.
--
wbr, Vitaly
On Thu, Jun 03, 2010 at 11:01:38AM +0200, Vitaly Mayatskikh wrote:
> At Wed, 2 Jun 2010 11:16:11 -0400, Vivek Goyal wrote:
>
> > I am not sure what is the problem we are trying to solve here. If we are
> > unable to capture the dump because second kernel did not boot due to
> > some dirver issue etc, above patch is not going to help either.
> >
> > If kernel has booted, then one should be able to capture the dump, filter
> > it and look at the log buffers and cpu registers.
> >
> > Most of the failures I have seen in capture kernel is that it was unable
> > to boot due to either deivce issues or failure in early boot. Once it has
> > crossed those hurdles, after that capturing the dump is easy part.
> >
> > How many times does it happen in second kernel that kernel is spinning in
> > a loop and NMI can still get you information out.
> >
> > So can you please give some more information about what kind of failures
> > while capturing the dump you are addressing by this patchset.
>
> Obviously, this change doesn't help if 2nd kernel is not able to
> boot. But there are other problems, which may prevent vmcore to be
> captured. For example, machine has RAM > HDD and it may save vmcore
> only over network. If network fails (e.g., due to bugs in NIC drivers
> or NFS, what is not so rare), and dump capture environment is
> non-interactive, or it doesn't have development tools like `crash',
> there's no chance even to guess what has happened.
Vitaly, in this case it sounds like writting some user space utility to
display log buffers of previous kernel and pack into initrd/initramfs and
run that utility if network is down and hard disk does not have enough
space to store the dump.
I vaguely remember that dump filtering utility was doing something
similar.
>
> Other possibilities of failure may include broken RAID controller,
> HDD, RAM. NMI button in such situations is a last chance to see old
> log.
Again, can't we do it with the help of user space utility packed in
initrd.
IMHO, somehow NMI button does not sound like a very good option. At max we
probably can look into doing this through sysrq option but I am not too
keen on that also until and unless we have good examples. You mentioned
that one might not be able to log in, but I am wondering why one would not
be able to login.
In kdump initrd, we can create one default policy that if you can't
capture dump, then try to save only log buffers of previous kernel. If
disk is broken, then just dump the buffers on console etc. This assumes
that console are at least being logged or somebody is looking at the
console. If not, one can always login and run the utility to dump buffers
again.
The only corner case which is not covered is that one can not login into
the system and somebody plugged in cosole later or console was shared. I
am not sure how common that case is.
Making capture kernel print pervious kernel's buffers does not sound very
convincing to me, at this point. I will
Thanks
Vivek
On Fri, Jun 04, 2010 at 05:49:36PM +0800, Am?rico Wang wrote:
> On Thu, Jun 03, 2010 at 05:13:47PM +0200, Andi Kleen wrote:
> >> > NMI would be only needed if the crash kernel is completely
> >> > hosed too.
> >>
> >> That's the case.
> >
> >You're saying it booted but is hosed after boot?
> >
> >That seems like a very obscure case. Is that really common?
> >
>
> Actually, I met two of this kind of cases recently.
>
> One was stucked in early_idt_hander() during booting the second kernel,
> the other one is even worse, I even can't find where it hangs.
If you can't initialize the idt, will NMI work?
I think we need to fix those cases instead of creating another NMI
handler. To handle cases like network is not up or there is not sufficient
space on disk, a user space utility in kdump initrd should work.
Thanks
Vivek