Hi,
Following set of patches is an update to kdump on i386 and x86_64. Changes
arises mainly due to review comments received during last posting of
kdump for x86_64.
Vivek Goyal:
- i386 save ss esp bug fix
- dynamic per cpu allocation of memory for saving cpu registers
- export per cpu crash notes pointer through sysfs
- save registers early (inline functions)
Murali M:
- x86_64 add memmmap command line option
- x86_64 add elfcorehdr command line option
- x86_64 kexec on panic
- x86_64 save cpu registers upon crash
Rachita Kothiyal:
- read previous kernel's memory
- kexec increase max segment limit
This set of patches has been tested on i386 and em64t x86_64 machine. Crash
dumps can be captured and opened using gdb and crash-4.0-2.12.
Corresponding kexec-tools patches are being posted in a separate set
on fastboot mailing list.
Thanks
Vivek
o This patch fixes a minor bug based on Andi Kleen's suggestion. asm's can't
be broken in this particular case, hence merging them.
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff -puN arch/i386/kernel/crash.c~kexec-i386-ss-esp-bug-fix arch/i386/kernel/crash.c
--- linux-2.6.15-rc1-1M-dynamic/arch/i386/kernel/crash.c~kexec-i386-ss-esp-bug-fix 2005-11-15 12:29:53.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash.c 2005-11-15 14:01:21.000000000 +0530
@@ -108,8 +108,10 @@ static void crash_setup_regs(struct pt_r
{
memcpy(newregs, oldregs, sizeof(*newregs));
newregs->esp = (unsigned long)&(oldregs->esp);
- __asm__ __volatile__("xorl %eax, %eax;");
- __asm__ __volatile__ ("movw %%ss, %%ax;" :"=a"(newregs->xss));
+ __asm__ __volatile__(
+ "xorl %%eax, %%eax\n\t"
+ "movw %%ss, %%ax\n\t"
+ :"=a"(newregs->xss));
}
/* We may have saved_regs from where the error came from
_
o In case of system crash, current state of cpu registers is saved in memory
in elf note format. So far memory for storing elf notes was being allocated
statically for NR_CPUS.
o This patch introduces dynamic allocation of memory for storing elf notes.
It uses alloc_percpu() interface. This should lead to better memory usage.
o Introduced based on Andi Kleen's and Eric W. Biederman's suggestions.
o This patch also moves memory allocation for elf notes from architecture
dependent portion to architecture independent portion. Now crash_notes
is architecture independent. The whole idea is that size of memory to be
allocated per cpu (MAX_NOTE_BYTES) can be architecture dependent and
allocation of this memory can be architecture independent.
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash.c | 5 +--
linux-2.6.15-rc1-1M-dynamic-root/arch/ppc/kernel/machine_kexec.c | 6 ---
linux-2.6.15-rc1-1M-dynamic-root/arch/ppc64/kernel/machine_kexec.c | 3 -
linux-2.6.15-rc1-1M-dynamic-root/arch/s390/kernel/crash.c | 2 -
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash.c | 2 -
linux-2.6.15-rc1-1M-dynamic-root/include/asm-i386/kexec.h | 3 -
linux-2.6.15-rc1-1M-dynamic-root/include/asm-powerpc/kexec.h | 3 -
linux-2.6.15-rc1-1M-dynamic-root/include/asm-s390/kexec.h | 3 -
linux-2.6.15-rc1-1M-dynamic-root/include/asm-x86_64/kexec.h | 3 -
linux-2.6.15-rc1-1M-dynamic-root/include/linux/kexec.h | 2 +
linux-2.6.15-rc1-1M-dynamic-root/kernel/kexec.c | 16 ++++++++++
11 files changed, 21 insertions(+), 27 deletions(-)
diff -puN arch/i386/kernel/crash.c~kdump-dynamic-per-cpu-elf-note-memory-alloc arch/i386/kernel/crash.c
--- linux-2.6.15-rc1-1M-dynamic/arch/i386/kernel/crash.c~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash.c 2005-11-15 14:09:52.000000000 +0530
@@ -25,7 +25,6 @@
#include <mach_ipi.h>
-note_buf_t crash_notes[NR_CPUS];
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
@@ -72,7 +71,9 @@ static void crash_save_this_cpu(struct p
* squirrelled away. ELF notes happen to provide
* all of that that no need to invent something new.
*/
- buf = &crash_notes[cpu][0];
+ buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
memset(&prstatus, 0, sizeof(prstatus));
prstatus.pr_pid = current->pid;
elf_core_copy_regs(&prstatus.pr_reg, regs);
diff -puN arch/ppc64/kernel/machine_kexec.c~kdump-dynamic-per-cpu-elf-note-memory-alloc arch/ppc64/kernel/machine_kexec.c
--- linux-2.6.15-rc1-1M-dynamic/arch/ppc64/kernel/machine_kexec.c~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/ppc64/kernel/machine_kexec.c 2005-11-15 14:09:52.000000000 +0530
@@ -28,9 +28,6 @@
#define HASH_GROUP_SIZE 0x80 /* size of each hash group, asm/mmu.h */
-/* Have this around till we move it into crash specific file */
-note_buf_t crash_notes[NR_CPUS];
-
/* Dummy for now. Not sure if we need to have a crash shutdown in here
* and if what it will achieve. Letting it be now to compile the code
* in generic kexec environment
diff -puN arch/ppc/kernel/machine_kexec.c~kdump-dynamic-per-cpu-elf-note-memory-alloc arch/ppc/kernel/machine_kexec.c
--- linux-2.6.15-rc1-1M-dynamic/arch/ppc/kernel/machine_kexec.c~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/ppc/kernel/machine_kexec.c 2005-11-15 14:09:52.000000000 +0530
@@ -28,12 +28,6 @@ typedef NORET_TYPE void (*relocate_new_k
const extern unsigned char relocate_new_kernel[];
const extern unsigned int relocate_new_kernel_size;
-/*
- * Provide a dummy crash_notes definition while crash dump arrives to ppc.
- * This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
- */
-note_buf_t crash_notes[NR_CPUS];
-
void machine_shutdown(void)
{
if (ppc_md.machine_shutdown)
diff -puN arch/s390/kernel/crash.c~kdump-dynamic-per-cpu-elf-note-memory-alloc arch/s390/kernel/crash.c
--- linux-2.6.15-rc1-1M-dynamic/arch/s390/kernel/crash.c~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/s390/kernel/crash.c 2005-11-15 14:09:52.000000000 +0530
@@ -10,8 +10,6 @@
#include <linux/threads.h>
#include <linux/kexec.h>
-note_buf_t crash_notes[NR_CPUS];
-
void machine_crash_shutdown(struct pt_regs *regs)
{
}
diff -puN arch/x86_64/kernel/crash.c~kdump-dynamic-per-cpu-elf-note-memory-alloc arch/x86_64/kernel/crash.c
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/crash.c~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash.c 2005-11-15 14:09:52.000000000 +0530
@@ -19,8 +19,6 @@
#include <asm/nmi.h>
#include <asm/hw_irq.h>
-note_buf_t crash_notes[NR_CPUS];
-
void machine_crash_shutdown(struct pt_regs *regs)
{
/* This function is only called after the system
diff -puN include/asm-i386/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc include/asm-i386/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/asm-i386/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/asm-i386/kexec.h 2005-11-15 14:09:52.000000000 +0530
@@ -26,8 +26,5 @@
#define KEXEC_ARCH KEXEC_ARCH_386
#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
-extern note_buf_t crash_notes[];
#endif /* _I386_KEXEC_H */
diff -puN include/asm-powerpc/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc include/asm-powerpc/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/asm-powerpc/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/asm-powerpc/kexec.h 2005-11-15 14:09:52.000000000 +0530
@@ -33,9 +33,6 @@
#ifndef __ASSEMBLY__
#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES / sizeof(u32)];
-
-extern note_buf_t crash_notes[];
#ifdef __powerpc64__
extern void kexec_smp_wait(void); /* get and clear naca physid, wait for
diff -puN include/asm-s390/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc include/asm-s390/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/asm-s390/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:51.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/asm-s390/kexec.h 2005-11-15 14:09:52.000000000 +0530
@@ -35,8 +35,5 @@
#define KEXEC_ARCH KEXEC_ARCH_S390
#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
-extern note_buf_t crash_notes[];
#endif /*_S390_KEXEC_H */
diff -puN include/asm-x86_64/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc include/asm-x86_64/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/asm-x86_64/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:52.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/asm-x86_64/kexec.h 2005-11-15 14:09:52.000000000 +0530
@@ -26,8 +26,5 @@
#define KEXEC_ARCH KEXEC_ARCH_X86_64
#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
-extern note_buf_t crash_notes[];
#endif /* _X86_64_KEXEC_H */
diff -puN include/linux/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc include/linux/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/linux/kexec.h~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:52.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/linux/kexec.h 2005-11-15 14:09:52.000000000 +0530
@@ -125,6 +125,8 @@ extern struct kimage *kexec_image;
/* Location of a reserved region to hold the crash kernel.
*/
extern struct resource crashk_res;
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+extern note_buf_t *crash_notes;
#else /* !CONFIG_KEXEC */
struct pt_regs;
diff -puN kernel/kexec.c~kdump-dynamic-per-cpu-elf-note-memory-alloc kernel/kexec.c
--- linux-2.6.15-rc1-1M-dynamic/kernel/kexec.c~kdump-dynamic-per-cpu-elf-note-memory-alloc 2005-11-15 14:09:52.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/kernel/kexec.c 2005-11-15 14:09:52.000000000 +0530
@@ -26,6 +26,9 @@
#include <asm/system.h>
#include <asm/semaphore.h>
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t* crash_notes;
+
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
@@ -1060,3 +1063,16 @@ void crash_kexec(struct pt_regs *regs)
xchg(&kexec_lock, 0);
}
}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ crash_notes = alloc_percpu(note_buf_t);
+ if (!crash_notes) {
+ printk("Kexec: Memory allocation for saving cpu register"
+ " states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+module_init(crash_notes_memory_init)
_
o Kexec on panic functionality allocates memory for saving cpu registers
in case of system crash event. Address of this allocated memory needs to
be exported to user space, which is used by kexec-tools.
o Previously, a single /sys/kernel/crash_notes entry was being exported as
memory allocated was a single continuous array. Now memory allocation being
dyanmic and per cpu based, address of per cpu buffer is exported through
"/sys/devices/system/cpu/cpuX/crash_notes"
Signed-off-by: Vivek Goyal <[email protected]>
---
diff -puN drivers/base/cpu.c~kdump-export-crash-notes-through-sysfs drivers/base/cpu.c
--- linux-2.6.15-rc1-16M-dynamic/drivers/base/cpu.c~kdump-export-crash-notes-through-sysfs 2005-11-17 08:47:10.000000000 -0800
+++ linux-2.6.15-rc1-16M-dynamic-root/drivers/base/cpu.c 2005-11-17 08:49:33.000000000 -0800
@@ -83,6 +83,33 @@ static inline void register_cpu_control(
}
#endif /* CONFIG_HOTPLUG_CPU */
+#ifdef CONFIG_KEXEC
+#include <linux/kexec.h>
+
+static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+{
+ struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+ ssize_t rc;
+ unsigned long long addr;
+ int cpunum;
+
+ cpunum = cpu->sysdev.id;
+
+ /*
+ * Might be reading other cpu's data based on which cpu read thread
+ * has been scheduled. But cpu data (memory) is allocated once during
+ * boot up and this data does not change there after. Hence this
+ * operation should be safe. No locking required.
+ */
+ get_cpu();
+ addr = __pa(per_cpu_ptr(crash_notes, cpunum));
+ rc = sprintf(buf, "%Lx\n", addr);
+ put_cpu();
+ return rc;
+}
+static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
+#endif
+
/*
* register_cpu - Setup a driverfs device for a CPU.
* @cpu - Callers can set the cpu->no_control field to 1, to indicate not to
@@ -108,6 +135,11 @@ int __devinit register_cpu(struct cpu *c
register_cpu_control(cpu);
if (!error)
cpu_sys_devices[num] = &cpu->sysdev;
+
+#ifdef CONFIG_KEXEC
+ if (!error)
+ error = sysdev_create_file(&cpu->sysdev, &attr_crash_notes);
+#endif
return error;
}
diff -puN kernel/ksysfs.c~kdump-export-crash-notes-through-sysfs kernel/ksysfs.c
--- linux-2.6.15-rc1-16M-dynamic/kernel/ksysfs.c~kdump-export-crash-notes-through-sysfs 2005-11-17 08:47:10.000000000 -0800
+++ linux-2.6.15-rc1-16M-dynamic-root/kernel/ksysfs.c 2005-11-17 08:47:10.000000000 -0800
@@ -30,16 +30,6 @@ static ssize_t hotplug_seqnum_show(struc
KERNEL_ATTR_RO(hotplug_seqnum);
#endif
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
-{
- return sprintf(page, "%p\n", (void *)crash_notes);
-}
-KERNEL_ATTR_RO(crash_notes);
-#endif
-
decl_subsys(kernel, NULL, NULL);
EXPORT_SYMBOL_GPL(kernel_subsys);
@@ -47,9 +37,6 @@ static struct attribute * kernel_attrs[]
#ifdef CONFIG_HOTPLUG
&hotplug_seqnum_attr.attr,
#endif
-#ifdef CONFIG_KEXEC
- &crash_notes_attr.attr,
-#endif
NULL
};
_
o If system panics then cpu register states are captured through funciton
crash_get_current_regs(). This is not a inline function hence a stack
frame is pushed on to the stack and then cpu register state is captured.
Later this frame is popped and new frames are pushed (machine_kexec).
o In theory this is not very right as we are capturing register states for
a frame and that frame is no more valid. This seems to have created back
trace problems for ppc64.
o This patch fixes it up. The very first thing it does after entering
crash_kexec() is to capture the register states. Anyway we don't want the
back trace beyond crash_kexec(). crash_get_current_regs() has been made
inline
o crash_setup_regs() is the top architecture dependent function which should
be responsible for capturing the register states as well as to do some
architecture dependent tricks. For ex. fixing up ss and esp for i386.
crash_setup_regs() has also been made inline to ensure no new call frame
is pushed onto stack.
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash.c | 47 --------------
linux-2.6.15-rc1-1M-dynamic-root/include/asm-i386/kexec.h | 45 +++++++++++++
linux-2.6.15-rc1-1M-dynamic-root/kernel/kexec.c | 4 -
3 files changed, 51 insertions(+), 45 deletions(-)
diff -puN arch/i386/kernel/crash.c~kdump-save-registers-early arch/i386/kernel/crash.c
--- linux-2.6.15-rc1-1M-dynamic/arch/i386/kernel/crash.c~kdump-save-registers-early 2005-11-15 14:22:35.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash.c 2005-11-15 16:48:34.000000000 +0530
@@ -82,53 +82,12 @@ static void crash_save_this_cpu(struct p
final_note(buf);
}
-static void crash_get_current_regs(struct pt_regs *regs)
+static void crash_save_self(struct pt_regs *regs)
{
- __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx));
- __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx));
- __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx));
- __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi));
- __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi));
- __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp));
- __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax));
- __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp));
- __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss));
- __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs));
- __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds));
- __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes));
- __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags));
-
- regs->eip = (unsigned long)current_text_addr();
-}
-
-/* CPU does not save ss and esp on stack if execution is already
- * running in kernel mode at the time of NMI occurrence. This code
- * fixes it.
- */
-static void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs)
-{
- memcpy(newregs, oldregs, sizeof(*newregs));
- newregs->esp = (unsigned long)&(oldregs->esp);
- __asm__ __volatile__(
- "xorl %%eax, %%eax\n\t"
- "movw %%ss, %%ax\n\t"
- :"=a"(newregs->xss));
-}
-
-/* We may have saved_regs from where the error came from
- * or it is NULL if via a direct panic().
- */
-static void crash_save_self(struct pt_regs *saved_regs)
-{
- struct pt_regs regs;
int cpu;
cpu = smp_processor_id();
- if (saved_regs)
- crash_setup_regs(®s, saved_regs);
- else
- crash_get_current_regs(®s);
- crash_save_this_cpu(®s, cpu);
+ crash_save_this_cpu(regs, cpu);
}
#ifdef CONFIG_SMP
@@ -147,7 +106,7 @@ static int crash_nmi_callback(struct pt_
local_irq_disable();
if (!user_mode(regs)) {
- crash_setup_regs(&fixed_regs, regs);
+ crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
}
crash_save_this_cpu(regs, cpu);
diff -puN kernel/kexec.c~kdump-save-registers-early kernel/kexec.c
--- linux-2.6.15-rc1-1M-dynamic/kernel/kexec.c~kdump-save-registers-early 2005-11-15 14:33:35.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/kernel/kexec.c 2005-11-15 14:40:35.000000000 +0530
@@ -1057,7 +1057,9 @@ void crash_kexec(struct pt_regs *regs)
if (!locked) {
image = xchg(&kexec_crash_image, NULL);
if (image) {
- machine_crash_shutdown(regs);
+ struct pt_regs fixed_regs;
+ crash_setup_regs(&fixed_regs, regs);
+ machine_crash_shutdown(&fixed_regs);
machine_kexec(image);
}
xchg(&kexec_lock, 0);
diff -puN include/asm-i386/kexec.h~kdump-save-registers-early include/asm-i386/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/asm-i386/kexec.h~kdump-save-registers-early 2005-11-15 14:39:07.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/asm-i386/kexec.h 2005-11-15 16:51:46.000000000 +0530
@@ -2,6 +2,7 @@
#define _I386_KEXEC_H
#include <asm/fixmap.h>
+#include <asm/ptrace.h>
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -27,4 +28,48 @@
#define MAX_NOTE_BYTES 1024
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
+static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+ memcpy(newregs, oldregs, sizeof(*newregs));
+ newregs->esp = (unsigned long)&(oldregs->esp);
+ __asm__ __volatile__(
+ "xorl %%eax, %%eax\n\t"
+ "movw %%ss, %%ax\n\t"
+ :"=a"(newregs->xss));
+}
+
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
+static inline void crash_setup_regs(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+ if (oldregs)
+ crash_fixup_ss_esp(newregs, oldregs);
+ else {
+ __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+ __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+ __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+ __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+ __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+ __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+ __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+ __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+ __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss));
+ __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs));
+ __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds));
+ __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes));
+ __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+ newregs->eip = (unsigned long)current_text_addr();
+ }
+}
+
#endif /* _I386_KEXEC_H */
_
o This patch introduces the memmap option for x86_64 similar to i386.
o memmap=exactmap enables setting of an exact E820 memory map, as
specified by the user.
Changes in this version:
o Used e820_end_of_ram() to find the max_pfn as suggested by
Andi kleen.
o removed PFN_UP & PFN_DOWN macros
o Printing the user defined map also.
Signed-off-by:Murali M Chakravarthy <[email protected]>
Signed-off-by:Hariprasad Nellitheertha<[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/Documentation/kernel-parameters.txt | 2
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/e820.c | 21 +++++++
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/setup.c | 27 ++++++++++
linux-2.6.15-rc1-1M-dynamic-root/include/asm-x86_64/e820.h | 1
4 files changed, 50 insertions(+), 1 deletion(-)
diff -puN arch/x86_64/kernel/e820.c~x86_64-memmap-command-line-option arch/x86_64/kernel/e820.c
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/e820.c~x86_64-memmap-command-line-option 2005-11-17 11:10:58.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/e820.c 2005-11-17 11:10:58.000000000 +0530
@@ -560,6 +560,27 @@ void __init parse_memopt(char *p, char *
end_user_pfn >>= PAGE_SHIFT;
}
+void __init parse_memmapopt(char *p, char **from)
+{
+ unsigned long long start_at, mem_size;
+
+ mem_size = memparse(p, from);
+ p = *from;
+ if (*p == '@') {
+ start_at = memparse(p+1, from);
+ add_memory_region(start_at, mem_size, E820_RAM);
+ } else if (*p == '#') {
+ start_at = memparse(p+1, from);
+ add_memory_region(start_at, mem_size, E820_ACPI);
+ } else if (*p == '$') {
+ start_at = memparse(p+1, from);
+ add_memory_region(start_at, mem_size, E820_RESERVED);
+ } else {
+ end_user_pfn = (mem_size >> PAGE_SHIFT);
+ }
+ p = *from;
+}
+
unsigned long pci_mem_start = 0xaeedbabe;
/*
diff -puN arch/x86_64/kernel/setup.c~x86_64-memmap-command-line-option arch/x86_64/kernel/setup.c
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/setup.c~x86_64-memmap-command-line-option 2005-11-17 11:10:58.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/setup.c 2005-11-17 12:00:35.000000000 +0530
@@ -274,6 +274,7 @@ static __init void parse_cmdline_early (
{
char c = ' ', *to = command_line, *from = COMMAND_LINE;
int len = 0;
+ int userdef = 0;
/* Save unparsed command line copy for /proc/cmdline */
memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE);
@@ -356,6 +357,28 @@ static __init void parse_cmdline_early (
if (!memcmp(from, "mem=", 4))
parse_memopt(from+4, &from);
+ if (!memcmp(from, "memmap=", 7)) {
+ /* exactmap option is for used defined memory */
+ if (!memcmp(from+7, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size before original memory map is
+ * reset.
+ */
+ saved_max_pfn = e820_end_of_ram();
+#endif
+ from += 8+7;
+ end_pfn_map = 0;
+ e820.nr_map = 0;
+ userdef = 1;
+ }
+ else {
+ parse_memmapopt(from+7, &from);
+ userdef = 1;
+ }
+ }
+
#ifdef CONFIG_NUMA
if (!memcmp(from, "numa=", 5))
numa_setup(from+5);
@@ -402,6 +425,10 @@ static __init void parse_cmdline_early (
break;
*(to++) = c;
}
+ if (userdef) {
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ e820_print_map("user");
+ }
*to = '\0';
*cmdline_p = command_line;
}
diff -puN Documentation/kernel-parameters.txt~x86_64-memmap-command-line-option Documentation/kernel-parameters.txt
--- linux-2.6.15-rc1-1M-dynamic/Documentation/kernel-parameters.txt~x86_64-memmap-command-line-option 2005-11-17 11:10:58.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/Documentation/kernel-parameters.txt 2005-11-17 12:00:35.000000000 +0530
@@ -824,7 +824,7 @@ running once the system is up.
mem=nopentium [BUGS=IA-32] Disable usage of 4MB pages for kernel
memory.
- memmap=exactmap [KNL,IA-32] Enable setting of an exact
+ memmap=exactmap [KNL,IA-32,X86_64] Enable setting of an exact
E820 memory map, as specified by the user.
Such memmap=exactmap lines can be constructed based on
BIOS output or other requirements. See the memmap=nn@ss
diff -puN include/asm-x86_64/e820.h~x86_64-memmap-command-line-option include/asm-x86_64/e820.h
--- linux-2.6.15-rc1-1M-dynamic/include/asm-x86_64/e820.h~x86_64-memmap-command-line-option 2005-11-17 11:10:58.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/asm-x86_64/e820.h 2005-11-17 11:10:58.000000000 +0530
@@ -55,6 +55,7 @@ extern unsigned long e820_hole_size(unsi
unsigned long end_pfn);
extern void __init parse_memopt(char *p, char **end);
+extern void __init parse_memmapopt(char *p, char **end);
extern struct e820map e820;
#endif/*!__ASSEMBLY__*/
_
o elfcorehdr= specifies the location of elf core header stored by the
crashed kernel. This command line option will be passed by the
kexec-tools to capture kernel.
Changes in this version :
o Added more comments in kernel-parameters.txt and in code.
Signed-off-by:Murali M Chakravarthy <[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/Documentation/kernel-parameters.txt | 7 ++++---
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/setup.c | 9 +++++++++
2 files changed, 13 insertions(+), 3 deletions(-)
diff -puN arch/x86_64/kernel/setup.c~x86_64-elfcorehdr-command-line-option arch/x86_64/kernel/setup.c
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/setup.c~x86_64-elfcorehdr-command-line-option 2005-11-17 11:11:07.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/setup.c 2005-11-17 11:11:07.000000000 +0530
@@ -42,6 +42,7 @@
#include <linux/edd.h>
#include <linux/mmzone.h>
#include <linux/kexec.h>
+#include <linux/crash_dump.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
@@ -417,6 +418,14 @@ static __init void parse_cmdline_early (
}
#endif
+#ifdef CONFIG_CRASH_DUMP
+ /* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+ else if(!memcmp(from, "elfcorehdr=", 11))
+ elfcorehdr_addr = memparse(from+11, &from);
+#endif
next_char:
c = *(from++);
if (!c)
diff -puN Documentation/kernel-parameters.txt~x86_64-elfcorehdr-command-line-option Documentation/kernel-parameters.txt
--- linux-2.6.15-rc1-1M-dynamic/Documentation/kernel-parameters.txt~x86_64-elfcorehdr-command-line-option 2005-11-17 11:11:07.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/Documentation/kernel-parameters.txt 2005-11-17 11:11:07.000000000 +0530
@@ -475,10 +475,11 @@ running once the system is up.
See Documentation/block/as-iosched.txt and
Documentation/block/deadline-iosched.txt for details.
- elfcorehdr= [IA-32]
+ elfcorehdr= [IA-32, X86_64]
Specifies physical address of start of kernel core
- image elf header.
- See Documentation/kdump.txt for details.
+ image elf header. Generally kexec loader will
+ pass this option to capture kernel.
+ See Documentation/kdump/kdump.txt for details.
enforcing [SELINUX] Set initial enforcing status.
Format: {"0" | "1"}
_
o Implementing the machine_crash_shutdown for x86_64 which will be
called by crash_kexec (called in case of a panic, sysrq etc.).
Here we do things similar to i386. Disable the interrupts, shootdown
the cpus and shutdown LAPIC and IOAPIC.
Changes in this version:
o As the Eric's APIC initialization patches are reverted back,
reintroducing LAPIC and IOAPIC shutdown.
o Added some comments on CPU hotplug, modified code as suggested
by Andi kleen.
Signed-off-by:Murali M Chakravarthy <[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash.c | 86 +++++++++++-
1 files changed, 85 insertions(+), 1 deletion(-)
diff -puN arch/x86_64/kernel/crash.c~x86_64-kexec-on-panic arch/x86_64/kernel/crash.c
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/crash.c~x86_64-kexec-on-panic 2005-11-17 11:11:10.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash.c 2005-11-17 11:54:15.000000000 +0530
@@ -13,15 +13,85 @@
#include <linux/smp.h>
#include <linux/reboot.h>
#include <linux/kexec.h>
+#include <linux/delay.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
#include <asm/hw_irq.h>
+#include <asm/mach_apic.h>
+
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+{
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
+ * Otherwise, system will completely hang. Crashing cpu can get
+ * an NMI if system was initially booted with nmi_watchdog parameter.
+ */
+ if (cpu == crashing_cpu)
+ return 1;
+ local_irq_disable();
+
+ disable_local_APIC();
+ atomic_dec(&waiting_for_crash_ipi);
+ /* Assume hlt works */
+ for(;;)
+ asm("hlt");
+
+ return 1;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+ send_IPI_allbutself(APIC_DM_NMI);
+}
+
+/*
+ * This code is a best effort heuristic to get the
+ * other cpus to stop executing. So races with
+ * cpu hotplug shouldn't matter.
+ */
+
+static void nmi_shootdown_cpus(void)
+{
+ unsigned long msecs;
+
+ atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+ set_nmi_callback(crash_nmi_callback);
+
+ /*
+ * Ensure the new callback function is set before sending
+ * out the NMI
+ */
+ wmb();
+
+ smp_send_nmi_allbutself();
+
+ msecs = 1000; /* Wait at most a second for the other cpus to stop */
+ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+ mdelay(1);
+ msecs--;
+ }
+ /* Leave the nmi callback set */
+ disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+ /* There are no cpus to shootdown */
+}
+#endif
void machine_crash_shutdown(struct pt_regs *regs)
{
- /* This function is only called after the system
+ /*
+ * This function is only called after the system
* has paniced or is otherwise in a critical state.
* The minimum amount of code to allow a kexec'd kernel
* to run successfully needs to happen here.
@@ -29,4 +99,18 @@ void machine_crash_shutdown(struct pt_re
* In practice this means shooting down the other cpus in
* an SMP system.
*/
+ /* The kernel is broken so disable interrupts */
+ local_irq_disable();
+
+ /* Make a note of crashing cpu. Will be used in NMI callback.*/
+ crashing_cpu = smp_processor_id();
+ nmi_shootdown_cpus();
+
+ if(cpu_has_apic)
+ disable_local_APIC();
+
+#if defined(CONFIG_X86_IO_APIC)
+ disable_IO_APIC();
+#endif
+
}
_
o Saving the cpu registers of all cpus before booting in to the crash kernel.
o crash_setup_regs will save the registers of the cpu on which
panic has occured. One of the concerns ppc64 folks raised is that after
capturing the register states, one should not pop the current call frame
and push new one. Hence it has been inlined. More call frames later get
pushed on to stack (machine_crash_shutdown() and machine_kexec()), but one
will not want to backtrace those.
o Not very sure about the CFI annotations. With this patch I am
getting decent backtrace with gdb. Assuming, compiler has generated
enough debugging information for crash_kexec(). Coding crash_setup_regs()
in pure assembly makes it tricky because then it can not be inlined and
we don't want to return back after capturing register states we don't
want to pop this call frame.
o Saving the non-panicing cpus registers will be done in the NMI handler
while shooting down them in machine_crash_shutdown.
o Introducing CRASH_DUMP option in Kconfig for x86_64.
Signed-off-by:Murali M Chakravarthy <[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/Kconfig | 7 +
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash.c | 70 ++++++++++++
linux-2.6.15-rc1-1M-dynamic-root/include/asm-x86_64/kexec.h | 36 ++++++
3 files changed, 113 insertions(+)
diff -puN arch/x86_64/Kconfig~x86_64-save-cpu-registers-upon-crash arch/x86_64/Kconfig
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/Kconfig~x86_64-save-cpu-registers-upon-crash 2005-11-17 11:57:08.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/Kconfig 2005-11-17 11:57:08.000000000 +0530
@@ -402,6 +402,13 @@ config KEXEC
support. As of this writing the exact hardware interface is
strongly in flux, so no good recommendation can be made.
+config CRASH_DUMP
+ bool "kernel crash dumps (EXPERIMENTAL)"
+ depends on EMBEDDED
+ depends on EXPERIMENTAL
+ help
+ Generate crash dump after being started by kexec.
+
config SECCOMP
bool "Enable seccomp to safely compute untrusted bytecode"
depends on PROC_FS
diff -puN arch/x86_64/kernel/crash.c~x86_64-save-cpu-registers-upon-crash arch/x86_64/kernel/crash.c
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/crash.c~x86_64-save-cpu-registers-upon-crash 2005-11-17 11:57:08.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash.c 2005-11-17 11:57:08.000000000 +0530
@@ -11,9 +11,12 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
+#include <linux/irq.h>
#include <linux/reboot.h>
#include <linux/kexec.h>
#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -24,6 +27,71 @@
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type,
+ void *data, size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, ¬e, sizeof(note));
+ buf += (sizeof(note) +3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, ¬e, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= NR_CPUS))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that that no need to invent something new.
+ */
+
+ buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+
+ if (!buf)
+ return;
+
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+ int cpu;
+
+ cpu = smp_processor_id();
+ crash_save_this_cpu(regs, cpu);
+}
+
#ifdef CONFIG_SMP
static atomic_t waiting_for_crash_ipi;
@@ -38,6 +106,7 @@ static int crash_nmi_callback(struct pt_
return 1;
local_irq_disable();
+ crash_save_this_cpu(regs, cpu);
disable_local_APIC();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -113,4 +182,5 @@ void machine_crash_shutdown(struct pt_re
disable_IO_APIC();
#endif
+ crash_save_self(regs);
}
diff -puN include/asm-x86_64/kexec.h~x86_64-save-cpu-registers-upon-crash include/asm-x86_64/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/asm-x86_64/kexec.h~x86_64-save-cpu-registers-upon-crash 2005-11-17 11:57:08.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/asm-x86_64/kexec.h 2005-11-17 11:57:08.000000000 +0530
@@ -3,6 +3,7 @@
#include <asm/page.h>
#include <asm/proto.h>
+#include <asm/ptrace.h>
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -27,4 +28,39 @@
#define MAX_NOTE_BYTES 1024
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
+
+static inline void crash_setup_regs(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+ if (oldregs)
+ memcpy(newregs, oldregs, sizeof(*newregs));
+ else {
+ __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
+ __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
+ __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
+ __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
+ __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
+ __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
+ __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
+ __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+ __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+ __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+ __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+ __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+ __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+ __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+ __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+ __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+ __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+
+ newregs->rip = (unsigned long)current_text_addr();
+ }
+}
#endif /* _X86_64_KEXEC_H */
_
o Moving the crash_dump.c file to arch dependent part as
kmap_atomic_pfn is specific to i386 and highmem may not
exist in other archs.
o Use ioremap for x86_64 to map the previous kernel memory.
o In copy_oldmem_page(), we now directly copy to the user/kernel
buffer and avoid the unneccesary copy to a kmalloc'd page.
Signed-off-by:Rachita Kothiyal <[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
---
/dev/null | 61 ----------
linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/Makefile | 1
linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash_dump.c | 47 +++++++
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/Makefile | 1
linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash_dump.c | 47 +++++++
linux-2.6.15-rc1-1M-dynamic-root/fs/proc/vmcore.c | 3
linux-2.6.15-rc1-1M-dynamic-root/kernel/Makefile | 1
7 files changed, 99 insertions(+), 62 deletions(-)
diff -puN /dev/null arch/i386/kernel/crash_dump.c
--- /dev/null 2005-11-15 21:13:36.730096500 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/crash_dump.c 2005-11-17 11:19:55.000000000 +0530
@@ -0,0 +1,47 @@
+/*
+ * kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha ([email protected])
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uaccess.h>
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *vaddr;
+
+ if (!csize)
+ return 0;
+
+ vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+
+ if (userbuf) {
+ if (copy_to_user(buf, (vaddr + offset), csize)) {
+ kunmap_atomic(vaddr, KM_PTE0);
+ return -EFAULT;
+ }
+ } else
+ memcpy(buf, (vaddr + offset), csize);
+
+ kunmap_atomic(vaddr, KM_PTE0);
+ return csize;
+}
diff -puN arch/i386/kernel/Makefile~read-previous-kernel-memory arch/i386/kernel/Makefile
--- linux-2.6.15-rc1-1M-dynamic/arch/i386/kernel/Makefile~read-previous-kernel-memory 2005-11-17 11:19:55.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/i386/kernel/Makefile 2005-11-17 11:19:55.000000000 +0530
@@ -25,6 +25,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o n
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_X86_NUMAQ) += numaq.o
obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o
obj-$(CONFIG_KPROBES) += kprobes.o
diff -puN /dev/null arch/x86_64/kernel/crash_dump.c
--- /dev/null 2005-11-15 21:13:36.730096500 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/crash_dump.c 2005-11-17 11:19:55.000000000 +0530
@@ -0,0 +1,47 @@
+/*
+ * kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha ([email protected])
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *vaddr;
+
+ if (!csize)
+ return 0;
+
+ vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+
+ if (userbuf) {
+ if (copy_to_user(buf, (vaddr + offset), csize)) {
+ iounmap(vaddr);
+ return -EFAULT;
+ }
+ } else
+ memcpy(buf, (vaddr + offset), csize);
+
+ iounmap(vaddr);
+ return csize;
+}
diff -puN arch/x86_64/kernel/Makefile~read-previous-kernel-memory arch/x86_64/kernel/Makefile
--- linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/Makefile~read-previous-kernel-memory 2005-11-17 11:19:55.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/arch/x86_64/kernel/Makefile 2005-11-17 11:19:55.000000000 +0530
@@ -21,6 +21,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_PM) += suspend.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff -puN fs/proc/vmcore.c~read-previous-kernel-memory fs/proc/vmcore.c
--- linux-2.6.15-rc1-1M-dynamic/fs/proc/vmcore.c~read-previous-kernel-memory 2005-11-17 11:19:55.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/fs/proc/vmcore.c 2005-11-17 11:19:55.000000000 +0530
@@ -35,6 +35,9 @@ static size_t elfcorebuf_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
struct proc_dir_entry *proc_vmcore = NULL;
/* Reads a page from the oldmem device from given offset. */
diff -L kernel/crash_dump.c -puN kernel/crash_dump.c~read-previous-kernel-memory /dev/null
--- linux-2.6.15-rc1-1M-dynamic/kernel/crash_dump.c
+++ /dev/null 2005-11-15 21:13:36.730096500 +0530
@@ -1,61 +0,0 @@
-/*
- * kernel/crash_dump.c - Memory preserving reboot related code.
- *
- * Created by: Hariprasad Nellitheertha ([email protected])
- * Copyright (C) IBM Corporation, 2004. All rights reserved
- */
-
-#include <linux/smp_lock.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/bootmem.h>
-#include <linux/highmem.h>
-#include <linux/crash_dump.h>
-
-#include <asm/io.h>
-#include <asm/uaccess.h>
-
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
-{
- void *page, *vaddr;
-
- if (!csize)
- return 0;
-
- page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
- copy_page(page, vaddr);
- kunmap_atomic(vaddr, KM_PTE0);
-
- if (userbuf) {
- if (copy_to_user(buf, (page + offset), csize)) {
- kfree(page);
- return -EFAULT;
- }
- } else {
- memcpy(buf, (page + offset), csize);
- }
-
- kfree(page);
- return csize;
-}
diff -puN kernel/Makefile~read-previous-kernel-memory kernel/Makefile
--- linux-2.6.15-rc1-1M-dynamic/kernel/Makefile~read-previous-kernel-memory 2005-11-17 11:19:55.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/kernel/Makefile 2005-11-17 11:19:55.000000000 +0530
@@ -29,7 +29,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
_
o In some cases, the number of segments, on a kexec load, exceeds
the existing cap of 8. This patch increases the KEXEC_SEGMENT_MAX
limit from 8 to 16.
Signed-off-by:Rachita Kothiyal <[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-1M-dynamic-root/include/linux/kexec.h | 2 +-
1 files changed, 1 insertion(+), 1 deletion(-)
diff -puN include/linux/kexec.h~kexec-increase-max-segment-limit include/linux/kexec.h
--- linux-2.6.15-rc1-1M-dynamic/include/linux/kexec.h~kexec-increase-max-segment-limit 2005-11-17 11:11:15.000000000 +0530
+++ linux-2.6.15-rc1-1M-dynamic-root/include/linux/kexec.h 2005-11-17 11:11:15.000000000 +0530
@@ -41,7 +41,7 @@ typedef unsigned long kimage_entry_t;
#define IND_DONE 0x4
#define IND_SOURCE 0x8
-#define KEXEC_SEGMENT_MAX 8
+#define KEXEC_SEGMENT_MAX 16
struct kexec_segment {
void __user *buf;
size_t bufsz;
_
Please always generate diffs against the latest kernel! I changed the
patch to reflect the new location of ppc64's machine_kexec.c.
In that file, I notice that this comment has become more informative:
/*
* Provide a dummy crash_notes definition until crash dump is implemented.
* This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
*/
note_buf_t crash_notes[NR_CPUS];
Please check that with your new implementation, the above "breakage"
(whatever it was) remains fixed.
Vivek Goyal <[email protected]> wrote:
>
> + /*
> + * Might be reading other cpu's data based on which cpu read thread
> + * has been scheduled. But cpu data (memory) is allocated once during
> + * boot up and this data does not change there after. Hence this
> + * operation should be safe. No locking required.
> + */
> + get_cpu();
> + addr = __pa(per_cpu_ptr(crash_notes, cpunum));
> + rc = sprintf(buf, "%Lx\n", addr);
> + put_cpu();
I don't think the get_cpu() and put_cpu() are needed here?
Vivek Goyal <[email protected]> wrote:
>
> +ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
> + size_t csize, unsigned long offset, int userbuf)
> +{
> + void *vaddr;
> +
> + if (!csize)
> + return 0;
> +
> + vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
> +
> + if (userbuf) {
> + if (copy_to_user(buf, (vaddr + offset), csize)) {
> + kunmap_atomic(vaddr, KM_PTE0);
> + return -EFAULT;
The copy_*_user() inside kmap_atomic() is problematic.
On some configs (eg, x86, highmem) the process is running atomically, hence
the copy_*_user() will *refuse* to fault in the user's page if it's not
present. Because pagefaulting involves doing things which sleep.
So
a) This code will generate might_sleep() warnings at runtime and
b) It'll return -EFAULT for user pages which haven't been faulted in yet.
We do all sorts of gruesome tricks in mm/filemap.c to get around all this.
I don't think your code is as performance-sensitive, so a suitable fix
might be to double-copy the data. Make sure that the same physical page is
used as a bounce page for each copy (ie: get the caller to pass it in) and
that page will be cache-hot and the performance should be acceptable.
If it really is performance-sensitive then you'll need to play filemap.c
games. It'd be better to use a sleeping kmap instead, if poss. That's
kmap().
Please send an incremental patch when it's sorted.
Andrew Morton <[email protected]> writes:
> Vivek Goyal <[email protected]> wrote:
>>
>> +ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
>> + size_t csize, unsigned long offset, int userbuf)
>> +{
>> + void *vaddr;
>> +
>> + if (!csize)
>> + return 0;
>> +
>> + vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
>> +
>> + if (userbuf) {
>> + if (copy_to_user(buf, (vaddr + offset), csize)) {
>> + kunmap_atomic(vaddr, KM_PTE0);
>> + return -EFAULT;
>
> The copy_*_user() inside kmap_atomic() is problematic.
>
> On some configs (eg, x86, highmem) the process is running atomically, hence
> the copy_*_user() will *refuse* to fault in the user's page if it's not
> present. Because pagefaulting involves doing things which sleep.
>
> So
>
> a) This code will generate might_sleep() warnings at runtime and
>
> b) It'll return -EFAULT for user pages which haven't been faulted in yet.
>
>
> We do all sorts of gruesome tricks in mm/filemap.c to get around all this.
> I don't think your code is as performance-sensitive, so a suitable fix
> might be to double-copy the data. Make sure that the same physical page is
> used as a bounce page for each copy (ie: get the caller to pass it in) and
> that page will be cache-hot and the performance should be acceptable.
>
> If it really is performance-sensitive then you'll need to play filemap.c
> games. It'd be better to use a sleeping kmap instead, if poss. That's
> kmap().
>
> Please send an incremental patch when it's sorted.
I'm going send my standard grumble that what is really needed
is to track why we can't use /dev/mem.
We could solve this with a normal kmap except that we
quite possibly don't have a struct page for this page of memory.
/dev/mem does not allow reads in this situation because of how
problematic getting I/O reads correct is. But not having a good
interface for mapping it into user space is similar.
Could we simply make this into a file that you can mmap
but you can't read? I think that would be cleaner, and simpler.
Eric
Vivek Goyal wrote:
>o
>
>
>+static void crash_save_self(struct pt_regs *regs)
>+{
>+ int cpu;
>+
>+ cpu = smp_processor_id();
>+ crash_save_this_cpu(regs, cpu);
>+}
>+
>
>
>
>
I think, we can remove crash_save_self() and call crash_save_this_cpu()
directly from machine_crash_shutdown().
On Thu, Nov 17, 2005 at 02:01:38PM -0800, Andrew Morton wrote:
>
> Please always generate diffs against the latest kernel! I changed the
> patch to reflect the new location of ppc64's machine_kexec.c.
>
Sorry. That's a mistake. I shall take care of it next time onwards.
> In that file, I notice that this comment has become more informative:
>
> /*
> * Provide a dummy crash_notes definition until crash dump is implemented.
> * This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
> */
> note_buf_t crash_notes[NR_CPUS];
>
> Please check that with your new implementation, the above "breakage"
> (whatever it was) remains fixed.
>
With this patchset "crash_notes" has been moved in architecture independent
portion (). Hence there is no need for architecture dependent definitions.
So this change should be fine.
Thanks
Vivek
On Thu, Nov 17, 2005 at 02:01:38PM -0800, Andrew Morton wrote:
>
> Please always generate diffs against the latest kernel! I changed the
> patch to reflect the new location of ppc64's machine_kexec.c.
>
Hi Andrew, I just noticed in 2.6.15-rc1-mm2 that ppc64/machine_kexec.c has
been moved to powerpc/machine_kexec_64.c. So my and your changes have not
taken effect. I am attaching an incremental patch.
Thanks
Vivek
o The file ppc64/machine_kexec.c has now become powerpc/machine_kexec_64.c
This patch removes the crash_notes definition from machine_kexec_64.c
as crash_notes definition now has become architecture independent.
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-mm2-1M-root/arch/powerpc/kernel/machine_kexec_64.c | 3 ---
1 files changed, 3 deletions(-)
diff -puN arch/powerpc/kernel/machine_kexec_64.c~kdump-powerpc-remove-crash-notes arch/powerpc/kernel/machine_kexec_64.c
--- linux-2.6.15-rc1-mm2-1M/arch/powerpc/kernel/machine_kexec_64.c~kdump-powerpc-remove-crash-notes 2005-11-18 17:15:19.000000000 +0530
+++ linux-2.6.15-rc1-mm2-1M-root/arch/powerpc/kernel/machine_kexec_64.c 2005-11-18 17:15:45.000000000 +0530
@@ -28,9 +28,6 @@
#define HASH_GROUP_SIZE 0x80 /* size of each hash group, asm/mmu.h */
-/* Have this around till we move it into crash specific file */
-note_buf_t crash_notes[NR_CPUS];
-
/* Dummy for now. Not sure if we need to have a crash shutdown in here
* and if what it will achieve. Letting it be now to compile the code
* in generic kexec environment
_
On Thu, Nov 17, 2005 at 02:07:23PM -0800, Andrew Morton wrote:
> Vivek Goyal <[email protected]> wrote:
> >
> > + /*
> > + * Might be reading other cpu's data based on which cpu read thread
> > + * has been scheduled. But cpu data (memory) is allocated once during
> > + * boot up and this data does not change there after. Hence this
> > + * operation should be safe. No locking required.
> > + */
> > + get_cpu();
> > + addr = __pa(per_cpu_ptr(crash_notes, cpunum));
> > + rc = sprintf(buf, "%Lx\n", addr);
> > + put_cpu();
>
> I don't think the get_cpu() and put_cpu() are needed here?
Thanks. I have done the changes. Please find attached the incremental patch.
Thanks
Vivek
o Removes the call to get_cpu() and put_cpu() as it is not required.
Signed-off-by: Vivek Goyal <[email protected]>
---
linux-2.6.15-rc1-mm2-1M-root/drivers/base/cpu.c | 2 --
1 files changed, 2 deletions(-)
diff -puN drivers/base/cpu.c~kdump-export-crash-notes-sysfs-remove-get-cpu drivers/base/cpu.c
--- linux-2.6.15-rc1-mm2-1M/drivers/base/cpu.c~kdump-export-crash-notes-sysfs-remove-get-cpu 2005-11-18 16:08:28.000000000 +0530
+++ linux-2.6.15-rc1-mm2-1M-root/drivers/base/cpu.c 2005-11-18 16:08:28.000000000 +0530
@@ -101,10 +101,8 @@ static ssize_t show_crash_notes(struct s
* boot up and this data does not change there after. Hence this
* operation should be safe. No locking required.
*/
- get_cpu();
addr = __pa(per_cpu_ptr(crash_notes, cpunum));
rc = sprintf(buf, "%Lx\n", addr);
- put_cpu();
return rc;
}
static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
_
> diff -puN arch/x86_64/kernel/crash.c~x86_64-save-cpu-registers-upon-crash
> arch/x86_64/kernel/crash.c
> ---
> linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/crash.c~x86_64-save-cpu-registers-upon-crash
> #ifdef CONFIG_SMP
> static atomic_t waiting_for_crash_ipi;
>
> @@ -38,6 +106,7 @@ static int crash_nmi_callback(struct pt_
> return 1;
> local_irq_disable();
>
> + crash_save_this_cpu(regs, cpu);
> disable_local_APIC();
> atomic_dec(&waiting_for_crash_ipi);
> /* Assume hlt works */
> @@ -113,4 +182,5 @@ void machine_crash_shutdown(struct pt_re
> disable_IO_APIC();
> #endif
>
> + crash_save_self(regs);
> }
Where did this disable_local_APIC and disable_IO_APIC on x86_64 come
from? I know we had it on x86 but that was supposed to be a stop gap
until we have the real fix. Now I know it needs a little more
debugging but the real fix has been written. Putting it there on
x86_64 makes the code less reliable and it allows things to start
depending on it.
Please don't saving of the apics to x86_64 propagate this to x86_64.
The rest of the patch looks fine fairly sane, although from a purely
paranoid perspective I can see a few things I would need to look
at more to be certain they are safe.
Eric
On Fri, Nov 18, 2005 at 02:52:33PM -0700, Eric W. Biederman wrote:
>
> > diff -puN arch/x86_64/kernel/crash.c~x86_64-save-cpu-registers-upon-crash
> > arch/x86_64/kernel/crash.c
> > ---
> > linux-2.6.15-rc1-1M-dynamic/arch/x86_64/kernel/crash.c~x86_64-save-cpu-registers-upon-crash
>
> > #ifdef CONFIG_SMP
> > static atomic_t waiting_for_crash_ipi;
> >
> > @@ -38,6 +106,7 @@ static int crash_nmi_callback(struct pt_
> > return 1;
> > local_irq_disable();
> >
> > + crash_save_this_cpu(regs, cpu);
> > disable_local_APIC();
> > atomic_dec(&waiting_for_crash_ipi);
> > /* Assume hlt works */
> > @@ -113,4 +182,5 @@ void machine_crash_shutdown(struct pt_re
> > disable_IO_APIC();
> > #endif
> >
> > + crash_save_self(regs);
> > }
>
> Where did this disable_local_APIC and disable_IO_APIC on x86_64 come
> from? I know we had it on x86 but that was supposed to be a stop gap
> until we have the real fix. Now I know it needs a little more
> debugging but the real fix has been written. Putting it there on
> x86_64 makes the code less reliable and it allows things to start
> depending on it.
>
Hi Eric,
Initially we had written the patch without disable_local_APIC and
disable_IO_APIC only. But realized later that fix provided by you to
move apic initialization in init_IRQ has not been merged yet.
Like i386, here also this is a stop gap solution only till your patch is
merged. After that we shall drop this code.
I understand that it is less reliable but at least it provides us a base
to move forward and test kdump on x86_64 and address the other issues.
Thanks
Vivek
On Thu, Nov 17, 2005 at 02:20:23PM -0800, Andrew Morton wrote:
> Vivek Goyal <[email protected]> wrote:
> >
> > +ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
> > + size_t csize, unsigned long offset, int userbuf)
> > +{
> > + void *vaddr;
> > +
> > + if (!csize)
> > + return 0;
> > +
> > + vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
> > +
> > + if (userbuf) {
> > + if (copy_to_user(buf, (vaddr + offset), csize)) {
> > + kunmap_atomic(vaddr, KM_PTE0);
> > + return -EFAULT;
>
> The copy_*_user() inside kmap_atomic() is problematic.
>
> On some configs (eg, x86, highmem) the process is running atomically, hence
> the copy_*_user() will *refuse* to fault in the user's page if it's not
> present. Because pagefaulting involves doing things which sleep.
>
> So
>
> a) This code will generate might_sleep() warnings at runtime and
>
> b) It'll return -EFAULT for user pages which haven't been faulted in yet.
>
>
> We do all sorts of gruesome tricks in mm/filemap.c to get around all this.
> I don't think your code is as performance-sensitive, so a suitable fix
> might be to double-copy the data. Make sure that the same physical page is
> used as a bounce page for each copy (ie: get the caller to pass it in) and
> that page will be cache-hot and the performance should be acceptable.
>
> If it really is performance-sensitive then you'll need to play filemap.c
> games. It'd be better to use a sleeping kmap instead, if poss. That's
> kmap().
>
> Please send an incremental patch when it's sorted.
Hi Andrew
Sending along the incremental patch as suggested.
In this patch, a global buffer page is introduced, where the page
from the previous kernel's memory is copied, before copying it
out to a user buffer. This will take care of the issue of
copy_to_user() page faulting in an atomic context.
This patch has been generated against 2.6.15-rc2-mm1.
Kindly review.
Thanks
Rachita
o This patch allocates a page to copy the previous kernel's memory
before we copy it onto a user buffer using copy_to_user(), thereby
taking care of the scenario of a possible page fault in an atomic
context.
Signed-off-by: Rachita Kothiyal <[email protected]>
---
arch/i386/kernel/crash_dump.c | 39 +++++++++++++++++++++++++++++++++------
1 files changed, 33 insertions(+), 6 deletions(-)
diff -puN arch/i386/kernel/crash_dump.c~double_copy_read_oldmem arch/i386/kernel/crash_dump.c
--- linux-2.6.15-rc2-mm1/arch/i386/kernel/crash_dump.c~double_copy_read_oldmem 2005-11-23 17:50:07.258543864 +0530
+++ linux-2.6.15-rc2-mm1-rachita/arch/i386/kernel/crash_dump.c 2005-11-23 17:50:36.779056064 +0530
@@ -11,6 +11,8 @@
#include <asm/uaccess.h>
+static void *kdump_buf_page;
+
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
@@ -23,6 +25,10 @@
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
@@ -34,14 +40,35 @@ ssize_t copy_oldmem_page(unsigned long p
vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
- if (userbuf) {
- if (copy_to_user(buf, (vaddr + offset), csize)) {
- kunmap_atomic(vaddr, KM_PTE0);
+ if (!userbuf) {
+ memcpy(buf, (vaddr + offset), csize);
+ kunmap_atomic(vaddr, KM_PTE0);
+ } else {
+ if (!kdump_buf_page) {
+ printk(KERN_WARNING "Kdump: Kdump buffer page not"
+ " allocated\n");
return -EFAULT;
}
- } else
- memcpy(buf, (vaddr + offset), csize);
+ copy_page(kdump_buf_page, vaddr);
+ kunmap_atomic(vaddr, KM_PTE0);
+ if (copy_to_user(buf, (kdump_buf_page + offset), csize))
+ return -EFAULT;
+ }
- kunmap_atomic(vaddr, KM_PTE0);
return csize;
}
+
+static int __init kdump_buf_page_init(void)
+{
+ int ret = 0;
+
+ kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!kdump_buf_page) {
+ printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
+ " page\n");
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+arch_initcall(kdump_buf_page_init);
_