Hi,
I have observed that kdump's reboot path to the second kernel is not
stack overflow safe.
On the event of a stack overflow critical data that usually resides at
the bottom of the stack is likely to be stomped and, consequently, its
use should be avoided.
In particular, in the i386 and IA64 architectures the macro
smp_processor_id ultimately makes use of the "cpu" member of struct
thread_info which resides at the bottom of the stack (see code snips
below). x86_64, on the other hand, is not affected by this problem
because it benefits from the PDA infrastructure.
****
* i386: thread_info is stomped and thus no longer valid
(include/linux/smp.h)
#define smp_processor_id() __smp_processor_id()
(include/asm-i386/smp.h)
#define __smp_processor_id() (current_thread_info()->cpu)
(include/asm-i386/thread_info.h)
static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti;
__asm__("andl %%esp,%0; ":"=r" (ti) : "" (~(THREAD_SIZE - 1)));
return ti;
}
* x86_64: thread_info is overwritten but this does not affect
smp_processor_id
(include/linux/smp.h)
#define smp_processor_id() __smp_processor_id()
(include/asm-x86_64/smp.h)
#define __smp_processor_id() read_pda(cpunumber)
(arch/x86_64/kernel/setup64.c)
struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
* IA64: both task_struct and thread_info are likely to be corrupted
(include/asm-ia64/smp.h)
#define smp_processor_id() (current_thread_info()->cpu)
(include/asm-ia64/thread_info.h)
#define current_thread_info() ((struct thread_info *) ((char *)
current + IA64_TASK_SIZE))
(include/asm-ia64/current.h)
#define current ((struct task_struct *) ia64_getreg(_IA64_REG_TP))
****
To circumvent this problem I suggest implementing
"safe_smp_processor_id()" (it already exists on x86_64) for i386 and
IA64 and use it as a replacement to smp_processor_id in the reboot path
to the dump capture kernel. A possible implementation for i386 is
attached below.
I would appreciante comments on this.
Regards,
Fernando
---
diff -urNp linux-2.6.15-rc2/arch/i386/kernel/crash.c
linux-2.6.15-rc2-sov/arch/i386/kernel/crash.c
--- linux-2.6.15-rc2/arch/i386/kernel/crash.c 2005-10-28
09:02:08.000000000 +0900
+++ linux-2.6.15-rc2-sov/arch/i386/kernel/crash.c 2005-11-28
21:54:19.000000000 +0900
@@ -120,7 +120,7 @@ static void crash_save_self(struct pt_re
struct pt_regs regs;
int cpu;
- cpu = smp_processor_id();
+ cpu = safe_smp_processor_id();
if (saved_regs)
crash_setup_regs(®s, saved_regs);
else
@@ -134,12 +134,14 @@ static atomic_t waiting_for_crash_ipi;
static int crash_nmi_callback(struct pt_regs *regs, int cpu)
{
struct pt_regs fixed_regs;
+ int scpu;
/* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
- if (cpu == crashing_cpu)
+ scpu = safe_smp_processor_id();
+ if (scpu == crashing_cpu)
return 1;
local_irq_disable();
@@ -147,7 +149,7 @@ static int crash_nmi_callback(struct pt_
crash_setup_regs(&fixed_regs, regs);
regs = &fixed_regs;
}
- crash_save_this_cpu(regs, cpu);
+ crash_save_this_cpu(regs, scpu);
disable_local_APIC();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -211,7 +213,7 @@ void machine_crash_shutdown(struct pt_re
local_irq_disable();
/* Make a note of crashing cpu. Will be used in NMI callback.*/
- crashing_cpu = smp_processor_id();
+ crashing_cpu = safe_smp_processor_id();
nmi_shootdown_cpus();
lapic_shutdown();
#if defined(CONFIG_X86_IO_APIC)
diff -urNp linux-2.6.15-rc2/arch/i386/kernel/smp.c
linux-2.6.15-rc2-sov/arch/i386/kernel/smp.c
--- linux-2.6.15-rc2/arch/i386/kernel/smp.c 2005-10-28
09:02:08.000000000 +0900
+++ linux-2.6.15-rc2-sov/arch/i386/kernel/smp.c 2005-11-28
22:05:14.000000000 +0900
@@ -628,3 +628,26 @@ fastcall void smp_call_function_interrup
}
}
+static int convert_apicid_to_cpu(int apic_id)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (x86_cpu_to_apicid[i] == apic_id)
+ return i;
+ }
+ return -1;
+}
+
+int safe_smp_processor_id(void) {
+ int apicid, cpuid;
+
+ apicid = hard_smp_processor_id();
+ if (apicid == BAD_APICID)
+ return 0;
+
+ cpuid = convert_apicid_to_cpu(apicid);
+ printk("Test %d\n", cpuid);
+
+ return cpuid >= 0 ? cpuid : 0;
+}
diff -urNp linux-2.6.15-rc2/arch/i386/mm/fault.c
linux-2.6.15-rc2-sov/arch/i386/mm/fault.c
--- linux-2.6.15-rc2/arch/i386/mm/fault.c 2005-11-22 19:48:18.000000000
+0900
+++ linux-2.6.15-rc2-sov/arch/i386/mm/fault.c 2005-11-24
14:04:12.000000000 +0900
@@ -245,6 +245,11 @@ fastcall void __kprobes do_page_fault(st
local_irq_enable();
tsk = current;
+ /* We may have invalid '*current' due to a stack overflow. */
+ if (!virt_addr_valid(tsk)) {
+ printk("do_page_fault: Discarding invalid 'current' struct
task_struct * = 0x%p\n", tsk);
+ tsk = NULL;
+ }
si_code = SEGV_MAPERR;
@@ -271,7 +276,14 @@ fastcall void __kprobes do_page_fault(st
goto bad_area_nosemaphore;
}
- mm = tsk->mm;
+ mm = NULL;
+ /* We may have invalid 'tsk' due to a i386 stack overflow */
+ if (tsk)
+ mm = tsk->mm;
+ if (mm && !virt_addr_valid(mm)) {
+ printk("do_page_fault: Discarding invalid current->mm struct
mm_struct * = 0x%p\n", mm);
+ mm = NULL;
+ }
/*
* If we're in an interrupt, have no user context or are running in an
diff -urNp linux-2.6.15-rc2/include/asm-i386/smp.h
linux-2.6.15-rc2-sov/include/asm-i386/smp.h
--- linux-2.6.15-rc2/include/asm-i386/smp.h 2005-11-22
19:49:02.000000000 +0900
+++ linux-2.6.15-rc2-sov/include/asm-i386/smp.h 2005-11-28
21:30:26.000000000 +0900
@@ -90,12 +90,14 @@ static __inline int logical_smp_processo
#endif
+extern int safe_smp_processor_id(void);
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
#endif /* !__ASSEMBLY__ */
#else /* CONFIG_SMP */
+#define safe_smp_processor_id() 0
#define cpu_physical_id(cpu) boot_cpu_physical_apicid
#define NO_PROC_ID 0xFF /* No processor magic marker */
Fernando Luis Vazquez Cao <[email protected]> writes:
> Hi,
>
> I have observed that kdump's reboot path to the second kernel is not
> stack overflow safe.
>
> On the event of a stack overflow critical data that usually resides at
> the bottom of the stack is likely to be stomped and, consequently, its
> use should be avoided.
>
> In particular, in the i386 and IA64 architectures the macro
> smp_processor_id ultimately makes use of the "cpu" member of struct
> thread_info which resides at the bottom of the stack (see code snips
> below). x86_64, on the other hand, is not affected by this problem
> because it benefits from the PDA infrastructure.
I agree this is something that we should handle if we can.
>
> To circumvent this problem I suggest implementing
> "safe_smp_processor_id()" (it already exists on x86_64) for i386 and
> IA64 and use it as a replacement to smp_processor_id in the reboot path
> to the dump capture kernel. A possible implementation for i386 is
> attached below.
>
> I would appreciante comments on this.
The patch looks like a good one to express the idea, but it is a
bad one to push upstream.
safe_smp_processor_id has a printk in it.
mm/fault.c has related code that really should go into a separate
patch.
For crash_nmi_callback I don't feel very comfortable about
dropping the cpu parameter. I suspect you want to move
the call safe_smp_process_id to do_nmi (which is current
calling smp_processor_id). Basically the whole nmi path needs a stack
overflow audit. I believe we have a separate interrupt stack that
should help but..
Eric
On Mon, 2005-11-28 at 06:39 -0700, Eric W. Biederman wrote:
> Fernando Luis Vazquez Cao <[email protected]> writes:
>
> > Hi,
> >
> > I have observed that kdump's reboot path to the second kernel is not
> > stack overflow safe.
> >
> > On the event of a stack overflow critical data that usually resides at
> > the bottom of the stack is likely to be stomped and, consequently, its
> > use should be avoided.
> >
> > In particular, in the i386 and IA64 architectures the macro
> > smp_processor_id ultimately makes use of the "cpu" member of struct
> > thread_info which resides at the bottom of the stack (see code snips
> > below). x86_64, on the other hand, is not affected by this problem
> > because it benefits from the PDA infrastructure.
>
> I agree this is something that we should handle if we can.
> >
> > To circumvent this problem I suggest implementing
> > "safe_smp_processor_id()" (it already exists on x86_64) for i386 and
> > IA64 and use it as a replacement to smp_processor_id in the reboot path
> > to the dump capture kernel. A possible implementation for i386 is
> > attached below.
> >
> > I would appreciante comments on this.
>
> The patch looks like a good one to express the idea, but it is a
> bad one to push upstream.
>
> safe_smp_processor_id has a printk in it.
Sorry for the vestige of debugging code.
> mm/fault.c has related code that really should go into a separate
> patch.
>
> For crash_nmi_callback I don't feel very comfortable about
> dropping the cpu parameter. I suspect you want to move
> the call safe_smp_process_id to do_nmi (which is current
> calling smp_processor_id). Basically the whole nmi path needs a stack
> overflow audit.
The reason behind dropping the cpu parameter in crash_nmi_callback was
to avoid the use of the much slower safe_smp_processor_id in do_nmi,
what would have an impact on all possible nmi handlers. This is ok with
crash_nmi_callback since it obviously isn't performance critical. But,
reconsidering the problem, one could argue that an nmi handler will
hardly ever be a fast path. I think I will move safe_smp_processor_id to
do_nmi as you suggested (sorry for thinking aloud).
Regarding the stack overflow audit of the nmi path, we have the problem
that both nmi_enter and nmi_exit in do_nmi (see code below) make heavy
use of "current" indirectly (specially through the kernel preemption
code).
fastcall void do_nmi(struct pt_regs * regs, long error_code)
{
int cpu;
nmi_enter();
cpu = smp_processor_id();
#ifdef CONFIG_HOTPLUG_CPU
if (!cpu_online(cpu)) {
nmi_exit();
return;
}
#endif
++nmi_count(cpu);
if (!rcu_dereference(nmi_callback)(regs, cpu))
default_do_nmi(regs);
nmi_exit();
}
> I believe we have a separate interrupt stack that
> should help but..
Yes, when using 4K stacks we have a separate interrupt stack that should
help, but I am afraid that crash dumping is about being paranoid.
I will split the previous patch as indicated below appropriately and
resend the parts in subsequent emails.
List of patches:
* safe_smp_processor_id: stack overflow safe implementation of
smp_processor_id
* crash: replace smp_processor_id with safe_smp_processor_id in
arch/i386/kernel/crash.c
* do_nmi: replace smp_processor_id with safe_smp_processor_id
NOTE: this patch is still imcomplete. Once I audit the whole nmi path
I will prepare a new patch complementing this.
* fault: take stack overflows into account in do_page_fault
Regards,
Fernando
Fernando Luis Vazquez Cao <[email protected]> writes:
> On Mon, 2005-11-28 at 06:39 -0700, Eric W. Biederman wrote:
>> Fernando Luis Vazquez Cao <[email protected]> writes:
> Regarding the stack overflow audit of the nmi path, we have the problem
> that both nmi_enter and nmi_exit in do_nmi (see code below) make heavy
> use of "current" indirectly (specially through the kernel preemption
> code).
Ok. I wonder if it would be saner to simply replace the nmi trap
handler on the crash dump path?
>> I believe we have a separate interrupt stack that
>> should help but..
> Yes, when using 4K stacks we have a separate interrupt stack that should
> help, but I am afraid that crash dumping is about being paranoid.
Oh I agree. If we had a private 4K stack for the nmi handler we
would not need to worry about overflow in that case. (baring
nmi happening during nmis) Hmm. Is there anything to keep
us doing something bad in that case?
I guess as long as we don't clear the high bit of port 0x70 we
should be reasonably safe from the nmi firing multiple times.
Eric
On Mon, Nov 28, 2005 at 11:29:29AM -0700, Eric W. Biederman wrote:
> Fernando Luis Vazquez Cao <[email protected]> writes:
>
> > On Mon, 2005-11-28 at 06:39 -0700, Eric W. Biederman wrote:
> >> Fernando Luis Vazquez Cao <[email protected]> writes:
>
> > Regarding the stack overflow audit of the nmi path, we have the problem
> > that both nmi_enter and nmi_exit in do_nmi (see code below) make heavy
> > use of "current" indirectly (specially through the kernel preemption
> > code).
>
> Ok. I wonder if it would be saner to simply replace the nmi trap
> handler on the crash dump path?
>
Sounds interesting.
> >> I believe we have a separate interrupt stack that
> >> should help but..
> > Yes, when using 4K stacks we have a separate interrupt stack that should
> > help, but I am afraid that crash dumping is about being paranoid.
>
> Oh I agree. If we had a private 4K stack for the nmi handler we
> would not need to worry about overflow in that case.
Having private 4K stack makes sense as crash_nmi_callback() itself
requires quite some space on stack. If one has enabled CONFIG_4KSTACKS,
then we use separate interrupt stack and we are probably safe from stack
overflows but otherwise we need it.
> (baring
> nmi happening during nmis) Hmm. Is there anything to keep
> us doing something bad in that case?
>
> I guess as long as we don't clear the high bit of port 0x70 we
> should be reasonably safe from the nmi firing multiple times.
Are you referring to port 0x23 for IMCR register.
Thanks
Vivek
On Mon, 2005-11-28 at 11:29 -0700, Eric W. Biederman wrote:
> Fernando Luis Vazquez Cao <[email protected]> writes:
>
> > On Mon, 2005-11-28 at 06:39 -0700, Eric W. Biederman wrote:
> >> Fernando Luis Vazquez Cao <[email protected]> writes:
>
> > Regarding the stack overflow audit of the nmi path, we have the problem
> > that both nmi_enter and nmi_exit in do_nmi (see code below) make heavy
> > use of "current" indirectly (specially through the kernel preemption
> > code).
>
> Ok. I wonder if it would be saner to simply replace the nmi trap
> handler on the crash dump path?
That seems to be the cleanest way to solve the problem. I will write a
patch implementing that and see how it works.
> >> I believe we have a separate interrupt stack that
> >> should help but..
> > Yes, when using 4K stacks we have a separate interrupt stack that should
> > help, but I am afraid that crash dumping is about being paranoid.
>
> Oh I agree. If we had a private 4K stack for the nmi handler we
> would not need to worry about overflow in that case. (baring
> nmi happening during nmis) Hmm. Is there anything to keep
> us doing something bad in that case?
I think that is a sensible thing to do. I am just back from a day
off, but tomorrow I will take a closer look at this.
Regards,
Fernando
Vivek Goyal <[email protected]> writes:
> On Mon, Nov 28, 2005 at 11:29:29AM -0700, Eric W. Biederman wrote:
>> (baring
>> nmi happening during nmis) Hmm. Is there anything to keep
>> us doing something bad in that case?
>>
>> I guess as long as we don't clear the high bit of port 0x70 we
>> should be reasonably safe from the nmi firing multiple times.
>
> Are you referring to port 0x23 for IMCR register.
No. Port 0x70 is the cmos address register. 0x71 the cmos
data register. The high bit of 0x70 if the nmi enable.
Although with local apics generating nmis I would be surprised
if it effective in every case. :)
Eric
On Mon, Nov 28, 2005 at 11:29:29AM -0700, Eric W. Biederman wrote:
> Fernando Luis Vazquez Cao <[email protected]> writes:
>
> > On Mon, 2005-11-28 at 06:39 -0700, Eric W. Biederman wrote:
> >> Fernando Luis Vazquez Cao <[email protected]> writes:
>
> > Regarding the stack overflow audit of the nmi path, we have the problem
> > that both nmi_enter and nmi_exit in do_nmi (see code below) make heavy
> > use of "current" indirectly (specially through the kernel preemption
> > code).
>
> Ok. I wonder if it would be saner to simply replace the nmi trap
> handler on the crash dump path?
>
> >> I believe we have a separate interrupt stack that
> >> should help but..
> > Yes, when using 4K stacks we have a separate interrupt stack that should
> > help, but I am afraid that crash dumping is about being paranoid.
>
> Oh I agree. If we had a private 4K stack for the nmi handler we
> would not need to worry about overflow in that case. (baring
> nmi happening during nmis) Hmm. Is there anything to keep
> us doing something bad in that case?
>
Can a NMI happen inside a NMI? As per Intel software developer manual vol3
(section 5.7.1 Handling multiple NMIs), after occurrence of an NMI, CPU
will not accept next NMI till iret is executed. Then it should not be a
problem. I hope, I understood the problem right.
Thanks
Vivek