Hi,
I submitted a quite similar patch last December.
http://www.spinics.net/lists/linux-mm/msg13157.html
I retry it with different description of the purpose.
[Changelog]
from v1:
- Change name of sysctl parameter ,kexec_on_mce, to kexec_on_hwerr.
- Move variable declaration from <asm/mce.h> to <kernel/panic.h>.
- Remove CONFIG_X86_MCE in *.c files.
- Modify [Purpose]/[Patch Description].
[Purpose]
There are some logging features of firmware/hardware, SEL,BMC, etc, in enterprise servers.
We investigate the firmware/hardware logs first when MCE occurred and replace the broken hardware.
So, memory dump is not necessary for detecting root cause of machine check.
Also, we can reduce down-time by skipping kdump.
Of course, there are a lot of servers which don't have logging features of firmware/hardware.
So, I proposed a option controlling kexec behaviour when hardware error occurred.
[Patch Description]
This patch adds a sysctl option ,kernel.kexec_on_hwerr, controlling kexec behaviour when hardware error occurred.
- Permission
$B!!!!(B- 0644
- Value(default is "1")
- non-zero: Kexec is enabled regardless of hardware error.
- 0: Kexec is disabled when MCE occurred.
Matrix of kernel.kexec_on_hwerr value ,hardware error and kexec
--------------------------------------------------
kernel.kexec_on_hwerr| hardware error | kexec
--------------------------------------------------
non-zero | occurred | enabled
-----------------------------
| not occurred | enabled
--------------------------------------------------
0 | occurred | disabled
|----------------------------
| not occurred | enabled
--------------------------------------------------
Any comments and suggestions are welcome.
Signed-off-by: Seiji Aguchi <[email protected]>
---
Documentation/sysctl/kernel.txt | 11 +++++++++++
arch/x86/kernel/cpu/mcheck/mce.c | 2 ++
include/linux/kernel.h | 2 ++
include/linux/sysctl.h | 1 +
kernel/panic.c | 15 ++++++++++++++-
kernel/sysctl.c | 8 ++++++++
kernel/sysctl_binary.c | 1 +
mm/memory-failure.c | 2 ++
8 files changed, 41 insertions(+), 1 deletions(-)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 11d5ced..3159111 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
- hotplug
- java-appletviewer [ binfmt_java, obsolete ]
- java-interpreter [ binfmt_java, obsolete ]
+- kexec_on_hwerr [ x86 only ]
- kptr_restrict
- kstack_depth_to_print [ X86 only ]
- l2cr [ PPC only ]
@@ -261,6 +262,16 @@ This flag controls the L2 cache of G3 processor boards. If 0, the cache is disabled. Enabled if nonzero.
==============================================================
+kexec_on_hwerr: (X86 only)
+
+Controls the behaviour of kexec when panic occurred due to hardware
+error.
+Default value is 1.
+
+0: Kexec is disabled.
+non-zero: Kexec is enabled.
+
+==============================================================
kptr_restrict:
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d916183..e76b47b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -944,6 +944,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
percpu_inc(mce_exception_count);
+ hwerr_flag = 1;
+
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
goto out;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2fe6e84..c2fba7c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,8 @@ extern void add_taint(unsigned flag); extern int test_taint(unsigned flag); extern unsigned long get_taint(void); extern int root_mountflags;
+extern int kexec_on_hwerr;
+extern int hwerr_flag;
extern bool early_boot_irqs_disabled;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 7bb5cb6..8ae5bfe 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+ KERN_KEXEC_ON_HWERR=77, /* int: bevaviour of kexec for hardware error
+*/
};
diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87..84c1d2e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -28,6 +28,8 @@
#define PANIC_BLINK_SPD 18
int panic_on_oops;
+int kexec_on_hwerr = 1;
+int hwerr_flag;
static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
@@ -45,6 +47,16 @@ static long no_blink(int state)
return 0;
}
+static int kexec_should_skip(void)
+{
+ if (!kexec_on_hwerr && hwerr_flag) {
+ printk(KERN_WARNING "Kexec is skipped because hardware error "
+ "occurred.\n");
+ return 1;
+ }
+ return 0;
+}
+
/* Returns how long it waited in ms */
long (*panic_blink)(int state);
EXPORT_SYMBOL(panic_blink);
@@ -86,7 +98,8 @@ NORET_TYPE void panic(const char * fmt, ...)
* everything else.
* Do we want to call this before we try to display a message?
*/
- crash_kexec(NULL);
+ if (!kexec_should_skip())
+ crash_kexec(NULL);
kmsg_dump(KMSG_DUMP_PANIC);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83..f78edd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -811,6 +811,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "kexec_on_hwerr",
+ .data = &kexec_on_hwerr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+
#endif
#if defined(CONFIG_MMU)
{
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bed..8d572ca 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_COMPAT_LOG, "compat-log" },
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ { CTL_INT, KERN_KEXEC_ON_HWERR, "kexec_on_hwerr" },
{}
};
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f..0178f47 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -994,6 +994,8 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
int res;
unsigned int nr_pages;
+ hwerr_flag = 1;
+
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
--
1.7.1
On Wed, Feb 09, 2011 at 11:35:43AM -0500, Seiji Aguchi wrote:
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -153,6 +153,7 @@ enum
> KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
> KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
> KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
> + KERN_KEXEC_ON_HWERR=77, /* int: bevaviour of kexec for hardware error
> +*/
Odd trailing comment on the next line.
Seiji Aguchi <[email protected]> writes:
> Hi,
>
> I submitted a quite similar patch last December.
>
> http://www.spinics.net/lists/linux-mm/msg13157.html
>
> I retry it with different description of the purpose.
>
> [Changelog]
> from v1:
> - Change name of sysctl parameter ,kexec_on_mce, to kexec_on_hwerr.
> - Move variable declaration from <asm/mce.h> to <kernel/panic.h>.
> - Remove CONFIG_X86_MCE in *.c files.
> - Modify [Purpose]/[Patch Description].
>
> [Purpose]
> There are some logging features of firmware/hardware, SEL,BMC, etc, in enterprise servers.
> We investigate the firmware/hardware logs first when MCE occurred and replace the broken hardware.
> So, memory dump is not necessary for detecting root cause of machine check.
> Also, we can reduce down-time by skipping kdump.
>
> Of course, there are a lot of servers which don't have logging features of firmware/hardware.
> So, I proposed a option controlling kexec behaviour when hardware error occurred.
Mostly this seems reasonable. If we can get the logic simple enough it
is fool proof I am for it.
> [Patch Description]
> This patch adds a sysctl option ,kernel.kexec_on_hwerr, controlling kexec behaviour when hardware error occurred.
>
> - Permission
> - 0644
> - Value(default is "1")
> - non-zero: Kexec is enabled regardless of hardware error.
> - 0: Kexec is disabled when MCE occurred.
>
>
> Matrix of kernel.kexec_on_hwerr value ,hardware error and kexec
If we do a version that is potentially arch agnostic but x86 for now,
and we call it kexec_on_logged_hwerr. Because it is important that we
expect that the hardware will log the error.
Is there any reason we can't put logic to decided if we should write
a crashdump in the crashdump userspace?
Eric
> --------------------------------------------------
> kernel.kexec_on_hwerr| hardware error | kexec
> --------------------------------------------------
> non-zero | occurred | enabled
> -----------------------------
> | not occurred | enabled
> --------------------------------------------------
> 0 | occurred | disabled
> |----------------------------
> | not occurred | enabled
> --------------------------------------------------
>
>
> Any comments and suggestions are welcome.
>
> Signed-off-by: Seiji Aguchi <[email protected]>
>
> ---
> Documentation/sysctl/kernel.txt | 11 +++++++++++
> arch/x86/kernel/cpu/mcheck/mce.c | 2 ++
> include/linux/kernel.h | 2 ++
> include/linux/sysctl.h | 1 +
> kernel/panic.c | 15 ++++++++++++++-
> kernel/sysctl.c | 8 ++++++++
> kernel/sysctl_binary.c | 1 +
> mm/memory-failure.c | 2 ++
> 8 files changed, 41 insertions(+), 1 deletions(-)
>
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 11d5ced..3159111 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
> - hotplug
> - java-appletviewer [ binfmt_java, obsolete ]
> - java-interpreter [ binfmt_java, obsolete ]
> +- kexec_on_hwerr [ x86 only ]
> - kptr_restrict
> - kstack_depth_to_print [ X86 only ]
> - l2cr [ PPC only ]
> @@ -261,6 +262,16 @@ This flag controls the L2 cache of G3 processor boards. If 0, the cache is disabled. Enabled if nonzero.
>
> ==============================================================
> +kexec_on_hwerr: (X86 only)
> +
> +Controls the behaviour of kexec when panic occurred due to hardware
> +error.
> +Default value is 1.
> +
> +0: Kexec is disabled.
> +non-zero: Kexec is enabled.
> +
> +==============================================================
>
> kptr_restrict:
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index d916183..e76b47b 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -944,6 +944,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
>
> percpu_inc(mce_exception_count);
>
> + hwerr_flag = 1;
> +
> if (notify_die(DIE_NMI, "machine check", regs, error_code,
> 18, SIGKILL) == NOTIFY_STOP)
> goto out;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2fe6e84..c2fba7c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -242,6 +242,8 @@ extern void add_taint(unsigned flag); extern int test_taint(unsigned flag); extern unsigned long get_taint(void); extern int root_mountflags;
> +extern int kexec_on_hwerr;
> +extern int hwerr_flag;
>
> extern bool early_boot_irqs_disabled;
>
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 7bb5cb6..8ae5bfe 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -153,6 +153,7 @@ enum
> KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
> KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
> KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
> + KERN_KEXEC_ON_HWERR=77, /* int: bevaviour of kexec for hardware error
> +*/
Don't change this file. You don't need a binary number.
> };
>
>
> diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87..84c1d2e 100644
> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -28,6 +28,8 @@
> #define PANIC_BLINK_SPD 18
>
> int panic_on_oops;
> +int kexec_on_hwerr = 1;
> +int hwerr_flag;
> static unsigned long tainted_mask;
> static int pause_on_oops;
> static int pause_on_oops_flag;
> @@ -45,6 +47,16 @@ static long no_blink(int state)
> return 0;
> }
>
> +static int kexec_should_skip(void)
> +{
> + if (!kexec_on_hwerr && hwerr_flag) {
> + printk(KERN_WARNING "Kexec is skipped because hardware error "
> + "occurred.\n");
> + return 1;
> + }
> + return 0;
> +}
> +
> /* Returns how long it waited in ms */
> long (*panic_blink)(int state);
> EXPORT_SYMBOL(panic_blink);
> @@ -86,7 +98,8 @@ NORET_TYPE void panic(const char * fmt, ...)
> * everything else.
> * Do we want to call this before we try to display a message?
> */
> - crash_kexec(NULL);
> + if (!kexec_should_skip())
> + crash_kexec(NULL);
>
> kmsg_dump(KMSG_DUMP_PANIC);
>
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83..f78edd8 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -811,6 +811,14 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> + {
> + .procname = "kexec_on_hwerr",
> + .data = &kexec_on_hwerr,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> +
> #endif
> #if defined(CONFIG_MMU)
> {
> diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bed..8d572ca 100644
> --- a/kernel/sysctl_binary.c
> +++ b/kernel/sysctl_binary.c
> @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
> { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
> { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
> { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
> + { CTL_INT, KERN_KEXEC_ON_HWERR, "kexec_on_hwerr" },
> {}
Don't change this file. No one uses the binary interface.
> };
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f..0178f47 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -994,6 +994,8 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
> int res;
> unsigned int nr_pages;
>
> + hwerr_flag = 1;
> +
> if (!sysctl_memory_failure_recovery)
> panic("Memory failure from trap %d on page %lx", trapno,
> pfn);
I get the feeling that we should either call a function besides panic or
do something different so that we aren't controlling this trough an
implicit parameter set in a global variable. That just seems scary racy
and hard to understand by reading the code.
Eric
Seiji Aguchi <[email protected]> writes:
> Hi,
>
> I submitted a quite similar patch last December.
>
> http://www.spinics.net/lists/linux-mm/msg13157.html
>
> I retry it with different description of the purpose.
>
> [Changelog]
> from v1:
> - Change name of sysctl parameter ,kexec_on_mce, to kexec_on_hwerr.
> - Move variable declaration from <asm/mce.h> to <kernel/panic.h>.
> - Remove CONFIG_X86_MCE in *.c files.
> - Modify [Purpose]/[Patch Description].
>
> [Purpose]
> There are some logging features of firmware/hardware, SEL,BMC, etc, in enterprise servers.
> We investigate the firmware/hardware logs first when MCE occurred and replace the broken hardware.
> So, memory dump is not necessary for detecting root cause of machine check.
> Also, we can reduce down-time by skipping kdump.
>
> Of course, there are a lot of servers which don't have logging features of firmware/hardware.
> So, I proposed a option controlling kexec behaviour when hardware error occurred.
Mostly this seems reasonable. If we can get the logic simple enough it
is fool proof I am for it.
> [Patch Description]
> This patch adds a sysctl option ,kernel.kexec_on_hwerr, controlling kexec behaviour when hardware error occurred.
>
> - Permission
> - 0644
> - Value(default is "1")
> - non-zero: Kexec is enabled regardless of hardware error.
> - 0: Kexec is disabled when MCE occurred.
>
>
> Matrix of kernel.kexec_on_hwerr value ,hardware error and kexec
If we do a version that is potentially arch agnostic but x86 for now,
and we call it kexec_on_logged_hwerr. Because it is important that we
expect that the hardware will log the error.
Is there any reason we can't put logic to decided if we should write
a crashdump in the crashdump userspace?
Eric
> --------------------------------------------------
> kernel.kexec_on_hwerr| hardware error | kexec
> --------------------------------------------------
> non-zero | occurred | enabled
> -----------------------------
> | not occurred | enabled
> --------------------------------------------------
> 0 | occurred | disabled
> |----------------------------
> | not occurred | enabled
> --------------------------------------------------
>
>
> Any comments and suggestions are welcome.
>
> Signed-off-by: Seiji Aguchi <[email protected]>
>
> ---
> Documentation/sysctl/kernel.txt | 11 +++++++++++
> arch/x86/kernel/cpu/mcheck/mce.c | 2 ++
> include/linux/kernel.h | 2 ++
> include/linux/sysctl.h | 1 +
> kernel/panic.c | 15 ++++++++++++++-
> kernel/sysctl.c | 8 ++++++++
> kernel/sysctl_binary.c | 1 +
> mm/memory-failure.c | 2 ++
> 8 files changed, 41 insertions(+), 1 deletions(-)
>
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 11d5ced..3159111 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
> - hotplug
> - java-appletviewer [ binfmt_java, obsolete ]
> - java-interpreter [ binfmt_java, obsolete ]
> +- kexec_on_hwerr [ x86 only ]
> - kptr_restrict
> - kstack_depth_to_print [ X86 only ]
> - l2cr [ PPC only ]
> @@ -261,6 +262,16 @@ This flag controls the L2 cache of G3 processor boards. If 0, the cache is disabled. Enabled if nonzero.
>
> ==============================================================
> +kexec_on_hwerr: (X86 only)
> +
> +Controls the behaviour of kexec when panic occurred due to hardware
> +error.
> +Default value is 1.
> +
> +0: Kexec is disabled.
> +non-zero: Kexec is enabled.
> +
> +==============================================================
>
> kptr_restrict:
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index d916183..e76b47b 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -944,6 +944,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
>
> percpu_inc(mce_exception_count);
>
> + hwerr_flag = 1;
> +
> if (notify_die(DIE_NMI, "machine check", regs, error_code,
> 18, SIGKILL) == NOTIFY_STOP)
> goto out;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2fe6e84..c2fba7c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -242,6 +242,8 @@ extern void add_taint(unsigned flag); extern int test_taint(unsigned flag); extern unsigned long get_taint(void); extern int root_mountflags;
> +extern int kexec_on_hwerr;
> +extern int hwerr_flag;
>
> extern bool early_boot_irqs_disabled;
>
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 7bb5cb6..8ae5bfe 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -153,6 +153,7 @@ enum
> KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
> KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
> KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
> + KERN_KEXEC_ON_HWERR=77, /* int: bevaviour of kexec for hardware error
> +*/
Don't change this file. You don't need a binary number.
> };
>
>
> diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87..84c1d2e 100644
> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -28,6 +28,8 @@
> #define PANIC_BLINK_SPD 18
>
> int panic_on_oops;
> +int kexec_on_hwerr = 1;
> +int hwerr_flag;
> static unsigned long tainted_mask;
> static int pause_on_oops;
> static int pause_on_oops_flag;
> @@ -45,6 +47,16 @@ static long no_blink(int state)
> return 0;
> }
>
> +static int kexec_should_skip(void)
> +{
> + if (!kexec_on_hwerr && hwerr_flag) {
> + printk(KERN_WARNING "Kexec is skipped because hardware error "
> + "occurred.\n");
> + return 1;
> + }
> + return 0;
> +}
> +
> /* Returns how long it waited in ms */
> long (*panic_blink)(int state);
> EXPORT_SYMBOL(panic_blink);
> @@ -86,7 +98,8 @@ NORET_TYPE void panic(const char * fmt, ...)
> * everything else.
> * Do we want to call this before we try to display a message?
> */
> - crash_kexec(NULL);
> + if (!kexec_should_skip())
> + crash_kexec(NULL);
>
> kmsg_dump(KMSG_DUMP_PANIC);
>
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83..f78edd8 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -811,6 +811,14 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> + {
> + .procname = "kexec_on_hwerr",
> + .data = &kexec_on_hwerr,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> +
> #endif
> #if defined(CONFIG_MMU)
> {
> diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bed..8d572ca 100644
> --- a/kernel/sysctl_binary.c
> +++ b/kernel/sysctl_binary.c
> @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
> { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
> { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
> { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
> + { CTL_INT, KERN_KEXEC_ON_HWERR, "kexec_on_hwerr" },
> {}
Don't change this file. No one uses the binary interface.
> };
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f..0178f47 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -994,6 +994,8 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
> int res;
> unsigned int nr_pages;
>
> + hwerr_flag = 1;
> +
> if (!sysctl_memory_failure_recovery)
> panic("Memory failure from trap %d on page %lx", trapno,
> pfn);
I get the feeling that we should either call a function besides panic or
do something different so that we aren't controlling this trough an
implicit parameter set in a global variable. That just seems scary racy
and hard to understand by reading the code.
Eric
于 2011年02月10日 01:07, Eric W. Biederman 写道:
>
> Is there any reason we can't put logic to decided if we should write
> a crashdump in the crashdump userspace?
>
Doesn't this already provide a choice for the user to decide if he wants
a crashdump via sysctl?
Except some minor issues pointed by you and Greg, this patch looks fine
for me.
Thanks.
(2011/02/10 1:35), Seiji Aguchi wrote:
> Hi,
>
> I submitted a quite similar patch last December.
>
> http://www.spinics.net/lists/linux-mm/msg13157.html
>
> I retry it with different description of the purpose.
>
> [Changelog]
> from v1:
> - Change name of sysctl parameter ,kexec_on_mce, to kexec_on_hwerr.
> - Move variable declaration from <asm/mce.h> to <kernel/panic.h>.
> - Remove CONFIG_X86_MCE in *.c files.
> - Modify [Purpose]/[Patch Description].
>
> [Purpose]
> There are some logging features of firmware/hardware, SEL,BMC, etc, in enterprise servers.
> We investigate the firmware/hardware logs first when MCE occurred and replace the broken hardware.
> So, memory dump is not necessary for detecting root cause of machine check.
> Also, we can reduce down-time by skipping kdump.
>
> Of course, there are a lot of servers which don't have logging features of firmware/hardware.
> So, I proposed a option controlling kexec behaviour when hardware error occurred.
>
> [Patch Description]
> This patch adds a sysctl option ,kernel.kexec_on_hwerr, controlling kexec behaviour when hardware error occurred.
>
> - Permission
> $B!!!!(B- 0644
> - Value(default is "1")
> - non-zero: Kexec is enabled regardless of hardware error.
> - 0: Kexec is disabled when MCE occurred.
>
>
> Matrix of kernel.kexec_on_hwerr value ,hardware error and kexec
>
> --------------------------------------------------
> kernel.kexec_on_hwerr| hardware error | kexec
> --------------------------------------------------
> non-zero | occurred | enabled
> -----------------------------
> | not occurred | enabled
> --------------------------------------------------
> 0 | occurred | disabled
> |----------------------------
> | not occurred | enabled
> --------------------------------------------------
>
>
> Any comments and suggestions are welcome.
>
> Signed-off-by: Seiji Aguchi <[email protected]>
>
> ---
> Documentation/sysctl/kernel.txt | 11 +++++++++++
> arch/x86/kernel/cpu/mcheck/mce.c | 2 ++
> include/linux/kernel.h | 2 ++
> include/linux/sysctl.h | 1 +
> kernel/panic.c | 15 ++++++++++++++-
> kernel/sysctl.c | 8 ++++++++
> kernel/sysctl_binary.c | 1 +
> mm/memory-failure.c | 2 ++
> 8 files changed, 41 insertions(+), 1 deletions(-)
>
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 11d5ced..3159111 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
> - hotplug
> - java-appletviewer [ binfmt_java, obsolete ]
> - java-interpreter [ binfmt_java, obsolete ]
> +- kexec_on_hwerr [ x86 only ]
> - kptr_restrict
> - kstack_depth_to_print [ X86 only ]
> - l2cr [ PPC only ]
> @@ -261,6 +262,16 @@ This flag controls the L2 cache of G3 processor boards. If 0, the cache is disabled. Enabled if nonzero.
>
> ==============================================================
> +kexec_on_hwerr: (X86 only)
> +
> +Controls the behaviour of kexec when panic occurred due to hardware
> +error.
> +Default value is 1.
> +
> +0: Kexec is disabled.
> +non-zero: Kexec is enabled.
> +
> +==============================================================
>
> kptr_restrict:
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index d916183..e76b47b 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -944,6 +944,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
>
> percpu_inc(mce_exception_count);
>
> + hwerr_flag = 1;
> +
> if (notify_die(DIE_NMI, "machine check", regs, error_code,
> 18, SIGKILL) == NOTIFY_STOP)
> goto out;
Now x86 supports some recoverable machine check, so setting
flag here will prevent running kexec on systems that have
encountered such recoverable machine check and recovered.
I think mce_panic() is proper place to set this flag "hwerr_flag".
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2fe6e84..c2fba7c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -242,6 +242,8 @@ extern void add_taint(unsigned flag); extern int test_taint(unsigned flag); extern unsigned long get_taint(void); extern int root_mountflags;
> +extern int kexec_on_hwerr;
> +extern int hwerr_flag;
>
> extern bool early_boot_irqs_disabled;
>
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 7bb5cb6..8ae5bfe 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -153,6 +153,7 @@ enum
> KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
> KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
> KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
> + KERN_KEXEC_ON_HWERR=77, /* int: bevaviour of kexec for hardware error
> +*/
> };
>
>
> diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87..84c1d2e 100644
> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -28,6 +28,8 @@
> #define PANIC_BLINK_SPD 18
>
> int panic_on_oops;
> +int kexec_on_hwerr = 1;
> +int hwerr_flag;
> static unsigned long tainted_mask;
> static int pause_on_oops;
> static int pause_on_oops_flag;
> @@ -45,6 +47,16 @@ static long no_blink(int state)
> return 0;
> }
>
> +static int kexec_should_skip(void)
> +{
> + if (!kexec_on_hwerr && hwerr_flag) {
> + printk(KERN_WARNING "Kexec is skipped because hardware error "
> + "occurred.\n");
> + return 1;
> + }
> + return 0;
> +}
> +
> /* Returns how long it waited in ms */
> long (*panic_blink)(int state);
> EXPORT_SYMBOL(panic_blink);
> @@ -86,7 +98,8 @@ NORET_TYPE void panic(const char * fmt, ...)
> * everything else.
> * Do we want to call this before we try to display a message?
> */
> - crash_kexec(NULL);
> + if (!kexec_should_skip())
> + crash_kexec(NULL);
>
> kmsg_dump(KMSG_DUMP_PANIC);
>
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83..f78edd8 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -811,6 +811,14 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> + {
> + .procname = "kexec_on_hwerr",
> + .data = &kexec_on_hwerr,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> +
> #endif
> #if defined(CONFIG_MMU)
> {
> diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bed..8d572ca 100644
> --- a/kernel/sysctl_binary.c
> +++ b/kernel/sysctl_binary.c
> @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
> { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
> { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
> { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
> + { CTL_INT, KERN_KEXEC_ON_HWERR, "kexec_on_hwerr" },
> {}
> };
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f..0178f47 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -994,6 +994,8 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
> int res;
> unsigned int nr_pages;
>
> + hwerr_flag = 1;
> +
> if (!sysctl_memory_failure_recovery)
> panic("Memory failure from trap %d on page %lx", trapno, pfn);
>
For similar reason, setting flag here is not good for
systems working after isolating some poisoned memory page.
Why not:
if (!sysctl_memory_failure_recovery) {
hwerr_flag = 1;
panic("Memory failure from trap %d on page %lx", trapno, pfn);
}
Thanks,
H.Seto
On Thu, Feb 10, 2011 at 05:36:58PM +0900, Hidetoshi Seto wrote:
> (2011/02/10 1:35), Seiji Aguchi wrote:
[..]
> > diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> > index d916183..e76b47b 100644
> > --- a/arch/x86/kernel/cpu/mcheck/mce.c
> > +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> > @@ -944,6 +944,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
> >
> > percpu_inc(mce_exception_count);
> >
> > + hwerr_flag = 1;
> > +
> > if (notify_die(DIE_NMI, "machine check", regs, error_code,
> > 18, SIGKILL) == NOTIFY_STOP)
> > goto out;
>
> Now x86 supports some recoverable machine check, so setting
> flag here will prevent running kexec on systems that have
> encountered such recoverable machine check and recovered.
>
> I think mce_panic() is proper place to set this flag "hwerr_flag".
I agree, in that case it is unsafe to run kexec only after the error
cannot be recovered by software.
Also, hwerr_flag is really a bad naming choice, how about
"hwerr_unrecoverable" or "hw_compromised" or "recovery_futile" or
"hw_incurable" or simply say what happened: "pcc" = processor context
corrupt (and a reliable restarting might not be possible). This could be
used by others too, besides kexec.
[..]
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f..0178f47 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -994,6 +994,8 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
> > int res;
> > unsigned int nr_pages;
> >
> > + hwerr_flag = 1;
> > +
> > if (!sysctl_memory_failure_recovery)
> > panic("Memory failure from trap %d on page %lx", trapno, pfn);
> >
>
> For similar reason, setting flag here is not good for
> systems working after isolating some poisoned memory page.
>
> Why not:
> if (!sysctl_memory_failure_recovery) {
> hwerr_flag = 1;
> panic("Memory failure from trap %d on page %lx", trapno, pfn);
> }
Why do we need that in memory-failure.c at all? I mean, when we consume
the UC, we'll end up in mce_panic() anyway.
--
Regards/Gruss,
Boris.
(2011/02/10 18:14), Borislav Petkov wrote:
> On Thu, Feb 10, 2011 at 05:36:58PM +0900, Hidetoshi Seto wrote:
>> (2011/02/10 1:35), Seiji Aguchi wrote:
>
> [..]
>
>>> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
>>> index d916183..e76b47b 100644
>>> --- a/arch/x86/kernel/cpu/mcheck/mce.c
>>> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
>>> @@ -944,6 +944,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
>>>
>>> percpu_inc(mce_exception_count);
>>>
>>> + hwerr_flag = 1;
>>> +
>>> if (notify_die(DIE_NMI, "machine check", regs, error_code,
>>> 18, SIGKILL) == NOTIFY_STOP)
>>> goto out;
>>
>> Now x86 supports some recoverable machine check, so setting
>> flag here will prevent running kexec on systems that have
>> encountered such recoverable machine check and recovered.
>>
>> I think mce_panic() is proper place to set this flag "hwerr_flag".
>
> I agree, in that case it is unsafe to run kexec only after the error
> cannot be recovered by software.
>
> Also, hwerr_flag is really a bad naming choice, how about
> "hwerr_unrecoverable" or "hw_compromised" or "recovery_futile" or
> "hw_incurable" or simply say what happened: "pcc" = processor context
> corrupt (and a reliable restarting might not be possible). This could be
> used by others too, besides kexec.
Or how about something like hwerr_panic() to clear that the panic is
requested due to hardware error.
Anyway, Aguchi-san, please note that we should not turn off kexec before
encountering fatal hardware error and before printing/transmitting
enough hardware error log to out of this system.
>
> [..]
>
>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f..0178f47 100644
>>> --- a/mm/memory-failure.c
>>> +++ b/mm/memory-failure.c
>>> @@ -994,6 +994,8 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
>>> int res;
>>> unsigned int nr_pages;
>>>
>>> + hwerr_flag = 1;
>>> +
>>> if (!sysctl_memory_failure_recovery)
>>> panic("Memory failure from trap %d on page %lx", trapno, pfn);
>>>
>>
>> For similar reason, setting flag here is not good for
>> systems working after isolating some poisoned memory page.
>>
>> Why not:
>> if (!sysctl_memory_failure_recovery) {
>> hwerr_flag = 1;
>> panic("Memory failure from trap %d on page %lx", trapno, pfn);
>> }
>
> Why do we need that in memory-failure.c at all? I mean, when we consume
> the UC, we'll end up in mce_panic() anyway.
One possible answer is that memory-failure.c is not x86 specific.
Thanks,
H.Seto