>
> ===================
>
> Borislav and Konrad,
>
> An approach which basically same as you suggested but w/ slightly
> update, is 1). at xen/mcelog.c, do
> misc_register(&xen_mce_chrdev_device) at xen_late_init_mcelog, define
> it as device_initcall(xen_late_init_mcelog) --> now linux dd ready,
> so xen mcelog divice would register successfully; 2). at native
> mce.c, change 1 line from device_initcall(mcheck_init_device) to
> device_initcall_sync(mcheck_init_device) --> so
> misc_register(&mce_chrdev_device) would be blocked by xen mcelog
> device;
>
> I have draft test it and works fine.
> Thought?
>
=====================
RFC patch attached:
=====================
>From d06e667632507d7ed8e18f952b0eb7cec3cfc55c Mon Sep 17 00:00:00 2001
From: Liu, Jinsong <[email protected]>
Date: Tue, 29 May 2012 06:13:19 +0800
Subject: [PATCH 1/3] xen/mce: Add mcelog support for Xen platform
When MCA error occurs, it would be handled by Xen hypervisor first,
and then the error information would be sent to initial domain for logging.
This patch gets error information from Xen hypervisor and convert
Xen format error into Linux format mcelog. This logic is basically
self-contained, not touching other kernel components.
By using tools like mcelog tool users could read specific error information,
like what they did under native Linux.
To test follow directions outlined in Documentation/acpi/apei/einj.txt
Signed-off-by: Ke, Liping <[email protected]>
Signed-off-by: Jiang, Yunhong <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Liu, Jinsong <[email protected]>
---
arch/x86/include/asm/xen/hypercall.h | 8 +
arch/x86/kernel/cpu/mcheck/mce.c | 4 +-
arch/x86/xen/enlighten.c | 5 +-
drivers/xen/Kconfig | 8 +
drivers/xen/Makefile | 1 +
drivers/xen/mcelog.c | 392 ++++++++++++++++++++++++++++++++++
include/linux/miscdevice.h | 1 +
include/xen/interface/xen-mca.h | 385 +++++++++++++++++++++++++++++++++
8 files changed, 798 insertions(+), 6 deletions(-)
create mode 100644 drivers/xen/mcelog.c
create mode 100644 include/xen/interface/xen-mca.h
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5728852..59c226d 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
#include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>
/*
* The hypercall asms have to meet several constraints:
@@ -302,6 +303,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
}
static inline int
+HYPERVISOR_mca(struct xen_mc *mc_op)
+{
+ mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+ return _hypercall1(int, mca, mc_op);
+}
+
+static inline int
HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
{
platform_op->interface_version = XENPF_INTERFACE_VERSION;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09..8a6913b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -57,8 +57,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
int mce_disabled __read_mostly;
-#define MISC_MCELOG_MINOR 227
-
#define SPINUNIT 100 /* 100ns */
atomic_t mce_entry;
@@ -2282,7 +2280,7 @@ static __init int mcheck_init_device(void)
return err;
}
-device_initcall(mcheck_init_device);
+device_initcall_sync(mcheck_init_device);
/*
* Old style boot options parsing. Only for compatibility.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 4f51beb..b2638b1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -38,6 +38,7 @@
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvm.h>
@@ -329,9 +330,7 @@ static void __init xen_init_cpuid_mask(void)
unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
- ~((1 << X86_FEATURE_MCE) | /* disable MCE */
- (1 << X86_FEATURE_MCA) | /* disable MCA */
- (1 << X86_FEATURE_MTRR) | /* disable MTRR */
+ ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
if (!xen_initial_domain())
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 9424313..0f54241 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -194,4 +194,12 @@ config XEN_ACPI_PROCESSOR
module will be called xen_acpi_processor If you do not know what to choose,
select M here. If the CPUFREQ drivers are built in, select Y here.
+config XEN_MCE_LOG
+ bool "Xen platform mcelog"
+ depends on XEN_DOM0 && X86_64 && X86_MCE
+ default n
+ help
+ Allow kernel fetching MCE error from Xen platform and
+ converting it into Linux mcelog format for mcelog tools
+
endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 9adc5be..1d3e763 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o
obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
+obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o
obj-$(CONFIG_XEN_PVHVM) += platform-pci.o
obj-$(CONFIG_XEN_TMEM) += tmem.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
new file mode 100644
index 0000000..72e87d2
--- /dev/null
+++ b/drivers/xen/mcelog.c
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * mcelog.c
+ * Driver for receiving and transferring machine check error infomation
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <[email protected]>
+ * Author: Jiang, Yunhong <[email protected]>
+ * Author: Ke, Liping <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+#include <xen/interface/vcpu.h>
+#include <xen/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#define XEN_MCELOG "xen_mcelog: "
+
+static struct mc_info g_mi;
+static struct mcinfo_logical_cpu *g_physinfo;
+static uint32_t ncpus;
+
+static DEFINE_SPINLOCK(mcelog_lock);
+
+static struct xen_mce_log xen_mcelog = {
+ .signature = XEN_MCE_LOG_SIGNATURE,
+ .len = XEN_MCE_LOG_LEN,
+ .recordlen = sizeof(struct xen_mce),
+};
+
+static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock);
+static int xen_mce_chrdev_open_count; /* #times opened */
+static int xen_mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int xen_mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&xen_mce_chrdev_state_lock);
+
+ if (xen_mce_chrdev_open_exclu ||
+ (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&xen_mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ xen_mce_chrdev_open_exclu = 1;
+ xen_mce_chrdev_open_count++;
+
+ spin_unlock(&xen_mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int xen_mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&xen_mce_chrdev_state_lock);
+
+ xen_mce_chrdev_open_count--;
+ xen_mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&xen_mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned num;
+ int i, err;
+
+ spin_lock(&mcelog_lock);
+
+ num = xen_mcelog.next;
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce))
+ goto out;
+
+ err = 0;
+ for (i = 0; i < num; i++) {
+ struct xen_mce *m = &xen_mcelog.entry[i];
+
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+ }
+
+ memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce));
+ xen_mcelog.next = 0;
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ spin_unlock(&mcelog_lock);
+
+ return err ? err : buf - ubuf;
+}
+
+static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct xen_mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(XEN_MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = xen_mcelog.flags;
+ } while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations xen_mce_chrdev_ops = {
+ .open = xen_mce_chrdev_open,
+ .release = xen_mce_chrdev_release,
+ .read = xen_mce_chrdev_read,
+ .unlocked_ioctl = xen_mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice xen_mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &xen_mce_chrdev_ops,
+};
+
+/*
+ * Caller should hold the mcelog_lock
+ */
+static void xen_mce_log(struct xen_mce *mce)
+{
+ unsigned entry;
+
+ entry = xen_mcelog.next;
+
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= XEN_MCE_LOG_LEN) {
+ set_bit(XEN_MCE_OVERFLOW,
+ (unsigned long *)&xen_mcelog.flags);
+ return;
+ }
+
+ memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce));
+
+ xen_mcelog.next++;
+}
+
+static int convert_log(struct mc_info *mi)
+{
+ struct mcinfo_common *mic;
+ struct mcinfo_global *mc_global;
+ struct mcinfo_bank *mc_bank;
+ struct xen_mce m;
+ uint32_t i;
+
+ mic = NULL;
+ x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
+ if (unlikely(!mic)) {
+ pr_warning(XEN_MCELOG "Failed to find global error info\n");
+ return -ENODEV;
+ }
+
+ memset(&m, 0, sizeof(struct xen_mce));
+
+ mc_global = (struct mcinfo_global *)mic;
+ m.mcgstatus = mc_global->mc_gstatus;
+ m.apicid = mc_global->mc_apicid;
+
+ for (i = 0; i < ncpus; i++)
+ if (g_physinfo[i].mc_apicid == m.apicid)
+ break;
+ if (unlikely(i == ncpus)) {
+ pr_warning(XEN_MCELOG "Failed to match cpu with apicid %d\n",
+ m.apicid);
+ return -ENODEV;
+ }
+
+ m.socketid = g_physinfo[i].mc_chipid;
+ m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+ m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+ m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value;
+
+ mic = NULL;
+ x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
+ if (unlikely(!mic)) {
+ pr_warning(XEN_MCELOG "Fail to find bank error info\n");
+ return -ENODEV;
+ }
+
+ do {
+ if ((!mic) || (mic->size == 0) ||
+ (mic->type != MC_TYPE_GLOBAL &&
+ mic->type != MC_TYPE_BANK &&
+ mic->type != MC_TYPE_EXTENDED &&
+ mic->type != MC_TYPE_RECOVERY))
+ break;
+
+ if (mic->type == MC_TYPE_BANK) {
+ mc_bank = (struct mcinfo_bank *)mic;
+ m.misc = mc_bank->mc_misc;
+ m.status = mc_bank->mc_status;
+ m.addr = mc_bank->mc_addr;
+ m.tsc = mc_bank->mc_tsc;
+ m.bank = mc_bank->mc_bank;
+ m.finished = 1;
+ /*log this record*/
+ xen_mce_log(&m);
+ }
+ mic = x86_mcinfo_next(mic);
+ } while (1);
+
+ return 0;
+}
+
+static int mc_queue_handle(uint32_t flags)
+{
+ struct xen_mc mc_op;
+ int ret = 0;
+
+ mc_op.cmd = XEN_MC_fetch;
+ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+ set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi);
+ do {
+ mc_op.u.mc_fetch.flags = flags;
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err(XEN_MCELOG "Failed to fetch %s error log\n",
+ (flags == XEN_MC_URGENT) ?
+ "urgnet" : "nonurgent");
+ break;
+ }
+
+ if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+ mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+ break;
+ else {
+ ret = convert_log(&g_mi);
+ if (ret)
+ pr_warning(XEN_MCELOG
+ "Failed to convert this error log, "
+ "continue acking it anyway\n");
+
+ mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK;
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err(XEN_MCELOG
+ "Failed to ack previous error log\n");
+ break;
+ }
+ }
+ } while (1);
+
+ return ret;
+}
+
+/* virq handler for machine check error info*/
+static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
+{
+ int err;
+ unsigned long tmp;
+
+ spin_lock_irqsave(&mcelog_lock, tmp);
+
+ /* urgent mc_info */
+ err = mc_queue_handle(XEN_MC_URGENT);
+ if (err)
+ pr_err(XEN_MCELOG
+ "Failed to handle urgent mc_info queue, "
+ "continue handling nonurgent mc_info queue anyway.\n");
+
+ /* nonurgent mc_info */
+ err = mc_queue_handle(XEN_MC_NONURGENT);
+ if (err)
+ pr_err(XEN_MCELOG
+ "Failed to handle nonurgent mc_info queue.\n");
+
+ spin_unlock_irqrestore(&mcelog_lock, tmp);
+
+ return IRQ_HANDLED;
+}
+
+static int bind_virq_for_mce(void)
+{
+ int ret;
+ struct xen_mc mc_op;
+
+ memset(&mc_op, 0, sizeof(struct xen_mc));
+
+ /* Fetch physical CPU Numbers */
+ mc_op.cmd = XEN_MC_physcpuinfo;
+ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err(XEN_MCELOG "Failed to get CPU numbers\n");
+ return ret;
+ }
+
+ /* Fetch each CPU Physical Info for later reference*/
+ ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+ g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu),
+ GFP_KERNEL);
+ if (!g_physinfo)
+ return -ENOMEM;
+ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err(XEN_MCELOG "Failed to get CPU info\n");
+ kfree(g_physinfo);
+ return ret;
+ }
+
+ ret = bind_virq_to_irqhandler(VIRQ_MCA, 0,
+ xen_mce_interrupt, 0, "mce", NULL);
+ if (ret < 0) {
+ pr_err(XEN_MCELOG "Failed to bind virq\n");
+ kfree(g_physinfo);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __init xen_late_init_mcelog(void)
+{
+ /* Only DOM0 is responsible for MCE logging */
+ if (xen_initial_domain()) {
+ /* register character device /dev/mcelog for xen mcelog */
+ if (misc_register(&xen_mce_chrdev_device))
+ return -ENODEV;
+ return bind_virq_for_mce();
+ }
+
+ return -ENODEV;
+}
+device_initcall(xen_late_init_mcelog);
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 0549d21..e0deeb2 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -35,6 +35,7 @@
#define MPT_MINOR 220
#define MPT2SAS_MINOR 221
#define UINPUT_MINOR 223
+#define MISC_MCELOG_MINOR 227
#define HPET_MINOR 228
#define FUSE_MINOR 229
#define KVM_MINOR 232
diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h
new file mode 100644
index 0000000..73a4ea7
--- /dev/null
+++ b/include/xen/interface/xen-mca.h
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * Guest OS machine check interface to x86 Xen.
+ *
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <[email protected]>
+ *
+ * Updated by Intel Corporation
+ * Author: Liu, Jinsong <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc003
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */
+#define XEN_MC_NONURGENT 0x1
+/* IN: Dom0 calls hypercall to retrieve urgent error log entry */
+#define XEN_MC_URGENT 0x2
+/* IN: Dom0 acknowledges previosly-fetched error log entry */
+#define XEN_MC_ACK 0x4
+
+/* OUT: All is ok */
+#define XEN_MC_OK 0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED 0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA 0x2
+
+#ifndef __ASSEMBLY__
+/* vIRQ injected to Dom0 */
+#define VIRQ_MCA VIRQ_ARCH_0
+
+/*
+ * mc_info entry types
+ * mca machine check info are recorded in mc_info entries.
+ * when fetch mca info, it can use MC_TYPE_... to distinguish
+ * different mca info.
+ */
+#define MC_TYPE_GLOBAL 0
+#define MC_TYPE_BANK 1
+#define MC_TYPE_EXTENDED 2
+#define MC_TYPE_RECOVERY 3
+
+struct mcinfo_common {
+ uint16_t type; /* structure type */
+ uint16_t size; /* size of this struct in bytes */
+};
+
+#define MC_FLAG_CORRECTABLE (1 << 0)
+#define MC_FLAG_UNCORRECTABLE (1 << 1)
+#define MC_FLAG_RECOVERABLE (1 << 2)
+#define MC_FLAG_POLLED (1 << 3)
+#define MC_FLAG_RESET (1 << 4)
+#define MC_FLAG_CMCI (1 << 5)
+#define MC_FLAG_MCE (1 << 6)
+
+/* contains x86 global mc information */
+struct mcinfo_global {
+ struct mcinfo_common common;
+
+ uint16_t mc_domid; /* running domain at the time in error */
+ uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+ uint32_t mc_socketid; /* physical socket of the physical core */
+ uint16_t mc_coreid; /* physical impacted core */
+ uint16_t mc_core_threadid; /* core thread of physical core */
+ uint32_t mc_apicid;
+ uint32_t mc_flags;
+ uint64_t mc_gstatus; /* global status */
+};
+
+/* contains x86 bank mc information */
+struct mcinfo_bank {
+ struct mcinfo_common common;
+
+ uint16_t mc_bank; /* bank nr */
+ uint16_t mc_domid; /* domain referenced by mc_addr if valid */
+ uint64_t mc_status; /* bank status */
+ uint64_t mc_addr; /* bank address */
+ uint64_t mc_misc;
+ uint64_t mc_ctrl2;
+ uint64_t mc_tsc;
+};
+
+struct mcinfo_msr {
+ uint64_t reg; /* MSR */
+ uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other or additional mc MSRs */
+struct mcinfo_extended {
+ struct mcinfo_common common;
+ uint32_t mc_msrs; /* Number of msr with valid values. */
+ /*
+ * Currently Intel extended MSR (32/64) include all gp registers
+ * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
+ * useful at present. So expand this array to 16/32 to leave room.
+ */
+ struct mcinfo_msr mc_msr[sizeof(void *) * 4];
+};
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/*
+ * Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/*
+ * Below interface used between XEN/DOM0 for passing XEN's recovery action
+ * information to DOM0.
+ */
+struct page_offline_action {
+ /* Params for passing the offlined page number to DOM0 */
+ uint64_t mfn;
+ uint64_t status;
+};
+
+struct cpu_offline_action {
+ /* Params for passing the identity of the offlined CPU to DOM0 */
+ uint32_t mc_socketid;
+ uint16_t mc_coreid;
+ uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mcinfo_recovery {
+ struct mcinfo_common common;
+ uint16_t mc_bank; /* bank nr */
+ uint8_t action_flags;
+ uint8_t action_types;
+ union {
+ struct page_offline_action page_retire;
+ struct cpu_offline_action cpu_offline;
+ uint8_t pad[MAX_UNION_SIZE];
+ } action_info;
+};
+
+
+#define MCINFO_MAXSIZE 768
+struct mc_info {
+ /* Number of mcinfo_* entries in mi_data */
+ uint32_t mi_nentries;
+ uint32_t flags;
+ uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
+};
+DEFINE_GUEST_HANDLE_STRUCT(mc_info);
+
+#define __MC_MSR_ARRAYSIZE 8
+#define __MC_MSR_MCGCAP 0
+#define __MC_NMSRS 1
+#define MC_NCAPS 7
+struct mcinfo_logical_cpu {
+ uint32_t mc_cpunr;
+ uint32_t mc_chipid;
+ uint16_t mc_coreid;
+ uint16_t mc_threadid;
+ uint32_t mc_apicid;
+ uint32_t mc_clusterid;
+ uint32_t mc_ncores;
+ uint32_t mc_ncores_active;
+ uint32_t mc_nthreads;
+ uint32_t mc_cpuid_level;
+ uint32_t mc_family;
+ uint32_t mc_vendor;
+ uint32_t mc_model;
+ uint32_t mc_step;
+ char mc_vendorid[16];
+ char mc_brandid[64];
+ uint32_t mc_cpu_caps[MC_NCAPS];
+ uint32_t mc_cache_size;
+ uint32_t mc_cache_alignment;
+ uint32_t mc_nmsrvals;
+ struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
+};
+DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu);
+
+/*
+ * Prototype:
+ * uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi) \
+ ((_mi)->mi_nentries)
+/*
+ * Prototype:
+ * struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi) \
+ ((struct mcinfo_common *)(_mi)->mi_data)
+/*
+ * Prototype:
+ * struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic) \
+ ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
+
+/*
+ * Prototype:
+ * void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+static inline void x86_mcinfo_lookup(struct mcinfo_common **ret,
+ struct mc_info *mi, uint16_t type)
+{
+ uint32_t i;
+ struct mcinfo_common *mic;
+ bool found = 0;
+
+ if (!ret || !mi)
+ return;
+
+ mic = x86_mcinfo_first(mi);
+ for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
+ if (mic->type == type) {
+ found = 1;
+ break;
+ }
+ mic = x86_mcinfo_next(mic);
+ }
+
+ *ret = found ? mic : NULL;
+}
+
+/*
+ * Fetch machine check data from hypervisor.
+ */
+#define XEN_MC_fetch 1
+struct xen_mc_fetch {
+ /*
+ * IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+ * XEN_MC_ACK if ack'king an earlier fetch
+ * OUT: XEN_MC_OK, XEN_MC_FETCHAILED, XEN_MC_NODATA
+ */
+ uint32_t flags;
+ uint32_t _pad0;
+ /* OUT: id for ack, IN: id we are ack'ing */
+ uint64_t fetch_id;
+
+ /* OUT variables. */
+ GUEST_HANDLE(mc_info) data;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch);
+
+
+/*
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain 2
+struct xen_mc_notifydomain {
+ /* IN variables */
+ uint16_t mc_domid; /* The unprivileged domain to notify */
+ uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify */
+
+ /* IN/OUT variables */
+ uint32_t flags;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain);
+
+#define XEN_MC_physcpuinfo 3
+struct xen_mc_physcpuinfo {
+ /* IN/OUT */
+ uint32_t ncpus;
+ uint32_t _pad0;
+ /* OUT */
+ GUEST_HANDLE(mcinfo_logical_cpu) info;
+};
+
+#define XEN_MC_msrinject 4
+#define MC_MSRINJ_MAXMSRS 8
+struct xen_mc_msrinject {
+ /* IN */
+ uint32_t mcinj_cpunr; /* target processor id */
+ uint32_t mcinj_flags; /* see MC_MSRINJ_F_* below */
+ uint32_t mcinj_count; /* 0 .. count-1 in array are valid */
+ uint32_t _pad0;
+ struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE 0x1
+
+#define XEN_MC_mceinject 5
+struct xen_mc_mceinject {
+ unsigned int mceinj_cpunr; /* target processor id */
+};
+
+struct xen_mc {
+ uint32_t cmd;
+ uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+ union {
+ struct xen_mc_fetch mc_fetch;
+ struct xen_mc_notifydomain mc_notifydomain;
+ struct xen_mc_physcpuinfo mc_physcpuinfo;
+ struct xen_mc_msrinject mc_msrinject;
+ struct xen_mc_mceinject mc_mceinject;
+ } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc);
+
+/* Fields are zero when not available */
+struct xen_mce {
+ __u64 status;
+ __u64 misc;
+ __u64 addr;
+ __u64 mcgstatus;
+ __u64 ip;
+ __u64 tsc; /* cpu time stamp counter */
+ __u64 time; /* wall time_t when error was detected */
+ __u8 cpuvendor; /* cpu vendor as encoded in system.h */
+ __u8 inject_flags; /* software inject flags */
+ __u16 pad;
+ __u32 cpuid; /* CPUID 1 EAX */
+ __u8 cs; /* code segment */
+ __u8 bank; /* machine check bank */
+ __u8 cpu; /* cpu number; obsolete; use extcpu now */
+ __u8 finished; /* entry is valid */
+ __u32 extcpu; /* linux cpu number that detected the error */
+ __u32 socketid; /* CPU socket ID */
+ __u32 apicid; /* CPU initial apic ID */
+ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
+};
+
+/*
+ * This structure contains all data related to the MCE log. Also
+ * carries a signature to make it easier to find from external
+ * debugging tools. Each entry is only valid when its finished flag
+ * is set.
+ */
+
+#define XEN_MCE_LOG_LEN 32
+
+struct xen_mce_log {
+ char signature[12]; /* "MACHINECHECK" */
+ unsigned len; /* = XEN_MCE_LOG_LEN */
+ unsigned next;
+ unsigned flags;
+ unsigned recordlen; /* length of struct xen_mce */
+ struct xen_mce entry[XEN_MCE_LOG_LEN];
+};
+
+#define XEN_MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
+
+#define XEN_MCE_LOG_SIGNATURE "MACHINECHECK"
+
+#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
+#define MCE_GET_LOG_LEN _IOR('M', 2, int)
+#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
+
+#endif /* __ASSEMBLY__ */
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
--
1.7.1
On Mon, May 28, 2012 at 02:48:06PM +0000, Liu, Jinsong wrote:
> > An approach which basically same as you suggested but w/ slightly
> > update, is 1). at xen/mcelog.c, do
> > misc_register(&xen_mce_chrdev_device) at xen_late_init_mcelog, define
> > it as device_initcall(xen_late_init_mcelog) --> now linux dd ready,
> > so xen mcelog divice would register successfully; 2). at native
> > mce.c, change 1 line from device_initcall(mcheck_init_device) to
> > device_initcall_sync(mcheck_init_device) --> so
> > misc_register(&mce_chrdev_device) would be blocked by xen mcelog
> > device;
> >
> > I have draft test it and works fine.
> > Thought?
> >
>
> =====================
> RFC patch attached:
> =====================
>
>
> From d06e667632507d7ed8e18f952b0eb7cec3cfc55c Mon Sep 17 00:00:00 2001
> From: Liu, Jinsong <[email protected]>
> Date: Tue, 29 May 2012 06:13:19 +0800
> Subject: [PATCH 1/3] xen/mce: Add mcelog support for Xen platform
>
> When MCA error occurs, it would be handled by Xen hypervisor first,
> and then the error information would be sent to initial domain for logging.
>
> This patch gets error information from Xen hypervisor and convert
> Xen format error into Linux format mcelog. This logic is basically
> self-contained, not touching other kernel components.
>
> By using tools like mcelog tool users could read specific error information,
> like what they did under native Linux.
>
> To test follow directions outlined in Documentation/acpi/apei/einj.txt
>
> Signed-off-by: Ke, Liping <[email protected]>
> Signed-off-by: Jiang, Yunhong <[email protected]>
> Signed-off-by: Jeremy Fitzhardinge <[email protected]>
> Signed-off-by: Liu, Jinsong <[email protected]>
Still no go, this is current linus with your patch applied. I'll look into it
later when there's time.
[ 3.644961] initlevel:6=device, 250 registered initcalls
[ 3.652666] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048
[ 3.661186] IP: [<ffffffff811ced67>] kobject_get+0x11/0x34
[ 3.667018] PGD 0
[ 3.669409] Oops: 0000 [#1] SMP
[ 3.672988] CPU 21
[ 3.675436] Modules linked in:
[ 3.678839]
[ 3.680710] Pid: 1, comm: swapper/0 Tainted: G W 3.4.0+ #1 AMD
[ 3.689103] RIP: 0010:[<ffffffff811ced67>] [<ffffffff811ced67>] kobject_get+0x11/0x34
[ 3.697665] RSP: 0000:ffff880425c67cd0 EFLAGS: 00010202
[ 3.703322] RAX: ffff880425ff40b0 RBX: 0000000000000010 RCX: ffff880425c67c50
[ 3.710801] RDX: ffff880425ff4000 RSI: ffff8808259c5380 RDI: 0000000000000010
[ 3.718302] RBP: ffff880425c67ce0 R08: 00000000fffffffe R09: 00000000ffffffff
[ 3.725780] R10: ffff8804a5c67e5f R11: 0000000000000000 R12: 0000000000000010
[ 3.733258] R13: 00000000fffffffe R14: 000000000000cbf8 R15: 0000000000011ec0
[ 3.740738] FS: 0000000000000000(0000) GS:ffff880c27cc0000(0000) knlGS:0000000000000000
[ 3.749472] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 3.755564] CR2: 0000000000000048 CR3: 0000000001a0b000 CR4: 00000000000007e0
[ 3.763044] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3.770549] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3.778026] Process swapper/0 (pid: 1, threadinfo ffff880425c66000, task ffff880425c78000)
[ 3.786934] Stack:
[ 3.789326] ffff880425c67d20 ffff8808259c5380 ffff880425c67d40 ffffffff811cedeb
[ 3.797368] ffff880425c67d70 ffff880425c67da0 ffff8808259c5380 ffff8808259c5380
[ 3.805411] 0000000000000000 ffff8808259c5380 0000000000000010 0000000000000000
[ 3.813453] Call Trace:
[ 3.816253] [<ffffffff811cedeb>] kobject_add_internal+0x61/0x249
[ 3.822693] [<ffffffff811cf3ca>] kobject_add+0x91/0xa2
[ 3.828290] [<ffffffff811cf5a9>] kobject_create_and_add+0x37/0x68
[ 3.834821] [<ffffffff8144b91a>] threshold_create_device+0x1e5/0x342
[ 3.841633] [<ffffffff814549c5>] ? mutex_lock+0x16/0x37
[ 3.847295] [<ffffffff81031894>] ? cpu_maps_update_done+0x15/0x2d
[ 3.853824] [<ffffffff81ad0b0e>] threshold_init_device+0x1b/0x4f
[ 3.860265] [<ffffffff81ad0af3>] ? severities_debugfs_init+0x3b/0x3b
[ 3.867054] [<ffffffff810002f9>] do_one_initcall+0x7f/0x136
[ 3.873062] [<ffffffff81ac8bca>] kernel_init+0x165/0x1fd
[ 3.878807] [<ffffffff81ac8495>] ? loglevel+0x31/0x31
[ 3.884321] [<ffffffff8145e8d4>] kernel_thread_helper+0x4/0x10
[ 3.890590] [<ffffffff81456d86>] ? retint_restore_args+0xe/0xe
[ 3.896885] [<ffffffff81ac8a65>] ? start_kernel+0x2ee/0x2ee
[ 3.902893] [<ffffffff8145e8d0>] ? gs_change+0xb/0xb
[ 3.908322] Code: aa 81 31 c0 e8 ac 90 01 00 4c 89 f7 e8 c5 42 f2 ff 5b 41 5c 41 5d 41 5e c9 c3 55 48 89 e5 53 48 89 fb 48 83 ec 08 48 85 ff 74 1c <8b> 47 38 85 c0 75 11 be 29 00 00 00 48 c7 c7 16 87 79 81 e8 95
[ 3.928115] RIP [<ffffffff811ced67>] kobject_get+0x11/0x34
[ 3.934032] RSP <ffff880425c67cd0>
[ 3.937870] CR2: 0000000000000048
[ 3.941548] ---[ end trace 4eaa2a86a8e2da23 ]---
[ 3.946581] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
[ 3.946581]
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
Borislav Petkov wrote:
> On Mon, May 28, 2012 at 02:48:06PM +0000, Liu, Jinsong wrote:
>>> An approach which basically same as you suggested but w/ slightly
>>> update, is 1). at xen/mcelog.c, do
>>> misc_register(&xen_mce_chrdev_device) at xen_late_init_mcelog,
>>> define it as device_initcall(xen_late_init_mcelog) --> now linux dd
>>> ready, so xen mcelog divice would register successfully; 2). at
>>> native mce.c, change 1 line from
>>> device_initcall(mcheck_init_device) to
>>> device_initcall_sync(mcheck_init_device) --> so
>>> misc_register(&mce_chrdev_device) would be blocked by xen mcelog
>>> device;
>>>
>>> I have draft test it and works fine.
>>> Thought?
>>>
>>
>> =====================
>> RFC patch attached:
>> =====================
>>
>>
>> From d06e667632507d7ed8e18f952b0eb7cec3cfc55c Mon Sep 17 00:00:00
>> 2001 From: Liu, Jinsong <[email protected]>
>> Date: Tue, 29 May 2012 06:13:19 +0800
>> Subject: [PATCH 1/3] xen/mce: Add mcelog support for Xen platform
>>
>> When MCA error occurs, it would be handled by Xen hypervisor first,
>> and then the error information would be sent to initial domain for
>> logging.
>>
>> This patch gets error information from Xen hypervisor and convert
>> Xen format error into Linux format mcelog. This logic is basically
>> self-contained, not touching other kernel components.
>>
>> By using tools like mcelog tool users could read specific error
>> information, like what they did under native Linux.
>>
>> To test follow directions outlined in
>> Documentation/acpi/apei/einj.txt
>>
>> Signed-off-by: Ke, Liping <[email protected]>
>> Signed-off-by: Jiang, Yunhong <[email protected]>
>> Signed-off-by: Jeremy Fitzhardinge <[email protected]>
>> Signed-off-by: Liu, Jinsong <[email protected]>
>
> Still no go, this is current linus with your patch applied. I'll look
> into it
> later when there's time.
>From calltrace seems it's related to device_initcall.
Borislav, would you please send me your .config? I can try to reproduce it and debug it.
(BTW, your kernel pull from git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git? I want to keep same baseline with you)
Attached is the .config at my environment which boot linux3.4.0-rc1+ as dom0 at Xen platform. Under this environment & config it's OK.
Thanks,
Jinsong
>
> [ 3.644961] initlevel:6=device, 250 registered initcalls
> [ 3.652666] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000048 [ 3.661186] IP: [<ffffffff811ced67>]
> kobject_get+0x11/0x34 [ 3.667018] PGD 0
> [ 3.669409] Oops: 0000 [#1] SMP
> [ 3.672988] CPU 21
> [ 3.675436] Modules linked in:
> [ 3.678839]
> [ 3.680710] Pid: 1, comm: swapper/0 Tainted: G W 3.4.0+
> #1 AMD [ 3.689103] RIP: 0010:[<ffffffff811ced67>]
> [<ffffffff811ced67>] kobject_get+0x11/0x34 [ 3.697665] RSP:
> 0000:ffff880425c67cd0 EFLAGS: 00010202 [ 3.703322] RAX:
> ffff880425ff40b0 RBX: 0000000000000010 RCX: ffff880425c67c50 [
> 3.710801] RDX: ffff880425ff4000 RSI: ffff8808259c5380 RDI:
> 0000000000000010 [ 3.718302] RBP: ffff880425c67ce0 R08:
> 00000000fffffffe R09: 00000000ffffffff [ 3.725780] R10:
> ffff8804a5c67e5f R11: 0000000000000000 R12: 0000000000000010 [
> 3.733258] R13: 00000000fffffffe R14: 000000000000cbf8 R15:
> 0000000000011ec0 [ 3.740738] FS: 0000000000000000(0000)
> GS:ffff880c27cc0000(0000) knlGS:0000000000000000 [ 3.749472] CS:
> 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 3.755564] CR2:
> 0000000000000048 CR3: 0000000001a0b000 CR4: 00000000000007e0 [
> 3.763044] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000 [ 3.770549] DR3: 0000000000000000 DR6:
> 00000000ffff0ff0 DR7: 0000000000000400 [ 3.778026] Process
> swapper/0 (pid: 1, threadinfo ffff880425c66000, task
> ffff880425c78000) [ 3.786934] Stack: [ 3.789326]
> ffff880425c67d20 ffff8808259c5380 ffff880425c67d40 ffffffff811cedeb [
> 3.797368] ffff880425c67d70 ffff880425c67da0 ffff8808259c5380
> ffff8808259c5380 [ 3.805411] 0000000000000000 ffff8808259c5380
> 0000000000000010 0000000000000000 [ 3.813453] Call Trace: [
> 3.816253] [<ffffffff811cedeb>] kobject_add_internal+0x61/0x249 [
> 3.822693] [<ffffffff811cf3ca>] kobject_add+0x91/0xa2 [ 3.828290]
> [<ffffffff811cf5a9>] kobject_create_and_add+0x37/0x68 [ 3.834821]
> [<ffffffff8144b91a>] threshold_create_device+0x1e5/0x342 [
> 3.841633] [<ffffffff814549c5>] ? mutex_lock+0x16/0x37 [ 3.847295]
> [<ffffffff81031894>] ? cpu_maps_update_done+0x15/0x2d [ 3.853824]
> [<ffffffff81ad0b0e>] threshold_init_device+0x1b/0x4f [ 3.860265]
> [<ffffffff81ad0af3>] ? severities_debugfs_init+0x3b/0x3b [
> 3.867054] [<ffffffff810002f9>] do_one_initcall+0x7f/0x136 [
> 3.873062] [<ffffffff81ac8bca>] kernel_init+0x165/0x1fd [
> 3.878807] [<ffffffff81ac8495>] ? loglevel+0x31/0x31 [ 3.884321]
> [<ffffffff8145e8d4>] kernel_thread_helper+0x4/0x10 [ 3.890590]
> [<ffffffff81456d86>] ? retint_restore_args+0xe/0xe [ 3.896885]
> [<ffffffff81ac8a65>] ? start_kernel+0x2ee/0x2ee [ 3.902893]
> [<ffffffff8145e8d0>] ? gs_change+0xb/0xb [ 3.908322] Code: aa 81
> 31 c0 e8 ac 90 01 00 4c 89 f7 e8 c5 42 f2 ff 5b 41 5c 41 5d 41 5e c9
> c3 55 48 89 e5 53 48 89 fb 48 83 ec 08 48 85 ff 74 1c <8b> 47 38 85
> c0 75 11 be 29 00 00 00 48 c7 c7 16 87 79 81 e8 95 [ 3.928115] RIP
> [<ffffffff811ced67>] kobject_get+0x11/0x34 [ 3.934032] RSP
> <ffff880425c67cd0> [ 3.937870] CR2: 0000000000000048 [
> 3.941548] ---[ end trace 4eaa2a86a8e2da23 ]--- [ 3.946581] Kernel
> panic - not syncing: Attempted to kill init! exitcode=0x00000009 [
> 3.946581]
On Tue, May 29, 2012 at 04:40:19PM +0000, Liu, Jinsong wrote:
> From calltrace seems it's related to device_initcall. Borislav, would
> you please send me your .config? I can try to reproduce it and debug
> it.
Attached. It is xen-less :).
> (BTW, your kernel pull from
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git? I
> want to keep same baseline with you)
Actually, it is
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
but it should be the same thing (symlinked or so).
Head is v3.4-8261-ga01ee165a132.
> Attached is the .config at my environment which boot linux3.4.0-rc1+
> as dom0 at Xen platform. Under this environment & config it's OK.
Why does it say "linux3.4.0-rc1+"? This is old. Or do you mean 3.4 +
patches? See head commit id above.
And you need to test baremetal too with your patch, not only as a dom0
kernel.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
Borislav Petkov wrote:
> On Tue, May 29, 2012 at 04:40:19PM +0000, Liu, Jinsong wrote:
>> From calltrace seems it's related to device_initcall. Borislav, would
>> you please send me your .config? I can try to reproduce it and debug
>> it.
>
> Attached. It is xen-less :).
>
>> (BTW, your kernel pull from
>> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git? I
>> want to keep same baseline with you)
>
> Actually, it is
>
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
>
> but it should be the same thing (symlinked or so).
>
> Head is v3.4-8261-ga01ee165a132.
>
>> Attached is the .config at my environment which boot linux3.4.0-rc1+
>> as dom0 at Xen platform. Under this environment & config it's OK.
>
> Why does it say "linux3.4.0-rc1+"? This is old. Or do you mean 3.4 +
> patches? See head commit id above.
>
> And you need to test baremetal too with your patch, not only as a dom0
> kernel.
>
> Thanks.
I pulled latest kernel and patched my patch, but cannot reproduce the bug (it works OK under both baremetal and xen platform).
Anyway, it's not important now, I have found root cause, the patch will only affect amd mce logic, I will explain in next email.
Thanks,
Jinsong-
>
> Still no go, this is current linus with your patch applied. I'll look
> into it
> later when there's time.
The root cause is,
1). at cpu/mcheck/mce.c, device_initcall_sync(mcheck_init_device) is *after* all device_initcall();
2). at cpu/mcheck/mce_amd.c, device_initcall(threshold_init_device) will
threshold_init_device
--> threshold_create_device
--> threshold_create_bank
--> kobject_create_and_add(name, &dev->kobj);
// at this point, struct device *dev = per_cpu(mce_device, cpu), which is a NULL pointer.
// mce_device is initialized at mcheck_init_device --> mce_device_create
3). so kernel panic
So our RFC patch would affect amd mce logic.
===========================
I have a thought about symlink approach, but seems it would bring more issues, e.g.
1). it need change more native mce code, like remove /dev/mcelog which created at native mce (under xen platform), or
2). it still need to change device_initcall(mcheck_init_device) to device_initcall_sync(mcheck_init_device), if it want to implicitly block native /dev/mcelog --> but that would panic amd mce logic.
IMO currently there are 2 options:
1). use the original approach (implicitly redirect /dev/mcelog to xen_mce_chrdev_device) --> what point of this approach do you think unreasonable? It just remove a 'static' from native mce code!
2). use another /dev/xen-mcelog interface, with another misc minor '226'
Your thoughts?
Thanks,
Jinsong
>
> [ 3.644961] initlevel:6=device, 250 registered initcalls
> [ 3.652666] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000048 [ 3.661186] IP: [<ffffffff811ced67>]
> kobject_get+0x11/0x34 [ 3.667018] PGD 0
> [ 3.669409] Oops: 0000 [#1] SMP
> [ 3.672988] CPU 21
> [ 3.675436] Modules linked in:
> [ 3.678839]
> [ 3.680710] Pid: 1, comm: swapper/0 Tainted: G W 3.4.0+
> #1 AMD [ 3.689103] RIP: 0010:[<ffffffff811ced67>]
> [<ffffffff811ced67>] kobject_get+0x11/0x34 [ 3.697665] RSP:
> 0000:ffff880425c67cd0 EFLAGS: 00010202 [ 3.703322] RAX:
> ffff880425ff40b0 RBX: 0000000000000010 RCX: ffff880425c67c50 [
> 3.710801] RDX: ffff880425ff4000 RSI: ffff8808259c5380 RDI:
> 0000000000000010 [ 3.718302] RBP: ffff880425c67ce0 R08:
> 00000000fffffffe R09: 00000000ffffffff [ 3.725780] R10:
> ffff8804a5c67e5f R11: 0000000000000000 R12: 0000000000000010 [
> 3.733258] R13: 00000000fffffffe R14: 000000000000cbf8 R15:
> 0000000000011ec0 [ 3.740738] FS: 0000000000000000(0000)
> GS:ffff880c27cc0000(0000) knlGS:0000000000000000 [ 3.749472] CS:
> 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 3.755564] CR2:
> 0000000000000048 CR3: 0000000001a0b000 CR4: 00000000000007e0 [
> 3.763044] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000 [ 3.770549] DR3: 0000000000000000 DR6:
> 00000000ffff0ff0 DR7: 0000000000000400 [ 3.778026] Process
> swapper/0 (pid: 1, threadinfo ffff880425c66000, task
> ffff880425c78000) [ 3.786934] Stack: [ 3.789326]
> ffff880425c67d20 ffff8808259c5380 ffff880425c67d40 ffffffff811cedeb [
> 3.797368] ffff880425c67d70 ffff880425c67da0 ffff8808259c5380
> ffff8808259c5380 [ 3.805411] 0000000000000000 ffff8808259c5380
> 0000000000000010 0000000000000000 [ 3.813453] Call Trace: [
> 3.816253] [<ffffffff811cedeb>] kobject_add_internal+0x61/0x249 [
> 3.822693] [<ffffffff811cf3ca>] kobject_add+0x91/0xa2 [ 3.828290]
> [<ffffffff811cf5a9>] kobject_create_and_add+0x37/0x68 [ 3.834821]
> [<ffffffff8144b91a>] threshold_create_device+0x1e5/0x342 [
> 3.841633] [<ffffffff814549c5>] ? mutex_lock+0x16/0x37 [ 3.847295]
> [<ffffffff81031894>] ? cpu_maps_update_done+0x15/0x2d [ 3.853824]
> [<ffffffff81ad0b0e>] threshold_init_device+0x1b/0x4f [ 3.860265]
> [<ffffffff81ad0af3>] ? severities_debugfs_init+0x3b/0x3b [
> 3.867054] [<ffffffff810002f9>] do_one_initcall+0x7f/0x136 [
> 3.873062] [<ffffffff81ac8bca>] kernel_init+0x165/0x1fd [
> 3.878807] [<ffffffff81ac8495>] ? loglevel+0x31/0x31 [ 3.884321]
> [<ffffffff8145e8d4>] kernel_thread_helper+0x4/0x10 [ 3.890590]
> [<ffffffff81456d86>] ? retint_restore_args+0xe/0xe [ 3.896885]
> [<ffffffff81ac8a65>] ? start_kernel+0x2ee/0x2ee [ 3.902893]
> [<ffffffff8145e8d0>] ? gs_change+0xb/0xb [ 3.908322] Code: aa 81
> 31 c0 e8 ac 90 01 00 4c 89 f7 e8 c5 42 f2 ff 5b 41 5c 41 5d 41 5e c9
> c3 55 48 89 e5 53 48 89 fb 48 83 ec 08 48 85 ff 74 1c <8b> 47 38 85
> c0 75 11 be 29 00 00 00 48 c7 c7 16 87 79 81 e8 95 [ 3.928115] RIP
> [<ffffffff811ced67>] kobject_get+0x11/0x34 [ 3.934032] RSP
> <ffff880425c67cd0> [ 3.937870] CR2: 0000000000000048 [
> 3.941548] ---[ end trace 4eaa2a86a8e2da23 ]--- [ 3.946581] Kernel
> panic - not syncing: Attempted to kill init! exitcode=0x00000009 [
> 3.946581]
On Wed, May 30, 2012 at 03:09:12PM +0000, Liu, Jinsong wrote:
> >
> > Still no go, this is current linus with your patch applied. I'll look
> > into it
> > later when there's time.
>
> The root cause is,
> 1). at cpu/mcheck/mce.c, device_initcall_sync(mcheck_init_device) is *after* all device_initcall();
> 2). at cpu/mcheck/mce_amd.c, device_initcall(threshold_init_device) will
> threshold_init_device
> --> threshold_create_device
> --> threshold_create_bank
> --> kobject_create_and_add(name, &dev->kobj);
> // at this point, struct device *dev = per_cpu(mce_device, cpu), which is a NULL pointer.
> // mce_device is initialized at mcheck_init_device --> mce_device_create
> 3). so kernel panic
>
> So our RFC patch would affect amd mce logic.
>
> ===========================
>
> I have a thought about symlink approach, but seems it would bring more issues, e.g.
> 1). it need change more native mce code, like remove /dev/mcelog which created at native mce (under xen platform), or
> 2). it still need to change device_initcall(mcheck_init_device) to device_initcall_sync(mcheck_init_device), if it want to implicitly block native /dev/mcelog --> but that would panic amd mce logic.
>
> IMO currently there are 2 options:
> 1). use the original approach (implicitly redirect /dev/mcelog to xen_mce_chrdev_device) --> what point of this approach do you think unreasonable? It just remove a 'static' from native mce code!
> 2). use another /dev/xen-mcelog interface, with another misc minor '226'
The 2) is no good.
3) What about moving the corresponding other users (so threshold_init_device),
to be at late_initcall and the mce to be at late_initcall_sync
4) Or make the driver you are making start at fs_initcall ?
Konrad Rzeszutek Wilk wrote:
> On Wed, May 30, 2012 at 03:09:12PM +0000, Liu, Jinsong wrote:
>>>
>>> Still no go, this is current linus with your patch applied. I'll
>>> look into it later when there's time.
>>
>> The root cause is,
>> 1). at cpu/mcheck/mce.c, device_initcall_sync(mcheck_init_device) is
>> *after* all device_initcall(); 2). at cpu/mcheck/mce_amd.c,
>> device_initcall(threshold_init_device) will
>> threshold_init_device
>> --> threshold_create_device
>> --> threshold_create_bank
>> --> kobject_create_and_add(name, &dev->kobj);
>> // at this point, struct device *dev = per_cpu(mce_device,
>> cpu), which is a NULL pointer. // mce_device is initialized
>> at mcheck_init_device --> mce_device_create 3). so kernel panic
>>
>> So our RFC patch would affect amd mce logic.
>>
>> ===========================
>>
>> I have a thought about symlink approach, but seems it would bring
>> more issues, e.g. 1). it need change more native mce code, like
>> remove /dev/mcelog which created at native mce (under xen platform),
>> or 2). it still need to change device_initcall(mcheck_init_device)
>> to device_initcall_sync(mcheck_init_device), if it want to
>> implicitly block native /dev/mcelog --> but that would panic amd mce
>> logic.
>>
>> IMO currently there are 2 options:
>> 1). use the original approach (implicitly redirect /dev/mcelog to
>> xen_mce_chrdev_device) --> what point of this approach do you think
>> unreasonable? It just remove a 'static' from native mce code! 2).
>> use another /dev/xen-mcelog interface, with another misc minor '226'
>
> The 2) is no good.
>
> 3) What about moving the corresponding other users (so
> threshold_init_device), to be at late_initcall and the mce to be at
> late_initcall_sync
I can try, but only if Boris would not jump out.
and it can only be tested by Boris at AMD platform :(
>
> 4) Or make the driver you are making start at fs_initcall ?
It's risky, I'm not sure whether dd ready at that time point, but I can have a look at it.
Thanks,
Jinsong
On Wed, May 30, 2012 at 03:20:03PM +0000, Liu, Jinsong wrote:
> >> IMO currently there are 2 options:
> >> 1). use the original approach (implicitly redirect /dev/mcelog to
> >> xen_mce_chrdev_device) --> what point of this approach do you think
> >> unreasonable? It just remove a 'static' from native mce code! 2).
> >> use another /dev/xen-mcelog interface, with another misc minor '226'
> >
> > The 2) is no good.
> >
> > 3) What about moving the corresponding other users (so
> > threshold_init_device), to be at late_initcall and the mce to be at
> > late_initcall_sync
>
> I can try, but only if Boris would not jump out.
> and it can only be tested by Boris at AMD platform :(
Sure, I'll test it.
However, it should be a separate patch, not merged with this one.
So, I don't see anything speaking against making threshold_init_device a
late_initcall leading to the following order:
device_initcall <- xen
device_initcall_sync <- arch/x86
fs_initcall <- threshold_init_device
and the "xen" line not happening on baremetal; which shouldn't change
anything noticeably in the init-order mce-wise.
Also, add a comment at the end of mce_amd.c saying why
threshold_init_device should be last.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
Borislav Petkov wrote:
> On Wed, May 30, 2012 at 03:20:03PM +0000, Liu, Jinsong wrote:
>>>> IMO currently there are 2 options:
>>>> 1). use the original approach (implicitly redirect /dev/mcelog to
>>>> xen_mce_chrdev_device) --> what point of this approach do you think
>>>> unreasonable? It just remove a 'static' from native mce code! 2).
>>>> use another /dev/xen-mcelog interface, with another misc minor
>>>> '226'
>>>
>>> The 2) is no good.
>>>
>>> 3) What about moving the corresponding other users (so
>>> threshold_init_device), to be at late_initcall and the mce to be at
>>> late_initcall_sync
>>
>> I can try, but only if Boris would not jump out.
>> and it can only be tested by Boris at AMD platform :(
>
> Sure, I'll test it.
>
> However, it should be a separate patch, not merged with this one.
>
> So, I don't see anything speaking against making
> threshold_init_device a late_initcall leading to the following order:
>
> device_initcall <- xen
> device_initcall_sync <- arch/x86
> fs_initcall <- threshold_init_device
do you mean 'late_initcall <- threshold_init_device'?
>
> and the "xen" line not happening on baremetal; which shouldn't change
> anything noticeably in the init-order mce-wise.
>
> Also, add a comment at the end of mce_amd.c saying why
> threshold_init_device should be last.
>
OK, good, we will use this approach, I will do it tomorrow morning.
Thanks Konrad and Boris's suggestion!
On Wed, May 30, 2012 at 05:21:47PM +0000, Liu, Jinsong wrote:
> do you mean 'late_initcall <- threshold_init_device'?
Yes sir, that's what I meant.
Sorry :).
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
Borislav Petkov wrote:
> On Wed, May 30, 2012 at 03:20:03PM +0000, Liu, Jinsong wrote:
>>>> IMO currently there are 2 options:
>>>> 1). use the original approach (implicitly redirect /dev/mcelog to
>>>> xen_mce_chrdev_device) --> what point of this approach do you think
>>>> unreasonable? It just remove a 'static' from native mce code! 2).
>>>> use another /dev/xen-mcelog interface, with another misc minor
>>>> '226'
>>>
>>> The 2) is no good.
>>>
>>> 3) What about moving the corresponding other users (so
>>> threshold_init_device), to be at late_initcall and the mce to be at
>>> late_initcall_sync
>>
>> I can try, but only if Boris would not jump out.
>> and it can only be tested by Boris at AMD platform :(
>
> Sure, I'll test it.
>
Boris,
I have update patches, and they have been tested under baremetal (Intel platform) and xen platform.
So you can test it at AMD platform, and thanks ahead!
Regards,
Jinsong-