2015-12-10 18:41:35

by Ashok Raj

[permalink] [raw]
Subject: [Patch V2 1/2] x86,mce: Basic support to add LMCE support to QEMU

This patch adds basic enumeration, control msr's required to support
Local Machine Check Exception Support (LMCE).

- Added Local Machine Check definitions, changed MCG_CAP
- Added support for IA32_FEATURE_CONTROL.
- When delivering MCE to guest, we deliver to just a single CPU
when guest OS has opted in to Local delivery.

Signed-off-by: Ashok Raj <[email protected]>
Tested-by: Gong Chen <[email protected]>
---
Resending with proper commit message for second patch

target-i386/cpu.c | 8 ++++++++
target-i386/cpu.h | 8 ++++++--
target-i386/kvm.c | 38 +++++++++++++++++++++++++++++++-------
3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 11e5e39..167669a 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -2737,6 +2737,13 @@ static void mce_init(X86CPU *cpu)
}
}

+static void feature_control_init(X86CPU *cpu)
+{
+ CPUX86State *cenv = &cpu->env;
+
+ cenv->msr_ia32_feature_control = ((1<<20) | (1<<0));
+}
+
#ifndef CONFIG_USER_ONLY
static void x86_cpu_apic_create(X86CPU *cpu, Error **errp)
{
@@ -2858,6 +2865,7 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
#endif

mce_init(cpu);
+ feature_control_init(cpu);

#ifndef CONFIG_USER_ONLY
if (tcg_enabled()) {
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 84edfd0..a567d7a 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -282,8 +282,9 @@

#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
+#define MCG_LMCE_P (1ULL<<27) /* Local Machine Check Supported */

-#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
+#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P|MCG_LMCE_P)
#define MCE_BANKS_DEF 10

#define MCG_CAP_BANKS_MASK 0xff
@@ -291,6 +292,7 @@
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
+#define MCG_STATUS_LMCE (1ULL<<3) /* Local MCE signaled */

#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
@@ -333,6 +335,7 @@
#define MSR_MCG_CAP 0x179
#define MSR_MCG_STATUS 0x17a
#define MSR_MCG_CTL 0x17b
+#define MSR_MCG_EXT_CTL 0x4d0

#define MSR_P6_EVNTSEL0 0x186

@@ -892,7 +895,6 @@ typedef struct CPUX86State {

uint64_t mcg_status;
uint64_t msr_ia32_misc_enable;
- uint64_t msr_ia32_feature_control;

uint64_t msr_fixed_ctr_ctrl;
uint64_t msr_global_ctrl;
@@ -977,8 +979,10 @@ typedef struct CPUX86State {
int64_t tsc_khz;
void *kvm_xsave_buf;

+ uint64_t msr_ia32_feature_control;
uint64_t mcg_cap;
uint64_t mcg_ctl;
+ uint64_t mcg_ext_ctl;
uint64_t mce_banks[MCE_BANKS_DEF*4];

uint64_t tsc_aux;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 6dc9846..c61fe1f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -72,6 +72,7 @@ static bool has_msr_tsc_aux;
static bool has_msr_tsc_adjust;
static bool has_msr_tsc_deadline;
static bool has_msr_feature_control;
+static bool has_msr_ext_mcg_ctl;
static bool has_msr_async_pf_en;
static bool has_msr_pv_eoi_en;
static bool has_msr_misc_enable;
@@ -370,18 +371,30 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
uint64_t mcg_status = MCG_STATUS_MCIP;
+ int flags = 0;
+ CPUState *cs = CPU(cpu);
+
+ /*
+ * We need to read back the value of MSR_EXT_MCG_CTL that was set by the
+ * guest kernel back into Qemu
+ */
+ cpu_synchronize_state(cs);
+
+ flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;

if (code == BUS_MCEERR_AR) {
- status |= MCI_STATUS_AR | 0x134;
- mcg_status |= MCG_STATUS_EIPV;
+ status |= MCI_STATUS_AR | 0x134;
+ mcg_status |= MCG_STATUS_EIPV;
+ if (env->mcg_ext_ctl & 0x1) {
+ mcg_status |= MCG_STATUS_LMCE;
+ flags = 0; /* No Broadcast when LMCE is opted by guest */
+ }
} else {
status |= 0xc0;
mcg_status |= MCG_STATUS_RIPV;
}
cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
- (MCM_ADDR_PHYS << 6) | 0xc,
- cpu_x86_support_mca_broadcast(env) ?
- MCE_INJECT_BROADCAST : 0);
+ (MCM_ADDR_PHYS << 6) | 0xc, flags);
}

static void hardware_memory_error(void)
@@ -808,10 +821,14 @@ int kvm_arch_init_vcpu(CPUState *cs)

c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
if (c) {
- has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
- !!(c->ecx & CPUID_EXT_SMX);
+ has_msr_feature_control = !!((c->ecx & CPUID_EXT_VMX) ||
+ !!(c->ecx & CPUID_EXT_SMX) ||
+ !!(env->mcg_cap & MCG_LMCE_P));
}

+ if (has_msr_feature_control && (env->mcg_cap & MCG_LMCE_P))
+ has_msr_ext_mcg_ctl = true;
+
c = cpuid_find_entry(&cpuid_data.cpuid, 0x80000007, 0);
if (c && (c->edx & 1<<8) && invtsc_mig_blocker == NULL) {
/* for migration */
@@ -1557,6 +1574,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level)

kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
+ kvm_msr_entry_set(&msrs[n++], MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
}
@@ -1811,6 +1829,9 @@ static int kvm_get_msrs(X86CPU *cpu)
if (has_msr_feature_control) {
msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
}
+ if (has_msr_ext_mcg_ctl) {
+ msrs[n++].index = MSR_MCG_EXT_CTL;
+ }
if (has_msr_bndcfgs) {
msrs[n++].index = MSR_IA32_BNDCFGS;
}
@@ -1981,6 +2002,9 @@ static int kvm_get_msrs(X86CPU *cpu)
case MSR_IA32_FEATURE_CONTROL:
env->msr_ia32_feature_control = msrs[i].data;
break;
+ case MSR_MCG_EXT_CTL:
+ env->mcg_ext_ctl = msrs[i].data;
+ break;
case MSR_IA32_BNDCFGS:
env->msr_bndcfgs = msrs[i].data;
break;
--
2.4.3


2015-12-10 18:41:34

by Ashok Raj

[permalink] [raw]
Subject: [Patch V2 2/2] x86, mce: Need to translate GPA to HPA to inject error in guest.

From: Gong Chen <[email protected]>

When we need to test error injection to a specific address using EINJ,
there needs to be a way to translate GPA to HPA. This will allow host EINJ
to inject error to test how guest behavior is when a bad address is consumed.
This permits guest OS to perform its own recovery.

Signed-off-by: Gong Chen <[email protected]>
---
Sorry about the spam :-(.
Resending with proper Commit Message. Previous had a bogus From. Fixed that.
before sending.

hmp-commands.hx | 14 ++++++++++++++
include/exec/memory.h | 2 ++
kvm-all.c | 24 ++++++++++++++++++++++++
memory.c | 13 +++++++++++++
monitor.c | 16 ++++++++++++++++
5 files changed, 69 insertions(+)
mode change 100644 => 100755 include/exec/memory.h
mode change 100644 => 100755 kvm-all.c
mode change 100644 => 100755 memory.c
mode change 100644 => 100755 monitor.c

diff --git a/hmp-commands.hx b/hmp-commands.hx
index bb52e4d..673c00e 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -444,6 +444,20 @@ Start gdbserver session (default @var{port}=1234)
ETEXI

{
+ .name = "x-gpa2hva",
+ .args_type = "fmt:/,addr:l",
+ .params = "/fmt addr",
+ .help = "translate guest physical 'addr' to host virtual address, only for debugging",
+ .mhandler.cmd = do_gpa2hva,
+ },
+
+STEXI
+@item x-gpa2hva @var{addr}
+@findex x-gpa2hva
+Translate guest physical @var{addr} to host virtual address, only for debugging.
+ETEXI
+
+ {
.name = "x",
.args_type = "fmt:/,addr:l",
.params = "/fmt addr",
diff --git a/include/exec/memory.h b/include/exec/memory.h
old mode 100644
new mode 100755
index 0f07159..57d7bf8
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -222,6 +222,7 @@ struct MemoryListener {
hwaddr addr, hwaddr len);
void (*coalesced_mmio_del)(MemoryListener *listener, MemoryRegionSection *section,
hwaddr addr, hwaddr len);
+ int (*translate_gpa2hva)(MemoryListener *listener, uint64_t paddr, uint64_t *vaddr);
/* Lower = earlier (during add), later (during del) */
unsigned priority;
AddressSpace *address_space_filter;
@@ -1123,6 +1124,7 @@ void memory_global_dirty_log_start(void);
void memory_global_dirty_log_stop(void);

void mtree_info(fprintf_function mon_printf, void *f);
+int memory_translate_gpa2hva(hwaddr paddr, uint64_t *vaddr);

/**
* memory_region_dispatch_read: perform a read directly to the specified
diff --git a/kvm-all.c b/kvm-all.c
old mode 100644
new mode 100755
index c648b81..cb029be
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -197,6 +197,29 @@ static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
return found;
}

+
+static int kvm_translate_gpa2hva(MemoryListener *listener, uint64_t paddr, uint64_t *vaddr)
+{
+ KVMState *s = kvm_state;
+ KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+ KVMSlot *mem = NULL;
+ int i;
+
+ for (i = 0; i < s->nr_slots; i++) {
+ mem = &kml->slots[i];
+ if (paddr >= mem->start_addr && paddr < mem->start_addr + mem->memory_size) {
+ *vaddr = (uint64_t)mem->ram + paddr - mem->start_addr;
+ break;
+ }
+ }
+
+ if (i == s->nr_slots) {
+ fprintf(stderr, "fail to find target physical addr(%ld) in KVM memory range\n", paddr);
+ return 1;
+ }
+ return 0;
+}
+
int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
hwaddr *phys_addr)
{
@@ -902,6 +925,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
kml->listener.log_start = kvm_log_start;
kml->listener.log_stop = kvm_log_stop;
kml->listener.log_sync = kvm_log_sync;
+ kml->listener.translate_gpa2hva = kvm_translate_gpa2hva;
kml->listener.priority = 10;

memory_listener_register(&kml->listener, as);
diff --git a/memory.c b/memory.c
old mode 100644
new mode 100755
index e193658..979dcf8
--- a/memory.c
+++ b/memory.c
@@ -2294,6 +2294,19 @@ static const TypeInfo memory_region_info = {
.instance_finalize = memory_region_finalize,
};

+int memory_translate_gpa2hva(hwaddr paddr, uint64_t *vaddr){
+ MemoryListener *ml = NULL;
+ int ret = 1;
+
+ QTAILQ_FOREACH(ml, &memory_listeners, link) {
+ if(ml->translate_gpa2hva)
+ ret = ml->translate_gpa2hva(ml, paddr, vaddr);
+ if(0 == ret)
+ break;
+ }
+ return ret;
+}
+
static void memory_register_types(void)
{
type_register_static(&memory_region_info);
diff --git a/monitor.c b/monitor.c
old mode 100644
new mode 100755
index 9a35d72..408e1fa
--- a/monitor.c
+++ b/monitor.c
@@ -76,6 +76,7 @@
#include "qapi-event.h"
#include "qmp-introspect.h"
#include "sysemu/block-backend.h"
+#include "exec/memory.h"

/* for hmp_info_irq/pic */
#if defined(TARGET_SPARC)
@@ -1681,6 +1682,21 @@ static void hmp_acl_remove(Monitor *mon, const QDict *qdict)
}
}

+static void do_gpa2hva(Monitor *mon, const QDict *qdict)
+{
+ uint64_t paddr;
+ uint64_t vaddr;
+
+ paddr = qdict_get_int(qdict, "addr");
+ if (memory_translate_gpa2hva(paddr, &vaddr)){
+ monitor_printf(mon, "fail to translate gpa(0x%lx) to hva\n", paddr);
+ return;
+ }
+
+ monitor_printf(mon, "0x%lx\n", (unsigned long)vaddr);
+ return;
+}
+
void qmp_getfd(const char *fdname, Error **errp)
{
mon_fd_t *monfd;
--
2.4.3

2015-12-12 03:22:53

by Chen, Gong

[permalink] [raw]
Subject: RE: [Patch V2 2/2] x86, mce: Need to translate GPA to HPA to inject error in guest.

Hi, Ashok

Please add " original author by Huang Ying <[email protected]>" at some place.
Thanks.

> -----Original Message-----
> From: Raj, Ashok
> Sent: Friday, December 11, 2015 3:41 AM
> To: [email protected]
> Cc: Chen, Gong; Gleb Natapov; Paolo Bonzini; [email protected];
> [email protected]; Boris Petkov; Luck, Tony; Raj, Ashok; Kleen,
> Andi
> Subject: [Patch V2 2/2] x86, mce: Need to translate GPA to HPA to inject
> error in guest.
>
> From: Gong Chen <[email protected]>
>
> When we need to test error injection to a specific address using EINJ,
> there needs to be a way to translate GPA to HPA. This will allow host EINJ
> to inject error to test how guest behavior is when a bad address is consumed.
> This permits guest OS to perform its own recovery.
>
> Signed-off-by: Gong Chen <[email protected]>
> ---
> Sorry about the spam :-(.
> Resending with proper Commit Message. Previous had a bogus From. Fixed
> that.
> before sending.
>
> hmp-commands.hx | 14 ++++++++++++++
> include/exec/memory.h | 2 ++
> kvm-all.c | 24 ++++++++++++++++++++++++
> memory.c | 13 +++++++++++++
> monitor.c | 16 ++++++++++++++++
> 5 files changed, 69 insertions(+)
> mode change 100644 => 100755 include/exec/memory.h
> mode change 100644 => 100755 kvm-all.c
> mode change 100644 => 100755 memory.c
> mode change 100644 => 100755 monitor.c
>
> diff --git a/hmp-commands.hx b/hmp-commands.hx
> index bb52e4d..673c00e 100644
> --- a/hmp-commands.hx
> +++ b/hmp-commands.hx
> @@ -444,6 +444,20 @@ Start gdbserver session (default @var{port}=1234)
> ETEXI
>
> {
> + .name = "x-gpa2hva",
> + .args_type = "fmt:/,addr:l",
> + .params = "/fmt addr",
> + .help = "translate guest physical 'addr' to host virtual address,
> only for debugging",
> + .mhandler.cmd = do_gpa2hva,
> + },
> +
> +STEXI
> +@item x-gpa2hva @var{addr}
> +@findex x-gpa2hva
> +Translate guest physical @var{addr} to host virtual address, only for
> debugging.
> +ETEXI
> +
> + {
> .name = "x",
> .args_type = "fmt:/,addr:l",
> .params = "/fmt addr",
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> old mode 100644
> new mode 100755
> index 0f07159..57d7bf8
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -222,6 +222,7 @@ struct MemoryListener {
> hwaddr addr, hwaddr len);
> void (*coalesced_mmio_del)(MemoryListener *listener,
> MemoryRegionSection *section,
> hwaddr addr, hwaddr len);
> + int (*translate_gpa2hva)(MemoryListener *listener, uint64_t paddr,
> uint64_t *vaddr);
> /* Lower = earlier (during add), later (during del) */
> unsigned priority;
> AddressSpace *address_space_filter;
> @@ -1123,6 +1124,7 @@ void memory_global_dirty_log_start(void);
> void memory_global_dirty_log_stop(void);
>
> void mtree_info(fprintf_function mon_printf, void *f);
> +int memory_translate_gpa2hva(hwaddr paddr, uint64_t *vaddr);
>
> /**
> * memory_region_dispatch_read: perform a read directly to the specified
> diff --git a/kvm-all.c b/kvm-all.c
> old mode 100644
> new mode 100755
> index c648b81..cb029be
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -197,6 +197,29 @@ static KVMSlot
> *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
> return found;
> }
>
> +
> +static int kvm_translate_gpa2hva(MemoryListener *listener, uint64_t paddr,
> uint64_t *vaddr)
> +{
> + KVMState *s = kvm_state;
> + KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
> listener);
> + KVMSlot *mem = NULL;
> + int i;
> +
> + for (i = 0; i < s->nr_slots; i++) {
> + mem = &kml->slots[i];
> + if (paddr >= mem->start_addr && paddr < mem->start_addr + mem-
> >memory_size) {
> + *vaddr = (uint64_t)mem->ram + paddr - mem->start_addr;
> + break;
> + }
> + }
> +
> + if (i == s->nr_slots) {
> + fprintf(stderr, "fail to find target physical addr(%ld) in KVM memory
> range\n", paddr);
> + return 1;
> + }
> + return 0;
> +}
> +
> int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
> hwaddr *phys_addr)
> {
> @@ -902,6 +925,7 @@ void kvm_memory_listener_register(KVMState *s,
> KVMMemoryListener *kml,
> kml->listener.log_start = kvm_log_start;
> kml->listener.log_stop = kvm_log_stop;
> kml->listener.log_sync = kvm_log_sync;
> + kml->listener.translate_gpa2hva = kvm_translate_gpa2hva;
> kml->listener.priority = 10;
>
> memory_listener_register(&kml->listener, as);
> diff --git a/memory.c b/memory.c
> old mode 100644
> new mode 100755
> index e193658..979dcf8
> --- a/memory.c
> +++ b/memory.c
> @@ -2294,6 +2294,19 @@ static const TypeInfo memory_region_info = {
> .instance_finalize = memory_region_finalize,
> };
>
> +int memory_translate_gpa2hva(hwaddr paddr, uint64_t *vaddr){
> + MemoryListener *ml = NULL;
> + int ret = 1;
> +
> + QTAILQ_FOREACH(ml, &memory_listeners, link) {
> + if(ml->translate_gpa2hva)
> + ret = ml->translate_gpa2hva(ml, paddr, vaddr);
> + if(0 == ret)
> + break;
> + }
> + return ret;
> +}
> +
> static void memory_register_types(void)
> {
> type_register_static(&memory_region_info);
> diff --git a/monitor.c b/monitor.c
> old mode 100644
> new mode 100755
> index 9a35d72..408e1fa
> --- a/monitor.c
> +++ b/monitor.c
> @@ -76,6 +76,7 @@
> #include "qapi-event.h"
> #include "qmp-introspect.h"
> #include "sysemu/block-backend.h"
> +#include "exec/memory.h"
>
> /* for hmp_info_irq/pic */
> #if defined(TARGET_SPARC)
> @@ -1681,6 +1682,21 @@ static void hmp_acl_remove(Monitor *mon,
> const QDict *qdict)
> }
> }
>
> +static void do_gpa2hva(Monitor *mon, const QDict *qdict)
> +{
> + uint64_t paddr;
> + uint64_t vaddr;
> +
> + paddr = qdict_get_int(qdict, "addr");
> + if (memory_translate_gpa2hva(paddr, &vaddr)){
> + monitor_printf(mon, "fail to translate gpa(0x%lx) to hva\n", paddr);
> + return;
> + }
> +
> + monitor_printf(mon, "0x%lx\n", (unsigned long)vaddr);
> + return;
> +}
> +
> void qmp_getfd(const char *fdname, Error **errp)
> {
> mon_fd_t *monfd;
> --
> 2.4.3

2015-12-14 16:29:31

by Eduardo Habkost

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

Hi,

Comments below:

On Thu, Dec 10, 2015 at 02:41:21PM -0500, Ashok Raj wrote:
> This patch adds basic enumeration, control msr's required to support
> Local Machine Check Exception Support (LMCE).
>
> - Added Local Machine Check definitions, changed MCG_CAP
> - Added support for IA32_FEATURE_CONTROL.
> - When delivering MCE to guest, we deliver to just a single CPU
> when guest OS has opted in to Local delivery.
>
> Signed-off-by: Ashok Raj <[email protected]>
> Tested-by: Gong Chen <[email protected]>
> ---
> Resending with proper commit message for second patch
>
> target-i386/cpu.c | 8 ++++++++
> target-i386/cpu.h | 8 ++++++--
> target-i386/kvm.c | 38 +++++++++++++++++++++++++++++++-------
> 3 files changed, 45 insertions(+), 9 deletions(-)
>
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index 11e5e39..167669a 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -2737,6 +2737,13 @@ static void mce_init(X86CPU *cpu)
> }
> }
>
> +static void feature_control_init(X86CPU *cpu)
> +{
> + CPUX86State *cenv = &cpu->env;
> +
> + cenv->msr_ia32_feature_control = ((1<<20) | (1<<0));

FEATURE_CONTROL_LOCKED and FEATURE_CONTROL_LMCE macros would make
the code clearer.

> +}
> +
> #ifndef CONFIG_USER_ONLY
> static void x86_cpu_apic_create(X86CPU *cpu, Error **errp)
> {
> @@ -2858,6 +2865,7 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
> #endif
>
> mce_init(cpu);
> + feature_control_init(cpu);
>
> #ifndef CONFIG_USER_ONLY
> if (tcg_enabled()) {
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index 84edfd0..a567d7a 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -282,8 +282,9 @@
>
> #define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
> #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
> +#define MCG_LMCE_P (1ULL<<27) /* Local Machine Check Supported */

Please use spaces instead of tabs. You can detect this and other
coding style issues in this patch with checkpatch.pl.


>
> -#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
> +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P|MCG_LMCE_P)

This makes mcg_cap change when upgrading QEMU.

VMs with MCG_LMCE_P enabled shouldn't be migratable to hosts
running older kernels, or the guest may try to read or write
MSR_IA32_MCG_EXT_CTL after miration and get a #GP. That means:

1) Older machine-types (pc-2.5 and older) should keep the
old (MCG_CTL_P|MCG_SER_P) default.
2) We can't make pc-2.6 enable LMCE by default, either, because
QEMU guarantees that just changing the machine-type shouldn't
introduce new host requirements (see:
http://article.gmane.org/gmane.comp.emulators.qemu/346651)

It looks like we need a new -cpu option to enable the feature,
then. At least until we raise the minimum kernel version
requirements of QEMU.

(I didn't review the kvm_mce_inject() changes as I am not
familiar with the details of how LMCE is implemented.)


> #define MCE_BANKS_DEF 10
>
> #define MCG_CAP_BANKS_MASK 0xff
> @@ -291,6 +292,7 @@
> #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
> #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
> #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
> +#define MCG_STATUS_LMCE (1ULL<<3) /* Local MCE signaled */
>
> #define MCI_STATUS_VAL (1ULL<<63) /* valid error */
> #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
> @@ -333,6 +335,7 @@
> #define MSR_MCG_CAP 0x179
> #define MSR_MCG_STATUS 0x17a
> #define MSR_MCG_CTL 0x17b
> +#define MSR_MCG_EXT_CTL 0x4d0
>
> #define MSR_P6_EVNTSEL0 0x186
>
> @@ -892,7 +895,6 @@ typedef struct CPUX86State {
>
> uint64_t mcg_status;
> uint64_t msr_ia32_misc_enable;
> - uint64_t msr_ia32_feature_control;
>
> uint64_t msr_fixed_ctr_ctrl;
> uint64_t msr_global_ctrl;
> @@ -977,8 +979,10 @@ typedef struct CPUX86State {
> int64_t tsc_khz;
> void *kvm_xsave_buf;
>
> + uint64_t msr_ia32_feature_control;
> uint64_t mcg_cap;
> uint64_t mcg_ctl;
> + uint64_t mcg_ext_ctl;
> uint64_t mce_banks[MCE_BANKS_DEF*4];
>
> uint64_t tsc_aux;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 6dc9846..c61fe1f 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -72,6 +72,7 @@ static bool has_msr_tsc_aux;
> static bool has_msr_tsc_adjust;
> static bool has_msr_tsc_deadline;
> static bool has_msr_feature_control;
> +static bool has_msr_ext_mcg_ctl;
> static bool has_msr_async_pf_en;
> static bool has_msr_pv_eoi_en;
> static bool has_msr_misc_enable;
> @@ -370,18 +371,30 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
> uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
> MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
> uint64_t mcg_status = MCG_STATUS_MCIP;
> + int flags = 0;
> + CPUState *cs = CPU(cpu);
> +
> + /*
> + * We need to read back the value of MSR_EXT_MCG_CTL that was set by the
> + * guest kernel back into Qemu
> + */
> + cpu_synchronize_state(cs);
> +
> + flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
>
> if (code == BUS_MCEERR_AR) {
> - status |= MCI_STATUS_AR | 0x134;
> - mcg_status |= MCG_STATUS_EIPV;
> + status |= MCI_STATUS_AR | 0x134;
> + mcg_status |= MCG_STATUS_EIPV;
> + if (env->mcg_ext_ctl & 0x1) {
> + mcg_status |= MCG_STATUS_LMCE;
> + flags = 0; /* No Broadcast when LMCE is opted by guest */
> + }
> } else {
> status |= 0xc0;
> mcg_status |= MCG_STATUS_RIPV;
> }
> cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
> - (MCM_ADDR_PHYS << 6) | 0xc,
> - cpu_x86_support_mca_broadcast(env) ?
> - MCE_INJECT_BROADCAST : 0);
> + (MCM_ADDR_PHYS << 6) | 0xc, flags);
> }
>
> static void hardware_memory_error(void)
> @@ -808,10 +821,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
>
> c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
> if (c) {
> - has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
> - !!(c->ecx & CPUID_EXT_SMX);
> + has_msr_feature_control = !!((c->ecx & CPUID_EXT_VMX) ||
> + !!(c->ecx & CPUID_EXT_SMX) ||
> + !!(env->mcg_cap & MCG_LMCE_P));
> }
>
> + if (has_msr_feature_control && (env->mcg_cap & MCG_LMCE_P))
> + has_msr_ext_mcg_ctl = true;
> +
> c = cpuid_find_entry(&cpuid_data.cpuid, 0x80000007, 0);
> if (c && (c->edx & 1<<8) && invtsc_mig_blocker == NULL) {
> /* for migration */
> @@ -1557,6 +1574,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
>
> kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
> kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
> + kvm_msr_entry_set(&msrs[n++], MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
> for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
> kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
> }
> @@ -1811,6 +1829,9 @@ static int kvm_get_msrs(X86CPU *cpu)
> if (has_msr_feature_control) {
> msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
> }
> + if (has_msr_ext_mcg_ctl) {
> + msrs[n++].index = MSR_MCG_EXT_CTL;
> + }
> if (has_msr_bndcfgs) {
> msrs[n++].index = MSR_IA32_BNDCFGS;
> }
> @@ -1981,6 +2002,9 @@ static int kvm_get_msrs(X86CPU *cpu)
> case MSR_IA32_FEATURE_CONTROL:
> env->msr_ia32_feature_control = msrs[i].data;
> break;
> + case MSR_MCG_EXT_CTL:
> + env->mcg_ext_ctl = msrs[i].data;
> + break;
> case MSR_IA32_BNDCFGS:
> env->msr_bndcfgs = msrs[i].data;
> break;
> --
> 2.4.3
>
>

--
Eduardo

2015-12-14 16:37:46

by Borislav Petkov

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

On Mon, Dec 14, 2015 at 02:23:56PM -0200, Eduardo Habkost wrote:
> > -#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
> > +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P|MCG_LMCE_P)
>
> This makes mcg_cap change when upgrading QEMU.
>
> VMs with MCG_LMCE_P enabled shouldn't be migratable to hosts
> running older kernels, or the guest may try to read or write
> MSR_IA32_MCG_EXT_CTL after miration and get a #GP. That means:
>
> 1) Older machine-types (pc-2.5 and older) should keep the
> old (MCG_CTL_P|MCG_SER_P) default.
> 2) We can't make pc-2.6 enable LMCE by default, either, because
> QEMU guarantees that just changing the machine-type shouldn't
> introduce new host requirements (see:
> http://article.gmane.org/gmane.comp.emulators.qemu/346651)
>
> It looks like we need a new -cpu option to enable the feature,
> then. At least until we raise the minimum kernel version
> requirements of QEMU.

... and obviously LMCE is vendor-specific so it cannot be enabled on
!Intel guests with a define like that. mce_init() in qemu should check
vendor too.

The same mistake was done with SER_P but that's much harder to change,
as we discussed previously.

--
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--

2015-12-14 18:32:21

by Eduardo Habkost

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

On Mon, Dec 14, 2015 at 02:11:46PM -0500, Raj, Ashok wrote:
> On Mon, Dec 14, 2015 at 05:37:38PM +0100, Borislav Petkov wrote:
> >
> > ... and obviously LMCE is vendor-specific so it cannot be enabled on
> > !Intel guests with a define like that. mce_init() in qemu should check
> > vendor too.
> >
> > The same mistake was done with SER_P but that's much harder to change,
> > as we discussed previously.
> >
>
> This is mostly harmless.. since the MCG_CAP space is shared and has no
> conflict between vendors. Also just the CAP being set has no effect.
>
> The Guest OS needs to opt-in and the SIGBUS indicating SRAR are the only
> ones that are treated special treatment. Also Intel was the only one
> broadcasting MCE's.. so we are like the rest now :-)

If the feature won't be enabled by default, I believe it will be
OK to not make it conditional on CPUID vendor (as we would be
simply doing that the user asked for).

But if it's going to be enabled by default, I would like to get
some assurance that there won't be conflicts between vendors in
the MCG_CAP bits.

--
Eduardo

2015-12-14 18:05:45

by Ashok Raj

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

Hi Eduardo,

Thanks for the feedback. All the suggestions make sense..

On Mon, Dec 14, 2015 at 02:23:56PM -0200, Eduardo Habkost wrote:
> > +static void feature_control_init(X86CPU *cpu)
> > +{
> > + CPUX86State *cenv = &cpu->env;
> > +
> > + cenv->msr_ia32_feature_control = ((1<<20) | (1<<0));
>
> FEATURE_CONTROL_LOCKED and FEATURE_CONTROL_LMCE macros would make
> the code clearer.

Makes sense.. will fix it next round.

> > @@ -282,8 +282,9 @@
> >
> > #define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
> > #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
> > +#define MCG_LMCE_P (1ULL<<27) /* Local Machine Check Supported */
>
> Please use spaces instead of tabs. You can detect this and other
> coding style issues in this patch with checkpatch.pl.
>

Will change the ones i added.

>
> >
> > -#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
> > +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P|MCG_LMCE_P)
>
> This makes mcg_cap change when upgrading QEMU.
>
> VMs with MCG_LMCE_P enabled shouldn't be migratable to hosts
> running older kernels, or the guest may try to read or write
> MSR_IA32_MCG_EXT_CTL after miration and get a #GP. That means:
>
> 1) Older machine-types (pc-2.5 and older) should keep the
> old (MCG_CTL_P|MCG_SER_P) default.
> 2) We can't make pc-2.6 enable LMCE by default, either, because
> QEMU guarantees that just changing the machine-type shouldn't
> introduce new host requirements (see:
> http://article.gmane.org/gmane.comp.emulators.qemu/346651)
>
> It looks like we need a new -cpu option to enable the feature,
> then. At least until we raise the minimum kernel version
> requirements of QEMU.

Ok.. i will look this up.
>
> (I didn't review the kvm_mce_inject() changes as I am not
> familiar with the details of how LMCE is implemented.)
>
This is quite simple.. we just don't broadcast MCE's and set LMCE in
MCG_STATUS.

I will make the suggested changes and resend.

Cheers,
Ashok

2015-12-14 18:11:57

by Ashok Raj

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

On Mon, Dec 14, 2015 at 05:37:38PM +0100, Borislav Petkov wrote:
>
> ... and obviously LMCE is vendor-specific so it cannot be enabled on
> !Intel guests with a define like that. mce_init() in qemu should check
> vendor too.
>
> The same mistake was done with SER_P but that's much harder to change,
> as we discussed previously.
>

This is mostly harmless.. since the MCG_CAP space is shared and has no
conflict between vendors. Also just the CAP being set has no effect.

The Guest OS needs to opt-in and the SIGBUS indicating SRAR are the only
ones that are treated special treatment. Also Intel was the only one
broadcasting MCE's.. so we are like the rest now :-)

Cheers,
Ashok

2015-12-14 22:37:30

by Borislav Petkov

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

On Mon, Dec 14, 2015 at 02:11:46PM -0500, Raj, Ashok wrote:
> This is mostly harmless.. since the MCG_CAP space is shared and has no
> conflict between vendors. Also just the CAP being set has no effect.

Of course it does - we check SER_P in machine_check_poll() and when
I emulate an AMD guest and inject errors into it, error handling is
obviously wrong, see:

https://lkml.kernel.org/r/[email protected]

--
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--

2015-12-14 23:17:39

by Ashok Raj

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

On Mon, Dec 14, 2015 at 11:37:16PM +0100, Borislav Petkov wrote:
> On Mon, Dec 14, 2015 at 02:11:46PM -0500, Raj, Ashok wrote:
> > This is mostly harmless.. since the MCG_CAP space is shared and has no
> > conflict between vendors. Also just the CAP being set has no effect.
>
> Of course it does - we check SER_P in machine_check_poll() and when
> I emulate an AMD guest and inject errors into it, error handling is
> obviously wrong, see:
>
> https://lkml.kernel.org/r/[email protected]
>

I can see how this hurts.. since the poller isn't doing cpu model specific
stuff..?

in the LMCE case, even if you advertise MCG_LMCE_P in MCG_CAP, the guest kernel
wont call intel_init_lmce() only from mce_intel.c.. so the same problem
won't happen.

but the issue Eduardo mentioned seems like the following.

New QEMU_LMCE + New KVM_LMCE + New_GUEST_LMCE - No problem

but if you were to migrage the Guest_LMCE to a non-LMCE supported KVM host
we could run into an issue..

is this the compatibility issue that you were looking to fix Eduardo?

Cheers,
Ashok

2015-12-15 13:04:58

by Borislav Petkov

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

On Mon, Dec 14, 2015 at 07:17:27PM -0500, Raj, Ashok wrote:
> I can see how this hurts.. since the poller isn't doing cpu model
> specific stuff..?

The poller sees mca_cfg.ser set on an AMD guest and then the whole
handling/decoding goes wrong.

> in the LMCE case, even if you advertise MCG_LMCE_P in MCG_CAP, the
> guest kernel wont call intel_init_lmce() only from mce_intel.c.. so
> the same problem won't happen.

You shouldn't advertise MCG_LMCE_P if the guest is not Intel. Those MCG
bits should be in the CPU model descriptor X86CPUDefinition.

--
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--

2015-12-15 15:43:04

by Eduardo Habkost

[permalink] [raw]
Subject: Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

On Mon, Dec 14, 2015 at 07:17:27PM -0500, Raj, Ashok wrote:
> On Mon, Dec 14, 2015 at 11:37:16PM +0100, Borislav Petkov wrote:
> > On Mon, Dec 14, 2015 at 02:11:46PM -0500, Raj, Ashok wrote:
> > > This is mostly harmless.. since the MCG_CAP space is shared and has no
> > > conflict between vendors. Also just the CAP being set has no effect.
> >
> > Of course it does - we check SER_P in machine_check_poll() and when
> > I emulate an AMD guest and inject errors into it, error handling is
> > obviously wrong, see:
> >
> > https://lkml.kernel.org/r/[email protected]
> >
>
> I can see how this hurts.. since the poller isn't doing cpu model specific
> stuff..?
>
> in the LMCE case, even if you advertise MCG_LMCE_P in MCG_CAP, the guest kernel
> wont call intel_init_lmce() only from mce_intel.c.. so the same problem
> won't happen.
>
> but the issue Eduardo mentioned seems like the following.
>
> New QEMU_LMCE + New KVM_LMCE + New_GUEST_LMCE - No problem
>
> but if you were to migrage the Guest_LMCE to a non-LMCE supported KVM host
> we could run into an issue..
>
> is this the compatibility issue that you were looking to fix Eduardo?

If I understood you correctly, yes. Also, note that currently
kvm_arch_init_vcpu() simply warns about missing capabilities,
instead of preventing the VM from running/migrating (as it
should). We need to change that, and figure out a good way to
report "feature FOO can't be enabled in this host" errors to
management software[1]. The main problem is that we don't even
have a QMP console available anymore if machine initialization is
aborted.

CCing libvir-list so they get in the loop.

[1] This is similar to what we need for CPUID checks, but the new
MCE feature means we need something more generic (that just
reports QOM property names, probably?)

--
Eduardo