2009-04-17 07:29:24

by Huang, Ying

[permalink] [raw]
Subject: [PATCH -v2] Add MCE support to KVM

The related MSRs are emulated. MCE capability is exported via
extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
such as the mcg_cap. MCE is injected via vcpu ioctl command
KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
not simulated.


ChangeLog:

v2:

- Add MCE capability exportation support.
- Allocate MCE banks registers simulation backing memory during VCPU
initialization.


Signed-off-by: Huang Ying <[email protected]>
Acked-by: Andi Kleen <[email protected]>

---
arch/x86/include/asm/kvm_host.h | 5
arch/x86/kvm/x86.c | 220 +++++++++++++++++++++++++++++++++++-----
include/linux/kvm.h | 17 +++
3 files changed, 218 insertions(+), 24 deletions(-)

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
+#include <asm/mce.h>

#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -55,6 +56,10 @@
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))

#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
@@ -740,23 +745,43 @@ static int set_msr_mtrr(struct kvm_vcpu
return 0;
}

-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
switch (msr) {
- case MSR_EFER:
- set_efer(vcpu, data);
- break;
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __func__, data);
- break;
case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __func__, data);
+ vcpu->arch.mcg_status = data;
break;
case MSR_IA32_MCG_CTL:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __func__, data);
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ /* only 0 or all 1s can be written to IA32_MCi_CTL */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case MSR_EFER:
+ set_efer(vcpu, data);
break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
@@ -812,6 +837,10 @@ int kvm_set_msr_common(struct kvm_vcpu *
kvm_request_guest_time_update(vcpu);
break;
}
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return set_msr_mce(vcpu, msr, data);
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -867,26 +896,49 @@ static int get_msr_mtrr(struct kvm_vcpu
return 0;
}

-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;

switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
+ data = 0;
+ break;
case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
case MSR_IA32_MCG_CTL:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_MC0_MISC+20:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+
+ switch (msr) {
+ case 0xc0010010: /* SYSCFG */
+ case 0xc0010015: /* HWCR */
+ case MSR_IA32_PLATFORM_ID:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
@@ -928,6 +980,13 @@ int kvm_get_msr_common(struct kvm_vcpu *
case MSR_KVM_SYSTEM_TIME:
data = vcpu->arch.time;
break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return get_msr_mce(vcpu, msr, pdata);
default:
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -1049,6 +1108,9 @@ int kvm_dev_ioctl_check_extension(long e
case KVM_CAP_IOMMU:
r = iommu_found();
break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
default:
r = 0;
break;
@@ -1109,6 +1171,16 @@ long kvm_arch_dev_ioctl(struct file *fil
r = 0;
break;
}
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ u64 mce_cap;
+
+ mce_cap = KVM_MCE_CAP_SUPPORTED;
+ r = -EFAULT;
+ if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ goto out;
+ r = 0;
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1452,6 +1524,80 @@ static int vcpu_ioctl_tpr_access_reporti
return 0;
}

+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num)
+ goto out;
+ if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ printk(KERN_DEBUG "kvm: set_mce: "
+ "injects mce exception while "
+ "previous one is in progress!\n");
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[1] = mce->status;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[1] = mce->status;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -1585,6 +1731,24 @@ long kvm_arch_vcpu_ioctl(struct file *fi
kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -4330,6 +4494,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
goto fail_mmu_destroy;
}

+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_mmu_destroy;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
return 0;

fail_mmu_destroy:
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -374,6 +374,11 @@ struct kvm_vcpu_arch {
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
+
+ u64 mcg_cap;
+ u64 mcg_status;
+ u64 mcg_ctl;
+ u64 *mce_banks;
};

struct kvm_mem_alias {
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -292,6 +292,18 @@ struct kvm_guest_debug {
struct kvm_guest_debug_arch arch;
};

+/* x86 MCE */
+struct kvm_x86_mce {
+ __u64 status;
+ __u64 addr;
+ __u64 misc;
+ __u64 mcg_status;
+ __u8 bank;
+ __u8 pad1;
+ __u16 pad2;
+ __u32 pad3;
+};
+
#define KVM_TRC_SHIFT 16
/*
* kvm trace categories
@@ -451,6 +463,7 @@ struct kvm_irq_routing {
};

#endif
+#define KVM_CAP_MCE 28

/*
* ioctls for VM fds
@@ -539,6 +552,10 @@ struct kvm_irq_routing {
#define KVM_NMI _IO(KVMIO, 0x9a)
/* Available with KVM_CAP_SET_GUEST_DEBUG */
#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64)
+#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64)
+#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce)

/*
* Deprecated interfaces


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-18 15:54:53

by Anthony Liguori

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Huang Ying wrote:
> The related MSRs are emulated. MCE capability is exported via
> extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
> vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
> such as the mcg_cap. MCE is injected via vcpu ioctl command
> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
> not simulated.
>

Maybe I'm missing something, but couldn't this be implemented entirely
within userspace? There's nothing VT/SVM specific about this. If the
issue is setting these MSRs from userspace via KVM_SET_MSRS isn't
enough, perhaps we should add userspace MSR handling.

Also, if you implement the MSR logic in userspace, it's pretty simple to
make it work in the non-TCG case which will be a requirement for
upstream merging.

Regards,

Anthony LIguori

2009-04-19 08:39:42

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Anthony Liguori wrote:
> Huang Ying wrote:
>> The related MSRs are emulated. MCE capability is exported via
>> extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
>> vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
>> such as the mcg_cap. MCE is injected via vcpu ioctl command
>> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
>> not simulated.
>>
>
> Maybe I'm missing something, but couldn't this be implemented entirely
> within userspace? There's nothing VT/SVM specific about this. If the
> issue is setting these MSRs from userspace via KVM_SET_MSRS isn't
> enough, perhaps we should add userspace MSR handling.
>

You also need to inject the MCE.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2009-04-20 01:20:00

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

On Sat, 2009-04-18 at 23:54 +0800, Anthony Liguori wrote:
> Huang Ying wrote:
> > The related MSRs are emulated. MCE capability is exported via
> > extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
> > vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
> > such as the mcg_cap. MCE is injected via vcpu ioctl command
> > KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
> > not simulated.
> >
>
> Maybe I'm missing something, but couldn't this be implemented entirely
> within userspace? There's nothing VT/SVM specific about this. If the
> issue is setting these MSRs from userspace via KVM_SET_MSRS isn't
> enough, perhaps we should add userspace MSR handling.
>
> Also, if you implement the MSR logic in userspace, it's pretty simple to
> make it work in the non-TCG case which will be a requirement for
> upstream merging.

There is more logic than just KVM_SET_MSRS, such as BANK reporting
disabling, overwriting rules, triple fault for UC MCE during MCIP.
Although these logic can be implemented in user space, I think put them
in kernel space is easy to be understood. And the code is pretty short.

Best Regards,
Huang Ying


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-20 03:57:31

by Kyle Moffett

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

On Sun, Apr 19, 2009 at 9:19 PM, Huang Ying <[email protected]> wrote:
> On Sat, 2009-04-18 at 23:54 +0800, Anthony Liguori wrote:
>> Huang Ying wrote:
>> > The related MSRs are emulated. MCE capability is exported via
>> > extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
>> > vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
>> > such as the mcg_cap. MCE is injected via vcpu ioctl command
>> > KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
>> > not simulated.
>> >
>>
>> Maybe I'm missing something, but couldn't this be implemented entirely
>> within userspace?  There's nothing VT/SVM specific about this.  If the
>> issue is setting these MSRs from userspace via KVM_SET_MSRS isn't
>> enough, perhaps we should add userspace MSR handling.
>>
>> Also, if you implement the MSR logic in userspace, it's pretty simple to
>> make it work in the non-TCG case which will be a requirement for
>> upstream merging.
>
> There is more logic than just KVM_SET_MSRS, such as BANK reporting
> disabling, overwriting rules, triple fault for UC MCE during MCIP.
> Although these logic can be implemented in user space, I think put them
> in kernel space is easy to be understood. And the code is pretty short.

IMO the main reason to put this in kernel-space would be to make it
possible to automatically forward some MCE errors generated by the
real hardware (RAM ECC errors for example) down into the VM. Right
now I suppose you could do that with the patches to forward RAM-based
hard MCEs to userspace using SIGSEGV and handling the SIGSEGV in
userspace, but that seems more fragile to me.

Cheers,
Kyle Moffett

2009-04-20 05:04:46

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

On Mon, 2009-04-20 at 11:57 +0800, Kyle Moffett wrote:
> On Sun, Apr 19, 2009 at 9:19 PM, Huang Ying <[email protected]> wrote:
> > On Sat, 2009-04-18 at 23:54 +0800, Anthony Liguori wrote:
> >> Huang Ying wrote:
> >> > The related MSRs are emulated. MCE capability is exported via
> >> > extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
> >> > vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
> >> > such as the mcg_cap. MCE is injected via vcpu ioctl command
> >> > KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
> >> > not simulated.
> >> >
> >>
> >> Maybe I'm missing something, but couldn't this be implemented entirely
> >> within userspace? There's nothing VT/SVM specific about this. If the
> >> issue is setting these MSRs from userspace via KVM_SET_MSRS isn't
> >> enough, perhaps we should add userspace MSR handling.
> >>
> >> Also, if you implement the MSR logic in userspace, it's pretty simple to
> >> make it work in the non-TCG case which will be a requirement for
> >> upstream merging.
> >
> > There is more logic than just KVM_SET_MSRS, such as BANK reporting
> > disabling, overwriting rules, triple fault for UC MCE during MCIP.
> > Although these logic can be implemented in user space, I think put them
> > in kernel space is easy to be understood. And the code is pretty short.
>
> IMO the main reason to put this in kernel-space would be to make it
> possible to automatically forward some MCE errors generated by the
> real hardware (RAM ECC errors for example) down into the VM. Right
> now I suppose you could do that with the patches to forward RAM-based
> hard MCEs to userspace using SIGSEGV and handling the SIGSEGV in
> userspace, but that seems more fragile to me.

Yes. Puting this in kernel-space would make it more reliable to forward
the MCE to KVM guest.

Best Regards,
Huang Ying


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-20 05:27:35

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

> IMO the main reason to put this in kernel-space would be to make it
> possible to automatically forward some MCE errors generated by the
> real hardware (RAM ECC errors for example) down into the VM. Right
> now I suppose you could do that with the patches to forward RAM-based
> hard MCEs to userspace using SIGSEGV and handling the SIGSEGV in
> userspace, but that seems more fragile to me.

I think you refer to my hwpoison patches that generate SIGBUS?

The event has to go through user space anyways. The code to generate
the fake MCE from the SIGBUS is (or rather will be with current
patch) in qemu. Normally the principle to process such errors
as quickly as possible is sound though, although I'm not sure
how much difference it makes. In theory it could be put
into kernel too, but you would need a "in kernel signal handler"
then, which would need quite some more changes.

-Andi
--
[email protected] -- Speaking for myself only.

2009-04-21 16:07:09

by Anthony Liguori

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Huang Ying wrote:
>>> The related MSRs are emulated. MCE capability is exported via
>>> extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
>>> vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
>>> such as the mcg_cap. MCE is injected via vcpu ioctl command
>>> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
>>> not simulated.
>>>
>>
>> Maybe I'm missing something, but couldn't this be implemented
>> entirely within userspace? There's nothing VT/SVM specific about
>> this. If the issue is setting these MSRs from userspace via
>> KVM_SET_MSRS isn't enough, perhaps we should add userspace MSR handling.
>>
>
> You also need to inject the MCE.

KVM_SET_EXCEPTION, instead of adding something MCE specific?

Regards,

Anthony Liguori

2009-04-21 16:12:30

by Anthony Liguori

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Huang Ying wrote:
>>> The related MSRs are emulated. MCE capability is exported via
>>> extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
>>> vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
>>> such as the mcg_cap. MCE is injected via vcpu ioctl command
>>> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
>>> not simulated.
>>>
>>
>> Maybe I'm missing something, but couldn't this be implemented
>> entirely within userspace? There's nothing VT/SVM specific about
>> this. If the issue is setting these MSRs from userspace via
>> KVM_SET_MSRS isn't enough, perhaps we should add userspace MSR handling.
>>
>
> You also need to inject the MCE.

Regardless of the KVM interface for this, to go upstream to QEMU, this
needs a TCG implementation which means the logic must be duplicated in
userspace. In particular, this is because a user-visible command is
being introduced in the monitor.

So Avi, regardless of what interface is chosen for KVM, can you hold off
applying these patches until there is a TCG implementation? Otherwise,
we'll be playing catchup between kvm-userspace and upstream QEMU forever.

Regards,

Anthony Liguori

2009-04-21 16:19:53

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Anthony Liguori wrote:
>> You also need to inject the MCE.
>
> KVM_SET_EXCEPTION, instead of adding something MCE specific?

That's of dubious use... exceptions are cpu-internal things. It's also
problematic to implement - what do you do if an exception is already
pending? For MCE you know it's priority (think it overrides almost
everything).

I would really like to have a well defined boundary.

--
error compiling committee.c: too many arguments to function

2009-04-21 16:21:45

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Anthony Liguori wrote:
>
> Regardless of the KVM interface for this, to go upstream to QEMU, this
> needs a TCG implementation which means the logic must be duplicated in
> userspace. In particular, this is because a user-visible command is
> being introduced in the monitor.

Definitely.

>
> So Avi, regardless of what interface is chosen for KVM, can you hold
> off applying these patches until there is a TCG implementation?
> Otherwise, we'll be playing catchup between kvm-userspace and upstream
> QEMU forever.

I have a feeling we will anyway, but that's no reason to increase the
gap. Huang, can you generate a version for qemu/tcg?

--
error compiling committee.c: too many arguments to function

2009-04-21 19:55:26

by Anthony Liguori

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Avi Kivity wrote:
>>
>> So Avi, regardless of what interface is chosen for KVM, can you hold
>> off applying these patches until there is a TCG implementation?
>> Otherwise, we'll be playing catchup between kvm-userspace and
>> upstream QEMU forever.
>
> I have a feeling we will anyway,

Not if we avoid committing things to kvm-userspace that would require
major reworking to get into upstream QEMU. Fixing these issues is much
easier up front then down the road.

Regards,

Anthony Liguori

2009-04-21 20:00:37

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Anthony Liguori wrote:
> Avi Kivity wrote:
>>>
>>> So Avi, regardless of what interface is chosen for KVM, can you hold
>>> off applying these patches until there is a TCG implementation?
>>> Otherwise, we'll be playing catchup between kvm-userspace and
>>> upstream QEMU forever.
>>
>> I have a feeling we will anyway,
>
> Not if we avoid committing things to kvm-userspace that would require
> major reworking to get into upstream QEMU. Fixing these issues is
> much easier up front then down the road.
>

No argument. But some kernel features will require major rework in qemu
before we can support them properly. We'll still want to bring them to
users quickly so users can enjoy them (and we can enjoy users testing
them). Waiting for a qemu rework may take a while, and we'll lose
valuable feedback.

For this kind of work, we can provide kvm-userspace.git as a kind of
playground where these experiments may be made. kvm-userspace.git
exists to support kvm.git; while qemu.git has a life of its own and more
stringent barriers to acceptance.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2009-04-21 20:08:51

by Anthony Liguori

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Avi Kivity wrote:
> No argument. But some kernel features will require major rework in
> qemu before we can support them properly. We'll still want to bring
> them to users quickly so users can enjoy them (and we can enjoy users
> testing them). Waiting for a qemu rework may take a while, and we'll
> lose valuable feedback.

Then we're failing. If a particular implementation of a feature is
acceptable for kvm-userspace users, and we don't take it in QEMU without
requiring a huge amount of different work, then it suggests something is
broken about the QEMU process.

> For this kind of work, we can provide kvm-userspace.git as a kind of
> playground where these experiments may be made. kvm-userspace.git
> exists to support kvm.git; while qemu.git has a life of its own and
> more stringent barriers to acceptance.

As long as people are using kvm-userspace.git instead of qemu.git, we're
failing IMHO. If kvm-userspace.git is basically the equivalent of the
x86 git kernel tree, linux-next, or something like that, then that's a
good thing.

Regards,

Anthony Liguori

2009-04-21 20:45:32

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Anthony Liguori wrote:
> Avi Kivity wrote:
>> No argument. But some kernel features will require major rework in
>> qemu before we can support them properly. We'll still want to bring
>> them to users quickly so users can enjoy them (and we can enjoy users
>> testing them). Waiting for a qemu rework may take a while, and we'll
>> lose valuable feedback.
>
> Then we're failing. If a particular implementation of a feature is
> acceptable for kvm-userspace users, and we don't take it in QEMU
> without requiring a huge amount of different work, then it suggests
> something is broken about the QEMU process.
>

Example: SMP.
Example: vlan API.

>> For this kind of work, we can provide kvm-userspace.git as a kind of
>> playground where these experiments may be made. kvm-userspace.git
>> exists to support kvm.git; while qemu.git has a life of its own and
>> more stringent barriers to acceptance.
>
> As long as people are using kvm-userspace.git instead of qemu.git,
> we're failing IMHO. If kvm-userspace.git is basically the equivalent
> of the x86 git kernel tree, linux-next, or something like that, then
> that's a good thing.

That's definitely a long term goal, but qemu is not yet at a point where
it is easy to implement new features efficiently. Once it reaches that
state, kvm-userspace will become a simple staging ground (or even
disappear entirely).

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2009-04-21 21:16:24

by Anthony Liguori

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Avi Kivity wrote:
>
> Example: SMP.

There was no KVM support in QEMU at the time when SMP was introduced.
Had there been, I see no reason not to do it in upstream QEMU.

> Example: vlan API.

You'll have to be more specific. Do you mean the up coming vlan API
refactoring? That absolutely should be happening in upstream QEMU.

>>> For this kind of work, we can provide kvm-userspace.git as a kind of
>>> playground where these experiments may be made. kvm-userspace.git
>>> exists to support kvm.git; while qemu.git has a life of its own and
>>> more stringent barriers to acceptance.
>>
>> As long as people are using kvm-userspace.git instead of qemu.git,
>> we're failing IMHO. If kvm-userspace.git is basically the equivalent
>> of the x86 git kernel tree, linux-next, or something like that, then
>> that's a good thing.
>
> That's definitely a long term goal, but qemu is not yet at a point
> where it is easy to implement new features efficiently. Once it
> reaches that state, kvm-userspace will become a simple staging ground
> (or even disappear entirely).

The simple fact is that right now, Fedora ships kvm-userspace and calls
it QEMU. It builds packages for non-KVM enabled boards. There is a
very large userspace that consumes packages derived from kvm-userspace.
Like it or not, kvm-userspace is not an experimental staging ground for
QEMU.

The only reasonable things to do IMHO is for as much as humanly possible
to be deferred to QEMU or for you to comes to terms with your role as a
defacto QEMU maintainer and start pushing back more on patch sets that
don't take into consideration TCG/non-KVM environments :-)

MCE is a perfect example of something that really has no reason to go in
via kvm-userspace since we have enough KVM support in upstream QEMU.

Regards,

Anthony Liguori

2009-04-22 02:33:20

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

On Wed, 2009-04-22 at 00:21 +0800, Avi Kivity wrote:
> Anthony Liguori wrote:
> >
> > Regardless of the KVM interface for this, to go upstream to QEMU, this
> > needs a TCG implementation which means the logic must be duplicated in
> > userspace. In particular, this is because a user-visible command is
> > being introduced in the monitor.
>
> Definitely.
>
> >
> > So Avi, regardless of what interface is chosen for KVM, can you hold
> > off applying these patches until there is a TCG implementation?
> > Otherwise, we'll be playing catchup between kvm-userspace and upstream
> > QEMU forever.
>
> I have a feeling we will anyway, but that's no reason to increase the
> gap. Huang, can you generate a version for qemu/tcg?

OK. I will implement the qemu/tcg support.

Best Regards,
Huang Ying


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-22 02:42:28

by Gregory Haskins

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Kyle Moffett wrote:
> On Sun, Apr 19, 2009 at 9:19 PM, Huang Ying <[email protected]> wrote:
>
>> On Sat, 2009-04-18 at 23:54 +0800, Anthony Liguori wrote:
>>
>>> Huang Ying wrote:
>>>
>>>> The related MSRs are emulated. MCE capability is exported via
>>>> extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new
>>>> vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
>>>> such as the mcg_cap. MCE is injected via vcpu ioctl command
>>>> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
>>>> not simulated.
>>>>
>>>>
>>> Maybe I'm missing something, but couldn't this be implemented entirely
>>> within userspace? There's nothing VT/SVM specific about this. If the
>>> issue is setting these MSRs from userspace via KVM_SET_MSRS isn't
>>> enough, perhaps we should add userspace MSR handling.
>>>
>>> Also, if you implement the MSR logic in userspace, it's pretty simple to
>>> make it work in the non-TCG case which will be a requirement for
>>> upstream merging.
>>>
>> There is more logic than just KVM_SET_MSRS, such as BANK reporting
>> disabling, overwriting rules, triple fault for UC MCE during MCIP.
>> Although these logic can be implemented in user space, I think put them
>> in kernel space is easy to be understood. And the code is pretty short.
>>
>
> IMO the main reason to put this in kernel-space would be to make it
> possible to automatically forward some MCE errors generated by the
> real hardware (RAM ECC errors for example) down into the VM. Right
> now I suppose you could do that with the patches to forward RAM-based
> hard MCEs to userspace using SIGSEGV and handling the SIGSEGV in
> userspace, but that seems more fragile to me.
>

FWIW: I would be looking for a way to generate MCE to report async vbus
faults via my SHM_SIGNAL_FAULT() mechanism, so I am very much in favor
of this support being in-kernel.

-Greg

> Cheers,
> Kyle Moffett
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>



Attachments:
signature.asc (266.00 B)
OpenPGP digital signature

2009-04-22 05:57:40

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Anthony Liguori wrote:
> Avi Kivity wrote:
>>
>> Example: SMP.
>
> There was no KVM support in QEMU at the time when SMP was introduced.
> Had there been, I see no reason not to do it in upstream QEMU.
>

Marcelo's been working on getting iothread (needed for smp) into qemu
for a while. It will take still more time. All this with the previous
implementation available.

In contrast, initial smp support was hacked into kvm-userspace in a few
days. Sure we didn't have an iothread (vcpu 0 did all the work) and it
was horribly ugly and buggy, but we got the feature out for testing and
got feedback, not only on the userspace bits, but also on the kernel
bits which were new.

Going through qemu would have delayed the work significantly, as well as
slowed down kernel development.

>> Example: vlan API.
>
> You'll have to be more specific. Do you mean the up coming vlan API
> refactoring? That absolutely should be happening in upstream QEMU.

The refactoring, absolutely. But if I have kernel support for zero copy
tomorrow, do I wait until qemu completes refactoring the VLAN API, or do
I hack something in so I can test it and get the benefit to users?

>>> As long as people are using kvm-userspace.git instead of qemu.git,
>>> we're failing IMHO. If kvm-userspace.git is basically the
>>> equivalent of the x86 git kernel tree, linux-next, or something like
>>> that, then that's a good thing.
>>
>> That's definitely a long term goal, but qemu is not yet at a point
>> where it is easy to implement new features efficiently. Once it
>> reaches that state, kvm-userspace will become a simple staging ground
>> (or even disappear entirely).
>
> The simple fact is that right now, Fedora ships kvm-userspace and
> calls it QEMU. It builds packages for non-KVM enabled boards. There
> is a very large userspace that consumes packages derived from
> kvm-userspace. Like it or not, kvm-userspace is not an experimental
> staging ground for QEMU.

It depends on how fast qemu development is. If everything took no time,
sure, but that is not the case.

>
> The only reasonable things to do IMHO is for as much as humanly
> possible to be deferred to QEMU or for you to comes to terms with your
> role as a defacto QEMU maintainer and start pushing back more on patch
> sets that don't take into consideration TCG/non-KVM environments :-)

I do that whenever possible -- and it most often is possible.

But neither kvm nor tcg are not going to be supersets of the other. Of
course qemu is not a subset of kvm as it has a much wider target/host
variety. But also kvm will always have features that tcg does not
have. For example AVX is easy to implement with a few lines in the
kernel and qemu, but would take a massive effort in tcg. It would have
a large performance impact for AVX-enabled apps/guests/hosts
combinations, not so much for tcg.

kvm wants features for large-scale production deployment. This means
focus on performance and managebility. qemu/tcg is more for hobbyist
use or as a developer tool for developing operating system kernels.
This means more focus on ease of use.
>
> MCE is a perfect example of something that really has no reason to go
> in via kvm-userspace since we have enough KVM support in upstream QEMU.

I agree. But the requirement that everything in kvm have a counterpart
in tcg is not realistic. The primary use of MCE for example is used to
allow a guest to survive bad hardware. I don't see this as being useful
in any way on qemu/tcg. A secondary is is to debug mce handling in
guests OSes; now this is useful with tcg, but I'd hesitate to call it a
requirement, it's more of a nice to have.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2009-04-22 13:32:33

by Anthony Liguori

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Avi Kivity wrote:
>
> The refactoring, absolutely. But if I have kernel support for zero
> copy tomorrow, do I wait until qemu completes refactoring the VLAN
> API, or do I hack something in so I can test it and get the benefit to
> users?

Why can't we do this in upstream QEMU though? Part of the point I'm
trying to make here is that what QEMU is can be flexible. We can find
ways to include functionality that might not be ready for prime time by
making it conditionally compiled, only available when using KVM, etc.
It's all open for discussion. So I'll be quite blunt about it, what
needs to change about what QEMU takes and doesn't take in order to get
rid of kvm-userspace?

Experimentation is a good thing. It's also important to do things the
right way as early as possible though because the longer users depend on
something, the harder it is to change later. I think it's easier to
strike that balance in upstream QEMU than trying to port things from
kvm-userspace over to QEMU after the fact.

>>
>> The only reasonable things to do IMHO is for as much as humanly
>> possible to be deferred to QEMU or for you to comes to terms with
>> your role as a defacto QEMU maintainer and start pushing back more on
>> patch sets that don't take into consideration TCG/non-KVM
>> environments :-)
>
> I do that whenever possible -- and it most often is possible.
>
> But neither kvm nor tcg are not going to be supersets of the other.
> Of course qemu is not a subset of kvm as it has a much wider
> target/host variety. But also kvm will always have features that tcg
> does not have. For example AVX is easy to implement with a few lines
> in the kernel and qemu, but would take a massive effort in tcg. It
> would have a large performance impact for AVX-enabled
> apps/guests/hosts combinations, not so much for tcg.
>
> kvm wants features for large-scale production deployment. This means
> focus on performance and managebility. qemu/tcg is more for hobbyist
> use or as a developer tool for developing operating system kernels.
> This means more focus on ease of use.

We can have KVM specific features in QEMU when that makes sense. In the
case of MCE, it doesn't make any sense because it's relatively simple
and the implementation can be common given the right interfaces.

>>
>> MCE is a perfect example of something that really has no reason to go
>> in via kvm-userspace since we have enough KVM support in upstream QEMU.
>
> I agree. But the requirement that everything in kvm have a
> counterpart in tcg is not realistic. The primary use of MCE for
> example is used to allow a guest to survive bad hardware. I don't see
> this as being useful in any way on qemu/tcg. A secondary is is to
> debug mce handling in guests OSes; now this is useful with tcg, but
> I'd hesitate to call it a requirement, it's more of a nice to have.

There is no requirement that everything have a counter part in TCG.

Regards,

Anthony Liguori

2009-04-22 14:02:49

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH -v2] Add MCE support to KVM

Anthony Liguori wrote:
> Avi Kivity wrote:
>>
>> The refactoring, absolutely. But if I have kernel support for zero
>> copy tomorrow, do I wait until qemu completes refactoring the VLAN
>> API, or do I hack something in so I can test it and get the benefit
>> to users?
>
> Why can't we do this in upstream QEMU though? Part of the point I'm
> trying to make here is that what QEMU is can be flexible.

If we do it the right way, it takes time, and we serialize development
(for example, we don't find and fix bugs in the kernel).
We don't want to do it the wrong way.

> We can find ways to include functionality that might not be ready for
> prime time by making it conditionally compiled, only available when
> using KVM, etc. It's all open for discussion. So I'll be quite blunt
> about it, what needs to change about what QEMU takes and doesn't take
> in order to get rid of kvm-userspace?

#ifdefs in master, an experimental branch, and kvm-userspace (soon,
qemu-kvm) are all equivalent.

There's even an option to diff(1) to generate #ifdefs instead of the
traditional diff markers.

> Experimentation is a good thing. It's also important to do things the
> right way as early as possible though because the longer users depend
> on something, the harder it is to change later. I think it's easier
> to strike that balance in upstream QEMU than trying to port things
> from kvm-userspace over to QEMU after the fact.

Well, let's take it on a case by case basis. I certainly prefer to see
everything go to qemu.git. If we find a case where it isn't workable,
qemu-kvm.git can be a temporary home.

> We can have KVM specific features in QEMU when that makes sense. In
> the case of MCE, it doesn't make any sense because it's relatively
> simple and the implementation can be common given the right interfaces.

The kvm kernel module doesn't have common implementation for qemu/tcg
and qemu/kvm as one of its goals. It doesn't know anything about qemu.
It wants to export an interface that makes sense (is as close to the cpu
model as possible).

The few places that we inadvertently let qemuisms slip through turned
out to be mistakes.

I agree that MCE is simple so it's easy to implement in both. But the
other features may be different.

--
error compiling committee.c: too many arguments to function