2007-09-14 19:45:59

by Anthony Liguori

[permalink] [raw]
Subject: [PATCH] Refactor hypercall infrastructure

This patch refactors the current hypercall infrastructure to better support live
migration and SMP. It eliminates the hypercall page by trapping the UD
exception that would occur if you used the wrong hypercall instruction for the
underlying architecture and replacing it with the right one lazily.

It also introduces the infrastructure to probe for hypercall available via
CPUID leaves 0x40000002. CPUID leaf 0x40000003 should be filled out by
userspace.

A fall-out of this patch is that the unhandled hypercalls no longer trap to
userspace. There is very little reason though to use a hypercall to communicate
with userspace as PIO or MMIO can be used. There is no code in tree that uses
userspace hypercalls.

Signed-off-by: Anthony Liguori <[email protected]>

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index ad08138..1cde572 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -46,6 +46,7 @@
#define KVM_MAX_CPUID_ENTRIES 40

#define DE_VECTOR 0
+#define UD_VECTOR 6
#define NM_VECTOR 7
#define DF_VECTOR 8
#define TS_VECTOR 10
@@ -317,9 +318,6 @@ struct kvm_vcpu {
unsigned long cr0;
unsigned long cr2;
unsigned long cr3;
- gpa_t para_state_gpa;
- struct page *para_state_page;
- gpa_t hypercall_gpa;
unsigned long cr4;
unsigned long cr8;
u64 pdptrs[4]; /* pae */
@@ -622,7 +620,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);

-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);

static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 99e4917..5211d19 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -39,6 +39,7 @@
#include <linux/smp.h>
#include <linux/anon_inodes.h>
#include <linux/profile.h>
+#include <linux/kvm_para.h>

#include <asm/processor.h>
#include <asm/msr.h>
@@ -1383,51 +1384,61 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);

-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
- unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
+ unsigned long nr, a0, a1, a2, a3, ret;

kvm_x86_ops->cache_regs(vcpu);
- ret = -KVM_EINVAL;
-#ifdef CONFIG_X86_64
- if (is_long_mode(vcpu)) {
- nr = vcpu->regs[VCPU_REGS_RAX];
- a0 = vcpu->regs[VCPU_REGS_RDI];
- a1 = vcpu->regs[VCPU_REGS_RSI];
- a2 = vcpu->regs[VCPU_REGS_RDX];
- a3 = vcpu->regs[VCPU_REGS_RCX];
- a4 = vcpu->regs[VCPU_REGS_R8];
- a5 = vcpu->regs[VCPU_REGS_R9];
- } else
-#endif
- {
- nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
- a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
- a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
- a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
- a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
- a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
- a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
- }
+
+ nr = vcpu->regs[VCPU_REGS_RAX];
+ a0 = vcpu->regs[VCPU_REGS_RBX];
+ a1 = vcpu->regs[VCPU_REGS_RCX];
+ a2 = vcpu->regs[VCPU_REGS_RDX];
+ a3 = vcpu->regs[VCPU_REGS_RSI];
+
+ if (!is_long_mode(vcpu)) {
+ nr &= ~1u;
+ a0 &= ~1u;
+ a1 &= ~1u;
+ a2 &= ~1u;
+ a3 &= ~1u;
+ }
+
switch (nr) {
default:
- run->hypercall.nr = nr;
- run->hypercall.args[0] = a0;
- run->hypercall.args[1] = a1;
- run->hypercall.args[2] = a2;
- run->hypercall.args[3] = a3;
- run->hypercall.args[4] = a4;
- run->hypercall.args[5] = a5;
- run->hypercall.ret = ret;
- run->hypercall.longmode = is_long_mode(vcpu);
- kvm_x86_ops->decache_regs(vcpu);
- return 0;
+ ret = -KVM_ENOSYS;
+ break;
}
vcpu->regs[VCPU_REGS_RAX] = ret;
kvm_x86_ops->decache_regs(vcpu);
- return 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+{
+ char instruction[3];
+ int ret = 0;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ /*
+ * Blow out the MMU to ensure that no other VCPU has an active mapping
+ * to ensure that the updated hypercall appears atomically across all
+ * VCPUs.
+ */
+ kvm_mmu_zap_all(vcpu->kvm);
+
+ kvm_x86_ops->cache_regs(vcpu);
+ kvm_x86_ops->patch_hypercall(vcpu, instruction);
+ if (emulator_write_emulated(vcpu->rip, instruction, 3, vcpu)
+ != X86EMUL_CONTINUE)
+ ret = -EFAULT;
+
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(kvm_hypercall);

static u64 mk_cr_64(u64 curr_cr, u32 new_val)
{
@@ -1495,75 +1506,6 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
}
}

-/*
- * Register the para guest with the host:
- */
-static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
-{
- struct kvm_vcpu_para_state *para_state;
- hpa_t para_state_hpa, hypercall_hpa;
- struct page *para_state_page;
- unsigned char *hypercall;
- gpa_t hypercall_gpa;
-
- printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
- printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
-
- /*
- * Needs to be page aligned:
- */
- if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
- goto err_gp;
-
- para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
- printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
- if (is_error_hpa(para_state_hpa))
- goto err_gp;
-
- mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
- para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
- para_state = kmap(para_state_page);
-
- printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
- printk(KERN_DEBUG ".... size: %d\n", para_state->size);
-
- para_state->host_version = KVM_PARA_API_VERSION;
- /*
- * We cannot support guests that try to register themselves
- * with a newer API version than the host supports:
- */
- if (para_state->guest_version > KVM_PARA_API_VERSION) {
- para_state->ret = -KVM_EINVAL;
- goto err_kunmap_skip;
- }
-
- hypercall_gpa = para_state->hypercall_gpa;
- hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
- printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
- if (is_error_hpa(hypercall_hpa)) {
- para_state->ret = -KVM_EINVAL;
- goto err_kunmap_skip;
- }
-
- printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
- vcpu->para_state_page = para_state_page;
- vcpu->para_state_gpa = para_state_gpa;
- vcpu->hypercall_gpa = hypercall_gpa;
-
- mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
- hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
- KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
- kvm_x86_ops->patch_hypercall(vcpu, hypercall);
- kunmap_atomic(hypercall, KM_USER1);
-
- para_state->ret = 0;
-err_kunmap_skip:
- kunmap(para_state_page);
- return 0;
-err_gp:
- return 1;
-}
-
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
@@ -1677,12 +1619,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_MISC_ENABLE:
vcpu->ia32_misc_enable_msr = data;
break;
- /*
- * This is the 'probe whether the host is KVM' logic:
- */
- case MSR_KVM_API_MAGIC:
- return vcpu_register_para(vcpu, data);
-
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
return 1;
@@ -1721,6 +1657,18 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
vcpu->regs[VCPU_REGS_RBX] = 0;
vcpu->regs[VCPU_REGS_RCX] = 0;
vcpu->regs[VCPU_REGS_RDX] = 0;
+
+ if (function == 0x40000002) {
+ u32 signature[3];
+
+ memcpy(signature, "LinuxPVLinux", 12);
+ vcpu->regs[VCPU_REGS_RAX] = 0;
+ vcpu->regs[VCPU_REGS_RBX] = signature[0];
+ vcpu->regs[VCPU_REGS_RCX] = signature[1];
+ vcpu->regs[VCPU_REGS_RDX] = signature[2];
+ goto out;
+ }
+
best = NULL;
for (i = 0; i < vcpu->cpuid_nent; ++i) {
e = &vcpu->cpuid_entries[i];
@@ -1741,6 +1689,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
vcpu->regs[VCPU_REGS_RCX] = best->ecx;
vcpu->regs[VCPU_REGS_RDX] = best->edx;
}
+out:
kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->skip_emulated_instruction(vcpu);
}
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 729f1cd..d09a9f5 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -476,7 +476,8 @@ static void init_vmcb(struct vmcb *vmcb)
INTERCEPT_DR5_MASK |
INTERCEPT_DR7_MASK;

- control->intercept_exceptions = 1 << PF_VECTOR;
+ control->intercept_exceptions = (1 << PF_VECTOR) |
+ (1 << UD_VECTOR);


control->intercept = (1ULL << INTERCEPT_INTR) |
@@ -970,6 +971,17 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 0;
}

+static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ int er;
+
+ er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0);
+ if (er != EMULATE_DONE)
+ inject_ud(&svm->vcpu);
+
+ return 1;
+}
+
static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
@@ -1036,7 +1048,8 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
svm->next_rip = svm->vmcb->save.rip + 3;
skip_emulated_instruction(&svm->vcpu);
- return kvm_hypercall(&svm->vcpu, kvm_run);
+ kvm_emulate_hypercall(&svm->vcpu);
+ return 1;
}

static int invalid_op_interception(struct vcpu_svm *svm,
@@ -1232,6 +1245,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_WRITE_DR3] = emulate_on_interception,
[SVM_EXIT_WRITE_DR5] = emulate_on_interception,
[SVM_EXIT_WRITE_DR7] = emulate_on_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_INTR] = nop_on_interception,
@@ -1664,7 +1678,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[0] = 0x0f;
hypercall[1] = 0x01;
hypercall[2] = 0xd9;
- hypercall[3] = 0xc3;
}

static void svm_check_processor_compat(void *rtn)
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 4f115a8..a71564c 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -164,6 +164,13 @@ static inline int is_no_device(u32 intr_info)
(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
}

+static inline int is_invalid_opcode(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+}
+
static inline int is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -315,7 +322,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;

- eb = 1u << PF_VECTOR;
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug.enabled)
@@ -558,6 +565,15 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
INTR_INFO_VALID_MASK);
}

+static void vmx_inject_ud(struct kvm_vcpu *vcpu)
+{
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ GP_VECTOR |
+ INTR_TYPE_EXCEPTION |
+ INTR_INFO_DELIEVER_CODE_MASK |
+ INTR_INFO_VALID_MASK);
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -1770,6 +1786,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}

+ if (is_invalid_opcode(intr_info)) {
+ er = emulate_instruction(vcpu, kvm_run, 0, 0);
+ if (er != EMULATE_DONE)
+ vmx_inject_ud(vcpu);
+
+ return 1;
+ }
+
error_code = 0;
rip = vmcs_readl(GUEST_RIP);
if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
@@ -1872,7 +1896,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[0] = 0x0f;
hypercall[1] = 0x01;
hypercall[2] = 0xc1;
- hypercall[3] = 0xc3;
}

static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -2058,7 +2081,8 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
skip_emulated_instruction(vcpu);
- return kvm_hypercall(vcpu, kvm_run);
+ kvm_emulate_hypercall(vcpu);
+ return 1;
}

/*
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 18c2b2c..1362082 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -1301,19 +1301,37 @@ twobyte_insn:
u16 size;
unsigned long address;

- case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, src.ptr,
- &size, &address, op_bytes);
+ case 0: /* vmcall */
+ if (modrm_mod != 3 || modrm_rm != 1)
+ goto cannot_emulate;
+
+ rc = kvm_fix_hypercall(ctxt->vcpu);
if (rc)
goto done;
- realmode_lgdt(ctxt->vcpu, size, address);
+
+ kvm_emulate_hypercall(ctxt->vcpu);
break;
- case 3: /* lidt */
+ case 2: /* lgdt */
rc = read_descriptor(ctxt, ops, src.ptr,
&size, &address, op_bytes);
if (rc)
goto done;
- realmode_lidt(ctxt->vcpu, size, address);
+ realmode_lgdt(ctxt->vcpu, size, address);
+ break;
+ case 3: /* lidt/vmmcall */
+ if (modrm_mod == 3 && modrm_rm == 1) {
+ rc = kvm_fix_hypercall(ctxt->vcpu);
+ if (rc)
+ goto done;
+ kvm_emulate_hypercall(ctxt->vcpu);
+ } else {
+ rc = read_descriptor(ctxt, ops, src.ptr,
+ &size, &address,
+ op_bytes);
+ if (rc)
+ goto done;
+ realmode_lidt(ctxt->vcpu, size, address);
+ }
break;
case 4: /* smsw */
if (modrm_mod != 3)
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3b29256..448112a 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -1,73 +1,86 @@
#ifndef __LINUX_KVM_PARA_H
#define __LINUX_KVM_PARA_H

-/*
- * Guest OS interface for KVM paravirtualization
- *
- * Note: this interface is totally experimental, and is certain to change
- * as we make progress.
- */
+#ifdef __KERNEL__
+#include <asm/processor.h>

-/*
- * Per-VCPU descriptor area shared between guest and host. Writable to
- * both guest and host. Registered with the host by the guest when
- * a guest acknowledges paravirtual mode.
- *
- * NOTE: all addresses are guest-physical addresses (gpa), to make it
- * easier for the hypervisor to map between the various addresses.
- */
-struct kvm_vcpu_para_state {
- /*
- * API version information for compatibility. If there's any support
- * mismatch (too old host trying to execute too new guest) then
- * the host will deny entry into paravirtual mode. Any other
- * combination (new host + old guest and new host + new guest)
- * is supposed to work - new host versions will support all old
- * guest API versions.
- */
- u32 guest_version;
- u32 host_version;
- u32 size;
- u32 ret;
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"

- /*
- * The address of the vm exit instruction (VMCALL or VMMCALL),
- * which the host will patch according to the CPU model the
- * VM runs on:
- */
- u64 hypercall_gpa;
+static inline long kvm_hypercall0(unsigned int nr)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr));
+ return ret;
+}

-} __attribute__ ((aligned(PAGE_SIZE)));
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1));
+ return ret;
+}

-#define KVM_PARA_API_VERSION 1
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+ unsigned long p2)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2));
+ return ret;
+}

-/*
- * This is used for an RDMSR's ECX parameter to probe for a KVM host.
- * Hopefully no CPU vendor will use up this number. This is placed well
- * out of way of the typical space occupied by CPU vendors' MSR indices,
- * and we think (or at least hope) it wont be occupied in the future
- * either.
- */
-#define MSR_KVM_API_MAGIC 0x87655678
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
+ return ret;
+}

-#define KVM_EINVAL 1
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
+ return ret;
+}

-/*
- * Hypercall calling convention:
- *
- * Each hypercall may have 0-6 parameters.
- *
- * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
- *
- * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
- * order: RDI, RSI, RDX, RCX, R8, R9.
- *
- * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
- * (the first 3 are according to the gcc regparm calling convention)
- *
- * No registers are clobbered by the hypercall, except that the
- * return value is in RAX.
- */
-#define __NR_hypercalls 0
+static inline int kvm_para_available(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ char signature[13];
+
+ cpuid(0x40000002, &eax, &ebx, &ecx, &edx);
+ memcpy(signature + 0, &ebx, 4);
+ memcpy(signature + 4, &ecx, 4);
+ memcpy(signature + 8, &edx, 4);
+ signature[12] = 0;
+
+ if (strcmp(signature, "LinuxPVLinux") == 0)
+ return 1;
+
+ return 0;
+}
+
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+ if (cpuid_eax(0x40000003) & (1UL << feature))
+ return 1;
+ return 0;
+}
+
+#endif
+
+#define KVM_ENOSYS 1000

#endif


2007-09-14 20:53:37

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] Refactor hypercall infrastructure

Anthony Liguori wrote:
> This patch refactors the current hypercall infrastructure to better support live
> migration and SMP. It eliminates the hypercall page by trapping the UD
> exception that would occur if you used the wrong hypercall instruction for the
> underlying architecture and replacing it with the right one lazily.
>

I guess it would be pretty rude/unlikely for these opcodes to get reused
in other implementations... But couldn't you make the page trap
instead, rather than relying on an instruction fault?

> It also introduces the infrastructure to probe for hypercall available via
> CPUID leaves 0x40000002. CPUID leaf 0x40000003 should be filled out by
> userspace.
>

Is this compatible with Xen's (and other's) use of cpuid? That is,
0x40000000 returns a hypervisor-specific signature in e[bcd]x, and eax
has the max hypervisor leaf.

J

2007-09-14 21:03:16

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
> Anthony Liguori wrote:
>
>> This patch refactors the current hypercall infrastructure to better support live
>> migration and SMP. It eliminates the hypercall page by trapping the UD
>> exception that would occur if you used the wrong hypercall instruction for the
>> underlying architecture and replacing it with the right one lazily.
>>
>>
>
> I guess it would be pretty rude/unlikely for these opcodes to get reused
> in other implementations... But couldn't you make the page trap
> instead, rather than relying on an instruction fault?
>

The whole point of using the instruction is to allow hypercalls to be
used in many locations. This has the nice side effect of not requiring
a central hypercall initialization routine in the guest to fetch the
hypercall page. A PV driver can be completely independent of any other
code provided that it restricts itself to it's hypercall namespace.

>> It also introduces the infrastructure to probe for hypercall available via
>> CPUID leaves 0x40000002. CPUID leaf 0x40000003 should be filled out by
>> userspace.
>>
>>
>
> Is this compatible with Xen's (and other's) use of cpuid? That is,
> 0x40000000 returns a hypervisor-specific signature in e[bcd]x, and eax
> has the max hypervisor leaf.
>

Xen is currently using 0/1/2. I had thought it was only using 0/1. The
intention was not to squash Xen's current CPUID usage so that it would
still be possible for Xen to make use of the guest code. Can we agree
that Xen won't squash leaves 3/4 or is it not worth trying to be
compatible at this point?

Regards,

Anthony Liguori

> J
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2005.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> kvm-devel mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>
>

2007-09-14 21:20:54

by Zachary Amsden

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

On Fri, 2007-09-14 at 16:02 -0500, Anthony Liguori wrote:
> Jeremy Fitzhardinge wrote:
> > Anthony Liguori wrote:
> >
> >> This patch refactors the current hypercall infrastructure to better support live
> >> migration and SMP. It eliminates the hypercall page by trapping the UD
> >> exception that would occur if you used the wrong hypercall instruction for the
> >> underlying architecture and replacing it with the right one lazily.
> >>
> >>
> >
> > I guess it would be pretty rude/unlikely for these opcodes to get reused
> > in other implementations... But couldn't you make the page trap
> > instead, rather than relying on an instruction fault?
> >
>
> The whole point of using the instruction is to allow hypercalls to be
> used in many locations. This has the nice side effect of not requiring
> a central hypercall initialization routine in the guest to fetch the
> hypercall page. A PV driver can be completely independent of any other
> code provided that it restricts itself to it's hypercall namespace.

But if the instruction is architecture dependent, and you run on the
wrong architecture, now you have to patch many locations at fault time,
introducing some nasty runtime code / data cache overlap performance
problems. Granted, they go away eventually.

I prefer the idea of a hypercall page, but not a central initialization.
Rather, a decentralized approach where PV drivers can detect using CPUID
which hypervisor is present, and a common MSR shared by all hypervisors
that provides the location of the hypercall page.

Zach

2007-09-14 21:22:51

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Anthony Liguori wrote:
> The whole point of using the instruction is to allow hypercalls to be
> used in many locations. This has the nice side effect of not
> requiring a central hypercall initialization routine in the guest to
> fetch the hypercall page. A PV driver can be completely independent
> of any other code provided that it restricts itself to it's hypercall
> namespace.

I see. So you take the fault, disassemble the instruction, see that its
another CPU's vmcall instruction, and then replace it with the current
CPU's vmcall?

> Xen is currently using 0/1/2. I had thought it was only using 0/1.
> The intention was not to squash Xen's current CPUID usage so that it
> would still be possible for Xen to make use of the guest code. Can we
> agree that Xen won't squash leaves 3/4 or is it not worth trying to be
> compatible at this point?

No, the point is that you're supposed to work out which hypervisor it is
from the signature in leaf 0, and then the hypervisor can put anything
it wants in the other leaves.

J

2007-09-14 21:45:25

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Zachary Amsden wrote:
> On Fri, 2007-09-14 at 16:02 -0500, Anthony Liguori wrote:
>
>> Jeremy Fitzhardinge wrote:
>>
>>> Anthony Liguori wrote:
>>>
>>>
>>>> This patch refactors the current hypercall infrastructure to better support live
>>>> migration and SMP. It eliminates the hypercall page by trapping the UD
>>>> exception that would occur if you used the wrong hypercall instruction for the
>>>> underlying architecture and replacing it with the right one lazily.
>>>>
>>>>
>>>>
>>> I guess it would be pretty rude/unlikely for these opcodes to get reused
>>> in other implementations... But couldn't you make the page trap
>>> instead, rather than relying on an instruction fault?
>>>
>>>
>> The whole point of using the instruction is to allow hypercalls to be
>> used in many locations. This has the nice side effect of not requiring
>> a central hypercall initialization routine in the guest to fetch the
>> hypercall page. A PV driver can be completely independent of any other
>> code provided that it restricts itself to it's hypercall namespace.
>>
>
> But if the instruction is architecture dependent, and you run on the
> wrong architecture, now you have to patch many locations at fault time,
> introducing some nasty runtime code / data cache overlap performance
> problems. Granted, they go away eventually.
>

We're addressing that by blowing away the shadow cache and holding the
big kvm lock to ensure SMP safety. Not a great thing to do from a
performance perspective but the whole point of patching is that the cost
is amortized.

> I prefer the idea of a hypercall page, but not a central initialization.
> Rather, a decentralized approach where PV drivers can detect using CPUID
> which hypervisor is present, and a common MSR shared by all hypervisors
> that provides the location of the hypercall page.
>

So then each module creates a hypercall page using this magic MSR and
the hypervisor has to keep track of it so that it can appropriately
change the page on migration. The page can only contain a single
instruction or else it cannot be easily changed (or you have to be able
to prevent the guest from being migrated while in the hypercall page).

We're really talking about identical models. Instead of an MSR, the #GP
is what tells the hypervisor to update the instruction. The nice thing
about this is that you don't have to keep track of all the current
hypercall page locations in the hypervisor.

Regards,

Anthony Liguori

> Zach
>
>
>

2007-09-14 21:46:40

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
> Anthony Liguori wrote:
>
>> The whole point of using the instruction is to allow hypercalls to be
>> used in many locations. This has the nice side effect of not
>> requiring a central hypercall initialization routine in the guest to
>> fetch the hypercall page. A PV driver can be completely independent
>> of any other code provided that it restricts itself to it's hypercall
>> namespace.
>>
>
> I see. So you take the fault, disassemble the instruction, see that its
> another CPU's vmcall instruction, and then replace it with the current
> CPU's vmcall?
>

Yup.

>> Xen is currently using 0/1/2. I had thought it was only using 0/1.
>> The intention was not to squash Xen's current CPUID usage so that it
>> would still be possible for Xen to make use of the guest code. Can we
>> agree that Xen won't squash leaves 3/4 or is it not worth trying to be
>> compatible at this point?
>>
>
> No, the point is that you're supposed to work out which hypervisor it is
> from the signature in leaf 0, and then the hypervisor can put anything
> it wants in the other leaves.
>

Yeah, see, the initial goal was to make it possible to use the KVM
paravirtualizations on other hypervisors. However, I don't think this
is really going to be possible in general so maybe it's better to just
use leaf 0. I'll let others chime in before sending a new patch.

Regards,

Anthony Liguori

> J
>
>

2007-09-14 21:53:16

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Anthony Liguori wrote:
> Yeah, see, the initial goal was to make it possible to use the KVM
> paravirtualizations on other hypervisors. However, I don't think this
> is really going to be possible in general so maybe it's better to just
> use leaf 0. I'll let others chime in before sending a new patch.

Hm. Obviously you can just define a signature for "kvm-compatible
hypercall interface" and make it common that way, but it gets tricky if
the hypervisor supports multiple hypercall interfaces, including the kvm
one. Start the kvm leaves at 0x40001000 or something?

J

2007-09-14 22:08:36

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
> Anthony Liguori wrote:
>
>> Yeah, see, the initial goal was to make it possible to use the KVM
>> paravirtualizations on other hypervisors. However, I don't think this
>> is really going to be possible in general so maybe it's better to just
>> use leaf 0. I'll let others chime in before sending a new patch.
>>
>
> Hm. Obviously you can just define a signature for "kvm-compatible
> hypercall interface" and make it common that way, but it gets tricky if
> the hypervisor supports multiple hypercall interfaces, including the kvm
> one. Start the kvm leaves at 0x40001000 or something?
>

Yeah, that works with me.

Regards,

Anthony Liguori

> J
>
>

2007-09-14 22:40:39

by Nakajima, Jun

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Anthony Liguori wrote:
> Jeremy Fitzhardinge wrote:
> > Anthony Liguori wrote:
> >
> > > Yeah, see, the initial goal was to make it possible to use the KVM
> > > paravirtualizations on other hypervisors. However, I don't think
this
> > > is really going to be possible in general so maybe it's better to
just
> > > use leaf 0. I'll let others chime in before sending a new patch.
> > >
> >
> > Hm. Obviously you can just define a signature for "kvm-compatible
> > hypercall interface" and make it common that way, but it gets tricky
if
> > the hypervisor supports multiple hypercall interfaces, including the
kvm
> > one. Start the kvm leaves at 0x40001000 or something?
> >
>
> Yeah, that works with me.

To me this is the beginning of fragmentation. Why do we need different
and VMM-specific Linux paravirtualization for hardware-assisted
virtualization? That would not be good for Linux.

>
> Regards,
>
> Anthony Liguori
>
> > J

Jun
---
Intel Open Source Technology Center

2007-09-14 23:00:39

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Nakajima, Jun wrote:
>>> one. Start the kvm leaves at 0x40001000 or something?
>>>
>>>
>> Yeah, that works with me.
>>
>
> To me this is the beginning of fragmentation. Why do we need different
> and VMM-specific Linux paravirtualization for hardware-assisted
> virtualization? That would not be good for Linux.
>

On the contrary. Xen already has a hypercall interface, and we need to
keep supporting it. If we were to also support a vmm-independent
interface (aka "kvm interface"), then we need to be able to do that in
parallel. If we have a cpuid leaf clash, then its impossible to do so;
if we define the new interface to be disjoint from other current users
of cpuid, then we can support them concurrently.

J

2007-09-15 00:11:00

by Nakajima, Jun

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
> Nakajima, Jun wrote:
> > > > one. Start the kvm leaves at 0x40001000 or something?
> > > >
> > > >
> > > Yeah, that works with me.
> > >
> >
> > To me this is the beginning of fragmentation. Why do we need
different
> > and VMM-specific Linux paravirtualization for hardware-assisted
> > virtualization? That would not be good for Linux.
> >
>
> On the contrary. Xen already has a hypercall interface, and we need
to
> keep supporting it. If we were to also support a vmm-independent
> interface (aka "kvm interface"), then we need to be able to do that in
> parallel. If we have a cpuid leaf clash, then its impossible to do
so;
> if we define the new interface to be disjoint from other current users
> of cpuid, then we can support them concurrently.
>
> J

Today, 3 CPUID leaves starting from 0x4000_0000 are defined in a generic
fashion (hypervisor detection, version, and hypercall page), and those
are the ones used by Xen today. We should extend those leaves (e.g.
starting from 0x4000_0003) for the vmm-independent features as well.

If Xen needs additional Xen-specific features, we need to allocate some
leaves for those (e.g. 0x4000_1000)

Jun
---
Intel Open Source Technology Center

2007-09-15 00:28:18

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Nakajima, Jun wrote:
> Today, 3 CPUID leaves starting from 0x4000_0000 are defined in a generic
> fashion (hypervisor detection, version, and hypercall page), and those
> are the ones used by Xen today. We should extend those leaves (e.g.
> starting from 0x4000_0003) for the vmm-independent features as well.
>
> If Xen needs additional Xen-specific features, we need to allocate some
> leaves for those (e.g. 0x4000_1000)

But the signature is "XenVMMXenVMM", which isn't very generic. If we're
presenting a generic interface, it needs to have a generic signature,
otherwise guests will need to have a list of all hypervisor signatures
supporting their interface. Since 0x40000000 has already been
established as the base leaf of the hypervisor-specific interfaces, the
generic interface will have to be elsewhere.

J

2007-09-15 01:04:25

by Nakajima, Jun

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
> Nakajima, Jun wrote:
> > Today, 3 CPUID leaves starting from 0x4000_0000 are defined in a
generic
> > fashion (hypervisor detection, version, and hypercall page), and
those
> > are the ones used by Xen today. We should extend those leaves (e.g.
> > starting from 0x4000_0003) for the vmm-independent features as well.
> >
> > If Xen needs additional Xen-specific features, we need to allocate
some
> > leaves for those (e.g. 0x4000_1000)
>
> But the signature is "XenVMMXenVMM", which isn't very generic. If
we're
> presenting a generic interface, it needs to have a generic signature,
> otherwise guests will need to have a list of all hypervisor signatures
> supporting their interface. Since 0x40000000 has already been
> established as the base leaf of the hypervisor-specific interfaces,
the
> generic interface will have to be elsewhere.

The hypervisor detection machanism is generic, and the signature
returned is implentation specific. Having a list of all hypervisor
signatures sounds fine to me as we are detecting vendor-specific
processor(s) in the native. And I don't expect the list is large.

>
> J

Jun
---
Intel Open Source Technology Center

2007-09-15 03:05:55

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] Refactor hypercall infrastructure

On Fri, 2007-09-14 at 13:53 -0700, Jeremy Fitzhardinge wrote:
> Anthony Liguori wrote:
> > This patch refactors the current hypercall infrastructure to better support live
> > migration and SMP. It eliminates the hypercall page by trapping the UD
> > exception that would occur if you used the wrong hypercall instruction for the
> > underlying architecture and replacing it with the right one lazily.
> >
>
> I guess it would be pretty rude/unlikely for these opcodes to get reused
> in other implementations... But couldn't you make the page trap
> instead, rather than relying on an instruction fault?

That's a pain for inline hypercalls tho. I was planning on moving
lguest to this model (which is interesting, because AFAICT this insn
will cause a #UD or #GP depending on whether VT is supported on this box
so I have to look for both).

Cheers,
Rusty.

2007-09-15 03:38:05

by Zachary Amsden

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

On Fri, 2007-09-14 at 16:44 -0500, Anthony Liguori wrote:

> So then each module creates a hypercall page using this magic MSR and
> the hypervisor has to keep track of it so that it can appropriately
> change the page on migration. The page can only contain a single
> instruction or else it cannot be easily changed (or you have to be able
> to prevent the guest from being migrated while in the hypercall page).
>
> We're really talking about identical models. Instead of an MSR, the #GP
> is what tells the hypervisor to update the instruction. The nice thing
> about this is that you don't have to keep track of all the current
> hypercall page locations in the hypervisor.

I agree, multiple hypercall pages is insane. I was thinking more of a
single hypercall page, fixed in place by the hypervisor, not the kernel.

Then each module can read an MSR saying what VA the hypercall page is
at, and the hypervisor can simply flip one page to switch architectures.

Zach

2007-09-15 04:53:51

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Nakajima, Jun wrote:
> The hypervisor detection machanism is generic, and the signature
> returned is implentation specific. Having a list of all hypervisor
> signatures sounds fine to me as we are detecting vendor-specific
> processor(s) in the native. And I don't expect the list is large.
>
>

I'm confused about what you're proposing. I was thinking that a kernel
looking for the generic hypervisor interface would check for a specific
signature at some cpuid leaf, and then go about using it from there. If
not, how does is it supposed to detect the generic hypervisor interface?

J

2007-09-15 06:11:32

by Nakajima, Jun

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
> Nakajima, Jun wrote:
> > The hypervisor detection machanism is generic, and the signature
> > returned is implentation specific. Having a list of all hypervisor
> > signatures sounds fine to me as we are detecting vendor-specific
> > processor(s) in the native. And I don't expect the list is large.
> >
> >
>
> I'm confused about what you're proposing. I was thinking that a
kernel
> looking for the generic hypervisor interface would check for a
specific
> signature at some cpuid leaf, and then go about using it from there.
If
> not, how does is it supposed to detect the generic hypervisor
interface?
>
> J

I'm suggesting that we use CPUID.0x4000000Y (Y: TBD, e.g. 6) for Linux
paravirtualization. The ebx, ecx and edx return the Linux
paravirtualization features available on that hypervisor. Those features
are defined architecturally (not VMM specific).

Like CPUID.0, CPUID.0x40000000 is used to detect the hypervisor with the
vendor identification string returned in ebx, edx, and ecx (as we are
doing in Xen). The eax returns the max leaf (which is 0x40000002 on Xen
today). And like CPUID.1, CPUID.0x40000001 returns the version number in
eax, and each VMM should be able to define a number of VMM-specific
features available in ebx, ecx, and edx returned (which are reserved,
i.e. not used in Xen today).

Suppose we knew (i.e. tested) Xen and KVM supported Linux
paravirtualization, the Linux code does:
1. detect Xen or KVM <the list> using CPUID.0x40000000
2. Check the version if necessary using CPUID.0x40000001
3. Check the Linux paravirtualization features available using
CPUID.0x4000000Y.

Jun
---
Intel Open Source Technology Center

2007-09-15 07:53:26

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Anthony Liguori wrote:
> Jeremy Fitzhardinge wrote:
>
>> Anthony Liguori wrote:
>>
>>
>>> This patch refactors the current hypercall infrastructure to better support live
>>> migration and SMP. It eliminates the hypercall page by trapping the UD
>>> exception that would occur if you used the wrong hypercall instruction for the
>>> underlying architecture and replacing it with the right one lazily.
>>>
>>>
>>>
>> I guess it would be pretty rude/unlikely for these opcodes to get reused
>> in other implementations... But couldn't you make the page trap
>> instead, rather than relying on an instruction fault?
>>
>>
>
> The whole point of using the instruction is to allow hypercalls to be
> used in many locations. This has the nice side effect of not requiring
> a central hypercall initialization routine in the guest to fetch the
> hypercall page. A PV driver can be completely independent of any other
> code provided that it restricts itself to it's hypercall namespace.
>
>

It also has the benefit of not requiring an initialization protocol, and
of reducing complaints about the hypervisor injecting code into the guest.


>>> It also introduces the infrastructure to probe for hypercall available via
>>> CPUID leaves 0x40000002. CPUID leaf 0x40000003 should be filled out by
>>> userspace.
>>>
>>>
>>>
>> Is this compatible with Xen's (and other's) use of cpuid? That is,
>> 0x40000000 returns a hypervisor-specific signature in e[bcd]x, and eax
>> has the max hypervisor leaf.
>>
>>
>
> Xen is currently using 0/1/2. I had thought it was only using 0/1. The
> intention was not to squash Xen's current CPUID usage so that it would
> still be possible for Xen to make use of the guest code. Can we agree
> that Xen won't squash leaves 3/4 or is it not worth trying to be
> compatible at this point?
>

I definitely want kvm to be able to emulate the Xen hypercall interface,
but there's no need to allow both concurrently. So I'd say use
0x40000000 for detection and the rest cannot clash because detection fails.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2007-09-15 08:01:00

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Nakajima, Jun wrote:
> To me this is the beginning of fragmentation. Why do we need different
> and VMM-specific Linux paravirtualization for hardware-assisted
> virtualization? That would not be good for Linux.
>
>

The only way to have a single interface is if a central authority
defines and documents that interface, and all hypervisor implementors
agree not to implement extensions. Do you see that happening?

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2007-09-15 08:09:03

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Zachary Amsden wrote:
> On Fri, 2007-09-14 at 16:44 -0500, Anthony Liguori wrote:
>
>
>> So then each module creates a hypercall page using this magic MSR and
>> the hypervisor has to keep track of it so that it can appropriately
>> change the page on migration. The page can only contain a single
>> instruction or else it cannot be easily changed (or you have to be able
>> to prevent the guest from being migrated while in the hypercall page).
>>
>> We're really talking about identical models. Instead of an MSR, the #GP
>> is what tells the hypervisor to update the instruction. The nice thing
>> about this is that you don't have to keep track of all the current
>> hypercall page locations in the hypervisor.
>>
>
> I agree, multiple hypercall pages is insane. I was thinking more of a
> single hypercall page, fixed in place by the hypervisor, not the kernel.
>
> Then each module can read an MSR saying what VA the hypercall page is
> at, and the hypervisor can simply flip one page to switch architectures.
>

VA as in "Virtual Address"? the ppc people don't have
hypervisor-visible virtual addresses, and the hypervisor (on x86) can't
safely select a virtual address, and ...

That means you need a physical address, so you need a central
initialization routine, and drivers for unmodified OSes can no longer be
self contained.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2007-09-15 17:33:50

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Zachary Amsden wrote:
> On Fri, 2007-09-14 at 16:44 -0500, Anthony Liguori wrote:
>
>
>> So then each module creates a hypercall page using this magic MSR and
>> the hypervisor has to keep track of it so that it can appropriately
>> change the page on migration. The page can only contain a single
>> instruction or else it cannot be easily changed (or you have to be able
>> to prevent the guest from being migrated while in the hypercall page).
>>
>> We're really talking about identical models. Instead of an MSR, the #GP
>> is what tells the hypervisor to update the instruction. The nice thing
>> about this is that you don't have to keep track of all the current
>> hypercall page locations in the hypervisor.
>>
>
> I agree, multiple hypercall pages is insane. I was thinking more of a
> single hypercall page, fixed in place by the hypervisor, not the kernel.
>
> Then each module can read an MSR saying what VA the hypercall page is
> at, and the hypervisor can simply flip one page to switch architectures.
>

That requires a memory hole though. In KVM, we don't have a memory hole.

Regards,

Anthony Liguori

> Zach
>
>

2007-09-15 18:24:13

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Nakajima, Jun wrote:
> Jeremy Fitzhardinge wrote:
>
>> Nakajima, Jun wrote:
>>
>>> The hypervisor detection machanism is generic, and the signature
>>> returned is implentation specific. Having a list of all hypervisor
>>> signatures sounds fine to me as we are detecting vendor-specific
>>> processor(s) in the native. And I don't expect the list is large.
>>>
>>>
>>>
>> I'm confused about what you're proposing. I was thinking that a
>>
> kernel
>
>> looking for the generic hypervisor interface would check for a
>>
> specific
>
>> signature at some cpuid leaf, and then go about using it from there.
>>
> If
>
>> not, how does is it supposed to detect the generic hypervisor
>>
> interface?
>
>> J
>>
>
> I'm suggesting that we use CPUID.0x4000000Y (Y: TBD, e.g. 6) for Linux
> paravirtualization. The ebx, ecx and edx return the Linux
> paravirtualization features available on that hypervisor. Those features
> are defined architecturally (not VMM specific).
>
> Like CPUID.0, CPUID.0x40000000 is used to detect the hypervisor with the
> vendor identification string returned in ebx, edx, and ecx (as we are
> doing in Xen). The eax returns the max leaf (which is 0x40000002 on Xen
> today).

I don't understand the purpose of returning the max leaf. Who is that
information useful for?

I like Jeremy's suggesting of starting with 0x40001000 for KVM. Xen has
an established hypercall interface and that isn't going to change.
However, in the future, if other Operating Systems (like the BSDs)
choose to implement the KVM paravirtualization interface, then that
leaves open the possibility for Xen to also support this interface to
get good performance for those OSes. It's necessary to be able to
support both at once if you wish to support these interfaces without
user interaction.

There's no tangible benefit to us to use 0x40000000. Therefore I'm
inclined to lean toward making things easier for others.

Regards,

Anthony Liguori

> And like CPUID.1, CPUID.0x40000001 returns the version number in
> eax, and each VMM should be able to define a number of VMM-specific
> features available in ebx, ecx, and edx returned (which are reserved,
> i.e. not used in Xen today).
>
> Suppose we knew (i.e. tested) Xen and KVM supported Linux
> paravirtualization, the Linux code does:
> 1. detect Xen or KVM <the list> using CPUID.0x40000000
> 2. Check the version if necessary using CPUID.0x40000001
> 3. Check the Linux paravirtualization features available using
> CPUID.0x4000000Y.
>
> Jun
> ---
> Intel Open Source Technology Center
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2005.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> kvm-devel mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>

2007-09-17 18:15:22

by Nakajima, Jun

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Anthony Liguori wrote:
> Nakajima, Jun wrote:
<snip>
> >
> > I'm suggesting that we use CPUID.0x4000000Y (Y: TBD, e.g. 6) for
Linux
> > paravirtualization. The ebx, ecx and edx return the Linux
> > paravirtualization features available on that hypervisor. Those
features
> > are defined architecturally (not VMM specific).
> >
> > Like CPUID.0, CPUID.0x40000000 is used to detect the hypervisor with
the
> > vendor identification string returned in ebx, edx, and ecx (as we
are
> > doing in Xen). The eax returns the max leaf (which is 0x40000002 on
Xen
> > today).
>
> I don't understand the purpose of returning the max leaf. Who is that
> information useful for?

Well, this is the key info to the user of CPUID. It tells which leaves
are valid to use. Otherwise, the user cannot tell whether the results of
CPUID.0x4000000N are valid or not (i.e. junk). BTW, this is what we are
doing on the native (for the leaf 0, 0x80000000, for example). The fact
that Xen returns 0x40000002 means it only uses 3 leaves today.

>
> I like Jeremy's suggesting of starting with 0x40001000 for KVM. Xen
has
> an established hypercall interface and that isn't going to change.
> However, in the future, if other Operating Systems (like the BSDs)
> choose to implement the KVM paravirtualization interface, then that
> leaves open the possibility for Xen to also support this interface to
> get good performance for those OSes. It's necessary to be able to
> support both at once if you wish to support these interfaces without
> user interaction.

Using CPUID.0x4000000N (N > 2) does not prevent Xen from doing that,
either. If you use 0x40001000, 1) you need to say the leaves from
0x40000000 through 0x40001000 are all valid, OR 2) you create/fork a
new/odd leaf (with 0x1000 offset) repeating the detection redundantly.

>
> There's no tangible benefit to us to use 0x40000000. Therefore I'm
> inclined to lean toward making things easier for others.

Again, 0x40000000 is not Xen specific. If the leaf 0x40000000 is used
for any guest to detect any hypervisor, that would be compelling
benefit. For future Xen-specific features, it's safe for Xen to use
other bigger leaves (like 0x40001000) because the guest starts looking
at them after detection of Xen.

Likewise if KVM paravirtualization interface (as kind of "open source
paravirtualization interface") is detected in the generic areas (not in
vender-specific), any guest can check the features available without
knowing which hypervisor uses which CPUID for that.

>
> Regards,
>
> Anthony Liguori
>
> > And like CPUID.1, CPUID.0x40000001 returns the version number in
> > eax, and each VMM should be able to define a number of VMM-specific
> > features available in ebx, ecx, and edx returned (which are
reserved, i.e.
> > not used in Xen today).
> >
> > Suppose we knew (i.e. tested) Xen and KVM supported Linux
> > paravirtualization, the Linux code does:
> > 1. detect Xen or KVM <the list> using CPUID.0x40000000
> > 2. Check the version if necessary using CPUID.0x40000001
> > 3. Check the Linux paravirtualization features available using
> > CPUID.0x4000000Y.
> >
> > Jun
> > ---
> > Intel Open Source Technology Center

Jun
---
Intel Open Source Technology Center

2007-09-17 18:27:35

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Nakajima, Jun wrote:
>> I don't understand the purpose of returning the max leaf. Who is that
>> information useful for?
>>
>
> Well, this is the key info to the user of CPUID. It tells which leaves
> are valid to use. Otherwise, the user cannot tell whether the results of
> CPUID.0x4000000N are valid or not (i.e. junk). BTW, this is what we are
> doing on the native (for the leaf 0, 0x80000000, for example). The fact
> that Xen returns 0x40000002 means it only uses 3 leaves today.
>

Then it's just a version ID. You pretty much have to treat it as a
version id because if it returns 0x4000 0003 and you only know what 0002
is, then you can't actually use it.

I much prefer the current use of CPUID in KVM. If 1000 returns the KVM
signature, then 1001 *must* be valid and contain a set of feature bits.
If we wish to use additional CPUID leaves in the future, then we can
just use a feature bit. The real benefit to us is that we can use a
discontiguous set of leaves whereas the Xen approach is forced to use a
linear set (at least for the result to be meaningful).

>> I like Jeremy's suggesting of starting with 0x40001000 for KVM. Xen
>>
> has
>
>> an established hypercall interface and that isn't going to change.
>> However, in the future, if other Operating Systems (like the BSDs)
>> choose to implement the KVM paravirtualization interface, then that
>> leaves open the possibility for Xen to also support this interface to
>> get good performance for those OSes. It's necessary to be able to
>> support both at once if you wish to support these interfaces without
>> user interaction.
>>
>
> Using CPUID.0x4000000N (N > 2) does not prevent Xen from doing that,
> either. If you use 0x40001000, 1) you need to say the leaves from
> 0x40000000 through 0x40001000 are all valid, OR 2) you create/fork a
> new/odd leaf (with 0x1000 offset) repeating the detection redundantly.
>

Why do 0000-1000 have to be valid? Xen is not going to change what they
have today--they can't. However, if down the road, they decided that
since so many guests use KVM's paravirtualization interface other than
Linux, there's value in supporting it, by using 1000, they can.

>> There's no tangible benefit to us to use 0x40000000. Therefore I'm
>> inclined to lean toward making things easier for others.
>>
>
> Again, 0x40000000 is not Xen specific. If the leaf 0x40000000 is used
> for any guest to detect any hypervisor, that would be compelling
> benefit. For future Xen-specific features, it's safe for Xen to use
> other bigger leaves (like 0x40001000) because the guest starts looking
> at them after detection of Xen.
>

I'm starting to lean toward just using 0000. If for no other reason
than the hypercall space is unsharable.

Regards,

Anthony Liguori

> Likewise if KVM paravirtualization interface (as kind of "open source
> paravirtualization interface") is detected in the generic areas (not in
> vender-specific), any guest can check the features available without
> knowing which hypervisor uses which CPUID for that.
>
>
>> Regards,
>>
>> Anthony Liguori
>>
>>
>>> And like CPUID.1, CPUID.0x40000001 returns the version number in
>>> eax, and each VMM should be able to define a number of VMM-specific
>>> features available in ebx, ecx, and edx returned (which are
>>>
> reserved, i.e.
>
>>> not used in Xen today).
>>>
>>> Suppose we knew (i.e. tested) Xen and KVM supported Linux
>>> paravirtualization, the Linux code does:
>>> 1. detect Xen or KVM <the list> using CPUID.0x40000000
>>> 2. Check the version if necessary using CPUID.0x40000001
>>> 3. Check the Linux paravirtualization features available using
>>> CPUID.0x4000000Y.
>>>
>>> Jun
>>> ---
>>> Intel Open Source Technology Center
>>>
>
> Jun
> ---
> Intel Open Source Technology Center
>

2007-09-17 19:15:37

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Nakajima, Jun wrote:
> Using CPUID.0x4000000N (N > 2) does not prevent Xen from doing that,
> either. If you use 0x40001000, 1) you need to say the leaves from
> 0x40000000 through 0x40001000 are all valid, OR 2) you create/fork a
> new/odd leaf (with 0x1000 offset) repeating the detection redundantly.
>

I don't see a particular problem with that. If the whole 0x4xxxxxxx
range is reserved for hypervisor use, and existing hypervisors are
already using 0x400000xx in hypervisor-specific ways, then it makes
sense to start the generic stuff at 0x40001xxx (or some other offset).
But without a few more implementations of the "generic" interface its
all a bit moot (ie, where's your code? ;).

> Again, 0x40000000 is not Xen specific. If the leaf 0x40000000 is used
> for any guest to detect any hypervisor, that would be compelling
> benefit. For future Xen-specific features, it's safe for Xen to use
> other bigger leaves (like 0x40001000) because the guest starts looking
> at them after detection of Xen.
>
> Likewise if KVM paravirtualization interface (as kind of "open source
> paravirtualization interface") is detected in the generic areas (not in
> vender-specific), any guest can check the features available without
> knowing which hypervisor uses which CPUID for that.
>

This just seems a bit grotty. You're relying on the fact that you can
overlay Xen's current use of 0x4000000x for the generic interface by
freezing Xen's current use of 40000000-2. 0x40000000 becomes a more or
less useless hypervisor-identification signature (useless because you
need to assume that leaves 4000000x, x>2 implement the generic interface
anyway, where x=1,2 are reserved for Xen (=hypervisor-specific) uses).

In other words, what mechanism can a guest use to explicitly identify
the existence of the generic interface? There needs to be a signature
for that somewhere.

J

2007-09-17 19:15:52

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Anthony Liguori wrote:
> Nakajima, Jun wrote:
>>> I don't understand the purpose of returning the max leaf. Who is that
>>> information useful for?
>>>
>>
>> Well, this is the key info to the user of CPUID. It tells which leaves
>> are valid to use. Otherwise, the user cannot tell whether the results of
>> CPUID.0x4000000N are valid or not (i.e. junk). BTW, this is what we are
>> doing on the native (for the leaf 0, 0x80000000, for example). The fact
>> that Xen returns 0x40000002 means it only uses 3 leaves today.
>
> Then it's just a version ID. You pretty much have to treat it as a
> version id because if it returns 0x4000 0003 and you only know what
> 0002 is, then you can't actually use it.

Yeah. It's the way all the other cpuid leaf/level stuff works, so it's
reasonable to do the same thing here. The question it helps answer is
"I understand leaf 33, does the [v]CPU?".

> I much prefer the current use of CPUID in KVM. If 1000 returns the
> KVM signature, then 1001 *must* be valid and contain a set of feature
> bits. If we wish to use additional CPUID leaves in the future, then
> we can just use a feature bit. The real benefit to us is that we can
> use a discontiguous set of leaves whereas the Xen approach is forced
> to use a linear set (at least for the result to be meaningful).

Well, its also what the CPU itself does. The feature bits tend to
relate to specific CPU features rather than CPUID instruction leaves.
The features themselves may also have corresponding leaves, but that's
secondary. IOW, if feature bit X is set, it may use leaf 0x4000101f,
but that doesn't mean leaves 0x40001001-1f are necessarily defined.

> I'm starting to lean toward just using 0000. If for no other reason
> than the hypercall space is unsharable.

Well, it could be, but it would take affirmative action on the guest's
part. If there's feature bits for each supported hypercall interface,
then you could have a magic MSR to select which interface you want to
use now. That would allow a generic-interface-using guest to probe for
the generic interface at cpuid leaf 0x40001000, use 40001001 to
determine whether the hypercall interface is available, 4000100x to find
the base of the magic msrs, and write appropriate msr to set the desired
hypercall style (and all this can be done without using vmcall, so it
doesn't matter that hypercall interface is initially established).

J

2007-09-17 19:34:20

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
>> I'm starting to lean toward just using 0000. If for no other reason
>> than the hypercall space is unsharable.
>>
>
> Well, it could be, but it would take affirmative action on the guest's
> part. If there's feature bits for each supported hypercall interface,
> then you could have a magic MSR to select which interface you want to
> use now. That would allow a generic-interface-using guest to probe for
> the generic interface at cpuid leaf 0x40001000, use 40001001 to
> determine whether the hypercall interface is available, 4000100x to find
> the base of the magic msrs, and write appropriate msr to set the desired
> hypercall style (and all this can be done without using vmcall, so it
> doesn't matter that hypercall interface is initially established).
>

The main thing keeping me from doing this ATM is what I perceive as lack
of interest in a generic interface. I think it's also a little
premature given that we don't have any features on the plate yet.
However, I don't think that means that we cannot turn KVM's PV into a
generic one. So here's what I propose.

Let's start building the KVM PV interface on 4000 0000. That means that
Xen cannot initially use it but that's okay. Once KVM-lite is merged
and we have some solid features (and other guests start implementing
them), we can also advertise this interface as a "generic interface" by
also supporting the signature on leave 4000 1000 and using the MSR
trickery that you propose.

As long as we all agree not to use 4000 1000 for now, it leaves open the
possibility of having a generic interface in the future.

Regards,

Anthony Liguori

> J
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2005.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> kvm-devel mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>
>

2007-09-17 20:52:38

by Nakajima, Jun

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH] Refactor hypercall infrastructure

Jeremy Fitzhardinge wrote:
> Nakajima, Jun wrote:
>
> > Again, 0x40000000 is not Xen specific. If the leaf 0x40000000 is
used
> > for any guest to detect any hypervisor, that would be compelling
> > benefit. For future Xen-specific features, it's safe for Xen to use
> > other bigger leaves (like 0x40001000) because the guest starts
looking
> > at them after detection of Xen.
> >
> > Likewise if KVM paravirtualization interface (as kind of "open
source
> > paravirtualization interface") is detected in the generic areas (not
in
> > vender-specific), any guest can check the features available without
> > knowing which hypervisor uses which CPUID for that.
> >
>
> This just seems a bit grotty. You're relying on the fact that you can
> overlay Xen's current use of 0x4000000x for the generic interface by
> freezing Xen's current use of 40000000-2. 0x40000000 becomes a more
or
> less useless hypervisor-identification signature (useless because you
> need to assume that leaves 4000000x, x>2 implement the generic
interface
> anyway, where x=1,2 are reserved for Xen (=hypervisor-specific) uses).

No, really. Xen just _implemented_ the generic interface from the
beginning, at least for 0 and 1 (version). The 0x40000002 (hypercall
page) looks specific to Xen, but it can be used for KVM as well, thus
can be generic (or a hypervisor can tell it's not supported by returning
0 pages for hypercall pages). If Xen implements the new generic feature
(defined by 0x40000003, for example), then it returns 40000003 or large
for the max leaf upon CPUID.0x40000000.

>
> In other words, what mechanism can a guest use to explicitly identify
> the existence of the generic interface? There needs to be a signature
> for that somewhere.
>
> J
So you don't need a signature for that.

As I wrote before:
1. detect Xen or KVM <the list> using CPUID.0x40000000
2. Check the version if necessary using CPUID.0x40000001
3. Check the generic features available using CPUID.0x4000000Y, if the
max leaf returned >= 0x4000000Y.

A guest wants to want to know who the hypervior is for practical
purposes (e.g. debuggging) anyway. This is equivalent to what a native
OS would do to detect a generic CPU feature.

Jun
---
Intel Open Source Technology Center