2007-02-19 10:20:38

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 0/13] kvm updates for 2.6.21

Following is a series of kvm updates. Please apply to Linux 2.6.21.

--
error compiling committee.c: too many arguments to function


2007-02-19 10:21:53

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 1/13] KVM: mmu: add missing dirty page tracking cases

We fail to mark a page dirty in three cases:

- setting the accessed bit in a pte
- setting the dirty bit in a pte
- emulating a write into a pagetable

This fix adds the missing cases.

Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -128,8 +128,10 @@ static int FNAME(walk_addr)(struct guest
goto access_error;
#endif

- if (!(*ptep & PT_ACCESSED_MASK))
- *ptep |= PT_ACCESSED_MASK; /* avoid rmw */
+ if (!(*ptep & PT_ACCESSED_MASK)) {
+ mark_page_dirty(vcpu->kvm, table_gfn);
+ *ptep |= PT_ACCESSED_MASK;
+ }

if (walker->level == PT_PAGE_TABLE_LEVEL) {
walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
@@ -185,6 +187,12 @@ static void FNAME(release_walker)(struct
kunmap_atomic(walker->table, KM_USER0);
}

+static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
+ struct guest_walker *walker)
+{
+ mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
+}
+
static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
u64 *shadow_pte, u64 access_bits, gfn_t gfn)
{
@@ -348,12 +356,15 @@ static int FNAME(fix_write_pf)(struct kv
} else if (kvm_mmu_lookup_page(vcpu, gfn)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
+ mark_page_dirty(vcpu->kvm, gfn);
+ FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
*guest_ent |= PT_DIRTY_MASK;
*write_pt = 1;
return 0;
}
mark_page_dirty(vcpu->kvm, gfn);
*shadow_ent |= PT_WRITABLE_MASK;
+ FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
*guest_ent |= PT_DIRTY_MASK;
rmap_add(vcpu, shadow_ent);

2007-02-19 10:22:53

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 2/13] KVM: Move virtualization deactivation from CPU_DEAD state to CPU_DOWN_PREPARE

From: Jeremy Katz <[email protected]>

This gives it more chances of surviving suspend.

Signed-off-by: Jeremy Katz <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -2080,13 +2080,17 @@ static int kvm_cpu_hotplug(struct notifi
int cpu = (long)v;

switch (val) {
- case CPU_DEAD:
+ case CPU_DOWN_PREPARE:
case CPU_UP_CANCELED:
+ printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+ cpu);
decache_vcpus_on_cpu(cpu);
smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
NULL, 0, 1);
break;
- case CPU_UP_PREPARE:
+ case CPU_ONLINE:
+ printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+ cpu);
smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
NULL, 0, 1);
break;

2007-02-19 10:23:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 3/13] KVM: Cosmetics

Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -126,10 +126,8 @@ static inline int valid_vcpu(int n)
return likely(n >= 0 && n < KVM_MAX_VCPUS);
}

-int kvm_read_guest(struct kvm_vcpu *vcpu,
- gva_t addr,
- unsigned long size,
- void *dest)
+int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
+ void *dest)
{
unsigned char *host_buf = dest;
unsigned long req_size = size;
@@ -161,10 +159,8 @@ int kvm_read_guest(struct kvm_vcpu *vcpu
}
EXPORT_SYMBOL_GPL(kvm_read_guest);

-int kvm_write_guest(struct kvm_vcpu *vcpu,
- gva_t addr,
- unsigned long size,
- void *data)
+int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
+ void *data)
{
unsigned char *host_buf = data;
unsigned long req_size = size;
@@ -457,7 +453,7 @@ EXPORT_SYMBOL_GPL(set_cr4);
void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (is_long_mode(vcpu)) {
- if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
+ if (cr3 & CR3_L_MODE_RESEVED_BITS) {
printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
inject_gp(vcpu);
return;
@@ -774,7 +770,6 @@ static int kvm_dev_ioctl_get_dirty_log(s
if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
goto out;

-
if (any) {
cleared = 0;
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
@@ -903,8 +898,9 @@ static int emulator_read_emulated(unsign
return X86EMUL_CONTINUE;
else {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+
if (gpa == UNMAPPED_GVA)
- return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
+ return X86EMUL_PROPAGATE_FAULT;
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
@@ -1801,12 +1797,11 @@ static long kvm_dev_ioctl(struct file *f
case KVM_GET_API_VERSION:
r = KVM_API_VERSION;
break;
- case KVM_CREATE_VCPU: {
+ case KVM_CREATE_VCPU:
r = kvm_dev_ioctl_create_vcpu(kvm, arg);
if (r)
goto out;
break;
- }
case KVM_RUN: {
struct kvm_run kvm_run;

Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -1042,22 +1042,22 @@ static int io_interception(struct kvm_vc

addr_mask = io_adress(vcpu, _in, &kvm_run->io.address);
if (!addr_mask) {
- printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__);
+ printk(KERN_DEBUG "%s: get io address failed\n",
+ __FUNCTION__);
return 1;
}

if (kvm_run->io.rep) {
- kvm_run->io.count = vcpu->regs[VCPU_REGS_RCX] & addr_mask;
+ kvm_run->io.count
+ = vcpu->regs[VCPU_REGS_RCX] & addr_mask;
kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags
& X86_EFLAGS_DF) != 0;
}
- } else {
+ } else
kvm_run->io.value = vcpu->svm->vmcb->save.rax;
- }
return 0;
}

-
static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
return 1;
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -27,7 +27,6 @@

#include "segment_descriptor.h"

-
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

@@ -418,10 +417,9 @@ static int vmx_set_msr(struct kvm_vcpu *
case MSR_IA32_SYSENTER_ESP:
vmcs_write32(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_TIME_STAMP_COUNTER: {
+ case MSR_IA32_TIME_STAMP_COUNTER:
guest_write_tsc(data);
break;
- }
default:
msr = find_msr_entry(vcpu, msr_index);
if (msr) {
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -441,9 +441,8 @@ static int FNAME(page_fault)(struct kvm_
/*
* mmio: emulate if accessible, otherwise its a guest fault.
*/
- if (is_io_pte(*shadow_pte)) {
+ if (is_io_pte(*shadow_pte))
return 1;
- }

++kvm_stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");

2007-02-19 10:24:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 4/13] KVM: vmx: hack set_cr0_no_modeswitch() to actually do modeswitch

From: Markus Rechberger <[email protected]>
From: Joerg Roedel <[email protected]>

The whole thing is rotten, but this allows vmx to boot with the guest reboot
fix.

Signed-off-by: Markus Rechberger <[email protected]>
Signed-off-by: Joerg Roedel <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -791,6 +791,9 @@ static void vmx_set_cr0(struct kvm_vcpu
*/
static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0)
{
+ if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
+ enter_rmode(vcpu);
+
vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0);
update_exception_bitmap(vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);

2007-02-19 10:25:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 5/13] KVM: Use ARRAY_SIZE macro instead of manual calculation.

From: Ahmed S. Darwish <[email protected]>

Signed-off-by: Ahmed S. Darwish <[email protected]>
Signed-off-by: Dor Laor <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -15,6 +15,7 @@
*/

#include <linux/module.h>
+#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/profile.h>
@@ -75,7 +76,7 @@ struct svm_init_data {

static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};

-#define NUM_MSR_MAPS (sizeof(msrpm_ranges) / sizeof(*msrpm_ranges))
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)

@@ -1297,7 +1298,7 @@ static int handle_exit(struct kvm_vcpu *
__FUNCTION__, vcpu->svm->vmcb->control.exit_int_info,
exit_code);

- if (exit_code >= sizeof(svm_exit_handlers) / sizeof(*svm_exit_handlers)
+ if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| svm_exit_handlers[exit_code] == 0) {
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n",
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -19,6 +19,7 @@
#include "vmx.h"
#include "kvm_vmx.h"
#include <linux/module.h>
+#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/profile.h>
@@ -75,7 +76,7 @@ static const u32 vmx_msr_index[] = {
#endif
MSR_EFER, MSR_K6_STAR,
};
-#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
+#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)

static inline int is_page_fault(u32 intr_info)
{
Index: linux-2.6/drivers/kvm/kvm_svm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_svm.h
+++ linux-2.6/drivers/kvm/kvm_svm.h
@@ -1,6 +1,7 @@
#ifndef __KVM_SVM_H
#define __KVM_SVM_H

+#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/list.h>
#include <asm/msr.h>
@@ -18,7 +19,7 @@ static const u32 host_save_msrs[] = {
MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/
};

-#define NR_HOST_SAVE_MSRS (sizeof(host_save_msrs) / sizeof(*host_save_msrs))
+#define NR_HOST_SAVE_MSRS ARRAY_SIZE(host_save_msrs)
#define NUM_DB_REGS 4

struct vcpu_svm {

2007-02-19 10:26:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 6/13] KVM: Use page_private()/set_page_private() apis

From: Markus Rechberger <[email protected]>

Besides using an established api, this allows using kvm in older kernels.

Signed-off-by: Markus Rechberger <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -298,18 +298,18 @@ static void rmap_add(struct kvm_vcpu *vc
if (!is_rmap_pte(*spte))
return;
page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page->private) {
+ if (!page_private(page)) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
- page->private = (unsigned long)spte;
- } else if (!(page->private & 1)) {
+ set_page_private(page,(unsigned long)spte);
+ } else if (!(page_private(page) & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)page->private;
+ desc->shadow_ptes[0] = (u64 *)page_private(page);
desc->shadow_ptes[1] = spte;
- page->private = (unsigned long)desc | 1;
+ set_page_private(page,(unsigned long)desc | 1);
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
desc = desc->more;
if (desc->shadow_ptes[RMAP_EXT-1]) {
@@ -337,12 +337,12 @@ static void rmap_desc_remove_entry(struc
if (j != 0)
return;
if (!prev_desc && !desc->more)
- page->private = (unsigned long)desc->shadow_ptes[0];
+ set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
else
if (prev_desc)
prev_desc->more = desc->more;
else
- page->private = (unsigned long)desc->more | 1;
+ set_page_private(page,(unsigned long)desc->more | 1);
mmu_free_rmap_desc(vcpu, desc);
}

@@ -356,20 +356,20 @@ static void rmap_remove(struct kvm_vcpu
if (!is_rmap_pte(*spte))
return;
page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page->private) {
+ if (!page_private(page)) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
- } else if (!(page->private & 1)) {
+ } else if (!(page_private(page) & 1)) {
rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
- if ((u64 *)page->private != spte) {
+ if ((u64 *)page_private(page) != spte) {
printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
spte, *spte);
BUG();
}
- page->private = 0;
+ set_page_private(page,0);
} else {
rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
prev_desc = NULL;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
@@ -398,11 +398,11 @@ static void rmap_write_protect(struct kv
BUG_ON(!slot);
page = gfn_to_page(slot, gfn);

- while (page->private) {
- if (!(page->private & 1))
- spte = (u64 *)page->private;
+ while (page_private(page)) {
+ if (!(page_private(page) & 1))
+ spte = (u64 *)page_private(page);
else {
- desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
spte = desc->shadow_ptes[0];
}
BUG_ON(!spte);
@@ -1218,7 +1218,7 @@ static int alloc_mmu_pages(struct kvm_vc
INIT_LIST_HEAD(&page_header->link);
if ((page = alloc_page(GFP_KERNEL)) == NULL)
goto error_1;
- page->private = (unsigned long)page_header;
+ set_page_private(page, (unsigned long)page_header);
page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
list_add(&page_header->link, &vcpu->free_pages);
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -670,7 +670,7 @@ raced:
| __GFP_ZERO);
if (!new.phys_mem[i])
goto out_free;
- new.phys_mem[i]->private = 0;
+ set_page_private(new.phys_mem[i],0);
}
}

Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -523,7 +523,7 @@ static inline struct kvm_mmu_page *page_
{
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);

- return (struct kvm_mmu_page *)page->private;
+ return (struct kvm_mmu_page *)page_private(page);
}

static inline u16 read_fs(void)

2007-02-19 10:27:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 7/13] KVM: add MSR based hypercall API

From: Ingo Molnar <[email protected]>

This adds a special MSR based hypercall API to KVM. This is to be
used by paravirtual kernels and virtual drivers.

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -1204,6 +1204,73 @@ void realmode_set_cr(struct kvm_vcpu *vc
}
}

+/*
+ * Register the para guest with the host:
+ */
+static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
+{
+ struct kvm_vcpu_para_state *para_state;
+ hpa_t para_state_hpa, hypercall_hpa;
+ struct page *para_state_page;
+ unsigned char *hypercall;
+ gpa_t hypercall_gpa;
+
+ printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
+ printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
+
+ /*
+ * Needs to be page aligned:
+ */
+ if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
+ goto err_gp;
+
+ para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
+ printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
+ if (is_error_hpa(para_state_hpa))
+ goto err_gp;
+
+ para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
+ para_state = kmap_atomic(para_state_page, KM_USER0);
+
+ printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
+ printk(KERN_DEBUG ".... size: %d\n", para_state->size);
+
+ para_state->host_version = KVM_PARA_API_VERSION;
+ /*
+ * We cannot support guests that try to register themselves
+ * with a newer API version than the host supports:
+ */
+ if (para_state->guest_version > KVM_PARA_API_VERSION) {
+ para_state->ret = -KVM_EINVAL;
+ goto err_kunmap_skip;
+ }
+
+ hypercall_gpa = para_state->hypercall_gpa;
+ hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
+ printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
+ if (is_error_hpa(hypercall_hpa)) {
+ para_state->ret = -KVM_EINVAL;
+ goto err_kunmap_skip;
+ }
+
+ printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
+ vcpu->para_state_page = para_state_page;
+ vcpu->para_state_gpa = para_state_gpa;
+ vcpu->hypercall_gpa = hypercall_gpa;
+
+ hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
+ KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
+ kvm_arch_ops->patch_hypercall(vcpu, hypercall);
+ kunmap_atomic(hypercall, KM_USER1);
+
+ para_state->ret = 0;
+err_kunmap_skip:
+ kunmap_atomic(para_state, KM_USER0);
+ return 0;
+err_gp:
+ return 1;
+}
+
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
@@ -1312,6 +1379,12 @@ int kvm_set_msr_common(struct kvm_vcpu *
case MSR_IA32_MISC_ENABLE:
vcpu->ia32_misc_enable_msr = data;
break;
+ /*
+ * This is the 'probe whether the host is KVM' logic:
+ */
+ case MSR_KVM_API_MAGIC:
+ return vcpu_register_para(vcpu, data);
+
default:
printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
return 1;
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -14,6 +14,7 @@

#include "vmx.h"
#include <linux/kvm.h>
+#include <linux/kvm_para.h>

#define CR0_PE_MASK (1ULL << 0)
#define CR0_TS_MASK (1ULL << 3)
@@ -237,6 +238,9 @@ struct kvm_vcpu {
unsigned long cr0;
unsigned long cr2;
unsigned long cr3;
+ gpa_t para_state_gpa;
+ struct page *para_state_page;
+ gpa_t hypercall_gpa;
unsigned long cr4;
unsigned long cr8;
u64 pdptrs[4]; /* pae */
@@ -382,6 +386,8 @@ struct kvm_arch_ops {
int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*vcpu_setup)(struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall_addr);
};

extern struct kvm_stat kvm_stat;
Index: linux-2.6/include/linux/kvm_para.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/kvm_para.h
@@ -0,0 +1,55 @@
+#ifndef __LINUX_KVM_PARA_H
+#define __LINUX_KVM_PARA_H
+
+/*
+ * Guest OS interface for KVM paravirtualization
+ *
+ * Note: this interface is totally experimental, and is certain to change
+ * as we make progress.
+ */
+
+/*
+ * Per-VCPU descriptor area shared between guest and host. Writable to
+ * both guest and host. Registered with the host by the guest when
+ * a guest acknowledges paravirtual mode.
+ *
+ * NOTE: all addresses are guest-physical addresses (gpa), to make it
+ * easier for the hypervisor to map between the various addresses.
+ */
+struct kvm_vcpu_para_state {
+ /*
+ * API version information for compatibility. If there's any support
+ * mismatch (too old host trying to execute too new guest) then
+ * the host will deny entry into paravirtual mode. Any other
+ * combination (new host + old guest and new host + new guest)
+ * is supposed to work - new host versions will support all old
+ * guest API versions.
+ */
+ u32 guest_version;
+ u32 host_version;
+ u32 size;
+ u32 ret;
+
+ /*
+ * The address of the vm exit instruction (VMCALL or VMMCALL),
+ * which the host will patch according to the CPU model the
+ * VM runs on:
+ */
+ u64 hypercall_gpa;
+
+} __attribute__ ((aligned(PAGE_SIZE)));
+
+#define KVM_PARA_API_VERSION 1
+
+/*
+ * This is used for an RDMSR's ECX parameter to probe for a KVM host.
+ * Hopefully no CPU vendor will use up this number. This is placed well
+ * out of way of the typical space occupied by CPU vendors' MSR indices,
+ * and we think (or at least hope) it wont be occupied in the future
+ * either.
+ */
+#define MSR_KVM_API_MAGIC 0x87655678
+
+#define KVM_EINVAL 1
+
+#endif
Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -1669,6 +1669,18 @@ static int is_disabled(void)
return 0;
}

+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xd9;
+ hypercall[3] = 0xc3;
+}
+
static struct kvm_arch_ops svm_arch_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -1717,6 +1729,7 @@ static struct kvm_arch_ops svm_arch_ops
.run = svm_vcpu_run,
.skip_emulated_instruction = skip_emulated_instruction,
.vcpu_setup = svm_vcpu_setup,
+ .patch_hypercall = svm_patch_hypercall,
};

static int __init svm_init(void)
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -1469,6 +1469,18 @@ static int handle_io(struct kvm_vcpu *vc
return 0;
}

+static void
+vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xc1;
+ hypercall[3] = 0xc3;
+}
+
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u64 exit_qualification;
@@ -2064,6 +2076,7 @@ static struct kvm_arch_ops vmx_arch_ops
.run = vmx_vcpu_run,
.skip_emulated_instruction = skip_emulated_instruction,
.vcpu_setup = vmx_vcpu_setup,
+ .patch_hypercall = vmx_patch_hypercall,
};

static int __init vmx_init(void)

2007-02-19 10:28:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 8/13] KVM: Add host hypercall support for vmx

From: Ingo Molnar <[email protected]>

Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/include/linux/kvm_para.h
===================================================================
--- linux-2.6.orig/include/linux/kvm_para.h
+++ linux-2.6/include/linux/kvm_para.h
@@ -52,4 +52,22 @@ struct kvm_vcpu_para_state {

#define KVM_EINVAL 1

+/*
+ * Hypercall calling convention:
+ *
+ * Each hypercall may have 0-6 parameters.
+ *
+ * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
+ *
+ * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
+ * order: RDI, RSI, RDX, RCX, R8, R9.
+ *
+ * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
+ * (the first 3 are according to the gcc regparm calling convention)
+ *
+ * No registers are clobbered by the hypercall, except that the
+ * return value is in RAX.
+ */
+#define __NR_hypercalls 0
+
#endif
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -1657,6 +1657,20 @@ static int handle_halt(struct kvm_vcpu *
return 0;
}

+static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ printk(KERN_DEBUG "got vmcall at RIP %08lx\n", vmcs_readl(GUEST_RIP));
+ printk(KERN_DEBUG "vmcall params: %08lx, %08lx, %08lx, %08lx\n",
+ vcpu->regs[VCPU_REGS_RAX],
+ vcpu->regs[VCPU_REGS_RCX],
+ vcpu->regs[VCPU_REGS_RDX],
+ vcpu->regs[VCPU_REGS_RBP]);
+ vcpu->regs[VCPU_REGS_RAX] = 0;
+ vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -1675,6 +1689,7 @@ static int (*kvm_vmx_exit_handlers[])(st
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_VMCALL] = handle_vmcall,
};

static const int kvm_vmx_max_exit_handlers =

2007-02-19 10:29:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 9/13] KVM: Add hypercall host support for svm

Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -1076,6 +1076,20 @@ static int halt_interception(struct kvm_
return 0;
}

+static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ printk(KERN_DEBUG "got vmmcall at RIP %08llx\n",
+ vcpu->svm->vmcb->save.rip);
+ printk(KERN_DEBUG "vmmcall params: %08llx, %08lx, %08lx, %08lx\n",
+ vcpu->svm->vmcb->save.rax,
+ vcpu->regs[VCPU_REGS_RCX],
+ vcpu->regs[VCPU_REGS_RDX],
+ vcpu->regs[VCPU_REGS_RBP]);
+ vcpu->svm->vmcb->save.rax = 0;
+ vcpu->svm->vmcb->save.rip += 3;
+ return 1;
+}
+
static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
inject_ud(vcpu);
@@ -1276,7 +1290,7 @@ static int (*svm_exit_handlers[])(struct
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
[SVM_EXIT_VMRUN] = invalid_op_interception,
- [SVM_EXIT_VMMCALL] = invalid_op_interception,
+ [SVM_EXIT_VMMCALL] = vmmcall_interception,
[SVM_EXIT_VMLOAD] = invalid_op_interception,
[SVM_EXIT_VMSAVE] = invalid_op_interception,
[SVM_EXIT_STGI] = invalid_op_interception,

2007-02-19 10:30:55

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -1138,6 +1138,42 @@ int emulate_instruction(struct kvm_vcpu
}
EXPORT_SYMBOL_GPL(emulate_instruction);

+int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
+
+ kvm_arch_ops->decache_regs(vcpu);
+ ret = -KVM_EINVAL;
+#ifdef CONFIG_X86_64
+ if (is_long_mode(vcpu)) {
+ nr = vcpu->regs[VCPU_REGS_RAX];
+ a0 = vcpu->regs[VCPU_REGS_RDI];
+ a1 = vcpu->regs[VCPU_REGS_RSI];
+ a2 = vcpu->regs[VCPU_REGS_RDX];
+ a3 = vcpu->regs[VCPU_REGS_RCX];
+ a4 = vcpu->regs[VCPU_REGS_R8];
+ a5 = vcpu->regs[VCPU_REGS_R9];
+ } else
+#endif
+ {
+ nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
+ a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
+ a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
+ a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
+ a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
+ a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
+ a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
+ }
+ switch (nr) {
+ default:
+ ;
+ }
+ vcpu->regs[VCPU_REGS_RAX] = ret;
+ kvm_arch_ops->cache_regs(vcpu);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_hypercall);
+
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
{
return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -482,6 +482,8 @@ void kvm_mmu_post_write(struct kvm_vcpu
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);

+int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -1078,16 +1078,8 @@ static int halt_interception(struct kvm_

static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- printk(KERN_DEBUG "got vmmcall at RIP %08llx\n",
- vcpu->svm->vmcb->save.rip);
- printk(KERN_DEBUG "vmmcall params: %08llx, %08lx, %08lx, %08lx\n",
- vcpu->svm->vmcb->save.rax,
- vcpu->regs[VCPU_REGS_RCX],
- vcpu->regs[VCPU_REGS_RDX],
- vcpu->regs[VCPU_REGS_RBP]);
- vcpu->svm->vmcb->save.rax = 0;
vcpu->svm->vmcb->save.rip += 3;
- return 1;
+ return kvm_hypercall(vcpu, kvm_run);
}

static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -1659,16 +1659,8 @@ static int handle_halt(struct kvm_vcpu *

static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- printk(KERN_DEBUG "got vmcall at RIP %08lx\n", vmcs_readl(GUEST_RIP));
- printk(KERN_DEBUG "vmcall params: %08lx, %08lx, %08lx, %08lx\n",
- vcpu->regs[VCPU_REGS_RAX],
- vcpu->regs[VCPU_REGS_RCX],
- vcpu->regs[VCPU_REGS_RDX],
- vcpu->regs[VCPU_REGS_RBP]);
- vcpu->regs[VCPU_REGS_RAX] = 0;
vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3);
- return 1;
+ return kvm_hypercall(vcpu, kvm_run);
}

/*

2007-02-19 10:31:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 11/13] KVM: svm: init cr0 with the wp bit set

Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -554,7 +554,7 @@ static void init_vmcb(struct vmcb *vmcb)
* cr0 val on cpu init should be 0x60000010, we enable cpu
* cache by default. the orderly way is to enable cache in bios.
*/
- save->cr0 = 0x00000010 | CR0_PG_MASK;
+ save->cr0 = 0x00000010 | CR0_PG_MASK | CR0_WP_MASK;
save->cr4 = CR4_PAE_MASK;
/* rdx = ?? */
}

2007-02-19 10:32:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 12/13] KVM: SVM: intercept SMI to handle it at host level

From: Joerg Roedel <[email protected]>

This patch changes the SVM code to intercept SMIs and handle it
outside the guest.

Signed-off-by: Joerg Roedel <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -486,6 +486,7 @@ static void init_vmcb(struct vmcb *vmcb)

control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
+ (1ULL << INTERCEPT_SMI) |
/*
* selective cr0 intercept bug?
* 0: 0f 22 d8 mov %eax,%cr3

2007-02-19 10:33:54

by Avi Kivity

[permalink] [raw]
Subject: [PATCH 13/13] KVM: More 0 -> NULL conversions

Signed-off-by: Avi Kivity <[email protected]>

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -2229,13 +2229,13 @@ static void kvm_exit_debug(void)
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{
decache_vcpus_on_cpu(raw_smp_processor_id());
- on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
+ on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
return 0;
}

static int kvm_resume(struct sys_device *dev)
{
- on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
+ on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
return 0;
}

2007-02-22 09:55:17

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

On Mon 2007-02-19 10:30:52, Avi Kivity wrote:
> Signed-off-by: Avi Kivity <[email protected]>

changelog?

> + switch (nr) {
> + default:
> + ;
> + }

Eh?

--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2007-02-22 10:14:59

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Pavel Machek wrote:
> On Mon 2007-02-19 10:30:52, Avi Kivity wrote:
>
>> Signed-off-by: Avi Kivity <[email protected]>
>>
>
> changelog?
>
>

Well, I can't think of anything to add beyond $subject. The patch adds
calls from the arch-dependent hypercall handlers to a new arch
independent function.


>> + switch (nr) {
>> + default:
>> + ;
>> + }
>>
>
> Eh?
>
>

No hypercalls defined yet.


--
error compiling committee.c: too many arguments to function

2007-02-22 10:18:04

by Dor Laor

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

>
>Pavel Machek wrote:
>> On Mon 2007-02-19 10:30:52, Avi Kivity wrote:
>>
>>> Signed-off-by: Avi Kivity <[email protected]>
>>>
>>
>> changelog?
>>
>>
>
>Well, I can't think of anything to add beyond $subject. The patch adds
>calls from the arch-dependent hypercall handlers to a new arch
>independent function.
>
>
>>> + switch (nr) {
>>> + default:
>>> + ;
>>> + }
>>>
>>
>> Eh?
>>
>>
>
>No hypercalls defined yet.
>

I have Ingo's network PV hypercalls to commit in my piplien.
Till then we can just add the test hypercall:
case __NR_hypercall_test:
printk(KERN_DEBUG "%s __NR_hypercall_test\n",
__FUNCTION__);
ret = 0x5a5a;
break;
default:
BUG();

>
>--
>error compiling committee.c: too many arguments to function
>
>
>-----------------------------------------------------------------------
--
>Take Surveys. Earn Cash. Influence the Future of IT
>Join SourceForge.net's Techsay panel and you'll get the chance to share
>your
>opinions on IT & business topics through brief surveys-and earn cash
>http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVD
EV
>_______________________________________________
>kvm-devel mailing list
>[email protected]
>https://lists.sourceforge.net/lists/listinfo/kvm-devel

2007-02-22 10:22:59

by Joerg Roedel

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

On Thu, Feb 22, 2007 at 02:17:54AM -0800, Dor Laor wrote:
> I have Ingo's network PV hypercalls to commit in my piplien.
> Till then we can just add the test hypercall:
> case __NR_hypercall_test:
> printk(KERN_DEBUG "%s __NR_hypercall_test\n",
> __FUNCTION__);
> ret = 0x5a5a;
> break;
> default:
> BUG();

Is it a good idea to call BUG() on an undefined hypercall? I assume its
better to inform the guest that is has issued an illegal call...

Joerg

--
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG


2007-02-22 10:35:12

by Arjan van de Ven

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location


> >
>
> I have Ingo's network PV hypercalls to commit in my piplien.

I have 5 or so pending as well :)
can't wait for the infrastructure to be there so that it's relatively
easy to add my paravirt block driver

2007-02-22 10:38:29

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Joerg Roedel wrote:
> On Thu, Feb 22, 2007 at 02:17:54AM -0800, Dor Laor wrote:
>
>> I have Ingo's network PV hypercalls to commit in my piplien.
>> Till then we can just add the test hypercall:
>> case __NR_hypercall_test:
>> printk(KERN_DEBUG "%s __NR_hypercall_test\n",
>> __FUNCTION__);
>> ret = 0x5a5a;
>> break;
>> default:
>> BUG();
>>
>
> Is it a good idea to call BUG() on an undefined hypercall? I assume its
> better to inform the guest that is has issued an illegal call...
>
>

The code returns -KVM_EINVAL if the hypercall is not handled (and BUG()
on the host as a response to something controlled by a guest is not a
good idea).


--
error compiling committee.c: too many arguments to function

2007-02-22 10:41:39

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Arjan van de Ven wrote:
>> I have Ingo's network PV hypercalls to commit in my piplien.
>>
>
> I have 5 or so pending as well :)
> can't wait for the infrastructure to be there so that it's relatively
> easy to add my paravirt block driver
>

I can't wait for your pv block driver :)

What do you think is missing? My list has:

- registration of hypercall handlers from modules
- execution of hypercall handlers outside vcpu_load() (so they are
preemptible and sleepable)
- passing unhandled hypercalls to userspace for qemu-based devices

Please do post the patches to kvm-devel, even if they aren't ready, so
we can get a taste of what's coming.

--
error compiling committee.c: too many arguments to function

2007-02-22 11:01:32

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

On Thu, 2007-02-22 at 12:40 +0200, Avi Kivity wrote:
> Arjan van de Ven wrote:
> >> I have Ingo's network PV hypercalls to commit in my piplien.
> >>
> >
> > I have 5 or so pending as well :)
> > can't wait for the infrastructure to be there so that it's relatively
> > easy to add my paravirt block driver
> >
>
> I can't wait for your pv block driver :)
>
> What do you think is missing? My list has:
>
> - registration of hypercall handlers from module

optional I think, but yeah easier for the user

> - execution of hypercall handlers outside vcpu_load() (so they are
> preemptible and sleepable)

I don't need this; most of my hypercalls are non-blocking. The ones that
are can already undo the load themselves, no big deal.

> - passing unhandled hypercalls to userspace for qemu-based devices

hm could do I suppose

One thing I'd like to see is some way to do batched hypercalls. I don't
quite know how this will work in general, but let me explain the
scenario:
The guest submits a bunch of disk IO requests into a submit queue.
The host gets a hypercall and goes to process the submit queue
While this is processing, the guest submits more IO
The guest would here do another hypercall..

.. but what could be done is have the host poll at the end of it's scan
of the queue if there's more, and while the host is scanning, just
disable the hypercall the guest would make. So that if there is a
"submit while scanning/processing" going on, no need for more
hypercalls.

(Otoh... the current situation isn't all that bad, there's one hypercall
for an entire batch of IO's, and the blocklayer isn't all that bad at
giving us nice large batches)



--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

2007-02-22 13:05:15

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Arjan van de Ven wrote:

> - execution of hypercall handlers outside vcpu_load() (so they are
> > preemptible and sleepable)
>
>
> I don't need this; most of my hypercalls are non-blocking. The ones that
> are can already undo the load themselves, no big deal.
>

I don't really like exposing the vcpu_load()/vcpu_put() interfaces.

On the other hand, sleeping in a hypercall means the guest vcpu is not
doing any processing. Maybe we should declare hypercall handlers atomic
and let them use a workqueue if they can't be atomic.

On the third hand, sleeping in a hypercall should be a rare event, so
forcing hypercall handlers to handle the sleeping case introduces a lot
of complexity for marginal gain.

>> - passing unhandled hypercalls to userspace for qemu-based devices
>>
>
> hm could do I suppose
>

One user would be the opengl guest to host tunneling driver/device
that's being developed for qemu.

> One thing I'd like to see is some way to do batched hypercalls. I don't
> quite know how this will work in general, but let me explain the
> scenario:
> The guest submits a bunch of disk IO requests into a submit queue.
> The host gets a hypercall and goes to process the submit queue
> While this is processing, the guest submits more IO
> The guest would here do another hypercall..
>
> .. but what could be done is have the host poll at the end of it's scan
> of the queue if there's more, and while the host is scanning, just
> disable the hypercall the guest would make. So that if there is a
> "submit while scanning/processing" going on, no need for more
> hypercalls.
>

Xen does exactly that. It's actually outside the scope of hypercalls,
and more related to the protocol between guest and host. Not that it
can't be made generic (and the Xen rings are).

Maybe we should just copy the code.

> (Otoh... the current situation isn't all that bad, there's one hypercall
> for an entire batch of IO's, and the blocklayer isn't all that bad at
> giving us nice large batches)
>

True. I expect it's more necessary with networking.

Somthing else that came up in a conversation with Dor: the need for a
clean way to raise a guest interrupt. The guest may be sleeping in
userspace, scheduled out, or running on another cpu (and requiring an
ipi to get it out of guest mode).

Right now I'm thinking about using the signal machinery since it appears
to do exactly the right thing.

--
error compiling committee.c: too many arguments to function

2007-02-22 13:12:15

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location


> Somthing else that came up in a conversation with Dor: the need for a
> clean way to raise a guest interrupt. The guest may be sleeping in
> userspace, scheduled out, or running on another cpu (and requiring an
> ipi to get it out of guest mode).

yeah it'd be nice if I could just call a function for it rather than
poking into kvm internals ;)

> Right now I'm thinking about using the signal machinery since it appears
> to do exactly the right thing.

signals are *expensive* though.

If you design an interrupt interface, it'd rock if you could make it
such that it is "raise <this> interrupt within <x> miliseconds from
now", rather than making it mostly synchronous. That way irq mitigation
becomes part of the interface rather than having to duplicate it all
over the virtual drivers...



--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

2007-02-22 13:29:54

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Arjan van de Ven wrote:
>> Somthing else that came up in a conversation with Dor: the need for a
>> clean way to raise a guest interrupt. The guest may be sleeping in
>> userspace, scheduled out, or running on another cpu (and requiring an
>> ipi to get it out of guest mode).
>>
>
> yeah it'd be nice if I could just call a function for it rather than
> poking into kvm internals ;)
>
>

Sure. Please report all inconveniences (they're really bugs) so we can
fix them.

Poking at kvm internals means you waste your time learning them, and
later we can't change them.


>> Right now I'm thinking about using the signal machinery since it appears
>> to do exactly the right thing.
>>
>
> signals are *expensive* though.
>
>

I think the expensive part of signals is userspace delivery. If they
are always blocked in userspace, they become just another IPC channel.

I plan to add a signal mask to KVM_RUN a la pselect() so that userspace
can dequeue signals instead of using a signal handler.


> If you design an interrupt interface, it'd rock if you could make it
> such that it is "raise <this> interrupt within <x> miliseconds from
> now", rather than making it mostly synchronous. That way irq mitigation
> becomes part of the interface rather than having to duplicate it all
> over the virtual drivers...
>

Can't it be done by a helper function using a timer and a signal (or
whatever mechanism we use to wake up vcpus)?

--
error compiling committee.c: too many arguments to function

2007-02-22 13:30:56

by Dor Laor

[permalink] [raw]
Subject: RE: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers toa central arch-independent location

>
>
>> Somthing else that came up in a conversation with Dor: the need for a
>> clean way to raise a guest interrupt. The guest may be sleeping in
>> userspace, scheduled out, or running on another cpu (and requiring an
>> ipi to get it out of guest mode).
>
>yeah it'd be nice if I could just call a function for it rather than
>poking into kvm internals ;)
>
>> Right now I'm thinking about using the signal machinery since it
appears
>> to do exactly the right thing.
>
>signals are *expensive* though.
>
>If you design an interrupt interface, it'd rock if you could make it
>such that it is "raise <this> interrupt within <x> miliseconds from
>now", rather than making it mostly synchronous. That way irq mitigation
>becomes part of the interface rather than having to duplicate it all
>over the virtual drivers...

Why do you need to raise an interrupt within a timeout?
I thought on just asking for a synchronous, as-fast-as-you-can-get
interrupt. If you need an interrupt that should pop within some
milliseconds you can set a timer.

>
>
>
>--
>if you want to mail me at work (you don't), use arjan (at)
linux.intel.com
>Test the interaction between Linux and your BIOS via
>http://www.linuxfirmwarekit.org

2007-02-22 14:09:47

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location


> Can't it be done by a helper function using a timer and a signal (or
> whatever mechanism we use to wake up vcpus)?

one could do that but it's not optimal; if the process DOES get waken up
earlier, it should take the interrupt then immediately, so that it
doesn't have to wake up again when the timer fires.

(in fact it would be nice if the guest could somehow poll at the irq
mask at waking from idle, so that it wouldn't need a vmexit/entry for
every such interrupt, but just do the right thing for all pending work)

In addition, yes it'll be a helper function, but since all drivers will
want the functionality its probably best off in the generic code


--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

2007-02-22 14:21:52

by Avi Kivity

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Arjan van de Ven wrote:
>> Can't it be done by a helper function using a timer and a signal (or
>> whatever mechanism we use to wake up vcpus)?
>>
>
> one could do that but it's not optimal; if the process DOES get waken up
> earlier, it should take the interrupt then immediately, so that it
> doesn't have to wake up again when the timer fires.
>
> (in fact it would be nice if the guest could somehow poll at the irq
> mask at waking from idle, so that it wouldn't need a vmexit/entry for
> every such interrupt, but just do the right thing for all pending work)
>
> In addition, yes it'll be a helper function, but since all drivers will
> want the functionality its probably best off in the generic code
>
>

That's what Xen does (I'm not sure about the timer, but they do avoid
unnecessary interrupts). As you can have memory shared between the
guest and host, it's quite simple:

0. host puts data in ring
1. host sets interrupt timer
2. guest wakes up for unrelated reasons
3. guest sees data in ring, consumes it, and bumps the consumer pointer
4. timer fires, host sees consumer == producer, no need to issue interrupt

They have that code abstracted out and reusable by multiple drivers.

--
error compiling committee.c: too many arguments to function

2007-02-23 14:55:55

by Anthony Liguori

[permalink] [raw]
Subject: Re: [kvm-devel] [PATCH 10/13] KVM: Wire up hypercall handlers to a central arch-independent location

Avi Kivity wrote:
> Arjan van de Ven wrote:
>
>>> Can't it be done by a helper function using a timer and a signal (or
>>> whatever mechanism we use to wake up vcpus)?
>>>
>>>
>> one could do that but it's not optimal; if the process DOES get waken up
>> earlier, it should take the interrupt then immediately, so that it
>> doesn't have to wake up again when the timer fires.
>>
>> (in fact it would be nice if the guest could somehow poll at the irq
>> mask at waking from idle, so that it wouldn't need a vmexit/entry for
>> every such interrupt, but just do the right thing for all pending work)
>>
>> In addition, yes it'll be a helper function, but since all drivers will
>> want the functionality its probably best off in the generic code
>>
>>
>>
>
> That's what Xen does (I'm not sure about the timer, but they do avoid
> unnecessary interrupts). As you can have memory shared between the
> guest and host, it's quite simple:
>
> 0. host puts data in ring
> 1. host sets interrupt timer
> 2. guest wakes up for unrelated reasons
> 3. guest sees data in ring, consumes it, and bumps the consumer pointer
> 4. timer fires, host sees consumer == producer, no need to issue interrupt
>

It's not quite a timer. It's an asynchronous notification mechanism.
Guest sets a bit in the appropriate domain's shared_info and when the
domain gets scheduled again, it checks it's bits.

An easy way to simulate this in KVM would be to have the guest set a bit
in the paravirt page and then on the next trip down to userspace, we
check the bits and dispatch appropriately. We know we'll get down to
userspace at least for the next EINTR.

> They have that code abstracted out and reusable by multiple drivers.
>

Why don't we see how many drivers we get first. I have my doubts that a
paravirt disk driver will significantly out perform scsi emulation.

Regards,

Anthony Liguori