The following patchset allows a host with running virtual machines to be
suspended and, on at least a subset of the machines tested, resumed.
Note that this is orthogonal to suspending and resuming an individual
guest to a file.
A side effect of implementing suspend/resume is that cpu hotplug is now
supported. This should please the owners of big iron.
Andrew, please queue for 2.6.21.
--
error compiling committee.c: too many arguments to function
KVM wants the cpu hotplug notifications, both for cpu hotplug itself, but more
commonly for host suspend/resume.
In order to avoid extensive #ifdefs, provide stubs when CONFIG_CPU_HOTPLUG is
not defined.
In all, we have four cases:
- UP: register and unregister stubbed out
- SMP+hotplug: full register and unregister
- SMP, no hotplug, core: register as __init, unregister stubbed
(cpus are brought up during core initialization)
- SMP, no hotplug, module: register and unregister stubbed out
(cpus cannot be brought up during module lifetime)
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/include/linux/cpu.h
===================================================================
--- linux-2.6.orig/include/linux/cpu.h
+++ linux-2.6/include/linux/cpu.h
@@ -49,10 +49,20 @@ struct notifier_block;
#ifdef CONFIG_SMP
/* Need to know about CPUs going up/down? */
-extern int register_cpu_notifier(struct notifier_block *nb);
#ifdef CONFIG_HOTPLUG_CPU
+extern int register_cpu_notifier(struct notifier_block *nb);
extern void unregister_cpu_notifier(struct notifier_block *nb);
#else
+
+#ifndef MODULE
+extern int register_cpu_notifier(struct notifier_block *nb);
+#else
+static inline int register_cpu_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+#endif
+
static inline void unregister_cpu_notifier(struct notifier_block *nb)
{
}
This will allow us to iterate over all vcpus and see which cpus they are
running on.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -41,6 +41,9 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+static spinlock_t kvm_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head vm_list = LIST_HEAD_INIT(vm_list);
+
struct kvm_arch_ops *kvm_arch_ops;
struct kvm_stat kvm_stat;
EXPORT_SYMBOL_GPL(kvm_stat);
@@ -230,9 +233,13 @@ static int kvm_dev_open(struct inode *in
struct kvm_vcpu *vcpu = &kvm->vcpus[i];
mutex_init(&vcpu->mutex);
+ vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->mmu.root_hpa = INVALID_PAGE;
INIT_LIST_HEAD(&vcpu->free_pages);
+ spin_lock(&kvm_lock);
+ list_add(&kvm->vm_list, &vm_list);
+ spin_unlock(&kvm_lock);
}
filp->private_data = kvm;
return 0;
@@ -292,6 +299,9 @@ static int kvm_dev_release(struct inode
{
struct kvm *kvm = filp->private_data;
+ spin_lock(&kvm_lock);
+ list_del(&kvm->vm_list);
+ spin_unlock(&kvm_lock);
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
kfree(kvm);
@@ -546,7 +556,6 @@ static int kvm_dev_ioctl_create_vcpu(str
FX_IMAGE_ALIGN);
vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
- vcpu->cpu = -1; /* First load will set up TR */
r = kvm_arch_ops->vcpu_create(vcpu);
if (r < 0)
goto out_free_vcpus;
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -304,6 +304,7 @@ struct kvm {
int memory_config_version;
int busy;
unsigned long rmap_overflow;
+ struct list_head vm_list;
};
struct kvm_stat {
Like the inline code it replaces, this function decaches the vmcs from the cpu
it last executed on. in addition:
- vcpu_clear() works if the last cpu is also the cpu we're running on
- it is faster on larger smps by virtue of using smp_call_function_single()
Includes fix from Ingo Molnar.
Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -125,6 +125,15 @@ static void __vcpu_clear(void *arg)
per_cpu(current_vmcs, cpu) = NULL;
}
+static void vcpu_clear(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1)
+ smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1);
+ else
+ __vcpu_clear(vcpu);
+ vcpu->launched = 0;
+}
+
static unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -202,10 +211,8 @@ static struct kvm_vcpu *vmx_vcpu_load(st
cpu = get_cpu();
- if (vcpu->cpu != cpu) {
- smp_call_function(__vcpu_clear, vcpu, 0, 1);
- vcpu->launched = 0;
- }
+ if (vcpu->cpu != cpu)
+ vcpu_clear(vcpu);
if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
u8 error;
On hotplug, we execute the hardware extension enable sequence.
On unplug, we decache any vcpus that last ran on the exiting cpu, and
execute the hardware extension disable sequence.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -34,6 +34,7 @@
#include <linux/highmem.h>
#include <linux/file.h>
#include <asm/desc.h>
+#include <linux/cpu.h>
#include "x86_emulate.h"
#include "segment_descriptor.h"
@@ -2038,6 +2039,64 @@ static struct notifier_block kvm_reboot_
.priority = 0,
};
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+static void decache_vcpus_on_cpu(int cpu)
+{
+ struct kvm *vm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(vm, &vm_list, vm_list)
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = &vm->vcpus[i];
+ /*
+ * If the vcpu is locked, then it is running on some
+ * other cpu and therefore it is not cached on the
+ * cpu in question.
+ *
+ * If it's not locked, check the last cpu it executed
+ * on.
+ */
+ if (mutex_trylock(&vcpu->mutex)) {
+ if (vcpu->cpu == cpu) {
+ kvm_arch_ops->vcpu_decache(vcpu);
+ vcpu->cpu = -1;
+ }
+ mutex_unlock(&vcpu->mutex);
+ }
+ }
+ spin_unlock(&kvm_lock);
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+ void *v)
+{
+ int cpu = (long)v;
+
+ switch (val) {
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ decache_vcpus_on_cpu(cpu);
+ smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
+ NULL, 0, 1);
+ break;
+ case CPU_UP_PREPARE:
+ smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
+ NULL, 0, 1);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_cpu_notifier = {
+ .notifier_call = kvm_cpu_hotplug,
+ .priority = 20, /* must be > scheduler priority */
+};
+
static __init void kvm_init_debug(void)
{
struct kvm_stats_debugfs_item *p;
@@ -2084,6 +2143,9 @@ int kvm_init_arch(struct kvm_arch_ops *o
return r;
on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
+ r = register_cpu_notifier(&kvm_cpu_notifier);
+ if (r)
+ goto out_free_1;
register_reboot_notifier(&kvm_reboot_notifier);
kvm_chardev_ops.owner = module;
@@ -2098,6 +2160,8 @@ int kvm_init_arch(struct kvm_arch_ops *o
out_free:
unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_1:
on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
kvm_arch_ops->hardware_unsetup();
return r;
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -341,6 +341,7 @@ struct kvm_arch_ops {
struct kvm_vcpu *(*vcpu_load)(struct kvm_vcpu *vcpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+ void (*vcpu_decache)(struct kvm_vcpu *vcpu);
int (*set_guest_debug)(struct kvm_vcpu *vcpu,
struct kvm_debug_guest *dbg);
Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -609,6 +609,10 @@ static void svm_vcpu_put(struct kvm_vcpu
put_cpu();
}
+static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_cache_regs(struct kvm_vcpu *vcpu)
{
vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax;
@@ -1677,6 +1681,7 @@ static struct kvm_arch_ops svm_arch_ops
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
+ .vcpu_decache = svm_vcpu_decache,
.set_guest_debug = svm_guest_debug,
.get_msr = svm_get_msr,
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -250,6 +250,11 @@ static void vmx_vcpu_put(struct kvm_vcpu
put_cpu();
}
+static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+ vcpu_clear(vcpu);
+}
+
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
return vmcs_readl(GUEST_RFLAGS);
@@ -509,7 +514,7 @@ static __init int vmx_disabled_by_bios(v
return (msr & 5) == 1; /* locked but not enabled */
}
-static __init void hardware_enable(void *garbage)
+static void hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2021,6 +2026,7 @@ static struct kvm_arch_ops vmx_arch_ops
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
+ .vcpu_decache = vmx_vcpu_decache,
.set_guest_debug = set_guest_debug,
.get_msr = vmx_get_msr,
Add the necessary callbacks to suspend and resume a host running kvm. This
is just a repeat of the cpu hotplug/unplug work.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -34,6 +34,7 @@
#include <linux/highmem.h>
#include <linux/file.h>
#include <asm/desc.h>
+#include <linux/sysdev.h>
#include <linux/cpu.h>
#include "x86_emulate.h"
@@ -2116,6 +2117,30 @@ static void kvm_exit_debug(void)
debugfs_remove(debugfs_dir);
}
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+{
+ decache_vcpus_on_cpu(raw_smp_processor_id());
+ on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
+ return 0;
+}
+
+static int kvm_resume(struct sys_device *dev)
+{
+ on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
+ return 0;
+}
+
+static struct sysdev_class kvm_sysdev_class = {
+ set_kset_name("kvm"),
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static struct sys_device kvm_sysdev = {
+ .id = 0,
+ .cls = &kvm_sysdev_class,
+};
+
hpa_t bad_page_address;
int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
@@ -2148,6 +2173,14 @@ int kvm_init_arch(struct kvm_arch_ops *o
goto out_free_1;
register_reboot_notifier(&kvm_reboot_notifier);
+ r = sysdev_class_register(&kvm_sysdev_class);
+ if (r)
+ goto out_free_2;
+
+ r = sysdev_register(&kvm_sysdev);
+ if (r)
+ goto out_free_3;
+
kvm_chardev_ops.owner = module;
r = misc_register(&kvm_dev);
@@ -2159,6 +2192,10 @@ int kvm_init_arch(struct kvm_arch_ops *o
return r;
out_free:
+ sysdev_unregister(&kvm_sysdev);
+out_free_3:
+ sysdev_class_unregister(&kvm_sysdev_class);
+out_free_2:
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
out_free_1:
@@ -2170,8 +2207,10 @@ out_free_1:
void kvm_exit_arch(void)
{
misc_deregister(&kvm_dev);
-
+ sysdev_unregister(&kvm_sysdev);
+ sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
kvm_arch_ops->hardware_unsetup();
kvm_arch_ops = NULL;
Hi.
On Tue, 2007-01-30 at 14:57 +0000, Avi Kivity wrote:
> Add the necessary callbacks to suspend and resume a host running kvm. This
> is just a repeat of the cpu hotplug/unplug work.
>
> Signed-off-by: Avi Kivity <[email protected]>
Maybe it's just a lack of understanding, but I'm wondering if this patch
is necessary - we already hot-unplug secondary cpus prior to suspending
devices.
Regards,
Nigel
On Tuesday, 30 January 2007 22:20, Nigel Cunningham wrote:
> Hi.
>
> On Tue, 2007-01-30 at 14:57 +0000, Avi Kivity wrote:
> > Add the necessary callbacks to suspend and resume a host running kvm. This
> > is just a repeat of the cpu hotplug/unplug work.
> >
> > Signed-off-by: Avi Kivity <[email protected]>
>
> Maybe it's just a lack of understanding, but I'm wondering if this patch
> is necessary - we already hot-unplug secondary cpus prior to suspending
> devices.
Not with the reoredering patches that are in -mm, though. With these patches,
the nonboot CPUs are disabled after "regular" devices.
Greetings,
Rafael
--
If you don't have the time to read,
you don't have the time or the tools to write.
- Stephen King
Hi.
On Tue, 2007-01-30 at 23:19 +0100, Rafael J. Wysocki wrote:
> On Tuesday, 30 January 2007 22:20, Nigel Cunningham wrote:
> > Hi.
> >
> > On Tue, 2007-01-30 at 14:57 +0000, Avi Kivity wrote:
> > > Add the necessary callbacks to suspend and resume a host running kvm. This
> > > is just a repeat of the cpu hotplug/unplug work.
> > >
> > > Signed-off-by: Avi Kivity <[email protected]>
> >
> > Maybe it's just a lack of understanding, but I'm wondering if this patch
> > is necessary - we already hot-unplug secondary cpus prior to suspending
> > devices.
>
> Not with the reoredering patches that are in -mm, though. With these patches,
> the nonboot CPUs are disabled after "regular" devices.
Ah. Good point. Thanks! :)
Nigel
On Tue, 30 Jan 2007 14:56:16 -0000
Avi Kivity <[email protected]> wrote:
> +static void decache_vcpus_on_cpu(int cpu)
> +{
> + struct kvm *vm;
> + struct kvm_vcpu *vcpu;
> + int i;
> +
> + spin_lock(&kvm_lock);
> + list_for_each_entry(vm, &vm_list, vm_list)
> + for (i = 0; i < KVM_MAX_VCPUS; ++i) {
> + vcpu = &vm->vcpus[i];
> + /*
> + * If the vcpu is locked, then it is running on some
> + * other cpu and therefore it is not cached on the
> + * cpu in question.
> + *
> + * If it's not locked, check the last cpu it executed
> + * on.
> + */
> + if (mutex_trylock(&vcpu->mutex)) {
> + if (vcpu->cpu == cpu) {
> + kvm_arch_ops->vcpu_decache(vcpu);
> + vcpu->cpu = -1;
> + }
> + mutex_unlock(&vcpu->mutex);
> + }
> + }
> + spin_unlock(&kvm_lock);
> +}
The trylock is unpleasing. Perhaps kvm_lock should be a mutex or something?
* Andrew Morton <[email protected]> wrote:
> On Tue, 30 Jan 2007 14:56:16 -0000
> Avi Kivity <[email protected]> wrote:
>
> > +static void decache_vcpus_on_cpu(int cpu)
> > +{
> > + struct kvm *vm;
> > + struct kvm_vcpu *vcpu;
> > + int i;
> > +
> > + spin_lock(&kvm_lock);
> > + list_for_each_entry(vm, &vm_list, vm_list)
> > + for (i = 0; i < KVM_MAX_VCPUS; ++i) {
> > + vcpu = &vm->vcpus[i];
> > + /*
> > + * If the vcpu is locked, then it is running on some
> > + * other cpu and therefore it is not cached on the
> > + * cpu in question.
> > + *
> > + * If it's not locked, check the last cpu it executed
> > + * on.
> > + */
> > + if (mutex_trylock(&vcpu->mutex)) {
> > + if (vcpu->cpu == cpu) {
> > + kvm_arch_ops->vcpu_decache(vcpu);
> > + vcpu->cpu = -1;
> > + }
> > + mutex_unlock(&vcpu->mutex);
> > + }
> > + }
> > + spin_unlock(&kvm_lock);
> > +}
>
> The trylock is unpleasing. Perhaps kvm_lock should be a mutex or
> something?
this is a special case. The vcpu->mutex acts as a 'this vcpu is running
right now' flag as well - hence the trylock signals: is it running right
now or not - if it's not running we do not have to 'decache' it. But i
agree and i already suggested to Avi to change kvm_lock to be a mutex -
but this wont change the trylock.
Ingo
Rafael J. Wysocki wrote:
> On Tuesday, 30 January 2007 22:20, Nigel Cunningham wrote:
>
>> Hi.
>>
>> On Tue, 2007-01-30 at 14:57 +0000, Avi Kivity wrote:
>>
>>> Add the necessary callbacks to suspend and resume a host running kvm. This
>>> is just a repeat of the cpu hotplug/unplug work.
>>>
>>> Signed-off-by: Avi Kivity <[email protected]>
>>>
>> Maybe it's just a lack of understanding, but I'm wondering if this patch
>> is necessary - we already hot-unplug secondary cpus prior to suspending
>> devices.
>>
>
>
The on_each_cpu() in the patch really means on_this_cpu().
> Not with the reoredering patches that are in -mm, though. With these patches,
> the nonboot CPUs are disabled after "regular" devices.
>
After that, it means on_each_cpu() again, and the hotplug patch becomes
unnecessary (except to support hotplug, of course).
Assuming I understand things correctly, which if not at all certain.
--
error compiling committee.c: too many arguments to function
Ingo Molnar wrote:
> * Andrew Morton <[email protected]> wrote:
>
>
>> On Tue, 30 Jan 2007 14:56:16 -0000
>> Avi Kivity <[email protected]> wrote:
>>
>>
>>> +static void decache_vcpus_on_cpu(int cpu)
>>> +{
>>> + struct kvm *vm;
>>> + struct kvm_vcpu *vcpu;
>>> + int i;
>>> +
>>> + spin_lock(&kvm_lock);
>>> + list_for_each_entry(vm, &vm_list, vm_list)
>>> + for (i = 0; i < KVM_MAX_VCPUS; ++i) {
>>> + vcpu = &vm->vcpus[i];
>>> + /*
>>> + * If the vcpu is locked, then it is running on some
>>> + * other cpu and therefore it is not cached on the
>>> + * cpu in question.
>>> + *
>>> + * If it's not locked, check the last cpu it executed
>>> + * on.
>>> + */
>>> + if (mutex_trylock(&vcpu->mutex)) {
>>> + if (vcpu->cpu == cpu) {
>>> + kvm_arch_ops->vcpu_decache(vcpu);
>>> + vcpu->cpu = -1;
>>> + }
>>> + mutex_unlock(&vcpu->mutex);
>>> + }
>>> + }
>>> + spin_unlock(&kvm_lock);
>>> +}
>>>
>> The trylock is unpleasing. Perhaps kvm_lock should be a mutex or
>> something?
>>
>
> this is a special case. The vcpu->mutex acts as a 'this vcpu is running
> right now' flag as well - hence the trylock signals: is it running right
> now or not - if it's not running we do not have to 'decache' it. But i
> agree and i already suggested to Avi to change kvm_lock to be a mutex -
> but this wont change the trylock.
>
To elaborate a little: replacing mutex_trylock() with mutex_lock() will
cause unbounded latency as we wait for the vcpu to be descheduled. In
this case, we're only interested in descheduled vcpus, so there's no
need to wait.
kvm is a bit funny in how it likes to pin cpus.
--
error compiling committee.c: too many arguments to function