Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 23 +++++++++++++++--------
1 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fedac9d..15d2ace 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3116,9 +3116,22 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
.notifier_call = kvmclock_cpufreq_notifier
};
+static void kvm_timer_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ tsc_khz_ref = tsc_khz;
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+}
+
int kvm_arch_init(void *opaque)
{
- int r, cpu;
+ int r;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
if (kvm_x86_ops) {
@@ -3150,13 +3163,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
- for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- tsc_khz_ref = tsc_khz;
- cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- }
+ kvm_timer_init();
return 0;
--
1.6.4.4
They are globals, not clearly protected by any ordering or locking, and
vulnerable to various startup races.
Instead, for variable TSC machines, register the cpufreq notifier and get
the TSC frequency directly from the cpufreq machinery. Not only is it
always right, it is also perfectly accurate, as no error prone measurement
is required. On such machines, also detect the frequency when bringing
a new CPU online; it isn't clear what frequency it will start with, and
it may not correspond to the reference.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 38 ++++++++++++++++++++++++++++----------
1 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15d2ace..35082dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -650,6 +650,19 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static inline void kvm_get_cpu_khz(int cpu)
+{
+ unsigned int khz = cpufreq_get(cpu);
+ if (khz <= 0) {
+ static int warn = 0;
+ if (warn++ < 10)
+ printk(KERN_ERR "kvm: could not get frequency for CPU "
+ "%d. Timers may be inaccurate\n", cpu);
+ khz = tsc_khz;
+ }
+ per_cpu(cpu_tsc_khz, cpu) = khz;
+}
+
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
@@ -3061,9 +3074,6 @@ static void bounce_off(void *info)
/* nothing */
}
-static unsigned int ref_freq;
-static unsigned long tsc_khz_ref;
-
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -3071,15 +3081,15 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
-
- if (!ref_freq)
- ref_freq = freq->old;
+ unsigned long old_khz;
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
- per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ old_khz = per_cpu(cpu_tsc_khz, freq->cpu);
+ per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(old_khz, freq->old,
+ freq->new);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3120,12 +3130,18 @@ static void kvm_timer_init(void)
{
int cpu;
- for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- tsc_khz_ref = tsc_khz;
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
+ for_each_online_cpu(cpu)
+ kvm_get_cpu_khz(cpu);
+ } else {
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ }
+ for_each_possible_cpu(cpu) {
+ printk(KERN_DEBUG "kvm: cpu %d = %ld khz\n",
+ cpu, per_cpu(cpu_tsc_khz, cpu));
}
}
@@ -4698,6 +4714,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
int kvm_arch_hardware_enable(void *garbage)
{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ kvm_get_cpu_khz(raw_smp_processor_id());
return kvm_x86_ops->hardware_enable(garbage);
}
--
1.6.4.4
Both VMX and SVM require per-cpu memory allocation, which is done at module
init time, for only online cpus. When bringing a new CPU online, we must
also allocate this structure. The method chosen to implement this is to
make the CPU online notifier available via a call to the arch code. This
allows memory allocation to be done smoothly, without any need to allocate
extra structures.
Note: CPU up notifiers may call KVM callback before calling cpufreq callbacks.
This would causes the CPU frequency not to be detected (and it is not always
clear on non-constant TSC platforms what the bringup TSC rate will be, so the
guess of using tsc_khz could be wrong). So, we clear the rate to zero in such
a case and add logic to query it upon entry.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/svm.c | 15 +++++++++++++--
arch/x86/kvm/vmx.c | 17 +++++++++++++++++
arch/x86/kvm/x86.c | 14 +++++++++++++-
include/linux/kvm_host.h | 6 ++++++
virt/kvm/kvm_main.c | 3 ++-
6 files changed, 53 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 299cc1b..b7dd14b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -459,6 +459,7 @@ struct descriptor_table {
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
+ int (*cpu_hotadd)(int cpu);
int (*hardware_enable)(void *dummy);
void (*hardware_disable)(void *dummy);
void (*check_processor_compatibility)(void *rtn);
@@ -791,6 +792,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
_ASM_PTR " 666b, 667b \n\t" \
".popsection"
+#define KVM_ARCH_WANT_HOTPLUG_NOTIFIER
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a4daca..8f99d0c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -330,13 +330,13 @@ static int svm_hardware_enable(void *garbage)
return -EBUSY;
if (!has_svm()) {
- printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
+ printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", me);
return -EINVAL;
}
svm_data = per_cpu(svm_data, me);
if (!svm_data) {
- printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
+ printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
me);
return -EINVAL;
}
@@ -394,6 +394,16 @@ err_1:
}
+static __cpuinit int svm_cpu_hotadd(int cpu)
+{
+ struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+
+ if (svm_data)
+ return 0;
+
+ return svm_cpu_init(cpu);
+}
+
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
@@ -2858,6 +2868,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_setup = svm_hardware_setup,
.hardware_unsetup = svm_hardware_unsetup,
.check_processor_compatibility = svm_check_processor_compat,
+ .cpu_hotadd = svm_cpu_hotadd,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3fe0d42..b8a8428 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1408,6 +1408,22 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
}
+static __cpuinit int vmx_cpu_hotadd(int cpu)
+{
+ struct vmcs *vmcs;
+
+ if (per_cpu(vmxarea, cpu))
+ return 0;
+
+ vmcs = alloc_vmcs_cpu(cpu);
+ if (!vmcs)
+ return -ENOMEM;
+
+ per_cpu(vmxarea, cpu) = vmcs;
+
+ return 0;
+}
+
static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3925,6 +3941,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_setup = hardware_setup,
.hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
+ .cpu_hotadd = vmx_cpu_hotadd,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35082dd..05aea42 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1339,6 +1339,8 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
+ if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0))
+ kvm_get_cpu_khz(cpu);
kvm_request_guest_time_update(vcpu);
}
@@ -4712,10 +4714,20 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
return kvm_x86_ops->vcpu_reset(vcpu);
}
+int kvm_arch_cpu_hotadd(int cpu)
+{
+ return kvm_x86_ops->cpu_hotadd(cpu);
+}
+
int kvm_arch_hardware_enable(void *garbage)
{
+ /*
+ * Notifier callback chain may not have called cpufreq code
+ * yet, thus we must reset TSC khz to zero and recompute it
+ * before entering.
+ */
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- kvm_get_cpu_khz(raw_smp_processor_id());
+ per_cpu(cpu_tsc_khz, raw_smp_processor_id()) = 0;
return kvm_x86_ops->hardware_enable(garbage);
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0bf9ee9..2f075c4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -345,6 +345,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+#ifdef KVM_ARCH_WANT_HOTPLUG_NOTIFIER
+int kvm_arch_cpu_hotadd(int cpu);
+#else
+#define kvm_arch_cpu_hotadd(x) (0)
+#endif
+
int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
int kvm_arch_hardware_enable(void *garbage);
void kvm_arch_hardware_disable(void *garbage);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e27b7a9..1360db4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1734,7 +1734,8 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
case CPU_ONLINE:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
- smp_call_function_single(cpu, hardware_enable, NULL, 1);
+ if (!kvm_arch_cpu_hotadd(cpu))
+ smp_call_function_single(cpu, hardware_enable, NULL, 1);
break;
}
return NOTIFY_OK;
--
1.6.4.4
In the process of bringing down CPUs, the SVM / VMX structures associated
with those CPUs are not freed. This may cause leaks when unloading and
reloading the KVM module, as only the structures associated with online
CPUs are cleaned up.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/svm.c | 1 +
arch/x86/kvm/vmx.c | 7 +++++++
arch/x86/kvm/x86.c | 5 +++++
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 2 ++
6 files changed, 18 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b7dd14b..91e02d3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -460,6 +460,7 @@ struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
int (*cpu_hotadd)(int cpu);
+ void (*cpu_hotremove)(int cpu);
int (*hardware_enable)(void *dummy);
void (*hardware_disable)(void *dummy);
void (*check_processor_compatibility)(void *rtn);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8f99d0c..7cf6c98 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2869,6 +2869,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_unsetup = svm_hardware_unsetup,
.check_processor_compatibility = svm_check_processor_compat,
.cpu_hotadd = svm_cpu_hotadd,
+ .cpu_hotremove = svm_cpu_uninit,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b8a8428..1e3e9fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1424,6 +1424,12 @@ static __cpuinit int vmx_cpu_hotadd(int cpu)
return 0;
}
+static __cpuinit void vmx_cpu_hotremove(int cpu)
+{
+ free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+}
+
static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3942,6 +3948,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
.cpu_hotadd = vmx_cpu_hotadd,
+ .cpu_hotremove = vmx_cpu_hotremove,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05aea42..38ba4a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4719,6 +4719,11 @@ int kvm_arch_cpu_hotadd(int cpu)
return kvm_x86_ops->cpu_hotadd(cpu);
}
+void kvm_arch_cpu_hotremove(int cpu)
+{
+ kvm_x86_ops->cpu_hotremove(cpu);
+}
+
int kvm_arch_hardware_enable(void *garbage)
{
/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2f075c4..2c844f0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -347,8 +347,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
#ifdef KVM_ARCH_WANT_HOTPLUG_NOTIFIER
int kvm_arch_cpu_hotadd(int cpu);
+void kvm_arch_cpu_hotremove(int cpu);
#else
#define kvm_arch_cpu_hotadd(x) (0)
+#define kvm_arch_cpu_hotremove(x)
#endif
int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1360db4..bd21927 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1725,11 +1725,13 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
cpu);
hardware_disable(NULL);
+ kvm_arch_cpu_hotremove(cpu);
break;
case CPU_UP_CANCELED:
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
cpu);
smp_call_function_single(cpu, hardware_disable, NULL, 1);
+ kvm_arch_cpu_hotremove(cpu);
break;
case CPU_ONLINE:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
--
1.6.4.4
For the non-hotplug case, this TSC speed should be available; instead, clear
cpu_khz_tsc when bringing down a CPU so it is recomputed at the next bringup.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 12 +++++-------
1 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 38ba4a6..f1470ce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4721,18 +4721,16 @@ int kvm_arch_cpu_hotadd(int cpu)
void kvm_arch_cpu_hotremove(int cpu)
{
+ /*
+ * Reset TSC khz to zero so it is recomputed on bringup
+ */
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ per_cpu(cpu_tsc_khz, cpu) = 0;
kvm_x86_ops->cpu_hotremove(cpu);
}
int kvm_arch_hardware_enable(void *garbage)
{
- /*
- * Notifier callback chain may not have called cpufreq code
- * yet, thus we must reset TSC khz to zero and recompute it
- * before entering.
- */
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- per_cpu(cpu_tsc_khz, raw_smp_processor_id()) = 0;
return kvm_x86_ops->hardware_enable(garbage);
}
--
1.6.4.4
CPU frequency change callback provides new TSC frequency for us, and in the
same units (kHz), so there is no reason to do any math.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 5 +----
1 files changed, 1 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1470ce..c849f8f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3083,15 +3083,12 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
- unsigned long old_khz;
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
- old_khz = per_cpu(cpu_tsc_khz, freq->cpu);
- per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(old_khz, freq->old,
- freq->new);
+ per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
--
1.6.4.4
On Wed, Sep 23, 2009 at 05:29:01PM -1000, Zachary Amsden wrote:
> They are globals, not clearly protected by any ordering or locking, and
> vulnerable to various startup races.
>
> Instead, for variable TSC machines, register the cpufreq notifier and get
> the TSC frequency directly from the cpufreq machinery. Not only is it
> always right, it is also perfectly accurate, as no error prone measurement
> is required. On such machines, also detect the frequency when bringing
> a new CPU online; it isn't clear what frequency it will start with, and
> it may not correspond to the reference.
>
> Signed-off-by: Zachary Amsden <[email protected]>
> ---
> arch/x86/kvm/x86.c | 38 ++++++++++++++++++++++++++++----------
> 1 files changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 15d2ace..35082dd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -650,6 +650,19 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
>
> static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
>
> +static inline void kvm_get_cpu_khz(int cpu)
> +{
> + unsigned int khz = cpufreq_get(cpu);
cpufreq_get does down_read, while kvm_arch_hardware_enable is called
either with a spinlock held or from interrupt context?
On Wed, Sep 23, 2009 at 05:29:02PM -1000, Zachary Amsden wrote:
> Both VMX and SVM require per-cpu memory allocation, which is done at module
> init time, for only online cpus. When bringing a new CPU online, we must
> also allocate this structure. The method chosen to implement this is to
> make the CPU online notifier available via a call to the arch code. This
> allows memory allocation to be done smoothly, without any need to allocate
> extra structures.
>
> Note: CPU up notifiers may call KVM callback before calling cpufreq callbacks.
> This would causes the CPU frequency not to be detected (and it is not always
> clear on non-constant TSC platforms what the bringup TSC rate will be, so the
> guess of using tsc_khz could be wrong). So, we clear the rate to zero in such
> a case and add logic to query it upon entry.
>
> Signed-off-by: Zachary Amsden <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 2 ++
> arch/x86/kvm/svm.c | 15 +++++++++++++--
> arch/x86/kvm/vmx.c | 17 +++++++++++++++++
> arch/x86/kvm/x86.c | 14 +++++++++++++-
> include/linux/kvm_host.h | 6 ++++++
> virt/kvm/kvm_main.c | 3 ++-
> 6 files changed, 53 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 299cc1b..b7dd14b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -459,6 +459,7 @@ struct descriptor_table {
> struct kvm_x86_ops {
> int (*cpu_has_kvm_support)(void); /* __init */
> int (*disabled_by_bios)(void); /* __init */
> + int (*cpu_hotadd)(int cpu);
> int (*hardware_enable)(void *dummy);
> void (*hardware_disable)(void *dummy);
> void (*check_processor_compatibility)(void *rtn);
> @@ -791,6 +792,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
> _ASM_PTR " 666b, 667b \n\t" \
> ".popsection"
>
> +#define KVM_ARCH_WANT_HOTPLUG_NOTIFIER
> #define KVM_ARCH_WANT_MMU_NOTIFIER
> int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
> int kvm_age_hva(struct kvm *kvm, unsigned long hva);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 9a4daca..8f99d0c 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -330,13 +330,13 @@ static int svm_hardware_enable(void *garbage)
> return -EBUSY;
>
> if (!has_svm()) {
> - printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
> + printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", me);
> return -EINVAL;
> }
> svm_data = per_cpu(svm_data, me);
>
> if (!svm_data) {
> - printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
> + printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
> me);
> return -EINVAL;
> }
> @@ -394,6 +394,16 @@ err_1:
>
> }
>
> +static __cpuinit int svm_cpu_hotadd(int cpu)
> +{
> + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
> +
> + if (svm_data)
> + return 0;
> +
> + return svm_cpu_init(cpu);
> +}
> +
> static void set_msr_interception(u32 *msrpm, unsigned msr,
> int read, int write)
> {
> @@ -2858,6 +2868,7 @@ static struct kvm_x86_ops svm_x86_ops = {
> .hardware_setup = svm_hardware_setup,
> .hardware_unsetup = svm_hardware_unsetup,
> .check_processor_compatibility = svm_check_processor_compat,
> + .cpu_hotadd = svm_cpu_hotadd,
> .hardware_enable = svm_hardware_enable,
> .hardware_disable = svm_hardware_disable,
> .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 3fe0d42..b8a8428 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1408,6 +1408,22 @@ static __exit void hardware_unsetup(void)
> free_kvm_area();
> }
>
> +static __cpuinit int vmx_cpu_hotadd(int cpu)
> +{
> + struct vmcs *vmcs;
> +
> + if (per_cpu(vmxarea, cpu))
> + return 0;
> +
> + vmcs = alloc_vmcs_cpu(cpu);
> + if (!vmcs)
> + return -ENOMEM;
> +
> + per_cpu(vmxarea, cpu) = vmcs;
> +
> + return 0;
> +}
Have to free in __cpuexit?
Is it too wasteful to allocate statically with DEFINE_PER_CPU_PAGE_ALIGNED?
On 09/24/2009 05:52 AM, Marcelo Tosatti wrote:
>
>> +static __cpuinit int vmx_cpu_hotadd(int cpu)
>> +{
>> + struct vmcs *vmcs;
>> +
>> + if (per_cpu(vmxarea, cpu))
>> + return 0;
>> +
>> + vmcs = alloc_vmcs_cpu(cpu);
>> + if (!vmcs)
>> + return -ENOMEM;
>> +
>> + per_cpu(vmxarea, cpu) = vmcs;
>> +
>> + return 0;
>> +}
>>
> Have to free in __cpuexit?
>
> Is it too wasteful to allocate statically with DEFINE_PER_CPU_PAGE_ALIGNED?
>
Unfortunately, I think it is. The VMX / SVM structures are quite large,
and we can have a lot of potential CPUs.
Zach
Simplified the patch series a bit and fixed some bugs noticed by Marcelo.
Axed the hot-remove notifier (was not needed), fixed a locking bug by
using cpufreq_quick_get, fixed another bug in kvm_cpu_hotplug that was
filtering out online notifications when KVM was loaded but not in use.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 23 +++++++++++++++--------
1 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fedac9d..15d2ace 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3116,9 +3116,22 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
.notifier_call = kvmclock_cpufreq_notifier
};
+static void kvm_timer_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ tsc_khz_ref = tsc_khz;
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+}
+
int kvm_arch_init(void *opaque)
{
- int r, cpu;
+ int r;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
if (kvm_x86_ops) {
@@ -3150,13 +3163,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
- for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- tsc_khz_ref = tsc_khz;
- cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- }
+ kvm_timer_init();
return 0;
--
1.6.4.4
They are globals, not clearly protected by any ordering or locking, and
vulnerable to various startup races.
Instead, for variable TSC machines, register the cpufreq notifier and get
the TSC frequency directly from the cpufreq machinery. Not only is it
always right, it is also perfectly accurate, as no error prone measurement
is required. On such machines, also detect the frequency when bringing
a new CPU online; it isn't clear what frequency it will start with, and
it may not correspond to the reference.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 27 +++++++++++++++++----------
1 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15d2ace..c18e2fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3061,9 +3061,6 @@ static void bounce_off(void *info)
/* nothing */
}
-static unsigned int ref_freq;
-static unsigned long tsc_khz_ref;
-
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -3071,15 +3068,15 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
-
- if (!ref_freq)
- ref_freq = freq->old;
+ unsigned long old_khz;
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
- per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ old_khz = per_cpu(cpu_tsc_khz, freq->cpu);
+ per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(old_khz, freq->old,
+ freq->new);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3120,12 +3117,18 @@ static void kvm_timer_init(void)
{
int cpu;
- for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- tsc_khz_ref = tsc_khz;
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
+ for_each_online_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = cpufreq_get(cpu);
+ } else {
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ }
+ for_each_possible_cpu(cpu) {
+ printk(KERN_DEBUG "kvm: cpu %d = %ld khz\n",
+ cpu, per_cpu(cpu_tsc_khz, cpu));
}
}
@@ -4698,6 +4701,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
int kvm_arch_hardware_enable(void *garbage)
{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ int cpu = raw_smp_processor_id();
+ per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
+ }
return kvm_x86_ops->hardware_enable(garbage);
}
--
1.6.4.4
Both VMX and SVM require per-cpu memory allocation, which is done at module
init time, for only online cpus. When bringing a new CPU online, we must
also allocate this structure. The method chosen to implement this is to
make the CPU online notifier available via a call to the arch code. This
allows memory allocation to be done smoothly, without any need to allocate
extra structures.
Note: CPU up notifiers may call KVM callback before calling cpufreq callbacks.
This would causes the CPU frequency not to be detected (and it is not always
clear on non-constant TSC platforms what the bringup TSC rate will be, so the
guess of using tsc_khz could be wrong). So, we clear the rate to zero in such
a case and add logic to query it upon entry.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/svm.c | 15 +++++++++++++--
arch/x86/kvm/vmx.c | 17 +++++++++++++++++
arch/x86/kvm/x86.c | 13 +++++++++----
include/linux/kvm_host.h | 6 ++++++
virt/kvm/kvm_main.c | 6 ++----
6 files changed, 49 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 299cc1b..b7dd14b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -459,6 +459,7 @@ struct descriptor_table {
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
+ int (*cpu_hotadd)(int cpu);
int (*hardware_enable)(void *dummy);
void (*hardware_disable)(void *dummy);
void (*check_processor_compatibility)(void *rtn);
@@ -791,6 +792,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
_ASM_PTR " 666b, 667b \n\t" \
".popsection"
+#define KVM_ARCH_WANT_HOTPLUG_NOTIFIER
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a4daca..8f99d0c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -330,13 +330,13 @@ static int svm_hardware_enable(void *garbage)
return -EBUSY;
if (!has_svm()) {
- printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
+ printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", me);
return -EINVAL;
}
svm_data = per_cpu(svm_data, me);
if (!svm_data) {
- printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
+ printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
me);
return -EINVAL;
}
@@ -394,6 +394,16 @@ err_1:
}
+static __cpuinit int svm_cpu_hotadd(int cpu)
+{
+ struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+
+ if (svm_data)
+ return 0;
+
+ return svm_cpu_init(cpu);
+}
+
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
@@ -2858,6 +2868,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_setup = svm_hardware_setup,
.hardware_unsetup = svm_hardware_unsetup,
.check_processor_compatibility = svm_check_processor_compat,
+ .cpu_hotadd = svm_cpu_hotadd,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3fe0d42..b8a8428 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1408,6 +1408,22 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
}
+static __cpuinit int vmx_cpu_hotadd(int cpu)
+{
+ struct vmcs *vmcs;
+
+ if (per_cpu(vmxarea, cpu))
+ return 0;
+
+ vmcs = alloc_vmcs_cpu(cpu);
+ if (!vmcs)
+ return -ENOMEM;
+
+ per_cpu(vmxarea, cpu) = vmcs;
+
+ return 0;
+}
+
static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3925,6 +3941,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_setup = hardware_setup,
.hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
+ .cpu_hotadd = vmx_cpu_hotadd,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c18e2fc..66c6bb9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1326,6 +1326,8 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
+ if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0))
+ per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
kvm_request_guest_time_update(vcpu);
}
@@ -4699,12 +4701,15 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
return kvm_x86_ops->vcpu_reset(vcpu);
}
+int kvm_arch_cpu_hotadd(int cpu)
+{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ per_cpu(cpu_tsc_khz, cpu) = 0;
+ return kvm_x86_ops->cpu_hotadd(cpu);
+}
+
int kvm_arch_hardware_enable(void *garbage)
{
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- int cpu = raw_smp_processor_id();
- per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
- }
return kvm_x86_ops->hardware_enable(garbage);
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0bf9ee9..2f075c4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -345,6 +345,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+#ifdef KVM_ARCH_WANT_HOTPLUG_NOTIFIER
+int kvm_arch_cpu_hotadd(int cpu);
+#else
+#define kvm_arch_cpu_hotadd(x) (0)
+#endif
+
int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
int kvm_arch_hardware_enable(void *garbage);
void kvm_arch_hardware_disable(void *garbage);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e27b7a9..7818b51 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1716,9 +1716,6 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
{
int cpu = (long)v;
- if (!kvm_usage_count)
- return NOTIFY_OK;
-
val &= ~CPU_TASKS_FROZEN;
switch (val) {
case CPU_DYING:
@@ -1734,7 +1731,8 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
case CPU_ONLINE:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
- smp_call_function_single(cpu, hardware_enable, NULL, 1);
+ if (!kvm_arch_cpu_hotadd(cpu))
+ smp_call_function_single(cpu, hardware_enable, NULL, 1);
break;
}
return NOTIFY_OK;
--
1.6.4.4
In the process of bringing down CPUs, the SVM / VMX structures associated
with those CPUs are not freed. This may cause leaks when unloading and
reloading the KVM module, as only the structures associated with online
CPUs are cleaned up. So, clean up all possible CPUs, not just online ones.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/svm.c | 2 +-
arch/x86/kvm/vmx.c | 7 +++++--
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8f99d0c..13ca268 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -525,7 +525,7 @@ static __exit void svm_hardware_unsetup(void)
{
int cpu;
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b8a8428..603bde3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1350,8 +1350,11 @@ static void free_kvm_area(void)
{
int cpu;
- for_each_online_cpu(cpu)
- free_vmcs(per_cpu(vmxarea, cpu));
+ for_each_possible_cpu(cpu)
+ if (per_cpu(vmxarea, cpu)) {
+ free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
}
static __init int alloc_kvm_area(void)
--
1.6.4.4
CPU frequency change callback provides new TSC frequency for us, and in the
same units (kHz), so there is no reason to do any math.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 5 +----
1 files changed, 1 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 66c6bb9..60ae2c7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3070,15 +3070,12 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
- unsigned long old_khz;
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
- old_khz = per_cpu(cpu_tsc_khz, freq->cpu);
- per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(old_khz, freq->old,
- freq->new);
+ per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
--
1.6.4.4
On 09/24/2009 11:32 PM, Zachary Amsden wrote:
> On 09/24/2009 05:52 AM, Marcelo Tosatti wrote:
>>
>>> +static __cpuinit int vmx_cpu_hotadd(int cpu)
>>> +{
>>> + struct vmcs *vmcs;
>>> +
>>> + if (per_cpu(vmxarea, cpu))
>>> + return 0;
>>> +
>>> + vmcs = alloc_vmcs_cpu(cpu);
>>> + if (!vmcs)
>>> + return -ENOMEM;
>>> +
>>> + per_cpu(vmxarea, cpu) = vmcs;
>>> +
>>> + return 0;
>>> +}
>> Have to free in __cpuexit?
>>
>> Is it too wasteful to allocate statically with
>> DEFINE_PER_CPU_PAGE_ALIGNED?
>
> Unfortunately, I think it is. The VMX / SVM structures are quite
> large, and we can have a lot of potential CPUs.
I think percpu is only allocated when the cpu is online (it would still
be wasteful if the modules were loaded but unused).
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 09/25/2009 03:47 AM, Zachary Amsden wrote:
> Both VMX and SVM require per-cpu memory allocation, which is done at module
> init time, for only online cpus. When bringing a new CPU online, we must
> also allocate this structure. The method chosen to implement this is to
> make the CPU online notifier available via a call to the arch code. This
> allows memory allocation to be done smoothly, without any need to allocate
> extra structures.
>
> Note: CPU up notifiers may call KVM callback before calling cpufreq callbacks.
> This would causes the CPU frequency not to be detected (and it is not always
> clear on non-constant TSC platforms what the bringup TSC rate will be, so the
> guess of using tsc_khz could be wrong). So, we clear the rate to zero in such
> a case and add logic to query it upon entry.
>
> Signed-off-by: Zachary Amsden<[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 2 ++
> arch/x86/kvm/svm.c | 15 +++++++++++++--
> arch/x86/kvm/vmx.c | 17 +++++++++++++++++
> arch/x86/kvm/x86.c | 13 +++++++++----
> include/linux/kvm_host.h | 6 ++++++
> virt/kvm/kvm_main.c | 6 ++----
> 6 files changed, 49 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 299cc1b..b7dd14b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -459,6 +459,7 @@ struct descriptor_table {
> struct kvm_x86_ops {
> int (*cpu_has_kvm_support)(void); /* __init */
> int (*disabled_by_bios)(void); /* __init */
> + int (*cpu_hotadd)(int cpu);
> int (*hardware_enable)(void *dummy);
> void (*hardware_disable)(void *dummy);
> void (*check_processor_compatibility)(void *rtn);
> @@ -791,6 +792,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
> _ASM_PTR " 666b, 667b \n\t" \
> ".popsection"
>
> +#define KVM_ARCH_WANT_HOTPLUG_NOTIFIER
> #define KVM_ARCH_WANT_MMU_NOTIFIER
> int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
> int kvm_age_hva(struct kvm *kvm, unsigned long hva);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 9a4daca..8f99d0c 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -330,13 +330,13 @@ static int svm_hardware_enable(void *garbage)
> return -EBUSY;
>
> if (!has_svm()) {
> - printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
> + printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", me);
> return -EINVAL;
> }
> svm_data = per_cpu(svm_data, me);
>
> if (!svm_data) {
> - printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
> + printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
> me);
> return -EINVAL;
> }
> @@ -394,6 +394,16 @@ err_1:
>
> }
>
> +static __cpuinit int svm_cpu_hotadd(int cpu)
> +{
> + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
> +
> + if (svm_data)
> + return 0;
> +
> + return svm_cpu_init(cpu);
> +}
> +
> static void set_msr_interception(u32 *msrpm, unsigned msr,
> int read, int write)
> {
> @@ -2858,6 +2868,7 @@ static struct kvm_x86_ops svm_x86_ops = {
> .hardware_setup = svm_hardware_setup,
> .hardware_unsetup = svm_hardware_unsetup,
> .check_processor_compatibility = svm_check_processor_compat,
> + .cpu_hotadd = svm_cpu_hotadd,
> .hardware_enable = svm_hardware_enable,
> .hardware_disable = svm_hardware_disable,
> .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 3fe0d42..b8a8428 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1408,6 +1408,22 @@ static __exit void hardware_unsetup(void)
> free_kvm_area();
> }
>
> +static __cpuinit int vmx_cpu_hotadd(int cpu)
> +{
> + struct vmcs *vmcs;
> +
> + if (per_cpu(vmxarea, cpu))
> + return 0;
> +
> + vmcs = alloc_vmcs_cpu(cpu);
> + if (!vmcs)
> + return -ENOMEM;
> +
> + per_cpu(vmxarea, cpu) = vmcs;
> +
> + return 0;
> +}
> +
> static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
> {
> struct kvm_vmx_segment_field *sf =&kvm_vmx_segment_fields[seg];
> @@ -3925,6 +3941,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
> .hardware_setup = hardware_setup,
> .hardware_unsetup = hardware_unsetup,
> .check_processor_compatibility = vmx_check_processor_compat,
> + .cpu_hotadd = vmx_cpu_hotadd,
> .hardware_enable = hardware_enable,
> .hardware_disable = hardware_disable,
> .cpu_has_accelerated_tpr = report_flexpriority,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c18e2fc..66c6bb9 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1326,6 +1326,8 @@ out:
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> {
> kvm_x86_ops->vcpu_load(vcpu, cpu);
> + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0))
> + per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
> kvm_request_guest_time_update(vcpu);
> }
>
> @@ -4699,12 +4701,15 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
> return kvm_x86_ops->vcpu_reset(vcpu);
> }
>
> +int kvm_arch_cpu_hotadd(int cpu)
> +{
> + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
> + per_cpu(cpu_tsc_khz, cpu) = 0;
> + return kvm_x86_ops->cpu_hotadd(cpu);
> +}
> +
> int kvm_arch_hardware_enable(void *garbage)
> {
> - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
> - int cpu = raw_smp_processor_id();
> - per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
> - }
> return kvm_x86_ops->hardware_enable(garbage);
> }
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 0bf9ee9..2f075c4 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -345,6 +345,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
> int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
> void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
>
> +#ifdef KVM_ARCH_WANT_HOTPLUG_NOTIFIER
> +int kvm_arch_cpu_hotadd(int cpu);
> +#else
> +#define kvm_arch_cpu_hotadd(x) (0)
> +#endif
> +
> int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
> int kvm_arch_hardware_enable(void *garbage);
> void kvm_arch_hardware_disable(void *garbage);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e27b7a9..7818b51 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1716,9 +1716,6 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
> {
> int cpu = (long)v;
>
> - if (!kvm_usage_count)
> - return NOTIFY_OK;
> -
>
Why? You'll now do hardware_enable() even if kvm is not in use.
> val&= ~CPU_TASKS_FROZEN;
> switch (val) {
> case CPU_DYING:
> @@ -1734,7 +1731,8 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
> case CPU_ONLINE:
> printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
> cpu);
> - smp_call_function_single(cpu, hardware_enable, NULL, 1);
> + if (!kvm_arch_cpu_hotadd(cpu))
> + smp_call_function_single(cpu, hardware_enable, NULL, 1);
> break;
> }
>
if (!blah) when blah is not a boolean or a pointer is confusing.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 09/25/2009 03:47 AM, Zachary Amsden wrote:
> In the process of bringing down CPUs, the SVM / VMX structures associated
> with those CPUs are not freed. This may cause leaks when unloading and
> reloading the KVM module, as only the structures associated with online
> CPUs are cleaned up. So, clean up all possible CPUs, not just online ones.
>
> Signed-off-by: Zachary Amsden<[email protected]>
> ---
> arch/x86/kvm/svm.c | 2 +-
> arch/x86/kvm/vmx.c | 7 +++++--
> 2 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 8f99d0c..13ca268 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -525,7 +525,7 @@ static __exit void svm_hardware_unsetup(void)
> {
> int cpu;
>
> - for_each_online_cpu(cpu)
> + for_each_possible_cpu(cpu)
> svm_cpu_uninit(cpu);
>
> __free_pages(pfn_to_page(iopm_base>> PAGE_SHIFT), IOPM_ALLOC_ORDER);
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index b8a8428..603bde3 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1350,8 +1350,11 @@ static void free_kvm_area(void)
> {
> int cpu;
>
> - for_each_online_cpu(cpu)
> - free_vmcs(per_cpu(vmxarea, cpu));
> + for_each_possible_cpu(cpu)
> + if (per_cpu(vmxarea, cpu)) {
> + free_vmcs(per_cpu(vmxarea, cpu));
> + per_cpu(vmxarea, cpu) = NULL;
> + }
> }
>
> static __init int alloc_kvm_area(void)
>
First, I'm not sure per_cpu works for possible but not actual cpus.
Second, we now eagerly allocate but lazily free, leading to lots of ifs
and buts. I think the code can be cleaner by eagerly allocating and
eagerly freeing.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 09/26/2009 10:52 PM, Avi Kivity wrote:
> On 09/25/2009 03:47 AM, Zachary Amsden wrote:
>>
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -1716,9 +1716,6 @@ static int kvm_cpu_hotplug(struct
>> notifier_block *notifier, unsigned long val,
>> {
>> int cpu = (long)v;
>>
>> - if (!kvm_usage_count)
>> - return NOTIFY_OK;
>> -
>
> Why? You'll now do hardware_enable() even if kvm is not in use.
Because otherwise you'll never allocate and hardware_enable_all will fail:
Switch to broadcast mode on CPU1
svm_hardware_enable: svm_data is NULL on 1
kvm: enabling virtualization on CPU1 failed
qemu-system-x86[8698]: segfault at 20 ip 00000000004db22f sp
00007fff0a3b4560 error 6 in qemu-system-x86_64[400000+21f000]
Perhaps I can make this work better by putting the allocation within
hardware_enable_all.
Zach
On 09/26/2009 10:54 PM, Avi Kivity wrote:
>
> First, I'm not sure per_cpu works for possible but not actual cpus.
> Second, we now eagerly allocate but lazily free, leading to lots of
> ifs and buts. I think the code can be cleaner by eagerly allocating
> and eagerly freeing.
Eager freeing requires a hotplug remove notification to the arch layer.
I had done that originally, but not sure.
How does per_cpu() work when defined in a module anyway? The linker
magic going on here evades a simple one-minute analysis.
Zach