This makes the VMX implementation match the SVM implementation as far
as when the TSC is read. This means last_tsc is measured at the time
of relinquishing VM operation, rather than at CPU switch time.
That may seem non-optimal for now, but it is a preparatory step for
unifying the TSC code in both VMX and SVM and also for allowing
non-real time based TSCs.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/vmx.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b923f2a..75aa1d5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -424,7 +424,6 @@ static void __vcpu_clear(void *arg)
vmcs_clear(vmx->vmcs);
if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- rdtscll(vmx->vcpu.arch.host_tsc);
list_del(&vmx->local_vcpus_link);
vmx->vcpu.cpu = -1;
vmx->launched = 0;
@@ -762,6 +761,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.host_tsc = native_read_tsc();
__vmx_load_host_state(to_vmx(vcpu));
}
--
1.6.5.2
Using a hotplug notifier here gives us direct control for getting the
clock rate, which might be different on power-up. This eliminates a check
from the hot path of entering hardware virtualization, and is a cleaner
way to do things. The notifier must be last, so the cpufreq code has a
chance to run first and figure out the CPU speed.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 38 ++++++++++++++++++++++++++++----------
1 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1cc51ca..faa467d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3308,10 +3308,37 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
.notifier_call = kvmclock_cpufreq_notifier
};
+static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
+ unsigned long val, void *v)
+{
+ int cpu = (long)v;
+
+ val &= ~CPU_TASKS_FROZEN;
+ switch (val) {
+ case CPU_DYING:
+ case CPU_UP_CANCELED:
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ per_cpu(cpu_tsc_khz, cpu) = 0;
+ break;
+
+ case CPU_ONLINE:
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_x86_cpu_notifier = {
+ .notifier_call = kvm_x86_cpu_hotplug,
+ .priority = -INT_MAX, /* we want to be called last */
+};
+
static void kvm_timer_init(void)
{
int cpu;
+ register_cpu_notifier(&kvm_x86_cpu_notifier);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
@@ -3374,6 +3401,7 @@ void kvm_arch_exit(void)
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
+ unregister_cpu_notifier(&kvm_x86_cpu_notifier);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
@@ -4904,17 +4932,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
int kvm_arch_hardware_enable(void *garbage)
{
- /*
- * Since this may be called from a hotplug notifcation,
- * we can't get the CPU frequency directly.
- */
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- int cpu = raw_smp_processor_id();
- per_cpu(cpu_tsc_khz, cpu) = 0;
- }
-
kvm_shared_msr_cpu_online();
-
return kvm_x86_ops->hardware_enable(garbage);
}
--
1.6.5.2
Add the framework for a preliminary way to cope with CPUs which have
different TSC offsets from each other. The TSC delta is measured
(in cross-cache-directions for highest accuracy) and stored.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 202 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index faa467d..95e43b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -730,6 +730,196 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
}
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
+static DEFINE_PER_CPU(int, cpu_tsc_shift);
+static DEFINE_PER_CPU(s64, cpu_tsc_offset);
+static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
+static int tsc_base_cpu = -1;
+static unsigned long ref_tsc_khz;
+
+static inline unsigned long div_precise(unsigned long hi, unsigned long lo,
+ unsigned long divisor, unsigned long *rptr)
+{
+ unsigned long quotient, remainder;
+ __asm__ ( "div %4"
+ : "=a" (quotient), "=d" (remainder)
+ : "0" (lo), "1" (hi), "rm" (divisor));
+ *rptr = remainder;
+ return quotient;
+}
+
+/*
+ * compute the best multipler and shift pair m,s, such that for n,
+ * n * a / b = (n * m) >> s
+ */
+static void compute_best_multiplier(unsigned long a, unsigned long b,
+ unsigned long *m, int *s)
+{
+ int shift, bit;
+ unsigned long lo, hi, remainder, mult;
+
+ /*
+ * By pre-shifting and using maximum machine width, we get the most
+ * bits of precision.
+ */
+ shift = BITS_PER_LONG + fls(b) - fls(a) - 1;
+ if (shift > BITS_PER_LONG)
+ shift = BITS_PER_LONG;
+ if (shift < 0)
+ shift = 0;
+ lo = a << shift;
+ hi = a >> (BITS_PER_LONG - shift);
+ mult = div_precise(hi, lo, b, &remainder);
+
+ /* See if it can be further simplified */
+ bit = __ffs(mult);
+ if (bit > shift)
+ bit = shift;
+ mult >>= bit;
+ shift -= bit;
+ *m = mult;
+ *s = shift;
+}
+
+static inline unsigned long mult_precise(unsigned long val, unsigned long mult,
+ int shift)
+{
+ unsigned long top, bot;
+
+ __asm__ ( "mul %3; shrd %1, %0" :
+ "=&a" (bot), "=&d" (top) :
+ "0" (mult), "rm" (val), "c" (shift));
+ return bot;
+}
+
+static inline u64 compute_ref_tsc(int cpu)
+{
+ u64 tsc = native_read_tsc() - per_cpu(cpu_tsc_measure_base, cpu);
+ tsc = mult_precise(tsc, per_cpu(cpu_tsc_multiplier, cpu),
+ per_cpu(cpu_tsc_shift, cpu));
+ return tsc + per_cpu(cpu_tsc_offset, cpu);
+}
+
+#define SYNC_TRIES 64
+
+/*
+ * sync_tsc_helper is a dual-entry coroutine meant to be run by only
+ * two CPUs at a time, one of which is a measuring CPU. Both CPUs
+ * synchronize entry and exit as well as the central recording loop
+ * using only memory barriers and atomic variables to avoid lock latency.
+ *
+ * To discount cache latency effects, this routine will be called
+ * twice, one with the measure / recording CPUs reversed. In addition,
+ * the first 4 and last 2 results will be discarded to allow branch
+ * predicition to become established (and to discount effects from
+ * a potentially early predicted loop exit).
+ *
+ * Because of this, we must be extra careful to guard the entrance
+ * and exit against the CPU switch. I chose to use atomic instructions
+ * only at the end of the measure loop and use the same routine for
+ * both CPUs, with symmetric comparisons, and a statically placed
+ * recording array, hopefully maximizing the branch predicition and
+ * cache locality. The results appear quite good; on known to be
+ * synchronized CPUs, I typically get < 10 TSC delta measured, with
+ * maximum observed error on the order of 100 cycles.
+ *
+ * This doesn't account for NUMA cache effects, and could potentially
+ * be improved there by moving the delta[] array to the stack of the
+ * measuring CPU. In fact, this modification might be worth trying
+ * for non-NUMA systems as well, but this appears to be fine for now.
+ */
+static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+{
+ int tries;
+ static u64 tsc_other;
+ int junk = 0;
+ u64 tsc;
+ int cpu = raw_smp_processor_id();
+
+ if (cpu == measure_cpu) {
+ atomic_set(ready, 0);
+ while (!atomic_read(ready))
+ /* wait */;
+ } else {
+ while (atomic_read(ready))
+ /* wait */;
+ atomic_set(ready, 1);
+ }
+ for (tries = 0; tries < SYNC_TRIES; tries++) {
+ mb();
+ if (cpu == measure_cpu) {
+ atomic_set(ready, 0);
+ } else {
+ while (atomic_read(ready))
+ /* wait */;
+ }
+ native_cpuid(&junk, &junk, &junk, &junk);
+ tsc = compute_ref_tsc(cpu);
+ rdtsc_barrier();
+ if (cpu == measure_cpu) {
+ while (!atomic_read(ready))
+ /* wait */;
+ rmb();
+ delta[tries] = tsc - tsc_other;
+ } else {
+ tsc_other = tsc;
+ wmb();
+ atomic_set(ready, 1);
+ }
+ }
+ if (cpu == measure_cpu)
+ atomic_dec(ready);
+ else
+ atomic_inc(ready);
+ while (atomic_read(ready) != 1)
+ /* wait */;
+ mb();
+}
+
+static void kvm_sync_tsc(void *cpup)
+{
+ int new_cpu = *(int *)cpup;
+ unsigned long flags;
+ static s64 delta[SYNC_TRIES*2];
+ static atomic_t ready = ATOMIC_INIT(1);
+
+ BUG_ON(tsc_base_cpu == -1);
+ pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) , tsc_base_cpu);
+ local_irq_save(flags);
+ if (raw_smp_processor_id() == new_cpu) {
+ per_cpu(cpu_tsc_measure_base, new_cpu) = native_read_tsc();
+ per_cpu(cpu_tsc_offset, new_cpu) = 0;
+ compute_best_multiplier(ref_tsc_khz,
+ per_cpu(cpu_tsc_khz, new_cpu),
+ &per_cpu(cpu_tsc_multiplier, new_cpu),
+ &per_cpu(cpu_tsc_shift, new_cpu));
+ }
+ sync_tsc_helper(tsc_base_cpu, delta, &ready);
+ sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+ if (raw_smp_processor_id() == new_cpu) {
+ int i;
+ s64 accumulator = 0;
+
+ /*
+ * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
+ * subtract [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+ *
+ * this allows instruction cycle and cache differences to
+ * cancel each other out and drops warm up/cool down variation
+ *
+ * Note the arithmatic must be signed because of the divide
+ */
+
+ for (i = 4; i < SYNC_TRIES - 2; i++)
+ accumulator += delta[i];
+ for (i = 4; i < SYNC_TRIES - 2; i++)
+ accumulator -= delta[i+SYNC_TRIES];
+ accumulator = accumulator / (SYNC_TRIES*2-12);
+ per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
+ pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu));
+ }
+ local_irq_restore(flags);
+}
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
@@ -3352,6 +3542,18 @@ static void kvm_timer_init(void)
for_each_possible_cpu(cpu)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
}
+ tsc_base_cpu = get_cpu();
+ ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu);
+ per_cpu(cpu_tsc_multiplier, tsc_base_cpu) = 1;
+ per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0;
+ per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0;
+ for_each_online_cpu(cpu)
+ if (cpu != tsc_base_cpu) {
+ smp_call_function_single(cpu, kvm_sync_tsc,
+ (void *)&cpu, 0);
+ kvm_sync_tsc((void *)&cpu);
+ }
+ put_cpu();
}
int kvm_arch_init(void *opaque)
--
1.6.5.2
The loop is written to only work when one master and one slave
enter simultaneously, so properly protect it, and call it when
adding new CPUs.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 43 ++++++++++++++++++++++++++++++++-----------
1 files changed, 32 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 95e43b9..a599c78 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -729,6 +729,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
hv_clock->tsc_to_system_mul);
}
+DEFINE_SPINLOCK(kvm_tsc_lock);
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
static DEFINE_PER_CPU(int, cpu_tsc_shift);
@@ -921,6 +922,21 @@ static void kvm_sync_tsc(void *cpup)
local_irq_restore(flags);
}
+static void kvm_do_sync_tsc(int cpu)
+{
+ spin_lock(&kvm_tsc_lock);
+ if (raw_smp_processor_id() != tsc_base_cpu) {
+ smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
+ (void *)&cpu, 0);
+ smp_call_function_single(cpu, kvm_sync_tsc, (void *)&cpu, 1);
+ } else {
+ smp_call_function_single(cpu, kvm_sync_tsc, (void *)&cpu, 0);
+ smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
+ (void *)&cpu, 1);
+ }
+ spin_unlock(&kvm_tsc_lock);
+}
+
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
@@ -1634,12 +1650,7 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
- if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
- unsigned long khz = cpufreq_quick_get(cpu);
- if (!khz)
- khz = tsc_khz;
- per_cpu(cpu_tsc_khz, cpu) = khz;
- }
+ BUG_ON(per_cpu(cpu_tsc_khz, cpu) == 0);
kvm_request_guest_time_update(vcpu);
}
@@ -3505,6 +3516,18 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
val &= ~CPU_TASKS_FROZEN;
switch (val) {
+ case CPU_DOWN_PREPARE:
+ if (cpu == tsc_base_cpu) {
+ int new_cpu;
+ spin_lock(&kvm_tsc_lock);
+ for_each_online_cpu(new_cpu)
+ if (new_cpu != tsc_base_cpu)
+ break;
+ tsc_base_cpu = new_cpu;
+ spin_unlock(&kvm_tsc_lock);
+ }
+ break;
+
case CPU_DYING:
case CPU_UP_CANCELED:
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
@@ -3514,6 +3537,7 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
case CPU_ONLINE:
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
+ kvm_do_sync_tsc(cpu);
break;
}
return NOTIFY_OK;
@@ -3548,11 +3572,8 @@ static void kvm_timer_init(void)
per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0;
per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0;
for_each_online_cpu(cpu)
- if (cpu != tsc_base_cpu) {
- smp_call_function_single(cpu, kvm_sync_tsc,
- (void *)&cpu, 0);
- kvm_sync_tsc((void *)&cpu);
- }
+ if (cpu != tsc_base_cpu)
+ kvm_do_sync_tsc(cpu);
put_cpu();
}
--
1.6.5.2
Some AMD based machines can have TSC drift when in C1 HLT state because
despite attempting to scale the TSC increment when dividing down the
P-state, the processor may return to full P-state to service cache
probes. This causes unpredictable TSC drift on these machines.
We implement a recommended workaround, which is disabling C1 clock
ramping.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 45 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a599c78..4c4d2e0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,12 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
+#ifdef CONFIG_K8_NB
+#include <linux/pci.h>
+#include <asm/k8.h>
+#include <asm/smp.h>
+#endif
+
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
@@ -3548,10 +3554,42 @@ static struct notifier_block kvm_x86_cpu_notifier = {
.priority = -INT_MAX, /* we want to be called last */
};
+static u8 disabled_c1_ramp = 0;
+
static void kvm_timer_init(void)
{
int cpu;
+ /*
+ * AMD processors can de-synchronize TSC on halt in C1 state, because
+ * processors in lower P state will have TSC scaled properly during
+ * normal operation, but will have TSC scaled improperly while
+ * servicing cache probes. Because there is no way to determine how
+ * TSC was adjusted during cache probes, there are two solutions:
+ * resynchronize after halt, or disable C1-clock ramping.
+ *
+ * We implemenent solution 2.
+ */
+#ifdef CONFIG_K8_NB
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 == 0x0f &&
+ !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ struct pci_dev *nb;
+ int i;
+ cache_k8_northbridges();
+ for (i = 0; i < num_k8_northbridges; i++) {
+ u8 byte;
+ nb = k8_northbridges[i];
+ pci_read_config_byte(nb, 0x87, &byte);
+ if (byte & 1) {
+ printk(KERN_INFO "%s: AMD C1 clock ramping detected, performing workaround\n", __func__);
+ disabled_c1_ramp = byte;
+ pci_write_config_byte(nb, 0x87, byte & 0xFC);
+
+ }
+ }
+ }
+#endif
register_cpu_notifier(&kvm_x86_cpu_notifier);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
@@ -3627,6 +3665,13 @@ void kvm_arch_exit(void)
unregister_cpu_notifier(&kvm_x86_cpu_notifier);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
+#ifdef CONFIG_K8_NB
+ if (disabled_c1_ramp) {
+ struct pci_dev **nb;
+ for (nb = k8_northbridges; *nb; nb++)
+ pci_write_config_byte(*nb, 0x87, disabled_c1_ramp);
+ }
+#endif
}
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
--
1.6.5.2
Add the full infrastructure for dealing with dynamic TSC frequency
changes. The only way to properly deal with this is to disable
hardware virtualization during a CPU frequency event; this is
possible because the CPU frequency module will call before and
after a change. In that case, we simply resynchronize the TSCs
afterwards to ensure a monotonic, global TSC source.
This is complicated a bit by the fact that the master TSC reference
may or may not be part of the same package which is undergoing the
frequency modification, thus we must deal with the case when there
is no master CPU frequency change by allowing single CPUs to
initiate their own synchronization.
During initial setup, cpufreq callbacks can set the cpu_tsc_khz
concurrently with the update, so we fix this by protecting under the
kvm_tsc_lock.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 163 +++++++++++++++++++++++++++++++++++++++-------------
1 files changed, 122 insertions(+), 41 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c4d2e0..144a820 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -741,9 +741,15 @@ static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
static DEFINE_PER_CPU(int, cpu_tsc_shift);
static DEFINE_PER_CPU(s64, cpu_tsc_offset);
static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
+static DEFINE_PER_CPU(atomic_t, cpu_tsc_synchronized);
static int tsc_base_cpu = -1;
static unsigned long ref_tsc_khz;
+static inline int cpu_is_tsc_synchronized(int cpu)
+{
+ return (atomic_read(&per_cpu(cpu_tsc_synchronized, cpu)) != 0);
+}
+
static inline unsigned long div_precise(unsigned long hi, unsigned long lo,
unsigned long divisor, unsigned long *rptr)
{
@@ -923,6 +929,7 @@ static void kvm_sync_tsc(void *cpup)
accumulator -= delta[i+SYNC_TRIES];
accumulator = accumulator / (SYNC_TRIES*2-12);
per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
+ atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);
pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu));
}
local_irq_restore(flags);
@@ -931,6 +938,11 @@ static void kvm_sync_tsc(void *cpup)
static void kvm_do_sync_tsc(int cpu)
{
spin_lock(&kvm_tsc_lock);
+
+ /* tsc_base_cpu can change without tsc_lock, so recheck */
+ if (unlikely(cpu == tsc_base_cpu))
+ goto out_unlock;
+
if (raw_smp_processor_id() != tsc_base_cpu) {
smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
(void *)&cpu, 0);
@@ -940,6 +952,8 @@ static void kvm_do_sync_tsc(int cpu)
smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
(void *)&cpu, 1);
}
+
+out_unlock:
spin_unlock(&kvm_tsc_lock);
}
@@ -1656,7 +1670,6 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
- BUG_ON(per_cpu(cpu_tsc_khz, cpu) == 0);
kvm_request_guest_time_update(vcpu);
}
@@ -3461,11 +3474,46 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
}
EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-static void bounce_off(void *info)
+static void evict(void *info)
{
/* nothing */
}
+static struct execute_work resync_work;
+static int work_scheduled;
+
+static void resync_user(struct work_struct *work)
+{
+ int cpu;
+
+ work_scheduled = 0;
+ for_each_online_cpu(cpu)
+ if (cpu != tsc_base_cpu)
+ kvm_do_sync_tsc(cpu);
+}
+
+static void resync(void *info)
+{
+ int cpu;
+ u64 tsc;
+
+ /* Fixup our own values to stay in sync with the reference */
+ cpu = raw_smp_processor_id();
+ tsc = compute_ref_tsc(cpu);
+ per_cpu(cpu_tsc_measure_base, cpu) = native_read_tsc();
+ per_cpu(cpu_tsc_offset, cpu) = tsc;
+ compute_best_multiplier(ref_tsc_khz, per_cpu(cpu_tsc_khz, cpu),
+ &per_cpu(cpu_tsc_multiplier, cpu),
+ &per_cpu(cpu_tsc_shift, cpu));
+ atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 1);
+
+ /* Then, get everybody else on board */
+ if (!work_scheduled) {
+ work_scheduled = 1;
+ execute_in_process_context(resync_user, &resync_work);
+ }
+}
+
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -3474,39 +3522,68 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
- return 0;
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
- return 0;
- per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
-
- spin_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->cpu != freq->cpu)
- continue;
- if (!kvm_request_guest_time_update(vcpu))
- continue;
- if (vcpu->cpu != smp_processor_id())
- send_ipi++;
+ /*
+ * There is no way to precisely know the TSC value at which time the
+ * CPU frequency actually changed, and these callbacks may happen at
+ * different times, thus there will be drift in the reference TSC
+ * clock across all CPUs. To avoid this problem, we forcibly evict
+ * any CPUs which may be running in hardware virtualization.
+ *
+ * We do this by setting cpu_tsc_synchronized to zero and polling for
+ * this value to change when entering hardware virtualization.
+ */
+ if (val == CPUFREQ_PRECHANGE) {
+ get_online_cpus();
+ atomic_set(&per_cpu(cpu_tsc_synchronized, freq->cpu), 0);
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ if (vcpu->cpu != smp_processor_id())
+ send_ipi++;
+ kvm_request_guest_time_update(vcpu);
+ }
+ }
+ spin_unlock(&kvm_lock);
+ if (send_ipi) {
+ /*
+ * In case we update the frequency for another cpu
+ * (which might be in guest context) send an interrupt
+ * to kick the cpu out of guest context. Next time
+ * guest context is entered kvmclock will be updated,
+ * so the guest will not see stale values.
+ */
+ smp_call_function_single(freq->cpu, evict, NULL, 1);
}
- }
- spin_unlock(&kvm_lock);
- if (freq->old < freq->new && send_ipi) {
/*
- * We upscale the frequency. Must make the guest
- * doesn't see old kvmclock values while running with
- * the new frequency, otherwise we risk the guest sees
- * time go backwards.
- *
- * In case we update the frequency for another cpu
- * (which might be in guest context) send an interrupt
- * to kick the cpu out of guest context. Next time
- * guest context is entered kvmclock will be updated,
- * so the guest will not see stale values.
+ * The update of the frequency can't happen while a VM
+ * is running, nor can it happen during init when we can
+ * race against the init code setting the first known freq.
+ * Just use the vm_tsc_lock for a mutex.
*/
- smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+ spin_lock(&kvm_tsc_lock);
+ per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+ spin_unlock(&kvm_tsc_lock);
+
+ return 0;
+ }
+
+ /*
+ * After the change, we must resynchronize affected CPUs.
+ * If the master reference changes, that means all CPUs.
+ * Note that some CPUs may be resynchronized twice, once
+ * by the master, and once by themselves, depending on the
+ * order in which this notifier is called; this is harmless.
+ */
+ if (val == CPUFREQ_POSTCHANGE) {
+ if (freq->cpu == tsc_base_cpu)
+ smp_call_function_single(freq->cpu, resync, NULL, 1);
+ else if (cpu_is_tsc_synchronized(tsc_base_cpu))
+ /* Can't do this if base is not yet updated */
+ kvm_do_sync_tsc(freq->cpu);
+ put_online_cpus();
}
return 0;
}
@@ -3536,8 +3613,7 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
case CPU_DYING:
case CPU_UP_CANCELED:
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- per_cpu(cpu_tsc_khz, cpu) = 0;
+ atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 0);
break;
case CPU_ONLINE:
@@ -3590,6 +3666,12 @@ static void kvm_timer_init(void)
}
}
#endif
+
+ /*
+ * Register notifiers for both CPU add / remove and CPU frequency
+ * change. Must be careful to avoid subtle races, as frequency
+ * can change at any time.
+ */
register_cpu_notifier(&kvm_x86_cpu_notifier);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
@@ -3598,7 +3680,10 @@ static void kvm_timer_init(void)
unsigned long khz = cpufreq_get(cpu);
if (!khz)
khz = tsc_khz;
- per_cpu(cpu_tsc_khz, cpu) = khz;
+ spin_lock(&kvm_tsc_lock);
+ if (!per_cpu(cpu_tsc_khz, cpu))
+ per_cpu(cpu_tsc_khz, cpu) = khz;
+ spin_unlock(&kvm_tsc_lock);
}
} else {
for_each_possible_cpu(cpu)
@@ -3606,12 +3691,7 @@ static void kvm_timer_init(void)
}
tsc_base_cpu = get_cpu();
ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu);
- per_cpu(cpu_tsc_multiplier, tsc_base_cpu) = 1;
- per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0;
- per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0;
- for_each_online_cpu(cpu)
- if (cpu != tsc_base_cpu)
- kvm_do_sync_tsc(cpu);
+ resync(NULL);
put_cpu();
}
@@ -4097,7 +4177,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
clear_bit(KVM_REQ_KICK, &vcpu->requests);
smp_mb__after_clear_bit();
- if (vcpu->requests || need_resched() || signal_pending(current)) {
+ if (vcpu->requests || need_resched() || signal_pending(current) ||
+ !cpu_is_tsc_synchronized(smp_processor_id())) {
set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
preempt_enable();
--
1.6.5.2
Add a dumb version of RDTSC trapping for SVM which just passes
through the hardware counter.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/emulate.c | 2 +-
arch/x86/kvm/svm.c | 15 +++++++++++++++
2 files changed, 16 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d226dff..76e180f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -226,7 +226,7 @@ static u32 twobyte_table[256] = {
ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0,
+ ImplicitOps, ImplicitOps, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 34b700f..16c9048 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -571,6 +571,7 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
+ (1ULL << INTERCEPT_RDTSC) |
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
@@ -2029,6 +2030,19 @@ static int task_switch_interception(struct vcpu_svm *svm)
return kvm_task_switch(&svm->vcpu, tss_selector, reason);
}
+static int rdtsc_interception(struct vcpu_svm *svm)
+{
+ u64 tsc;
+
+ rdtscll(tsc);
+ tsc += svm->vmcb->control.tsc_offset;
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, tsc & 0xffffffff);
+ tsc >>= 32;
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, tsc & 0xffffffff);
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
+}
+
static int cpuid_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
@@ -2326,6 +2340,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_INIT] = nop_on_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
/* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
+ [SVM_EXIT_RDTSC] = rdtsc_interception,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
--
1.6.5.2
KVM now maintains a reference TSC, which can be exported to the
individual backends for private use.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 144a820..170fd5d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -742,6 +742,7 @@ static DEFINE_PER_CPU(int, cpu_tsc_shift);
static DEFINE_PER_CPU(s64, cpu_tsc_offset);
static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
static DEFINE_PER_CPU(atomic_t, cpu_tsc_synchronized);
+static DEFINE_PER_CPU(int, cpu_tsc_generation);
static int tsc_base_cpu = -1;
static unsigned long ref_tsc_khz;
@@ -813,6 +814,29 @@ static inline u64 compute_ref_tsc(int cpu)
return tsc + per_cpu(cpu_tsc_offset, cpu);
}
+/*
+ * kvm_get_ref_tsc returns a reference TSC value which we attempt to keep
+ * stable across all CPUs, regardless of frequency shift, CPU hotplug, or
+ * pre-existing TSC offsets.
+ */
+u64 kvm_get_ref_tsc(void)
+{
+ int cpu, gen;
+ u64 tsc;
+
+ cpu = get_cpu();
+again:
+ gen = per_cpu(cpu_tsc_generation, cpu);
+ smp_rmb();
+ tsc = compute_ref_tsc(cpu);
+ smp_rmb();
+ if (unlikely(gen != per_cpu(cpu_tsc_generation, cpu)))
+ goto again;
+ put_cpu();
+ return tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
+
#define SYNC_TRIES 64
/*
@@ -929,6 +953,7 @@ static void kvm_sync_tsc(void *cpup)
accumulator -= delta[i+SYNC_TRIES];
accumulator = accumulator / (SYNC_TRIES*2-12);
per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
+ ++per_cpu(cpu_tsc_generation, new_cpu);
atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);
pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu));
}
--
1.6.5.2
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/kvm_timer.h | 19 ++++++++++++++++
arch/x86/kvm/svm.c | 45 +++++++++++++--------------------------
arch/x86/kvm/x86.c | 2 +-
4 files changed, 36 insertions(+), 31 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 758ffc2..26acb4c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -275,6 +275,7 @@ struct kvm_mmu {
struct kvm_vcpu_arch {
u64 host_tsc;
+ u64 tsc_msr_offset;
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524..3863161 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -16,3 +16,22 @@ struct kvm_timer_ops {
enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
+u64 kvm_get_ref_tsc(void);
+
+static inline u64 kvm_get_elapsed_tsc(struct kvm *kvm)
+{
+ return kvm_get_ref_tsc() - kvm->arch.vm_init_tsc;
+}
+
+static inline u64 kvm_get_cpu_tsc(struct kvm_vcpu *vcpu)
+{
+ return kvm_get_elapsed_tsc(vcpu->kvm) + vcpu->arch.tsc_msr_offset;
+}
+
+static inline void kvm_set_cpu_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+ u64 tsc_offset;
+
+ tsc_offset = data - kvm_get_cpu_tsc(vcpu);
+ vcpu->arch.tsc_msr_offset = tsc_offset;
+}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 16c9048..29e88e5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -766,16 +766,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int i;
if (unlikely(cpu != vcpu->cpu)) {
- u64 delta;
-
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- delta = vcpu->arch.host_tsc - native_read_tsc();
- svm->vmcb->control.tsc_offset += delta;
- if (is_nested(svm))
- svm->nested.hsave->control.tsc_offset += delta;
vcpu->cpu = cpu;
kvm_migrate_timers(vcpu);
svm->asid_generation = 0;
@@ -1826,7 +1816,6 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
- svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
@@ -2030,12 +2019,20 @@ static int task_switch_interception(struct vcpu_svm *svm)
return kvm_task_switch(&svm->vcpu, tss_selector, reason);
}
-static int rdtsc_interception(struct vcpu_svm *svm)
+static u64 get_tsc(struct vcpu_svm *svm)
{
u64 tsc;
- rdtscll(tsc);
- tsc += svm->vmcb->control.tsc_offset;
+ tsc = kvm_get_cpu_tsc(&svm->vcpu);
+ if (is_nested(svm))
+ tsc += svm->nested.hsave->control.tsc_offset;
+
+ return tsc;
+}
+
+static int rdtsc_interception(struct vcpu_svm *svm)
+{
+ u64 tsc = get_tsc(svm);
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, tsc & 0xffffffff);
tsc >>= 32;
kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, tsc & 0xffffffff);
@@ -2095,14 +2092,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
switch (ecx) {
case MSR_IA32_TSC: {
- u64 tsc_offset;
-
- if (is_nested(svm))
- tsc_offset = svm->nested.hsave->control.tsc_offset;
- else
- tsc_offset = svm->vmcb->control.tsc_offset;
-
- *data = tsc_offset + native_read_tsc();
+ *data = get_tsc(svm);
break;
}
case MSR_K6_STAR:
@@ -2188,17 +2178,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
switch (ecx) {
case MSR_IA32_TSC: {
- u64 tsc_offset = data - native_read_tsc();
- u64 g_tsc_offset = 0;
-
if (is_nested(svm)) {
- g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
+ u64 tsc_offset = data - kvm_get_cpu_tsc(vcpu);
svm->nested.hsave->control.tsc_offset = tsc_offset;
+ } else {
+ kvm_set_cpu_tsc(vcpu, data);
}
-
- svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
-
break;
}
case MSR_K6_STAR:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 170fd5d..cb323f7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5402,7 +5402,7 @@ struct kvm *kvm_arch_create_vm(void)
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- rdtscll(kvm->arch.vm_init_tsc);
+ kvm->arch.vm_init_tsc = kvm_get_ref_tsc();
return kvm;
}
--
1.6.5.2
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 ++++
arch/x86/kvm/svm.c | 1 +
arch/x86/kvm/x86.c | 3 +++
include/linux/kvm_host.h | 12 ++++++++++++
virt/kvm/kvm_main.c | 18 ++++++++++++++++--
5 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 26acb4c..8e4d606 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -421,6 +421,9 @@ struct kvm_arch{
struct kvm_xen_hvm_config xen_hvm_config;
};
+#define KVM_ARCH_STAT \
+ u32 tsc_resync;
+
struct kvm_vm_stat {
u32 mmu_shadow_zapped;
u32 mmu_pte_write;
@@ -450,6 +453,7 @@ struct kvm_vcpu_stat {
u32 halt_wakeup;
u32 request_irq_exits;
u32 irq_exits;
+ u32 rdtsc_exits;
u32 host_state_reload;
u32 efer_reload;
u32 fpu_reload;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 29e88e5..91eb263 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2036,6 +2036,7 @@ static int rdtsc_interception(struct vcpu_svm *svm)
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, tsc & 0xffffffff);
tsc >>= 32;
kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, tsc & 0xffffffff);
+ ++svm->vcpu.stat.rdtsc_exits;
skip_emulated_instruction(&svm->vcpu);
return 1;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cb323f7..520ea6a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -129,6 +129,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
+ { "rdtsc_exits", VCPU_STAT(rdtsc_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
{ "fpu_reload", VCPU_STAT(fpu_reload) },
@@ -146,6 +147,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
+ { "tsc_resync", KVM_STAT(tsc_resync) },
{ NULL }
};
@@ -977,6 +979,7 @@ static void kvm_do_sync_tsc(int cpu)
smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
(void *)&cpu, 1);
}
+ ++kvm_stats.tsc_resync;
out_unlock:
spin_unlock(&kvm_tsc_lock);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd5a616..dc5e1d6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -503,6 +503,7 @@ static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
enum kvm_stat_kind {
KVM_STAT_VM,
KVM_STAT_VCPU,
+ KVM_STAT_GLOBAL
};
struct kvm_stats_debugfs_item {
@@ -514,6 +515,17 @@ struct kvm_stats_debugfs_item {
extern struct kvm_stats_debugfs_item debugfs_entries[];
extern struct dentry *kvm_debugfs_dir;
+#ifndef KVM_ARCH_STAT
+#define KVM_ARCH_STAT
+#endif
+
+struct kvm_stat {
+ KVM_ARCH_STAT
+};
+
+extern struct kvm_stat kvm_stats;
+#define KVM_STAT(x) offsetof(struct kvm_stat, x), KVM_STAT_GLOBAL
+
#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bd44fb4..6fc54a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1971,9 +1971,23 @@ static int vcpu_stat_get(void *_offset, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
+struct kvm_stat kvm_stats;
+EXPORT_SYMBOL_GPL(kvm_stats);
+
+static int kvm_stat_get(void *_offset, u64 *val)
+{
+ unsigned offset = (long)_offset;
+
+ *val = *(u32 *)((void *)&kvm_stats + offset);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(kvm_stat_fops, kvm_stat_get, NULL, "%llu\n");
+
static const struct file_operations *stat_fops[] = {
- [KVM_STAT_VCPU] = &vcpu_stat_fops,
- [KVM_STAT_VM] = &vm_stat_fops,
+ [KVM_STAT_VCPU] = &vcpu_stat_fops,
+ [KVM_STAT_VM] = &vm_stat_fops,
+ [KVM_STAT_GLOBAL] = &kvm_stat_fops,
};
static void kvm_init_debug(void)
--
1.6.5.2
If CPU frequency governors can change the TSC frequency, use
the highest possible frequency as the reference.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 9 ++++++++-
1 files changed, 8 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 520ea6a..3c4266f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3701,7 +3701,15 @@ static void kvm_timer_init(void)
* can change at any time.
*/
register_cpu_notifier(&kvm_x86_cpu_notifier);
+ ref_tsc_khz = tsc_khz;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+ struct cpufreq_policy policy;
+
+ cpufreq_get_policy(&policy, get_cpu());
+ ref_tsc_khz = policy.cpuinfo.max_freq;
+ put_cpu();
+#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
for_each_online_cpu(cpu) {
@@ -3718,7 +3726,6 @@ static void kvm_timer_init(void)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
}
tsc_base_cpu = get_cpu();
- ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu);
resync(NULL);
put_cpu();
}
--
1.6.5.2
Use per-cpu delta counters and statistically eliminate outliers which
might happen due to platform anomalies such as NMI.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 100 +++++++++++++++++++++++++++++++++++++---------------
1 files changed, 71 insertions(+), 29 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4266f..c66dede 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -840,6 +840,11 @@ again:
EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
#define SYNC_TRIES 64
+struct cpu_delta_array {
+ s64 delta[SYNC_TRIES];
+};
+static DEFINE_PER_CPU(struct cpu_delta_array, delta_array);
+
/*
* sync_tsc_helper is a dual-entry coroutine meant to be run by only
@@ -849,25 +854,16 @@ EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
*
* To discount cache latency effects, this routine will be called
* twice, one with the measure / recording CPUs reversed. In addition,
- * the first 4 and last 2 results will be discarded to allow branch
- * predicition to become established (and to discount effects from
- * a potentially early predicted loop exit).
+ * the first and last results will be discarded to allow branch predicition
+ * to become established (and to discount effects from a potentially early
+ * predicted loop exit).
*
- * Because of this, we must be extra careful to guard the entrance
- * and exit against the CPU switch. I chose to use atomic instructions
- * only at the end of the measure loop and use the same routine for
- * both CPUs, with symmetric comparisons, and a statically placed
- * recording array, hopefully maximizing the branch predicition and
- * cache locality. The results appear quite good; on known to be
- * synchronized CPUs, I typically get < 10 TSC delta measured, with
- * maximum observed error on the order of 100 cycles.
- *
- * This doesn't account for NUMA cache effects, and could potentially
- * be improved there by moving the delta[] array to the stack of the
- * measuring CPU. In fact, this modification might be worth trying
- * for non-NUMA systems as well, but this appears to be fine for now.
+ * We allocate the delta array as a per-cpu variable to get local cache
+ * allocation, and also take steps to statistically ignore outliers which
+ * might be caused by SMIs. It may seem like overkill, but it is very
+ * accurate, which is what we're aiming for.
*/
-static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready)
{
int tries;
static u64 tsc_other;
@@ -915,12 +911,59 @@ static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
mb();
}
+/*
+ * Average and trim the samples of any outliers; we use > 2 x sigma
+ */
+static s64 average_samples(s64 *samples, unsigned num_samples)
+{
+ unsigned i, j;
+ s64 mean;
+ u64 stddev;
+ s64 average;
+
+ /* First get the mean */
+ mean = 0;
+ for (i = 0; i < num_samples; i++)
+ mean += samples[i];
+ mean = mean / num_samples;
+
+ /* Now the deviation */
+ stddev = 0;
+ for (i = 0; i < num_samples; i++) {
+ s64 dev = samples[i] - mean;
+ stddev += dev * dev;
+ }
+ stddev = stddev / (num_samples - 1);
+ stddev = int_sqrt(stddev);
+
+ /* Throw out anything outside 2x stddev */
+ stddev <<= 1;
+ average = 0, j = 0;
+ for (i = 0; i < num_samples; i++) {
+ s64 dev = samples[i] - mean;
+ if (dev < 0)
+ dev = -dev;
+ if (dev <= stddev) {
+ average += samples[i];
+ j++;
+ }
+ }
+ if (j > 0)
+ average = average / j;
+ else
+ printk(KERN_ERR "kvm: TSC samples failed to converge!\n");
+ pr_debug("%s: mean = %lld, stddev = %llu, average = %lld\n",
+ __func__, mean, stddev, average);
+
+ return average;
+}
+
static void kvm_sync_tsc(void *cpup)
{
int new_cpu = *(int *)cpup;
unsigned long flags;
- static s64 delta[SYNC_TRIES*2];
- static atomic_t ready = ATOMIC_INIT(1);
+ s64 *delta1, *delta2;
+ static atomic_t ready ____cacheline_aligned = ATOMIC_INIT(1);
BUG_ON(tsc_base_cpu == -1);
pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) , tsc_base_cpu);
@@ -933,27 +976,26 @@ static void kvm_sync_tsc(void *cpup)
&per_cpu(cpu_tsc_multiplier, new_cpu),
&per_cpu(cpu_tsc_shift, new_cpu));
}
- sync_tsc_helper(tsc_base_cpu, delta, &ready);
- sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+ delta1 = per_cpu(delta_array, tsc_base_cpu).delta;
+ delta2 = per_cpu(delta_array, new_cpu).delta;
+ sync_tsc_helper(tsc_base_cpu, delta1, &ready);
+ sync_tsc_helper(new_cpu, delta2, &ready);
if (raw_smp_processor_id() == new_cpu) {
- int i;
s64 accumulator = 0;
/*
- * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
- * subtract [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+ * accumulate [2,SYNC_TRIES-1) of tsc{base} - tsc{new}
+ * subtract [SYNC_TRIES+2,-1) of tsc{new} - tsc{base}
*
* this allows instruction cycle and cache differences to
* cancel each other out and drops warm up/cool down variation
*
* Note the arithmatic must be signed because of the divide
*/
+ accumulator += average_samples(&delta1[2], SYNC_TRIES-3);
+ accumulator -= average_samples(&delta2[2], SYNC_TRIES-3);
+ accumulator /= 2;
- for (i = 4; i < SYNC_TRIES - 2; i++)
- accumulator += delta[i];
- for (i = 4; i < SYNC_TRIES - 2; i++)
- accumulator -= delta[i+SYNC_TRIES];
- accumulator = accumulator / (SYNC_TRIES*2-12);
per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
++per_cpu(cpu_tsc_generation, new_cpu);
atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);
--
1.6.5.2
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 14 +++++++++++---
1 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c66dede..8bd362b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -914,6 +914,7 @@ static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready)
/*
* Average and trim the samples of any outliers; we use > 2 x sigma
*/
+static u64 tsc_deviation;
static s64 average_samples(s64 *samples, unsigned num_samples)
{
unsigned i, j;
@@ -936,6 +937,11 @@ static s64 average_samples(s64 *samples, unsigned num_samples)
stddev = stddev / (num_samples - 1);
stddev = int_sqrt(stddev);
+ /* Accumulate deviation measurement into a moving average */
+ tsc_deviation = tsc_deviation ?
+ ((tsc_deviation << 4) - tsc_deviation + stddev) >> 4 :
+ stddev;
+
/* Throw out anything outside 2x stddev */
stddev <<= 1;
average = 0, j = 0;
@@ -950,10 +956,12 @@ static s64 average_samples(s64 *samples, unsigned num_samples)
}
if (j > 0)
average = average / j;
- else
+ else {
printk(KERN_ERR "kvm: TSC samples failed to converge!\n");
- pr_debug("%s: mean = %lld, stddev = %llu, average = %lld\n",
- __func__, mean, stddev, average);
+ average = mean;
+ }
+ pr_debug("%s: mean=%lld, stddev=%llu, average=%lld hdev=%llu\n",
+ __func__, mean, stddev, average, tsc_deviation);
return average;
}
--
1.6.5.2
There's enough of these floating around randomly to justify using a
structure for keeping them in one place.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 110 ++++++++++++++++++++++++++++++----------------------
1 files changed, 64 insertions(+), 46 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8bd362b..7bc20ac 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -738,19 +738,24 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
}
DEFINE_SPINLOCK(kvm_tsc_lock);
-static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
-static DEFINE_PER_CPU(int, cpu_tsc_shift);
-static DEFINE_PER_CPU(s64, cpu_tsc_offset);
-static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
-static DEFINE_PER_CPU(atomic_t, cpu_tsc_synchronized);
-static DEFINE_PER_CPU(int, cpu_tsc_generation);
+struct cpu_tsc_vars
+{
+ unsigned long tsc_khz;
+ unsigned long tsc_multiplier;
+ unsigned int tsc_shift;
+ int tsc_generation;
+ s64 tsc_offset;
+ u64 tsc_measure_base;
+ atomic_t tsc_synchronized;
+};
+static DEFINE_PER_CPU(struct cpu_tsc_vars, cpu_tsc_vars);
+
static int tsc_base_cpu = -1;
static unsigned long ref_tsc_khz;
static inline int cpu_is_tsc_synchronized(int cpu)
{
- return (atomic_read(&per_cpu(cpu_tsc_synchronized, cpu)) != 0);
+ return (atomic_read(&per_cpu(cpu_tsc_vars, cpu).tsc_synchronized) != 0);
}
static inline unsigned long div_precise(unsigned long hi, unsigned long lo,
@@ -808,12 +813,12 @@ static inline unsigned long mult_precise(unsigned long val, unsigned long mult,
return bot;
}
-static inline u64 compute_ref_tsc(int cpu)
+static inline u64 compute_ref_tsc(void)
{
- u64 tsc = native_read_tsc() - per_cpu(cpu_tsc_measure_base, cpu);
- tsc = mult_precise(tsc, per_cpu(cpu_tsc_multiplier, cpu),
- per_cpu(cpu_tsc_shift, cpu));
- return tsc + per_cpu(cpu_tsc_offset, cpu);
+ struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
+ u64 tsc = native_read_tsc() - cv->tsc_measure_base;
+ tsc = mult_precise(tsc, cv->tsc_multiplier, cv->tsc_shift);
+ return tsc + cv->tsc_offset;
}
/*
@@ -824,15 +829,17 @@ static inline u64 compute_ref_tsc(int cpu)
u64 kvm_get_ref_tsc(void)
{
int cpu, gen;
+ struct cpu_tsc_vars *cv;
u64 tsc;
cpu = get_cpu();
+ cv = &per_cpu(cpu_tsc_vars, cpu);
again:
- gen = per_cpu(cpu_tsc_generation, cpu);
+ gen = cv->tsc_generation;
smp_rmb();
- tsc = compute_ref_tsc(cpu);
+ tsc = compute_ref_tsc();
smp_rmb();
- if (unlikely(gen != per_cpu(cpu_tsc_generation, cpu)))
+ if (unlikely(gen != cv->tsc_generation))
goto again;
put_cpu();
return tsc;
@@ -889,7 +896,7 @@ static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready)
/* wait */;
}
native_cpuid(&junk, &junk, &junk, &junk);
- tsc = compute_ref_tsc(cpu);
+ tsc = compute_ref_tsc();
rdtsc_barrier();
if (cpu == measure_cpu) {
while (!atomic_read(ready))
@@ -972,17 +979,18 @@ static void kvm_sync_tsc(void *cpup)
unsigned long flags;
s64 *delta1, *delta2;
static atomic_t ready ____cacheline_aligned = ATOMIC_INIT(1);
+ struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, new_cpu);
BUG_ON(tsc_base_cpu == -1);
- pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) , tsc_base_cpu);
local_irq_save(flags);
if (raw_smp_processor_id() == new_cpu) {
- per_cpu(cpu_tsc_measure_base, new_cpu) = native_read_tsc();
- per_cpu(cpu_tsc_offset, new_cpu) = 0;
- compute_best_multiplier(ref_tsc_khz,
- per_cpu(cpu_tsc_khz, new_cpu),
- &per_cpu(cpu_tsc_multiplier, new_cpu),
- &per_cpu(cpu_tsc_shift, new_cpu));
+ cv->tsc_measure_base = native_read_tsc();
+ cv->tsc_offset = 0;
+ compute_best_multiplier(ref_tsc_khz, cv->tsc_khz,
+ &cv->tsc_multiplier, &cv->tsc_shift);
+ pr_debug("%s: IN, cpu = %d, freq = %ldkHz, measure_base = %lld,"
+ " tsc_base_cpu = %d\n", __func__, new_cpu, cv->tsc_khz,
+ cv->tsc_measure_base, tsc_base_cpu);
}
delta1 = per_cpu(delta_array, tsc_base_cpu).delta;
delta2 = per_cpu(delta_array, new_cpu).delta;
@@ -1004,22 +1012,32 @@ static void kvm_sync_tsc(void *cpup)
accumulator -= average_samples(&delta2[2], SYNC_TRIES-3);
accumulator /= 2;
- per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
- ++per_cpu(cpu_tsc_generation, new_cpu);
- atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);
- pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu));
+ cv->tsc_offset = accumulator;
+ smp_wmb();
+ ++cv->tsc_generation;
+ atomic_set(&cv->tsc_synchronized, 1);
+ ++kvm_stats.tsc_resync;
+ pr_debug("%s: OUT, cpu = %d, delta = %lld, cpu_tsc_offset = "
+ "%lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n",
+ __func__, new_cpu, accumulator, cv->tsc_offset,
+ cv->tsc_multiplier, cv->tsc_shift);
}
local_irq_restore(flags);
}
static void kvm_do_sync_tsc(int cpu)
{
+ BUG_ON(tsc_base_cpu == -1);
spin_lock(&kvm_tsc_lock);
/* tsc_base_cpu can change without tsc_lock, so recheck */
if (unlikely(cpu == tsc_base_cpu))
goto out_unlock;
+ /*
+ * We're calling a co-routine; if we're one of the called CPUs, we
+ * must call ourselves last.
+ */
if (raw_smp_processor_id() != tsc_base_cpu) {
smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
(void *)&cpu, 0);
@@ -1046,12 +1064,12 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
- this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+ this_tsc_khz = get_cpu_var(cpu_tsc_vars).tsc_khz;
if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
vcpu->hv_clock_tsc_khz = this_tsc_khz;
}
- put_cpu_var(cpu_tsc_khz);
+ put_cpu_var(cpu_tsc_vars);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -3572,18 +3590,16 @@ static void resync_user(struct work_struct *work)
static void resync(void *info)
{
- int cpu;
+ struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
u64 tsc;
/* Fixup our own values to stay in sync with the reference */
- cpu = raw_smp_processor_id();
- tsc = compute_ref_tsc(cpu);
- per_cpu(cpu_tsc_measure_base, cpu) = native_read_tsc();
- per_cpu(cpu_tsc_offset, cpu) = tsc;
- compute_best_multiplier(ref_tsc_khz, per_cpu(cpu_tsc_khz, cpu),
- &per_cpu(cpu_tsc_multiplier, cpu),
- &per_cpu(cpu_tsc_shift, cpu));
- atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 1);
+ tsc = compute_ref_tsc();
+ cv->tsc_measure_base = native_read_tsc();
+ cv->tsc_offset = tsc;
+ compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, &cv->tsc_multiplier,
+ &cv->tsc_shift);
+ atomic_set(&cv->tsc_synchronized, 1);
/* Then, get everybody else on board */
if (!work_scheduled) {
@@ -3599,6 +3615,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
+ struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, freq->cpu);
/*
* There is no way to precisely know the TSC value at which time the
@@ -3612,7 +3629,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
*/
if (val == CPUFREQ_PRECHANGE) {
get_online_cpus();
- atomic_set(&per_cpu(cpu_tsc_synchronized, freq->cpu), 0);
+ atomic_set(&cv->tsc_synchronized, 0);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -3642,7 +3659,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* Just use the vm_tsc_lock for a mutex.
*/
spin_lock(&kvm_tsc_lock);
- per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+ cv->tsc_khz = freq->new;
spin_unlock(&kvm_tsc_lock);
return 0;
@@ -3691,12 +3708,13 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
case CPU_DYING:
case CPU_UP_CANCELED:
- atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 0);
+ atomic_set(&per_cpu(cpu_tsc_vars, cpu).tsc_synchronized, 0);
break;
case CPU_ONLINE:
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu);
+ per_cpu(cpu_tsc_vars, cpu).tsc_khz =
+ cpufreq_quick_get(cpu);
kvm_do_sync_tsc(cpu);
break;
}
@@ -3767,13 +3785,13 @@ static void kvm_timer_init(void)
if (!khz)
khz = tsc_khz;
spin_lock(&kvm_tsc_lock);
- if (!per_cpu(cpu_tsc_khz, cpu))
- per_cpu(cpu_tsc_khz, cpu) = khz;
+ if (!per_cpu(cpu_tsc_vars, cpu).tsc_khz)
+ per_cpu(cpu_tsc_vars, cpu).tsc_khz = khz;
spin_unlock(&kvm_tsc_lock);
}
} else {
for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ per_cpu(cpu_tsc_vars, cpu).tsc_khz = tsc_khz;
}
tsc_base_cpu = get_cpu();
resync(NULL);
--
1.6.5.2
First, we could get multiple CPU frequency updates in rapid succession.
If the base CPU changes, we must resynchronize all CPUs, but the work
scheduling was poorly serialized, which could result in some CPUs
missing the resynchronize action.
Second, the work function doesn't protect against CPU hotplug, and
further, it isn't actually possible; we would have to hold-off CPU
hotplug through the IPIs.
Third, we could potentially interrupt a call to get the reference
clock on the base CPU with a resync IPI, resulting in a skewed return
value in the original call.
The problem stems from the fact that any random CPU may come along and
change the tsc_khz variable of another; at the same time, we are doing
resynchronize through an IPI which requires us to abstain from using
higher level mutex primitives. Combined with the fact that without
locking, we can't guarantee the tsc_base_cpu is stable, this makes most
locking schemes untenable.
So, I fix the flaw by creating these rules:
1) per_cpu tsc_khz variable is updated only on the local CPU
2) tsc_base_cpu is changed only in CPU take down
Now, we can avoid CPU takedown by simply entering a preemption disabled
state; CPU take down is done as a stop_machine action, and thus can't
run concurrently with any preemption disabled region. So, we can now
guarantee tsc_base_cpu is stable with just get_cpu().
Further, since all tsc_khz updates are local, we can avoid tsc_khz
changes by blocking interrupts. This isn't used for the general
kvm_get_ref_tsc(), which doesn't need to use such a heavy primitive,
but it is useful when resynchronizing to the master; the evict()
callout used to find how far advanced slave CPUs have advanced since
the master frequency change requires not to be interrupted with
another frequency change (this frequency change is likely as the
callouts to notifier chains for the CPU frequency change can happen
in the order {PRE|BASE},{PRE|NON-BASE}; the first callout will try
to schedule resync_all as work to be done). Using the fact that the
tsc_khz update only happens from an interrupt also allows us to
simplify the generation update; no intermediate generations can ever
be observed, thus we can simplify the generation count again.
Damn, this is complicated crap. The analagous task in real life would
be keeping a band of howler monkeys, each in their own tree, singing in
unison while the lead vocalist jumps from tree to tree, and meanwhile,
an unseen conductor keeps changing the tempo the piece is played at.
Thankfully, there are no key changes, however, occasionally new trees
sprout up at random and live ones fall over.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 139 ++++++++++++++++++++++++++++++++--------------------
1 files changed, 85 insertions(+), 54 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7bc20ac..7e2ba3e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -747,6 +747,7 @@ struct cpu_tsc_vars
s64 tsc_offset;
u64 tsc_measure_base;
atomic_t tsc_synchronized;
+ u64 last_ref;
};
static DEFINE_PER_CPU(struct cpu_tsc_vars, cpu_tsc_vars);
@@ -3570,42 +3571,83 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
}
EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-static void evict(void *info)
+/*
+ * evict(struct cpufreq_freqs *data)
+ *
+ * This function is meant to be called from an IPI and runs with
+ * interrupts disabled. It records the last good value of the
+ * reference TSC before optionally updating the CPU frequency and
+ * marking the CPU unsynchronized. Since it is entered from an
+ * interrupt handler, it can be used to bring CPUs out of hardware
+ * virtualization and have them wait until their clock has been
+ * resynchronized.
+ */
+static void evict(void *data)
{
- /* nothing */
+ struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
+ WARN_ON(!irqs_disabled());
+ cv->last_ref = compute_ref_tsc();
+ if (data) {
+ struct cpufreq_freqs *freq = data;
+ cv->tsc_khz = freq->new;
+ }
+ atomic_set(&cv->tsc_synchronized, 0);
}
-static struct execute_work resync_work;
-static int work_scheduled;
-
-static void resync_user(struct work_struct *work)
+static long resync(void *unused)
{
+ struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
+ u64 tsc = 0;
int cpu;
- work_scheduled = 0;
- for_each_online_cpu(cpu)
- if (cpu != tsc_base_cpu)
- kvm_do_sync_tsc(cpu);
-}
+ /*
+ * First, make sure we are on the right CPU; between when the work got
+ * scheduled and when this runs, the tsc_base could have changed. We
+ * lock out changes to tsc_base_cpu with cpu_get(). The cpu takedown
+ * code can't run until all non-preempt sections are finished, so being
+ * in a non-preempt section protects the base from changing. Tricky.
+ */
+ cpu = get_cpu();
+ if (cpu != tsc_base_cpu)
+ return -1;
-static void resync(void *info)
-{
- struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
- u64 tsc;
+ /*
+ * When the base CPU frequency changes independenetly of other
+ * cores, the reference TSC can fall behind. Rather than
+ * introduce a complicated and fragile locking scheme for this
+ * rare case, simply evict all CPUs from VM execution and take
+ * the highest global reference clock. This also allows us to
+ * recover from skew events which slightly advance other clocks
+ * relative to the base.
+ */
+ on_each_cpu(evict, NULL, 1);
+ for_each_online_cpu(cpu)
+ if (per_cpu(cpu_tsc_vars, cpu).last_ref > tsc)
+ tsc = per_cpu(cpu_tsc_vars, cpu).last_ref;
/* Fixup our own values to stay in sync with the reference */
- tsc = compute_ref_tsc();
cv->tsc_measure_base = native_read_tsc();
cv->tsc_offset = tsc;
+ cv->tsc_generation++; // XXX needed? */
compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, &cv->tsc_multiplier,
&cv->tsc_shift);
atomic_set(&cv->tsc_synchronized, 1);
- /* Then, get everybody else on board */
- if (!work_scheduled) {
- work_scheduled = 1;
- execute_in_process_context(resync_user, &resync_work);
- }
+ for_each_online_cpu(cpu)
+ kvm_do_sync_tsc(cpu);
+
+ put_cpu();
+ return 0;
+}
+
+static DEFINE_MUTEX(resync_lock);
+
+static void resync_all(void)
+{
+ mutex_lock(&resync_lock);
+ while (work_on_cpu(tsc_base_cpu, resync, NULL) != 0);
+ /* do nothing */
+ mutex_unlock(&resync_lock);
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -3614,8 +3656,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct cpufreq_freqs *freq = data;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i, send_ipi = 0;
- struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, freq->cpu);
+ int i;
/*
* There is no way to precisely know the TSC value at which time the
@@ -3625,43 +3666,34 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* any CPUs which may be running in hardware virtualization.
*
* We do this by setting cpu_tsc_synchronized to zero and polling for
- * this value to change when entering hardware virtualization.
+ * this value to change when entering hardware virtualization; this
+ * will happen in the call to evict(), which also stores the new freq
*/
if (val == CPUFREQ_PRECHANGE) {
- get_online_cpus();
- atomic_set(&cv->tsc_synchronized, 0);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
- if (vcpu->cpu != smp_processor_id())
- send_ipi++;
kvm_request_guest_time_update(vcpu);
}
}
spin_unlock(&kvm_lock);
- if (send_ipi) {
- /*
- * In case we update the frequency for another cpu
- * (which might be in guest context) send an interrupt
- * to kick the cpu out of guest context. Next time
- * guest context is entered kvmclock will be updated,
- * so the guest will not see stale values.
- */
- smp_call_function_single(freq->cpu, evict, NULL, 1);
- }
/*
- * The update of the frequency can't happen while a VM
- * is running, nor can it happen during init when we can
- * race against the init code setting the first known freq.
- * Just use the vm_tsc_lock for a mutex.
+ * In case we update the frequency for another cpu (which might
+ * be in guest context) we must send an interrupt to kick the
+ * cpu out of guest context. Next time guest context is
+ * entered kvmclock will be updated, so the guest will not see
+ * stale values.
+ *
+ * The update of the frequency can't happen while a VM is
+ * running, nor can it happen while we might be possibly
+ * running a resync. To avoid locking problems which arise
+ * otherwise, always update the tsc_khz value on the affected
+ * processor; this is done by passing the freq data to evict
*/
- spin_lock(&kvm_tsc_lock);
- cv->tsc_khz = freq->new;
- spin_unlock(&kvm_tsc_lock);
-
+ smp_call_function_single(freq->cpu, evict, data, 1);
return 0;
}
@@ -3673,8 +3705,9 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* order in which this notifier is called; this is harmless.
*/
if (val == CPUFREQ_POSTCHANGE) {
+ get_online_cpus();
if (freq->cpu == tsc_base_cpu)
- smp_call_function_single(freq->cpu, resync, NULL, 1);
+ resync_all();
else if (cpu_is_tsc_synchronized(tsc_base_cpu))
/* Can't do this if base is not yet updated */
kvm_do_sync_tsc(freq->cpu);
@@ -3694,19 +3727,17 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
val &= ~CPU_TASKS_FROZEN;
switch (val) {
- case CPU_DOWN_PREPARE:
+ case CPU_DYING:
if (cpu == tsc_base_cpu) {
int new_cpu;
- spin_lock(&kvm_tsc_lock);
for_each_online_cpu(new_cpu)
if (new_cpu != tsc_base_cpu)
break;
tsc_base_cpu = new_cpu;
- spin_unlock(&kvm_tsc_lock);
}
break;
- case CPU_DYING:
+ case CPU_DOWN_PREPARE:
case CPU_UP_CANCELED:
atomic_set(&per_cpu(cpu_tsc_vars, cpu).tsc_synchronized, 0);
break;
@@ -3784,18 +3815,18 @@ static void kvm_timer_init(void)
unsigned long khz = cpufreq_get(cpu);
if (!khz)
khz = tsc_khz;
- spin_lock(&kvm_tsc_lock);
+ local_irq_disable();
if (!per_cpu(cpu_tsc_vars, cpu).tsc_khz)
per_cpu(cpu_tsc_vars, cpu).tsc_khz = khz;
- spin_unlock(&kvm_tsc_lock);
+ local_irq_enable();
}
} else {
for_each_possible_cpu(cpu)
per_cpu(cpu_tsc_vars, cpu).tsc_khz = tsc_khz;
}
tsc_base_cpu = get_cpu();
- resync(NULL);
put_cpu();
+ resync_all();
}
int kvm_arch_init(void *opaque)
--
1.6.5.2
Turns out it wasn't very precise; it needs to work on 64-bit values.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 18 +++++++++++++++---
1 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7e2ba3e..792c895 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -803,15 +803,27 @@ static void compute_best_multiplier(unsigned long a, unsigned long b,
*s = shift;
}
-static inline unsigned long mult_precise(unsigned long val, unsigned long mult,
- int shift)
+static inline u64 mult_precise(u64 val, unsigned long mult, int shift)
{
+#if (BITS_PER_LONG == 64)
unsigned long top, bot;
- __asm__ ( "mul %3; shrd %1, %0" :
+ __asm__ ( "mulq %3; shrdq %1, %0" :
"=&a" (bot), "=&d" (top) :
"0" (mult), "rm" (val), "c" (shift));
return bot;
+#else
+ unsigned long ltop, lbot;
+ unsigned long htop, hbot;
+
+ __asm__ ( "mull %3; shrd %1, %0" :
+ "=&a" (lbot), "=&d" (ltop) :
+ "0" (mult), "rm" (long)(val), "c" (shift));
+ __asm__ ( "mull %3; shrd %1, %0" :
+ "=&a" (hbot), "=&d" (htop) :
+ "0" (mult), "rm" (long)(val >> 32), "c" (shift));
+ return (u64)lbot + ((u64)ltop + (u64)hbot) << 32;
+#endif
}
static inline u64 compute_ref_tsc(void)
--
1.6.5.2
Resync all CPUs to measure TSC skew periodically. Use the measured skew
to adjust the resync time (not done yet - heuristic needed)
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 90 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 792c895..3a854ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -750,9 +750,10 @@ struct cpu_tsc_vars
u64 last_ref;
};
static DEFINE_PER_CPU(struct cpu_tsc_vars, cpu_tsc_vars);
-
static int tsc_base_cpu = -1;
static unsigned long ref_tsc_khz;
+static u64 tsc_drift;
+static struct timer_list resync_timer;
static inline int cpu_is_tsc_synchronized(int cpu)
{
@@ -935,6 +936,7 @@ static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready)
* Average and trim the samples of any outliers; we use > 2 x sigma
*/
static u64 tsc_deviation;
+static u64 tsc_skew;
static s64 average_samples(s64 *samples, unsigned num_samples)
{
unsigned i, j;
@@ -993,10 +995,24 @@ static void kvm_sync_tsc(void *cpup)
s64 *delta1, *delta2;
static atomic_t ready ____cacheline_aligned = ATOMIC_INIT(1);
struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, new_cpu);
+ static u64 old_base;
+ static s64 old_offset;
+ static unsigned long old_multiplier;
+ static unsigned int old_shift;
BUG_ON(tsc_base_cpu == -1);
local_irq_save(flags);
+
+ /*
+ * First, the new CPU may be just coming up to sync or might have
+ * changed frequency, which means the measurement base must be
+ * adjusted. If not, we can use it to compute a skew estimate.
+ */
if (raw_smp_processor_id() == new_cpu) {
+ old_multiplier = cv->tsc_multiplier;
+ old_shift = cv->tsc_shift;
+ old_base = cv->tsc_measure_base;
+ old_offset = cv->tsc_offset;
cv->tsc_measure_base = native_read_tsc();
cv->tsc_offset = 0;
compute_best_multiplier(ref_tsc_khz, cv->tsc_khz,
@@ -1005,10 +1021,12 @@ static void kvm_sync_tsc(void *cpup)
" tsc_base_cpu = %d\n", __func__, new_cpu, cv->tsc_khz,
cv->tsc_measure_base, tsc_base_cpu);
}
+
delta1 = per_cpu(delta_array, tsc_base_cpu).delta;
delta2 = per_cpu(delta_array, new_cpu).delta;
sync_tsc_helper(tsc_base_cpu, delta1, &ready);
sync_tsc_helper(new_cpu, delta2, &ready);
+
if (raw_smp_processor_id() == new_cpu) {
s64 accumulator = 0;
@@ -1024,8 +1042,40 @@ static void kvm_sync_tsc(void *cpup)
accumulator += average_samples(&delta1[2], SYNC_TRIES-3);
accumulator -= average_samples(&delta2[2], SYNC_TRIES-3);
accumulator /= 2;
-
cv->tsc_offset = accumulator;
+
+ /*
+ * Skew can be computed over a constant multiplier as follows:
+ *
+ * ref_new = (tsc_new - base_new) * mult + off_new
+ * ref_old = (tsc_old - base_old) * mult + off_old
+ *
+ * skew = ref_new - (ref_old + delta_ref)
+ *
+ * skew = off_new - off_old + mult(tsc_new - tsc_old)
+ * - mult(base_new - base_old) - delta_ref
+ *
+ * The tsc_old / tsc_new values are not recoverable, but
+ * observe that mult(tsc_new - tsc_old) == delta_ref, so
+ *
+ * skew = delta(off) - mult(delta base)
+ *
+ * To avoid problems with signed computation, we multiply
+ * unsigned numbers first before switching to signed arithmetic
+ */
+ if (old_multiplier == cv->tsc_multiplier &&
+ old_shift == cv->tsc_shift) {
+ u64 sbo = old_base, sbn = cv->tsc_measure_base;
+ s64 skew;
+ sbo = mult_precise(sbo, old_multiplier, old_shift);
+ sbn = mult_precise(sbn, old_multiplier, old_shift);
+ skew = cv->tsc_offset - old_offset + (sbo - sbn);
+ if (skew < 0)
+ skew = -skew;
+ if (skew > tsc_skew)
+ tsc_skew = skew;
+ }
+
smp_wmb();
++cv->tsc_generation;
atomic_set(&cv->tsc_synchronized, 1);
@@ -3611,6 +3661,8 @@ static long resync(void *unused)
struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
u64 tsc = 0;
int cpu;
+ static unsigned long jif_old;
+ unsigned long jif_delta;
/*
* First, make sure we are on the right CPU; between when the work got
@@ -3643,17 +3695,28 @@ static long resync(void *unused)
cv->tsc_generation++; // XXX needed? */
compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, &cv->tsc_multiplier,
&cv->tsc_shift);
+ tsc_skew = 0;
atomic_set(&cv->tsc_synchronized, 1);
+ smp_wmb();
for_each_online_cpu(cpu)
kvm_do_sync_tsc(cpu);
+ for_each_online_cpu(cpu)
+ while (!cpu_is_tsc_synchronized(cpu))
+ cpu_relax();
+
+ smp_rmb();
+ jif_delta = jiffies - jif_old;
+ pr_debug("max TSC skew now estimated at %llu over %lu jiffies\n",
+ tsc_skew, jif_delta);
+ jif_old = jiffies;
+ mod_timer(&resync_timer, jiffies + HZ * 50);
put_cpu();
return 0;
}
static DEFINE_MUTEX(resync_lock);
-
static void resync_all(void)
{
mutex_lock(&resync_lock);
@@ -3662,6 +3725,18 @@ static void resync_all(void)
mutex_unlock(&resync_lock);
}
+static struct work_struct resync_work;
+static void resync_work_fn(struct work_struct *work)
+{
+ resync_all();
+}
+
+static void resync_callout(unsigned long unused)
+{
+ INIT_WORK(&resync_work, resync_work_fn);
+ schedule_work(&resync_work);
+}
+
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -3836,6 +3911,15 @@ static void kvm_timer_init(void)
for_each_possible_cpu(cpu)
per_cpu(cpu_tsc_vars, cpu).tsc_khz = tsc_khz;
}
+
+ /*
+ * Now, pick a CPU to make the master and synchronize all other
+ * CPUs to it's clock. Periodically check for drift as well.
+ * Our initial drift estimate is 1 ppm / sec.
+ */
+ tsc_drift = ref_tsc_khz / 1000;
+ init_timer(&resync_timer);
+ resync_timer.function = resync_callout;
tsc_base_cpu = get_cpu();
put_cpu();
resync_all();
@@ -3898,6 +3982,9 @@ void kvm_arch_exit(void)
pci_write_config_byte(*nb, 0x87, disabled_c1_ramp);
}
#endif
+ mutex_lock(&resync_lock);
+ del_timer(&resync_timer);
+ mutex_unlock(&resync_lock);
}
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
--
1.6.5.2
Add ioctl's to get and set TSC rate per-VM. The TSC rate is set at
creation to the reference tsc rate.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 +++
arch/x86/kvm/kvm_timer.h | 14 +---------
arch/x86/kvm/x86.c | 49 +++++++++++++++++++++++++++++++++++++++
include/linux/kvm.h | 3 ++
4 files changed, 58 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e4d606..313befc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -276,6 +276,7 @@ struct kvm_mmu {
struct kvm_vcpu_arch {
u64 host_tsc;
u64 tsc_msr_offset;
+
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
@@ -417,6 +418,9 @@ struct kvm_arch{
unsigned long irq_sources_bitmap;
u64 vm_init_tsc;
s64 kvmclock_offset;
+ unsigned int tsc_khz;
+ unsigned long tsc_multiplier;
+ int tsc_shift;
struct kvm_xen_hvm_config xen_hvm_config;
};
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 3863161..670be31 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -23,15 +23,5 @@ static inline u64 kvm_get_elapsed_tsc(struct kvm *kvm)
return kvm_get_ref_tsc() - kvm->arch.vm_init_tsc;
}
-static inline u64 kvm_get_cpu_tsc(struct kvm_vcpu *vcpu)
-{
- return kvm_get_elapsed_tsc(vcpu->kvm) + vcpu->arch.tsc_msr_offset;
-}
-
-static inline void kvm_set_cpu_tsc(struct kvm_vcpu *vcpu, u64 data)
-{
- u64 tsc_offset;
-
- tsc_offset = data - kvm_get_cpu_tsc(vcpu);
- vcpu->arch.tsc_msr_offset = tsc_offset;
-}
+extern u64 kvm_get_cpu_tsc(struct kvm_vcpu *vcpu);
+extern void kvm_set_cpu_tsc(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a854ec..c1bed03 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1727,6 +1727,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_XEN_HVM:
case KVM_CAP_ADJUST_CLOCK:
+ case KVM_CAP_SET_TSC_RATE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2680,6 +2681,34 @@ out:
return r;
}
+u64 kvm_get_cpu_tsc(struct kvm_vcpu *vcpu)
+{
+ u64 tsc;
+ struct kvm *kvm = vcpu->kvm;
+
+ tsc = kvm_get_elapsed_tsc(kvm) + vcpu->arch.tsc_msr_offset;
+ tsc = mult_precise(tsc, kvm->arch.tsc_multiplier, kvm->arch.tsc_shift);
+ return tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cpu_tsc);
+
+void kvm_set_cpu_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+ u64 tsc_offset;
+
+ tsc_offset = data - kvm_get_cpu_tsc(vcpu);
+ vcpu->arch.tsc_msr_offset = tsc_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cpu_tsc);
+
+static int kvm_set_tsc_rate(struct kvm *kvm, u32 new_tsc_khz)
+{
+ kvm->arch.tsc_khz = new_tsc_khz;
+ compute_best_multiplier(new_tsc_khz, ref_tsc_khz,
+ &kvm->arch.tsc_multiplier, &kvm->arch.tsc_shift);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2986,6 +3015,25 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
+ case KVM_X86_GET_TSC_RATE: {
+ u32 rate = kvm->arch.tsc_khz;
+ r = -EFAULT;
+ if (copy_to_user(argp, &rate, sizeof(rate)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_X86_SET_TSC_RATE: {
+ u32 tsc_rate;
+ r = -EFAULT;
+ if (copy_from_user(&tsc_rate, argp, sizeof tsc_rate))
+ goto out;
+ r = -EINVAL;
+ if (tsc_rate == 0 || tsc_rate > (1ULL << 30))
+ goto out;
+ r = kvm_set_tsc_rate(kvm, tsc_rate);
+ break;
+ }
default:
;
@@ -5611,6 +5659,7 @@ struct kvm *kvm_arch_create_vm(void)
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
kvm->arch.vm_init_tsc = kvm_get_ref_tsc();
+ kvm_set_tsc_rate(kvm, ref_tsc_khz);
return kvm;
}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6ed1a12..ac2f0af 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -440,6 +440,7 @@ struct kvm_ioeventfd {
#define KVM_CAP_XEN_HVM 38
#endif
#define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_SET_TSC_RATE 40
#ifdef KVM_CAP_IRQ_ROUTING
@@ -619,6 +620,8 @@ struct kvm_clock_data {
#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64)
#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64)
#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce)
+#define KVM_X86_GET_TSC_RATE _IOR(KVMIO, 0x9f, __u64)
+#define KVM_X86_SET_TSC_RATE _IOW(KVMIO, 0xa0, __u64)
/*
* Deprecated interfaces
--
1.6.5.2
Expand the previous ioctl to add a mode field specifying the
mode of TSC virtualization.
There are three modes.
1) Pure passthrough - TSC is always passed through. Hardware offset
features are used. Great for fixed frequency machines with VMs that
will not be migrated.
2) Pure intercept - TSC reads are always intercepted. Great for SMP
VMs which require perfect synchronization across CPUs.
3) Hybrid mode - TSC is passed through when possible, even if the
actual CPU frequency is lower than the VM's frequency. On exits,
the TSC offset is adjusted to keep in sync with the VM frequency.
If the hardware frequency exceeds the VM frequency, intercept mode
is used instead.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 21 ++++++++++++---------
include/linux/kvm.h | 17 ++++++++++++++---
3 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 313befc..f2cf4c5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -418,6 +418,7 @@ struct kvm_arch{
unsigned long irq_sources_bitmap;
u64 vm_init_tsc;
s64 kvmclock_offset;
+ unsigned int tsc_mode;
unsigned int tsc_khz;
unsigned long tsc_multiplier;
int tsc_shift;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1bed03..f8fa978 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1727,7 +1727,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_XEN_HVM:
case KVM_CAP_ADJUST_CLOCK:
- case KVM_CAP_SET_TSC_RATE:
+ case KVM_CAP_SET_TSC_MODE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -3015,23 +3015,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
- case KVM_X86_GET_TSC_RATE: {
- u32 rate = kvm->arch.tsc_khz;
+ case KVM_X86_GET_TSC_MODE: {
+ struct kvm_tsc_state state;
+ state.rate = kvm->arch.tsc_khz;
+ state.mode = kvm->arch.tsc_mode;
r = -EFAULT;
- if (copy_to_user(argp, &rate, sizeof(rate)))
+ if (copy_to_user(argp, &state, sizeof(state)))
goto out;
r = 0;
break;
}
- case KVM_X86_SET_TSC_RATE: {
- u32 tsc_rate;
+ case KVM_X86_SET_TSC_MODE: {
+ struct kvm_tsc_state state;
r = -EFAULT;
- if (copy_from_user(&tsc_rate, argp, sizeof tsc_rate))
+ if (copy_from_user(&state, argp, sizeof(state)))
goto out;
r = -EINVAL;
- if (tsc_rate == 0 || tsc_rate > (1ULL << 30))
+ if (state.rate == 0 || state.rate > (1ULL << 30) ||
+ state.mode > KVM_TSC_MODE_HYBRID)
goto out;
- r = kvm_set_tsc_rate(kvm, tsc_rate);
+ r = kvm_set_tsc_rate(kvm, state.rate);
break;
}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ac2f0af..85ad3e0 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -440,7 +440,7 @@ struct kvm_ioeventfd {
#define KVM_CAP_XEN_HVM 38
#endif
#define KVM_CAP_ADJUST_CLOCK 39
-#define KVM_CAP_SET_TSC_RATE 40
+#define KVM_CAP_SET_TSC_MODE 40
#ifdef KVM_CAP_IRQ_ROUTING
@@ -520,6 +520,17 @@ struct kvm_clock_data {
__u32 pad[9];
};
+struct kvm_tsc_state {
+ __u32 rate;
+ __u32 mode;
+};
+
+enum {
+ KVM_TSC_MODE_PASSTHROUGH,
+ KVM_TSC_MODE_INTERCEPT,
+ KVM_TSC_MODE_HYBRID
+};
+
/*
* ioctls for VM fds
*/
@@ -620,8 +631,8 @@ struct kvm_clock_data {
#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64)
#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64)
#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce)
-#define KVM_X86_GET_TSC_RATE _IOR(KVMIO, 0x9f, __u64)
-#define KVM_X86_SET_TSC_RATE _IOW(KVMIO, 0xa0, __u64)
+#define KVM_X86_GET_TSC_MODE _IOR(KVMIO, 0x9f, struct kvm_tsc_state)
+#define KVM_X86_SET_TSC_MODE _IOW(KVMIO, 0xa0, struct kvm_tsc_state)
/*
* Deprecated interfaces
--
1.6.5.2
Quick and dirty hack to get passthrough TSC back in SVM.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/svm.c | 4 +++-
1 files changed, 3 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 91eb263..91107a4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -571,7 +571,6 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
- (1ULL << INTERCEPT_RDTSC) |
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
@@ -592,6 +591,9 @@ static void init_vmcb(struct vcpu_svm *svm)
(1ULL << INTERCEPT_MONITOR) |
(1ULL << INTERCEPT_MWAIT);
+ if (svm->vcpu.kvm->arch.tsc_mode != KVM_TSC_MODE_PASSTHROUGH)
+ control->intercept |= (1ULL << INTERCEPT_RDTSC);
+
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
control->tsc_offset = 0;
--
1.6.5.2
Zachary Amsden <[email protected]> writes:
>
> Damn, this is complicated crap. The analagous task in real life would
> be keeping a band of howler monkeys, each in their own tree, singing in
> unison while the lead vocalist jumps from tree to tree, and meanwhile,
> an unseen conductor keeps changing the tempo the piece is played at.
> Thankfully, there are no key changes, however, occasionally new trees
> sprout up at random and live ones fall over.
On CPUs where the TSC frequency is not constant typically you can't tell
exactly when the frequency changes. So you would always have a race window
where the frequency is unknown and wrong results occur. This can be worked
around, but it's quite complicated.
The safe way is to not use the TSC on such CPUs.
-Andi
--
[email protected] -- Speaking for myself only.
On Mon, Dec 14, 2009 at 06:08:42PM -1000, Zachary Amsden wrote:
<snip>
+ atomic_set(&per_cpu(cpu_tsc_synchronized, freq->cpu),
0);
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ if (vcpu->cpu != smp_processor_id())
+ send_ipi++;
+ kvm_request_guest_time_update(vcpu);
There is some overlap here between KVM_REQ_KVMCLOCK_UPDATE and
cpu_tsc_synchronized. Its the same information (frequency for a CPU has
changed) stored in two places.
Later you do:
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
if (vcpu->cpu != smp_processor_id())
send_ipi++;
kvm_request_guest_time_update(vcpu);
}
}
spin_unlock(&kvm_lock);
<--- a remote CPU could have updated kvmclock information
with stale cpu_tsc_khz, clearing the
KVM_REQ_KVMCLOCK_UPDATE bit.
smp_call_function(evict) (which sets cpu_tsc_synchronized
to zero)
Maybe worthwhile to unify it. Perhaps use the per cpu tsc generation in
addition to vcpu_load to update kvmclock info (on arch vcpu_load update
kvmclock store generation, update again on generation change).
On 12/15/2009 08:21 AM, Marcelo Tosatti wrote:
> On Mon, Dec 14, 2009 at 06:08:42PM -1000, Zachary Amsden wrote:
>
> <snip>
>
> + atomic_set(&per_cpu(cpu_tsc_synchronized, freq->cpu),
> 0);
> + spin_lock(&kvm_lock);
> + list_for_each_entry(kvm,&vm_list, vm_list) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (vcpu->cpu != freq->cpu)
> + continue;
> + if (vcpu->cpu != smp_processor_id())
> + send_ipi++;
> + kvm_request_guest_time_update(vcpu);
>
> There is some overlap here between KVM_REQ_KVMCLOCK_UPDATE and
> cpu_tsc_synchronized. Its the same information (frequency for a CPU has
> changed) stored in two places.
>
> Later you do:
>
> spin_lock(&kvm_lock);
> list_for_each_entry(kvm,&vm_list, vm_list) {
> kvm_for_each_vcpu(i, vcpu, kvm) {
> if (vcpu->cpu != freq->cpu)
> continue;
> if (vcpu->cpu != smp_processor_id())
> send_ipi++;
> kvm_request_guest_time_update(vcpu);
> }
> }
> spin_unlock(&kvm_lock);
> <--- a remote CPU could have updated kvmclock information
> with stale cpu_tsc_khz, clearing the
> KVM_REQ_KVMCLOCK_UPDATE bit.
> smp_call_function(evict) (which sets cpu_tsc_synchronized
> to zero)
>
> Maybe worthwhile to unify it. Perhaps use the per cpu tsc generation in
> addition to vcpu_load to update kvmclock info (on arch vcpu_load update
> kvmclock store generation, update again on generation change).
>
Yes, that is an excellent point. The generation counter, the
tsc_synchronized variable and the per-vcpu clock counter all have some
amount of redundancy of information.
Perhaps instead of overlapping, they should be layered?
A rule for kvmclock: can't update kvmclock info until cpu is synchronized?
Thanks,
Zach
On Tue, Dec 15, 2009 at 11:26:59AM -1000, Zachary Amsden wrote:
> On 12/15/2009 08:21 AM, Marcelo Tosatti wrote:
>> On Mon, Dec 14, 2009 at 06:08:42PM -1000, Zachary Amsden wrote:
>>
>> <snip>
>>
>> + atomic_set(&per_cpu(cpu_tsc_synchronized, freq->cpu),
>> 0);
>> + spin_lock(&kvm_lock);
>> + list_for_each_entry(kvm,&vm_list, vm_list) {
>> + kvm_for_each_vcpu(i, vcpu, kvm) {
>> + if (vcpu->cpu != freq->cpu)
>> + continue;
>> + if (vcpu->cpu != smp_processor_id())
>> + send_ipi++;
>> + kvm_request_guest_time_update(vcpu);
>>
>> There is some overlap here between KVM_REQ_KVMCLOCK_UPDATE and
>> cpu_tsc_synchronized. Its the same information (frequency for a CPU has
>> changed) stored in two places.
>>
>> Later you do:
>>
>> spin_lock(&kvm_lock);
>> list_for_each_entry(kvm,&vm_list, vm_list) {
>> kvm_for_each_vcpu(i, vcpu, kvm) {
>> if (vcpu->cpu != freq->cpu)
>> continue;
>> if (vcpu->cpu != smp_processor_id())
>> send_ipi++;
>> kvm_request_guest_time_update(vcpu);
>> }
>> }
>> spin_unlock(&kvm_lock);
>> <--- a remote CPU could have updated kvmclock information
>> with stale cpu_tsc_khz, clearing the
>> KVM_REQ_KVMCLOCK_UPDATE bit.
>> smp_call_function(evict) (which sets cpu_tsc_synchronized
>> to zero)
>>
>> Maybe worthwhile to unify it. Perhaps use the per cpu tsc generation in
>> addition to vcpu_load to update kvmclock info (on arch vcpu_load update
>> kvmclock store generation, update again on generation change).
>>
>
> Yes, that is an excellent point. The generation counter, the
> tsc_synchronized variable and the per-vcpu clock counter all have some
> amount of redundancy of information.
>
> Perhaps instead of overlapping, they should be layered?
>
> A rule for kvmclock: can't update kvmclock info until cpu is synchronized?
How about update kvmclock on:
- cpu switch (kvm_arch_vcpu_load). Then store cpu tsc generation in
vcpu->arch.
- on vcpu_enter_guest, if tsc generation changes.
If kvm_arch_vcpu_load stored stale cpu_khz into kvmclock, the tsc
generation will have changed by the time guest entry succeeds.
Then you can kill KVM_REQ_KVMCLOCK_UPDATE and the kvm_for_each_vcpu()
loop in the cpufreq callback.