In-Reply-To:
This is the remaining bulk of work I have related to TSC emulation.
In summary, I believe this fixes all known issues with TSC. A few
rather subtle issues are cleaned up, S4 suspend is fixed, and the
API for adjust_tsc_offset is expanded to take arguments in either
guest cycles or optionally host cycles. The final patch adds
software TSC emulation, with a configurable default setting.
Note that TSC trapping will only be used when other methods fail
due to unstable TSC or lack of scaling hardware. With the improved
offset matching, I was even able to get SMP guests to boot with a
TSC clock; cycle testing showed a maximum backwards leap of around
0.25 ms, which is actually fairly good. With the last patch applied,
software TSC emulation kicks in and the backwards TSC count even on
my broken hardware dropped to zero.
Some of this code (the S4 suspend compensation) has already been
ported into RHEL to fix various bugs - however upstream had diverged
a bit from the original course I planned; I had to add a few things
that had been optimized out of upstream back in (last_host_tsc).
In the course of doing this, I think the new code looks much
cleaner, with well documented and easy to understand variables.
Yes, there are a lot of tracking variables to maintain this whole
infrastructure - and yes, some of them appear to be redundant of
easily computable from others - but in actuality, the information
they provide is easy to understand, and the resulting code is much
easier to verify than a complex system where some quantities may
be undefined or computed on the fly and thus causing subtle races.
Zach
This requires some restructuring; rather than use 'virtual_tsc_khz'
to indicate whether hardware rate scaling is in effect, we consider
each VCPU to always have a virtual TSC rate. Instead, there is new
logic above the vendor-specific hardware scaling that decides whether
it is even necessary to use and updates all rate variables used by
common code. This means we can simply query the virtual rate at
any point, which is needed for software rate scaling.
There is also now a threshold added to the TSC rate scaling; minor
differences and variations of measured TSC rate can accidentally
provoke rate scaling to be used when it is not needed. Instead,
we have a tolerance variable called tsc_tolerance_ppm, which is
the maximum variation from user requested rate at which scaling
will be used. The default is 250ppm, which is the half the
threshold for NTP adjustment, allowing for some hardware variation.
In the event that hardware rate scaling is not available, we can
kludge a bit by forcing TSC catchup to turn on when a faster than
hardware speed has been requested, but there is nothing available
yet for the reverse case; this requires a trap and emulate software
implementation for RDTSC, which is still forthcoming.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 9 ++--
arch/x86/kvm/svm.c | 20 ++++++----
arch/x86/kvm/vmx.c | 16 +++++--
arch/x86/kvm/x86.c | 82 ++++++++++++++++++--------------------
include/linux/kvm.h | 1 +
5 files changed, 68 insertions(+), 60 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da6bbee..184cd38 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -393,10 +393,11 @@ struct kvm_vcpu_arch {
u64 last_kernel_ns;
u64 last_tsc_nsec;
u64 last_tsc_write;
- u32 virtual_tsc_khz;
bool tsc_catchup;
- u32 tsc_catchup_mult;
- s8 tsc_catchup_shift;
+ bool tsc_always_catchup;
+ s8 virtual_tsc_shift;
+ u32 virtual_tsc_mult;
+ u32 virtual_tsc_khz;
bool nmi_pending;
bool nmi_injected;
@@ -604,7 +605,7 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
+ void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 475d1c9..47f557e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -905,20 +905,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
return _tsc;
}
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 ratio;
u64 khz;
- /* TSC scaling supported? */
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
return;
+ }
- /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
- if (user_tsc_khz == 0) {
- vcpu->arch.virtual_tsc_khz = 0;
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
+ /* TSC scaling supported? */
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ } else
+ WARN(1, "user requested TSC rate below hardware speed\n");
return;
}
@@ -933,7 +938,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
user_tsc_khz);
return;
}
- vcpu->arch.virtual_tsc_khz = user_tsc_khz;
svm->tsc_ratio = ratio;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5b49c7..bc3ecfd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1751,13 +1751,19 @@ static u64 guest_read_tsc(void)
}
/*
- * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
- * ioctl. In this case the call-back should update internal vmx state to make
- * the changes effective.
+ * Engage any workarounds for mis-matched TSC rates. Currently limited to
+ * software catchup for faster rates on slower CPUs.
*/
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
- /* Nothing to do here */
+ if (!scale)
+ return;
+
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ } else
+ WARN(1, "user requested TSC rate below hardware speed\n");
}
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b803f0..776895a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -95,6 +95,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -991,49 +995,43 @@ static inline u64 get_kernel_ns(void)
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
unsigned long max_tsc_khz;
-static inline int kvm_tsc_changes_freq(void)
-{
- int cpu = get_cpu();
- int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
- cpufreq_quick_get(cpu) != 0;
- put_cpu();
- return ret;
-}
-
-static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.virtual_tsc_khz)
- return vcpu->arch.virtual_tsc_khz;
- else
- return __this_cpu_read(cpu_tsc_khz);
-}
-
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
- u64 ret;
-
- WARN_ON(preemptible());
- if (kvm_tsc_changes_freq())
- printk_once(KERN_WARNING
- "kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * vcpu_tsc_khz(vcpu);
- do_div(ret, USEC_PER_SEC);
- return ret;
+ return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
}
-static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
+ u32 thresh_lo, thresh_hi;
+ int use_scaling = 0;
+
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &vcpu->arch.tsc_catchup_shift,
- &vcpu->arch.tsc_catchup_mult);
+ &vcpu->arch.virtual_tsc_shift,
+ &vcpu->arch.virtual_tsc_mult);
+ vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+
+ /*
+ * Compute the variation in TSC rate which is acceptable
+ * within the range of tolerance and decide if the
+ * rate being applied is within that bounds of the hardware
+ * rate. If so, no scaling or compensation need be done.
+ */
+ thresh_lo = (u64)tsc_khz * (1000000 - tsc_tolerance_ppm) / 1000000;
+ thresh_hi = (u64)tsc_khz * (1000000 + tsc_tolerance_ppm) / 1000000;
+ if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
+ pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+ use_scaling = 1;
+ }
+ kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
- vcpu->arch.tsc_catchup_mult,
- vcpu->arch.tsc_catchup_shift);
+ vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
tsc += vcpu->arch.last_tsc_write;
return tsc;
}
@@ -1100,7 +1098,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
kernel_ns = get_kernel_ns();
- this_tsc_khz = vcpu_tsc_khz(v);
+ this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -2018,6 +2016,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
case KVM_CAP_GET_TSC_KHZ:
+ case KVM_CAP_SET_TSC_KHZ:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -3176,26 +3175,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
u32 user_tsc_khz;
r = -EINVAL;
- if (!kvm_has_tsc_control)
- break;
-
user_tsc_khz = (u32)arg;
if (user_tsc_khz >= kvm_max_guest_tsc_khz)
goto out;
- kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ kvm_set_tsc_khz(vcpu, user_tsc_khz);
r = 0;
goto out;
}
case KVM_GET_TSC_KHZ: {
- r = -EIO;
- if (check_tsc_unstable())
- goto out;
-
- r = vcpu_tsc_khz(vcpu);
-
+ r = vcpu->arch.virtual_tsc_khz;
goto out;
}
default:
@@ -5581,6 +5575,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
profile_hit(KVM_PROFILING, (void *)rip);
}
+ if (unlikely(vcpu->arch.tsc_always_catchup))
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_lapic_sync_from_vapic(vcpu);
@@ -6277,7 +6273,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- kvm_init_tsc_catchup(vcpu, max_tsc_khz);
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
r = kvm_mmu_create(vcpu);
if (r < 0)
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 9c9ca7c..31adf3d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -544,6 +544,7 @@ struct kvm_ppc_pvinfo {
#define KVM_CAP_TSC_CONTROL 60
#define KVM_CAP_GET_TSC_KHZ 61
#define KVM_CAP_PPC_BOOKE_SREGS 62
+#define KVM_CAP_SET_TSC_KHZ 63
#ifdef KVM_CAP_IRQ_ROUTING
--
1.7.1
There are a few improvements that can be made to the TSC offset
matching code. First, we don't need to call the 128-bit multiply
(especially on a constant number), the code works much nicer to
do computation in nanosecond units.
Second, the way everything is setup with software TSC rate scaling,
we currently have per-cpu rates. Obviously this isn't too desirable
to use in practice, but if for some reason we do change the rate of
all VCPUs at runtime, then reset the TSCs, we will only want to
match offsets for VCPUs running at the same rate.
Finally, for the case where we have an unstable host TSC, but
rate scaling is being done in hardware, we should call the platform
code to compute the TSC offset, so the math is reorganized to recompute
the base instead, then transform the base into an offset using the
existing API.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 37 ++++++++++++++++++++++---------------
2 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 184cd38..865f051 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -468,6 +468,7 @@ struct kvm_arch {
u64 last_tsc_nsec;
u64 last_tsc_offset;
u64 last_tsc_write;
+ u32 last_tsc_khz;
struct kvm_xen_hvm_config xen_hvm_config;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 776895a..457bd79 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1041,33 +1041,39 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
- s64 sdiff;
+ s64 nsdiff;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
- sdiff = data - kvm->arch.last_tsc_write;
- if (sdiff < 0)
- sdiff = -sdiff;
+
+ /* n.b - signed multiplication and division required */
+ nsdiff = data - kvm->arch.last_tsc_write;
+ nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+ nsdiff -= elapsed;
+ if (nsdiff < 0)
+ nsdiff = -nsdiff;
/*
- * Special case: close write to TSC within 5 seconds of
- * another CPU is interpreted as an attempt to synchronize
- * The 5 seconds is to accommodate host load / swapping as
- * well as any reset of TSC during the boot process.
- *
- * In that case, for a reliable TSC, we can match TSC offsets,
- * or make a best guest using elapsed value.
- */
- if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
- elapsed < 5ULL * NSEC_PER_SEC) {
+ * Special case: TSC write with a small delta (1 second) of virtual
+ * cycle time against real time is interpreted as an attempt to
+ * synchronize the CPU.
+ *
+ * For a reliable TSC, we can match TSC offsets, and for an unstable
+ * TSC, we add elapsed time in this computation. We could let the
+ * compensation code attempt to catch up if we fall behind, but
+ * it's better to try to match offsets from the beginning.
+ */
+ if (nsdiff < NSEC_PER_SEC &&
+ vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
if (!check_tsc_unstable()) {
offset = kvm->arch.last_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
- offset += delta;
+ data += delta;
+ offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
ns = kvm->arch.last_tsc_nsec;
@@ -1075,6 +1081,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm->arch.last_tsc_nsec = ns;
kvm->arch.last_tsc_write = data;
kvm->arch.last_tsc_offset = offset;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
kvm_x86_ops->write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
--
1.7.1
Currently, when the TSC is written by the guest, the variable
ns is updated to force the current write to appear to have taken
place at the time of the first write in this sync phase. This
leaves a cliff at the end of the match window where updates will
fall of the end. There are two scenarios where this can be a
problem in practe - first, on a system with a large number of
VCPUs, the sync period may last for an extended period of time.
The second way this can happen is if the VM reboots very rapidly
and we catch a VCPU TSC synchronization just around the edge.
We may be unaware of the reboot, and thus the first VCPU might
synchronize with an old set of the timer (at, say 0.97 seconds
ago, when first powered on). The second VCPU can come in 0.04
seconds later to try to synchronize, but it misses the window
because it is just over the threshold.
Instead, stop doing this artificial setback of the ns variable
and just update it with every write of the TSC.
It may be observed that doing so causes values computed by
compute_guest_tsc to diverge slightly across CPUs - note that
the last_tsc_ns and last_tsc_write variable are used here, and
now they last_tsc_ns will be different for each VCPU, reflecting
the actual time of the update.
However, compute_guest_tsc is used only for guests which already
have TSC stability issues, and further, note that the previous
patch has caused last_tsc_write to be incremented by the difference
in nanoseconds, converted back into guest cycles. As such, only
boundary rounding errors should be visible, which given the
resolution in nanoseconds, is going to only be a few cycles and
only visible in cross-CPU consistency tests. The problem can be
fixed by adding a new set of variables to track the start offset
and start write value for the current sync cycle.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 1 -
1 files changed, 0 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 457bd79..2176714 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1076,7 +1076,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
- ns = kvm->arch.last_tsc_nsec;
}
kvm->arch.last_tsc_nsec = ns;
kvm->arch.last_tsc_write = data;
--
1.7.1
The variable last_guest_tsc was being used as an ad-hoc indicator
that guest TSC has been initialized and recorded correctly. However,
it may not have been, it could be that guest TSC has been set to some
large value, the back to a small value (by, say, a software reboot).
This defeats the logic and causes KVM to falsely assume that the
guest TSC has gone backwards, marking the host TSC unstable, which
is undesirable behavior.
In addition, rather than try to compute an offset adjustment for the
TSC on unstable platforms, just recompute the whole offset. This
allows us to get rid of one callsite for adjust_tsc_offset, which
is problematic because the units it takes are in guest units, but
here, the computation was originally being done in host units.
Doing this, and also recording last_guest_tsc when the TSC is written
allow us to remove the tricky logic which depended on last_guest_tsc
being zero to indicate a reset of uninitialized value.
Instead, we now have the guarantee that the guest TSC offset is
always at least something which will get us last_guest_tsc.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/x86.c | 11 ++++++-----
1 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2176714..3bd2ca3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1088,6 +1088,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
vcpu->arch.hv_clock.tsc_timestamp = 0;
vcpu->arch.last_tsc_write = data;
vcpu->arch.last_tsc_nsec = ns;
+ vcpu->arch.last_guest_tsc = data;
}
EXPORT_SYMBOL_GPL(kvm_write_tsc);
@@ -1156,7 +1157,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* observed by the guest and ensure the new system time is greater.
*/
max_kernel_ns = 0;
- if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+ if (vcpu->hv_clock.tsc_timestamp) {
max_kernel_ns = vcpu->last_guest_tsc -
vcpu->hv_clock.tsc_timestamp;
max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -2157,13 +2158,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
u64 tsc;
kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
- tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
- tsc - vcpu->arch.last_guest_tsc;
-
+ tsc_delta = tsc - vcpu->arch.last_guest_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
- kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+ u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+ vcpu->arch.last_guest_tsc);
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
--
1.7.1
The variable last_host_tsc was removed from upstream code. I am adding
it back for two reasons. First, it is unnecessary to use guest TSC
computation to conclude information about the host TSC. The guest may
set the TSC backwards (this case handled by the previous patch), but
the computation of guest TSC (and fetching an MSR) is significanlty more
work and complexity than simply reading the hardware counter. In addition,
we don't actually need the guest TSC for any part of the computation,
by always recomputing the offset, we can eliminate the need to deal with
the current offset and any scaling factors that may apply.
The second reason is that later on, we are going to be using the host
TSC value to restore TSC offsets after a host S4 suspend, so we need to
be reading the host values, not the guest values here.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 9 +++------
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 865f051..3b9fdb5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -393,6 +393,7 @@ struct kvm_vcpu_arch {
u64 last_kernel_ns;
u64 last_tsc_nsec;
u64 last_tsc_write;
+ u64 last_host_tsc;
bool tsc_catchup;
bool tsc_always_catchup;
s8 virtual_tsc_shift;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3bd2ca3..8fe988a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2154,11 +2154,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
/* Make sure TSC doesn't go backwards */
- s64 tsc_delta;
- u64 tsc;
-
- kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
- tsc_delta = tsc - vcpu->arch.last_guest_tsc;
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ native_read_tsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
@@ -2178,7 +2175,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+ vcpu->arch.last_host_tsc = native_read_tsc();
}
static int is_efer_nx(void)
--
1.7.1
Redefine the API to take a parameter indicating whether an
adjustment is in host or guest cycles.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 13 ++++++++++++-
arch/x86/kvm/svm.c | 6 +++++-
arch/x86/kvm/vmx.c | 2 +-
arch/x86/kvm/x86.c | 2 +-
4 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3b9fdb5..c2854da 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -599,7 +599,7 @@ struct kvm_x86_ops {
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
- void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+ void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -630,6 +630,17 @@ struct kvm_arch_async_pf {
extern struct kvm_x86_ops *kvm_x86_ops;
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
+}
+
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 47f557e..dcab00e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -957,10 +957,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON(adjustment < 0);
+ if (host)
+ adjustment = svm_scale_tsc(vcpu, adjustment);
+
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc3ecfd..780fe12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1782,7 +1782,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
to_vmx(vcpu)->nested.vmcs01_tsc_offset;
}
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
{
u64 offset = vmcs_read64(TSC_OFFSET);
vmcs_write64(TSC_OFFSET, offset + adjustment);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8fe988a..10950f7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1125,7 +1125,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (vcpu->tsc_catchup) {
u64 tsc = compute_guest_tsc(v, kernel_ns);
if (tsc > tsc_timestamp) {
- kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+ adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
tsc_timestamp = tsc;
}
}
--
1.7.1
During a host suspend, TSC may go backwards, which KVM interprets
as an unstable TSC. Technically, KVM should not be marking the
TSC unstable, which causes the TSC clocksource to go bad, but we
need to be adjusting the TSC offsets in such a case.
Dealing with this issue is a little tricky as the only place we
can reliably do it is before much of the timekeeping infrastructure
is up and running. On top of this, we are not in a KVM thread
context, so we may not be able to safely access VCPU fields.
Instead, we compute our best known hardware offset at power-up and
stash it to be applied to all VCPUs when they actually start running.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++++++++++++--
2 files changed, 89 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c2854da..226897f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -394,6 +394,7 @@ struct kvm_vcpu_arch {
u64 last_tsc_nsec;
u64 last_tsc_write;
u64 last_host_tsc;
+ u64 tsc_offset_adjustment;
bool tsc_catchup;
bool tsc_always_catchup;
s8 virtual_tsc_shift;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 10950f7..cb8876b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2152,6 +2152,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
kvm_x86_ops->vcpu_load(vcpu, cpu);
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+ adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+ vcpu->arch.tsc_offset_adjustment = 0;
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ }
+
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
/* Make sure TSC doesn't go backwards */
s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
@@ -6221,13 +6229,88 @@ int kvm_arch_hardware_enable(void *garbage)
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i;
+ int ret;
+ u64 local_tsc;
+ u64 max_tsc = 0;
+ bool stable, backwards_tsc = false;
kvm_shared_msr_cpu_online();
- list_for_each_entry(kvm, &vm_list, vm_list)
- kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu->cpu == smp_processor_id())
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- return kvm_x86_ops->hardware_enable(garbage);
+ ret = kvm_x86_ops->hardware_enable(garbage);
+ if (ret != 0)
+ return ret;
+
+ local_tsc = native_read_tsc();
+ stable = !check_tsc_unstable();
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!stable && vcpu->cpu == smp_processor_id())
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+ backwards_tsc = true;
+ if (vcpu->arch.last_host_tsc > max_tsc)
+ max_tsc = vcpu->arch.last_host_tsc;
+ }
+ }
+ }
+
+ /*
+ * Sometimes, even reliable TSCs go backwards. This happens on
+ * platforms that reset TSC during suspend or hibernate actions, but
+ * maintain synchronization. We must compensate. Fortunately, we can
+ * detect that condition here, which happens early in CPU bringup,
+ * before any KVM threads can be running. Unfortunately, we can't
+ * bring the TSCs fully up to date with real time, as we aren't yet far
+ * enough into CPU bringup that we know how much real time has actually
+ * elapsed; our helper function, get_kernel_ns() will be using boot
+ * variables that haven't been updated yet.
+ *
+ * So we simply find the maximum observed TSC above, then record the
+ * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
+ * the adjustment will be applied. Note that we accumulate
+ * adjustments, in case multiple suspend cycles happen before some VCPU
+ * gets a chance to run again. In the event that no KVM threads get a
+ * chance to run, we will miss the entire elapsed period, as we'll have
+ * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+ * loose cycle time. This isn't too big a deal, since the loss will be
+ * uniform across all VCPUs (not to mention the scenario is extremely
+ * unlikely). It is possible that a second hibernate recovery happens
+ * much faster than a first, causing the observed TSC here to be
+ * smaller; this would require additional padding adjustment, which is
+ * why we set last_host_tsc to the local tsc observed here.
+ *
+ * N.B. - this code below runs only on platforms with reliable TSC,
+ * as that is the only way backwards_tsc is set above. Also note
+ * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+ * have the same delta_cyc adjustment applied if backwards_tsc
+ * is detected. Note further, this adjustment is only done once,
+ * as we reset last_host_tsc on all VCPUs to stop this from being
+ * called multiple times (one for each physical CPU bringup).
+ *
+ * Platforms with unnreliable TSCs don't have to deal with this, they
+ * will be compensated by the logic in vcpu_load, which sets the TSC to
+ * catchup mode. This will catchup all VCPUs to real time, but cannot
+ * guarantee that they stay in perfect synchronization.
+ */
+ if (backwards_tsc) {
+ u64 delta_cyc = max_tsc - local_tsc;
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.tsc_offset_adjustment += delta_cyc;
+ vcpu->arch.last_host_tsc = local_tsc;
+ }
+
+ /*
+ * We have to disable TSC offset matching.. if you were
+ * booting a VM while issuing an S4 host suspend....
+ * you may have some problem. Solving this issue is
+ * left as an exercise to the reader.
+ */
+ kvm->arch.last_tsc_nsec = 0;
+ kvm->arch.last_tsc_write = 0;
+ }
+
+ }
+ return 0;
}
void kvm_arch_hardware_disable(void *garbage)
--
1.7.1
This allows us to track the original nanosecond and counter values
at each phase of TSC writing by the guest. This gets us perfect
offset matching for stable TSC systems, and perfect software
computed TSC matching for machines with unstable TSC.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 10 ++++++--
arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++-------
2 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 226897f..198ce8b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -391,10 +391,11 @@ struct kvm_vcpu_arch {
struct page *time_page;
u64 last_guest_tsc;
u64 last_kernel_ns;
- u64 last_tsc_nsec;
- u64 last_tsc_write;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
+ u64 this_tsc_nsec;
+ u64 this_tsc_write;
+ u8 this_tsc_generation;
bool tsc_catchup;
bool tsc_always_catchup;
s8 virtual_tsc_shift;
@@ -468,9 +469,12 @@ struct kvm_arch {
s64 kvmclock_offset;
raw_spinlock_t tsc_write_lock;
u64 last_tsc_nsec;
- u64 last_tsc_offset;
u64 last_tsc_write;
u32 last_tsc_khz;
+ u64 cur_tsc_nsec;
+ u64 cur_tsc_write;
+ u64 cur_tsc_offset;
+ u8 cur_tsc_generation;
struct kvm_xen_hvm_config xen_hvm_config;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cb8876b..09e67fb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1029,10 +1029,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
- u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
vcpu->arch.virtual_tsc_mult,
vcpu->arch.virtual_tsc_shift);
- tsc += vcpu->arch.last_tsc_write;
+ tsc += vcpu->arch.this_tsc_write;
return tsc;
}
@@ -1068,7 +1068,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
if (nsdiff < NSEC_PER_SEC &&
vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
if (!check_tsc_unstable()) {
- offset = kvm->arch.last_tsc_offset;
+ offset = kvm->arch.cur_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
@@ -1076,20 +1076,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
+ } else {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computaion in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = data;
+ kvm->arch.cur_tsc_offset = offset;
+ pr_debug("kvm: new tsc generation %u, clock %llu\n",
+ kvm->arch.cur_tsc_generation, data);
}
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
kvm->arch.last_tsc_nsec = ns;
kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_offset = offset;
kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
/* Reset of TSC must disable overshoot protection below */
vcpu->arch.hv_clock.tsc_timestamp = 0;
- vcpu->arch.last_tsc_write = data;
- vcpu->arch.last_tsc_nsec = ns;
vcpu->arch.last_guest_tsc = data;
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
}
+
EXPORT_SYMBOL_GPL(kvm_write_tsc);
static int kvm_guest_time_update(struct kvm_vcpu *v)
--
1.7.1
When hardware assistance is unavailable to scale the TSC, or it is
not possible to keep in sync, add a software virtualization mode
where the TSC is trapped and thus guaranteed to always have perfect
synchronization.
Currently this behavior defaults to on; how and when the decision to
use trapping is made is likely to be a matter of debate. For now,
just make it possible.
Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/kvm/svm.c | 26 +++++++++++++++++++++++++-
arch/x86/kvm/vmx.c | 28 +++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 34 +++++++++++++++++++++++-----------
arch/x86/kvm/x86.h | 5 +++++
4 files changed, 80 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index dcab00e..fc4583d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -185,6 +185,7 @@ module_param(nested, int, S_IRUGO);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
+static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
static int nested_svm_intercept(struct vcpu_svm *svm);
@@ -912,13 +913,18 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
u64 khz;
/* Guest TSC same frequency as host TSC? */
- if (!scale) {
+ if (!scale && !check_tsc_unstable()) {
svm->tsc_ratio = TSC_RATIO_DEFAULT;
return;
}
/* TSC scaling supported? */
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ if (kvm_software_tsc) {
+ pr_debug("kvm: using TSC trapping\n");
+ svm_set_tsc_trapping(vcpu, true);
+ return;
+ }
if (user_tsc_khz > tsc_khz) {
vcpu->arch.tsc_catchup = 1;
vcpu->arch.tsc_always_catchup = 1;
@@ -1184,6 +1190,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm);
+ kvm_set_tsc_khz(&svm->vcpu, kvm_max_tsc_khz);
kvm_write_tsc(&svm->vcpu, 0);
err = fx_init(&svm->vcpu);
@@ -1303,6 +1310,15 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
clr_intercept(svm, INTERCEPT_VINTR);
}
+static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (trap)
+ set_intercept(svm, INTERCEPT_RDTSC);
+ else
+ clr_intercept(svm, INTERCEPT_RDTSC);
+}
+
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -2732,6 +2748,13 @@ static int task_switch_interception(struct vcpu_svm *svm)
return 1;
}
+static int rdtsc_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ kvm_read_tsc(&svm->vcpu);
+ return 1;
+}
+
static int cpuid_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
@@ -3178,6 +3201,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_SMI] = nop_on_interception,
[SVM_EXIT_INIT] = nop_on_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDTSC] = rdtsc_interception,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 780fe12..65066b4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -606,6 +606,7 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+static void vmx_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1756,9 +1757,14 @@ static u64 guest_read_tsc(void)
*/
static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
- if (!scale)
+ if (!scale && !check_tsc_unstable())
return;
+ if (kvm_software_tsc) {
+ pr_debug("kvm: using TSC trapping\n");
+ vmx_set_tsc_trapping(vcpu, true);
+ return;
+ }
if (user_tsc_khz > tsc_khz) {
vcpu->arch.tsc_catchup = 1;
vcpu->arch.tsc_always_catchup = 1;
@@ -3695,6 +3701,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
+ kvm_set_tsc_khz(&vmx->vcpu, kvm_max_tsc_khz);
kvm_write_tsc(&vmx->vcpu, 0);
return 0;
@@ -3997,6 +4004,18 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return 0;
}
+static void vmx_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+ u32 cpu_based_vm_exec_control;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ if (trap)
+ cpu_based_vm_exec_control |= CPU_BASED_RDTSC_EXITING;
+ else
+ cpu_based_vm_exec_control &= ~CPU_BASED_RDTSC_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
static int handle_rmode_exception(struct kvm_vcpu *vcpu,
int vec, u32 err_code)
{
@@ -4497,6 +4516,12 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_rdtsc(struct kvm_vcpu *vcpu)
+{
+ kvm_read_tsc(vcpu);
+ return 1;
+}
+
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
@@ -5421,6 +5446,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDTSC] = handle_rdtsc,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 09e67fb..1a07796 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -99,6 +99,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+int kvm_software_tsc = 1;
+module_param_named(software_tsc_emulation, kvm_software_tsc, bool, 0644);
+EXPORT_SYMBOL_GPL(kvm_software_tsc);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -993,7 +997,8 @@ static inline u64 get_kernel_ns(void)
}
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+unsigned long kvm_max_tsc_khz;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_khz);
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
@@ -1001,7 +1006,7 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
vcpu->arch.virtual_tsc_shift);
}
-static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
@@ -1026,6 +1031,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
}
kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
}
+EXPORT_SYMBOL_GPL(kvm_set_tsc_khz);
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
@@ -1117,6 +1123,18 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
EXPORT_SYMBOL_GPL(kvm_write_tsc);
+void kvm_read_tsc(struct kvm_vcpu *vcpu)
+{
+ u64 tsc;
+ s64 kernel_ns = get_kernel_ns();
+
+ tsc = compute_guest_tsc(vcpu, kernel_ns);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)tsc);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tsc >> 32);
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_read_tsc);
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags;
@@ -4931,7 +4949,7 @@ static void kvm_timer_init(void)
{
int cpu;
- max_tsc_khz = tsc_khz;
+ kvm_max_tsc_khz = tsc_khz;
register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
@@ -4940,13 +4958,13 @@ static void kvm_timer_init(void)
cpu = get_cpu();
cpufreq_get_policy(&policy, cpu);
if (policy.cpuinfo.max_freq)
- max_tsc_khz = policy.cpuinfo.max_freq;
+ kvm_max_tsc_khz = policy.cpuinfo.max_freq;
put_cpu();
#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
+ pr_debug("kvm: max_tsc_khz = %ld\n", kvm_max_tsc_khz);
for_each_online_cpu(cpu)
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
}
@@ -6194,10 +6212,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
- if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
return kvm_x86_ops->vcpu_create(kvm, id);
}
@@ -6385,8 +6399,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
r = kvm_mmu_create(vcpu);
if (r < 0)
goto fail_free_pio_data;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 256da82..94780df 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -80,6 +80,10 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+void kvm_read_tsc(struct kvm_vcpu *vcpu);
+void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz);
+extern int kvm_software_tsc;
+extern unsigned long kvm_max_tsc_khz;
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
@@ -89,4 +93,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
+
#endif
--
1.7.1