2010-08-20 08:12:48

by Zachary Amsden

[permalink] [raw]
Subject: [KVM timekeeping 25/35] Add clock catchup mode

Make the clock update handler handle generic clock synchronization,
not just KVM clock. We add a catchup mode which keeps passthrough
TSC in line with absolute guest TSC.

Signed-off-by: Zachary Amsden <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++------------
2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a54cc1..ec1dc3a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -343,6 +343,7 @@ struct kvm_vcpu_arch {
u64 last_kernel_ns;
u64 last_tsc_nsec;
u64 last_tsc_write;
+ bool tsc_rebase;

bool nmi_pending;
bool nmi_injected;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac0b2d9..a4215d7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -927,6 +927,15 @@ static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
kvm->arch.virtual_tsc_khz = this_tsc_khz;
}

+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+ vcpu->kvm->arch.virtual_tsc_mult,
+ vcpu->kvm->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.last_tsc_write;
+ return tsc;
+}
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
@@ -984,22 +993,29 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
unsigned long this_tsc_khz;
s64 kernel_ns, max_kernel_ns;
u64 tsc_timestamp;
-
- if ((!vcpu->time_page))
- return 0;
+ bool catchup = (!vcpu->time_page);

/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
kernel_ns = getnsboottime();
this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
- local_irq_restore(flags);

if (unlikely(this_tsc_khz == 0)) {
+ local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
return 1;
}

+ if (catchup) {
+ u64 tsc = compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp)
+ kvm_x86_ops->adjust_tsc_offset(v, tsc-tsc_timestamp);
+ }
+ local_irq_restore(flags);
+ if (catchup)
+ return 0;
+
/*
* Time as measured by the TSC may go backwards when resetting the base
* tsc_timestamp. The reason for this is that the TSC resolution is
@@ -1065,14 +1081,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 0;
}

-static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+static void kvm_request_clock_update(struct kvm_vcpu *v)
{
- struct kvm_vcpu_arch *vcpu = &v->arch;
-
- if (!vcpu->time_page)
- return 0;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- return 1;
}

static bool msr_mtrr_valid(unsigned msr)
@@ -1398,6 +1409,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}

vcpu->arch.time = data;
+ kvm_request_clock_update(vcpu);

/* we verify if the enable bit is set... */
if (!(data & 1))
@@ -1413,8 +1425,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_release_page_clean(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
-
- kvm_request_guest_time_update(vcpu);
break;
}
case MSR_IA32_MCG_CTL:
@@ -1929,16 +1939,20 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}

kvm_x86_ops->vcpu_load(vcpu, cpu);
- if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+ if (unlikely(vcpu->cpu != cpu) || vcpu->arch.tsc_rebase) {
/* Make sure TSC doesn't go backwards */
s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
native_read_tsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
- if (check_tsc_unstable())
+ if (check_tsc_unstable()) {
kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
- kvm_migrate_timers(vcpu);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ if (vcpu->cpu != cpu)
+ kvm_migrate_timers(vcpu);
vcpu->cpu = cpu;
+ vcpu->arch.tsc_rebase = 0;
}
}

@@ -1947,6 +1961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = native_read_tsc();
+
+ /* For unstable TSC, force compensation and catchup on next CPU */
+ if (check_tsc_unstable()) {
+ vcpu->arch.tsc_rebase = 1;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
}

static int is_efer_nx(void)
@@ -4307,8 +4327,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
- if (!kvm_request_guest_time_update(vcpu))
- continue;
+ kvm_request_clock_update(vcpu);
if (vcpu->cpu != smp_processor_id())
send_ipi = 1;
}
@@ -5597,7 +5616,7 @@ int kvm_arch_hardware_enable(void *garbage)
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
if (vcpu->cpu == smp_processor_id())
- kvm_request_guest_time_update(vcpu);
+ kvm_request_clock_update(vcpu);
return kvm_x86_ops->hardware_enable(garbage);
}

--
1.7.1


2010-08-25 17:28:04

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [KVM timekeeping 25/35] Add clock catchup mode

On Thu, Aug 19, 2010 at 10:07:39PM -1000, Zachary Amsden wrote:
> Make the clock update handler handle generic clock synchronization,
> not just KVM clock. We add a catchup mode which keeps passthrough
> TSC in line with absolute guest TSC.
>
> Signed-off-by: Zachary Amsden <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++------------
> 2 files changed, 38 insertions(+), 18 deletions(-)
>

> kvm_x86_ops->vcpu_load(vcpu, cpu);
> - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
> + if (unlikely(vcpu->cpu != cpu) || vcpu->arch.tsc_rebase) {
> /* Make sure TSC doesn't go backwards */
> s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
> native_read_tsc() - vcpu->arch.last_host_tsc;
> if (tsc_delta < 0)
> mark_tsc_unstable("KVM discovered backwards TSC");
> - if (check_tsc_unstable())
> + if (check_tsc_unstable()) {
> kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
> - kvm_migrate_timers(vcpu);
> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> + }
> + if (vcpu->cpu != cpu)
> + kvm_migrate_timers(vcpu);
> vcpu->cpu = cpu;
> + vcpu->arch.tsc_rebase = 0;
> }
> }
>
> @@ -1947,6 +1961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> kvm_x86_ops->vcpu_put(vcpu);
> kvm_put_guest_fpu(vcpu);
> vcpu->arch.last_host_tsc = native_read_tsc();
> +
> + /* For unstable TSC, force compensation and catchup on next CPU */
> + if (check_tsc_unstable()) {
> + vcpu->arch.tsc_rebase = 1;
> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> + }

The mix between catchup,trap versus stable,unstable TSC is confusing and
difficult to grasp. Can you please introduce all the infrastructure
first, then control usage of them in centralized places? Examples:

+static void kvm_update_tsc_trapping(struct kvm *kvm)
+{
+ int trap, i;
+ struct kvm_vcpu *vcpu;
+
+ trap = check_tsc_unstable() && atomic_read(&kvm->online_vcpus) > 1;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_x86_ops->set_tsc_trap(vcpu, trap && !vcpu->arch.time_page);
+}

+ /* For unstable TSC, force compensation and catchup on next CPU */
+ if (check_tsc_unstable()) {
+ vcpu->arch.tsc_rebase = 1;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }


kvm_guest_time_update is becoming very confusing too. I understand this
is due to the many cases its dealing with, but please make it as simple
as possible.

+ /*
+ * If we are trapping and no longer need to, use catchup to
+ * ensure passthrough TSC will not be less than trapped TSC
+ */
+ if (vcpu->tsc_mode == TSC_MODE_PASSTHROUGH && vcpu->tsc_trapping &&
+ ((this_tsc_khz <= v->kvm->arch.virtual_tsc_khz || kvmclock))) {
+ catchup = 1;

What, TSC trapping with kvmclock enabled?

For both catchup and trapping the resolution of the host clock is
important, as Glauber commented for kvmclock. Can you comment on the
problems that arrive from a low res clock for both modes?

Similarly for catchup mode, the effect of exit frequency. No need for
any guarantees?

2010-08-25 20:48:33

by Zachary Amsden

[permalink] [raw]
Subject: Re: [KVM timekeeping 25/35] Add clock catchup mode

On 08/25/2010 07:27 AM, Marcelo Tosatti wrote:
> On Thu, Aug 19, 2010 at 10:07:39PM -1000, Zachary Amsden wrote:
>
>> Make the clock update handler handle generic clock synchronization,
>> not just KVM clock. We add a catchup mode which keeps passthrough
>> TSC in line with absolute guest TSC.
>>
>> Signed-off-by: Zachary Amsden<[email protected]>
>> ---
>> arch/x86/include/asm/kvm_host.h | 1 +
>> arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++------------
>> 2 files changed, 38 insertions(+), 18 deletions(-)
>>
>>
>
>> kvm_x86_ops->vcpu_load(vcpu, cpu);
>> - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
>> + if (unlikely(vcpu->cpu != cpu) || vcpu->arch.tsc_rebase) {
>> /* Make sure TSC doesn't go backwards */
>> s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
>> native_read_tsc() - vcpu->arch.last_host_tsc;
>> if (tsc_delta< 0)
>> mark_tsc_unstable("KVM discovered backwards TSC");
>> - if (check_tsc_unstable())
>> + if (check_tsc_unstable()) {
>> kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
>> - kvm_migrate_timers(vcpu);
>> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
>> + }
>> + if (vcpu->cpu != cpu)
>> + kvm_migrate_timers(vcpu);
>> vcpu->cpu = cpu;
>> + vcpu->arch.tsc_rebase = 0;
>> }
>> }
>>
>> @@ -1947,6 +1961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>> kvm_x86_ops->vcpu_put(vcpu);
>> kvm_put_guest_fpu(vcpu);
>> vcpu->arch.last_host_tsc = native_read_tsc();
>> +
>> + /* For unstable TSC, force compensation and catchup on next CPU */
>> + if (check_tsc_unstable()) {
>> + vcpu->arch.tsc_rebase = 1;
>> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
>> + }
>>
> The mix between catchup,trap versus stable,unstable TSC is confusing and
> difficult to grasp. Can you please introduce all the infrastructure
> first, then control usage of them in centralized places? Examples:
>
> +static void kvm_update_tsc_trapping(struct kvm *kvm)
> +{
> + int trap, i;
> + struct kvm_vcpu *vcpu;
> +
> + trap = check_tsc_unstable()&& atomic_read(&kvm->online_vcpus)> 1;
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + kvm_x86_ops->set_tsc_trap(vcpu, trap&& !vcpu->arch.time_page);
> +}
>
> + /* For unstable TSC, force compensation and catchup on next CPU */
> + if (check_tsc_unstable()) {
> + vcpu->arch.tsc_rebase = 1;
> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> + }
>
>
> kvm_guest_time_update is becoming very confusing too. I understand this
> is due to the many cases its dealing with, but please make it as simple
> as possible.
>

I tried to comment as best as I could. I think the whole
"kvm_update_tsc_trapping" thing is probably a poor design choice. It
works, but it's thoroughly unintelligible right now without spending
some days figuring out why.

I'll rework the tail series of patches to try to make them more clear.

> + /*
> + * If we are trapping and no longer need to, use catchup to
> + * ensure passthrough TSC will not be less than trapped TSC
> + */
> + if (vcpu->tsc_mode == TSC_MODE_PASSTHROUGH&& vcpu->tsc_trapping&&
> + ((this_tsc_khz<= v->kvm->arch.virtual_tsc_khz || kvmclock))) {
> + catchup = 1;
>
> What, TSC trapping with kvmclock enabled?
>

Transitioning to use of kvmclock after a cold boot means we may have
been trapping and now we will not be.

> For both catchup and trapping the resolution of the host clock is
> important, as Glauber commented for kvmclock. Can you comment on the
> problems that arrive from a low res clock for both modes?
>
> Similarly for catchup mode, the effect of exit frequency. No need for
> any guarantees?
>

The scheduler will do something to get an IRQ at whatever resolution it
uses for it's timeslice. That guarantees an exit per timeslice, so
we'll never be behind by more than one slice while scheduling. While
not scheduling, we're dormant anyway, waiting on either an IRQ or shared
memory variable change. Local timers could end up behind when dormant.

We may need a hack to accelerate firing of timers in such a case, or
perhaps bounds on when to use catchup mode and when to not.

Partly, the lack of implementation is by deliberate choice; the logic
involved with setting such bounds and wisdom of doing so is a choice
most likely to be done by a policy agent in userspace, in our case,
qemu. In the end, that is what has full control over the setting or not
of guest TSC rate and choice of TSC mode.

What's lacking is the ability to force the use of a certain mode. I
think it's clear now, that needs to be a per-VM choice, not a global one.

Zach

2010-08-25 22:18:04

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [KVM timekeeping 25/35] Add clock catchup mode

On Wed, Aug 25, 2010 at 10:48:20AM -1000, Zachary Amsden wrote:
> On 08/25/2010 07:27 AM, Marcelo Tosatti wrote:
> >On Thu, Aug 19, 2010 at 10:07:39PM -1000, Zachary Amsden wrote:
> >>Make the clock update handler handle generic clock synchronization,
> >>not just KVM clock. We add a catchup mode which keeps passthrough
> >>TSC in line with absolute guest TSC.
> >>
> >>Signed-off-by: Zachary Amsden<[email protected]>
> >>---
> >> arch/x86/include/asm/kvm_host.h | 1 +
> >> arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++------------
> >> 2 files changed, 38 insertions(+), 18 deletions(-)
> >>
> >> kvm_x86_ops->vcpu_load(vcpu, cpu);
> >>- if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
> >>+ if (unlikely(vcpu->cpu != cpu) || vcpu->arch.tsc_rebase) {
> >> /* Make sure TSC doesn't go backwards */
> >> s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
> >> native_read_tsc() - vcpu->arch.last_host_tsc;
> >> if (tsc_delta< 0)
> >> mark_tsc_unstable("KVM discovered backwards TSC");
> >>- if (check_tsc_unstable())
> >>+ if (check_tsc_unstable()) {
> >> kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
> >>- kvm_migrate_timers(vcpu);
> >>+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> >>+ }
> >>+ if (vcpu->cpu != cpu)
> >>+ kvm_migrate_timers(vcpu);
> >> vcpu->cpu = cpu;
> >>+ vcpu->arch.tsc_rebase = 0;
> >> }
> >> }
> >>
> >>@@ -1947,6 +1961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >> kvm_x86_ops->vcpu_put(vcpu);
> >> kvm_put_guest_fpu(vcpu);
> >> vcpu->arch.last_host_tsc = native_read_tsc();
> >>+
> >>+ /* For unstable TSC, force compensation and catchup on next CPU */
> >>+ if (check_tsc_unstable()) {
> >>+ vcpu->arch.tsc_rebase = 1;
> >>+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> >>+ }
> >The mix between catchup,trap versus stable,unstable TSC is confusing and
> >difficult to grasp. Can you please introduce all the infrastructure
> >first, then control usage of them in centralized places? Examples:
> >
> >+static void kvm_update_tsc_trapping(struct kvm *kvm)
> >+{
> >+ int trap, i;
> >+ struct kvm_vcpu *vcpu;
> >+
> >+ trap = check_tsc_unstable()&& atomic_read(&kvm->online_vcpus)> 1;
> >+ kvm_for_each_vcpu(i, vcpu, kvm)
> >+ kvm_x86_ops->set_tsc_trap(vcpu, trap&& !vcpu->arch.time_page);
> >+}
> >
> >+ /* For unstable TSC, force compensation and catchup on next CPU */
> >+ if (check_tsc_unstable()) {
> >+ vcpu->arch.tsc_rebase = 1;
> >+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> >+ }
> >
> >
> >kvm_guest_time_update is becoming very confusing too. I understand this
> >is due to the many cases its dealing with, but please make it as simple
> >as possible.
>
> I tried to comment as best as I could. I think the whole
> "kvm_update_tsc_trapping" thing is probably a poor design choice.
> It works, but it's thoroughly unintelligible right now without
> spending some days figuring out why.
>
> I'll rework the tail series of patches to try to make them more clear.
>
> >+ /*
> >+ * If we are trapping and no longer need to, use catchup to
> >+ * ensure passthrough TSC will not be less than trapped TSC
> >+ */
> >+ if (vcpu->tsc_mode == TSC_MODE_PASSTHROUGH&& vcpu->tsc_trapping&&
> >+ ((this_tsc_khz<= v->kvm->arch.virtual_tsc_khz || kvmclock))) {
> >+ catchup = 1;
> >
> >What, TSC trapping with kvmclock enabled?
>
> Transitioning to use of kvmclock after a cold boot means we may have
> been trapping and now we will not be.
>
> >For both catchup and trapping the resolution of the host clock is
> >important, as Glauber commented for kvmclock. Can you comment on the
> >problems that arrive from a low res clock for both modes?
> >
> >Similarly for catchup mode, the effect of exit frequency. No need for
> >any guarantees?
>
> The scheduler will do something to get an IRQ at whatever resolution
> it uses for it's timeslice. That guarantees an exit per timeslice,
> so we'll never be behind by more than one slice while scheduling.
> While not scheduling, we're dormant anyway, waiting on either an IRQ
> or shared memory variable change. Local timers could end up behind
> when dormant.
>
> We may need a hack to accelerate firing of timers in such a case, or
> perhaps bounds on when to use catchup mode and when to not.

What about emulating rdtsc with low res clock?

"The RDTSC instruction reads the time-stamp counter and is guaranteed to
return a monotonically increasing unique value whenever executed, except
for a 64-bit counter wraparound."

> Partly, the lack of implementation is by deliberate choice; the
> logic involved with setting such bounds and wisdom of doing so is a
> choice most likely to be done by a policy agent in userspace, in our
> case, qemu. In the end, that is what has full control over the
> setting or not of guest TSC rate and choice of TSC mode.
>
> What's lacking is the ability to force the use of a certain mode. I
> think it's clear now, that needs to be a per-VM choice, not a global
> one.
>
> Zach

2010-08-25 23:38:19

by Glauber Costa

[permalink] [raw]
Subject: Re: [KVM timekeeping 25/35] Add clock catchup mode

On Wed, Aug 25, 2010 at 07:01:34PM -0300, Marcelo Tosatti wrote:
> On Wed, Aug 25, 2010 at 10:48:20AM -1000, Zachary Amsden wrote:
> > On 08/25/2010 07:27 AM, Marcelo Tosatti wrote:
> > >On Thu, Aug 19, 2010 at 10:07:39PM -1000, Zachary Amsden wrote:
> > >>Make the clock update handler handle generic clock synchronization,
> > >>not just KVM clock. We add a catchup mode which keeps passthrough
> > >>TSC in line with absolute guest TSC.
> > >>
> > >>Signed-off-by: Zachary Amsden<[email protected]>
> > >>---
> > >> arch/x86/include/asm/kvm_host.h | 1 +
> > >> arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++------------
> > >> 2 files changed, 38 insertions(+), 18 deletions(-)
> > >>
> > >> kvm_x86_ops->vcpu_load(vcpu, cpu);
> > >>- if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
> > >>+ if (unlikely(vcpu->cpu != cpu) || vcpu->arch.tsc_rebase) {
> > >> /* Make sure TSC doesn't go backwards */
> > >> s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
> > >> native_read_tsc() - vcpu->arch.last_host_tsc;
> > >> if (tsc_delta< 0)
> > >> mark_tsc_unstable("KVM discovered backwards TSC");
> > >>- if (check_tsc_unstable())
> > >>+ if (check_tsc_unstable()) {
> > >> kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
> > >>- kvm_migrate_timers(vcpu);
> > >>+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> > >>+ }
> > >>+ if (vcpu->cpu != cpu)
> > >>+ kvm_migrate_timers(vcpu);
> > >> vcpu->cpu = cpu;
> > >>+ vcpu->arch.tsc_rebase = 0;
> > >> }
> > >> }
> > >>
> > >>@@ -1947,6 +1961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> > >> kvm_x86_ops->vcpu_put(vcpu);
> > >> kvm_put_guest_fpu(vcpu);
> > >> vcpu->arch.last_host_tsc = native_read_tsc();
> > >>+
> > >>+ /* For unstable TSC, force compensation and catchup on next CPU */
> > >>+ if (check_tsc_unstable()) {
> > >>+ vcpu->arch.tsc_rebase = 1;
> > >>+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> > >>+ }
> > >The mix between catchup,trap versus stable,unstable TSC is confusing and
> > >difficult to grasp. Can you please introduce all the infrastructure
> > >first, then control usage of them in centralized places? Examples:
> > >
> > >+static void kvm_update_tsc_trapping(struct kvm *kvm)
> > >+{
> > >+ int trap, i;
> > >+ struct kvm_vcpu *vcpu;
> > >+
> > >+ trap = check_tsc_unstable()&& atomic_read(&kvm->online_vcpus)> 1;
> > >+ kvm_for_each_vcpu(i, vcpu, kvm)
> > >+ kvm_x86_ops->set_tsc_trap(vcpu, trap&& !vcpu->arch.time_page);
> > >+}
> > >
> > >+ /* For unstable TSC, force compensation and catchup on next CPU */
> > >+ if (check_tsc_unstable()) {
> > >+ vcpu->arch.tsc_rebase = 1;
> > >+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> > >+ }
> > >
> > >
> > >kvm_guest_time_update is becoming very confusing too. I understand this
> > >is due to the many cases its dealing with, but please make it as simple
> > >as possible.
> >
> > I tried to comment as best as I could. I think the whole
> > "kvm_update_tsc_trapping" thing is probably a poor design choice.
> > It works, but it's thoroughly unintelligible right now without
> > spending some days figuring out why.
> >
> > I'll rework the tail series of patches to try to make them more clear.
> >
> > >+ /*
> > >+ * If we are trapping and no longer need to, use catchup to
> > >+ * ensure passthrough TSC will not be less than trapped TSC
> > >+ */
> > >+ if (vcpu->tsc_mode == TSC_MODE_PASSTHROUGH&& vcpu->tsc_trapping&&
> > >+ ((this_tsc_khz<= v->kvm->arch.virtual_tsc_khz || kvmclock))) {
> > >+ catchup = 1;
> > >
> > >What, TSC trapping with kvmclock enabled?
> >
> > Transitioning to use of kvmclock after a cold boot means we may have
> > been trapping and now we will not be.
> >
> > >For both catchup and trapping the resolution of the host clock is
> > >important, as Glauber commented for kvmclock. Can you comment on the
> > >problems that arrive from a low res clock for both modes?
> > >
> > >Similarly for catchup mode, the effect of exit frequency. No need for
> > >any guarantees?
> >
> > The scheduler will do something to get an IRQ at whatever resolution
> > it uses for it's timeslice. That guarantees an exit per timeslice,
> > so we'll never be behind by more than one slice while scheduling.
> > While not scheduling, we're dormant anyway, waiting on either an IRQ
> > or shared memory variable change. Local timers could end up behind
> > when dormant.
> >
> > We may need a hack to accelerate firing of timers in such a case, or
> > perhaps bounds on when to use catchup mode and when to not.
>
> What about emulating rdtsc with low res clock?
>
> "The RDTSC instruction reads the time-stamp counter and is guaranteed to
> return a monotonically increasing unique value whenever executed, except
> for a 64-bit counter wraparound."
>
This is bad semantics, IMHO. It is a totally different behaviour than the
one guest users would expect.

2010-08-26 00:17:20

by Zachary Amsden

[permalink] [raw]
Subject: Re: [KVM timekeeping 25/35] Add clock catchup mode

On 08/25/2010 12:01 PM, Marcelo Tosatti wrote:
> On Wed, Aug 25, 2010 at 10:48:20AM -1000, Zachary Amsden wrote:
>
>> On 08/25/2010 07:27 AM, Marcelo Tosatti wrote:
>>
>>> On Thu, Aug 19, 2010 at 10:07:39PM -1000, Zachary Amsden wrote:
>>>
>>>> Make the clock update handler handle generic clock synchronization,
>>>> not just KVM clock. We add a catchup mode which keeps passthrough
>>>> TSC in line with absolute guest TSC.
>>>>
>>>> Signed-off-by: Zachary Amsden<[email protected]>
>>>> ---
>>>> arch/x86/include/asm/kvm_host.h | 1 +
>>>> arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++------------
>>>> 2 files changed, 38 insertions(+), 18 deletions(-)
>>>>
>>>> kvm_x86_ops->vcpu_load(vcpu, cpu);
>>>> - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
>>>> + if (unlikely(vcpu->cpu != cpu) || vcpu->arch.tsc_rebase) {
>>>> /* Make sure TSC doesn't go backwards */
>>>> s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
>>>> native_read_tsc() - vcpu->arch.last_host_tsc;
>>>> if (tsc_delta< 0)
>>>> mark_tsc_unstable("KVM discovered backwards TSC");
>>>> - if (check_tsc_unstable())
>>>> + if (check_tsc_unstable()) {
>>>> kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
>>>> - kvm_migrate_timers(vcpu);
>>>> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
>>>> + }
>>>> + if (vcpu->cpu != cpu)
>>>> + kvm_migrate_timers(vcpu);
>>>> vcpu->cpu = cpu;
>>>> + vcpu->arch.tsc_rebase = 0;
>>>> }
>>>> }
>>>>
>>>> @@ -1947,6 +1961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>> kvm_x86_ops->vcpu_put(vcpu);
>>>> kvm_put_guest_fpu(vcpu);
>>>> vcpu->arch.last_host_tsc = native_read_tsc();
>>>> +
>>>> + /* For unstable TSC, force compensation and catchup on next CPU */
>>>> + if (check_tsc_unstable()) {
>>>> + vcpu->arch.tsc_rebase = 1;
>>>> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
>>>> + }
>>>>
>>> The mix between catchup,trap versus stable,unstable TSC is confusing and
>>> difficult to grasp. Can you please introduce all the infrastructure
>>> first, then control usage of them in centralized places? Examples:
>>>
>>> +static void kvm_update_tsc_trapping(struct kvm *kvm)
>>> +{
>>> + int trap, i;
>>> + struct kvm_vcpu *vcpu;
>>> +
>>> + trap = check_tsc_unstable()&& atomic_read(&kvm->online_vcpus)> 1;
>>> + kvm_for_each_vcpu(i, vcpu, kvm)
>>> + kvm_x86_ops->set_tsc_trap(vcpu, trap&& !vcpu->arch.time_page);
>>> +}
>>>
>>> + /* For unstable TSC, force compensation and catchup on next CPU */
>>> + if (check_tsc_unstable()) {
>>> + vcpu->arch.tsc_rebase = 1;
>>> + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
>>> + }
>>>
>>>
>>> kvm_guest_time_update is becoming very confusing too. I understand this
>>> is due to the many cases its dealing with, but please make it as simple
>>> as possible.
>>>
>> I tried to comment as best as I could. I think the whole
>> "kvm_update_tsc_trapping" thing is probably a poor design choice.
>> It works, but it's thoroughly unintelligible right now without
>> spending some days figuring out why.
>>
>> I'll rework the tail series of patches to try to make them more clear.
>>
>>
>>> + /*
>>> + * If we are trapping and no longer need to, use catchup to
>>> + * ensure passthrough TSC will not be less than trapped TSC
>>> + */
>>> + if (vcpu->tsc_mode == TSC_MODE_PASSTHROUGH&& vcpu->tsc_trapping&&
>>> + ((this_tsc_khz<= v->kvm->arch.virtual_tsc_khz || kvmclock))) {
>>> + catchup = 1;
>>>
>>> What, TSC trapping with kvmclock enabled?
>>>
>> Transitioning to use of kvmclock after a cold boot means we may have
>> been trapping and now we will not be.
>>
>>
>>> For both catchup and trapping the resolution of the host clock is
>>> important, as Glauber commented for kvmclock. Can you comment on the
>>> problems that arrive from a low res clock for both modes?
>>>
>>> Similarly for catchup mode, the effect of exit frequency. No need for
>>> any guarantees?
>>>
>> The scheduler will do something to get an IRQ at whatever resolution
>> it uses for it's timeslice. That guarantees an exit per timeslice,
>> so we'll never be behind by more than one slice while scheduling.
>> While not scheduling, we're dormant anyway, waiting on either an IRQ
>> or shared memory variable change. Local timers could end up behind
>> when dormant.
>>
>> We may need a hack to accelerate firing of timers in such a case, or
>> perhaps bounds on when to use catchup mode and when to not.
>>
> What about emulating rdtsc with low res clock?
>
> "The RDTSC instruction reads the time-stamp counter and is guaranteed to
> return a monotonically increasing unique value whenever executed, except
> for a 64-bit counter wraparound."
>

Technically, that may not be quite correct.

<digression into weeds>

The RDTSC instruction will return a monotonically increasing unique
value, but the execution and retirement of the instruction are
unserialized. So technically, two simultaneous RDTSC could be issued to
multiple execution units, and they may either return the same values, or
the earlier one may stall and complete after the latter.

rdtsc
mov %eax, %ebx
mov %edx, %ecx
rdtsc
cmp %edx, %ecx
jb fail
cmp %ebx, %eax
jae fail
jmp good
fail:
int3
good:
ret

If execution of RDTSC is restricted to a single issue unit, this can
never fail. If it can be issued simultaneously in multiple units, it
can fail because register renaming may end up sorting the instruction
stream and removing dependencies so it can be executed as:

UNIT 1 UNIT 2
rdtsc rdtsc
mov %eax, %ebx (store to local %edx, %eax)
mov %edx, %ecx cmp %ebx, local %eax
(commit local %edx, %eax to
global register)
cmp %edx, %ecx
jb fail
jae fail

Both failure modes can be observed if this is indeed the case. I'm not
aware that anything is specifically done to maintain the serialization
internally, and as the architecture actually specifically states that
RDTSC is unserialized, I doubt anything to prevent this situation is done.

</digression into weeds>

However, that's not the pertinent issue. If the clock is very low res,
we don't present a higher granularity TSC to the guest.

While there are things that can be done to ensure that (add 1 for each
read, estimate with TSC..), they have problems of their own and in
generally will make things very messy.

Given the above digression, I'm not sure that any code written to run
with such guarantees is actually sound.

It is plausible, however, someone does

count of some value / (TSC2 - TSC1)

and ends up with a divide by zero. So it may be better to bump the
counter by at least one for each call.