2016-11-11 10:12:27

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH] KVM: x86: do not go through vcpu in __get_kvmclock_ns

Going through the first VCPU is wrong if you follow a KVM_SET_CLOCK with
a KVM_GET_CLOCK immediately after, without letting the VCPU run and
call kvm_guest_time_update.

This is easily fixed however, because kvm_get_time_and_clockread provides
the information we want.

Reported-by: Marcelo Tosatti <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/x86.c | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1ba08278a9a9..1c16c6d7df7a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1620,6 +1620,11 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)

return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
}
+#else
+static inline bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+ return false;
+}
#endif

/*
@@ -1724,18 +1729,15 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)

static u64 __get_kvmclock_ns(struct kvm *kvm)
{
- struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
struct kvm_arch *ka = &kvm->arch;
+ cycle_t cycle_now;
s64 ns;

- if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
- u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
- } else {
- ns = ktime_get_boot_ns() + ka->kvmclock_offset;
- }
+ if (!ka->use_master_clock ||
+ !kvm_get_time_and_clockread(&ns, &cycle_now))
+ ns = ktime_get_boot_ns();

- return ns;
+ return ns + ka->kvmclock_offset;
}

u64 get_kvmclock_ns(struct kvm *kvm)
--
1.8.3.1


2016-11-14 14:59:01

by Radim Krčmář

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: do not go through vcpu in __get_kvmclock_ns

2016-11-11 11:12+0100, Paolo Bonzini:
> Going through the first VCPU is wrong if you follow a KVM_SET_CLOCK with
> a KVM_GET_CLOCK immediately after, without letting the VCPU run and
> call kvm_guest_time_update.
>
> This is easily fixed however, because kvm_get_time_and_clockread provides
> the information we want.
>
> Reported-by: Marcelo Tosatti <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/x86.c | 18 ++++++++++--------
> 1 file changed, 10 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1ba08278a9a9..1c16c6d7df7a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1620,6 +1620,11 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
>
> return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
> }
> +#else
> +static inline bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
> +{
> + return false;
> +}
> #endif
>
> /*
> @@ -1724,18 +1729,15 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>
> static u64 __get_kvmclock_ns(struct kvm *kvm)
> {
> - struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
> struct kvm_arch *ka = &kvm->arch;
> + cycle_t cycle_now;
> s64 ns;
>
> - if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
> - u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> - ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);

This patch regresses the behavior as well, because the assumption that
kvm_get_time_and_clockread() and __pvclock_read_cycles() count the same
time doesn't hold. See the end of the message for a quick test.

kvm_get_time_and_clockread() is actually the same as ktime_get_boot_ns()
(if it works), so we'd be just obfucating the code. :)

I think that making kvmclock count as ktime_get_boot_ns() would be the
best solution, but not possible this late in 4.9 ...

As a quick hack, I think it would be better to duplicate the update that
would happen when running the VCPU before calling
__pvclock_read_cycles(), i.e. paste something like this:

if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
kvm_guest_time_update(vcpu);

> - } else {
> - ns = ktime_get_boot_ns() + ka->kvmclock_offset;
> - }
> + if (!ka->use_master_clock ||
> + !kvm_get_time_and_clockread(&ns, &cycle_now))
> + ns = ktime_get_boot_ns();
>
> - return ns;
> + return ns + ka->kvmclock_offset;
> }

The hunk below should return the same value in pvclock_ns and kernel_ns
if they can be used interchangeably. boot_ns is expected to be a bit
delayed, because it is read late. boot_ns shows a bounded offset from
kernel_ns, unlike the drifting pvclock_ns.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83990ad3710e..30d4d3d02ac7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6653,6 +6653,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}

+ if (vcpu->kvm->arch.use_master_clock) {
+ s64 kernel_ns;
+ cycle_t tsc_now, pvclock_ns, boot_ns;
+
+ kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
+ pvclock_ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, kvm_read_l1_tsc(vcpu, tsc_now)) - vcpu->kvm->arch.kvmclock_offset;
+ boot_ns = ktime_get_boot_ns();
+
+ printk("ns diff: %lld %lld\n", pvclock_ns - kernel_ns, boot_ns - kernel_ns);
+ }
+
preempt_disable();

kvm_x86_ops->prepare_guest_switch(vcpu);

and a sample output:

[ 2386.172552] ns diff: -1 270
[ 2386.175695] ns diff: 14 246
[ 2386.178831] ns diff: 28 157
[ 2386.181962] ns diff: 43 176
[ 2386.185089] ns diff: 57 146
[ 2386.188228] ns diff: 71 170
[ 2386.191364] ns diff: 85 142
[ 2386.194494] ns diff: 100 138
[ 2386.197717] ns diff: 115 129
[ 2386.200935] ns diff: 128 190
[ 2386.204156] ns diff: 144 125
[ 2386.207797] ns diff: 161 145
[ 2386.211018] ns diff: 175 120
[ 2386.214239] ns diff: 189 176
[ 2386.217460] ns diff: 203 149
[ 2386.220677] ns diff: 219 110
[ 2386.223900] ns diff: 234 124
[ 2386.227125] ns diff: 248 303
[ 2386.230345] ns diff: 262 111
[ 2386.233565] ns diff: 278 107
[ 2386.236783] ns diff: 292 107
[ 2386.240001] ns diff: 306 170
[ 2386.243228] ns diff: 321 140
[ 2386.246447] ns diff: 336 101
[ 2386.249667] ns diff: 351 96
[ 2386.252787] ns diff: 365 145
[ 2386.256008] ns diff: 379 107
[ 2386.259225] ns diff: 394 99
[ 2386.262346] ns diff: 409 95
[ 2386.265467] ns diff: 423 95
[ 2386.268588] ns diff: 437 97
[ 2386.271708] ns diff: 451 165
[ 2386.274925] ns diff: 467 97
[ 2386.278047] ns diff: 480 95
[ 2386.281176] ns diff: 494 120
[ 2386.284398] ns diff: 509 408
[ 2386.287616] ns diff: 524 88
[ 2386.290736] ns diff: 538 87
[ 2386.293858] ns diff: 552 89
[...]
[ 2403.966262] ns diff: 81022 68
[ 2403.969567] ns diff: 81037 68
[ 2403.972878] ns diff: 81052 68
[ 2403.976189] ns diff: 81067 66
[ 2403.979504] ns diff: 81082 69
[ 2403.982815] ns diff: 81097 67
[ 2403.986127] ns diff: 81113 67
[ 2403.989439] ns diff: 81127 69
[ 2403.992752] ns diff: 81143 70
[ 2403.996065] ns diff: 81158 70
[...]
[ 2753.756071] ns diff: 1686562 65
[ 2753.759567] ns diff: 1686578 65
[ 2753.763073] ns diff: 1686596 65
[ 2753.766570] ns diff: 1686611 66
[ 2753.770075] ns diff: 1686627 67
[ 2753.773580] ns diff: 1686643 67
[ 2753.777086] ns diff: 1686658 66
[ 2753.780584] ns diff: 1686674 65
[ 2753.784093] ns diff: 1686690 69
[ 2753.787589] ns diff: 1686706 66

2016-11-14 16:18:20

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: do not go through vcpu in __get_kvmclock_ns



On 14/11/2016 15:52, Radim Krčmář wrote:
> The hunk below should return the same value in pvclock_ns and kernel_ns
> if they can be used interchangeably. boot_ns is expected to be a bit
> delayed, because it is read late. boot_ns shows a bounded offset from
> kernel_ns, unlike the drifting pvclock_ns.
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 83990ad3710e..30d4d3d02ac7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6653,6 +6653,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> goto cancel_injection;
> }
>
> + if (vcpu->kvm->arch.use_master_clock) {
> + s64 kernel_ns;
> + cycle_t tsc_now, pvclock_ns, boot_ns;
> +
> + kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
> + pvclock_ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, kvm_read_l1_tsc(vcpu, tsc_now)) - vcpu->kvm->arch.kvmclock_offset;
> + boot_ns = ktime_get_boot_ns();
> +
> + printk("ns diff: %lld %lld\n", pvclock_ns - kernel_ns, boot_ns - kernel_ns);
> + }
> +
> preempt_disable();
>
> kvm_x86_ops->prepare_guest_switch(vcpu);

Ok, I'll post a v2 of this patch.

Paolo

2016-11-14 17:13:46

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: do not go through vcpu in __get_kvmclock_ns

On Mon, Nov 14, 2016 at 03:52:40PM +0100, Radim Krčmář wrote:
> 2016-11-11 11:12+0100, Paolo Bonzini:
> > Going through the first VCPU is wrong if you follow a KVM_SET_CLOCK with
> > a KVM_GET_CLOCK immediately after, without letting the VCPU run and
> > call kvm_guest_time_update.
> >
> > This is easily fixed however, because kvm_get_time_and_clockread provides
> > the information we want.
> >
> > Reported-by: Marcelo Tosatti <[email protected]>
> > Signed-off-by: Paolo Bonzini <[email protected]>
> > ---
> > arch/x86/kvm/x86.c | 18 ++++++++++--------
> > 1 file changed, 10 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 1ba08278a9a9..1c16c6d7df7a 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1620,6 +1620,11 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
> >
> > return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
> > }
> > +#else
> > +static inline bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
> > +{
> > + return false;
> > +}
> > #endif
> >
> > /*
> > @@ -1724,18 +1729,15 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> >
> > static u64 __get_kvmclock_ns(struct kvm *kvm)
> > {
> > - struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
> > struct kvm_arch *ka = &kvm->arch;
> > + cycle_t cycle_now;
> > s64 ns;
> >
> > - if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
> > - u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> > - ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
>
> This patch regresses the behavior as well, because the assumption that
> kvm_get_time_and_clockread() and __pvclock_read_cycles() count the same
> time doesn't hold. See the end of the message for a quick test.
>
> kvm_get_time_and_clockread() is actually the same as ktime_get_boot_ns()
> (if it works), so we'd be just obfucating the code. :)
>
> I think that making kvmclock count as ktime_get_boot_ns() would be the
> best solution, but not possible this late in 4.9 ...
>
> As a quick hack, I think it would be better to duplicate the update that
> would happen when running the VCPU before calling
> __pvclock_read_cycles(), i.e. paste something like this:
>
> if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
> kvm_guest_time_update(vcpu);
>
> > - } else {
> > - ns = ktime_get_boot_ns() + ka->kvmclock_offset;
> > - }
> > + if (!ka->use_master_clock ||
> > + !kvm_get_time_and_clockread(&ns, &cycle_now))
> > + ns = ktime_get_boot_ns();
> >
> > - return ns;
> > + return ns + ka->kvmclock_offset;
> > }
>
> The hunk below should return the same value in pvclock_ns and kernel_ns
> if they can be used interchangeably. boot_ns is expected to be a bit
> delayed, because it is read late. boot_ns shows a bounded offset from
> kernel_ns, unlike the drifting pvclock_ns.
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 83990ad3710e..30d4d3d02ac7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6653,6 +6653,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> goto cancel_injection;
> }
>
> + if (vcpu->kvm->arch.use_master_clock) {
> + s64 kernel_ns;
> + cycle_t tsc_now, pvclock_ns, boot_ns;
> +
> + kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
> + pvclock_ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, kvm_read_l1_tsc(vcpu, tsc_now)) - vcpu->kvm->arch.kvmclock_offset;
> + boot_ns = ktime_get_boot_ns();
> +
> + printk("ns diff: %lld %lld\n", pvclock_ns - kernel_ns, boot_ns - kernel_ns);
> + }
> +
> preempt_disable();
>
> kvm_x86_ops->prepare_guest_switch(vcpu);
>
> and a sample output:

KVM_GET_CLOCK should return what the guest sees at the moment
KVM_GET_CLOCK is called, which should include

if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
} else {
ns = ktime_get_boot_ns() + ka->kvmclock_offset;

>>> add (rdtsc() - tsc_timestamp),
if kvmclock is enabled
}

The addition under >>> above.