2019-08-15 04:06:43

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH v2] KVM: LAPIC: Periodically revaluate to get conservative lapic_timer_advance_ns

From: Wanpeng Li <[email protected]>

Even if for realtime CPUs, cache line bounces, frequency scaling, presence
of higher-priority RT tasks, etc can still cause different response. These
interferences should be considered and periodically revaluate whether
or not the lapic_timer_advance_ns value is the best, do nothing if it is,
otherwise recaluate again. Set lapic_timer_advance_ns to the minimal
conservative value from all the estimated values.

Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, before patch:
1628
4161
4321
3236
...

Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, after patch:
1553
1499
1509
1489
...

Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, before patch:
4617
3641
4102
4577
...
Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, after patch:
2775
2892
2764
2775
...

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/lapic.c | 34 ++++++++++++++++++++++++++++------
arch/x86/kvm/lapic.h | 2 ++
2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index df5cd07..8487d9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -69,6 +69,7 @@
#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+#define LAPIC_TIMER_ADVANCE_RECALC_PERIOD (600 * HZ)

static inline int apic_test_vector(int vec, void *bitmap)
{
@@ -1480,10 +1481,21 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
s64 advance_expire_delta)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
+ u32 timer_advance_ns = ktimer->timer_advance_ns;
u64 ns;

+ /* periodic revaluate */
+ if (unlikely(ktimer->timer_advance_adjust_done)) {
+ ktimer->recalc_timer_advance_ns = jiffies +
+ LAPIC_TIMER_ADVANCE_RECALC_PERIOD;
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_DONE) {
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+ ktimer->timer_advance_adjust_done = false;
+ } else
+ return;
+ }
+
/* too early */
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
@@ -1499,12 +1511,18 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
}

if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
+ ktimer->timer_advance_adjust_done = true;
if (unlikely(timer_advance_ns > 5000)) {
timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
+ ktimer->timer_advance_adjust_done = false;
+ }
+ ktimer->timer_advance_ns = timer_advance_ns;
+
+ if (ktimer->timer_advance_adjust_done) {
+ if (ktimer->min_timer_advance_ns > timer_advance_ns)
+ ktimer->min_timer_advance_ns = timer_advance_ns;
+ ktimer->timer_advance_ns = ktimer->min_timer_advance_ns;
}
- apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}

static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
@@ -1523,7 +1541,8 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);

- if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ if (unlikely(!apic->lapic_timer.timer_advance_adjust_done) ||
+ time_before(apic->lapic_timer.recalc_timer_advance_ns, jiffies))
adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}

@@ -2301,9 +2320,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
if (timer_advance_ns == -1) {
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
apic->lapic_timer.timer_advance_adjust_done = false;
+ apic->lapic_timer.recalc_timer_advance_ns = jiffies;
+ apic->lapic_timer.min_timer_advance_ns = UINT_MAX;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
apic->lapic_timer.timer_advance_adjust_done = true;
+ apic->lapic_timer.recalc_timer_advance_ns = MAX_JIFFY_OFFSET;
}


diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 50053d2..56a05eb 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,8 @@ struct kvm_timer {
u32 timer_mode_mask;
u64 tscdeadline;
u64 expired_tscdeadline;
+ unsigned long recalc_timer_advance_ns;
+ u32 min_timer_advance_ns;
u32 timer_advance_ns;
s64 advance_expire_delta;
atomic_t pending; /* accumulated triggered timers */
--
2.7.4


2019-08-22 06:25:51

by Wanpeng Li

[permalink] [raw]
Subject: Re: [PATCH v2] KVM: LAPIC: Periodically revaluate to get conservative lapic_timer_advance_ns

ping,
On Thu, 15 Aug 2019 at 12:03, Wanpeng Li <[email protected]> wrote:
>
> From: Wanpeng Li <[email protected]>
>
> Even if for realtime CPUs, cache line bounces, frequency scaling, presence
> of higher-priority RT tasks, etc can still cause different response. These
> interferences should be considered and periodically revaluate whether
> or not the lapic_timer_advance_ns value is the best, do nothing if it is,
> otherwise recaluate again. Set lapic_timer_advance_ns to the minimal
> conservative value from all the estimated values.
>
> Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, before patch:
> 1628
> 4161
> 4321
> 3236
> ...
>
> Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, after patch:
> 1553
> 1499
> 1509
> 1489
> ...
>
> Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, before patch:
> 4617
> 3641
> 4102
> 4577
> ...
> Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, after patch:
> 2775
> 2892
> 2764
> 2775
> ...
>
> Cc: Paolo Bonzini <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> arch/x86/kvm/lapic.c | 34 ++++++++++++++++++++++++++++------
> arch/x86/kvm/lapic.h | 2 ++
> 2 files changed, 30 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index df5cd07..8487d9c 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -69,6 +69,7 @@
> #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
> /* step-by-step approximation to mitigate fluctuation */
> #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
> +#define LAPIC_TIMER_ADVANCE_RECALC_PERIOD (600 * HZ)
>
> static inline int apic_test_vector(int vec, void *bitmap)
> {
> @@ -1480,10 +1481,21 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
> static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
> s64 advance_expire_delta)
> {
> - struct kvm_lapic *apic = vcpu->arch.apic;
> - u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
> + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
> + u32 timer_advance_ns = ktimer->timer_advance_ns;
> u64 ns;
>
> + /* periodic revaluate */
> + if (unlikely(ktimer->timer_advance_adjust_done)) {
> + ktimer->recalc_timer_advance_ns = jiffies +
> + LAPIC_TIMER_ADVANCE_RECALC_PERIOD;
> + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_DONE) {
> + timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
> + ktimer->timer_advance_adjust_done = false;
> + } else
> + return;
> + }
> +
> /* too early */
> if (advance_expire_delta < 0) {
> ns = -advance_expire_delta * 1000000ULL;
> @@ -1499,12 +1511,18 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
> }
>
> if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> - apic->lapic_timer.timer_advance_adjust_done = true;
> + ktimer->timer_advance_adjust_done = true;
> if (unlikely(timer_advance_ns > 5000)) {
> timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
> - apic->lapic_timer.timer_advance_adjust_done = false;
> + ktimer->timer_advance_adjust_done = false;
> + }
> + ktimer->timer_advance_ns = timer_advance_ns;
> +
> + if (ktimer->timer_advance_adjust_done) {
> + if (ktimer->min_timer_advance_ns > timer_advance_ns)
> + ktimer->min_timer_advance_ns = timer_advance_ns;
> + ktimer->timer_advance_ns = ktimer->min_timer_advance_ns;
> }
> - apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> }
>
> static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
> @@ -1523,7 +1541,8 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
> if (guest_tsc < tsc_deadline)
> __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
>
> - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> + if (unlikely(!apic->lapic_timer.timer_advance_adjust_done) ||
> + time_before(apic->lapic_timer.recalc_timer_advance_ns, jiffies))
> adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
> }
>
> @@ -2301,9 +2320,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
> if (timer_advance_ns == -1) {
> apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
> apic->lapic_timer.timer_advance_adjust_done = false;
> + apic->lapic_timer.recalc_timer_advance_ns = jiffies;
> + apic->lapic_timer.min_timer_advance_ns = UINT_MAX;
> } else {
> apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> apic->lapic_timer.timer_advance_adjust_done = true;
> + apic->lapic_timer.recalc_timer_advance_ns = MAX_JIFFY_OFFSET;
> }
>
>
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 50053d2..56a05eb 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -31,6 +31,8 @@ struct kvm_timer {
> u32 timer_mode_mask;
> u64 tscdeadline;
> u64 expired_tscdeadline;
> + unsigned long recalc_timer_advance_ns;
> + u32 min_timer_advance_ns;
> u32 timer_advance_ns;
> s64 advance_expire_delta;
> atomic_t pending; /* accumulated triggered timers */
> --
> 2.7.4
>

2019-08-27 17:50:45

by Radim Krčmář

[permalink] [raw]
Subject: Re: [PATCH v2] KVM: LAPIC: Periodically revaluate to get conservative lapic_timer_advance_ns

2019-08-15 12:03+0800, Wanpeng Li:
> From: Wanpeng Li <[email protected]>
>
> Even if for realtime CPUs, cache line bounces, frequency scaling, presence
> of higher-priority RT tasks, etc can still cause different response. These
> interferences should be considered and periodically revaluate whether
> or not the lapic_timer_advance_ns value is the best, do nothing if it is,
> otherwise recaluate again. Set lapic_timer_advance_ns to the minimal
> conservative value from all the estimated values.

IIUC, this patch is looking for the minimal timer_advance_ns because it
provides the best throughput:
When every code path ran as fast as possible and we don't have to wait
for the timer to expire, but still arrive exactly at the point when it
would have expired.
We always arrive late late if something delayed the execution, which
means higher latencies, but RT shouldn't be using an adaptive algorithm
anyway, so that is not an issue.

The computed conservative timer_advance_ns will converge to the minimal
measured timer_advance_ns as time progresses, because it can only go
down and will do so repeatedly by small steps as even one smaller
measurement sufficiently close is enough to decrease it.

With that in mind, wouldn't the following patch (completely untested)
give you about the same numbers?

The idea is that if we have to wait, we are wasting time and therefore
decrease timer_advance_ns to eliminate the time spent waiting.

The first run is special and just sets timer_advance_ns to the latency
we measured, regardless of what it is -- it deals with the possibility
that the default was too low.

This algorithm is also likely prone to turbo boost making few runs
faster than what is then achieved during a more common workload, but
we'd need to have a sliding window or some other more sophisticated
algorithm in order to deal with that.

I also like Paolo's idea of a smoothing -- if we use a moving average
based on advance_expire_delta, we wouldn't even have to convert it into
ns unless it reached a given threshold, which could make decently fast
to be run every time.

Something like

moving_average = (apic->lapic_timer.moving_average * (weight - 1) + advance_expire_delta) / weight

if (moving_average > threshold)
recompute timer_advance_ns

apic->lapic_timer.moving_average = moving_average

where weight would be a of 2 to make the operation fast.

This kind of moving average gives less value to old inputs and the
weight allows us to control the reaction speed of the approximation.
(A small number like 4 or 8 seems about right.)

I don't have any information on the latency, though.
Do you think that the added overhead isn't worth the smoothing?

Thanks.

---8<---
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e904ff06a83d..d7f2af2eb3ce 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1491,23 +1491,20 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns -= min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= (u32)ns;
} else {
/* too late */
+ /* This branch can only be taken on the initial calibration. */
+ if (apic->lapic_timer.timer_advance_adjust_done)
+ pr_err_once("kvm: broken expectation in lapic timer_advance_ns");
+
ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns += min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += (u32)ns;
}

- if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
- if (unlikely(timer_advance_ns > 5000)) {
- timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
- }
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ apic->lapic_timer.timer_advance_adjust_done = true;
}

static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
@@ -1526,7 +1523,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);

- if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ if (unlikely(!apic->lapic_timer.timer_advance_adjust_done) || guest_tsc < tsc_deadline)
adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}

2019-08-28 08:10:33

by Wanpeng Li

[permalink] [raw]
Subject: Re: [PATCH v2] KVM: LAPIC: Periodically revaluate to get conservative lapic_timer_advance_ns

On Wed, 28 Aug 2019 at 01:47, Radim Krčmář <[email protected]> wrote:
>
> 2019-08-15 12:03+0800, Wanpeng Li:
> > From: Wanpeng Li <[email protected]>
> >
> > Even if for realtime CPUs, cache line bounces, frequency scaling, presence
> > of higher-priority RT tasks, etc can still cause different response. These
> > interferences should be considered and periodically revaluate whether
> > or not the lapic_timer_advance_ns value is the best, do nothing if it is,
> > otherwise recaluate again. Set lapic_timer_advance_ns to the minimal
> > conservative value from all the estimated values.
>
> IIUC, this patch is looking for the minimal timer_advance_ns because it
> provides the best throughput:
> When every code path ran as fast as possible and we don't have to wait
> for the timer to expire, but still arrive exactly at the point when it
> would have expired.
> We always arrive late late if something delayed the execution, which
> means higher latencies, but RT shouldn't be using an adaptive algorithm
> anyway, so that is not an issue.
>
> The computed conservative timer_advance_ns will converge to the minimal
> measured timer_advance_ns as time progresses, because it can only go
> down and will do so repeatedly by small steps as even one smaller
> measurement sufficiently close is enough to decrease it.
>
> With that in mind, wouldn't the following patch (completely untested)
> give you about the same numbers?
>
> The idea is that if we have to wait, we are wasting time and therefore
> decrease timer_advance_ns to eliminate the time spent waiting.
>
> The first run is special and just sets timer_advance_ns to the latency
> we measured, regardless of what it is -- it deals with the possibility
> that the default was too low.
>
> This algorithm is also likely prone to turbo boost making few runs
> faster than what is then achieved during a more common workload, but
> we'd need to have a sliding window or some other more sophisticated
> algorithm in order to deal with that.
>
> I also like Paolo's idea of a smoothing -- if we use a moving average
> based on advance_expire_delta, we wouldn't even have to convert it into
> ns unless it reached a given threshold, which could make decently fast
> to be run every time.
>
> Something like
>
> moving_average = (apic->lapic_timer.moving_average * (weight - 1) + advance_expire_delta) / weight
>
> if (moving_average > threshold)
> recompute timer_advance_ns
>
> apic->lapic_timer.moving_average = moving_average
>
> where weight would be a of 2 to make the operation fast.
>
> This kind of moving average gives less value to old inputs and the
> weight allows us to control the reaction speed of the approximation.
> (A small number like 4 or 8 seems about right.)
>
> I don't have any information on the latency, though.
> Do you think that the added overhead isn't worth the smoothing?
>
> Thanks.
>
> ---8<---
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index e904ff06a83d..d7f2af2eb3ce 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1491,23 +1491,20 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
> if (advance_expire_delta < 0) {
> ns = -advance_expire_delta * 1000000ULL;
> do_div(ns, vcpu->arch.virtual_tsc_khz);
> - timer_advance_ns -= min((u32)ns,
> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> + timer_advance_ns -= (u32)ns;
> } else {
> /* too late */
> + /* This branch can only be taken on the initial calibration. */
> + if (apic->lapic_timer.timer_advance_adjust_done)
> + pr_err_once("kvm: broken expectation in lapic timer_advance_ns");
> +
> ns = advance_expire_delta * 1000000ULL;
> do_div(ns, vcpu->arch.virtual_tsc_khz);
> - timer_advance_ns += min((u32)ns,
> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> + timer_advance_ns += (u32)ns;
> }
>
> - if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> - apic->lapic_timer.timer_advance_adjust_done = true;
> - if (unlikely(timer_advance_ns > 5000)) {
> - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
> - apic->lapic_timer.timer_advance_adjust_done = false;
> - }
> apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> + apic->lapic_timer.timer_advance_adjust_done = true;
> }
>
> static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
> @@ -1526,7 +1523,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
> if (guest_tsc < tsc_deadline)
> __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
>
> - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> + if (unlikely(!apic->lapic_timer.timer_advance_adjust_done) || guest_tsc < tsc_deadline)
> adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
> }
>

Something like below, the result is not as good as we expected. How
about v3? I use moving average to be smooth and filter out drastic
fluctuation which prevents it before.

(testing on Skylake server, the lapic_timer_advance_ns is around
1500ns for my v2 patch)
# cat vcpu*/lapic_timer_advance_ns
483
263
211
15

---8<---
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e904ff0..bdc0702 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1485,29 +1485,50 @@ static inline void
adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
{
struct kvm_lapic *apic = vcpu->arch.apic;
u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
+ s64 moving_average;
u64 ns;

+ if (abs(advance_expire_delta) > 10000)
+ /* filter out drastic flunctuations */
+ return;
+
+ if (apic->lapic_timer.timer_advance_adjust_done) {
+ moving_average = (apic->lapic_timer.moving_average *
+ (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) +
+ advance_expire_delta) / LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+
+ if (abs(moving_average) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
+ /* no update for random fluctuations */
+ return;
+
+ apic->lapic_timer.moving_average = moving_average;
+ advance_expire_delta = moving_average;
+ } else
+ apic->lapic_timer.moving_average = advance_expire_delta;
+
/* too early */
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns -= min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= (u32)ns;
} else {
/* too late */
+ /* This branch can only be taken on the initial calibration. */
+ if (apic->lapic_timer.timer_advance_adjust_done)
+ pr_err_once("kvm: broken expectation in lapic timer_advance_ns");
+
ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns += min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += (u32)ns;
}

- if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
if (unlikely(timer_advance_ns > 5000)) {
timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
apic->lapic_timer.timer_advance_adjust_done = false;
}
+
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ apic->lapic_timer.timer_advance_adjust_done = true;
}

static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
@@ -1526,7 +1547,8 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);

- if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ if (unlikely(!apic->lapic_timer.timer_advance_adjust_done) ||
+ guest_tsc < tsc_deadline)
adjust_lapic_timer_advance(vcpu,
apic->lapic_timer.advance_expire_delta);
}

diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 50053d2..2e6e499 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -33,6 +33,7 @@ struct kvm_timer {
u64 expired_tscdeadline;
u32 timer_advance_ns;
s64 advance_expire_delta;
+ s64 moving_average;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
bool timer_advance_adjust_done;