2019-09-06 10:11:14

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH RESEND v3 1/5] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly

From: Wanpeng Li <[email protected]>

Using a moving average based on per-vCPU lapic_timer_advance_ns to tune
smoothly, filter out drastic fluctuation which prevents this before,
let's assume it is 10000 cycles.

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/lapic.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e904ff0..2f4a48a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -69,6 +69,7 @@
#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+#define LAPIC_TIMER_ADVANCE_FILTER 10000

static inline int apic_test_vector(int vec, void *bitmap)
{
@@ -1484,23 +1485,28 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
s64 advance_expire_delta)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
- u64 ns;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns;
+
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER)
+ /* filter out drastic fluctuations */
+ return;

/* too early */
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns -= min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= ns;
} else {
/* too late */
ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns += min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += ns;
}

+ timer_advance_ns = (apic->lapic_timer.timer_advance_ns *
+ (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) /
+ LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+
if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
apic->lapic_timer.timer_advance_adjust_done = true;
if (unlikely(timer_advance_ns > 5000)) {
--
2.7.4


2019-09-06 10:12:09

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH RESEND v3 2/5] KVM: LAPIC: Periodically revaluate to get conservative lapic_timer_advance_ns

From: Wanpeng Li <[email protected]>

Even if for realtime CPUs, cache line bounces, frequency scaling, presence
of higher-priority RT tasks, etc can still cause different response. These
interferences should be considered and periodically revaluate whether
or not the lapic_timer_advance_ns value is the best, do nothing if it is,
otherwise recaluate again. Set lapic_timer_advance_ns to the minimal
conservative value from all the estimated values.

Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, before patch:
1628
4161
4321
3236
...

Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, after patch:
1553
1499
1509
1489
...

Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, before patch:
4617
3641
4102
4577
...
Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, after patch:
2775
2892
2764
2775
...

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/lapic.c | 37 ++++++++++++++++++++++++++++++-------
arch/x86/kvm/lapic.h | 2 ++
2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2f4a48a..12ade70 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,6 +70,7 @@
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
#define LAPIC_TIMER_ADVANCE_FILTER 10000
+#define LAPIC_TIMER_ADVANCE_RECALC_PERIOD (600 * HZ)

static inline int apic_test_vector(int vec, void *bitmap)
{
@@ -1484,13 +1485,24 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
s64 advance_expire_delta)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
+ u32 timer_advance_ns = ktimer->timer_advance_ns, ns;

if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER)
/* filter out drastic fluctuations */
return;

+ /* periodic revaluate */
+ if (unlikely(ktimer->timer_advance_adjust_done)) {
+ ktimer->recalc_timer_advance_ns = jiffies +
+ LAPIC_TIMER_ADVANCE_RECALC_PERIOD;
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_DONE) {
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+ ktimer->timer_advance_adjust_done = false;
+ } else
+ return;
+ }
+
/* too early */
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
@@ -1503,17 +1515,24 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
timer_advance_ns += ns;
}

- timer_advance_ns = (apic->lapic_timer.timer_advance_ns *
+ timer_advance_ns = (ktimer->timer_advance_ns *
(LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) /
LAPIC_TIMER_ADVANCE_ADJUST_STEP;

if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
+ ktimer->timer_advance_adjust_done = true;
if (unlikely(timer_advance_ns > 5000)) {
timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
+ ktimer->timer_advance_adjust_done = false;
+ }
+
+ ktimer->timer_advance_ns = timer_advance_ns;
+
+ if (ktimer->timer_advance_adjust_done) {
+ if (ktimer->min_timer_advance_ns > timer_advance_ns)
+ ktimer->min_timer_advance_ns = timer_advance_ns;
+ ktimer->timer_advance_ns = ktimer->min_timer_advance_ns;
}
- apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}

static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
@@ -1532,7 +1551,8 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);

- if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ if (unlikely(!apic->lapic_timer.timer_advance_adjust_done) ||
+ time_before(apic->lapic_timer.recalc_timer_advance_ns, jiffies))
adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}

@@ -2310,9 +2330,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
if (timer_advance_ns == -1) {
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
apic->lapic_timer.timer_advance_adjust_done = false;
+ apic->lapic_timer.recalc_timer_advance_ns = jiffies;
+ apic->lapic_timer.min_timer_advance_ns = UINT_MAX;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
apic->lapic_timer.timer_advance_adjust_done = true;
+ apic->lapic_timer.recalc_timer_advance_ns = MAX_JIFFY_OFFSET;
}


diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 50053d2..56a05eb 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,8 @@ struct kvm_timer {
u32 timer_mode_mask;
u64 tscdeadline;
u64 expired_tscdeadline;
+ unsigned long recalc_timer_advance_ns;
+ u32 min_timer_advance_ns;
u32 timer_advance_ns;
s64 advance_expire_delta;
atomic_t pending; /* accumulated triggered timers */
--
2.7.4

2019-09-06 10:13:31

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH RESEND 3/5] KVM: LAPIC: Micro optimize IPI latency

From: Wanpeng Li <[email protected]>

This patch optimizes the virtual IPI emulation sequence:

write ICR2 write ICR2
write ICR read ICR2
read ICR ==> send virtual IPI
read ICR2 write ICR
send virtual IPI

It can reduce kvm-unit-tests/vmexit.flat IPI testing latency(from sender
send IPI to sender receive the ACK) from 3319 cycles to 3203 cycles on
SKylake server.

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/lapic.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 12ade70..34fd299 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1200,10 +1200,8 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
}
EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);

-static void apic_send_ipi(struct kvm_lapic *apic)
+static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
{
- u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR);
- u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2);
struct kvm_lapic_irq irq;

irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -1940,8 +1938,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
}
case APIC_ICR:
/* No delay here, so we always clear the pending bit */
- kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
- apic_send_ipi(apic);
+ val &= ~(1 << 12);
+ apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+ kvm_lapic_set_reg(apic, APIC_ICR, val);
break;

case APIC_ICR2:
--
2.7.4

2019-09-06 11:57:14

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH RESEND 4/5] KVM: VMX: Stop the preemption timer during vCPU reset

From: Wanpeng Li <[email protected]>

The hrtimer which is used to emulate lapic timer is stopped during
vcpu reset, preemption timer should do the same.

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 570a233..f794929 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4162,6 +4162,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)

vcpu->arch.microcode_version = 0x100000000ULL;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+ vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);

if (!init_event) {
--
2.7.4

2019-09-06 12:20:55

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH v2 5/5] KVM: hyperv: Fix Direct Synthetic timers assert an interrupt w/o lapic_in_kernel

From: Wanpeng Li <[email protected]>

Reported by syzkaller:

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
RIP: 0010:__apic_accept_irq+0x46/0x740 arch/x86/kvm/lapic.c:1029
Call Trace:
kvm_apic_set_irq+0xb4/0x140 arch/x86/kvm/lapic.c:558
stimer_notify_direct arch/x86/kvm/hyperv.c:648 [inline]
stimer_expiration arch/x86/kvm/hyperv.c:659 [inline]
kvm_hv_process_stimers+0x594/0x1650 arch/x86/kvm/hyperv.c:686
vcpu_enter_guest+0x2b2a/0x54b0 arch/x86/kvm/x86.c:7896
vcpu_run+0x393/0xd40 arch/x86/kvm/x86.c:8152
kvm_arch_vcpu_ioctl_run+0x636/0x900 arch/x86/kvm/x86.c:8360
kvm_vcpu_ioctl+0x6cf/0xaf0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2765

The testcase programs HV_X64_MSR_STIMERn_CONFIG/HV_X64_MSR_STIMERn_COUNT,
in addition, there is no lapic in the kernel, the counters value are small
enough in order that kvm_hv_process_stimers() inject this already-expired
timer interrupt into the guest through lapic in the kernel which triggers
the NULL deferencing. This patch fixes it by don't advertise direct mode
synthetic timers and discarding the inject when lapic is not in kernel.

Reported-by: [email protected]
Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Cc: Vitaly Kuznetsov <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
v1 -> v2:
* don't advertise direct mode synthetic timers when lapic is not in kernel

arch/x86/kvm/hyperv.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c10a8b1..069e655 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -645,7 +645,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
.vector = stimer->config.apic_vector
};

- return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ if (lapic_in_kernel(vcpu))
+ return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ return 0;
}

static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
@@ -1849,7 +1851,13 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,

ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
- ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+ /*
+ * Direct Synthetic timers only make sense with in-kernel
+ * LAPIC
+ */
+ if (lapic_in_kernel(vcpu))
+ ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;

break;

--
2.7.4

2019-09-11 16:29:55

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 5/5] KVM: hyperv: Fix Direct Synthetic timers assert an interrupt w/o lapic_in_kernel

On 06/09/19 03:30, Wanpeng Li wrote:
> From: Wanpeng Li <[email protected]>
>
> Reported by syzkaller:
>
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault: 0000 [#1] PREEMPT SMP KASAN
> RIP: 0010:__apic_accept_irq+0x46/0x740 arch/x86/kvm/lapic.c:1029
> Call Trace:
> kvm_apic_set_irq+0xb4/0x140 arch/x86/kvm/lapic.c:558
> stimer_notify_direct arch/x86/kvm/hyperv.c:648 [inline]
> stimer_expiration arch/x86/kvm/hyperv.c:659 [inline]
> kvm_hv_process_stimers+0x594/0x1650 arch/x86/kvm/hyperv.c:686
> vcpu_enter_guest+0x2b2a/0x54b0 arch/x86/kvm/x86.c:7896
> vcpu_run+0x393/0xd40 arch/x86/kvm/x86.c:8152
> kvm_arch_vcpu_ioctl_run+0x636/0x900 arch/x86/kvm/x86.c:8360
> kvm_vcpu_ioctl+0x6cf/0xaf0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2765
>
> The testcase programs HV_X64_MSR_STIMERn_CONFIG/HV_X64_MSR_STIMERn_COUNT,
> in addition, there is no lapic in the kernel, the counters value are small
> enough in order that kvm_hv_process_stimers() inject this already-expired
> timer interrupt into the guest through lapic in the kernel which triggers
> the NULL deferencing. This patch fixes it by don't advertise direct mode
> synthetic timers and discarding the inject when lapic is not in kernel.
>
> Reported-by: [email protected]
> Cc: Paolo Bonzini <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Cc: Vitaly Kuznetsov <[email protected]>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> v1 -> v2:
> * don't advertise direct mode synthetic timers when lapic is not in kernel
>
> arch/x86/kvm/hyperv.c | 12 ++++++++++--
> 1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index c10a8b1..069e655 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -645,7 +645,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
> .vector = stimer->config.apic_vector
> };
>
> - return !kvm_apic_set_irq(vcpu, &irq, NULL);
> + if (lapic_in_kernel(vcpu))
> + return !kvm_apic_set_irq(vcpu, &irq, NULL);
> + return 0;
> }
>
> static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
> @@ -1849,7 +1851,13 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>
> ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
> ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
> - ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
> +
> + /*
> + * Direct Synthetic timers only make sense with in-kernel
> + * LAPIC
> + */
> + if (lapic_in_kernel(vcpu))
> + ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
>
> break;
>
>

See replies to the previous version of the individual patches.

Paolo