2019-06-12 09:41:30

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH v3 1/2] KVM: LAPIC: Optimize timer latency consider world switch time

From: Wanpeng Li <[email protected]>

Advance lapic timer tries to hidden the hypervisor overhead between the
host emulated timer fires and the guest awares the timer is fired. However,
even though after more sustaining optimizations, kvm-unit-tests/tscdeadline_latency
still awares ~1000 cycles latency since we lost the time between the end of
wait_lapic_expire and the guest awares the timer is fired. There are
codes between the end of wait_lapic_expire and the world switch, furthermore,
the world switch itself also has overhead. Actually the guest_tsc is equal
to the target deadline time in wait_lapic_expire is too late, guest will
aware the latency between the end of wait_lapic_expire() and after vmentry
to the guest. This patch takes this time into consideration.

The vmentry+vmexit time which is measured by kvm-unit-tests/vmexit.falt is
1800 cycles on my 2.5GHz Skylake server, the vmentry_advance_ns module
parameter default value is 300ns, it can be tuned/reworked in the further.
This patch can reduce average cyclictest latency from 3us to 2us on Skylake
server. (guest w/ nohz=off, idle=poll, host w/ preemption_timer=N, the
cyclictest latency is not too sensitive when preemption_timer=Y for this
optimization in my testing).

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Cc: Sean Christopherson <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
v2 -> v3:
* read-only module parameter
* get_vmentry_advance_cycles() not inline
v1 -> v2:
* rename get_vmentry_advance_delta to get_vmentry_advance_cycles
* cache vmentry_advance_cycles by setting param bit 0
* add param max limit

arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++++++---
arch/x86/kvm/lapic.h | 3 +++
arch/x86/kvm/vmx/vmx.c | 2 +-
arch/x86/kvm/x86.c | 8 ++++++++
arch/x86/kvm/x86.h | 2 ++
5 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fcf42a3..c6d76f9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1531,6 +1531,33 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}

+u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu)
+{
+ u64 cycles;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz;
+ do_div(cycles, 1000000);
+
+ apic->lapic_timer.vmentry_advance_cycles = cycles;
+
+ return cycles;
+}
+
+u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (unlikely(!vmentry_advance_ns))
+ return 0;
+
+ if (likely(apic->lapic_timer.vmentry_advance_cycles))
+ return apic->lapic_timer.vmentry_advance_cycles;
+
+ return compute_vmentry_advance_cycles(vcpu);
+}
+EXPORT_SYMBOL_GPL(get_vmentry_advance_cycles);
+
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1544,7 +1571,7 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)

tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
- guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu);
apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;

if (guest_tsc < tsc_deadline)
@@ -1572,7 +1599,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
local_irq_save(flags);

now = ktime_get();
- guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu);

ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
@@ -2329,7 +2356,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
apic->lapic_timer.timer_advance_adjust_done = true;
}
-
+ apic->lapic_timer.vmentry_advance_cycles = 0;

/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f974a3d..fb32e69 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -33,6 +33,7 @@ struct kvm_timer {
u64 expired_tscdeadline;
u32 timer_advance_ns;
s64 advance_expire_delta;
+ u64 vmentry_advance_cycles;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
bool timer_advance_adjust_done;
@@ -221,6 +222,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);

void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu);
+u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu);

bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0861c71..0751a44 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7041,7 +7041,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,

vmx = to_vmx(vcpu);
tscl = rdtsc();
- guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ guest_tscl = kvm_read_l1_tsc(vcpu, tscl) + get_vmentry_advance_cycles(vcpu);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
ktimer->timer_advance_ns);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 553c292..4b983bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);

+/*
+ * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds.
+ */
+u32 __read_mostly vmentry_advance_ns = 300;
+module_param(vmentry_advance_ns, uint, S_IRUGO);
+
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);

@@ -1592,6 +1598,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
&vcpu->arch.virtual_tsc_shift,
&vcpu->arch.virtual_tsc_mult);
+ if (vcpu->arch.apic && user_tsc_khz != vcpu->arch.virtual_tsc_khz)
+ compute_vmentry_advance_cycles(vcpu);
vcpu->arch.virtual_tsc_khz = user_tsc_khz;

/*
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a470ff0..2174355 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,8 @@ extern u64 kvm_supported_xcr0(void);

extern unsigned int min_timer_period_us;

+extern unsigned int vmentry_advance_ns;
+
extern bool enable_vmware_backdoor;

extern struct static_key kvm_no_apic_vcpu;
--
2.7.4


2019-06-12 09:41:48

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH v3 2/2] KVM: LAPIC: remove the trailing newline used in the fmt parameter of TP_printk

From: Wanpeng Li <[email protected]>

The trailing newlines will lead to extra newlines in the trace file
which looks like the following output, so remove it.

qemu-system-x86-15695 [002] ...1 15774.839240: kvm_hv_timer_state: vcpu_id 0 hv_timer 1

qemu-system-x86-15695 [002] ...1 15774.839309: kvm_hv_timer_state: vcpu_id 0 hv_timer 1

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/trace.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4d47a26..b5c831e 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1365,7 +1365,7 @@ TRACE_EVENT(kvm_hv_timer_state,
__entry->vcpu_id = vcpu_id;
__entry->hv_timer_in_use = hv_timer_in_use;
),
- TP_printk("vcpu_id %x hv_timer %x\n",
+ TP_printk("vcpu_id %x hv_timer %x",
__entry->vcpu_id,
__entry->hv_timer_in_use)
);
--
2.7.4

2019-06-12 18:05:15

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] KVM: LAPIC: Optimize timer latency consider world switch time

On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote:
> From: Wanpeng Li <[email protected]>
>
> Advance lapic timer tries to hidden the hypervisor overhead between the
> host emulated timer fires and the guest awares the timer is fired. However,
> even though after more sustaining optimizations, kvm-unit-tests/tscdeadline_latency
> still awares ~1000 cycles latency since we lost the time between the end of
> wait_lapic_expire and the guest awares the timer is fired. There are
> codes between the end of wait_lapic_expire and the world switch, furthermore,
> the world switch itself also has overhead. Actually the guest_tsc is equal
> to the target deadline time in wait_lapic_expire is too late, guest will
> aware the latency between the end of wait_lapic_expire() and after vmentry
> to the guest. This patch takes this time into consideration.
>
> The vmentry+vmexit time which is measured by kvm-unit-tests/vmexit.falt is
> 1800 cycles on my 2.5GHz Skylake server, the vmentry_advance_ns module
> parameter default value is 300ns, it can be tuned/reworked in the further.
> This patch can reduce average cyclictest latency from 3us to 2us on Skylake
> server. (guest w/ nohz=off, idle=poll, host w/ preemption_timer=N, the
> cyclictest latency is not too sensitive when preemption_timer=Y for this
> optimization in my testing).
>
> Cc: Paolo Bonzini <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Cc: Sean Christopherson <[email protected]>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> v2 -> v3:
> * read-only module parameter
> * get_vmentry_advance_cycles() not inline
> v1 -> v2:
> * rename get_vmentry_advance_delta to get_vmentry_advance_cycles
> * cache vmentry_advance_cycles by setting param bit 0
> * add param max limit
>
> arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++++++---
> arch/x86/kvm/lapic.h | 3 +++
> arch/x86/kvm/vmx/vmx.c | 2 +-
> arch/x86/kvm/x86.c | 8 ++++++++
> arch/x86/kvm/x86.h | 2 ++
> 5 files changed, 44 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index fcf42a3..c6d76f9 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1531,6 +1531,33 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
> apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> }
>
> +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu)
> +{
> + u64 cycles;
> + struct kvm_lapic *apic = vcpu->arch.apic;
> +
> + cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz;
> + do_div(cycles, 1000000);
> +
> + apic->lapic_timer.vmentry_advance_cycles = cycles;
> +
> + return cycles;
> +}
> +
> +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_lapic *apic = vcpu->arch.apic;
> +
> + if (unlikely(!vmentry_advance_ns))
> + return 0;
> +
> + if (likely(apic->lapic_timer.vmentry_advance_cycles))
> + return apic->lapic_timer.vmentry_advance_cycles;
> +
> + return compute_vmentry_advance_cycles(vcpu);

If vmentry_advance_ns is read-only, then we don't need to be able to
compute lapic_timer.vmentry_advance_cycles on demand, e.g. it can be set
during kvm_create_lapic() and recomputed in kvm_set_tsc_khz().

Alternatively, it could be handled purely in kvm_set_tsc_khz() if the call
to kvm_create_lapic() were moved before kvm_set_tsc_khz().

> +}
> +EXPORT_SYMBOL_GPL(get_vmentry_advance_cycles);
> +
> void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
> {
> struct kvm_lapic *apic = vcpu->arch.apic;
> @@ -1544,7 +1571,7 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
>
> tsc_deadline = apic->lapic_timer.expired_tscdeadline;
> apic->lapic_timer.expired_tscdeadline = 0;
> - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu);
> apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
>
> if (guest_tsc < tsc_deadline)
> @@ -1572,7 +1599,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
> local_irq_save(flags);
>
> now = ktime_get();
> - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu);
> ns = (tscdeadline - guest_tsc) * 1000000ULL;
> do_div(ns, this_tsc_khz);
> @@ -2329,7 +2356,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
> apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> apic->lapic_timer.timer_advance_adjust_done = true;
> }
> -
> + apic->lapic_timer.vmentry_advance_cycles = 0;
>
> /*
> * APIC is created enabled. This will prevent kvm_lapic_set_base from
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index f974a3d..fb32e69 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -33,6 +33,7 @@ struct kvm_timer {
> u64 expired_tscdeadline;
> u32 timer_advance_ns;
> s64 advance_expire_delta;
> + u64 vmentry_advance_cycles;
> atomic_t pending; /* accumulated triggered timers */
> bool hv_timer_in_use;
> bool timer_advance_adjust_done;
> @@ -221,6 +222,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
> bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
>
> void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
> +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu);
> +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu);
>
> bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
> struct kvm_vcpu **dest_vcpu);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 0861c71..0751a44 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7041,7 +7041,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
>
> vmx = to_vmx(vcpu);
> tscl = rdtsc();
> - guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
> + guest_tscl = kvm_read_l1_tsc(vcpu, tscl) + get_vmentry_advance_cycles(vcpu);
> delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
> lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
> ktimer->timer_advance_ns);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 553c292..4b983bb 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
> static int __read_mostly lapic_timer_advance_ns = -1;
> module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
>
> +/*
> + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds.
> + */
> +u32 __read_mostly vmentry_advance_ns = 300;

Enabling this by default makes me nervous, e.g. nothing guarantees that
future versions of KVM and/or CPUs will continue to have 300ns of overhead
between wait_lapic_expire() and VM-Enter.

If we want it enabled by default so that it gets tested, the default
value should be extremely conservative, e.g. set the default to a small
percentage (25%?) of the latency of VM-Enter itself on modern CPUs,
VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE.

> +module_param(vmentry_advance_ns, uint, S_IRUGO);
> +
> static bool __read_mostly vector_hashing = true;
> module_param(vector_hashing, bool, S_IRUGO);
>
> @@ -1592,6 +1598,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
> kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
> &vcpu->arch.virtual_tsc_shift,
> &vcpu->arch.virtual_tsc_mult);
> + if (vcpu->arch.apic && user_tsc_khz != vcpu->arch.virtual_tsc_khz)
> + compute_vmentry_advance_cycles(vcpu);
> vcpu->arch.virtual_tsc_khz = user_tsc_khz;
>
> /*
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index a470ff0..2174355 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -294,6 +294,8 @@ extern u64 kvm_supported_xcr0(void);
>
> extern unsigned int min_timer_period_us;
>
> +extern unsigned int vmentry_advance_ns;
> +
> extern bool enable_vmware_backdoor;
>
> extern struct static_key kvm_no_apic_vcpu;
> --
> 2.7.4
>

2019-06-12 19:23:07

by Radim Krčmář

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] KVM: LAPIC: Optimize timer latency consider world switch time

2019-06-12 08:14-0700, Sean Christopherson:
> On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
> > static int __read_mostly lapic_timer_advance_ns = -1;
> > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
> >
> > +/*
> > + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds.
> > + */
> > +u32 __read_mostly vmentry_advance_ns = 300;
>
> Enabling this by default makes me nervous, e.g. nothing guarantees that
> future versions of KVM and/or CPUs will continue to have 300ns of overhead
> between wait_lapic_expire() and VM-Enter.
>
> If we want it enabled by default so that it gets tested, the default
> value should be extremely conservative, e.g. set the default to a small
> percentage (25%?) of the latency of VM-Enter itself on modern CPUs,
> VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE.

I share the sentiment. We definitely must not enter the guest before
the deadline has expired and CPUs are approaching 5 GHz (in turbo), so
300 ns would be too much even today.

I wrote a simple testcase for rough timing and there are 267 cycles
(111 ns @ 2.4 GHz) between doing rdtsc() right after
kvm_wait_lapic_expire() [1] and doing rdtsc() in the guest as soon as
possible (see the attached kvm-unit-test).

That is on a Haswell, where vmexit.flat reports 2120 cycles for a
vmcall. This would linearly (likely incorrect method in this case)
translate to 230 cycles on a machine with 1800 cycles for a vmcall,
which is less than 50 ns @ 5 GHz.

I wouldn't go above 25 ns for a hard-coded default.

(We could also do a similar measurement when initializing KVM and have a
dynamic default, but I'm thinking it's going to be way too much code
for the benefit.)

---
1: This is how the TSC is read in KVM.

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index da24f1858acc..a7251ac0109b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6449,6 +6449,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.apic->lapic_timer.timer_advance_ns)
kvm_wait_lapic_expire(vcpu);

+ vcpu->last_seen_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
* it's non-zero. Since vmentry is serialising on affected CPUs, there
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6200d5a51f13..5e0ce8ca31e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7201,6 +7201,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_SEND_IPI:
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
+ case KVM_HC_LAST_SEEN_TSC:
+ ret = vcpu->last_seen_tsc;
+ break;
default:
ret = -KVM_ENOSYS;
break;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index abafddb9fe2c..7f70fe7a28b1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -323,6 +323,8 @@ struct kvm_vcpu {
bool preempted;
struct kvm_vcpu_arch arch;
struct dentry *debugfs_dentry;
+
+ u64 last_seen_tsc;
};

static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index 6c0ce49931e5..dfbc6e9ad7a1 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -28,6 +28,7 @@
#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
#define KVM_HC_CLOCK_PAIRING 9
#define KVM_HC_SEND_IPI 10
+#define KVM_HC_LAST_SEEN_TSC 11

/*
* hypercalls use architecture specific

2019-06-12 19:29:31

by Radim Krčmář

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] KVM: LAPIC: Optimize timer latency consider world switch time

2019-06-12 21:22+0200, Radim Krčmář:
> 2019-06-12 08:14-0700, Sean Christopherson:
> > On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
> > > static int __read_mostly lapic_timer_advance_ns = -1;
> > > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
> > >
> > > +/*
> > > + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds.
> > > + */
> > > +u32 __read_mostly vmentry_advance_ns = 300;
> >
> > Enabling this by default makes me nervous, e.g. nothing guarantees that
> > future versions of KVM and/or CPUs will continue to have 300ns of overhead
> > between wait_lapic_expire() and VM-Enter.
> >
> > If we want it enabled by default so that it gets tested, the default
> > value should be extremely conservative, e.g. set the default to a small
> > percentage (25%?) of the latency of VM-Enter itself on modern CPUs,
> > VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE.
>
> I share the sentiment. We definitely must not enter the guest before
> the deadline has expired and CPUs are approaching 5 GHz (in turbo), so
> 300 ns would be too much even today.
>
> I wrote a simple testcase for rough timing and there are 267 cycles
> (111 ns @ 2.4 GHz) between doing rdtsc() right after
> kvm_wait_lapic_expire() [1] and doing rdtsc() in the guest as soon as
> possible (see the attached kvm-unit-test).

I forgot to attach it, pasting here as a patch for kvm-unit-tests.

---
diff --git a/x86/Makefile.common b/x86/Makefile.common
index e612dbe..ceed648 100644
--- a/x86/Makefile.common
+++ b/x86/Makefile.common
@@ -58,7 +58,7 @@ tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \
$(TEST_DIR)/init.flat $(TEST_DIR)/smap.flat \
$(TEST_DIR)/hyperv_synic.flat $(TEST_DIR)/hyperv_stimer.flat \
$(TEST_DIR)/hyperv_connections.flat \
- $(TEST_DIR)/umip.flat
+ $(TEST_DIR)/umip.flat $(TEST_DIR)/vmentry_latency.flat

ifdef API
tests-api = api/api-sample api/dirty-log api/dirty-log-perf
diff --git a/x86/vmentry_latency.c b/x86/vmentry_latency.c
new file mode 100644
index 0000000..3859f09
--- /dev/null
+++ b/x86/vmentry_latency.c
@@ -0,0 +1,45 @@
+#include "x86/vm.h"
+
+static u64 get_last_hypervisor_tsc_delta(void)
+{
+ u64 a = 0, b, c, d;
+ u64 tsc;
+
+ /*
+ * The first vmcall is there to force a vm exit just before measuring.
+ */
+ asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
+
+ tsc = rdtsc();
+
+ /*
+ * The second hypercall recovers the value that was stored when vm
+ * entering to execute the rdtsc()
+ */
+ a = 11;
+ asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
+
+ return tsc - a;
+}
+
+static void vmentry_latency(void)
+{
+ unsigned i = 1000000;
+ u64 min = -1;
+
+ while (i--) {
+ u64 latency = get_last_hypervisor_tsc_delta();
+ if (latency < min)
+ min = latency;
+ }
+
+ printf("vm entry latency is %"PRIu64" TSC cycles\n", min);
+}
+
+int main(void)
+{
+ setup_vm();
+ vmentry_latency();
+
+ return 0;
+}

2019-06-13 16:56:15

by Wanpeng Li

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] KVM: LAPIC: Optimize timer latency consider world switch time

On Thu, 13 Jun 2019 at 03:27, Radim Krčmář <[email protected]> wrote:
>
> 2019-06-12 21:22+0200, Radim Krčmář:
> > 2019-06-12 08:14-0700, Sean Christopherson:
> > > On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote:
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
> > > > static int __read_mostly lapic_timer_advance_ns = -1;
> > > > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
> > > >
> > > > +/*
> > > > + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds.
> > > > + */
> > > > +u32 __read_mostly vmentry_advance_ns = 300;
> > >
> > > Enabling this by default makes me nervous, e.g. nothing guarantees that
> > > future versions of KVM and/or CPUs will continue to have 300ns of overhead
> > > between wait_lapic_expire() and VM-Enter.
> > >
> > > If we want it enabled by default so that it gets tested, the default
> > > value should be extremely conservative, e.g. set the default to a small
> > > percentage (25%?) of the latency of VM-Enter itself on modern CPUs,
> > > VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE.
> >
> > I share the sentiment. We definitely must not enter the guest before
> > the deadline has expired and CPUs are approaching 5 GHz (in turbo), so
> > 300 ns would be too much even today.
> >
> > I wrote a simple testcase for rough timing and there are 267 cycles
> > (111 ns @ 2.4 GHz) between doing rdtsc() right after
> > kvm_wait_lapic_expire() [1] and doing rdtsc() in the guest as soon as
> > possible (see the attached kvm-unit-test).
>
> I forgot to attach it, pasting here as a patch for kvm-unit-tests.

Thanks for this, Radim. :)

Regards,
Wanpeng Li

>
> ---
> diff --git a/x86/Makefile.common b/x86/Makefile.common
> index e612dbe..ceed648 100644
> --- a/x86/Makefile.common
> +++ b/x86/Makefile.common
> @@ -58,7 +58,7 @@ tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \
> $(TEST_DIR)/init.flat $(TEST_DIR)/smap.flat \
> $(TEST_DIR)/hyperv_synic.flat $(TEST_DIR)/hyperv_stimer.flat \
> $(TEST_DIR)/hyperv_connections.flat \
> - $(TEST_DIR)/umip.flat
> + $(TEST_DIR)/umip.flat $(TEST_DIR)/vmentry_latency.flat
>
> ifdef API
> tests-api = api/api-sample api/dirty-log api/dirty-log-perf
> diff --git a/x86/vmentry_latency.c b/x86/vmentry_latency.c
> new file mode 100644
> index 0000000..3859f09
> --- /dev/null
> +++ b/x86/vmentry_latency.c
> @@ -0,0 +1,45 @@
> +#include "x86/vm.h"
> +
> +static u64 get_last_hypervisor_tsc_delta(void)
> +{
> + u64 a = 0, b, c, d;
> + u64 tsc;
> +
> + /*
> + * The first vmcall is there to force a vm exit just before measuring.
> + */
> + asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
> +
> + tsc = rdtsc();
> +
> + /*
> + * The second hypercall recovers the value that was stored when vm
> + * entering to execute the rdtsc()
> + */
> + a = 11;
> + asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
> +
> + return tsc - a;
> +}
> +
> +static void vmentry_latency(void)
> +{
> + unsigned i = 1000000;
> + u64 min = -1;
> +
> + while (i--) {
> + u64 latency = get_last_hypervisor_tsc_delta();
> + if (latency < min)
> + min = latency;
> + }
> +
> + printf("vm entry latency is %"PRIu64" TSC cycles\n", min);
> +}
> +
> +int main(void)
> +{
> + setup_vm();
> + vmentry_latency();
> +
> + return 0;
> +}

2019-06-13 16:56:24

by Wanpeng Li

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] KVM: LAPIC: Optimize timer latency consider world switch time

On Wed, 12 Jun 2019 at 23:14, Sean Christopherson
<[email protected]> wrote:
>
> On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote:
> > From: Wanpeng Li <[email protected]>
> >
> > Advance lapic timer tries to hidden the hypervisor overhead between the
> > host emulated timer fires and the guest awares the timer is fired. However,
> > even though after more sustaining optimizations, kvm-unit-tests/tscdeadline_latency
> > still awares ~1000 cycles latency since we lost the time between the end of
> > wait_lapic_expire and the guest awares the timer is fired. There are
> > codes between the end of wait_lapic_expire and the world switch, furthermore,
> > the world switch itself also has overhead. Actually the guest_tsc is equal
> > to the target deadline time in wait_lapic_expire is too late, guest will
> > aware the latency between the end of wait_lapic_expire() and after vmentry
> > to the guest. This patch takes this time into consideration.
> >
> > The vmentry+vmexit time which is measured by kvm-unit-tests/vmexit.falt is
> > 1800 cycles on my 2.5GHz Skylake server, the vmentry_advance_ns module
> > parameter default value is 300ns, it can be tuned/reworked in the further.
> > This patch can reduce average cyclictest latency from 3us to 2us on Skylake
> > server. (guest w/ nohz=off, idle=poll, host w/ preemption_timer=N, the
> > cyclictest latency is not too sensitive when preemption_timer=Y for this
> > optimization in my testing).
> >
> > Cc: Paolo Bonzini <[email protected]>
> > Cc: Radim Krčmář <[email protected]>
> > Cc: Sean Christopherson <[email protected]>
> > Signed-off-by: Wanpeng Li <[email protected]>
> > ---
> > v2 -> v3:
> > * read-only module parameter
> > * get_vmentry_advance_cycles() not inline
> > v1 -> v2:
> > * rename get_vmentry_advance_delta to get_vmentry_advance_cycles
> > * cache vmentry_advance_cycles by setting param bit 0
> > * add param max limit
> >
> > arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++++++---
> > arch/x86/kvm/lapic.h | 3 +++
> > arch/x86/kvm/vmx/vmx.c | 2 +-
> > arch/x86/kvm/x86.c | 8 ++++++++
> > arch/x86/kvm/x86.h | 2 ++
> > 5 files changed, 44 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index fcf42a3..c6d76f9 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -1531,6 +1531,33 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
> > apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> > }
> >
> > +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu)
> > +{
> > + u64 cycles;
> > + struct kvm_lapic *apic = vcpu->arch.apic;
> > +
> > + cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz;
> > + do_div(cycles, 1000000);
> > +
> > + apic->lapic_timer.vmentry_advance_cycles = cycles;
> > +
> > + return cycles;
> > +}
> > +
> > +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu)
> > +{
> > + struct kvm_lapic *apic = vcpu->arch.apic;
> > +
> > + if (unlikely(!vmentry_advance_ns))
> > + return 0;
> > +
> > + if (likely(apic->lapic_timer.vmentry_advance_cycles))
> > + return apic->lapic_timer.vmentry_advance_cycles;
> > +
> > + return compute_vmentry_advance_cycles(vcpu);
>
> If vmentry_advance_ns is read-only, then we don't need to be able to
> compute lapic_timer.vmentry_advance_cycles on demand, e.g. it can be set
> during kvm_create_lapic() and recomputed in kvm_set_tsc_khz().
>
> Alternatively, it could be handled purely in kvm_set_tsc_khz() if the call
> to kvm_create_lapic() were moved before kvm_set_tsc_khz().

hardcode 25ns, how about something like this:

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fcf42a3..ed62d6b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1531,6 +1531,19 @@ static inline void
adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}

+u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu)
+{
+ u64 cycles;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz;
+ do_div(cycles, 1000000);
+
+ apic->lapic_timer.vmentry_advance_cycles = cycles;
+
+ return cycles;
+}
+
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1544,7 +1557,8 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)

tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
- guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) +
+ apic->lapic_timer.vmentry_advance_cycles;
apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;

if (guest_tsc < tsc_deadline)
@@ -1572,7 +1586,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
local_irq_save(flags);

now = ktime_get();
- guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) +
+ apic->lapic_timer.vmentry_advance_cycles;

ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
@@ -2329,7 +2344,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int
timer_advance_ns)
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
apic->lapic_timer.timer_advance_adjust_done = true;
}
-
+ apic->lapic_timer.vmentry_advance_cycles = 0;

/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f974a3d..751cb26 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -33,6 +33,7 @@ struct kvm_timer {
u64 expired_tscdeadline;
u32 timer_advance_ns;
s64 advance_expire_delta;
+ u64 vmentry_advance_cycles;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
bool timer_advance_adjust_done;
@@ -221,6 +222,7 @@ static inline int kvm_lapic_latched_init(struct
kvm_vcpu *vcpu)
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);

void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu);

bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0861c71..b572359 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7041,7 +7041,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu
*vcpu, u64 guest_deadline_tsc,

vmx = to_vmx(vcpu);
tscl = rdtsc();
- guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ guest_tscl = kvm_read_l1_tsc(vcpu, tscl) +
+ vcpu->arch.apic->lapic_timer.vmentry_advance_cycles;
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
ktimer->timer_advance_ns);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 553c292..f6e1366 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);

+/*
+ * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds.
+ */
+u32 __read_mostly vmentry_advance_ns = 25;
+module_param(vmentry_advance_ns, uint, S_IRUGO);
+
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);

@@ -1592,6 +1598,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu
*vcpu, u32 user_tsc_khz)
kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
&vcpu->arch.virtual_tsc_shift,
&vcpu->arch.virtual_tsc_mult);
+ if (user_tsc_khz != vcpu->arch.virtual_tsc_khz)
+ compute_vmentry_advance_cycles(vcpu);
vcpu->arch.virtual_tsc_khz = user_tsc_khz;

/*
@@ -9127,8 +9135,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);

- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
r = kvm_mmu_create(vcpu);
if (r < 0)
goto fail_free_pio_data;
@@ -9141,6 +9147,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
} else
static_key_slow_inc(&kvm_no_apic_vcpu);

+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
+
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.mce_banks) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a470ff0..2174355 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,8 @@ extern u64 kvm_supported_xcr0(void);

extern unsigned int min_timer_period_us;

+extern unsigned int vmentry_advance_ns;
+
extern bool enable_vmware_backdoor;

extern struct static_key kvm_no_apic_vcpu;