2020-02-25 09:53:14

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH v2] KVM: LAPIC: Recalculate apic map in batch

From: Wanpeng Li <[email protected]>

In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated
several times, each time it will consume 10+ us observed by ftrace in my
non-overcommit environment since the expensive memory allocate/mutex/rcu etc
operations. This patch optimizes it by recaluating apic map in batch, I hope
this can benefit the serverless scenario which can frequently create/destroy
VMs.

Signed-off-by: Wanpeng Li <[email protected]>
---
v1 -> v2:
* add apic_map_dirty to kvm_lapic
* error condition in kvm_apic_set_state, do recalcuate unconditionally

arch/x86/kvm/lapic.c | 29 +++++++++++++++++++----------
arch/x86/kvm/lapic.h | 2 ++
arch/x86/kvm/x86.c | 2 ++
3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afcd30d..3476dbc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -164,7 +164,7 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
kvfree(map);
}

-static void recalculate_apic_map(struct kvm *kvm)
+void kvm_recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
@@ -197,6 +197,7 @@ static void recalculate_apic_map(struct kvm *kvm)
if (!kvm_apic_present(vcpu))
continue;

+ apic->apic_map_dirty = false;
xapic_id = kvm_xapic_id(apic);
x2apic_id = kvm_x2apic_id(apic);

@@ -257,20 +258,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
else
static_key_slow_inc(&apic_sw_disabled.key);

- recalculate_apic_map(apic->vcpu->kvm);
+ apic->apic_map_dirty = true;
}
}

static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
{
kvm_lapic_set_reg(apic, APIC_ID, id << 24);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->apic_map_dirty = true;
}

static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
{
kvm_lapic_set_reg(apic, APIC_LDR, id);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->apic_map_dirty = true;
}

static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
@@ -286,7 +287,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)

kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->apic_map_dirty = true;
}

static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -1912,7 +1913,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_DFR:
if (!apic_x2apic_mode(apic)) {
kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->apic_map_dirty = true;
} else
ret = 1;
break;
@@ -2018,6 +2019,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}

+ if (apic->apic_map_dirty)
+ kvm_recalculate_apic_map(apic->vcpu->kvm);
+
return ret;
}
EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
@@ -2166,7 +2170,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
static_key_slow_dec_deferred(&apic_hw_disabled);
} else {
static_key_slow_inc(&apic_hw_disabled.key);
- recalculate_apic_map(vcpu->kvm);
+ apic->apic_map_dirty = true;
}
}

@@ -2207,6 +2211,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!apic)
return;

+ apic->apic_map_dirty = false;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);

@@ -2258,6 +2263,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)

vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
+
+ if (vcpu->arch.apic->apic_map_dirty)
+ kvm_recalculate_apic_map(vcpu->kvm);
}

/*
@@ -2479,17 +2487,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
struct kvm_lapic *apic = vcpu->arch.apic;
int r;

-
kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
/* set SPIV separately to get count of SW disabled APICs right */
apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));

r = kvm_apic_state_fixup(vcpu, s, true);
- if (r)
+ if (r) {
+ kvm_recalculate_apic_map(vcpu->kvm);
return r;
+ }
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));

- recalculate_apic_map(vcpu->kvm);
+ kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);

apic_update_ppr(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ec6fbfe..ba1156c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -47,6 +47,7 @@ struct kvm_lapic {
bool sw_enabled;
bool irr_pending;
bool lvt0_in_nmi_mode;
+ bool apic_map_dirty;
/* Number of bits set in ISR. */
s16 isr_count;
/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
@@ -78,6 +79,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 79bc995..2200f99 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -350,6 +350,8 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}

kvm_lapic_set_base(vcpu, msr_info->data);
+ if (vcpu->arch.apic->apic_map_dirty)
+ kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
--
2.7.4


2020-02-25 14:21:31

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2] KVM: LAPIC: Recalculate apic map in batch

On 25/02/20 10:47, Wanpeng Li wrote:
> From: Wanpeng Li <[email protected]>
>
> In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated
> several times, each time it will consume 10+ us observed by ftrace in my
> non-overcommit environment since the expensive memory allocate/mutex/rcu etc
> operations. This patch optimizes it by recaluating apic map in batch, I hope
> this can benefit the serverless scenario which can frequently create/destroy
> VMs.
>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> v1 -> v2:
> * add apic_map_dirty to kvm_lapic
> * error condition in kvm_apic_set_state, do recalcuate unconditionally
>
> arch/x86/kvm/lapic.c | 29 +++++++++++++++++++----------
> arch/x86/kvm/lapic.h | 2 ++
> arch/x86/kvm/x86.c | 2 ++
> 3 files changed, 23 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index afcd30d..3476dbc 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -164,7 +164,7 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
> kvfree(map);
> }
>
> -static void recalculate_apic_map(struct kvm *kvm)
> +void kvm_recalculate_apic_map(struct kvm *kvm)
> {

It's better to add an "if" here rather than in every caller. It should
be like:

if (!apic->apic_map_dirty) {
/*
* Read apic->apic_map_dirty before
* kvm->arch.apic_map.
*/
smp_rmb();
return;
}

mutex_lock(&kvm->arch.apic_map_lock);
if (!apic->apic_map_dirty) {
/* Someone else has updated the map. */
mutex_unlock(&kvm->arch.apic_map_lock);
return;
}
...
out:
old = rcu_dereference_protected(kvm->arch.apic_map,
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new);
/*
* Write kvm->arch.apic_map before
* clearing apic->apic_map_dirty.
*/
smp_wmb();
apic->apic_map_dirty = false;
mutex_unlock(&kvm->arch.apic_map_lock);
...

But actually it seems to me that, given we're going through all this
pain, it's better to put the "dirty" flag in kvm->arch, next to the
mutex and the map itself. This should also reduce the number of calls
to kvm_recalculate_apic_map that recompute the map. A lot of them will
just wait on the mutex and exit.

Paolo

2020-02-25 14:27:10

by Wanpeng Li

[permalink] [raw]
Subject: Re: [PATCH v2] KVM: LAPIC: Recalculate apic map in batch

On Tue, 25 Feb 2020 at 22:20, Paolo Bonzini <[email protected]> wrote:
>
> On 25/02/20 10:47, Wanpeng Li wrote:
> > From: Wanpeng Li <[email protected]>
> >
> > In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated
> > several times, each time it will consume 10+ us observed by ftrace in my
> > non-overcommit environment since the expensive memory allocate/mutex/rcu etc
> > operations. This patch optimizes it by recaluating apic map in batch, I hope
> > this can benefit the serverless scenario which can frequently create/destroy
> > VMs.
> >
> > Signed-off-by: Wanpeng Li <[email protected]>
> > ---
> > v1 -> v2:
> > * add apic_map_dirty to kvm_lapic
> > * error condition in kvm_apic_set_state, do recalcuate unconditionally
> >
> > arch/x86/kvm/lapic.c | 29 +++++++++++++++++++----------
> > arch/x86/kvm/lapic.h | 2 ++
> > arch/x86/kvm/x86.c | 2 ++
> > 3 files changed, 23 insertions(+), 10 deletions(-)
> >
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index afcd30d..3476dbc 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -164,7 +164,7 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
> > kvfree(map);
> > }
> >
> > -static void recalculate_apic_map(struct kvm *kvm)
> > +void kvm_recalculate_apic_map(struct kvm *kvm)
> > {
>
> It's better to add an "if" here rather than in every caller. It should
> be like:
>
> if (!apic->apic_map_dirty) {
> /*
> * Read apic->apic_map_dirty before
> * kvm->arch.apic_map.
> */
> smp_rmb();
> return;
> }
>
> mutex_lock(&kvm->arch.apic_map_lock);
> if (!apic->apic_map_dirty) {
> /* Someone else has updated the map. */
> mutex_unlock(&kvm->arch.apic_map_lock);
> return;
> }
> ...
> out:
> old = rcu_dereference_protected(kvm->arch.apic_map,
> lockdep_is_held(&kvm->arch.apic_map_lock));
> rcu_assign_pointer(kvm->arch.apic_map, new);
> /*
> * Write kvm->arch.apic_map before
> * clearing apic->apic_map_dirty.
> */
> smp_wmb();
> apic->apic_map_dirty = false;
> mutex_unlock(&kvm->arch.apic_map_lock);
> ...
>
> But actually it seems to me that, given we're going through all this
> pain, it's better to put the "dirty" flag in kvm->arch, next to the
> mutex and the map itself. This should also reduce the number of calls
> to kvm_recalculate_apic_map that recompute the map. A lot of them will
> just wait on the mutex and exit.

Good point, will do in next version.

Wanpeng