2020-02-26 02:44:01

by Wanpeng Li

[permalink] [raw]
Subject: [PATCH v3] KVM: LAPIC: Recalculate apic map in batch

From: Wanpeng Li <[email protected]>

In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated
several times, each time it will consume 10+ us observed by ftrace in my
non-overcommit environment since the expensive memory allocate/mutex/rcu etc
operations. This patch optimizes it by recaluating apic map in batch, I hope
this can benefit the serverless scenario which can frequently create/destroy
VMs.

Before patch:

kvm_lapic_reset ~27us

After patch:

kvm_lapic_reset ~14us

Observed by ftrace, improve ~48%.

Signed-off-by: Wanpeng Li <[email protected]>
---
v2 -> v3:
* move apic_map_dirty to kvm_arch
* add the suggestions from Paolo

v1 -> v2:
* add apic_map_dirty to kvm_lapic
* error condition in kvm_apic_set_state, do recalcuate unconditionally

arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/lapic.c | 46 ++++++++++++++++++++++++++++++++---------
arch/x86/kvm/lapic.h | 1 +
arch/x86/kvm/x86.c | 1 +
4 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 40a0c0f..4380ed1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -920,6 +920,7 @@ struct kvm_arch {
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map *apic_map;
+ bool apic_map_dirty;

bool apic_access_page_done;
unsigned long apicv_inhibit_reasons;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afcd30d..de832aa 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -164,14 +164,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
kvfree(map);
}

-static void recalculate_apic_map(struct kvm *kvm)
+void kvm_recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
int i;
u32 max_id = 255; /* enough space for any xAPIC ID */

+ if (!kvm->arch.apic_map_dirty) {
+ /*
+ * Read kvm->arch.apic_map_dirty before
+ * kvm->arch.apic_map
+ */
+ smp_rmb();
+ return;
+ }
+
mutex_lock(&kvm->arch.apic_map_lock);
+ if (!kvm->arch.apic_map_dirty) {
+ /* Someone else has updated the map. */
+ mutex_unlock(&kvm->arch.apic_map_lock);
+ return;
+ }

kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
@@ -236,6 +250,12 @@ static void recalculate_apic_map(struct kvm *kvm)
old = rcu_dereference_protected(kvm->arch.apic_map,
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new);
+ /*
+ * Write kvm->arch.apic_map before
+ * clearing apic->apic_map_dirty
+ */
+ smp_wmb();
+ kvm->arch.apic_map_dirty = false;
mutex_unlock(&kvm->arch.apic_map_lock);

if (old)
@@ -257,20 +277,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
else
static_key_slow_inc(&apic_sw_disabled.key);

- recalculate_apic_map(apic->vcpu->kvm);
+ apic->vcpu->kvm->arch.apic_map_dirty = true;
}
}

static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
{
kvm_lapic_set_reg(apic, APIC_ID, id << 24);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->vcpu->kvm->arch.apic_map_dirty = true;
}

static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
{
kvm_lapic_set_reg(apic, APIC_LDR, id);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->vcpu->kvm->arch.apic_map_dirty = true;
}

static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
@@ -286,7 +306,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)

kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->vcpu->kvm->arch.apic_map_dirty = true;
}

static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -1912,7 +1932,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_DFR:
if (!apic_x2apic_mode(apic)) {
kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- recalculate_apic_map(apic->vcpu->kvm);
+ apic->vcpu->kvm->arch.apic_map_dirty = true;
} else
ret = 1;
break;
@@ -2018,6 +2038,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}

+ kvm_recalculate_apic_map(apic->vcpu->kvm);
+
return ret;
}
EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
@@ -2166,7 +2188,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
static_key_slow_dec_deferred(&apic_hw_disabled);
} else {
static_key_slow_inc(&apic_hw_disabled.key);
- recalculate_apic_map(vcpu->kvm);
+ vcpu->kvm->arch.apic_map_dirty = true;
}
}

@@ -2207,6 +2229,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!apic)
return;

+ vcpu->kvm->arch.apic_map_dirty = false;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);

@@ -2258,6 +2281,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)

vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
+
+ kvm_recalculate_apic_map(vcpu->kvm);
}

/*
@@ -2479,17 +2504,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
struct kvm_lapic *apic = vcpu->arch.apic;
int r;

-
kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
/* set SPIV separately to get count of SW disabled APICs right */
apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));

r = kvm_apic_state_fixup(vcpu, s, true);
- if (r)
+ if (r) {
+ kvm_recalculate_apic_map(vcpu->kvm);
return r;
+ }
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));

- recalculate_apic_map(vcpu->kvm);
+ kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);

apic_update_ppr(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ec6fbfe..7581bc2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -78,6 +78,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 79bc995..d3802a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -350,6 +350,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}

kvm_lapic_set_base(vcpu, msr_info->data);
+ kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
--
2.7.4


2020-03-02 16:33:27

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3] KVM: LAPIC: Recalculate apic map in batch

On 26/02/20 03:41, Wanpeng Li wrote:
> From: Wanpeng Li <[email protected]>
>
> In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated
> several times, each time it will consume 10+ us observed by ftrace in my
> non-overcommit environment since the expensive memory allocate/mutex/rcu etc
> operations. This patch optimizes it by recaluating apic map in batch, I hope
> this can benefit the serverless scenario which can frequently create/destroy
> VMs.
>
> Before patch:
>
> kvm_lapic_reset ~27us
>
> After patch:
>
> kvm_lapic_reset ~14us
>
> Observed by ftrace, improve ~48%.
>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> v2 -> v3:
> * move apic_map_dirty to kvm_arch
> * add the suggestions from Paolo
>
> v1 -> v2:
> * add apic_map_dirty to kvm_lapic
> * error condition in kvm_apic_set_state, do recalcuate unconditionally
>
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/lapic.c | 46 ++++++++++++++++++++++++++++++++---------
> arch/x86/kvm/lapic.h | 1 +
> arch/x86/kvm/x86.c | 1 +
> 4 files changed, 39 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 40a0c0f..4380ed1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -920,6 +920,7 @@ struct kvm_arch {
> atomic_t vapics_in_nmi_mode;
> struct mutex apic_map_lock;
> struct kvm_apic_map *apic_map;
> + bool apic_map_dirty;
>
> bool apic_access_page_done;
> unsigned long apicv_inhibit_reasons;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index afcd30d..de832aa 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -164,14 +164,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
> kvfree(map);
> }
>
> -static void recalculate_apic_map(struct kvm *kvm)
> +void kvm_recalculate_apic_map(struct kvm *kvm)
> {
> struct kvm_apic_map *new, *old = NULL;
> struct kvm_vcpu *vcpu;
> int i;
> u32 max_id = 255; /* enough space for any xAPIC ID */
>
> + if (!kvm->arch.apic_map_dirty) {
> + /*
> + * Read kvm->arch.apic_map_dirty before
> + * kvm->arch.apic_map
> + */
> + smp_rmb();
> + return;
> + }
> +
> mutex_lock(&kvm->arch.apic_map_lock);
> + if (!kvm->arch.apic_map_dirty) {
> + /* Someone else has updated the map. */
> + mutex_unlock(&kvm->arch.apic_map_lock);
> + return;
> + }
>
> kvm_for_each_vcpu(i, vcpu, kvm)
> if (kvm_apic_present(vcpu))
> @@ -236,6 +250,12 @@ static void recalculate_apic_map(struct kvm *kvm)
> old = rcu_dereference_protected(kvm->arch.apic_map,
> lockdep_is_held(&kvm->arch.apic_map_lock));
> rcu_assign_pointer(kvm->arch.apic_map, new);
> + /*
> + * Write kvm->arch.apic_map before
> + * clearing apic->apic_map_dirty
> + */
> + smp_wmb();
> + kvm->arch.apic_map_dirty = false;
> mutex_unlock(&kvm->arch.apic_map_lock);
>
> if (old)
> @@ -257,20 +277,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> else
> static_key_slow_inc(&apic_sw_disabled.key);
>
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
> }
>
> static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
> {
> kvm_lapic_set_reg(apic, APIC_ID, id << 24);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
>
> static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> {
> kvm_lapic_set_reg(apic, APIC_LDR, id);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
>
> static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
> @@ -286,7 +306,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
>
> kvm_lapic_set_reg(apic, APIC_ID, id);
> kvm_lapic_set_reg(apic, APIC_LDR, ldr);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
>
> static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
> @@ -1912,7 +1932,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> case APIC_DFR:
> if (!apic_x2apic_mode(apic)) {
> kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> } else
> ret = 1;
> break;
> @@ -2018,6 +2038,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> break;
> }
>
> + kvm_recalculate_apic_map(apic->vcpu->kvm);
> +
> return ret;
> }
> EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
> @@ -2166,7 +2188,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> static_key_slow_dec_deferred(&apic_hw_disabled);
> } else {
> static_key_slow_inc(&apic_hw_disabled.key);
> - recalculate_apic_map(vcpu->kvm);
> + vcpu->kvm->arch.apic_map_dirty = true;
> }
> }
>
> @@ -2207,6 +2229,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
> if (!apic)
> return;
>
> + vcpu->kvm->arch.apic_map_dirty = false;
> /* Stop the timer in case it's a reset to an active apic */
> hrtimer_cancel(&apic->lapic_timer.timer);
>
> @@ -2258,6 +2281,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>
> vcpu->arch.apic_arb_prio = 0;
> vcpu->arch.apic_attention = 0;
> +
> + kvm_recalculate_apic_map(vcpu->kvm);
> }
>
> /*
> @@ -2479,17 +2504,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
> struct kvm_lapic *apic = vcpu->arch.apic;
> int r;
>
> -
> kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
> /* set SPIV separately to get count of SW disabled APICs right */
> apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
>
> r = kvm_apic_state_fixup(vcpu, s, true);
> - if (r)
> + if (r) {
> + kvm_recalculate_apic_map(vcpu->kvm);
> return r;
> + }
> memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
>
> - recalculate_apic_map(vcpu->kvm);
> + kvm_recalculate_apic_map(vcpu->kvm);
> kvm_apic_set_version(vcpu);
>
> apic_update_ppr(apic);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index ec6fbfe..7581bc2 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -78,6 +78,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
> void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
> void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
> u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
> +void kvm_recalculate_apic_map(struct kvm *kvm);
> void kvm_apic_set_version(struct kvm_vcpu *vcpu);
> int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
> int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 79bc995..d3802a2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -350,6 +350,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> }
>
> kvm_lapic_set_base(vcpu, msr_info->data);
> + kvm_recalculate_apic_map(vcpu->kvm);
> return 0;
> }
> EXPORT_SYMBOL_GPL(kvm_set_apic_base);
>

Queued, thanks.

Paolo

2020-06-19 13:05:33

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH v3] KVM: LAPIC: Recalculate apic map in batch

On Wed, 26 Feb 2020 10:41:02 +0800
Wanpeng Li <[email protected]> wrote:

> From: Wanpeng Li <[email protected]>
>
> In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated
> several times, each time it will consume 10+ us observed by ftrace in my
> non-overcommit environment since the expensive memory allocate/mutex/rcu etc
> operations. This patch optimizes it by recaluating apic map in batch, I hope
> this can benefit the serverless scenario which can frequently create/destroy
> VMs.
>
> Before patch:
>
> kvm_lapic_reset ~27us
>
> After patch:
>
> kvm_lapic_reset ~14us
>
> Observed by ftrace, improve ~48%.
>
> Signed-off-by: Wanpeng Li <[email protected]>
this broke CPU hotplug,

qemu-kvm -m 2G -smp 4,maxcpus=8 -monitor stdio
(qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0

in guest fails with:

smpboot: do_boot_cpu failed(-1) to wakeup CPU#4

which makes me suspect that INIT/SIPI wasn't delivered

Is it a know issue?

> ---
> v2 -> v3:
> * move apic_map_dirty to kvm_arch
> * add the suggestions from Paolo
>
> v1 -> v2:
> * add apic_map_dirty to kvm_lapic
> * error condition in kvm_apic_set_state, do recalcuate unconditionally
>
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/lapic.c | 46 ++++++++++++++++++++++++++++++++---------
> arch/x86/kvm/lapic.h | 1 +
> arch/x86/kvm/x86.c | 1 +
> 4 files changed, 39 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 40a0c0f..4380ed1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -920,6 +920,7 @@ struct kvm_arch {
> atomic_t vapics_in_nmi_mode;
> struct mutex apic_map_lock;
> struct kvm_apic_map *apic_map;
> + bool apic_map_dirty;
>
> bool apic_access_page_done;
> unsigned long apicv_inhibit_reasons;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index afcd30d..de832aa 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -164,14 +164,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
> kvfree(map);
> }
>
> -static void recalculate_apic_map(struct kvm *kvm)
> +void kvm_recalculate_apic_map(struct kvm *kvm)
> {
> struct kvm_apic_map *new, *old = NULL;
> struct kvm_vcpu *vcpu;
> int i;
> u32 max_id = 255; /* enough space for any xAPIC ID */
>
> + if (!kvm->arch.apic_map_dirty) {
> + /*
> + * Read kvm->arch.apic_map_dirty before
> + * kvm->arch.apic_map
> + */
> + smp_rmb();
> + return;
> + }
> +
> mutex_lock(&kvm->arch.apic_map_lock);
> + if (!kvm->arch.apic_map_dirty) {
> + /* Someone else has updated the map. */
> + mutex_unlock(&kvm->arch.apic_map_lock);
> + return;
> + }
>
> kvm_for_each_vcpu(i, vcpu, kvm)
> if (kvm_apic_present(vcpu))
> @@ -236,6 +250,12 @@ static void recalculate_apic_map(struct kvm *kvm)
> old = rcu_dereference_protected(kvm->arch.apic_map,
> lockdep_is_held(&kvm->arch.apic_map_lock));
> rcu_assign_pointer(kvm->arch.apic_map, new);
> + /*
> + * Write kvm->arch.apic_map before
> + * clearing apic->apic_map_dirty
> + */
> + smp_wmb();
> + kvm->arch.apic_map_dirty = false;
> mutex_unlock(&kvm->arch.apic_map_lock);
>
> if (old)
> @@ -257,20 +277,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> else
> static_key_slow_inc(&apic_sw_disabled.key);
>
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
> }
>
> static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
> {
> kvm_lapic_set_reg(apic, APIC_ID, id << 24);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
>
> static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> {
> kvm_lapic_set_reg(apic, APIC_LDR, id);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
>
> static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
> @@ -286,7 +306,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
>
> kvm_lapic_set_reg(apic, APIC_ID, id);
> kvm_lapic_set_reg(apic, APIC_LDR, ldr);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> }
>
> static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
> @@ -1912,7 +1932,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> case APIC_DFR:
> if (!apic_x2apic_mode(apic)) {
> kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> - recalculate_apic_map(apic->vcpu->kvm);
> + apic->vcpu->kvm->arch.apic_map_dirty = true;
> } else
> ret = 1;
> break;
> @@ -2018,6 +2038,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> break;
> }
>
> + kvm_recalculate_apic_map(apic->vcpu->kvm);
> +
> return ret;
> }
> EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
> @@ -2166,7 +2188,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> static_key_slow_dec_deferred(&apic_hw_disabled);
> } else {
> static_key_slow_inc(&apic_hw_disabled.key);
> - recalculate_apic_map(vcpu->kvm);
> + vcpu->kvm->arch.apic_map_dirty = true;
> }
> }
>
> @@ -2207,6 +2229,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
> if (!apic)
> return;
>
> + vcpu->kvm->arch.apic_map_dirty = false;
> /* Stop the timer in case it's a reset to an active apic */
> hrtimer_cancel(&apic->lapic_timer.timer);
>
> @@ -2258,6 +2281,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>
> vcpu->arch.apic_arb_prio = 0;
> vcpu->arch.apic_attention = 0;
> +
> + kvm_recalculate_apic_map(vcpu->kvm);
> }
>
> /*
> @@ -2479,17 +2504,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
> struct kvm_lapic *apic = vcpu->arch.apic;
> int r;
>
> -
> kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
> /* set SPIV separately to get count of SW disabled APICs right */
> apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
>
> r = kvm_apic_state_fixup(vcpu, s, true);
> - if (r)
> + if (r) {
> + kvm_recalculate_apic_map(vcpu->kvm);
> return r;
> + }
> memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
>
> - recalculate_apic_map(vcpu->kvm);
> + kvm_recalculate_apic_map(vcpu->kvm);
> kvm_apic_set_version(vcpu);
>
> apic_update_ppr(apic);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index ec6fbfe..7581bc2 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -78,6 +78,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
> void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
> void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
> u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
> +void kvm_recalculate_apic_map(struct kvm *kvm);
> void kvm_apic_set_version(struct kvm_vcpu *vcpu);
> int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
> int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 79bc995..d3802a2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -350,6 +350,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> }
>
> kvm_lapic_set_base(vcpu, msr_info->data);
> + kvm_recalculate_apic_map(vcpu->kvm);
> return 0;
> }
> EXPORT_SYMBOL_GPL(kvm_set_apic_base);

2020-06-19 14:13:11

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3] KVM: LAPIC: Recalculate apic map in batch

On 19/06/20 14:36, Igor Mammedov wrote:
> qemu-kvm -m 2G -smp 4,maxcpus=8 -monitor stdio
> (qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0
>
> in guest fails with:
>
> smpboot: do_boot_cpu failed(-1) to wakeup CPU#4
>
> which makes me suspect that INIT/SIPI wasn't delivered
>
> Is it a know issue?
>

No, it isn't. I'll revert.

Paolo

2020-06-19 15:42:02

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH v3] KVM: LAPIC: Recalculate apic map in batch

On Fri, 19 Jun 2020 16:10:43 +0200
Paolo Bonzini <[email protected]> wrote:

> On 19/06/20 14:36, Igor Mammedov wrote:
> > qemu-kvm -m 2G -smp 4,maxcpus=8 -monitor stdio
> > (qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0
> >
> > in guest fails with:
> >
> > smpboot: do_boot_cpu failed(-1) to wakeup CPU#4
> >
> > which makes me suspect that INIT/SIPI wasn't delivered
> >
> > Is it a know issue?
> >
>
> No, it isn't. I'll revert.
wait for a day or 2,

I'll try to come up with a fix over weekend.

> Paolo
>

2020-06-21 22:29:54

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH v3] KVM: LAPIC: Recalculate apic map in batch

On Fri, 19 Jun 2020 16:10:43 +0200
Paolo Bonzini <[email protected]> wrote:

> On 19/06/20 14:36, Igor Mammedov wrote:
> > qemu-kvm -m 2G -smp 4,maxcpus=8 -monitor stdio
> > (qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0
> >
> > in guest fails with:
> >
> > smpboot: do_boot_cpu failed(-1) to wakeup CPU#4
> >
> > which makes me suspect that INIT/SIPI wasn't delivered
> >
> > Is it a know issue?
> >
>
> No, it isn't. I'll revert.
>
> Paolo
>

Following fixes immediate issue:

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 34a7e0533dad..6dc177da19da 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2567,6 +2567,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
}
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));

+ apic->vcpu->kvm->arch.apic_map_dirty = true;
kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);

Problem is that during kvm_arch_vcpu_create() new vcpu is not visible to
kvm_recalculate_apic_map(), so whoever many times map update was called
during it, it didn't affect apic map.

What broke hotplug is that kvm_vcpu_ioctl_set_lapic -> kvm_apic_set_state,
which is called after new vcpu is visible, used to make an unconditional update
which pulled in the new vcpu, but with this patch the map update is gone
since state hasn't actuaaly changed, so we lost the one call of
kvm_recalculate_apic_map() which did actually matter.

It happens to work for vcpus present at boot just by luck
(BSP updates SPIV after all vcpus has been created which triggers kvm_recalculate_apic_map())

I'm not sending formal patch yet, since I have doubts wrt subj.

following sequence looks like a race that can cause lost map update events:

cpu1 cpu2

apic_map_dirty = true
------------------------------------------------------------
kvm_recalculate_apic_map:
pass check
mutex_lock(&kvm->arch.apic_map_lock);
if (!kvm->arch.apic_map_dirty)
and in process of updating map
-------------------------------------------------------------
other calls to
apic_map_dirty = true might be too late for affected cpu
-------------------------------------------------------------
apic_map_dirty = false
-------------------------------------------------------------
kvm_recalculate_apic_map:
bail out on
if (!kvm->arch.apic_map_dirty)

it's safer to revert this patch for now like you have suggested earlier.

If you prefer to keep it, I'll post above fixup as a patch.

2020-06-22 14:40:14

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3] KVM: LAPIC: Recalculate apic map in batch

On 22/06/20 00:26, Igor Mammedov wrote:
>
> following sequence looks like a race that can cause lost map update events:
>
> cpu1 cpu2
>
> apic_map_dirty = true
> ------------------------------------------------------------
> kvm_recalculate_apic_map:
> pass check
> mutex_lock(&kvm->arch.apic_map_lock);
> if (!kvm->arch.apic_map_dirty)
> and in process of updating map
> -------------------------------------------------------------
> other calls to
> apic_map_dirty = true might be too late for affected cpu
> -------------------------------------------------------------
> apic_map_dirty = false
> -------------------------------------------------------------
> kvm_recalculate_apic_map:
> bail out on
> if (!kvm->arch.apic_map_dirty)

I will post a fix for that. Thanks for the analysis!

Paolo