These should fix, or at least help, the kernel panic reported by Longpeng
with VT-d posted interrupts.
CONFIG_DEBUG_LIST reports a double add, meaning that pi_pre_block ran twice
without pi_post_block deleting the vCPU from the blocked_on_vcpu list.
The only possibility that I could think of is that this:
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(vcpu))
return;
was false in pi_post_block. In turn, I can only think of hot-unplug as
the cause of this imbalance, but maybe there is another way to reach it
just via repeated startup and shutdown. Gonglei reported problems with
hot-unplug offlist too, so this is a start.
In any case, patch 2 replaces it with a check on vcpu->pre_pcpu.
A similar change is done in patch 3 to vmx_vcpu_pi_load. I don't
have hardware easily accessible with VT-d PI, so these patches are
compile-tested only. I apologize for any stupid mistakes.
The first three patches are meant for stable versions too.
Paolo
Paolo Bonzini (4):
KVM: VMX: extract __pi_post_block
KVM: VMX: avoid double list add with VT-d posted interrupts
KVM: VMX: simplify and fix vmx_vcpu_pi_load
KVM: VMX: simplify cmpxchg of PI descriptor control field
arch/x86/kvm/vmx.c | 228 ++++++++++++++++++++++++++---------------------------
1 file changed, 110 insertions(+), 118 deletions(-)
--
2.13.0
Simple code movement patch, preparing for the next one.
Cc: Longpeng (Mike) <[email protected]>
Cc: Huangweidong <[email protected]>
Cc: Gonglei <[email protected]>
Cc: wangxin <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++++++-------------------------
1 file changed, 38 insertions(+), 33 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d9b4ad2a528a..747d16525b45 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11231,6 +11231,43 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+ unsigned long flags;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if(vcpu->pre_pcpu != -1) {
+ spin_lock_irqsave(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+ }
+}
+
/*
* This routine does the following things for vCPU which is going
* to be blocked if VT-d PI is enabled.
@@ -11324,44 +11361,12 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
static void pi_post_block(struct kvm_vcpu *vcpu)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
- unsigned long flags;
-
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(vcpu))
return;
- do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* Allow posting non-urgent interrupts */
- new.sn = 0;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if(vcpu->pre_pcpu != -1) {
- spin_lock_irqsave(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock_irqrestore(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- vcpu->pre_pcpu = -1;
- }
+ __pi_post_block(vcpu);
}
static void vmx_post_block(struct kvm_vcpu *vcpu)
--
2.13.0
Introduce new helpers __pi_set_ndst and pi_try_cmpxchg_control.
Cc: Longpeng (Mike) <[email protected]>
Cc: Huangweidong <[email protected]>
Cc: Gonglei <[email protected]>
Cc: wangxin <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 83 ++++++++++++++++++++++++++----------------------------
1 file changed, 40 insertions(+), 43 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81047f373747..4f6b5fa57bbc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -560,6 +560,29 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+/* Note that this is not an atomic operation. */
+static inline void __pi_set_ndst(struct pi_desc *pi_desc, unsigned cpu)
+{
+ unsigned dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ pi_desc->ndst = dest;
+ else
+ pi_desc->ndst = (dest << 8) & 0xFF00;
+}
+
+static inline bool pi_try_cmpxchg_control(struct pi_desc *pi_desc,
+ struct pi_desc *old,
+ struct pi_desc *new)
+{
+ if (cmpxchg(&pi_desc->control, old->control,
+ new->control) == old->control)
+ return true;
+
+ old->control = READ_ONCE(pi_desc->control);
+ return false;
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -2182,7 +2205,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
- unsigned int dest;
/*
* In case of hot-plug or hot-unplug, we may have to undo
@@ -2209,19 +2231,12 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
}
/* The full case. */
+ old.control = pi_desc->control;
do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
+ new.control = old.control;
+ __pi_set_ndst(&new, cpu);
new.sn = 0;
- } while (cmpxchg(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (!pi_try_cmpxchg_control(pi_desc, &old, &new));
}
static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
@@ -11240,24 +11255,16 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
- unsigned int dest;
+ old.control = pi_desc->control;
do {
- old.control = new.control = pi_desc->control;
WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
"Wakeup handler not enabled while the VCPU is blocked\n");
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
+ new.control = old.control;
+ __pi_set_ndst(&new, vcpu->cpu);
new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (!pi_try_cmpxchg_control(pi_desc, &old, &new));
if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
@@ -11265,6 +11272,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
vcpu->pre_pcpu = -1;
}
+
}
/*
@@ -11282,7 +11290,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
*/
static int pi_pre_block(struct kvm_vcpu *vcpu)
{
- unsigned int dest;
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -11302,32 +11309,22 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
}
+ old.control = pi_desc->control;
do {
- old.control = new.control = pi_desc->control;
-
WARN((pi_desc->sn == 1),
"Warning: SN field of posted-interrupts "
"is set before blocking\n");
/*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
+ * The wakeup_handler expects the VCPU to be on the
+ * blocked_vcpu_list that matches ndst. Interrupts
+ * are disabled so no preemption should happen, but
+ * err on the side of safety.
*/
- dest = cpu_physical_id(vcpu->pre_pcpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'wakeup vector' */
+ new.control = old.control;
+ __pi_set_ndst(&new, vcpu->pre_pcpu);
new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (!pi_try_cmpxchg_control(pi_desc, &old, &new));
/* We should not block the vCPU if an interrupt is posted for it. */
if (pi_test_on(pi_desc) == 1)
--
2.13.0
In some cases, for example involving hot-unplug of assigned
devices, pi_post_block can forget to remove the vCPU from the
blocked_vcpu_list. When this happens, the next call to
pi_pre_block corrupts the list.
Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
and WARN instead of adding the element twice in the list. Second,
always do the list removal in pi_post_block if vcpu->pre_pcpu is
set (not -1).
The new code keeps interrupts disabled for the whole duration of
pi_pre_block/pi_post_block. This is not strictly necessary, but
easier to follow. For the same reason, PI.ON is checked only
after the cmpxchg, and to handle it we just call the post-block
code. This removes duplication of the list removal code.
Cc: Longpeng (Mike) <[email protected]>
Cc: Huangweidong <[email protected]>
Cc: Gonglei <[email protected]>
Cc: wangxin <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
1 file changed, 25 insertions(+), 37 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 747d16525b45..0f4714fe4908 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11236,10 +11236,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
unsigned int dest;
- unsigned long flags;
do {
old.control = new.control = pi_desc->control;
+ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the VCPU is blocked\n");
dest = cpu_physical_id(vcpu->cpu);
@@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
} while (cmpxchg(&pi_desc->control, old.control,
new.control) != old.control);
- if(vcpu->pre_pcpu != -1) {
- spin_lock_irqsave(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
list_del(&vcpu->blocked_vcpu_list);
- spin_unlock_irqrestore(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
vcpu->pre_pcpu = -1;
}
}
@@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
*/
static int pi_pre_block(struct kvm_vcpu *vcpu)
{
- unsigned long flags;
unsigned int dest;
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
!kvm_vcpu_apicv_active(vcpu))
return 0;
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ }
do {
old.control = new.control = pi_desc->control;
- /*
- * We should not block the vCPU if
- * an interrupt is posted for it.
- */
- if (pi_test_on(pi_desc) == 1) {
- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock_irqrestore(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- vcpu->pre_pcpu = -1;
-
- return 1;
- }
-
WARN((pi_desc->sn == 1),
"Warning: SN field of posted-interrupts "
"is set before blocking\n");
@@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
} while (cmpxchg(&pi_desc->control, old.control,
new.control) != old.control);
- return 0;
+ /* We should not block the vCPU if an interrupt is posted for it. */
+ if (pi_test_on(pi_desc) == 1)
+ __pi_post_block(vcpu);
+
+ local_irq_enable();
+ return (vcpu->pre_pcpu == -1);
}
static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
static void pi_post_block(struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (vcpu->pre_pcpu == -1)
return;
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
__pi_post_block(vcpu);
+ local_irq_enable();
}
static void vmx_post_block(struct kvm_vcpu *vcpu)
--
2.13.0
The simplify part: do not touch pi_desc.nv, we can set it when the
VCPU is first created. Likewise, pi_desc.sn is only handled by
vmx_vcpu_pi_load, do not touch it in __pi_post_block.
The fix part: do not check kvm_arch_has_assigned_device, instead
check the SN bit to figure out whether vmx_vcpu_pi_put ran before.
This matches what the previous patch did in pi_post_block.
Cc: Longpeng (Mike) <[email protected]>
Cc: Huangweidong <[email protected]>
Cc: Gonglei <[email protected]>
Cc: wangxin <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++++++--------------------------
1 file changed, 35 insertions(+), 33 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0f4714fe4908..81047f373747 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2184,43 +2184,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
struct pi_desc old, new;
unsigned int dest;
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ /*
+ * In case of hot-plug or hot-unplug, we may have to undo
+ * vmx_vcpu_pi_put even if there is no assigned device. And we
+ * always keep PI.NDST up to date for simplicity: it makes the
+ * code easier, and CPU migration is not a fast path.
+ */
+ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ return;
+
+ /*
+ * First handle the simple case where no cmpxchg is necessary; just
+ * allow posting non-urgent interrupts.
+ *
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+ * expects the VCPU to be on the blocked_vcpu_list that matches
+ * PI.NDST.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+ vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
return;
+ }
+ /* The full case. */
do {
old.control = new.control = pi_desc->control;
- /*
- * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
- * are two possible cases:
- * 1. After running 'pre_block', context switch
- * happened. For this case, 'sn' was set in
- * vmx_vcpu_put(), so we need to clear it here.
- * 2. After running 'pre_block', we were blocked,
- * and woken up by some other guy. For this case,
- * we don't need to do anything, 'pi_post_block'
- * will do everything for us. However, we cannot
- * check whether it is case #1 or case #2 here
- * (maybe, not needed), so we also clear sn here,
- * I think it is not a big deal.
- */
- if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
- if (vcpu->cpu != cpu) {
- dest = cpu_physical_id(cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
- }
+ dest = cpu_physical_id(cpu);
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- }
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
- /* Allow posting non-urgent interrupts */
new.sn = 0;
} while (cmpxchg(&pi_desc->control, old.control,
new.control) != old.control);
@@ -9259,6 +9257,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
+
return &vmx->vcpu;
free_vmcs:
@@ -11249,9 +11254,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
else
new.ndst = (dest << 8) & 0xFF00;
- /* Allow posting non-urgent interrupts */
- new.sn = 0;
-
/* set 'NV' to 'notification vector' */
new.nv = POSTED_INTR_VECTOR;
} while (cmpxchg(&pi_desc->control, old.control,
--
2.13.0
On 2017/6/6 18:57, Paolo Bonzini wrote:
> In some cases, for example involving hot-unplug of assigned
> devices, pi_post_block can forget to remove the vCPU from the
> blocked_vcpu_list. When this happens, the next call to
> pi_pre_block corrupts the list.
>
> Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
> and WARN instead of adding the element twice in the list. Second,
> always do the list removal in pi_post_block if vcpu->pre_pcpu is
> set (not -1).
>
> The new code keeps interrupts disabled for the whole duration of
> pi_pre_block/pi_post_block. This is not strictly necessary, but
> easier to follow. For the same reason, PI.ON is checked only
> after the cmpxchg, and to handle it we just call the post-block
> code. This removes duplication of the list removal code.
>
> Cc: Longpeng (Mike) <[email protected]>
> Cc: Huangweidong <[email protected]>
> Cc: Gonglei <[email protected]>
> Cc: wangxin <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
> 1 file changed, 25 insertions(+), 37 deletions(-)
>
[...]
> @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> } while (cmpxchg(&pi_desc->control, old.control,
> new.control) != old.control);
>
> - if(vcpu->pre_pcpu != -1) {
> - spin_lock_irqsave(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> list_del(&vcpu->blocked_vcpu_list);
> - spin_unlock_irqrestore(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
Hi Paolo,
spin_lock_irqsave() will disable kernel preempt, but spin_lock() won't. is there
some potential problems ?
Regards,
Longpeng(Mike)
> vcpu->pre_pcpu = -1;
> }
> }
> @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> */
> static int pi_pre_block(struct kvm_vcpu *vcpu)
> {
> - unsigned long flags;
> unsigned int dest;
> struct pi_desc old, new;
> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> !kvm_vcpu_apicv_active(vcpu))
> return 0;
>
> - vcpu->pre_pcpu = vcpu->cpu;
> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - list_add_tail(&vcpu->blocked_vcpu_list,
> - &per_cpu(blocked_vcpu_on_cpu,
> - vcpu->pre_pcpu));
> - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + WARN_ON(irqs_disabled());
> + local_irq_disable();
> + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> + vcpu->pre_pcpu = vcpu->cpu;
> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> + list_add_tail(&vcpu->blocked_vcpu_list,
> + &per_cpu(blocked_vcpu_on_cpu,
> + vcpu->pre_pcpu));
> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> + }
>
> do {
> old.control = new.control = pi_desc->control;
>
> - /*
> - * We should not block the vCPU if
> - * an interrupt is posted for it.
> - */
> - if (pi_test_on(pi_desc) == 1) {
> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - list_del(&vcpu->blocked_vcpu_list);
> - spin_unlock_irqrestore(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - vcpu->pre_pcpu = -1;
> -
> - return 1;
> - }
> -
> WARN((pi_desc->sn == 1),
> "Warning: SN field of posted-interrupts "
> "is set before blocking\n");
> @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> } while (cmpxchg(&pi_desc->control, old.control,
> new.control) != old.control);
>
> - return 0;
> + /* We should not block the vCPU if an interrupt is posted for it. */
> + if (pi_test_on(pi_desc) == 1)
> + __pi_post_block(vcpu);
> +
> + local_irq_enable();
> + return (vcpu->pre_pcpu == -1);
> }
>
> static int vmx_pre_block(struct kvm_vcpu *vcpu)
> @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
>
> static void pi_post_block(struct kvm_vcpu *vcpu)
> {
> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> - !irq_remapping_cap(IRQ_POSTING_CAP) ||
> - !kvm_vcpu_apicv_active(vcpu))
> + if (vcpu->pre_pcpu == -1)
> return;
>
> + WARN_ON(irqs_disabled());
> + local_irq_disable();
> __pi_post_block(vcpu);
> + local_irq_enable();
> }
>
> static void vmx_post_block(struct kvm_vcpu *vcpu)
--
Regards,
Longpeng(Mike)
On 06/06/2017 14:30, Longpeng (Mike) wrote:
>
>
> On 2017/6/6 18:57, Paolo Bonzini wrote:
>
>> In some cases, for example involving hot-unplug of assigned
>> devices, pi_post_block can forget to remove the vCPU from the
>> blocked_vcpu_list. When this happens, the next call to
>> pi_pre_block corrupts the list.
>>
>> Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
>> and WARN instead of adding the element twice in the list. Second,
>> always do the list removal in pi_post_block if vcpu->pre_pcpu is
>> set (not -1).
>>
>> The new code keeps interrupts disabled for the whole duration of
>> pi_pre_block/pi_post_block. This is not strictly necessary, but
>> easier to follow. For the same reason, PI.ON is checked only
>> after the cmpxchg, and to handle it we just call the post-block
>> code. This removes duplication of the list removal code.
>>
>> Cc: Longpeng (Mike) <[email protected]>
>> Cc: Huangweidong <[email protected]>
>> Cc: Gonglei <[email protected]>
>> Cc: wangxin <[email protected]>
>> Cc: Radim Krčmář <[email protected]>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
>> 1 file changed, 25 insertions(+), 37 deletions(-)
>>
>
>
> [...]
>
>
>> @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
>> } while (cmpxchg(&pi_desc->control, old.control,
>> new.control) != old.control);
>>
>> - if(vcpu->pre_pcpu != -1) {
>> - spin_lock_irqsave(
>> - &per_cpu(blocked_vcpu_on_cpu_lock,
>> - vcpu->pre_pcpu), flags);
>> + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
>> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>> list_del(&vcpu->blocked_vcpu_list);
>> - spin_unlock_irqrestore(
>> - &per_cpu(blocked_vcpu_on_cpu_lock,
>> - vcpu->pre_pcpu), flags);
>> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>
>
> Hi Paolo,
>
> spin_lock_irqsave() will disable kernel preempt, but spin_lock() won't. is there
> some potential problems ?
Hi,
This function (and pi_pre_block too's part where it takes the spin lock)
runs with interrupts disabled now.
Thanks,
Paolo
> Regards,
> Longpeng(Mike)
>
>> vcpu->pre_pcpu = -1;
>> }
>> }
>> @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
>> */
>> static int pi_pre_block(struct kvm_vcpu *vcpu)
>> {
>> - unsigned long flags;
>> unsigned int dest;
>> struct pi_desc old, new;
>> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
>> @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
>> !kvm_vcpu_apicv_active(vcpu))
>> return 0;
>>
>> - vcpu->pre_pcpu = vcpu->cpu;
>> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
>> - vcpu->pre_pcpu), flags);
>> - list_add_tail(&vcpu->blocked_vcpu_list,
>> - &per_cpu(blocked_vcpu_on_cpu,
>> - vcpu->pre_pcpu));
>> - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
>> - vcpu->pre_pcpu), flags);
>> + WARN_ON(irqs_disabled());
>> + local_irq_disable();
>> + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
>> + vcpu->pre_pcpu = vcpu->cpu;
>> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>> + list_add_tail(&vcpu->blocked_vcpu_list,
>> + &per_cpu(blocked_vcpu_on_cpu,
>> + vcpu->pre_pcpu));
>> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>> + }
>>
>> do {
>> old.control = new.control = pi_desc->control;
>>
>> - /*
>> - * We should not block the vCPU if
>> - * an interrupt is posted for it.
>> - */
>> - if (pi_test_on(pi_desc) == 1) {
>> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
>> - vcpu->pre_pcpu), flags);
>> - list_del(&vcpu->blocked_vcpu_list);
>> - spin_unlock_irqrestore(
>> - &per_cpu(blocked_vcpu_on_cpu_lock,
>> - vcpu->pre_pcpu), flags);
>> - vcpu->pre_pcpu = -1;
>> -
>> - return 1;
>> - }
>> -
>> WARN((pi_desc->sn == 1),
>> "Warning: SN field of posted-interrupts "
>> "is set before blocking\n");
>> @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
>> } while (cmpxchg(&pi_desc->control, old.control,
>> new.control) != old.control);
>>
>> - return 0;
>> + /* We should not block the vCPU if an interrupt is posted for it. */
>> + if (pi_test_on(pi_desc) == 1)
>> + __pi_post_block(vcpu);
>> +
>> + local_irq_enable();
>> + return (vcpu->pre_pcpu == -1);
>> }
>>
>> static int vmx_pre_block(struct kvm_vcpu *vcpu)
>> @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
>>
>> static void pi_post_block(struct kvm_vcpu *vcpu)
>> {
>> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
>> - !irq_remapping_cap(IRQ_POSTING_CAP) ||
>> - !kvm_vcpu_apicv_active(vcpu))
>> + if (vcpu->pre_pcpu == -1)
>> return;
>>
>> + WARN_ON(irqs_disabled());
>> + local_irq_disable();
>> __pi_post_block(vcpu);
>> + local_irq_enable();
>> }
>>
>> static void vmx_post_block(struct kvm_vcpu *vcpu)
>
>
On 2017/6/6 20:35, Paolo Bonzini wrote:
>
>
> On 06/06/2017 14:30, Longpeng (Mike) wrote:
>>
>>
>> On 2017/6/6 18:57, Paolo Bonzini wrote:
>>
>>> In some cases, for example involving hot-unplug of assigned
>>> devices, pi_post_block can forget to remove the vCPU from the
>>> blocked_vcpu_list. When this happens, the next call to
>>> pi_pre_block corrupts the list.
>>>
>>> Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
>>> and WARN instead of adding the element twice in the list. Second,
>>> always do the list removal in pi_post_block if vcpu->pre_pcpu is
>>> set (not -1).
>>>
>>> The new code keeps interrupts disabled for the whole duration of
>>> pi_pre_block/pi_post_block. This is not strictly necessary, but
>>> easier to follow. For the same reason, PI.ON is checked only
>>> after the cmpxchg, and to handle it we just call the post-block
>>> code. This removes duplication of the list removal code.
>>>
>>> Cc: Longpeng (Mike) <[email protected]>
>>> Cc: Huangweidong <[email protected]>
>>> Cc: Gonglei <[email protected]>
>>> Cc: wangxin <[email protected]>
>>> Cc: Radim Krčmář <[email protected]>
>>> Signed-off-by: Paolo Bonzini <[email protected]>
>>> ---
>>> arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
>>> 1 file changed, 25 insertions(+), 37 deletions(-)
>>>
>>
>>
>> [...]
>>
>>
>>> @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
>>> } while (cmpxchg(&pi_desc->control, old.control,
>>> new.control) != old.control);
>>>
>>> - if(vcpu->pre_pcpu != -1) {
>>> - spin_lock_irqsave(
>>> - &per_cpu(blocked_vcpu_on_cpu_lock,
>>> - vcpu->pre_pcpu), flags);
>>> + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
>>> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>>> list_del(&vcpu->blocked_vcpu_list);
>>> - spin_unlock_irqrestore(
>>> - &per_cpu(blocked_vcpu_on_cpu_lock,
>>> - vcpu->pre_pcpu), flags);
>>> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>>
>>
>> Hi Paolo,
>>
>> spin_lock_irqsave() will disable kernel preempt, but spin_lock() won't. is there
>> some potential problems ?
>
> Hi,
>
> This function (and pi_pre_block too's part where it takes the spin lock)
> runs with interrupts disabled now.
>
Oh, yes, please forgive my foolish.
We'll continue to find why the list is corrupt when repeat poweron/shutdown
Thanks.
> Thanks,
>
> Paolo
>
>> Regards,
>> Longpeng(Mike)
>>
>>> vcpu->pre_pcpu = -1;
>>> }
>>> }
>>> @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
>>> */
>>> static int pi_pre_block(struct kvm_vcpu *vcpu)
>>> {
>>> - unsigned long flags;
>>> unsigned int dest;
>>> struct pi_desc old, new;
>>> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
>>> @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
>>> !kvm_vcpu_apicv_active(vcpu))
>>> return 0;
>>>
>>> - vcpu->pre_pcpu = vcpu->cpu;
>>> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
>>> - vcpu->pre_pcpu), flags);
>>> - list_add_tail(&vcpu->blocked_vcpu_list,
>>> - &per_cpu(blocked_vcpu_on_cpu,
>>> - vcpu->pre_pcpu));
>>> - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
>>> - vcpu->pre_pcpu), flags);
>>> + WARN_ON(irqs_disabled());
>>> + local_irq_disable();
>>> + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
>>> + vcpu->pre_pcpu = vcpu->cpu;
>>> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>>> + list_add_tail(&vcpu->blocked_vcpu_list,
>>> + &per_cpu(blocked_vcpu_on_cpu,
>>> + vcpu->pre_pcpu));
>>> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
>>> + }
>>>
>>> do {
>>> old.control = new.control = pi_desc->control;
>>>
>>> - /*
>>> - * We should not block the vCPU if
>>> - * an interrupt is posted for it.
>>> - */
>>> - if (pi_test_on(pi_desc) == 1) {
>>> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
>>> - vcpu->pre_pcpu), flags);
>>> - list_del(&vcpu->blocked_vcpu_list);
>>> - spin_unlock_irqrestore(
>>> - &per_cpu(blocked_vcpu_on_cpu_lock,
>>> - vcpu->pre_pcpu), flags);
>>> - vcpu->pre_pcpu = -1;
>>> -
>>> - return 1;
>>> - }
>>> -
>>> WARN((pi_desc->sn == 1),
>>> "Warning: SN field of posted-interrupts "
>>> "is set before blocking\n");
>>> @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
>>> } while (cmpxchg(&pi_desc->control, old.control,
>>> new.control) != old.control);
>>>
>>> - return 0;
>>> + /* We should not block the vCPU if an interrupt is posted for it. */
>>> + if (pi_test_on(pi_desc) == 1)
>>> + __pi_post_block(vcpu);
>>> +
>>> + local_irq_enable();
>>> + return (vcpu->pre_pcpu == -1);
>>> }
>>>
>>> static int vmx_pre_block(struct kvm_vcpu *vcpu)
>>> @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
>>>
>>> static void pi_post_block(struct kvm_vcpu *vcpu)
>>> {
>>> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
>>> - !irq_remapping_cap(IRQ_POSTING_CAP) ||
>>> - !kvm_vcpu_apicv_active(vcpu))
>>> + if (vcpu->pre_pcpu == -1)
>>> return;
>>>
>>> + WARN_ON(irqs_disabled());
>>> + local_irq_disable();
>>> __pi_post_block(vcpu);
>>> + local_irq_enable();
>>> }
>>>
>>> static void vmx_post_block(struct kvm_vcpu *vcpu)
>>
>>
>
> .
>
--
Regards,
Longpeng(Mike)
Hi Paolo,
[auto build test WARNING on kvm/linux-next]
[also build test WARNING on v4.12-rc4 next-20170606]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Paolo-Bonzini/VT-d-PI-fixes/20170607-042637
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git linux-next
config: i386-randconfig-x012-201723 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All warnings (new ones prefixed by >>):
In file included from arch/x86/include/asm/atomic.h:7:0,
from include/linux/atomic.h:4,
from include/linux/mm_types_task.h:12,
from include/linux/mm_types.h:4,
from arch/x86/kvm/irq.h:25,
from arch/x86/kvm/vmx.c:19:
arch/x86/kvm/vmx.c: In function '__pi_post_block':
>> arch/x86/include/asm/cmpxchg.h:129:2: warning: '__ret' is used uninitialized in this function [-Wuninitialized]
__ret; \
^~~~~
arch/x86/include/asm/cmpxchg.h:86:21: note: '__ret' was declared here
__typeof__(*(ptr)) __ret; \
^
>> arch/x86/include/asm/cmpxchg.h:133:2: note: in expansion of macro '__raw_cmpxchg'
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
^~~~~~~~~~~~~
>> arch/x86/include/asm/cmpxchg.h:148:2: note: in expansion of macro '__cmpxchg'
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
^~~~~~~~~
>> arch/x86/kvm/vmx.c:11242:11: note: in expansion of macro 'cmpxchg'
} while (cmpxchg(&pi_desc->control, old.control,
^~~~~~~
--
In file included from arch/x86/include/asm/atomic.h:7:0,
from include/linux/atomic.h:4,
from include/linux/mm_types_task.h:12,
from include/linux/mm_types.h:4,
from arch/x86//kvm/irq.h:25,
from arch/x86//kvm/vmx.c:19:
arch/x86//kvm/vmx.c: In function '__pi_post_block':
>> arch/x86/include/asm/cmpxchg.h:129:2: warning: '__ret' is used uninitialized in this function [-Wuninitialized]
__ret; \
^~~~~
arch/x86/include/asm/cmpxchg.h:86:21: note: '__ret' was declared here
__typeof__(*(ptr)) __ret; \
^
>> arch/x86/include/asm/cmpxchg.h:133:2: note: in expansion of macro '__raw_cmpxchg'
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
^~~~~~~~~~~~~
>> arch/x86/include/asm/cmpxchg.h:148:2: note: in expansion of macro '__cmpxchg'
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
^~~~~~~~~
arch/x86//kvm/vmx.c:11242:11: note: in expansion of macro 'cmpxchg'
} while (cmpxchg(&pi_desc->control, old.control,
^~~~~~~
vim +/cmpxchg +11242 arch/x86/kvm/vmx.c
11226
11227 do {
11228 old.control = new.control = pi_desc->control;
11229
11230 dest = cpu_physical_id(vcpu->cpu);
11231
11232 if (x2apic_enabled())
11233 new.ndst = dest;
11234 else
11235 new.ndst = (dest << 8) & 0xFF00;
11236
11237 /* Allow posting non-urgent interrupts */
11238 new.sn = 0;
11239
11240 /* set 'NV' to 'notification vector' */
11241 new.nv = POSTED_INTR_VECTOR;
11242 } while (cmpxchg(&pi_desc->control, old.control,
11243 new.control) != old.control);
11244
11245 if(vcpu->pre_pcpu != -1) {
11246 spin_lock_irqsave(
11247 &per_cpu(blocked_vcpu_on_cpu_lock,
11248 vcpu->pre_pcpu), flags);
11249 list_del(&vcpu->blocked_vcpu_list);
11250 spin_unlock_irqrestore(
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Paolo,
[auto build test ERROR on kvm/linux-next]
[also build test ERROR on v4.12-rc4 next-20170606]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Paolo-Bonzini/VT-d-PI-fixes/20170607-042637
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git linux-next
config: i386-randconfig-x012-201723 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All errors (new ones prefixed by >>):
In file included from arch/x86/include/asm/atomic.h:7:0,
from include/linux/atomic.h:4,
from include/linux/mm_types_task.h:12,
from include/linux/mm_types.h:4,
from arch/x86//kvm/irq.h:25,
from arch/x86//kvm/vmx.c:19:
arch/x86//kvm/vmx.c: In function '__pi_post_block':
arch/x86/include/asm/cmpxchg.h:129:2: warning: '__ret' is used uninitialized in this function [-Wuninitialized]
__ret; \
^~~~~
arch/x86/include/asm/cmpxchg.h:86:21: note: '__ret' was declared here
__typeof__(*(ptr)) __ret; \
^
arch/x86/include/asm/cmpxchg.h:133:2: note: in expansion of macro '__raw_cmpxchg'
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
^~~~~~~~~~~~~
arch/x86/include/asm/cmpxchg.h:148:2: note: in expansion of macro '__cmpxchg'
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
^~~~~~~~~
arch/x86//kvm/vmx.c:11243:11: note: in expansion of macro 'cmpxchg'
} while (cmpxchg(&pi_desc->control, old.control,
^~~~~~~
In function '__pi_post_block',
inlined from 'pi_post_block' at arch/x86//kvm/vmx.c:11342:2,
inlined from 'vmx_post_block' at arch/x86//kvm/vmx.c:11351:2:
>> arch/x86/include/asm/cmpxchg.h:127:3: error: call to '__cmpxchg_wrong_size' declared with attribute error: Bad argument size for cmpxchg
__cmpxchg_wrong_size(); \
^~~~~~~~~~~~~~~~~~~~~~
arch/x86/include/asm/cmpxchg.h:133:2: note: in expansion of macro '__raw_cmpxchg'
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
^~~~~~~~~~~~~
arch/x86/include/asm/cmpxchg.h:148:2: note: in expansion of macro '__cmpxchg'
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
^~~~~~~~~
arch/x86//kvm/vmx.c:11243:11: note: in expansion of macro 'cmpxchg'
} while (cmpxchg(&pi_desc->control, old.control,
^~~~~~~
--
In file included from arch/x86/include/asm/atomic.h:7:0,
from include/linux/atomic.h:4,
from include/linux/mm_types_task.h:12,
from include/linux/mm_types.h:4,
from arch/x86/kvm/irq.h:25,
from arch/x86/kvm/vmx.c:19:
arch/x86/kvm/vmx.c: In function '__pi_post_block':
arch/x86/include/asm/cmpxchg.h:129:2: warning: '__ret' is used uninitialized in this function [-Wuninitialized]
__ret; \
^~~~~
arch/x86/include/asm/cmpxchg.h:86:21: note: '__ret' was declared here
__typeof__(*(ptr)) __ret; \
^
arch/x86/include/asm/cmpxchg.h:133:2: note: in expansion of macro '__raw_cmpxchg'
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
^~~~~~~~~~~~~
arch/x86/include/asm/cmpxchg.h:148:2: note: in expansion of macro '__cmpxchg'
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
^~~~~~~~~
arch/x86/kvm/vmx.c:11243:11: note: in expansion of macro 'cmpxchg'
} while (cmpxchg(&pi_desc->control, old.control,
^~~~~~~
In function '__pi_post_block',
inlined from 'pi_post_block' at arch/x86/kvm/vmx.c:11342:2,
inlined from 'vmx_post_block' at arch/x86/kvm/vmx.c:11351:2:
>> arch/x86/include/asm/cmpxchg.h:127:3: error: call to '__cmpxchg_wrong_size' declared with attribute error: Bad argument size for cmpxchg
__cmpxchg_wrong_size(); \
^~~~~~~~~~~~~~~~~~~~~~
arch/x86/include/asm/cmpxchg.h:133:2: note: in expansion of macro '__raw_cmpxchg'
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
^~~~~~~~~~~~~
arch/x86/include/asm/cmpxchg.h:148:2: note: in expansion of macro '__cmpxchg'
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
^~~~~~~~~
arch/x86/kvm/vmx.c:11243:11: note: in expansion of macro 'cmpxchg'
} while (cmpxchg(&pi_desc->control, old.control,
^~~~~~~
vim +/__cmpxchg_wrong_size +127 arch/x86/include/asm/cmpxchg.h
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 121 : "=a" (__ret), "+m" (*__ptr) \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 122 : "r" (__new), "0" (__old) \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 123 : "memory"); \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 124 break; \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 125 } \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 126 default: \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 @127 __cmpxchg_wrong_size(); \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 128 } \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 129 __ret; \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 130 })
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 131
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 132 #define __cmpxchg(ptr, old, new, size) \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 133 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 134
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 135 #define __sync_cmpxchg(ptr, old, new, size) \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 136 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 137
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 138 #define __cmpxchg_local(ptr, old, new, size) \
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 139 __raw_cmpxchg((ptr), (old), (new), (size), "")
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 140
96a388de include/asm-x86/cmpxchg.h Thomas Gleixner 2007-10-11 141 #ifdef CONFIG_X86_32
a1ce3928 arch/x86/include/asm/cmpxchg.h David Howells 2012-10-02 142 # include <asm/cmpxchg_32.h>
96a388de include/asm-x86/cmpxchg.h Thomas Gleixner 2007-10-11 143 #else
a1ce3928 arch/x86/include/asm/cmpxchg.h David Howells 2012-10-02 144 # include <asm/cmpxchg_64.h>
96a388de include/asm-x86/cmpxchg.h Thomas Gleixner 2007-10-11 145 #endif
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 146
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 147 #define cmpxchg(ptr, old, new) \
fc395b92 arch/x86/include/asm/cmpxchg.h Jan Beulich 2012-01-26 @148 __cmpxchg(ptr, old, new, sizeof(*(ptr)))
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 149
e9826380 arch/x86/include/asm/cmpxchg.h Jeremy Fitzhardinge 2011-08-18 150 #define sync_cmpxchg(ptr, old, new) \
fc395b92 arch/x86/include/asm/cmpxchg.h Jan Beulich 2012-01-26 151 __sync_cmpxchg(ptr, old, new, sizeof(*(ptr)))
:::::: The code at line 127 was first introduced by commit
:::::: e9826380d83d1bda3ee5663bf3fa4667a6fbe60a x86, cmpxchg: Unify cmpxchg into cmpxchg.h
:::::: TO: Jeremy Fitzhardinge <[email protected]>
:::::: CC: H. Peter Anvin <[email protected]>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
> -----Original Message-----
> From: Paolo Bonzini [mailto:[email protected]] On Behalf Of Paolo
> Bonzini
> Sent: Tuesday, June 06, 2017 6:57 PM
> To: [email protected]; [email protected]
> Cc: longpeng; Huangweidong (C); Gonglei (Arei); wangxin (U); Radim Kr?m??
> Subject: [PATCH CFT 0/4] VT-d PI fixes
>
> These should fix, or at least help, the kernel panic reported by Longpeng
> with VT-d posted interrupts.
>
> CONFIG_DEBUG_LIST reports a double add, meaning that pi_pre_block ran
> twice
> without pi_post_block deleting the vCPU from the blocked_on_vcpu list.
> The only possibility that I could think of is that this:
>
> if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> !irq_remapping_cap(IRQ_POSTING_CAP) ||
> !kvm_vcpu_apicv_active(vcpu))
> return;
>
> was false in pi_post_block. In turn, I can only think of hot-unplug as
> the cause of this imbalance, but maybe there is another way to reach it
> just via repeated startup and shutdown. Gonglei reported problems with
> hot-unplug offlist too, so this is a start.
>
> In any case, patch 2 replaces it with a check on vcpu->pre_pcpu.
> A similar change is done in patch 3 to vmx_vcpu_pi_load. I don't
> have hardware easily accessible with VT-d PI, so these patches are
> compile-tested only. I apologize for any stupid mistakes.
>
Hi Paolo,
We are testing your patch, but maybe need some time to report
the results because it's not an inevitable problem.
Meanwhile we also try to find a possible scenario of non-hotplugging to
explain the double-add warnings.
We found that some other VMs start failed before the kernel painc:
2017-06-02T12:27:49.972583Z qemu-kvm: -device vfio-pci,host=0b:10.4,id=hostdev0,bus=pci.0,addr=0x5: vfio: error getting device 0000:
0b:10.4 from group 97: No such device
Verify all devices in group 97 are bound to vfio-<bus> or pci-stub and not already in use
2017-06-02T12:27:49.975925Z qemu-kvm: -device vfio-pci,host=0b:10.4,id=hostdev0,bus=pci.0,addr=0x5: vfio: failed to get device 0000:
0b:10.4
2017-06-02T12:27:51.246385Z qemu-kvm: -device vfio-pci,host=0b:10.4,id=hostdev0,bus=pci.0,addr=0x5: Device initialization failed
2017-06-02 12:27:53.628: shutting down, reason=crashed
2017-06-02 12:30:48.723: shutting down, reason=failed
But we don't think those failure will cause the unequal of kvm->arch.assigned_device_count
between pi_pre_block and pi_post_block. Am I right?
Thanks,
-Gonglei
On 07/06/2017 11:33, Gonglei (Arei) wrote:
>
>> -----Original Message-----
>> From: Paolo Bonzini [mailto:[email protected]] On Behalf Of Paolo
>> Bonzini
>> Sent: Tuesday, June 06, 2017 6:57 PM
>> To: [email protected]; [email protected]
>> Cc: longpeng; Huangweidong (C); Gonglei (Arei); wangxin (U); Radim Kr?m??
>> Subject: [PATCH CFT 0/4] VT-d PI fixes
>>
>> These should fix, or at least help, the kernel panic reported by Longpeng
>> with VT-d posted interrupts.
>>
>> CONFIG_DEBUG_LIST reports a double add, meaning that pi_pre_block ran
>> twice
>> without pi_post_block deleting the vCPU from the blocked_on_vcpu list.
>> The only possibility that I could think of is that this:
>>
>> if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
>> !irq_remapping_cap(IRQ_POSTING_CAP) ||
>> !kvm_vcpu_apicv_active(vcpu))
>> return;
>>
>> was false in pi_post_block. In turn, I can only think of hot-unplug as
>> the cause of this imbalance, but maybe there is another way to reach it
>> just via repeated startup and shutdown. Gonglei reported problems with
>> hot-unplug offlist too, so this is a start.
>>
>> In any case, patch 2 replaces it with a check on vcpu->pre_pcpu.
>> A similar change is done in patch 3 to vmx_vcpu_pi_load. I don't
>> have hardware easily accessible with VT-d PI, so these patches are
>> compile-tested only. I apologize for any stupid mistakes.
>>
> Hi Paolo,
>
> We are testing your patch, but maybe need some time to report
> the results because it's not an inevitable problem.
Of course! I guess it should run for at least a couple days before
deeming it fixed. If you didn't find any immediate showstopper bugs,
that's already good. :)
Paolo
> Meanwhile we also try to find a possible scenario of non-hotplugging to
> explain the double-add warnings.
>
> We found that some other VMs start failed before the kernel painc:
>
> 2017-06-02T12:27:49.972583Z qemu-kvm: -device vfio-pci,host=0b:10.4,id=hostdev0,bus=pci.0,addr=0x5: vfio: error getting device 0000:
> 0b:10.4 from group 97: No such device
> Verify all devices in group 97 are bound to vfio-<bus> or pci-stub and not already in use
> 2017-06-02T12:27:49.975925Z qemu-kvm: -device vfio-pci,host=0b:10.4,id=hostdev0,bus=pci.0,addr=0x5: vfio: failed to get device 0000:
> 0b:10.4
> 2017-06-02T12:27:51.246385Z qemu-kvm: -device vfio-pci,host=0b:10.4,id=hostdev0,bus=pci.0,addr=0x5: Device initialization failed
> 2017-06-02 12:27:53.628: shutting down, reason=crashed
> 2017-06-02 12:30:48.723: shutting down, reason=failed
>
> But we don't think those failure will cause the unequal of kvm->arch.assigned_device_count
> between pi_pre_block and pi_post_block. Am I right?
>
> Thanks,
> -Gonglei
>
>
On Tue, Jun 06, 2017 at 12:57:05PM +0200, Paolo Bonzini wrote:
> In some cases, for example involving hot-unplug of assigned
> devices, pi_post_block can forget to remove the vCPU from the
> blocked_vcpu_list. When this happens, the next call to
> pi_pre_block corrupts the list.
>
> Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
> and WARN instead of adding the element twice in the list. Second,
> always do the list removal in pi_post_block if vcpu->pre_pcpu is
> set (not -1).
>
> The new code keeps interrupts disabled for the whole duration of
> pi_pre_block/pi_post_block. This is not strictly necessary, but
> easier to follow. For the same reason, PI.ON is checked only
> after the cmpxchg, and to handle it we just call the post-block
> code. This removes duplication of the list removal code.
>
> Cc: Longpeng (Mike) <[email protected]>
> Cc: Huangweidong <[email protected]>
> Cc: Gonglei <[email protected]>
> Cc: wangxin <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
> 1 file changed, 25 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 747d16525b45..0f4714fe4908 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -11236,10 +11236,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> struct pi_desc old, new;
> unsigned int dest;
> - unsigned long flags;
>
> do {
> old.control = new.control = pi_desc->control;
> + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
> + "Wakeup handler not enabled while the VCPU is blocked\n");
>
> dest = cpu_physical_id(vcpu->cpu);
>
> @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> } while (cmpxchg(&pi_desc->control, old.control,
> new.control) != old.control);
>
> - if(vcpu->pre_pcpu != -1) {
> - spin_lock_irqsave(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> list_del(&vcpu->blocked_vcpu_list);
> - spin_unlock_irqrestore(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> vcpu->pre_pcpu = -1;
> }
> }
> @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> */
> static int pi_pre_block(struct kvm_vcpu *vcpu)
> {
> - unsigned long flags;
> unsigned int dest;
> struct pi_desc old, new;
> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> !kvm_vcpu_apicv_active(vcpu))
> return 0;
>
> - vcpu->pre_pcpu = vcpu->cpu;
> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - list_add_tail(&vcpu->blocked_vcpu_list,
> - &per_cpu(blocked_vcpu_on_cpu,
> - vcpu->pre_pcpu));
> - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + WARN_ON(irqs_disabled());
> + local_irq_disable();
> + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> + vcpu->pre_pcpu = vcpu->cpu;
> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> + list_add_tail(&vcpu->blocked_vcpu_list,
> + &per_cpu(blocked_vcpu_on_cpu,
> + vcpu->pre_pcpu));
> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> + }
>
> do {
> old.control = new.control = pi_desc->control;
>
> - /*
> - * We should not block the vCPU if
> - * an interrupt is posted for it.
> - */
> - if (pi_test_on(pi_desc) == 1) {
> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - list_del(&vcpu->blocked_vcpu_list);
> - spin_unlock_irqrestore(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - vcpu->pre_pcpu = -1;
> -
> - return 1;
[1]
> - }
> -
> WARN((pi_desc->sn == 1),
> "Warning: SN field of posted-interrupts "
> "is set before blocking\n");
> @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> } while (cmpxchg(&pi_desc->control, old.control,
> new.control) != old.control);
>
> - return 0;
> + /* We should not block the vCPU if an interrupt is posted for it. */
> + if (pi_test_on(pi_desc) == 1)
> + __pi_post_block(vcpu);
A question on when pi_test_on() is set:
The old code will return 1 if detected (ses [1]), while the new code
does not. Would that matter? (IIUC that decides whether the vcpu will
continue to run?)
> +
> + local_irq_enable();
> + return (vcpu->pre_pcpu == -1);
Above we have:
if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
vcpu->pre_pcpu = vcpu->cpu;
...
}
Then can here vcpu->pre_pcpu really be -1?
> }
>
> static int vmx_pre_block(struct kvm_vcpu *vcpu)
> @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
>
> static void pi_post_block(struct kvm_vcpu *vcpu)
> {
> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> - !irq_remapping_cap(IRQ_POSTING_CAP) ||
> - !kvm_vcpu_apicv_active(vcpu))
> + if (vcpu->pre_pcpu == -1)
> return;
>
> + WARN_ON(irqs_disabled());
> + local_irq_disable();
> __pi_post_block(vcpu);
> + local_irq_enable();
> }
>
> static void vmx_post_block(struct kvm_vcpu *vcpu)
> --
> 2.13.0
>
>
A general question to pre_block/post_block handling for PI:
I see that we are handling PI logic mostly in four places:
vmx_vcpu_pi_{load|put}
pi_{pre_post}_block
But do we really need the pre_block/post_block handling? Here's how I
understand when vcpu blocked:
- vcpu_block
- ->pre_block
- kvm_vcpu_block [1]
- schedule()
- kvm_sched_out
- vmx_vcpu_pi_put [3]
- (another process working) ...
- kvm_sched_in
- vmx_vcpu_pi_load [4]
- ->post_block [2]
If so, [1] & [2] will definitely be paired with [3] & [4], then why we
need [3] & [4] at all?
(Though [3] & [4] will also be used when preemption happens, so they
are required)
Please kindly figure out if I missed anything important...
Thanks,
--
Peter Xu
On Thu, Jun 08, 2017 at 02:50:57PM +0800, Peter Xu wrote:
> On Tue, Jun 06, 2017 at 12:57:05PM +0200, Paolo Bonzini wrote:
> > In some cases, for example involving hot-unplug of assigned
> > devices, pi_post_block can forget to remove the vCPU from the
> > blocked_vcpu_list. When this happens, the next call to
> > pi_pre_block corrupts the list.
> >
> > Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
> > and WARN instead of adding the element twice in the list. Second,
> > always do the list removal in pi_post_block if vcpu->pre_pcpu is
> > set (not -1).
> >
> > The new code keeps interrupts disabled for the whole duration of
> > pi_pre_block/pi_post_block. This is not strictly necessary, but
> > easier to follow. For the same reason, PI.ON is checked only
> > after the cmpxchg, and to handle it we just call the post-block
> > code. This removes duplication of the list removal code.
> >
> > Cc: Longpeng (Mike) <[email protected]>
> > Cc: Huangweidong <[email protected]>
> > Cc: Gonglei <[email protected]>
> > Cc: wangxin <[email protected]>
> > Cc: Radim Krčmář <[email protected]>
> > Signed-off-by: Paolo Bonzini <[email protected]>
> > ---
> > arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
> > 1 file changed, 25 insertions(+), 37 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 747d16525b45..0f4714fe4908 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -11236,10 +11236,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > struct pi_desc old, new;
> > unsigned int dest;
> > - unsigned long flags;
> >
> > do {
> > old.control = new.control = pi_desc->control;
> > + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
> > + "Wakeup handler not enabled while the VCPU is blocked\n");
> >
> > dest = cpu_physical_id(vcpu->cpu);
> >
> > @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> > } while (cmpxchg(&pi_desc->control, old.control,
> > new.control) != old.control);
> >
> > - if(vcpu->pre_pcpu != -1) {
> > - spin_lock_irqsave(
> > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
> > + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > list_del(&vcpu->blocked_vcpu_list);
> > - spin_unlock_irqrestore(
> > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > vcpu->pre_pcpu = -1;
> > }
> > }
> > @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> > */
> > static int pi_pre_block(struct kvm_vcpu *vcpu)
> > {
> > - unsigned long flags;
> > unsigned int dest;
> > struct pi_desc old, new;
> > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> > !kvm_vcpu_apicv_active(vcpu))
> > return 0;
> >
> > - vcpu->pre_pcpu = vcpu->cpu;
> > - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > - list_add_tail(&vcpu->blocked_vcpu_list,
> > - &per_cpu(blocked_vcpu_on_cpu,
> > - vcpu->pre_pcpu));
> > - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > + WARN_ON(irqs_disabled());
> > + local_irq_disable();
> > + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> > + vcpu->pre_pcpu = vcpu->cpu;
> > + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > + list_add_tail(&vcpu->blocked_vcpu_list,
> > + &per_cpu(blocked_vcpu_on_cpu,
> > + vcpu->pre_pcpu));
> > + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > + }
> >
> > do {
> > old.control = new.control = pi_desc->control;
> >
> > - /*
> > - * We should not block the vCPU if
> > - * an interrupt is posted for it.
> > - */
> > - if (pi_test_on(pi_desc) == 1) {
> > - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > - list_del(&vcpu->blocked_vcpu_list);
> > - spin_unlock_irqrestore(
> > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > - vcpu->pre_pcpu = -1;
> > -
> > - return 1;
>
> [1]
>
> > - }
> > -
> > WARN((pi_desc->sn == 1),
> > "Warning: SN field of posted-interrupts "
> > "is set before blocking\n");
> > @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> > } while (cmpxchg(&pi_desc->control, old.control,
> > new.control) != old.control);
> >
> > - return 0;
> > + /* We should not block the vCPU if an interrupt is posted for it. */
> > + if (pi_test_on(pi_desc) == 1)
> > + __pi_post_block(vcpu);
>
> A question on when pi_test_on() is set:
>
> The old code will return 1 if detected (ses [1]), while the new code
> does not. Would that matter? (IIUC that decides whether the vcpu will
> continue to run?)
>
> > +
> > + local_irq_enable();
> > + return (vcpu->pre_pcpu == -1);
>
> Above we have:
>
> if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> vcpu->pre_pcpu = vcpu->cpu;
> ...
> }
>
> Then can here vcpu->pre_pcpu really be -1?
>
> > }
> >
> > static int vmx_pre_block(struct kvm_vcpu *vcpu)
> > @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
> >
> > static void pi_post_block(struct kvm_vcpu *vcpu)
> > {
> > - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> > - !irq_remapping_cap(IRQ_POSTING_CAP) ||
> > - !kvm_vcpu_apicv_active(vcpu))
> > + if (vcpu->pre_pcpu == -1)
> > return;
> >
> > + WARN_ON(irqs_disabled());
> > + local_irq_disable();
> > __pi_post_block(vcpu);
> > + local_irq_enable();
> > }
> >
> > static void vmx_post_block(struct kvm_vcpu *vcpu)
> > --
> > 2.13.0
> >
> >
>
> A general question to pre_block/post_block handling for PI:
>
> I see that we are handling PI logic mostly in four places:
>
> vmx_vcpu_pi_{load|put}
> pi_{pre_post}_block
>
> But do we really need the pre_block/post_block handling? Here's how I
> understand when vcpu blocked:
>
> - vcpu_block
> - ->pre_block
> - kvm_vcpu_block [1]
> - schedule()
> - kvm_sched_out
> - vmx_vcpu_pi_put [3]
> - (another process working) ...
> - kvm_sched_in
> - vmx_vcpu_pi_load [4]
> - ->post_block [2]
>
> If so, [1] & [2] will definitely be paired with [3] & [4], then why we
> need [3] & [4] at all?
^^^^^^^^^ Here I meant [1] & [2]. Sorry.
>
> (Though [3] & [4] will also be used when preemption happens, so they
> are required)
>
> Please kindly figure out if I missed anything important...
>
> Thanks,
>
> --
> Peter Xu
--
Peter Xu
----- Original Message -----
> From: "Peter Xu" <[email protected]>
> To: "Paolo Bonzini" <[email protected]>
> Cc: [email protected], [email protected], "Longpeng" <[email protected]>, "Huangweidong"
> <[email protected]>, "Gonglei" <[email protected]>, "wangxin" <[email protected]>, "Radim
> Krčmář" <[email protected]>
> Sent: Thursday, June 8, 2017 8:50:57 AM
> Subject: Re: [PATCH 2/4] KVM: VMX: avoid double list add with VT-d posted interrupts
>
> On Tue, Jun 06, 2017 at 12:57:05PM +0200, Paolo Bonzini wrote:
> > In some cases, for example involving hot-unplug of assigned
> > devices, pi_post_block can forget to remove the vCPU from the
> > blocked_vcpu_list. When this happens, the next call to
> > pi_pre_block corrupts the list.
> >
> > Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
> > and WARN instead of adding the element twice in the list. Second,
> > always do the list removal in pi_post_block if vcpu->pre_pcpu is
> > set (not -1).
> >
> > The new code keeps interrupts disabled for the whole duration of
> > pi_pre_block/pi_post_block. This is not strictly necessary, but
> > easier to follow. For the same reason, PI.ON is checked only
> > after the cmpxchg, and to handle it we just call the post-block
> > code. This removes duplication of the list removal code.
> >
> > Cc: Longpeng (Mike) <[email protected]>
> > Cc: Huangweidong <[email protected]>
> > Cc: Gonglei <[email protected]>
> > Cc: wangxin <[email protected]>
> > Cc: Radim Krčmář <[email protected]>
> > Signed-off-by: Paolo Bonzini <[email protected]>
> > ---
> > arch/x86/kvm/vmx.c | 62
> > ++++++++++++++++++++++--------------------------------
> > 1 file changed, 25 insertions(+), 37 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 747d16525b45..0f4714fe4908 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -11236,10 +11236,11 @@ static void __pi_post_block(struct kvm_vcpu
> > *vcpu)
> > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > struct pi_desc old, new;
> > unsigned int dest;
> > - unsigned long flags;
> >
> > do {
> > old.control = new.control = pi_desc->control;
> > + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
> > + "Wakeup handler not enabled while the VCPU is blocked\n");
> >
> > dest = cpu_physical_id(vcpu->cpu);
> >
> > @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu
> > *vcpu)
> > } while (cmpxchg(&pi_desc->control, old.control,
> > new.control) != old.control);
> >
> > - if(vcpu->pre_pcpu != -1) {
> > - spin_lock_irqsave(
> > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
> > + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > list_del(&vcpu->blocked_vcpu_list);
> > - spin_unlock_irqrestore(
> > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > vcpu->pre_pcpu = -1;
> > }
> > }
> > @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> > */
> > static int pi_pre_block(struct kvm_vcpu *vcpu)
> > {
> > - unsigned long flags;
> > unsigned int dest;
> > struct pi_desc old, new;
> > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> > !kvm_vcpu_apicv_active(vcpu))
> > return 0;
> >
> > - vcpu->pre_pcpu = vcpu->cpu;
> > - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > - list_add_tail(&vcpu->blocked_vcpu_list,
> > - &per_cpu(blocked_vcpu_on_cpu,
> > - vcpu->pre_pcpu));
> > - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > + WARN_ON(irqs_disabled());
> > + local_irq_disable();
> > + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> > + vcpu->pre_pcpu = vcpu->cpu;
> > + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > + list_add_tail(&vcpu->blocked_vcpu_list,
> > + &per_cpu(blocked_vcpu_on_cpu,
> > + vcpu->pre_pcpu));
> > + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > + }
> >
> > do {
> > old.control = new.control = pi_desc->control;
> >
> > - /*
> > - * We should not block the vCPU if
> > - * an interrupt is posted for it.
> > - */
> > - if (pi_test_on(pi_desc) == 1) {
> > - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > - list_del(&vcpu->blocked_vcpu_list);
> > - spin_unlock_irqrestore(
> > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > - vcpu->pre_pcpu), flags);
> > - vcpu->pre_pcpu = -1;
> > -
> > - return 1;
>
> [1]
>
> > - }
> > -
> > WARN((pi_desc->sn == 1),
> > "Warning: SN field of posted-interrupts "
> > "is set before blocking\n");
> > @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> > } while (cmpxchg(&pi_desc->control, old.control,
> > new.control) != old.control);
> >
> > - return 0;
> > + /* We should not block the vCPU if an interrupt is posted for it. */
> > + if (pi_test_on(pi_desc) == 1)
> > + __pi_post_block(vcpu);
>
> A question on when pi_test_on() is set:
>
> The old code will return 1 if detected (ses [1]), while the new code
> does not. Would that matter? (IIUC that decides whether the vcpu will
> continue to run?)
The new code does, because __pi_post_block resets vcpu->pre_pcpu to -1.
> > +
> > + local_irq_enable();
> > + return (vcpu->pre_pcpu == -1);
>
> Above we have:
>
> if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> vcpu->pre_pcpu = vcpu->cpu;
> ...
> }
>
> Then can here vcpu->pre_pcpu really be -1?
See above. :)
> > }
> >
> > static int vmx_pre_block(struct kvm_vcpu *vcpu)
> > @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
> >
> > static void pi_post_block(struct kvm_vcpu *vcpu)
> > {
> > - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> > - !irq_remapping_cap(IRQ_POSTING_CAP) ||
> > - !kvm_vcpu_apicv_active(vcpu))
> > + if (vcpu->pre_pcpu == -1)
> > return;
> >
> > + WARN_ON(irqs_disabled());
> > + local_irq_disable();
> > __pi_post_block(vcpu);
> > + local_irq_enable();
> > }
> >
> > static void vmx_post_block(struct kvm_vcpu *vcpu)
> > --
> > 2.13.0
> >
> >
>
> A general question to pre_block/post_block handling for PI:
>
> I see that we are handling PI logic mostly in four places:
>
> vmx_vcpu_pi_{load|put}
> pi_{pre_post}_block
>
> But do we really need the pre_block/post_block handling? Here's how I
> understand when vcpu blocked:
>
> - vcpu_block
> - ->pre_block
> - kvm_vcpu_block [1]
> - schedule()
> - kvm_sched_out
> - vmx_vcpu_pi_put [3]
> - (another process working) ...
> - kvm_sched_in
> - vmx_vcpu_pi_load [4]
> - ->post_block [2]
>
> If so, [1] & [2] will definitely be paired with [3] & [4], then why we
> need [3] & [4] at all?
>
> (Though [3] & [4] will also be used when preemption happens, so they
> are required)
>
> Please kindly figure out if I missed anything important...
Oh, I see what you mean: set up the wakeup handler in vmx_vcpu_pi_put
and rely on PI.ON to wake up the sleeping process immediately. That
should be feasible, but overall I like the current pre_block/post_block
structure, and I think it's simpler. The only thing to be careful about
is leaving the IRTE unmodified when scheduling out a blocked VCPU, which
is cleaned up and simplified in patch 3.
So I understand that the state may seem a bit too complicated as
of this patch, but hopefully the next two make it clearer.
Paolo
On Thu, Jun 08, 2017 at 03:00:39AM -0400, Paolo Bonzini wrote:
>
>
> ----- Original Message -----
> > From: "Peter Xu" <[email protected]>
> > To: "Paolo Bonzini" <[email protected]>
> > Cc: [email protected], [email protected], "Longpeng" <[email protected]>, "Huangweidong"
> > <[email protected]>, "Gonglei" <[email protected]>, "wangxin" <[email protected]>, "Radim
> > Krčmář" <[email protected]>
> > Sent: Thursday, June 8, 2017 8:50:57 AM
> > Subject: Re: [PATCH 2/4] KVM: VMX: avoid double list add with VT-d posted interrupts
> >
> > On Tue, Jun 06, 2017 at 12:57:05PM +0200, Paolo Bonzini wrote:
> > > In some cases, for example involving hot-unplug of assigned
> > > devices, pi_post_block can forget to remove the vCPU from the
> > > blocked_vcpu_list. When this happens, the next call to
> > > pi_pre_block corrupts the list.
> > >
> > > Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
> > > and WARN instead of adding the element twice in the list. Second,
> > > always do the list removal in pi_post_block if vcpu->pre_pcpu is
> > > set (not -1).
> > >
> > > The new code keeps interrupts disabled for the whole duration of
> > > pi_pre_block/pi_post_block. This is not strictly necessary, but
> > > easier to follow. For the same reason, PI.ON is checked only
> > > after the cmpxchg, and to handle it we just call the post-block
> > > code. This removes duplication of the list removal code.
> > >
> > > Cc: Longpeng (Mike) <[email protected]>
> > > Cc: Huangweidong <[email protected]>
> > > Cc: Gonglei <[email protected]>
> > > Cc: wangxin <[email protected]>
> > > Cc: Radim Krčmář <[email protected]>
> > > Signed-off-by: Paolo Bonzini <[email protected]>
> > > ---
> > > arch/x86/kvm/vmx.c | 62
> > > ++++++++++++++++++++++--------------------------------
> > > 1 file changed, 25 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > > index 747d16525b45..0f4714fe4908 100644
> > > --- a/arch/x86/kvm/vmx.c
> > > +++ b/arch/x86/kvm/vmx.c
> > > @@ -11236,10 +11236,11 @@ static void __pi_post_block(struct kvm_vcpu
> > > *vcpu)
> > > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > > struct pi_desc old, new;
> > > unsigned int dest;
> > > - unsigned long flags;
> > >
> > > do {
> > > old.control = new.control = pi_desc->control;
> > > + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
> > > + "Wakeup handler not enabled while the VCPU is blocked\n");
> > >
> > > dest = cpu_physical_id(vcpu->cpu);
> > >
> > > @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu
> > > *vcpu)
> > > } while (cmpxchg(&pi_desc->control, old.control,
> > > new.control) != old.control);
> > >
> > > - if(vcpu->pre_pcpu != -1) {
> > > - spin_lock_irqsave(
> > > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > > - vcpu->pre_pcpu), flags);
> > > + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
> > > + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > > list_del(&vcpu->blocked_vcpu_list);
> > > - spin_unlock_irqrestore(
> > > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > > - vcpu->pre_pcpu), flags);
> > > + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > > vcpu->pre_pcpu = -1;
> > > }
> > > }
> > > @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> > > */
> > > static int pi_pre_block(struct kvm_vcpu *vcpu)
> > > {
> > > - unsigned long flags;
> > > unsigned int dest;
> > > struct pi_desc old, new;
> > > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > > @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> > > !kvm_vcpu_apicv_active(vcpu))
> > > return 0;
> > >
> > > - vcpu->pre_pcpu = vcpu->cpu;
> > > - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > > - vcpu->pre_pcpu), flags);
> > > - list_add_tail(&vcpu->blocked_vcpu_list,
> > > - &per_cpu(blocked_vcpu_on_cpu,
> > > - vcpu->pre_pcpu));
> > > - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> > > - vcpu->pre_pcpu), flags);
> > > + WARN_ON(irqs_disabled());
> > > + local_irq_disable();
> > > + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> > > + vcpu->pre_pcpu = vcpu->cpu;
> > > + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > > + list_add_tail(&vcpu->blocked_vcpu_list,
> > > + &per_cpu(blocked_vcpu_on_cpu,
> > > + vcpu->pre_pcpu));
> > > + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> > > + }
> > >
> > > do {
> > > old.control = new.control = pi_desc->control;
> > >
> > > - /*
> > > - * We should not block the vCPU if
> > > - * an interrupt is posted for it.
> > > - */
> > > - if (pi_test_on(pi_desc) == 1) {
> > > - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > > - vcpu->pre_pcpu), flags);
> > > - list_del(&vcpu->blocked_vcpu_list);
> > > - spin_unlock_irqrestore(
> > > - &per_cpu(blocked_vcpu_on_cpu_lock,
> > > - vcpu->pre_pcpu), flags);
> > > - vcpu->pre_pcpu = -1;
> > > -
> > > - return 1;
> >
> > [1]
> >
> > > - }
> > > -
> > > WARN((pi_desc->sn == 1),
> > > "Warning: SN field of posted-interrupts "
> > > "is set before blocking\n");
> > > @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> > > } while (cmpxchg(&pi_desc->control, old.control,
> > > new.control) != old.control);
> > >
> > > - return 0;
> > > + /* We should not block the vCPU if an interrupt is posted for it. */
> > > + if (pi_test_on(pi_desc) == 1)
> > > + __pi_post_block(vcpu);
> >
> > A question on when pi_test_on() is set:
> >
> > The old code will return 1 if detected (ses [1]), while the new code
> > does not. Would that matter? (IIUC that decides whether the vcpu will
> > continue to run?)
>
> The new code does, because __pi_post_block resets vcpu->pre_pcpu to -1.
Sorry I overlook that. :-)
>
> > > +
> > > + local_irq_enable();
> > > + return (vcpu->pre_pcpu == -1);
> >
> > Above we have:
> >
> > if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> > vcpu->pre_pcpu = vcpu->cpu;
> > ...
> > }
> >
> > Then can here vcpu->pre_pcpu really be -1?
>
> See above. :)
Yes. Then there's no problem.
>
> > > }
> > >
> > > static int vmx_pre_block(struct kvm_vcpu *vcpu)
> > > @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
> > >
> > > static void pi_post_block(struct kvm_vcpu *vcpu)
> > > {
> > > - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> > > - !irq_remapping_cap(IRQ_POSTING_CAP) ||
> > > - !kvm_vcpu_apicv_active(vcpu))
> > > + if (vcpu->pre_pcpu == -1)
> > > return;
> > >
> > > + WARN_ON(irqs_disabled());
> > > + local_irq_disable();
> > > __pi_post_block(vcpu);
> > > + local_irq_enable();
> > > }
> > >
> > > static void vmx_post_block(struct kvm_vcpu *vcpu)
> > > --
> > > 2.13.0
> > >
> > >
> >
> > A general question to pre_block/post_block handling for PI:
> >
> > I see that we are handling PI logic mostly in four places:
> >
> > vmx_vcpu_pi_{load|put}
> > pi_{pre_post}_block
> >
> > But do we really need the pre_block/post_block handling? Here's how I
> > understand when vcpu blocked:
> >
> > - vcpu_block
> > - ->pre_block
> > - kvm_vcpu_block [1]
> > - schedule()
> > - kvm_sched_out
> > - vmx_vcpu_pi_put [3]
> > - (another process working) ...
> > - kvm_sched_in
> > - vmx_vcpu_pi_load [4]
> > - ->post_block [2]
> >
> > If so, [1] & [2] will definitely be paired with [3] & [4], then why we
> > need [3] & [4] at all?
> >
> > (Though [3] & [4] will also be used when preemption happens, so they
> > are required)
> >
> > Please kindly figure out if I missed anything important...
>
> Oh, I see what you mean: set up the wakeup handler in vmx_vcpu_pi_put
> and rely on PI.ON to wake up the sleeping process immediately. That
> should be feasible, but overall I like the current pre_block/post_block
> structure, and I think it's simpler. The only thing to be careful about
> is leaving the IRTE unmodified when scheduling out a blocked VCPU, which
> is cleaned up and simplified in patch 3.
>
> So I understand that the state may seem a bit too complicated as
> of this patch, but hopefully the next two make it clearer.
After re-read the codes and patches I got the point. Indeed current
way should be clearer since pre/post_block are mostly handling NV/DST
while pi_load/put are for SN bit. Thanks!
--
Peter Xu
On 08/06/2017 11:16, Peter Xu wrote:
>> Oh, I see what you mean: set up the wakeup handler in vmx_vcpu_pi_put
>> and rely on PI.ON to wake up the sleeping process immediately. That
>> should be feasible, but overall I like the current pre_block/post_block
>> structure, and I think it's simpler. The only thing to be careful about
>> is leaving the IRTE unmodified when scheduling out a blocked VCPU, which
>> is cleaned up and simplified in patch 3.
>>
>> So I understand that the state may seem a bit too complicated as
>> of this patch, but hopefully the next two make it clearer.
> After re-read the codes and patches I got the point. Indeed current
> way should be clearer since pre/post_block are mostly handling NV/DST
> while pi_load/put are for SN bit. Thanks!
Almost: pi_load handles NDST too. However, I think with patch 3 it's
clearer how pi_load handles the nesting inside pre_block...post_block.
Thanks,
Paolo
On Thu, Jun 08, 2017 at 01:24:44PM +0200, Paolo Bonzini wrote:
>
>
> On 08/06/2017 11:16, Peter Xu wrote:
> >> Oh, I see what you mean: set up the wakeup handler in vmx_vcpu_pi_put
> >> and rely on PI.ON to wake up the sleeping process immediately. That
> >> should be feasible, but overall I like the current pre_block/post_block
> >> structure, and I think it's simpler. The only thing to be careful about
> >> is leaving the IRTE unmodified when scheduling out a blocked VCPU, which
> >> is cleaned up and simplified in patch 3.
> >>
> >> So I understand that the state may seem a bit too complicated as
> >> of this patch, but hopefully the next two make it clearer.
> > After re-read the codes and patches I got the point. Indeed current
> > way should be clearer since pre/post_block are mostly handling NV/DST
> > while pi_load/put are for SN bit. Thanks!
>
> Almost: pi_load handles NDST too. However, I think with patch 3 it's
> clearer how pi_load handles the nesting inside pre_block...post_block.
Yes. The old codes & comments for vmx_vcpu_pi_load() are not very easy
to understand for me.
For patch 3, not sure whether moving clear_sn() upper would be
clearer:
pi_load()
{
if (!pi_test_bit() && vcpu->cpu == cpu)
return;
/* Unconditionally clear SN */
pi_clear_sn();
/*
* If cpu not changed, no need to switch PDST; if WAKEUP, post_block
* will do it for us
*/
if (vcpu->cpu == cpu || nv == WAKEUP)
return;
/*
* Update PDST. Possibly the vcpu thread switched from one cpu to
* another.
*/
do {
...
} while (...)
}
Even, I'm thinking whether we can unconditionally setup PDST only in
pi_load(), then post_block() only needs to handle the NV bit.
(PS. since I'm at here... could I ask why in pi_pre_block we need to
udpate PDST as well? I guess that decides who will run the
wakeup_handler code to kick the vcpu thread, but would that really
matter?)
Thanks,
--
Peter Xu
On 09/06/2017 04:50, Peter Xu wrote:
> Even, I'm thinking whether we can unconditionally setup PDST only in
> pi_load(), then post_block() only needs to handle the NV bit.
No, you can't do that without fiddling with the blocked_vcpu lists in
pi_load.
> (PS. since I'm at here... could I ask why in pi_pre_block we need to
> udpate PDST as well? I guess that decides who will run the
> wakeup_handler code to kick the vcpu thread, but would that really
> matter?)
For this one it's a yes. :) I think it's not needed anymore indeed
after these patches; see this comment:
/*
* The wakeup_handler expects the VCPU to be on the
* blocked_vcpu_list that matches ndst. Interrupts
* are disabled so no preemption should happen, but
* err on the side of safety.
*/
So we could add a WARN.
Paolo
On Fri, Jun 09, 2017 at 09:29:45AM +0200, Paolo Bonzini wrote:
>
>
> On 09/06/2017 04:50, Peter Xu wrote:
> > Even, I'm thinking whether we can unconditionally setup PDST only in
> > pi_load(), then post_block() only needs to handle the NV bit.
>
> No, you can't do that without fiddling with the blocked_vcpu lists in
> pi_load.
Then how about we keep the blocked_vcpu list maintainance in
post_block(), but only let pi_load() handle the PDST?
(I really feel like they are two things - the blocked_vcpu list helps
for the kick when wakeup happens; while PDST makes sure the PI is
always pointing to the correct cpu)
>
> > (PS. since I'm at here... could I ask why in pi_pre_block we need to
> > udpate PDST as well? I guess that decides who will run the
> > wakeup_handler code to kick the vcpu thread, but would that really
> > matter?)
>
> For this one it's a yes. :) I think it's not needed anymore indeed
> after these patches; see this comment:
>
> /*
> * The wakeup_handler expects the VCPU to be on the
> * blocked_vcpu_list that matches ndst.
Actually I was always unclear on what this sentense means: iiuc
blocked_vcpu_list is only a list of vcpus that may need a kick, so why
it has anything to do with PDST after all?
(or say, no matter what PDST is, we just kick the vcpu thread without
doing anything else, do we?)
> Interrupts
> * are disabled so no preemption should happen, but
> * err on the side of safety.
> */
>
> So we could add a WARN.
Thanks,
--
Peter Xu
On 07/06/2017 11:33, Gonglei (Arei) wrote:
> We are testing your patch, but maybe need some time to report
> the results because it's not an inevitable problem.
>
> Meanwhile we also try to find a possible scenario of non-hotplugging to
> explain the double-add warnings.
Hi Lei,
do you have any updates? I would like to get at least the first three
patches in 4.13.
Thanks,
Paolo
> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On
> Behalf Of Paolo Bonzini
> Sent: Tuesday, July 11, 2017 4:56 PM
> To: Gonglei (Arei)
> Cc: [email protected]; [email protected]; longpeng;
> Huangweidong (C); wangxin (U); Radim Kr?m??
> Subject: Re: [PATCH CFT 0/4] VT-d PI fixes
>
> On 07/06/2017 11:33, Gonglei (Arei) wrote:
> > We are testing your patch, but maybe need some time to report
> > the results because it's not an inevitable problem.
> >
> > Meanwhile we also try to find a possible scenario of non-hotplugging to
> > explain the double-add warnings.
>
> Hi Lei,
>
> do you have any updates?
Dear Paolo,
Thanks for kicking me :)
TBH, thinking about the reliability of productive project (we use kvm-4.4),
we applied the patch you used in fedora pastebin, and
the bug seems gone after one month's testing.
diff --git a/source/x86/vmx.c b/source/x86/vmx.c
index 79012cf..efc611f 100644
--- a/source/x86/vmx.c
+++ b/source/x86/vmx.c
@@ -11036,8 +11036,9 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
unsigned int dest;
unsigned long flags;
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP))
+ if ((vcpu->pre_pcpu == -1) &&
+ (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP)))
return;
> I would like to get at least the first three
> patches in 4.13.
>
I think they are okay to me for upstream.
Thanks,
-Gonglei
Hi Paolo,
On 2017/6/6 18:57, Paolo Bonzini wrote:
> In some cases, for example involving hot-unplug of assigned
> devices, pi_post_block can forget to remove the vCPU from the
> blocked_vcpu_list. When this happens, the next call to
> pi_pre_block corrupts the list.
>
> Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
> and WARN instead of adding the element twice in the list. Second,
> always do the list removal in pi_post_block if vcpu->pre_pcpu is
> set (not -1).
>
> The new code keeps interrupts disabled for the whole duration of
> pi_pre_block/pi_post_block. This is not strictly necessary, but
> easier to follow. For the same reason, PI.ON is checked only
> after the cmpxchg, and to handle it we just call the post-block
> code. This removes duplication of the list removal code.
>
> Cc: Longpeng (Mike) <[email protected]>
> Cc: Huangweidong <[email protected]>
> Cc: Gonglei <[email protected]>
> Cc: wangxin <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
> 1 file changed, 25 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 747d16525b45..0f4714fe4908 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -11236,10 +11236,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> struct pi_desc old, new;
> unsigned int dest;
> - unsigned long flags;
>
> do {
> old.control = new.control = pi_desc->control;
> + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
> + "Wakeup handler not enabled while the VCPU is blocked\n");
>
> dest = cpu_physical_id(vcpu->cpu);
>
> @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> } while (cmpxchg(&pi_desc->control, old.control,
> new.control) != old.control);
>
> - if(vcpu->pre_pcpu != -1) {
> - spin_lock_irqsave(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
__pi_post_block is only called by pi_post_block/pi_pre_block now, it seems that
both of them would make sure "vcpu->pre_pcpu != -1" before __pi_post_block is
called, so maybe the above check is useless, right?
> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> list_del(&vcpu->blocked_vcpu_list);
> - spin_unlock_irqrestore(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> vcpu->pre_pcpu = -1;
> }
> }
> @@ -11283,7 +11280,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> */
> static int pi_pre_block(struct kvm_vcpu *vcpu)
> {
> - unsigned long flags;
> unsigned int dest;
> struct pi_desc old, new;
> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> @@ -11293,34 +11289,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> !kvm_vcpu_apicv_active(vcpu))
> return 0;
>
> - vcpu->pre_pcpu = vcpu->cpu;
> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - list_add_tail(&vcpu->blocked_vcpu_list,
> - &per_cpu(blocked_vcpu_on_cpu,
> - vcpu->pre_pcpu));
> - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> + WARN_ON(irqs_disabled());
> + local_irq_disable();
> + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
> + vcpu->pre_pcpu = vcpu->cpu;
> + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> + list_add_tail(&vcpu->blocked_vcpu_list,
> + &per_cpu(blocked_vcpu_on_cpu,
> + vcpu->pre_pcpu));
> + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
> + }
>
> do {
> old.control = new.control = pi_desc->control;
>
> - /*
> - * We should not block the vCPU if
> - * an interrupt is posted for it.
> - */
> - if (pi_test_on(pi_desc) == 1) {
> - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - list_del(&vcpu->blocked_vcpu_list);
> - spin_unlock_irqrestore(
> - &per_cpu(blocked_vcpu_on_cpu_lock,
> - vcpu->pre_pcpu), flags);
> - vcpu->pre_pcpu = -1;
> -
> - return 1;
> - }
> -
> WARN((pi_desc->sn == 1),
> "Warning: SN field of posted-interrupts "
> "is set before blocking\n");
> @@ -11345,7 +11327,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
> } while (cmpxchg(&pi_desc->control, old.control,
> new.control) != old.control);
>
> - return 0;
> + /* We should not block the vCPU if an interrupt is posted for it. */
> + if (pi_test_on(pi_desc) == 1)
> + __pi_post_block(vcpu);
> +
> + local_irq_enable();
> + return (vcpu->pre_pcpu == -1);
> }
>
> static int vmx_pre_block(struct kvm_vcpu *vcpu)
> @@ -11361,12 +11348,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
>
> static void pi_post_block(struct kvm_vcpu *vcpu)
> {
> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> - !irq_remapping_cap(IRQ_POSTING_CAP) ||
> - !kvm_vcpu_apicv_active(vcpu))
> + if (vcpu->pre_pcpu == -1)
> return;
>
> + WARN_ON(irqs_disabled());
> + local_irq_disable();
> __pi_post_block(vcpu);
> + local_irq_enable();
> }
>
> static void vmx_post_block(struct kvm_vcpu *vcpu)
--
Regards,
Longpeng(Mike)
On 2017/6/6 18:57, Paolo Bonzini wrote:
> The simplify part: do not touch pi_desc.nv, we can set it when the
> VCPU is first created. Likewise, pi_desc.sn is only handled by
> vmx_vcpu_pi_load, do not touch it in __pi_post_block.
>
> The fix part: do not check kvm_arch_has_assigned_device, instead
> check the SN bit to figure out whether vmx_vcpu_pi_put ran before.
> This matches what the previous patch did in pi_post_block.
>
> Cc: Longpeng (Mike) <[email protected]>
> Cc: Huangweidong <[email protected]>
> Cc: Gonglei <[email protected]>
> Cc: wangxin <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++++++--------------------------
> 1 file changed, 35 insertions(+), 33 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 0f4714fe4908..81047f373747 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2184,43 +2184,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
> struct pi_desc old, new;
> unsigned int dest;
>
> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> - !irq_remapping_cap(IRQ_POSTING_CAP) ||
> - !kvm_vcpu_apicv_active(vcpu))
> + /*
> + * In case of hot-plug or hot-unplug, we may have to undo
> + * vmx_vcpu_pi_put even if there is no assigned device. And we
> + * always keep PI.NDST up to date for simplicity: it makes the
> + * code easier, and CPU migration is not a fast path.
> + */
> + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
> + return;
Hi Paolo,
I'm confused with the following scenario:
(suppose the VM has a assigned devices)
step 1. the running vcpu is be preempted
--> vmx_vcpu_pi_put [ SET pi.sn ]
step 2. hot-unplug the assigned devices
step 3. the vcpu is scheduled in
--> vmx_vcpu_pi_load [ CLEAR pi.sn ]
step 4. the running vcpu is be preempted again
--> vmx_vcpu_pi_put [ direct return ]
step 5. the vcpu is migrated to another pcpu
step 6. the vcpu is scheduled in
--> vmx_vcpu_pi_load [ above check fails and
continue to execute the follow parts ]
I think vmx_vcpu_pi_load should return direct in step6, because
vmx_vcpu_pi_put in step4 did nothing.
So maybe the above check has a potential problem.
Please kindly figure out if I misunderstand anything important :)
--
Regards,
Longpeng(Mike)
> +
> + /*
> + * First handle the simple case where no cmpxchg is necessary; just
> + * allow posting non-urgent interrupts.
> + *
> + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
> + * PI.NDST: pi_post_block will do it for us and the wakeup_handler
> + * expects the VCPU to be on the blocked_vcpu_list that matches
> + * PI.NDST.
> + */
> + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
> + vcpu->cpu == cpu) {
> + pi_clear_sn(pi_desc);
> return;
> + }
>
> + /* The full case. */
> do {
> old.control = new.control = pi_desc->control;
>
> - /*
> - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
> - * are two possible cases:
> - * 1. After running 'pre_block', context switch
> - * happened. For this case, 'sn' was set in
> - * vmx_vcpu_put(), so we need to clear it here.
> - * 2. After running 'pre_block', we were blocked,
> - * and woken up by some other guy. For this case,
> - * we don't need to do anything, 'pi_post_block'
> - * will do everything for us. However, we cannot
> - * check whether it is case #1 or case #2 here
> - * (maybe, not needed), so we also clear sn here,
> - * I think it is not a big deal.
> - */
> - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
> - if (vcpu->cpu != cpu) {
> - dest = cpu_physical_id(cpu);
> -
> - if (x2apic_enabled())
> - new.ndst = dest;
> - else
> - new.ndst = (dest << 8) & 0xFF00;
> - }
> + dest = cpu_physical_id(cpu);
>
> - /* set 'NV' to 'notification vector' */
> - new.nv = POSTED_INTR_VECTOR;
> - }
> + if (x2apic_enabled())
> + new.ndst = dest;
> + else
> + new.ndst = (dest << 8) & 0xFF00;
>
> - /* Allow posting non-urgent interrupts */
> new.sn = 0;
> } while (cmpxchg(&pi_desc->control, old.control,
> new.control) != old.control);
> @@ -9259,6 +9257,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>
> vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
>
> + /*
> + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
> + * or POSTED_INTR_WAKEUP_VECTOR.
> + */
> + vmx->pi_desc.nv = POSTED_INTR_VECTOR;
> + vmx->pi_desc.sn = 1;
> +
> return &vmx->vcpu;
>
> free_vmcs:
> @@ -11249,9 +11254,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
> else
> new.ndst = (dest << 8) & 0xFF00;
>
> - /* Allow posting non-urgent interrupts */
> - new.sn = 0;
> -
> /* set 'NV' to 'notification vector' */
> new.nv = POSTED_INTR_VECTOR;
> } while (cmpxchg(&pi_desc->control, old.control,
--
Regards,
Longpeng(Mike)
On 2017/7/28 12:22, Longpeng (Mike) wrote:
>
>
> On 2017/6/6 18:57, Paolo Bonzini wrote:
>
>> The simplify part: do not touch pi_desc.nv, we can set it when the
>> VCPU is first created. Likewise, pi_desc.sn is only handled by
>> vmx_vcpu_pi_load, do not touch it in __pi_post_block.
>>
>> The fix part: do not check kvm_arch_has_assigned_device, instead
>> check the SN bit to figure out whether vmx_vcpu_pi_put ran before.
>> This matches what the previous patch did in pi_post_block.
>>
>> Cc: Longpeng (Mike) <[email protected]>
>> Cc: Huangweidong <[email protected]>
>> Cc: Gonglei <[email protected]>
>> Cc: wangxin <[email protected]>
>> Cc: Radim Krčmář <[email protected]>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++++++--------------------------
>> 1 file changed, 35 insertions(+), 33 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 0f4714fe4908..81047f373747 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2184,43 +2184,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
>> struct pi_desc old, new;
>> unsigned int dest;
>>
>> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
>> - !irq_remapping_cap(IRQ_POSTING_CAP) ||
>> - !kvm_vcpu_apicv_active(vcpu))
>> + /*
>> + * In case of hot-plug or hot-unplug, we may have to undo
>> + * vmx_vcpu_pi_put even if there is no assigned device. And we
>> + * always keep PI.NDST up to date for simplicity: it makes the
>> + * code easier, and CPU migration is not a fast path.
>> + */
>> + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
>> + return;
>
>
> Hi Paolo,
>
> I'm confused with the following scenario:
>
> (suppose the VM has a assigned devices)
> step 1. the running vcpu is be preempted
> --> vmx_vcpu_pi_put [ SET pi.sn ]
> step 2. hot-unplug the assigned devices
> step 3. the vcpu is scheduled in
> --> vmx_vcpu_pi_load [ CLEAR pi.sn ]
> step 4. the running vcpu is be preempted again
> --> vmx_vcpu_pi_put [ direct return ]
> step 5. the vcpu is migrated to another pcpu
> step 6. the vcpu is scheduled in
> --> vmx_vcpu_pi_load [ above check fails and
> continue to execute the follow parts ]
>
> I think vmx_vcpu_pi_load should return direct in step6, because
> vmx_vcpu_pi_put in step4 did nothing.
> So maybe the above check has a potential problem.
Oh! Sorry, please just ignore the above. I made a mistaken.
>
> Please kindly figure out if I misunderstand anything important :)
>
> --
> Regards,
> Longpeng(Mike)
>
>> +
>> + /*
>> + * First handle the simple case where no cmpxchg is necessary; just
>> + * allow posting non-urgent interrupts.
>> + *
>> + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
>> + * PI.NDST: pi_post_block will do it for us and the wakeup_handler
>> + * expects the VCPU to be on the blocked_vcpu_list that matches
>> + * PI.NDST.
>> + */
>> + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
>> + vcpu->cpu == cpu) {
>> + pi_clear_sn(pi_desc);
>> return;
>> + }
>>
>> + /* The full case. */
>> do {
>> old.control = new.control = pi_desc->control;
>>
>> - /*
>> - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
>> - * are two possible cases:
>> - * 1. After running 'pre_block', context switch
>> - * happened. For this case, 'sn' was set in
>> - * vmx_vcpu_put(), so we need to clear it here.
>> - * 2. After running 'pre_block', we were blocked,
>> - * and woken up by some other guy. For this case,
>> - * we don't need to do anything, 'pi_post_block'
>> - * will do everything for us. However, we cannot
>> - * check whether it is case #1 or case #2 here
>> - * (maybe, not needed), so we also clear sn here,
>> - * I think it is not a big deal.
>> - */
>> - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
>> - if (vcpu->cpu != cpu) {
>> - dest = cpu_physical_id(cpu);
>> -
>> - if (x2apic_enabled())
>> - new.ndst = dest;
>> - else
>> - new.ndst = (dest << 8) & 0xFF00;
>> - }
>> + dest = cpu_physical_id(cpu);
>>
>> - /* set 'NV' to 'notification vector' */
>> - new.nv = POSTED_INTR_VECTOR;
>> - }
>> + if (x2apic_enabled())
>> + new.ndst = dest;
>> + else
>> + new.ndst = (dest << 8) & 0xFF00;
>>
>> - /* Allow posting non-urgent interrupts */
>> new.sn = 0;
>> } while (cmpxchg(&pi_desc->control, old.control,
>> new.control) != old.control);
>> @@ -9259,6 +9257,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>>
>> vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
>>
>> + /*
>> + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
>> + * or POSTED_INTR_WAKEUP_VECTOR.
>> + */
>> + vmx->pi_desc.nv = POSTED_INTR_VECTOR;
>> + vmx->pi_desc.sn = 1;
>> +
>> return &vmx->vcpu;
>>
>> free_vmcs:
>> @@ -11249,9 +11254,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
>> else
>> new.ndst = (dest << 8) & 0xFF00;
>>
>> - /* Allow posting non-urgent interrupts */
>> - new.sn = 0;
>> -
>> /* set 'NV' to 'notification vector' */
>> new.nv = POSTED_INTR_VECTOR;
>> } while (cmpxchg(&pi_desc->control, old.control,
>
>
--
Regards,
Longpeng(Mike)
On 28/07/2017 04:31, Longpeng (Mike) wrote:
> Hi Paolo,
>
> On 2017/6/6 18:57, Paolo Bonzini wrote:
>
>> In some cases, for example involving hot-unplug of assigned
>> devices, pi_post_block can forget to remove the vCPU from the
>> blocked_vcpu_list. When this happens, the next call to
>> pi_pre_block corrupts the list.
>>
>> Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
>> and WARN instead of adding the element twice in the list. Second,
>> always do the list removal in pi_post_block if vcpu->pre_pcpu is
>> set (not -1).
>>
>> The new code keeps interrupts disabled for the whole duration of
>> pi_pre_block/pi_post_block. This is not strictly necessary, but
>> easier to follow. For the same reason, PI.ON is checked only
>> after the cmpxchg, and to handle it we just call the post-block
>> code. This removes duplication of the list removal code.
>>
>> Cc: Longpeng (Mike) <[email protected]>
>> Cc: Huangweidong <[email protected]>
>> Cc: Gonglei <[email protected]>
>> Cc: wangxin <[email protected]>
>> Cc: Radim Krčmář <[email protected]>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
>> 1 file changed, 25 insertions(+), 37 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 747d16525b45..0f4714fe4908 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -11236,10 +11236,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
>> struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
>> struct pi_desc old, new;
>> unsigned int dest;
>> - unsigned long flags;
>>
>> do {
>> old.control = new.control = pi_desc->control;
>> + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
>> + "Wakeup handler not enabled while the VCPU is blocked\n");
>>
>> dest = cpu_physical_id(vcpu->cpu);
>>
>> @@ -11256,14 +11257,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
>> } while (cmpxchg(&pi_desc->control, old.control,
>> new.control) != old.control);
>>
>> - if(vcpu->pre_pcpu != -1) {
>> - spin_lock_irqsave(
>> - &per_cpu(blocked_vcpu_on_cpu_lock,
>> - vcpu->pre_pcpu), flags);
>> + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
>
>
> __pi_post_block is only called by pi_post_block/pi_pre_block now, it seems that
> both of them would make sure "vcpu->pre_pcpu != -1" before __pi_post_block is
> called, so maybe the above check is useless, right?
It's because a WARN is better than a double-add. And even if the caller
broke the invariant you'd have to do the cmpxchg loop above to make
things not break too much.
Paolo
Hi Paolo,
We have backported the first three patches and have tested for about 20 days, it
works fine.
So could you consider to merge this series ?
--
Regards,
Longpeng(Mike)
On 2017/7/11 17:16, Gonglei (Arei) wrote:
>
>
>> -----Original Message-----
>> From: [email protected] [mailto:[email protected]] On
>> Behalf Of Paolo Bonzini
>> Sent: Tuesday, July 11, 2017 4:56 PM
>> To: Gonglei (Arei)
>> Cc: [email protected]; [email protected]; longpeng;
>> Huangweidong (C); wangxin (U); Radim Kr?m??
>> Subject: Re: [PATCH CFT 0/4] VT-d PI fixes
>>
>> On 07/06/2017 11:33, Gonglei (Arei) wrote:
>>> We are testing your patch, but maybe need some time to report
>>> the results because it's not an inevitable problem.
>>>
>>> Meanwhile we also try to find a possible scenario of non-hotplugging to
>>> explain the double-add warnings.
>>
>> Hi Lei,
>>
>> do you have any updates?
>
> Dear Paolo,
>
> Thanks for kicking me :)
>
> TBH, thinking about the reliability of productive project (we use kvm-4.4),
> we applied the patch you used in fedora pastebin, and
> the bug seems gone after one month's testing.
>
> diff --git a/source/x86/vmx.c b/source/x86/vmx.c
> index 79012cf..efc611f 100644
> --- a/source/x86/vmx.c
> +++ b/source/x86/vmx.c
> @@ -11036,8 +11036,9 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
> unsigned int dest;
> unsigned long flags;
>
> - if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> - !irq_remapping_cap(IRQ_POSTING_CAP))
> + if ((vcpu->pre_pcpu == -1) &&
> + (!kvm_arch_has_assigned_device(vcpu->kvm) ||
> + !irq_remapping_cap(IRQ_POSTING_CAP)))
> return;
>
>> I would like to get at least the first three
>> patches in 4.13.
>>
> I think they are okay to me for upstream.
>
> Thanks,
> -Gonglei
>
> .
>
--
Regards,
Longpeng(Mike)
On 21/09/2017 10:23, Longpeng (Mike) wrote:
> Hi Paolo,
>
> We have backported the first three patches and have tested for about 20 days, it
> works fine.
>
> So could you consider to merge this series ?
Yes, thanks very much!!
Paolo