Fix the nested posted interrupts bug Jim reported a while back[*], where
KVM fails to detect that a pending virtual interrupt for a halted L2 is a
valid wake event. My original analysis and the basic gits of my hack-a-
patch was correct, I just botched a few mundane details (I kept forgetting
the PIR is physically contiguous, while the ISR and IRR are not, *sigh*).
[*] https://lore.kernel.org/all/[email protected]
Sean Christopherson (6):
KVM: nVMX: Add a helper to get highest pending from Posted Interrupt
vector
KVM: nVMX: Request immediate exit iff pending nested event needs
injection
KVM: VMX: Split out the non-virtualization part of
vmx_interrupt_blocked()
KVM: nVMX: Check for pending posted interrupts when looking for nested
events
KVM: nVMX: Fold requested virtual interrupt check into
has_nested_events()
KVM: x86: WARN if a vCPU gets a valid wakeup that KVM can't yet inject
arch/x86/include/asm/kvm-x86-ops.h | 1 -
arch/x86/include/asm/kvm_host.h | 3 +-
arch/x86/kvm/vmx/main.c | 1 -
arch/x86/kvm/vmx/nested.c | 47 ++++++++++++++++++++++++++----
arch/x86/kvm/vmx/posted_intr.h | 10 +++++++
arch/x86/kvm/vmx/vmx.c | 33 ++++++---------------
arch/x86/kvm/vmx/vmx.h | 1 +
arch/x86/kvm/vmx/x86_ops.h | 1 -
arch/x86/kvm/x86.c | 19 +++++-------
9 files changed, 70 insertions(+), 46 deletions(-)
base-commit: af0903ab52ee6d6f0f63af67fa73d5eb00f79b9a
--
2.45.2.505.gda0bf45e8d-goog
Add a helper to retrieve the highest pending vector given a Posted
Interrupt descriptor. While the actual operation is straightforward, it's
surprisingly easy to mess up, e.g. if one tries to reuse lapic.c's
find_highest_vector(), which doesn't work with PID.PIR due to the APIC's
IRR and ISR component registers being physically discontiguous (they're
4-byte registers aligned at 16-byte intervals).
To make PIR handling more consistent with respect to IRR and ISR handling,
return -1 to indicate "no interrupt pending".
Cc: [email protected]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/vmx/nested.c | 5 +++--
arch/x86/kvm/vmx/posted_intr.h | 10 ++++++++++
2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 75b4f41d9926..0710486d42cc 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "posted_intr.h"
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
@@ -3899,8 +3900,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
return 0;
- max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
- if (max_irr != 256) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
goto mmio_needed;
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 6b2a0226257e..1715d2ab07be 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H
+
+#include <linux/find.h>
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -12,4 +14,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
+static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
+{
+ int vec;
+
+ vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ return vec < 256 ? vec : -1;
+}
+
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
--
2.45.2.505.gda0bf45e8d-goog
When requesting an immediate exit from L2 in order to inject a pending
event, do so only if the pending event actually requires manual injection,
i.e. if and only if KVM actually needs to regain control in order to
deliver the event.
Avoiding the "immediate exit" isn't simply an optimization, it's necessary
to make forward progress, as the "already expired" VMX preemption timer
trick that KVM uses to force a VM-Exit has higher priority than events
that aren't directly injected.
At present time, this is a glorified nop as all events processed by
vmx_has_nested_events() require injection, but that will not hold true in
the future, e.g. if there's a pending virtual interrupt in vmcs02.RVI.
I.e. if KVM is trying to deliver a virtual interrupt to L2, the expired
VMX preemption timer will trigger VM-Exit before the virtual interrupt is
delivered, and KVM will effectively hang the vCPU in an endless loop of
forced immediate VM-Exits (because the pending virtual interrupt never
goes away).
Cc: [email protected]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/vmx/nested.c | 2 +-
arch/x86/kvm/x86.c | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5c0415899a07..473f7e1d245c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1836,7 +1836,7 @@ struct kvm_x86_nested_ops {
bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
u32 error_code);
int (*check_events)(struct kvm_vcpu *vcpu);
- bool (*has_events)(struct kvm_vcpu *vcpu);
+ bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0710486d42cc..9099c1d0c7cb 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4032,7 +4032,7 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
-static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
return nested_vmx_preemption_timer_pending(vcpu) ||
to_vmx(vcpu)->nested.mtf_pending;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4157602c964e..5ec24d9cb231 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10534,7 +10534,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
+ kvm_x86_ops.nested_ops->has_events(vcpu, true))
*req_immediate_exit = true;
/*
@@ -13182,7 +13182,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
+ kvm_x86_ops.nested_ops->has_events(vcpu, false))
return true;
if (kvm_xen_has_pending_events(vcpu))
--
2.45.2.505.gda0bf45e8d-goog
Move the non-VMX chunk of the "interrupt blocked" checks to a separate
helper so that KVM can reuse the code to detect if interrupts are blocked
for L2, e.g. to determine if a virtual interrupt _for L2_ is a valid wake
event. If L1 disables HLT-exiting for L2, nested APICv is enabled, and L2
HLTs, then L2 virtual interrupts are valid wake events, but if and only if
interrupts are unblocked for L2.
Cc: [email protected]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 13 +++++++++----
arch/x86/kvm/vmx/vmx.h | 1 +
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0e3aaf520db2..d8d9e1f6c340 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5050,16 +5050,21 @@ int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_nmi_blocked(vcpu);
}
-bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
- return false;
-
return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return false;
+
+ return __vmx_interrupt_blocked(vcpu);
+}
+
int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 08d7d67fe760..42498fa63abb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -406,6 +406,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
--
2.45.2.505.gda0bf45e8d-goog
Check for pending (and notified!) posted interrupts when checking if L2
has a pending wake event, as fully posted/notified virtual interrupt is a
valid wake event for HLT.
Note that KVM must check vmx->nested.pi_pending to avoid prematurely
waking L2, e.g. even if KVM sees a non-zero PID.PIR and PID.0N=1, the
virtual interrupt won't actually be recognized until a notification IRQ is
received by the vCPU or the vCPU does (nested) VM-Enter.
Fixes: 26844fee6ade ("KVM: x86: never write to memory from kvm_vcpu_check_block()")
Cc: [email protected]
Cc: Maxim Levitsky <[email protected]>
Reported-by: Jim Mattson <[email protected]>
Closes: https://lore.kernel.org/all/[email protected]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/vmx/nested.c | 36 ++++++++++++++++++++++++++++++++++--
1 file changed, 34 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 9099c1d0c7cb..3bac65591f20 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4034,8 +4034,40 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
- return nested_vmx_preemption_timer_pending(vcpu) ||
- to_vmx(vcpu)->nested.mtf_pending;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic = vmx->nested.virtual_apic_map.hva;
+ int max_irr, vppr;
+
+ if (nested_vmx_preemption_timer_pending(vcpu) ||
+ vmx->nested.mtf_pending)
+ return true;
+
+ /*
+ * Virtual Interrupt Delivery doesn't require manual injection. Either
+ * the interrupt is already in GUEST_RVI and will be recognized by CPU
+ * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
+ * the interrupt from the PIR to RVI prior to entering the guest.
+ */
+ if (for_injection)
+ return false;
+
+ if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ __vmx_interrupt_blocked(vcpu))
+ return false;
+
+ if (!vapic)
+ return false;
+
+ vppr = *((u32 *)(vapic + APIC_PROCPRI));
+
+ if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
+ pi_test_on(vmx->nested.pi_desc)) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+ }
+
+ return false;
}
/*
--
2.45.2.505.gda0bf45e8d-goog
Check for a Requested Virtual Interrupt, i.e. a virtual interrupt that is
pending delivery, in vmx_has_nested_events() and drop the one-off
kvm_x86_ops.guest_apic_has_interrupt() hook.
In addition to dropping a superfluous hook, this fixes a bug where KVM
would incorrectly treat virtual interrupts _for L2_ as always enabled due
to kvm_arch_interrupt_allowed(), by way of vmx_interrupt_blocked(),
treating IRQs as enabled if L2 is active and vmcs12 is configured to exit
on IRQs, i.e. KVM would treat a virtual interrupt for L2 as a valid wake
event based on L1's IRQ blocking status.
Cc: [email protected]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 -
arch/x86/include/asm/kvm_host.h | 1 -
arch/x86/kvm/vmx/main.c | 1 -
arch/x86/kvm/vmx/nested.c | 4 ++++
arch/x86/kvm/vmx/vmx.c | 20 --------------------
arch/x86/kvm/vmx/x86_ops.h | 1 -
arch/x86/kvm/x86.c | 10 +---------
7 files changed, 5 insertions(+), 33 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 566d19b02483..f91d413d7de1 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -85,7 +85,6 @@ KVM_X86_OP_OPTIONAL(update_cr8_intercept)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
-KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 473f7e1d245c..f2336c646088 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1728,7 +1728,6 @@ struct kvm_x86_ops {
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
- bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d0e1a5b5c915..7e846a842443 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -97,7 +97,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3bac65591f20..2392a7ef254d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4060,6 +4060,10 @@ static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
vppr = *((u32 *)(vapic + APIC_PROCPRI));
+ max_irr = vmx_get_rvi();
+ if ((max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+
if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
pi_test_on(vmx->nested.pi_desc)) {
max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d8d9e1f6c340..c7558bcb0241 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4106,26 +4106,6 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
-
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
- return false;
-
- rvi = vmx_get_rvi();
-
- vapic_page = vmx->nested.virtual_apic_map.hva;
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 502704596c83..d404227c164d 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -49,7 +49,6 @@ void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
void vmx_hwapic_isr_update(int max_isr);
-bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu);
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5ec24d9cb231..82442960b499 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13133,12 +13133,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- return (is_guest_mode(vcpu) &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
-}
-
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
if (!list_empty_careful(&vcpu->async_pf.done))
@@ -13172,9 +13166,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
return true;
- if (kvm_arch_interrupt_allowed(vcpu) &&
- (kvm_cpu_has_interrupt(vcpu) ||
- kvm_guest_apic_has_interrupt(vcpu)))
+ if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
return true;
if (kvm_hv_has_stimer_pending(vcpu))
--
2.45.2.505.gda0bf45e8d-goog
WARN if a blocking vCPU is awaked by a valid wake event that KVM can't
inject, e.g. because KVM needs to completed a nested VM-enter, or needs to
re-inject an exception. For the nested VM-Enter case, KVM is supposed to
clear "nested_run_pending" if L1 puts L2 into HLT, i.e. entering HLT
"completes" the nested VM-Enter. And for already-injected exceptions, it
should be impossible for the vCPU to be in a blocking state if a VM-Exit
occurred while an exception was being vectored.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 82442960b499..f6ace2bd7124 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11233,7 +11233,10 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
* causes a spurious wakeup from HLT).
*/
if (is_guest_mode(vcpu)) {
- if (kvm_check_nested_events(vcpu) < 0)
+ int r = kvm_check_nested_events(vcpu);
+
+ WARN_ON_ONCE(r == -EBUSY);
+ if (r < 0)
return 0;
}
--
2.45.2.505.gda0bf45e8d-goog