This series is one of the prerequisites for supporting AMD AVIC with
in-kernel irqchip (kernel_irqchip=on).
Since AVIC does not support ExtINT interrupt, which is required during
the booting phase of Windows and FreeBSD VMs (e.g. PIT -> PIC -> ExtInt).
This results in VM hang in the boot loader with kernel_irqchip=on mode.
This series provides workaround by temporary deactivate AVIC and fallback
to use legacy interrupt injection (w/ vINTR and interrupt window).
Then re-activate AVIC once the intrrupts are handled.
Thanks,
Suravee
Suravee Suthikulpanit (6):
KVM: x86: Add callback functions for handling APIC ID, DFR and LDR
update
svm: Add AMD AVIC handlers for APIC ID, DFR and LDR update
svm: Add support for APIC_ACCESS_PAGE_PRIVATE_MEMSLOT setup/destroy
kvm: lapic: Add apicv activate/deactivate helper function
KVM: x86: Add interface for run-time activate/de-activate APIC
virtualization
svm: Temporary deactivate AVIC during ExtINT handling
arch/x86/include/asm/kvm_host.h | 11 ++++
arch/x86/kvm/lapic.c | 29 +++++++--
arch/x86/kvm/lapic.h | 1 +
arch/x86/kvm/svm.c | 106 ++++++++++++++++++++++++++++++--
arch/x86/kvm/x86.c | 48 +++++++++++++++
5 files changed, 185 insertions(+), 10 deletions(-)
--
2.17.1
Activate/deactivate AVIC requires setting/unsetting the memory region used
for APIC_ACCESS_PAGE_PRIVATE_MEMSLOT. So, re-factor avic_init_access_page()
to avic_setup_access_page() and add srcu_read_lock/unlock, which are needed
to allow this function to be called during run-time.
Also, introduce avic_destroy_access_page() to unset the page when
deactivate AVIC.
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
arch/x86/kvm/svm.c | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4cf93a729ad8..f41f34f70dde 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1666,7 +1666,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
* field of the VMCB. Therefore, we set up the
* APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
*/
-static int avic_init_access_page(struct kvm_vcpu *vcpu)
+static int avic_setup_access_page(struct kvm_vcpu *vcpu, bool init)
{
struct kvm *kvm = vcpu->kvm;
int ret = 0;
@@ -1675,10 +1675,14 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
if (kvm->arch.apic_access_page_done)
goto out;
+ if (!init)
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
ret = __x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
PAGE_SIZE);
+ if (!init)
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (ret)
goto out;
@@ -1688,6 +1692,26 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
return ret;
}
+static void avic_destroy_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (!kvm->arch.apic_access_page_done)
+ goto out;
+
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ __x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ 0);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm->arch.apic_access_page_done = false;
+out:
+ mutex_unlock(&kvm->slots_lock);
+}
+
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
int ret;
@@ -1695,7 +1719,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- ret = avic_init_access_page(vcpu);
+ ret = avic_setup_access_page(vcpu, true);
if (ret)
return ret;
--
2.17.1
Add hooks for handling the case when guest VM update APIC ID, DFR and LDR.
This is needed during AMD AVIC is temporary deactivated.
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 3 +++
arch/x86/kvm/lapic.c | 6 ++++++
2 files changed, 9 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a5db4475e72d..1906e205e6a3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1077,6 +1077,9 @@ struct kvm_x86_ops {
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+ void (*hwapic_apic_id_update)(struct kvm_vcpu *vcpu);
+ void (*hwapic_dfr_update)(struct kvm_vcpu *vcpu);
+ void (*hwapic_ldr_update)(struct kvm_vcpu *vcpu);
bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 991fdf7fc17f..95295cf81283 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -262,12 +262,16 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
{
kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ if (kvm_x86_ops->hwapic_apic_id_update)
+ kvm_x86_ops->hwapic_apic_id_update(apic->vcpu);
recalculate_apic_map(apic->vcpu->kvm);
}
static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
{
kvm_lapic_set_reg(apic, APIC_LDR, id);
+ if (kvm_x86_ops->hwapic_ldr_update)
+ kvm_x86_ops->hwapic_ldr_update(apic->vcpu);
recalculate_apic_map(apic->vcpu->kvm);
}
@@ -1836,6 +1840,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_DFR:
if (!apic_x2apic_mode(apic)) {
kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ if (kvm_x86_ops->hwapic_dfr_update)
+ kvm_x86_ops->hwapic_dfr_update(apic->vcpu);
recalculate_apic_map(apic->vcpu->kvm);
} else
ret = 1;
--
2.17.1
When activate / deactivate AVIC during runtime, all vcpus has to be
operating in the same mode. So, introduce new interface to request
all vCPUs to activate/deactivate APICV.
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 8 ++++++
arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++++++++
2 files changed, 56 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1906e205e6a3..31dee26a37f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,6 +79,10 @@
#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
+#define KVM_REQ_APICV_ACTIVATE \
+ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_APICV_DEACTIVATE \
+ KVM_ARCH_REQ_FLAGS(26, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1537,6 +1541,10 @@ bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
void kvm_make_mclock_inprogress_request(struct kvm *kvm);
void kvm_make_scan_ioapic_request(struct kvm *kvm);
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
+void kvm_vcpu_activate_apicv(struct kvm_vcpu *vcpu);
+void kvm_make_apicv_activate_request(struct kvm *kvm);
+void kvm_make_apicv_deactivate_request(struct kvm *kvm);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 65e4559eef2f..1cd49c394680 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -29,6 +29,7 @@
#include "cpuid.h"
#include "pmu.h"
#include "hyperv.h"
+#include "lapic.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -7054,6 +7055,22 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
+void kvm_vcpu_activate_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu)) {
+ WARN_ON_ONCE(!vcpu->arch.apicv_active);
+ return;
+ }
+ if (vcpu->arch.apicv_active)
+ return;
+
+ vcpu->arch.apicv_active = true;
+ kvm_apic_update_apicv(vcpu);
+
+ kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_activate_apicv);
+
void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu)) {
@@ -7064,8 +7081,11 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
return;
vcpu->arch.apicv_active = false;
+ kvm_apic_update_apicv(vcpu);
+
kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_deactivate_apicv);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
@@ -7557,6 +7577,30 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
+void kvm_make_apicv_activate_request(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *v;
+
+ kvm_for_each_vcpu(i, v, kvm)
+ kvm_clear_request(KVM_REQ_APICV_DEACTIVATE, v);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_ACTIVATE);
+}
+EXPORT_SYMBOL_GPL(kvm_make_apicv_activate_request);
+
+void kvm_make_apicv_deactivate_request(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *v;
+
+ kvm_for_each_vcpu(i, v, kvm)
+ kvm_clear_request(KVM_REQ_APICV_ACTIVATE, v);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_DEACTIVATE);
+}
+EXPORT_SYMBOL_GPL(kvm_make_apicv_deactivate_request);
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
if (!kvm_apic_present(vcpu))
@@ -7743,6 +7787,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
kvm_hv_process_stimers(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_ACTIVATE, vcpu))
+ kvm_vcpu_activate_apicv(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_DEACTIVATE, vcpu))
+ kvm_vcpu_deactivate_apicv(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
--
2.17.1
AMD AVIC does not support ExtINT. Therefore, AVIC must be temporary
deactivated and fall back to using legacy interrupt injection via
vINTR and interrupt window.
Introduce svm_request_activate/deactivate_avic() helper functions,
which handle steps required to activate/deactivate AVIC.
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
arch/x86/kvm/svm.c | 57 +++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 54 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f41f34f70dde..84116e689d5f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -391,6 +391,8 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa";
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
static void svm_complete_interrupts(struct vcpu_svm *svm);
+static void svm_request_activate_avic(struct kvm_vcpu *vcpu);
+static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
static int nested_svm_intercept(struct vcpu_svm *svm);
@@ -2109,6 +2111,9 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
svm->avic_is_running = is_run;
if (is_run)
avic_vcpu_load(vcpu, vcpu->cpu);
@@ -2356,6 +2361,10 @@ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
+ if (kvm_check_request(KVM_REQ_APICV_ACTIVATE, vcpu))
+ kvm_vcpu_activate_apicv(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_DEACTIVATE, vcpu))
+ kvm_vcpu_deactivate_apicv(vcpu);
avic_set_running(vcpu, true);
}
@@ -4505,6 +4514,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
{
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
+
+ /*
+ * For AVIC, the only reason to end up here is ExtINTs.
+ * In this case AVIC was temporarily disabled for
+ * requesting the IRQ window and we have to re-enable it.
+ */
+ if (svm_get_enable_apicv(&svm->vcpu))
+ svm_request_activate_avic(&svm->vcpu);
+
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
++svm->vcpu.stat.irq_window_exits;
@@ -5206,6 +5224,34 @@ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
}
+static bool is_avic_active(struct vcpu_svm *svm)
+{
+ return (svm_get_enable_apicv(&svm->vcpu) &&
+ svm->vmcb->control.int_ctl & AVIC_ENABLE_MASK);
+}
+
+static void svm_request_activate_avic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!lapic_in_kernel(vcpu) || is_avic_active(svm))
+ return;
+
+ avic_setup_access_page(vcpu, false);
+ kvm_make_apicv_activate_request(vcpu->kvm);
+}
+
+static void svm_request_deactivate_avic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!lapic_in_kernel(vcpu) || !is_avic_active(svm))
+ return;
+
+ avic_destroy_access_page(vcpu);
+ kvm_make_apicv_deactivate_request(vcpu->kvm);
+}
+
/* Note: Currently only used by Hyper-V. */
static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
@@ -5493,9 +5539,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (kvm_vcpu_apicv_active(vcpu))
- return;
-
/*
* In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
* 1, because that's a separate STGI/VMRUN intercept. The next time we
@@ -5505,6 +5548,14 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
* window under the assumption that the hardware will set the GIF.
*/
if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
+ /*
+ * IRQ window is not needed when AVIC is enabled,
+ * unless we have pending ExtINT since it cannot be injected
+ * via AVIC. In such case, we need to temporarily disable AVIC,
+ * and fallback to injecting IRQ via V_IRQ.
+ */
+ if (kvm_vcpu_apicv_active(vcpu))
+ svm_request_deactivate_avic(&svm->vcpu);
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
--
2.17.1
During AVIC temporary deactivation, guest could update APIC ID,
DFR and LDR registers, which would not be trapped by
avic_unaccelerated_ccess_interception(). In this case, we need
to update the AVIC logical APIC ID table accordingly.
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
arch/x86/kvm/svm.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f4fb766e474c..4cf93a729ad8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4705,6 +4705,24 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
svm->dfr_reg = dfr;
}
+static void svm_hwapic_ldr_update(struct kvm_vcpu *vcpu)
+{
+ if (svm_get_enable_apicv(vcpu) && !kvm_vcpu_apicv_active(vcpu))
+ avic_handle_ldr_update(vcpu);
+}
+
+static void svm_hwapic_apic_id_update(struct kvm_vcpu *vcpu)
+{
+ if (svm_get_enable_apicv(vcpu) && !kvm_vcpu_apicv_active(vcpu))
+ avic_handle_apic_id_update(vcpu);
+}
+
+static void svm_hwapic_dfr_update(struct kvm_vcpu *vcpu)
+{
+ if (svm_get_enable_apicv(vcpu) && !kvm_vcpu_apicv_active(vcpu))
+ avic_handle_dfr_update(vcpu);
+}
+
static int avic_unaccel_trap_write(struct vcpu_svm *svm)
{
struct kvm_lapic *apic = svm->vcpu.arch.apic;
@@ -7222,6 +7240,9 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
+ .hwapic_apic_id_update = svm_hwapic_apic_id_update,
+ .hwapic_dfr_update = svm_hwapic_dfr_update,
+ .hwapic_ldr_update = svm_hwapic_ldr_update,
.sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
--
2.17.1
Introduce a helper function for setting lapic parameters when
activate/deactivate apicv.
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
arch/x86/kvm/lapic.c | 23 ++++++++++++++++++-----
arch/x86/kvm/lapic.h | 1 +
2 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 95295cf81283..9c5cd1a98928 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2126,6 +2126,22 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ bool active = vcpu->arch.apicv_active;
+
+ if (active) {
+ /* irr_pending is always true when apicv is activated. */
+ apic->irr_pending = true;
+ apic->isr_count = 1;
+ } else {
+ apic->irr_pending = !!count_vectors(apic->regs + APIC_IRR);
+ apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+ }
+}
+EXPORT_SYMBOL(kvm_apic_update_apicv);
+
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -2170,8 +2186,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = vcpu->arch.apicv_active;
- apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
+ kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -2433,9 +2448,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
start_apic_timer(apic);
- apic->irr_pending = true;
- apic->isr_count = vcpu->arch.apicv_active ?
- 1 : count_vectors(apic->regs + APIC_ISR);
+ kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
kvm_x86_ops->apicv_post_state_restore(vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff6ef9c3d760..b657605cfec0 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -88,6 +88,7 @@ void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
--
2.17.1
2019-03-22 11:57+0000, Suthikulpanit, Suravee:
> This series is one of the prerequisites for supporting AMD AVIC with
> in-kernel irqchip (kernel_irqchip=on).
>
> Since AVIC does not support ExtINT interrupt, which is required during
> the booting phase of Windows and FreeBSD VMs (e.g. PIT -> PIC -> ExtInt).
> This results in VM hang in the boot loader with kernel_irqchip=on mode.
>
> This series provides workaround by temporary deactivate AVIC and fallback
> to use legacy interrupt injection (w/ vINTR and interrupt window).
> Then re-activate AVIC once the intrrupts are handled.
The solution looks reasonable (although it seems dangerous to me) as we
only enable the workaround if the interrupt cannot be immediately
injected.
The interesting part is that split irqchip works, yet it cannot inject
ExtInt correctly either -- does the guest OS behave differently, or does
split irqchip just ignore the ExtInt?
Thanks.
2019-03-22 11:57+0000, Suthikulpanit, Suravee:
> This series is one of the prerequisites for supporting AMD AVIC with
> in-kernel irqchip (kernel_irqchip=on).
>
> Since AVIC does not support ExtINT interrupt, which is required during
> the booting phase of Windows and FreeBSD VMs (e.g. PIT -> PIC -> ExtInt).
> This results in VM hang in the boot loader with kernel_irqchip=on mode.
>
> This series provides workaround by temporary deactivate AVIC and fallback
> to use legacy interrupt injection (w/ vINTR and interrupt window).
> Then re-activate AVIC once the intrrupts are handled.
Hm, another idea. It is possible to inject the ExtInt in APICv, but if
interrupt injection is currently disabled, we need to wait until the
interrupt window opens, which can't be done with int_* controls n.
Wouldn't intercepting IRET/STGI/IF writes be enough to eventually reach
the point where we can do event_inj?
Thanks.
Hi Suravee.
I wonder, how this interacts with Hyper-V SynIC; see comments below.
On 22/03/2019 12.57, Suthikulpanit, Suravee wrote:
> AMD AVIC does not support ExtINT. Therefore, AVIC must be temporary
> deactivated and fall back to using legacy interrupt injection via
> vINTR and interrupt window.
>
> Introduce svm_request_activate/deactivate_avic() helper functions,
> which handle steps required to activate/deactivate AVIC.
>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
> ---
> arch/x86/kvm/svm.c | 57 +++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 54 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index f41f34f70dde..84116e689d5f 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -391,6 +391,8 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa";
> static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
> static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
> static void svm_complete_interrupts(struct vcpu_svm *svm);
> +static void svm_request_activate_avic(struct kvm_vcpu *vcpu);
> +static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu);
>
> static int nested_svm_exit_handled(struct vcpu_svm *svm);
> static int nested_svm_intercept(struct vcpu_svm *svm);
> @@ -2109,6 +2111,9 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
>
> + if (!kvm_vcpu_apicv_active(vcpu))
> + return;
> +
> svm->avic_is_running = is_run;
> if (is_run)
> avic_vcpu_load(vcpu, vcpu->cpu);
> @@ -2356,6 +2361,10 @@ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
>
> static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
> {
> + if (kvm_check_request(KVM_REQ_APICV_ACTIVATE, vcpu))
> + kvm_vcpu_activate_apicv(vcpu);
> + if (kvm_check_request(KVM_REQ_APICV_DEACTIVATE, vcpu))
> + kvm_vcpu_deactivate_apicv(vcpu);
> avic_set_running(vcpu, true);
> }
>
> @@ -4505,6 +4514,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
> {
> kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
> svm_clear_vintr(svm);
> +
> + /*
> + * For AVIC, the only reason to end up here is ExtINTs.
> + * In this case AVIC was temporarily disabled for
> + * requesting the IRQ window and we have to re-enable it.
> + */
> + if (svm_get_enable_apicv(&svm->vcpu))
> + svm_request_activate_avic(&svm->vcpu);
> +
Are we sure, we're not accidentally re-enabling AVIC, if it was disabled via
kvm_hv_activate_synic()?
> svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
> mark_dirty(svm->vmcb, VMCB_INTR);
> ++svm->vcpu.stat.irq_window_exits;
> @@ -5206,6 +5224,34 @@ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
> {
> }
>
> +static bool is_avic_active(struct vcpu_svm *svm)
> +{
> + return (svm_get_enable_apicv(&svm->vcpu) &&
> + svm->vmcb->control.int_ctl & AVIC_ENABLE_MASK);
> +}
> +
> +static void svm_request_activate_avic(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + if (!lapic_in_kernel(vcpu) || is_avic_active(svm))
> + return;
> +
> + avic_setup_access_page(vcpu, false);
> + kvm_make_apicv_activate_request(vcpu->kvm);
> +}
> +
> +static void svm_request_deactivate_avic(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + if (!lapic_in_kernel(vcpu) || !is_avic_active(svm))
> + return;
> +
> + avic_destroy_access_page(vcpu);
Something like avic_destroy_access_page() is not called, when AVIC is
disabled via kvm_hv_activate_synic().
Is that an oversight in the other code path, is it not needed here,
or am I missing something?
> + kvm_make_apicv_deactivate_request(vcpu->kvm);
> +}
> +
> /* Note: Currently only used by Hyper-V. */
nit: This comment should probably go away, now.
Regards
Jan
> static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
> {
> @@ -5493,9 +5539,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
>
> - if (kvm_vcpu_apicv_active(vcpu))
> - return;
> -
> /*
> * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
> * 1, because that's a separate STGI/VMRUN intercept. The next time we
> @@ -5505,6 +5548,14 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
> * window under the assumption that the hardware will set the GIF.
> */
> if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
> + /*
> + * IRQ window is not needed when AVIC is enabled,
> + * unless we have pending ExtINT since it cannot be injected
> + * via AVIC. In such case, we need to temporarily disable AVIC,
> + * and fallback to injecting IRQ via V_IRQ.
> + */
> + if (kvm_vcpu_apicv_active(vcpu))
> + svm_request_deactivate_avic(&svm->vcpu);
> svm_set_vintr(svm);
> svm_inject_irq(svm, 0x0);
> }
>
On 22/03/2019 12.57, Suthikulpanit, Suravee wrote:
> Activate/deactivate AVIC requires setting/unsetting the memory region used
> for APIC_ACCESS_PAGE_PRIVATE_MEMSLOT. So, re-factor avic_init_access_page()
> to avic_setup_access_page() and add srcu_read_lock/unlock, which are needed
> to allow this function to be called during run-time.
>
> Also, introduce avic_destroy_access_page() to unset the page when
> deactivate AVIC.
>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
> ---
> arch/x86/kvm/svm.c | 28 ++++++++++++++++++++++++++--
> 1 file changed, 26 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 4cf93a729ad8..f41f34f70dde 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1666,7 +1666,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
> * field of the VMCB. Therefore, we set up the
> * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
> */
> -static int avic_init_access_page(struct kvm_vcpu *vcpu)
> +static int avic_setup_access_page(struct kvm_vcpu *vcpu, bool init)
> {
> struct kvm *kvm = vcpu->kvm;
> int ret = 0;
> @@ -1675,10 +1675,14 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
> if (kvm->arch.apic_access_page_done)
> goto out;
>
> + if (!init)
> + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
> ret = __x86_set_memory_region(kvm,
> APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> APIC_DEFAULT_PHYS_BASE,
> PAGE_SIZE);
> + if (!init)
> + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> if (ret)
> goto out;
>
> @@ -1688,6 +1692,26 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
> return ret;
> }
>
> +static void avic_destroy_access_page(struct kvm_vcpu *vcpu)
> +{
> + struct kvm *kvm = vcpu->kvm;
> +
> + mutex_lock(&kvm->slots_lock);
> +
> + if (!kvm->arch.apic_access_page_done)
> + goto out;
> +
> + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
> + __x86_set_memory_region(kvm,
> + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> + APIC_DEFAULT_PHYS_BASE,
> + 0);
> + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
This pattern of "unlock, do something, re-lock" strikes me as odd --
here and in the setup function.
There seem to be a few assumptions for this to work:
a) SRCU read-side critical sections must not be nested.
b) We must not keep any pointer to a SRCU protected structure
across a call to this function.
Can we guarantee these assumptions? Now and in the future (given that this is already
a bit hidden in the call stack)?
(And if we can guarantee them, why are we holding the SRCU lock in the first place?)
Or is there maybe a nicer way to do this?
Regards
Jan
> + kvm->arch.apic_access_page_done = false;
> +out:
> + mutex_unlock(&kvm->slots_lock);
> +}
> +
> static int avic_init_backing_page(struct kvm_vcpu *vcpu)
> {
> int ret;
> @@ -1695,7 +1719,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
> int id = vcpu->vcpu_id;
> struct vcpu_svm *svm = to_svm(vcpu);
>
> - ret = avic_init_access_page(vcpu);
> + ret = avic_setup_access_page(vcpu, true);
> if (ret)
> return ret;
>
>
On 22/03/2019 12.57, Suthikulpanit, Suravee wrote:
> When activate / deactivate AVIC during runtime, all vcpus has to be
> operating in the same mode. So, introduce new interface to request
> all vCPUs to activate/deactivate APICV.
If we need to switch APICV on and off on all vCPUs of a VM, shouldn't
we have a variable somewhere, that tells us, whether AVIC is
currently activated/deactivated in the VM?
The logic in patch 6/6, that triggers changes of this global state based
on just local information, feels prone to race conditions otherwise.
(Consider, for example, that two vCPUs have to handle ExtINTs at the same
time. Shouldn't we prevent AVIC from getting activated when just one of
the two vCPUs is done? That is, re-enable AVIC only when no vCPU is
handling an ExtINT anymore?)
Also, now that vcpu->apic.apicv_active is dynamic, there are a
few more places, where it must be updated, I think:
a) In kvm_arch_vcpu_init() a newly created vCPU needs to be
initialized with the correct global state, so that vCPU
hotplugging does not lead to a mixture of APICV states.
b) At some point during vCPU restore, so that APICV does not end
up being enabled if there was an ExtINT pending in the VM
snapshot.
c) Probably during vCPU reset as well, in case the ExtINT is cleared.
Regards
Jan
>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 8 ++++++
> arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++++++++
> 2 files changed, 56 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 1906e205e6a3..31dee26a37f2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -79,6 +79,10 @@
> #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
> #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
> #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
> +#define KVM_REQ_APICV_ACTIVATE \
> + KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> +#define KVM_REQ_APICV_DEACTIVATE \
> + KVM_ARCH_REQ_FLAGS(26, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>
> #define CR0_RESERVED_BITS \
> (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> @@ -1537,6 +1541,10 @@ bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
>
> void kvm_make_mclock_inprogress_request(struct kvm *kvm);
> void kvm_make_scan_ioapic_request(struct kvm *kvm);
> +void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
> +void kvm_vcpu_activate_apicv(struct kvm_vcpu *vcpu);
> +void kvm_make_apicv_activate_request(struct kvm *kvm);
> +void kvm_make_apicv_deactivate_request(struct kvm *kvm);
>
> void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
> struct kvm_async_pf *work);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 65e4559eef2f..1cd49c394680 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -29,6 +29,7 @@
> #include "cpuid.h"
> #include "pmu.h"
> #include "hyperv.h"
> +#include "lapic.h"
>
> #include <linux/clocksource.h>
> #include <linux/interrupt.h>
> @@ -7054,6 +7055,22 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
> kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
> }
>
> +void kvm_vcpu_activate_apicv(struct kvm_vcpu *vcpu)
> +{
> + if (!lapic_in_kernel(vcpu)) {
> + WARN_ON_ONCE(!vcpu->arch.apicv_active);
> + return;
> + }
> + if (vcpu->arch.apicv_active)
> + return;
> +
> + vcpu->arch.apicv_active = true;
> + kvm_apic_update_apicv(vcpu);
> +
> + kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
> +}
> +EXPORT_SYMBOL_GPL(kvm_vcpu_activate_apicv);
> +
> void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
> {
> if (!lapic_in_kernel(vcpu)) {
> @@ -7064,8 +7081,11 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
> return;
>
> vcpu->arch.apicv_active = false;
> + kvm_apic_update_apicv(vcpu);
> +
> kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
> }
> +EXPORT_SYMBOL_GPL(kvm_vcpu_deactivate_apicv);
>
> int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> {
> @@ -7557,6 +7577,30 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
> kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
> }
>
> +void kvm_make_apicv_activate_request(struct kvm *kvm)
> +{
> + int i;
> + struct kvm_vcpu *v;
> +
> + kvm_for_each_vcpu(i, v, kvm)
> + kvm_clear_request(KVM_REQ_APICV_DEACTIVATE, v);
> +
> + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_ACTIVATE);
> +}
> +EXPORT_SYMBOL_GPL(kvm_make_apicv_activate_request);
> +
> +void kvm_make_apicv_deactivate_request(struct kvm *kvm)
> +{
> + int i;
> + struct kvm_vcpu *v;
> +
> + kvm_for_each_vcpu(i, v, kvm)
> + kvm_clear_request(KVM_REQ_APICV_ACTIVATE, v);
> +
> + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_DEACTIVATE);
> +}
> +EXPORT_SYMBOL_GPL(kvm_make_apicv_deactivate_request);
> +
> static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
> {
> if (!kvm_apic_present(vcpu))
> @@ -7743,6 +7787,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> */
> if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
> kvm_hv_process_stimers(vcpu);
> + if (kvm_check_request(KVM_REQ_APICV_ACTIVATE, vcpu))
> + kvm_vcpu_activate_apicv(vcpu);
> + if (kvm_check_request(KVM_REQ_APICV_DEACTIVATE, vcpu))
> + kvm_vcpu_deactivate_apicv(vcpu);
> }
>
> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
>
On 22/03/2019 12.57, Suthikulpanit, Suravee wrote:
> Introduce a helper function for setting lapic parameters when
> activate/deactivate apicv.
>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
> ---
> arch/x86/kvm/lapic.c | 23 ++++++++++++++++++-----
> arch/x86/kvm/lapic.h | 1 +
> 2 files changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 95295cf81283..9c5cd1a98928 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2126,6 +2126,22 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>
> }
>
> +void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_lapic *apic = vcpu->arch.apic;
> + bool active = vcpu->arch.apicv_active;
> +
> + if (active) {
> + /* irr_pending is always true when apicv is activated. */
> + apic->irr_pending = true;
> + apic->isr_count = 1;
> + } else {
> + apic->irr_pending = !!count_vectors(apic->regs + APIC_IRR);
What about:
apic->irr_pending = apic_search_irr(apic) != -1;
instead? (more in line with the logic in apic_clear_irr())
Related to this, I wonder if we need to ensure to execute
kvm_x86_ops->sync_pir_to_irr() just before apicv_active transitions
from true to false, so that we don't miss an interrupt and irr_pending
is set correctly in this function (on Intel at least).
Hmm... there seems to be other stuff as well, that depends on
vcpu->arch.apicv_active, which is not updated on a transition. For
example: posted interrupts, CR8 intercept, and a potential asymmetry
via avic_vcpu_load()/avic_vcpu_put() because the bottom half
of just one of the two functions may be skipped...
Do these need to be handled in some way?
Regards
Jan
> + apic->isr_count = count_vectors(apic->regs + APIC_ISR);
> + }
> +}
> +EXPORT_SYMBOL(kvm_apic_update_apicv);
> +
> void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
> {
> struct kvm_lapic *apic = vcpu->arch.apic;
> @@ -2170,8 +2186,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
> kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
> kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
> }
> - apic->irr_pending = vcpu->arch.apicv_active;
> - apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
> + kvm_apic_update_apicv(vcpu);
> apic->highest_isr_cache = -1;
> update_divide_count(apic);
> atomic_set(&apic->lapic_timer.pending, 0);
> @@ -2433,9 +2448,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
> apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
> update_divide_count(apic);
> start_apic_timer(apic);
> - apic->irr_pending = true;
> - apic->isr_count = vcpu->arch.apicv_active ?
> - 1 : count_vectors(apic->regs + APIC_ISR);
> + kvm_apic_update_apicv(vcpu);
> apic->highest_isr_cache = -1;
> if (vcpu->arch.apicv_active) {
> kvm_x86_ops->apicv_post_state_restore(vcpu);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index ff6ef9c3d760..b657605cfec0 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -88,6 +88,7 @@ void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
> int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
> struct dest_map *dest_map);
> int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
> +void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
>
> bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
>
Hi Jan,
On 5/8/19 12:37 PM, Jan H. Schönherr wrote:
> [CAUTION: External Email]
>
> Hi Suravee.
>
> I wonder, how this interacts with Hyper-V SynIC; see comments below.
>
> On 22/03/2019 12.57, Suthikulpanit, Suravee wrote:
>> AMD AVIC does not support ExtINT. Therefore, AVIC must be temporary
>> deactivated and fall back to using legacy interrupt injection via
>> vINTR and interrupt window.
>>
>> Introduce svm_request_activate/deactivate_avic() helper functions,
>> which handle steps required to activate/deactivate AVIC.
>>
>> Signed-off-by: Suravee Suthikulpanit <[email protected]>
>> ---
>> arch/x86/kvm/svm.c | 57 +++++++++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 54 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index f41f34f70dde..84116e689d5f 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -391,6 +391,8 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa";
>> static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
>> static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
>> static void svm_complete_interrupts(struct vcpu_svm *svm);
>> +static void svm_request_activate_avic(struct kvm_vcpu *vcpu);
>> +static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu);
>>
>> static int nested_svm_exit_handled(struct vcpu_svm *svm);
>> static int nested_svm_intercept(struct vcpu_svm *svm);
>> @@ -2109,6 +2111,9 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
>> {
>> struct vcpu_svm *svm = to_svm(vcpu);
>>
>> + if (!kvm_vcpu_apicv_active(vcpu))
>> + return;
>> +
>> svm->avic_is_running = is_run;
>> if (is_run)
>> avic_vcpu_load(vcpu, vcpu->cpu);
>> @@ -2356,6 +2361,10 @@ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
>>
>> static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
>> {
>> + if (kvm_check_request(KVM_REQ_APICV_ACTIVATE, vcpu))
>> + kvm_vcpu_activate_apicv(vcpu);
>> + if (kvm_check_request(KVM_REQ_APICV_DEACTIVATE, vcpu))
>> + kvm_vcpu_deactivate_apicv(vcpu);
>> avic_set_running(vcpu, true);
>> }
>>
>> @@ -4505,6 +4514,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
>> {
>> kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
>> svm_clear_vintr(svm);
>> +
>> + /*
>> + * For AVIC, the only reason to end up here is ExtINTs.
>> + * In this case AVIC was temporarily disabled for
>> + * requesting the IRQ window and we have to re-enable it.
>> + */
>> + if (svm_get_enable_apicv(&svm->vcpu))
>> + svm_request_activate_avic(&svm->vcpu);
>> +
>
> Are we sure, we're not accidentally re-enabling AVIC, if it was disabled via
> kvm_hv_activate_synic()?
Actually, I missed this case. Now I have a solution that I'll be send out for review in V2.
>> svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
>> mark_dirty(svm->vmcb, VMCB_INTR);
>> ++svm->vcpu.stat.irq_window_exits;
>> @@ -5206,6 +5224,34 @@ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
>> {
>> }
>>
>> +static bool is_avic_active(struct vcpu_svm *svm)
>> +{
>> + return (svm_get_enable_apicv(&svm->vcpu) &&
>> + svm->vmcb->control.int_ctl & AVIC_ENABLE_MASK);
>> +}
>> +
>> +static void svm_request_activate_avic(struct kvm_vcpu *vcpu)
>> +{
>> + struct vcpu_svm *svm = to_svm(vcpu);
>> +
>> + if (!lapic_in_kernel(vcpu) || is_avic_active(svm))
>> + return;
>> +
>> + avic_setup_access_page(vcpu, false);
>> + kvm_make_apicv_activate_request(vcpu->kvm);
>> +}
>> +
>> +static void svm_request_deactivate_avic(struct kvm_vcpu *vcpu)
>> +{
>> + struct vcpu_svm *svm = to_svm(vcpu);
>> +
>> + if (!lapic_in_kernel(vcpu) || !is_avic_active(svm))
>> + return;
>> +
>> + avic_destroy_access_page(vcpu);
>
> Something like avic_destroy_access_page() is not called, when AVIC is
> disabled via kvm_hv_activate_synic().
>
> Is that an oversight in the other code path, is it not needed here,
> or am I missing something?
This is an oversight. I also have a fix for this in V2.
Thanks,
Suravee
Jan,
On 5/8/2019 2:14 PM, Jan H. Schönherr wrote:
> On 22/03/2019 12.57, Suthikulpanit, Suravee wrote:
>> Activate/deactivate AVIC requires setting/unsetting the memory region used
>> for APIC_ACCESS_PAGE_PRIVATE_MEMSLOT. So, re-factor avic_init_access_page()
>> to avic_setup_access_page() and add srcu_read_lock/unlock, which are needed
>> to allow this function to be called during run-time.
>>
>> Also, introduce avic_destroy_access_page() to unset the page when
>> deactivate AVIC.
>>
>> Signed-off-by: Suravee Suthikulpanit<[email protected]>
>> ---
>> arch/x86/kvm/svm.c | 28 ++++++++++++++++++++++++++--
>> 1 file changed, 26 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 4cf93a729ad8..f41f34f70dde 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -1666,7 +1666,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
>> * field of the VMCB. Therefore, we set up the
>> * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
>> */
>> -static int avic_init_access_page(struct kvm_vcpu *vcpu)
>> +static int avic_setup_access_page(struct kvm_vcpu *vcpu, bool init)
>> {
>> struct kvm *kvm = vcpu->kvm;
>> int ret = 0;
>> @@ -1675,10 +1675,14 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
>> if (kvm->arch.apic_access_page_done)
>> goto out;
>>
>> + if (!init)
>> + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
>> ret = __x86_set_memory_region(kvm,
>> APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
>> APIC_DEFAULT_PHYS_BASE,
>> PAGE_SIZE);
>> + if (!init)
>> + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>> if (ret)
>> goto out;
>>
>> @@ -1688,6 +1692,26 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
>> return ret;
>> }
>>
>> +static void avic_destroy_access_page(struct kvm_vcpu *vcpu)
>> +{
>> + struct kvm *kvm = vcpu->kvm;
>> +
>> + mutex_lock(&kvm->slots_lock);
>> +
>> + if (!kvm->arch.apic_access_page_done)
>> + goto out;
>> +
>> + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
>> + __x86_set_memory_region(kvm,
>> + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
>> + APIC_DEFAULT_PHYS_BASE,
>> + 0);
>> + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> This pattern of "unlock, do something, re-lock" strikes me as odd --
> here and in the setup function.
>
> There seem to be a few assumptions for this to work:
> a) SRCU read-side critical sections must not be nested.
> b) We must not keep any pointer to a SRCU protected structure
> across a call to this function.
>
> Can we guarantee these assumptions? Now and in the future (given that this is already
> a bit hidden in the call stack)?
>
> (And if we can guarantee them, why are we holding the SRCU lock in the first place?)
The reason we need to call srcu_read_unlock() here is because the srcu_read_lock() is
called at the beginning of vcpu_run() before going into vcpu_enter_guest().
Here, since we need to __x86_set_memory_region(), which update the page table.
If we do not call srcu_read_unlock(), we end up with the following call trace:
[94617.992835] INFO: task qemu-system-x86:246799 blocked for more than 120 seconds.
[94618.001635] Not tainted 5.1.0-avic+ #14
[94618.006934] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[94618.016114] qemu-system-x86 D 0 246799 246788 0x00000084
[94618.022773] Call Trace:
[94618.025919] ? __schedule+0x2f9/0x860
[94618.030416] schedule+0x32/0x70
[94618.034335] schedule_timeout+0x1d8/0x300
[94618.039234] ? __queue_work+0x12c/0x3b0
[94618.043938] wait_for_completion+0xb9/0x140
[94618.049036] ? wake_up_q+0x70/0x70
[94618.053265] __synchronize_srcu.part.17+0x8a/0xc0
[94618.058959] ? __bpf_trace_rcu_invoke_callback+0x10/0x10
[94618.065359] install_new_memslots+0x56/0x90 [kvm]
[94618.071080] __kvm_set_memory_region+0x7df/0x8c0 [kvm]
[94618.077275] __x86_set_memory_region+0xb6/0x190 [kvm]
[94618.083347] svm_pre_update_apicv_exec_ctrl+0x42/0x70 [kvm_amd]
[94618.090410] kvm_make_apicv_deactivate_request+0xa4/0xd0 [kvm]
[94618.097450] enable_irq_window+0x119/0x170 [kvm_amd]
[94618.103461] kvm_arch_vcpu_ioctl_run+0x8bf/0x1a80 [kvm]
[94618.109774] kvm_vcpu_ioctl+0x3ab/0x5d0 [kvm]
[94618.115119] do_vfs_ioctl+0xa9/0x630
[94618.119593] ksys_ioctl+0x60/0x90
[94618.123780] __x64_sys_ioctl+0x16/0x20
[94618.128459] do_syscall_64+0x55/0x110
[94618.133037] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[94618.139163] RIP: 0033:0x7f2a9d5478d7
[94618.143630] Code: Bad RIP value.
[94618.147707] RSP: 002b:00007f2a9ade4988 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[94618.156660] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2a9d5478d7
[94618.165160] RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000011
[94618.173649] RBP: 00007f2a9ade4a90 R08: 0000000000000000 R09: 00000000000000ff
[94618.182142] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000
[94618.190641] R13: 0000000000801000 R14: 0000000000000000 R15: 00007f2a9ade5700
Regards,
Suravee
On 22/03/19 12:57, Suthikulpanit, Suravee wrote:
> Add hooks for handling the case when guest VM update APIC ID, DFR and LDR.
> This is needed during AMD AVIC is temporary deactivated.
>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
Why not do this later when AVIC is reactivated, in
svm_refresh_apicv_exec_ctrl?
Thanks,
Paolo
> ---
> arch/x86/include/asm/kvm_host.h | 3 +++
> arch/x86/kvm/lapic.c | 6 ++++++
> 2 files changed, 9 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index a5db4475e72d..1906e205e6a3 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1077,6 +1077,9 @@ struct kvm_x86_ops {
> void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
> void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
> void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
> + void (*hwapic_apic_id_update)(struct kvm_vcpu *vcpu);
> + void (*hwapic_dfr_update)(struct kvm_vcpu *vcpu);
> + void (*hwapic_ldr_update)(struct kvm_vcpu *vcpu);
> bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
> void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
> void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 991fdf7fc17f..95295cf81283 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -262,12 +262,16 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
> {
> kvm_lapic_set_reg(apic, APIC_ID, id << 24);
> + if (kvm_x86_ops->hwapic_apic_id_update)
> + kvm_x86_ops->hwapic_apic_id_update(apic->vcpu);
> recalculate_apic_map(apic->vcpu->kvm);
> }
>
> static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> {
> kvm_lapic_set_reg(apic, APIC_LDR, id);
> + if (kvm_x86_ops->hwapic_ldr_update)
> + kvm_x86_ops->hwapic_ldr_update(apic->vcpu);
> recalculate_apic_map(apic->vcpu->kvm);
> }
>
> @@ -1836,6 +1840,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> case APIC_DFR:
> if (!apic_x2apic_mode(apic)) {
> kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> + if (kvm_x86_ops->hwapic_dfr_update)
> + kvm_x86_ops->hwapic_dfr_update(apic->vcpu);
> recalculate_apic_map(apic->vcpu->kvm);
> } else
> ret = 1;
>
Jan,
On 5/8/2019 5:27 PM, Jan H. Schönherr wrote:
> On 22/03/2019 12.57, Suthikulpanit, Suravee wrote:
>> Introduce a helper function for setting lapic parameters when
>> activate/deactivate apicv.
>>
>> Signed-off-by: Suravee Suthikulpanit<[email protected]>
>> ---
>> arch/x86/kvm/lapic.c | 23 ++++++++++++++++++-----
>> arch/x86/kvm/lapic.h | 1 +
>> 2 files changed, 19 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 95295cf81283..9c5cd1a98928 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -2126,6 +2126,22 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>>
>> }
>>
>> +void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
>> +{
>> + struct kvm_lapic *apic = vcpu->arch.apic;
>> + bool active = vcpu->arch.apicv_active;
>> +
>> + if (active) {
>> + /* irr_pending is always true when apicv is activated. */
>> + apic->irr_pending = true;
>> + apic->isr_count = 1;
>> + } else {
>> + apic->irr_pending = !!count_vectors(apic->regs + APIC_IRR);
> What about:
> apic->irr_pending = apic_search_irr(apic) != -1;
> instead? (more in line with the logic in apic_clear_irr())
Sure, that works also.
> Related to this, I wonder if we need to ensure to execute
> kvm_x86_ops->sync_pir_to_irr() just before apicv_active transitions
> from true to false, so that we don't miss an interrupt and irr_pending
> is set correctly in this function (on Intel at least).
I'm not sure about the PIR on Intel, but AMD since AMD IOMMU hardware
directly updates the vAPIC backing page, the driver should not need
to worry.
> Hmm... there seems to be other stuff as well, that depends on
> vcpu->arch.apicv_active, which is not updated on a transition. For
> example: posted interrupts,
You are right. We need to also handle the posted-interrupt
activate/deactivate. I am also doing this in V2.
> CR8 intercept,
When APICv is deactivated, the update_cr8_intercept() should handle
updating of CR8 interception.
> and a potential asymmetry via avic_vcpu_load()/avic_vcpu_put()
> because the bottom half of just one of the two functions may be
> skipped
Not sure if I understand the concern that you mentioned here.
However, once we add the IOMMU activation/deactivation code for
posted-interrupt, we should not need to worry about calling
avic_update_iommu_vcpu_affinity() at the end of the functions
here.
Thanks,
Suravee
> Regards
> Jan
>
Paolo,
On 7/3/2019 4:16 PM, Paolo Bonzini wrote:
> On 22/03/19 12:57, Suthikulpanit, Suravee wrote:
>> Add hooks for handling the case when guest VM update APIC ID, DFR and LDR.
>> This is needed during AMD AVIC is temporary deactivated.
>>
>> Signed-off-by: Suravee Suthikulpanit<[email protected]>
> Why not do this later when AVIC is reactivated, in
> svm_refresh_apicv_exec_ctrl?
>
> Thanks,
>
> Paolo
>
Actually, calling avic_post_state_restore() in the svm_refresh_apicv_exec_ctrl()
should work also. I'll do that then.
Thanks,
Suravee