Resend for a typo in the LKML address in the previous patch sending.
Upon receiving an external interrupt, KVM VMX reinjects it through
calling the interrupt handler in its IDT descriptor on the current
kernel stack, which essentially uses the IDT as an interrupt dispatch
table.
However the IDT is one of the lowest level critical data structures
between a x86 CPU and the Linux kernel, we should avoid using it
*directly* whenever possible, espeically in a software defined manner.
On x86, external interrupts are divided into the following groups
1) system interrupts
2) external device interrupts
With the IDT, system interrupts are dispatched through the IDT
directly, while external device interrupts are all routed to the
external interrupt dispatch function common_interrupt(), which
dispatches external device interrupts through a per-CPU external
interrupt dispatch table vector_irq.
Implement software based NMI/IRQ dispatch for VMX NMI/IRQ reinjection
to eliminate dispatching external interrupts through the IDT with adding
a system interrupt handler table for dispatching a system interrupt
to its corresponding handler directly. Thus a software based dispatch
function will be:
void external_interrupt(struct pt_regs *regs, u8 vector)
{
if (is_system_interrupt(vector))
system_interrupt_handler_table[vector_to_sysvec(vector)](regs);
else /* external device interrupt */
common_interrupt(regs, vector);
}
And the software dispatch approach nukes a bunch of assembly.
What's more, with the Intel FRED (Flexible Return and Event Delivery)
architecture, IDT, the hardware based event dispatch table, is gone,
and the Linux kernel needs to dispatch events to their handlers with
vector to handler mappings, the dispatch function external_interrupt()
is also needed.
H. Peter Anvin (Intel) (1):
x86/traps: let common_interrupt() handle IRQ_MOVE_CLEANUP_VECTOR
Xin Li (5):
x86/traps: add a system interrupt table for system interrupt dispatch
x86/traps: add install_system_interrupt_handler()
x86/traps: add external_interrupt() to dispatch external interrupts
KVM: x86/VMX: add kvm_vmx_reinject_nmi_irq() for NMI/IRQ reinjection
x86/traps: remove unused NMI entry exc_nmi_noist()
arch/x86/include/asm/idtentry.h | 15 -----
arch/x86/include/asm/traps.h | 12 ++++
arch/x86/kernel/cpu/acrn.c | 7 +-
arch/x86/kernel/cpu/mshyperv.c | 22 ++++---
arch/x86/kernel/irq.c | 4 ++
arch/x86/kernel/kvm.c | 4 +-
arch/x86/kernel/nmi.c | 10 ---
arch/x86/kernel/traps.c | 107 +++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmenter.S | 33 ----------
arch/x86/kvm/vmx/vmx.c | 19 ++----
drivers/xen/events/events_base.c | 5 +-
11 files changed, 156 insertions(+), 82 deletions(-)
--
2.34.1
To eliminate dispatching NMI/IRQ through the IDT, add
kvm_vmx_reinject_nmi_irq(), which calls external_interrupt()
for IRQ reinjection.
Lastly replace calling a NMI/IRQ handler in an IDT descriptor
with calling kvm_vmx_reinject_nmi_irq().
Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
Signed-off-by: Xin Li <[email protected]>
---
arch/x86/include/asm/traps.h | 2 ++
arch/x86/kernel/traps.c | 23 +++++++++++++++++++++++
arch/x86/kvm/vmx/vmenter.S | 33 ---------------------------------
arch/x86/kvm/vmx/vmx.c | 19 +++++++------------
4 files changed, 32 insertions(+), 45 deletions(-)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 89c4233e19db..4c56a8d31762 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -57,4 +57,6 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
unsigned long vector __maybe_unused)
typedef DECLARE_SYSTEM_INTERRUPT_HANDLER((*system_interrupt_handler));
+void kvm_vmx_reinject_nmi_irq(u32 vector);
+
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c1eb3bd335ce..9abf91534b13 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1528,6 +1528,29 @@ __visible noinstr void external_interrupt(struct pt_regs *regs,
common_interrupt(regs, vector);
}
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * KVM VMX reinjects NMI/IRQ on its current stack, it's a sync
+ * call thus the values in the pt_regs structure are not used in
+ * executing NMI/IRQ handlers, except cs.RPL and flags.IF, which
+ * are both always 0 in the VMX NMI/IRQ reinjection context. Thus
+ * we simply allocate a zeroed pt_regs structure on current stack
+ * to call external_interrupt().
+ */
+void kvm_vmx_reinject_nmi_irq(u32 vector)
+{
+ struct pt_regs irq_regs;
+
+ memset(&irq_regs, 0, sizeof(irq_regs));
+
+ if (vector == NMI_VECTOR)
+ return exc_nmi(&irq_regs);
+
+ external_interrupt(&irq_regs, vector);
+}
+EXPORT_SYMBOL_GPL(kvm_vmx_reinject_nmi_irq);
+#endif
+
void __init trap_init(void)
{
/* Init cpu_entry_area before IST entries are set up */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8477d8bdd69c..0c1608b329cd 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -317,36 +317,3 @@ SYM_FUNC_START(vmread_error_trampoline)
RET
SYM_FUNC_END(vmread_error_trampoline)
-
-SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
- /*
- * Unconditionally create a stack frame, getting the correct RSP on the
- * stack (for x86-64) would take two instructions anyways, and RBP can
- * be used to restore RSP to make objtool happy (see below).
- */
- push %_ASM_BP
- mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
- /*
- * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
- * creating the synthetic interrupt stack frame for the IRQ/NMI.
- */
- and $-16, %rsp
- push $__KERNEL_DS
- push %rbp
-#endif
- pushf
- push $__KERNEL_CS
- CALL_NOSPEC _ASM_ARG1
-
- /*
- * "Restore" RSP from RBP, even though IRET has already unwound RSP to
- * the correct value. objtool doesn't know the callee will IRET and,
- * without the explicit restore, thinks the stack is getting walloped.
- * Using an unwind hint is problematic due to x86-64's dynamic alignment.
- */
- mov %_ASM_BP, %_ASM_SP
- pop %_ASM_BP
- RET
-SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63247c57c72c..b457e4888468 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -46,6 +46,7 @@
#include <asm/mshyperv.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
+#include <asm/traps.h>
#include <asm/virtext.h>
#include <asm/vmx.h>
@@ -6758,15 +6759,11 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
- unsigned long entry)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 vector)
{
- bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
-
- kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
- vmx_do_interrupt_nmi_irqoff(entry);
+ kvm_before_interrupt(vcpu, vector == NMI_VECTOR ?
+ KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
+ kvm_vmx_reinject_nmi_irq(vector);
kvm_after_interrupt(vcpu);
}
@@ -6792,7 +6789,6 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
- const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6806,20 +6802,19 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, NMI_VECTOR);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ handle_interrupt_nmi_irqoff(vcpu, vector);
vcpu->arch.at_instruction_boundary = true;
}
--
2.34.1
After the introduction of kvm_vmx_reinject_irq(), exc_nmi_noist()
is no longer needed, thus remove it.
Signed-off-by: Xin Li <[email protected]>
---
arch/x86/include/asm/idtentry.h | 15 ---------------
arch/x86/kernel/nmi.c | 10 ----------
2 files changed, 25 deletions(-)
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 72184b0b2219..da28ac17c57a 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -581,21 +581,6 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
#endif
/* NMI */
-
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
-/*
- * Special NOIST entry point for VMX which invokes this on the kernel
- * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
- * 'executing' marker.
- *
- * On 32bit this just uses the regular NMI entry point because 32-bit does
- * not have ISTs.
- */
-DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
-#else
-#define asm_exc_nmi_noist asm_exc_nmi
-#endif
-
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index cec0bfa3bc04..816bb59a4ba4 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -527,16 +527,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
mds_user_clear_cpu_buffers();
}
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_noist)
-{
- exc_nmi(regs);
-}
-#endif
-#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
-#endif
-
void stop_nmi(void)
{
ignore_nmis++;
--
2.34.1
From: "H. Peter Anvin (Intel)" <[email protected]>
IRQ_MOVE_CLEANUP_VECTOR is the only one of the system IRQ vectors that
is *below* FIRST_SYSTEM_VECTOR. It is a slow path, so just push it
into common_interrupt() just before the spurios interrupt handling.
Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
Signed-off-by: Xin Li <[email protected]>
---
arch/x86/kernel/irq.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 766ffe3ba313..7e125fff45ab 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -248,6 +248,10 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
handle_irq(desc, regs);
+#ifdef CONFIG_SMP
+ } else if (vector == IRQ_MOVE_CLEANUP_VECTOR) {
+ sysvec_irq_move_cleanup(regs);
+#endif
} else {
ack_APIC_irq();
--
2.34.1
Upon receiving an external interrupt, KVM VMX reinjects it through
calling the interrupt handler in its IDT descriptor on the current
kernel stack, which essentially uses the IDT as an interrupt dispatch
table.
However the IDT is one of the lowest level critical data structures
between a x86 CPU and the Linux kernel, we should avoid using it
*directly* whenever possible, espeically in a software defined manner.
On x86, external interrupts are divided into the following groups
1) system interrupts
2) external device interrupts
With the IDT, system interrupts are dispatched through the IDT
directly, while external device interrupts are all routed to the
external interrupt dispatch function common_interrupt(), which
dispatches external device interrupts through a per-CPU external
interrupt dispatch table vector_irq.
To eliminate dispatching external interrupts through the IDT, add
a system interrupt handler table for dispatching a system interrupt
to its corresponding handler directly. Thus a software based dispatch
function will be:
void external_interrupt(struct pt_regs *regs, u8 vector)
{
if (is_system_interrupt(vector))
system_interrupt_handler_table[vector_to_sysvec(vector)](regs);
else /* external device interrupt */
common_interrupt(regs, vector);
}
What's more, with the Intel FRED (Flexible Return and Event Delivery)
architecture, IDT, the hardware based event dispatch table, is gone,
and the Linux kernel needs to dispatch events to their handlers with
vector to handler mappings, the dispatch function external_interrupt()
is also needed.
Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
Signed-off-by: Xin Li <[email protected]>
---
arch/x86/include/asm/traps.h | 8 ++++++
arch/x86/kernel/traps.c | 55 ++++++++++++++++++++++++++++++++++++
2 files changed, 63 insertions(+)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 47ecfff2c83d..3dc63d753bda 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -47,4 +47,12 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
struct stack_info *info);
#endif
+/*
+ * How system interrupt handlers are called.
+ */
+#define DECLARE_SYSTEM_INTERRUPT_HANDLER(f) \
+ void f (struct pt_regs *regs __maybe_unused, \
+ unsigned long vector __maybe_unused)
+typedef DECLARE_SYSTEM_INTERRUPT_HANDLER((*system_interrupt_handler));
+
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 178015a820f0..95dd917ef9ad 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1444,6 +1444,61 @@ DEFINE_IDTENTRY_SW(iret_error)
}
#endif
+#define SYSV(x,y) [(x) - FIRST_SYSTEM_VECTOR] = (system_interrupt_handler)y
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-function-type"
+
+/*
+ * The initializer spurious_interrupt() has two arguments of types struct
+ * pt_regs * and unsigned long, and the system interrupt handlers with
+ * prefix sysvec_ are all defined with either DEFINE_IDTENTRY_SYSVEC or
+ * DEFINE_IDTENTRY_SYSVEC_SIMPLE, both with only one argument of type
+ * struct pt_regs *. Because all handlers only declare and require a subset
+ * of the arguments provided by the full system_interrupt_handler prototype,
+ * the function type cast is safe here.
+ */
+const system_interrupt_handler system_interrupt_handler_table[NR_SYSTEM_VECTORS] = {
+ [0 ... NR_SYSTEM_VECTORS-1] = spurious_interrupt,
+#ifdef CONFIG_SMP
+ SYSV(RESCHEDULE_VECTOR, sysvec_reschedule_ipi),
+ SYSV(CALL_FUNCTION_VECTOR, sysvec_call_function),
+ SYSV(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single),
+ SYSV(REBOOT_VECTOR, sysvec_reboot),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ SYSV(THERMAL_APIC_VECTOR, sysvec_thermal),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ SYSV(THRESHOLD_APIC_VECTOR, sysvec_threshold),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ SYSV(DEFERRED_ERROR_VECTOR, sysvec_deferred_error),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ SYSV(LOCAL_TIMER_VECTOR, sysvec_apic_timer_interrupt),
+ SYSV(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi),
+# ifdef CONFIG_HAVE_KVM
+ SYSV(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi),
+ SYSV(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi),
+ SYSV(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ SYSV(IRQ_WORK_VECTOR, sysvec_irq_work),
+# endif
+ SYSV(SPURIOUS_APIC_VECTOR, sysvec_spurious_apic_interrupt),
+ SYSV(ERROR_APIC_VECTOR, sysvec_error_interrupt),
+#endif
+};
+
+#pragma GCC diagnostic pop
+
+#undef SYSV
+
void __init trap_init(void)
{
/* Init cpu_entry_area before IST entries are set up */
--
2.34.1
On Wed, Nov 09, 2022 at 10:15:44PM -0800, Xin Li wrote:
> To eliminate dispatching NMI/IRQ through the IDT, add
> kvm_vmx_reinject_nmi_irq(), which calls external_interrupt()
> for IRQ reinjection.
>
> Lastly replace calling a NMI/IRQ handler in an IDT descriptor
> with calling kvm_vmx_reinject_nmi_irq().
>
> Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> Signed-off-by: Xin Li <[email protected]>
Idem.
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * KVM VMX reinjects NMI/IRQ on its current stack, it's a sync
> + * call thus the values in the pt_regs structure are not used in
> + * executing NMI/IRQ handlers, except cs.RPL and flags.IF, which
> + * are both always 0 in the VMX NMI/IRQ reinjection context. Thus
> + * we simply allocate a zeroed pt_regs structure on current stack
> + * to call external_interrupt().
> + */
> +void kvm_vmx_reinject_nmi_irq(u32 vector)
noinstr ?
> +{
> + struct pt_regs irq_regs;
> +
> + memset(&irq_regs, 0, sizeof(irq_regs));
> +
> + if (vector == NMI_VECTOR)
> + return exc_nmi(&irq_regs);
> +
> + external_interrupt(&irq_regs, vector);
> +}
> +EXPORT_SYMBOL_GPL(kvm_vmx_reinject_nmi_irq);
> +#endif
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 63247c57c72c..b457e4888468 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -46,6 +46,7 @@
> #include <asm/mshyperv.h>
> #include <asm/mwait.h>
> #include <asm/spec-ctrl.h>
> +#include <asm/traps.h>
> #include <asm/virtext.h>
> #include <asm/vmx.h>
>
> @@ -6758,15 +6759,11 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
> memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
> }
>
> -void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
> -
> -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
> - unsigned long entry)
> +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 vector)
> {
> - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
> -
> - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
> - vmx_do_interrupt_nmi_irqoff(entry);
> + kvm_before_interrupt(vcpu, vector == NMI_VECTOR ?
> + KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
> + kvm_vmx_reinject_nmi_irq(vector);
> kvm_after_interrupt(vcpu);
> }
>
> @@ -6792,7 +6789,6 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
>
> static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> {
> - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
> u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
>
> /* if exit due to PF check for async PF */
> @@ -6806,20 +6802,19 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> kvm_machine_check();
> /* We need to handle NMIs before interrupts are enabled */
> else if (is_nmi(intr_info))
> - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
> + handle_interrupt_nmi_irqoff(&vmx->vcpu, NMI_VECTOR);
> }
>
> static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> {
> u32 intr_info = vmx_get_intr_info(vcpu);
> unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> - gate_desc *desc = (gate_desc *)host_idt_base + vector;
>
> if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
> "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
> return;
>
> - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> + handle_interrupt_nmi_irqoff(vcpu, vector);
> vcpu->arch.at_instruction_boundary = true;
> }
How does any of this work? You're calling into entry/noinstr code from a
random context.
On Wed, Nov 09, 2022 at 10:15:41PM -0800, Xin Li wrote:
> Upon receiving an external interrupt, KVM VMX reinjects it through
> calling the interrupt handler in its IDT descriptor on the current
> kernel stack, which essentially uses the IDT as an interrupt dispatch
> table.
>
> However the IDT is one of the lowest level critical data structures
> between a x86 CPU and the Linux kernel, we should avoid using it
> *directly* whenever possible, espeically in a software defined manner.
>
> On x86, external interrupts are divided into the following groups
> 1) system interrupts
> 2) external device interrupts
> With the IDT, system interrupts are dispatched through the IDT
> directly, while external device interrupts are all routed to the
> external interrupt dispatch function common_interrupt(), which
> dispatches external device interrupts through a per-CPU external
> interrupt dispatch table vector_irq.
>
> To eliminate dispatching external interrupts through the IDT, add
> a system interrupt handler table for dispatching a system interrupt
> to its corresponding handler directly. Thus a software based dispatch
> function will be:
>
> void external_interrupt(struct pt_regs *regs, u8 vector)
> {
> if (is_system_interrupt(vector))
> system_interrupt_handler_table[vector_to_sysvec(vector)](regs);
> else /* external device interrupt */
> common_interrupt(regs, vector);
> }
>
> What's more, with the Intel FRED (Flexible Return and Event Delivery)
> architecture, IDT, the hardware based event dispatch table, is gone,
> and the Linux kernel needs to dispatch events to their handlers with
> vector to handler mappings, the dispatch function external_interrupt()
> is also needed.
>
> Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> Signed-off-by: Xin Li <[email protected]>
This is not a valid SOB, it would suggest hpa is the author, but he's
not in in From.
> ---
> arch/x86/include/asm/traps.h | 8 ++++++
> arch/x86/kernel/traps.c | 55 ++++++++++++++++++++++++++++++++++++
> 2 files changed, 63 insertions(+)
>
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 47ecfff2c83d..3dc63d753bda 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -47,4 +47,12 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
> struct stack_info *info);
> #endif
>
> +/*
> + * How system interrupt handlers are called.
> + */
> +#define DECLARE_SYSTEM_INTERRUPT_HANDLER(f) \
> + void f (struct pt_regs *regs __maybe_unused, \
> + unsigned long vector __maybe_unused)
> +typedef DECLARE_SYSTEM_INTERRUPT_HANDLER((*system_interrupt_handler));
> +
> #endif /* _ASM_X86_TRAPS_H */
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 178015a820f0..95dd917ef9ad 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -1444,6 +1444,61 @@ DEFINE_IDTENTRY_SW(iret_error)
> }
> #endif
>
> +#define SYSV(x,y) [(x) - FIRST_SYSTEM_VECTOR] = (system_interrupt_handler)y
> +
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wcast-function-type"
How does this not break CFI ?
On Wed, Nov 09, 2022 at 10:15:40PM -0800, Xin Li wrote:
> From: "H. Peter Anvin (Intel)" <[email protected]>
>
> IRQ_MOVE_CLEANUP_VECTOR is the only one of the system IRQ vectors that
> is *below* FIRST_SYSTEM_VECTOR. It is a slow path, so just push it
> into common_interrupt() just before the spurios interrupt handling.
nit:
s/spurios/spurious
>
> Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> Signed-off-by: Xin Li <[email protected]>
> ---
> arch/x86/kernel/irq.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> index 766ffe3ba313..7e125fff45ab 100644
> --- a/arch/x86/kernel/irq.c
> +++ b/arch/x86/kernel/irq.c
> @@ -248,6 +248,10 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
> desc = __this_cpu_read(vector_irq[vector]);
> if (likely(!IS_ERR_OR_NULL(desc))) {
> handle_irq(desc, regs);
> +#ifdef CONFIG_SMP
> + } else if (vector == IRQ_MOVE_CLEANUP_VECTOR) {
> + sysvec_irq_move_cleanup(regs);
> +#endif
> } else {
> ack_APIC_irq();
>
> --
> 2.34.1
>
> > Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> > Signed-off-by: Xin Li <[email protected]>
>
> This is not a valid SOB, it would suggest hpa is the author, but he's not in in
> From.
HPA wrote the initial dispatch code for FRED, and I worked with him to
refactor it for KVM/VMX NMI/IRQ dispatch. So use SOB from both. No?
> > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index
> > 178015a820f0..95dd917ef9ad 100644
> > --- a/arch/x86/kernel/traps.c
> > +++ b/arch/x86/kernel/traps.c
> > @@ -1444,6 +1444,61 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif
> >
> > +#define SYSV(x,y) [(x) - FIRST_SYSTEM_VECTOR] =
> > +(system_interrupt_handler)y
> > +
> > +#pragma GCC diagnostic push
> > +#pragma GCC diagnostic ignored "-Wcast-function-type"
>
> How does this not break CFI ?
I wasn't aware of it, will check.
> > > +#pragma GCC diagnostic push
> > > +#pragma GCC diagnostic ignored "-Wcast-function-type"
> >
> > How does this not break CFI ?
>
> I wasn't aware of it, will check.
CFI needs $(cc-option,-fsanitize=kcfi), which, reported on LWN on Jun, 2002,
had not yet landed in the LLVM mainline (I'm using GCC). So looks we are
replying on people keeping an eye on it to make sure it's not broken?
Even we are unable to test it now, we should find a solution.
Thanks!
Xin
On Thu, Nov 10, 2022 at 08:36:30PM +0000, Li, Xin3 wrote:
> > > > +#pragma GCC diagnostic push
> > > > +#pragma GCC diagnostic ignored "-Wcast-function-type"
> > >
> > > How does this not break CFI ?
> >
> > I wasn't aware of it, will check.
>
> CFI needs $(cc-option,-fsanitize=kcfi), which, reported on LWN on Jun, 2002,
> had not yet landed in the LLVM mainline (I'm using GCC). So looks we are
> replying on people keeping an eye on it to make sure it's not broken?
Well, the entire point of the warning that you are disabling here is to
catch potential CFI failures at compile time, rather than run time :)
Clang also has -Wcast-function-type-strict, which Gustavo and Kees are
working on getting enabled, so that even more CFI failures can be caught
at compile time.
https://github.com/ClangBuiltLinux/linux/issues/1724
https://lore.kernel.org/all/?q=-Wcast-function-type-strict
Cheers,
Nathan
> > To eliminate dispatching NMI/IRQ through the IDT, add
> > kvm_vmx_reinject_nmi_irq(), which calls external_interrupt() for IRQ
> > reinjection.
> >
> > Lastly replace calling a NMI/IRQ handler in an IDT descriptor with
> > calling kvm_vmx_reinject_nmi_irq().
> >
> > Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> > Signed-off-by: Xin Li <[email protected]>
>
> Idem.
>
>
> > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > +/*
> > + * KVM VMX reinjects NMI/IRQ on its current stack, it's a sync
> > + * call thus the values in the pt_regs structure are not used in
> > + * executing NMI/IRQ handlers, except cs.RPL and flags.IF, which
> > + * are both always 0 in the VMX NMI/IRQ reinjection context. Thus
> > + * we simply allocate a zeroed pt_regs structure on current stack
> > + * to call external_interrupt().
> > + */
> > +void kvm_vmx_reinject_nmi_irq(u32 vector)
>
> noinstr ?
Seems it's not needed to me.
>
> > +{
> > + struct pt_regs irq_regs;
> > +
> > + memset(&irq_regs, 0, sizeof(irq_regs));
> > +
> > + if (vector == NMI_VECTOR)
> > + return exc_nmi(&irq_regs);
> > +
> > + external_interrupt(&irq_regs, vector); }
> > +EXPORT_SYMBOL_GPL(kvm_vmx_reinject_nmi_irq);
> > +#endif
>
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > 63247c57c72c..b457e4888468 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -46,6 +46,7 @@
> > #include <asm/mshyperv.h>
> > #include <asm/mwait.h>
> > #include <asm/spec-ctrl.h>
> > +#include <asm/traps.h>
> > #include <asm/virtext.h>
> > #include <asm/vmx.h>
> >
> > @@ -6758,15 +6759,11 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
> > memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); }
> >
> > -void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
> > -
> > -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
> > - unsigned long entry)
> > +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32
> > +vector)
> > {
> > - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
> > -
> > - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI :
> KVM_HANDLING_IRQ);
> > - vmx_do_interrupt_nmi_irqoff(entry);
> > + kvm_before_interrupt(vcpu, vector == NMI_VECTOR ?
> > + KVM_HANDLING_NMI :
> KVM_HANDLING_IRQ);
> > + kvm_vmx_reinject_nmi_irq(vector);
> > kvm_after_interrupt(vcpu);
> > }
> >
> > @@ -6792,7 +6789,6 @@ static void handle_nm_fault_irqoff(struct
> > kvm_vcpu *vcpu)
> >
> > static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) {
> > - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
> > u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
> >
> > /* if exit due to PF check for async PF */ @@ -6806,20 +6802,19 @@
> > static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> > kvm_machine_check();
> > /* We need to handle NMIs before interrupts are enabled */
> > else if (is_nmi(intr_info))
> > - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
> > + handle_interrupt_nmi_irqoff(&vmx->vcpu, NMI_VECTOR);
> > }
> >
> > static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> > {
> > u32 intr_info = vmx_get_intr_info(vcpu);
> > unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> > - gate_desc *desc = (gate_desc *)host_idt_base + vector;
> >
> > if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
> > "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
> > return;
> >
> > - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> > + handle_interrupt_nmi_irqoff(vcpu, vector);
> > vcpu->arch.at_instruction_boundary = true; }
>
> How does any of this work? You're calling into entry/noinstr code from a
> random context.
Can you please elaborate your concern a bit more?
We are here in handle_external_interrupt_irqoff () because an external
interrupt happened when a guest was running and the CPU vm-exits to host
to dispatch to the IRQ handler with IRQ disabled.
> > IRQ_MOVE_CLEANUP_VECTOR is the only one of the system IRQ vectors that
> > is *below* FIRST_SYSTEM_VECTOR. It is a slow path, so just push it
> > into common_interrupt() just before the spurios interrupt handling.
>
> nit:
> s/spurios/spurious
Thank you!
> > > > > +#pragma GCC diagnostic push
> > > > > +#pragma GCC diagnostic ignored "-Wcast-function-type"
> > > >
> > > > How does this not break CFI ?
> > >
> > > I wasn't aware of it, will check.
> >
> > CFI needs $(cc-option,-fsanitize=kcfi), which, reported on LWN on Jun,
> > 2002, had not yet landed in the LLVM mainline (I'm using GCC). So
> > looks we are replying on people keeping an eye on it to make sure it's not
> broken?
>
> Well, the entire point of the warning that you are disabling here is to catch
> potential CFI failures at compile time, rather than run time :)
Oh, of course I didn't intend to be opposed to CFI.
>
> Clang also has -Wcast-function-type-strict, which Gustavo and Kees are working
> on getting enabled, so that even more CFI failures can be caught at compile
> time.
>
> https://github.com/ClangBuiltLinux/linux/issues/1724
> https://lore.kernel.org/all/?q=-Wcast-function-type-strict
>
> Cheers,
> Nathan
On Thu, Nov 10, 2022 at 11:00:51PM +0000, Li, Xin3 wrote:
> > > > > > +#pragma GCC diagnostic push
> > > > > > +#pragma GCC diagnostic ignored "-Wcast-function-type"
> > > > >
> > > > > How does this not break CFI ?
> > > >
> > > > I wasn't aware of it, will check.
> > >
> > > CFI needs $(cc-option,-fsanitize=kcfi), which, reported on LWN on Jun,
> > > 2002, had not yet landed in the LLVM mainline (I'm using GCC). So
> > > looks we are replying on people keeping an eye on it to make sure it's not
> > broken?
> >
> > Well, the entire point of the warning that you are disabling here is to catch
> > potential CFI failures at compile time, rather than run time :)
>
> Oh, of course I didn't intend to be opposed to CFI.
Oh, I apologize if I gave the impression that you were, I did not mean
to put words in your mouth! I was just giving additional context as to
why we have that warning enable and how we use it to help catch
potential run time failures at compile time, which does not require
running CFI to keep CFI happy.
Cheers,
Nathan
> > Clang also has -Wcast-function-type-strict, which Gustavo and Kees are working
> > on getting enabled, so that even more CFI failures can be caught at compile
> > time.
> >
> > https://github.com/ClangBuiltLinux/linux/issues/1724
> > https://lore.kernel.org/all/?q=-Wcast-function-type-strict
> >
> > Cheers,
> > Nathan
> From: Li, Xin3 <[email protected]>
> Sent: Friday, November 11, 2022 3:55 AM
>
> > > Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> > > Signed-off-by: Xin Li <[email protected]>
> >
> > This is not a valid SOB, it would suggest hpa is the author, but he's not in in
> > From.
>
> HPA wrote the initial dispatch code for FRED, and I worked with him to
> refactor it for KVM/VMX NMI/IRQ dispatch. So use SOB from both. No?
>
If main work was done by HPA while your work is trivial:
From: H. Peter Anvin (Intel) <[email protected]>
commit msg
Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
Signed-off-by: Xin Li <[email protected]>
if your contribution is non-trivial:
From: H. Peter Anvin (Intel) <[email protected]>
commit msg
Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
Co-developed-by: Xin Li <[email protected]>
Signed-off-by: Xin Li <[email protected]>
It's documented in Documentation/process/submitting-patches.rst
> > > Well, the entire point of the warning that you are disabling here is
> > > to catch potential CFI failures at compile time, rather than run
> > > time :)
> >
> > Oh, of course I didn't intend to be opposed to CFI.
>
> Oh, I apologize if I gave the impression that you were, I did not mean to put
> words in your mouth! I was just giving additional context as to why we have
> that warning enable and how we use it to help catch potential run time failures
> at compile time, which does not require running CFI to keep CFI happy.
No, I don't feel impression at all :), maybe just a little bit upset that
I didn't realize it at the beginning.
I actually appreciate you provided additional context into this thread.
Xin
> > > > Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> > > > Signed-off-by: Xin Li <[email protected]>
> > >
> > > This is not a valid SOB, it would suggest hpa is the author, but
> > > he's not in in From.
> >
> > HPA wrote the initial dispatch code for FRED, and I worked with him to
> > refactor it for KVM/VMX NMI/IRQ dispatch. So use SOB from both. No?
> >
>
> If main work was done by HPA while your work is trivial:
>
> From: H. Peter Anvin (Intel) <[email protected]>
>
> commit msg
>
> Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> Signed-off-by: Xin Li <[email protected]>
>
> if your contribution is non-trivial:
>
> From: H. Peter Anvin (Intel) <[email protected]>
>
> commit msg
>
> Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> Co-developed-by: Xin Li <[email protected]>
> Signed-off-by: Xin Li <[email protected]>
>
> It's documented in Documentation/process/submitting-patches.rst
Thanks, Kevin.
>
On Thu, Nov 10, 2022 at 07:55:22PM +0000, Li, Xin3 wrote:
> > > Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> > > Signed-off-by: Xin Li <[email protected]>
> >
> > This is not a valid SOB, it would suggest hpa is the author, but he's not in in
> > From.
>
> HPA wrote the initial dispatch code for FRED, and I worked with him to
> refactor it for KVM/VMX NMI/IRQ dispatch. So use SOB from both. No?
Yes, but not like this. Please review the documentation we have on this.
On Thu, Nov 10, 2022 at 08:53:30PM +0000, Li, Xin3 wrote:
> > > static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> > > {
> > > u32 intr_info = vmx_get_intr_info(vcpu);
> > > unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> > > - gate_desc *desc = (gate_desc *)host_idt_base + vector;
> > >
> > > if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
> > > "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
> > > return;
> > >
> > > - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> > > + handle_interrupt_nmi_irqoff(vcpu, vector);
> > > vcpu->arch.at_instruction_boundary = true; }
> >
> > How does any of this work? You're calling into entry/noinstr code from a
> > random context.
>
> Can you please elaborate your concern a bit more?
>
> We are here in handle_external_interrupt_irqoff () because an external
> interrupt happened when a guest was running and the CPU vm-exits to host
> to dispatch to the IRQ handler with IRQ disabled.
I don't speak virt (but this all sounds disguisting) -- but what appears
to be the case is you calling into entry code from regular kernel
context, which is odd at best.
Specifically, going by the fact that all this is not noinstr code, the
assumption is that RCU/lockdep/etc.. is all set-up and running. This
means you should not be calling DEFINE_IDTENTRY_*(func) functions
because those will try and set all that up again.
Granted, irqentry_{enter,exit}() do nest, but *yuck*.
On Thu, Nov 10, 2022 at 08:36:30PM +0000, Li, Xin3 wrote:
> > > > +#pragma GCC diagnostic push
> > > > +#pragma GCC diagnostic ignored "-Wcast-function-type"
> > >
> > > How does this not break CFI ?
> >
> > I wasn't aware of it, will check.
>
> CFI needs $(cc-option,-fsanitize=kcfi), which, reported on LWN on Jun, 2002,
> had not yet landed in the LLVM mainline (I'm using GCC). So looks we are
> replying on people keeping an eye on it to make sure it's not broken?
We're relying on that warning you disabled.
> Even we are unable to test it now, we should find a solution.
You can test this just fine. Build llvm.git and compile a kernel with
CFI_CLANG=y. This is not hard, my ADL runs such a kernel as we speak.
On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> On 11/11/22 10:15, Peter Zijlstra wrote:
> > I don't speak virt (but this all sounds disguisting)
>
> Yes, it is. AMD does not need this, they just hold onto the interrupt
> until the host has issued both STGI (for NMIs) and STI (for IRQs).
Right -- Cooper just explained that to me.
> On Intel you can optionally make it hold onto IRQs, but NMIs are always
> eaten by the VMEXIT and have to be reinjected manually.
That 'optionally' thing worries me -- as in, KVM is currently
opting-out?
Since much of this is about preparing for FRED, can we pretty *PLEASE*
fix this VMX NMI hole instead so that all this nonsense isn't needed
anymore, please, really please?
Have FRED imply a GI flag for VMX or something and then we can all
forget about these reinjection horrors.
On 11/11/22 13:19, Peter Zijlstra wrote:
> On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>> On Intel you can optionally make it hold onto IRQs, but NMIs are always
>> eaten by the VMEXIT and have to be reinjected manually.
>
> That 'optionally' thing worries me -- as in, KVM is currently
> opting-out?
Yes, because "If the “process posted interrupts” VM-execution control is
1, the “acknowledge interrupt on exit” VM-exit control is 1" (SDM
26.2.1.1, checks on VM-Execution Control Fields). Ipse dixit. Posted
interrupts are available and used on all processors since I think Ivy
Bridge.
(sorry about splitting the replies across two threads)
Paolo
On 11/11/22 10:15, Peter Zijlstra wrote:
> I don't speak virt (but this all sounds disguisting)
Yes, it is. AMD does not need this, they just hold onto the interrupt
until the host has issued both STGI (for NMIs) and STI (for IRQs).
On Intel you can optionally make it hold onto IRQs, but NMIs are always
eaten by the VMEXIT and have to be reinjected manually.
> -- but what appears to be the case is you calling into entry code
> from regular kernel context, which is odd at best.
>
> Specifically, going by the fact that all this is not noinstr code,
> the assumption is that RCU/lockdep/etc.. is all set-up and running.
Indeed it is. This is called long after the noinstr area has been left:
vcpu_enter_guest
...
static_call(kvm_x86_vcpu_run) -> vmx_vcpu_run
vmx_vcpu_enter_exit (noinstr)
guest_state_enter_irqoff
__vmx_vcpu_run
guest_state_exit_irqoff
...
static_call(kvm_x86_handle_exit_irqoff) -> vmx_handle_exit_irqoff
handle_external_interrupt_irqoff
...
Paolo
> This means you should not be calling DEFINE_IDTENTRY_*(func)
> functions because those will try and set all that up again.
>
> Granted, irqentry_{enter,exit}() do nest, but*yuck*.
On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
> On 11/11/22 13:19, Peter Zijlstra wrote:
> > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> > > On Intel you can optionally make it hold onto IRQs, but NMIs are always
> > > eaten by the VMEXIT and have to be reinjected manually.
> >
> > That 'optionally' thing worries me -- as in, KVM is currently
> > opting-out?
>
> Yes, because "If the “process posted interrupts” VM-execution control is 1,
> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
> checks on VM-Execution Control Fields). Ipse dixit. Posted interrupts are
> available and used on all processors since I think Ivy Bridge.
(imagine the non-coc compliant reaction here)
So instead of fixing it, they made it worse :-(
And now FRED is arguably making it worse again, and people wonder why I
hate virt...
> On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
> > On 11/11/22 13:19, Peter Zijlstra wrote:
> > > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> > > > On Intel you can optionally make it hold onto IRQs, but NMIs are
> > > > always eaten by the VMEXIT and have to be reinjected manually.
> > >
> > > That 'optionally' thing worries me -- as in, KVM is currently
> > > opting-out?
> >
> > Yes, because "If the “process posted interrupts” VM-execution control
> > is 1, the “acknowledge interrupt on exit” VM-exit control is 1" (SDM
> > 26.2.1.1, checks on VM-Execution Control Fields). Ipse dixit. Posted
> > interrupts are available and used on all processors since I think Ivy Bridge.
>
> (imagine the non-coc compliant reaction here)
>
> So instead of fixing it, they made it worse :-(
>
> And now FRED is arguably making it worse again, and people wonder why I
> hate virt...
Maybe I take it wrong, but FRED doesn't make anything worse. Fred entry
code will call external_interrupt() immediately for IRQs.
You really really don't like the context how VMX dispatches NMI/IRQs (which has
been there for a long time), right?
As you said, what if we do not call DEFINE_IDTENTRY_*(func) but the internal
IRQ handlers __func? That way we take out the setup for RCU/lockdep/etc.
On Fri, Nov 11, 2022 at 06:06:12PM +0000, Li, Xin3 wrote:
> > On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
> > > On 11/11/22 13:19, Peter Zijlstra wrote:
> > > > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> > > > > On Intel you can optionally make it hold onto IRQs, but NMIs are
> > > > > always eaten by the VMEXIT and have to be reinjected manually.
> > > >
> > > > That 'optionally' thing worries me -- as in, KVM is currently
> > > > opting-out?
> > >
> > > Yes, because "If the “process posted interrupts” VM-execution control
> > > is 1, the “acknowledge interrupt on exit” VM-exit control is 1" (SDM
> > > 26.2.1.1, checks on VM-Execution Control Fields). Ipse dixit. Posted
> > > interrupts are available and used on all processors since I think Ivy Bridge.
> >
> > (imagine the non-coc compliant reaction here)
> >
> > So instead of fixing it, they made it worse :-(
> >
> > And now FRED is arguably making it worse again, and people wonder why I
> > hate virt...
>
> Maybe I take it wrong, but FRED doesn't make anything worse. Fred entry
> code will call external_interrupt() immediately for IRQs.
But what about NMIs, afaict this is all horribly broken for NMIs.
So the whole VMX thing latches the NMI (which stops NMI recursion),
right?
But then you drop out of noinstr code, which means any random exception
can happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
nonsense like *SAN). This exception will do IRET and clear the NMI
latch, all before you get to run any of the NMI code.
Note how the normal NMI code is very careful to clear DR7 and both
kprobes and hw_breakpoint know not to accept noinstr code as targets.
You threw all that out the window.
Also, NMI is IST, and with FRED it will run on a different stack as
well, directly calling external_interrupt() doesn't honour that either.
> You really really don't like the context how VMX dispatches NMI/IRQs (which has
> been there for a long time), right?
I really really hate this with a passion. The fact that it's been this
way is no justification for keeping it. Crap is crap.
Intel should have taken an example of SVM in this regard, and not
doubled down and extended this NMI hole to regular IRQs. These are
exactly the kind of exception delivery trainwrecks FRED is supposed to
fix, except in this case it appears it doesn't :/
On November 11, 2022 8:35:30 AM PST, Andrew Cooper <[email protected]> wrote:
>On 11/11/2022 14:23, Peter Zijlstra wrote:
>> On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
>>> On 11/11/22 13:19, Peter Zijlstra wrote:
>>>> On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>>>>> On Intel you can optionally make it hold onto IRQs, but NMIs are always
>>>>> eaten by the VMEXIT and have to be reinjected manually.
>>>> That 'optionally' thing worries me -- as in, KVM is currently
>>>> opting-out?
>>> Yes, because "If the “process posted interrupts” VM-execution control is 1,
>>> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
>>> checks on VM-Execution Control Fields). Ipse dixit. Posted interrupts are
>>> available and used on all processors since I think Ivy Bridge.
>
>On server SKUs. Client only got "virtual interrupt processing" fairly
>recently IIRC, which is the CPU-side property which matters.
>
>> (imagine the non-coc compliant reaction here)
>>
>> So instead of fixing it, they made it worse :-(
>>
>> And now FRED is arguably making it worse again, and people wonder why I
>> hate virt...
>
>The only FRED-compatible fix is to send a self-NMI, because because you
>may need a CSL change too.
>
>VT-x *does* hold the NMI latch (for VMEXIT_REASON NMI), so it's self-NMI
>and then enable_nmi()s.
>
>Except the IRET to self won't work - it will need to be ERETS-to-self.
>Which I think is fine.
>
>But what isn't fine is the fact that a self-NMI doesn't deliver
>synchronously, so you need to wait until it is pending, before enabling
>NMIs. (Well, actually you need to ensure that it's definitely delivered
>before re-entering the VM).
>
>And I'm totally out of ideas here...
>
>~Andrew
>
There is no fundamental reason to do a CSL/IST change if you happen to know a priori that the stack is in a valid state to have the NMI frame on it; that is:
1. Not deep into a nested I/O layer;
2. Valid, and not in flux in any way.
Since this reinject will always be in a well-defined location, that's fine.
So I think *that* concern is not actually an issue.
Again, note that this is not a FRED-specific problem.
On November 11, 2022 6:23:13 AM PST, Peter Zijlstra <[email protected]> wrote:
>On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
>> On 11/11/22 13:19, Peter Zijlstra wrote:
>> > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>> > > On Intel you can optionally make it hold onto IRQs, but NMIs are always
>> > > eaten by the VMEXIT and have to be reinjected manually.
>> >
>> > That 'optionally' thing worries me -- as in, KVM is currently
>> > opting-out?
>>
>> Yes, because "If the “process posted interrupts” VM-execution control is 1,
>> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
>> checks on VM-Execution Control Fields). Ipse dixit. Posted interrupts are
>> available and used on all processors since I think Ivy Bridge.
>
>(imagine the non-coc compliant reaction here)
>
>So instead of fixing it, they made it worse :-(
>
>And now FRED is arguably making it worse again, and people wonder why I
>hate virt...
I object to saying that FRED is making it worse. Xin's patchset gets rid of low-level assembly magic across the board and at least makes it obvious what the code actually *does*, right now, as opposed to being buried in highly questionable assembly.
It would also, regardless, be good to narrow down the set of possible events that may have to be reinjected to the absolute minimum, *and* document that in the code.
That being said, if there are better ways of doing it, we should, and you are certainly right that we may not have properly dug into if this code is even exercised on platforms which will have FRED.
On November 10, 2022 11:55:22 AM PST, "Li, Xin3" <[email protected]> wrote:
>> > Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
>> > Signed-off-by: Xin Li <[email protected]>
>>
>> This is not a valid SOB, it would suggest hpa is the author, but he's not in in
>> From.
>
>HPA wrote the initial dispatch code for FRED, and I worked with him to
>refactor it for KVM/VMX NMI/IRQ dispatch. So use SOB from both. No?
>
>> > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index
>> > 178015a820f0..95dd917ef9ad 100644
>> > --- a/arch/x86/kernel/traps.c
>> > +++ b/arch/x86/kernel/traps.c
>> > @@ -1444,6 +1444,61 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif
>> >
>> > +#define SYSV(x,y) [(x) - FIRST_SYSTEM_VECTOR] =
>> > +(system_interrupt_handler)y
>> > +
>> > +#pragma GCC diagnostic push
>> > +#pragma GCC diagnostic ignored "-Wcast-function-type"
>>
>> How does this not break CFI ?
>
>I wasn't aware of it, will check.
>
It doesn't break CFI because the arguments passed is always a strict superset of the ones expected and they are free enough that they are always passed in registers.
> > > So instead of fixing it, they made it worse :-(
> > >
> > > And now FRED is arguably making it worse again, and people wonder
> > > why I hate virt...
> >
> > Maybe I take it wrong, but FRED doesn't make anything worse. Fred
> > entry code will call external_interrupt() immediately for IRQs.
>
> But what about NMIs, afaict this is all horribly broken for NMIs.
NMIs are NOT handled by external_interrupt(), which is introduced in patch 4.
The NMI handling is added to kvm_vmx_reinject_nmi_irq() in patch 5 purely for VMX only.
>
> So the whole VMX thing latches the NMI (which stops NMI recursion), right?
>
> But then you drop out of noinstr code, which means any random exception can
> happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
> nonsense like *SAN). This exception will do IRET and clear the NMI latch, all
> before you get to run any of the NMI code.
>
> Note how the normal NMI code is very careful to clear DR7 and both kprobes
> and hw_breakpoint know not to accept noinstr code as targets.
>
> You threw all that out the window.
>
> Also, NMI is IST, and with FRED it will run on a different stack as well, directly
> calling external_interrupt() doesn't honour that either.
>
> > You really really don't like the context how VMX dispatches NMI/IRQs
> > (which has been there for a long time), right?
>
> I really really hate this with a passion. The fact that it's been this way is no
> justification for keeping it. Crap is crap.
>
> Intel should have taken an example of SVM in this regard, and not doubled
> down and extended this NMI hole to regular IRQs. These are exactly the kind of
> exception delivery trainwrecks FRED is supposed to fix, except in this case it
> appears it doesn't :/
On Fri, Nov 11, 2022 at 02:07:05PM -0800, H. Peter Anvin wrote:
> On November 10, 2022 11:55:22 AM PST, "Li, Xin3" <[email protected]> wrote:
> >> > Signed-off-by: H. Peter Anvin (Intel) <[email protected]>
> >> > Signed-off-by: Xin Li <[email protected]>
> >>
> >> This is not a valid SOB, it would suggest hpa is the author, but he's not in in
> >> From.
> >
> >HPA wrote the initial dispatch code for FRED, and I worked with him to
> >refactor it for KVM/VMX NMI/IRQ dispatch. So use SOB from both. No?
> >
> >> > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index
> >> > 178015a820f0..95dd917ef9ad 100644
> >> > --- a/arch/x86/kernel/traps.c
> >> > +++ b/arch/x86/kernel/traps.c
> >> > @@ -1444,6 +1444,61 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif
> >> >
> >> > +#define SYSV(x,y) [(x) - FIRST_SYSTEM_VECTOR] =
> >> > +(system_interrupt_handler)y
> >> > +
> >> > +#pragma GCC diagnostic push
> >> > +#pragma GCC diagnostic ignored "-Wcast-function-type"
> >>
> >> How does this not break CFI ?
> >
> >I wasn't aware of it, will check.
> >
>
> It doesn't break CFI because the arguments passed is always a strict
> superset of the ones expected and they are free enough that they are
> always passed in registers.
It does break CFI because the signature hash doesn't match and you'll
trigger an explicit UD2.
> > > > > > On Intel you can optionally make it hold onto IRQs, but NMIs
> > > > > > are always eaten by the VMEXIT and have to be reinjected manually.
> > > > >
> > > > > That 'optionally' thing worries me -- as in, KVM is currently
> > > > > opting-out?
> > > >
> > > > Yes, because "If the “process posted interrupts” VM-execution
> > > > control is 1, the “acknowledge interrupt on exit” VM-exit control
> > > > is 1" (SDM 26.2.1.1, checks on VM-Execution Control Fields). Ipse
> > > > dixit. Posted interrupts are available and used on all processors since I
> think Ivy Bridge.
> > >
> > > (imagine the non-coc compliant reaction here)
> > >
> > > So instead of fixing it, they made it worse :-(
> > >
> > > And now FRED is arguably making it worse again, and people wonder
> > > why I hate virt...
> >
> > Maybe I take it wrong, but FRED doesn't make anything worse. Fred
> > entry code will call external_interrupt() immediately for IRQs.
>
> But what about NMIs, afaict this is all horribly broken for NMIs.
>
> So the whole VMX thing latches the NMI (which stops NMI recursion), right?
>
> But then you drop out of noinstr code, which means any random exception can
> happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
> nonsense like *SAN). This exception will do IRET and clear the NMI latch, all
> before you get to run any of the NMI code.
What you said here implies that we have this problem in the existing code.
Because a fake iret stack is created to call the NMI handler in the IDT NMI
descriptor, which lastly executes the IRET instruction.
>
> Note how the normal NMI code is very careful to clear DR7 and both kprobes
> and hw_breakpoint know not to accept noinstr code as targets.
>
> You threw all that out the window.
>
> Also, NMI is IST, and with FRED it will run on a different stack as well, directly
> calling external_interrupt() doesn't honour that either.
>
> > You really really don't like the context how VMX dispatches NMI/IRQs
> > (which has been there for a long time), right?
>
> I really really hate this with a passion. The fact that it's been this way is no
> justification for keeping it. Crap is crap.
>
> Intel should have taken an example of SVM in this regard, and not doubled
> down and extended this NMI hole to regular IRQs. These are exactly the kind of
> exception delivery trainwrecks FRED is supposed to fix, except in this case it
> appears it doesn't :/
On Mon, Nov 14, 2022 at 04:39:40AM +0000, Li, Xin3 wrote:
> > But what about NMIs, afaict this is all horribly broken for NMIs.
> >
> > So the whole VMX thing latches the NMI (which stops NMI recursion), right?
> >
> > But then you drop out of noinstr code, which means any random exception can
> > happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
> > nonsense like *SAN). This exception will do IRET and clear the NMI latch, all
> > before you get to run any of the NMI code.
>
> What you said here implies that we have this problem in the existing code.
> Because a fake iret stack is created to call the NMI handler in the IDT NMI
> descriptor, which lastly executes the IRET instruction.
I can't follow; of course the IDT handler terminates with IRET, it has
to no?
And yes, the current code appears to suffer the same defect.
> > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > >
> > > So the whole VMX thing latches the NMI (which stops NMI recursion),
> right?
> > >
> > > But then you drop out of noinstr code, which means any random
> > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even #PF
> > > due to random nonsense like *SAN). This exception will do IRET and
> > > clear the NMI latch, all before you get to run any of the NMI code.
> >
> > What you said here implies that we have this problem in the existing code.
> > Because a fake iret stack is created to call the NMI handler in the
> > IDT NMI descriptor, which lastly executes the IRET instruction.
>
> I can't follow; of course the IDT handler terminates with IRET, it has to no?
With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS field
to control whether to unblock NMI. If bit 28 of the field (above the selector)
is 1, ERETS/ERETU unblocks NMIs.
>
> And yes, the current code appears to suffer the same defect.
On Tue, Nov 15, 2022 at 07:50:49AM +0000, Li, Xin3 wrote:
> > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > >
> > > > So the whole VMX thing latches the NMI (which stops NMI recursion),
> > right?
> > > >
> > > > But then you drop out of noinstr code, which means any random
> > > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even #PF
> > > > due to random nonsense like *SAN). This exception will do IRET and
> > > > clear the NMI latch, all before you get to run any of the NMI code.
> > >
> > > What you said here implies that we have this problem in the existing code.
> > > Because a fake iret stack is created to call the NMI handler in the
> > > IDT NMI descriptor, which lastly executes the IRET instruction.
> >
> > I can't follow; of course the IDT handler terminates with IRET, it has to no?
>
> With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS field
> to control whether to unblock NMI. If bit 28 of the field (above the selector)
> is 1, ERETS/ERETU unblocks NMIs.
Yes, I know that. It is one of the many improvements FRED brings.
Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in the
hardware exception frame, but that's still up in the air I believe :/
Anyway.. given there is interrupt priority and NMI is pretty much on top
of everything else the reinject crap *should* run NMI first. That way
NMI runs with the latch disabled and whatever other pending interrupts
will run later.
But that all is still broken because afaict the current code also leaves
noinstr -- and once you leave noinstr (or use a static_key, static_call
or anything else that *can* change at runtime) you can't guarantee
nothing.
> > > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > > >
> > > > > So the whole VMX thing latches the NMI (which stops NMI
> > > > > recursion),
> > > right?
> > > > >
> > > > > But then you drop out of noinstr code, which means any random
> > > > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even
> > > > > #PF due to random nonsense like *SAN). This exception will do
> > > > > IRET and clear the NMI latch, all before you get to run any of the NMI
> code.
> > > >
> > > > What you said here implies that we have this problem in the existing
> code.
> > > > Because a fake iret stack is created to call the NMI handler in
> > > > the IDT NMI descriptor, which lastly executes the IRET instruction.
> > >
> > > I can't follow; of course the IDT handler terminates with IRET, it has to no?
> >
> > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS
> > field to control whether to unblock NMI. If bit 28 of the field (above
> > the selector) is 1, ERETS/ERETU unblocks NMIs.
>
> Yes, I know that. It is one of the many improvements FRED brings.
> Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in the
> hardware exception frame, but that's still up in the air I believe :/
>
> Anyway.. given there is interrupt priority and NMI is pretty much on top of
> everything else the reinject crap *should* run NMI first. That way NMI runs
> with the latch disabled and whatever other pending interrupts will run later.
>
> But that all is still broken because afaict the current code also leaves noinstr --
> and once you leave noinstr (or use a static_key, static_call or anything else that
> *can* change at runtime) you can't guarantee nothing.
For NMI, HPA asked me to use "int $2", as it switches to the NMI IST stack to
execute the NMI handler, essentially like how HW deals with a NMI in host. and
I tested it with NMI watchdog, it looks working fine.
For IRQs, we still use the dispatch table, but with a new func added in
DEFINE_IDTENTRY_SYSVEC with the noinstr entry/exit code removed:
#define DEFINE_IDTENTRY_SYSVEC(func) \
static void __##func(struct pt_regs *regs); \
\
__visible noinstr void func(struct pt_regs *regs) \
{ \
irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
kvm_set_cpu_l1tf_flush_l1d(); \
run_sysvec_on_irqstack_cond(__##func, regs); \
instrumentation_end(); \
irqentry_exit(regs, state); \
} \
\
+void dispatch_table_##func(struct pt_regs *regs) \
+{ \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ run_sysvec_on_irqstack_cond(__##func, regs); \
+}
+ \
\
static noinline void __##func(struct pt_regs *regs)
How do you think?
On Thu, Nov 17, 2022, Li, Xin3 wrote:
>
> > > > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > > > > So the whole VMX thing latches the NMI (which stops NMI
> > > > > > recursion), right?
> > > > > >
> > > > > > But then you drop out of noinstr code, which means any random
> > > > > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even
> > > > > > #PF due to random nonsense like *SAN). This exception will do
> > > > > > IRET and clear the NMI latch, all before you get to run any of the
> > > > > > NMI code.
> > > > >
> > > > > What you said here implies that we have this problem in the existing code.
> > > > > Because a fake iret stack is created to call the NMI handler in
> > > > > the IDT NMI descriptor, which lastly executes the IRET instruction.
> > > >
> > > > I can't follow; of course the IDT handler terminates with IRET, it has to no?
> > > >
> > > > And yes, the current code appears to suffer the same defect.
That defect isn't going to be fixed simply by changing how KVM forwards NMIs
though. IIUC, _everything_ between VM-Exit and the invocation of the NMI handler
needs to be noinstr. On VM-Exit due to NMI, NMIs are blocked. If a #BP/#DB/#PF
occurs before KVM gets to kvm_x86_handle_exit_irqoff(), the subsequent IRET will
unblock NMIs before the original NMI is serviced, i.e. a second NMI could come in
at anytime regardless of how KVM forwards the NMI to the kernel.
Is there any way to solve this without tagging everything noinstr? There is a
metric shit ton of code between VM-Exit and the handling of NMIs, and much of that
code is common helpers. It might be possible to hoist NMI handler much earlier,
though we'd need to do a super thorough audit to ensure all necessary host state
is restored.
> > > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS
> > > field to control whether to unblock NMI. If bit 28 of the field (above
> > > the selector) is 1, ERETS/ERETU unblocks NMIs.
Side topic, there's a bug in the ISE docs. Section "9.4.2 NMI Blocking" states
that bit 16 holds the "unblock NMI" magic, which I'm guessing is a holdover from
an earlier revision of FRED.
As specified in Section 6.1.3 and Section 6.2.3, ERETS and ERETU each unblocks NMIs
if bit 16 of the popped CS field is 1. The following items detail how this behavior may be
changed in VMX non-root operation, depending on the settings of certain VM-execution
controls:
> > Yes, I know that. It is one of the many improvements FRED brings.
> > Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in the
> > hardware exception frame, but that's still up in the air I believe :/
> >
> > Anyway.. given there is interrupt priority and NMI is pretty much on top of
> > everything else the reinject crap *should* run NMI first. That way NMI runs
> > with the latch disabled and whatever other pending interrupts will run later.
> >
> > But that all is still broken because afaict the current code also leaves noinstr --
> > and once you leave noinstr (or use a static_key, static_call or anything else that
> > *can* change at runtime) you can't guarantee nothing.
>
> For NMI, HPA asked me to use "int $2", as it switches to the NMI IST stack to
> execute the NMI handler, essentially like how HW deals with a NMI in host. and
> I tested it with NMI watchdog, it looks working fine.
Heh, well yeah, because that's how KVM used to handle NMIs back before I reworked
NMI handling to use the direct call method. Ironically, that original change was
done in part to try and make it _easier_ to deal with FRED (back before FRED was
publicly disclosed).
If KVM reverts to INTn, the fix to route KVM=>NMI through the non-IST entry can
be reverted too.
a217a6593cec ("KVM/VMX: Invoke NMI non-IST entry instead of IST entry")
1a5488ef0dcf ("KVM: VMX: Invoke NMI handler via indirect call instead of INTn")
> > > > > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > > > > > So the whole VMX thing latches the NMI (which stops NMI
> > > > > > > recursion), right?
> > > > > > >
> > > > > > > But then you drop out of noinstr code, which means any
> > > > > > > random exception can happen (kprobes #BP, hw_breakpoint #DB,
> > > > > > > or even #PF due to random nonsense like *SAN). This
> > > > > > > exception will do IRET and clear the NMI latch, all before
> > > > > > > you get to run any of the NMI code.
> > > > > >
> > > > > > What you said here implies that we have this problem in the existing
> code.
> > > > > > Because a fake iret stack is created to call the NMI handler
> > > > > > in the IDT NMI descriptor, which lastly executes the IRET instruction.
> > > > >
> > > > > I can't follow; of course the IDT handler terminates with IRET, it has to
> no?
> > > > >
> > > > > And yes, the current code appears to suffer the same defect.
>
> That defect isn't going to be fixed simply by changing how KVM forwards NMIs
> though. IIUC, _everything_ between VM-Exit and the invocation of the NMI
> handler needs to be noinstr. On VM-Exit due to NMI, NMIs are blocked. If a
> #BP/#DB/#PF occurs before KVM gets to kvm_x86_handle_exit_irqoff(), the
> subsequent IRET will unblock NMIs before the original NMI is serviced, i.e. a
> second NMI could come in at anytime regardless of how KVM forwards the NMI
> to the kernel.
>
> Is there any way to solve this without tagging everything noinstr? There is a
> metric shit ton of code between VM-Exit and the handling of NMIs, and much
> of that code is common helpers. It might be possible to hoist NMI handler
> much earlier, though we'd need to do a super thorough audit to ensure all
> necessary host state is restored.
As NMI is the only vector with this potential issue, it sounds a good idea
to only promote its handling.
> > > > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped
> > > > CS field to control whether to unblock NMI. If bit 28 of the field
> > > > (above the selector) is 1, ERETS/ERETU unblocks NMIs.
>
> Side topic, there's a bug in the ISE docs. Section "9.4.2 NMI Blocking" states
> that bit 16 holds the "unblock NMI" magic, which I'm guessing is a holdover
> from an earlier revision of FRED.
Good catch, the latest 3.0 draft spec changed it to bit 28, but section 9.4.2
didn't get a proper update.
>
> As specified in Section 6.1.3 and Section 6.2.3, ERETS and ERETU each
> unblocks NMIs
> if bit 16 of the popped CS field is 1. The following items detail how this
> behavior may be
> changed in VMX non-root operation, depending on the settings of certain VM-
> execution
> controls:
>
> > > Yes, I know that. It is one of the many improvements FRED brings.
> > > Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in
> > > the hardware exception frame, but that's still up in the air I
> > > believe :/
> > >
> > > Anyway.. given there is interrupt priority and NMI is pretty much on
> > > top of everything else the reinject crap *should* run NMI first.
> > > That way NMI runs with the latch disabled and whatever other pending
> interrupts will run later.
> > >
> > > But that all is still broken because afaict the current code also
> > > leaves noinstr -- and once you leave noinstr (or use a static_key,
> > > static_call or anything else that
> > > *can* change at runtime) you can't guarantee nothing.
> >
> > For NMI, HPA asked me to use "int $2", as it switches to the NMI IST
> > stack to execute the NMI handler, essentially like how HW deals with a
> > NMI in host. and I tested it with NMI watchdog, it looks working fine.
>
> Heh, well yeah, because that's how KVM used to handle NMIs back before I
> reworked NMI handling to use the direct call method. Ironically, that original
> change was done in part to try and make it _easier_ to deal with FRED (back
> before FRED was publicly disclosed).
>
> If KVM reverts to INTn, the fix to route KVM=>NMI through the non-IST entry
> can be reverted too.
>
> a217a6593cec ("KVM/VMX: Invoke NMI non-IST entry instead of IST entry")
> 1a5488ef0dcf ("KVM: VMX: Invoke NMI handler via indirect call instead of
> INTn")
Sure, I'm just thinking to put asm("int $2") in a new function exc_raise_nmi()
defined in arch/x86/kernel/traps.c. Thus we will need to change it only
whenever we have any better facility, and no KVM VMX change required.
> > > > > > And yes, the current code appears to suffer the same defect.
> >
> > That defect isn't going to be fixed simply by changing how KVM
> > forwards NMIs though. IIUC, _everything_ between VM-Exit and the
> > invocation of the NMI handler needs to be noinstr. On VM-Exit due to
> > NMI, NMIs are blocked. If a #BP/#DB/#PF occurs before KVM gets to
> > kvm_x86_handle_exit_irqoff(), the subsequent IRET will unblock NMIs
> > before the original NMI is serviced, i.e. a second NMI could come in
> > at anytime regardless of how KVM forwards the NMI to the kernel.
> >
> > Is there any way to solve this without tagging everything noinstr?
> > There is a metric shit ton of code between VM-Exit and the handling of
> > NMIs, and much of that code is common helpers. It might be possible
> > to hoist NMI handler much earlier, though we'd need to do a super
> > thorough audit to ensure all necessary host state is restored.
>
> As NMI is the only vector with this potential issue, it sounds a good idea to only
> promote its handling.
>
Hi Peter/Sean,
I prefer to move _everything_ between VM-Exit and the invocation of the NMI
handler into the noinstr section in the next patch set, how do you think?
Xin
>
> > > > > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the
> > > > > popped CS field to control whether to unblock NMI. If bit 28 of
> > > > > the field (above the selector) is 1, ERETS/ERETU unblocks NMIs.
> >
> > Side topic, there's a bug in the ISE docs. Section "9.4.2 NMI
> > Blocking" states that bit 16 holds the "unblock NMI" magic, which I'm
> > guessing is a holdover from an earlier revision of FRED.
>
> Good catch, the latest 3.0 draft spec changed it to bit 28, but section 9.4.2
> didn't get a proper update.
>
> >
> > As specified in Section 6.1.3 and Section 6.2.3, ERETS and ERETU
> > each unblocks NMIs
> > if bit 16 of the popped CS field is 1. The following items detail
> > how this behavior may be
> > changed in VMX non-root operation, depending on the settings of
> > certain VM- execution
> > controls:
> >
> > > > Yes, I know that. It is one of the many improvements FRED brings.
> > > > Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in
> > > > the hardware exception frame, but that's still up in the air I
> > > > believe :/
> > > >
> > > > Anyway.. given there is interrupt priority and NMI is pretty much
> > > > on top of everything else the reinject crap *should* run NMI first.
> > > > That way NMI runs with the latch disabled and whatever other
> > > > pending
> > interrupts will run later.
> > > >
> > > > But that all is still broken because afaict the current code also
> > > > leaves noinstr -- and once you leave noinstr (or use a static_key,
> > > > static_call or anything else that
> > > > *can* change at runtime) you can't guarantee nothing.
> > >
> > > For NMI, HPA asked me to use "int $2", as it switches to the NMI IST
> > > stack to execute the NMI handler, essentially like how HW deals with
> > > a NMI in host. and I tested it with NMI watchdog, it looks working fine.
> >
> > Heh, well yeah, because that's how KVM used to handle NMIs back before
> > I reworked NMI handling to use the direct call method. Ironically,
> > that original change was done in part to try and make it _easier_ to
> > deal with FRED (back before FRED was publicly disclosed).
> >
> > If KVM reverts to INTn, the fix to route KVM=>NMI through the non-IST
> > entry can be reverted too.
> >
> > a217a6593cec ("KVM/VMX: Invoke NMI non-IST entry instead of IST entry")
> > 1a5488ef0dcf ("KVM: VMX: Invoke NMI handler via indirect call
> > instead of
> > INTn")
>
> Sure, I'm just thinking to put asm("int $2") in a new function exc_raise_nmi()
> defined in arch/x86/kernel/traps.c. Thus we will need to change it only
> whenever we have any better facility, and no KVM VMX change required.
On Tue, Nov 22, 2022, Li, Xin3 wrote:
> > > > > > > And yes, the current code appears to suffer the same defect.
> > >
> > > That defect isn't going to be fixed simply by changing how KVM
> > > forwards NMIs though. IIUC, _everything_ between VM-Exit and the
> > > invocation of the NMI handler needs to be noinstr. On VM-Exit due to
> > > NMI, NMIs are blocked. If a #BP/#DB/#PF occurs before KVM gets to
> > > kvm_x86_handle_exit_irqoff(), the subsequent IRET will unblock NMIs
> > > before the original NMI is serviced, i.e. a second NMI could come in
> > > at anytime regardless of how KVM forwards the NMI to the kernel.
> > >
> > > Is there any way to solve this without tagging everything noinstr?
> > > There is a metric shit ton of code between VM-Exit and the handling of
> > > NMIs, and much of that code is common helpers. It might be possible
> > > to hoist NMI handler much earlier, though we'd need to do a super
> > > thorough audit to ensure all necessary host state is restored.
> >
> > As NMI is the only vector with this potential issue, it sounds a good idea to only
> > promote its handling.
> >
>
> Hi Peter/Sean,
>
> I prefer to move _everything_ between VM-Exit and the invocation of the NMI
> handler into the noinstr section in the next patch set, how do you think?
That's likely going to be beyond painful and will have a _lot_ of collateral
damage in the sense that other paths will end up calling noinstr function just
because of VMX. E.g. hw_breakpoint_restore(), fpu_sync_guest_vmexit_xfd_state(),
kvm_get_apic_mode(), multiple static calls in KVM... the list goes on and on and on.
The ongoing maintenance for that would also be quite painful.
Actually, SVM already enables NMIs far earlier, which means the probability of
breaking something by moving NMI handling earlier is lower. Not zero, as I don't
trust that SVM gets all the corner right, but definitely low.
E.g. this should be doable
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cea8c07f5229..2fec93f38960 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7249,6 +7249,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
+ <handle NMIs here>
+
vmx->loaded_vmcs->launched = 1;
vmx_recover_nmi_blocking(vmx);
thouh we'd like want a fair bit of refactoring so that all of vmx_vcpu_run() and
svm_vcpu_run() don't need to be noinstr.
Another wart that needs to be addressed is trace_kvm_exit(). IIRC, tracepoints
must be outside of noinstr, though maybe I'm misremembering that. And conversely,
SVM doesn't trace exits that are handled in the fastpath. Best option is probably
to move VMX's trace_kvm_exit() call to vmx_handle_exit(), and then figure out a
common way to trace exits that are handled in the fastpath (which can reside outside
of the noinstr section).
> > > > > > > > And yes, the current code appears to suffer the same defect.
> > > >
> > > > That defect isn't going to be fixed simply by changing how KVM
> > > > forwards NMIs though. IIUC, _everything_ between VM-Exit and the
> > > > invocation of the NMI handler needs to be noinstr. On VM-Exit due
> > > > to NMI, NMIs are blocked. If a #BP/#DB/#PF occurs before KVM gets
> > > > to kvm_x86_handle_exit_irqoff(), the subsequent IRET will unblock
> > > > NMIs before the original NMI is serviced, i.e. a second NMI could
> > > > come in at anytime regardless of how KVM forwards the NMI to the
> kernel.
> > > >
> > > > Is there any way to solve this without tagging everything noinstr?
> > > > There is a metric shit ton of code between VM-Exit and the
> > > > handling of NMIs, and much of that code is common helpers. It
> > > > might be possible to hoist NMI handler much earlier, though we'd
> > > > need to do a super thorough audit to ensure all necessary host state is
> restored.
> > >
> > > As NMI is the only vector with this potential issue, it sounds a
> > > good idea to only promote its handling.
> > >
> >
> > Hi Peter/Sean,
> >
> > I prefer to move _everything_ between VM-Exit and the invocation of
> > the NMI handler into the noinstr section in the next patch set, how do you
> think?
>
> That's likely going to be beyond painful and will have a _lot_ of collateral
> damage in the sense that other paths will end up calling noinstr function just
> because of VMX. E.g. hw_breakpoint_restore(),
> fpu_sync_guest_vmexit_xfd_state(),
> kvm_get_apic_mode(), multiple static calls in KVM... the list goes on and on
> and on.
>
> The ongoing maintenance for that would also be quite painful.
This is very complicated and I'm lost after reading the code around it.
Maybe some comments would help to explain the steps and order after VM exit.
(of course some people know it quite well)
>
> Actually, SVM already enables NMIs far earlier, which means the probability of
> breaking something by moving NMI handling earlier is lower. Not zero, as I
> don't trust that SVM gets all the corner right, but definitely low.
>
> E.g. this should be doable
Do we need to recover *all* host states before switching to NMI stack and
handler? if not what is the minimal set? If we restore the minimal set and
do "int $2" as early as possible, is it a quick and dirty approach?
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> cea8c07f5229..2fec93f38960 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7249,6 +7249,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu
> *vcpu)
> if (unlikely(vmx->exit_reason.failed_vmentry))
> return EXIT_FASTPATH_NONE;
>
> + <handle NMIs here>
> +
> vmx->loaded_vmcs->launched = 1;
>
> vmx_recover_nmi_blocking(vmx);
>
>
> thouh we'd like want a fair bit of refactoring so that all of vmx_vcpu_run() and
> svm_vcpu_run() don't need to be noinstr.
This sounds reasonable to me, however from Documentation/core-api/entry.rst,
we do need it. Maybe we could argue guest is logically orthogonal to host,
and the CPU is borrowed to another OS... (which also kind of explains nested)
>
> Another wart that needs to be addressed is trace_kvm_exit(). IIRC, tracepoints
> must be outside of noinstr, though maybe I'm misremembering that. And
> conversely, SVM doesn't trace exits that are handled in the fastpath. Best
> option is probably to move VMX's trace_kvm_exit() call to vmx_handle_exit(),
> and then figure out a common way to trace exits that are handled in the
> fastpath (which can reside outside of the noinstr section).
On Tue, Nov 22, 2022 at 08:52:35PM +0000, Sean Christopherson wrote:
> Another wart that needs to be addressed is trace_kvm_exit(). IIRC, tracepoints
> must be outside of noinstr, though maybe I'm misremembering that.
You are not, that is correct. Another point to be careful with is usage
of jump_label and static_call, both can be used in noinstr *provided*
they don't actually ever change -- so boot time setup only.
If either of them were to change, text_poke_bp() has a clue in the name.
On Wed, Nov 23, 2022, Peter Zijlstra wrote:
> On Tue, Nov 22, 2022 at 08:52:35PM +0000, Sean Christopherson wrote:
>
> > Another wart that needs to be addressed is trace_kvm_exit(). IIRC, tracepoints
> > must be outside of noinstr, though maybe I'm misremembering that.
>
> You are not, that is correct. Another point to be careful with is usage
> of jump_label and static_call, both can be used in noinstr *provided*
> they don't actually ever change -- so boot time setup only.
>
> If either of them were to change, text_poke_bp() has a clue in the name.
I think we're mostly ok on that front. kvm_wait_lapic_expire() consumes multiple
static keys that can change at will, but that can be kept outside of the noinstr
section.
Thanks!
On Wed, Nov 23, 2022, Li, Xin3 wrote:
> > Actually, SVM already enables NMIs far earlier, which means the probability of
> > breaking something by moving NMI handling earlier is lower. Not zero, as I
> > don't trust that SVM gets all the corner right, but definitely low.
Duh. Moving NMI handling much earlier in VMX's VM-Exit path _must_ be safe,
otherwise we already have problems. On VMX, NMIs are enabled right up until
VM-Enter (VMLAUNCH/VMRESUME), and unless the VM-Exit is due to an NMI, NMIs are
enabled immediately after VM-Exit.
This case is problematic because the CPU can end up running the NMI handler with
NMIs enabled, not because it might run with incorrect state loaded.
> > E.g. this should be doable
>
> Do we need to recover *all* host states before switching to NMI stack and
> handler?
As above, no.
> if not what is the minimal set?
The absolute minimal set is what is context switched via the VMCS.
> If we restore the minimal set and do "int $2" as early as possible, is it a
> quick and dirty approach?
No, it is simply "the approach". If there are other problems with respect to
noinstr, then they can/should be fixed separately.
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > cea8c07f5229..2fec93f38960 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -7249,6 +7249,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu
> > *vcpu)
> > if (unlikely(vmx->exit_reason.failed_vmentry))
> > return EXIT_FASTPATH_NONE;
> >
> > + <handle NMIs here>
> > +
> > vmx->loaded_vmcs->launched = 1;
> >
> > vmx_recover_nmi_blocking(vmx);
> >
> >
> > thouh we'd like want a fair bit of refactoring so that all of vmx_vcpu_run() and
> > svm_vcpu_run() don't need to be noinstr.
For the record, svm_vcpu_run() is fine, at least as far as NMIs are concerned.
> This sounds reasonable to me, however from Documentation/core-api/entry.rst,
> we do need it.
Why do you say that?
I believe this is what we should end up with. Compile tested only, and needs to
split up into 4+ patches. I'll test and massage this into a proper series next
week (US holiday the rest of this week).
---
arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
arch/x86/kvm/vmx/vmenter.S | 4 +--
arch/x86/kvm/vmx/vmx.c | 51 ++++++++++++++++++-----------------
arch/x86/kvm/vmx/vmx.h | 2 +-
arch/x86/kvm/x86.h | 6 ++---
5 files changed, 41 insertions(+), 38 deletions(-)
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index c09174f73a34..af9bd0374915 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
* 1 0 register in vcpu->arch
* 1 1 register in vcpu->arch, needs to be stored back
*/
-static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
{
return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}
-static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
{
return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
}
-static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static __always_inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
{
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}
-static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
{
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 0b5db4de4d09..b104dfd282ed 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -318,7 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline)
RET
SYM_FUNC_END(vmread_error_trampoline)
-SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
+SYM_FUNC_START(vmx_do_interrupt_irqoff)
/*
* Unconditionally create a stack frame, getting the correct RSP on the
* stack (for x86-64) would take two instructions anyways, and RBP can
@@ -349,4 +349,4 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
RET
-SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
+SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cea8c07f5229..b721fde4f5af 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5064,8 +5064,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vect_info = vmx->idt_vectoring_info;
intr_info = vmx_get_intr_info(vcpu);
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
if (is_machine_check(intr_info) || is_nmi(intr_info))
- return 1; /* handled by handle_exception_nmi_irqoff() */
+ return 1;
/*
* Queue the exception here instead of in handle_nm_fault_irqoff().
@@ -6755,18 +6760,6 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
- unsigned long entry)
-{
- bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
-
- kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
- vmx_do_interrupt_nmi_irqoff(entry);
- kvm_after_interrupt(vcpu);
-}
-
static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
/*
@@ -6787,9 +6780,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct vcpu_vmx *vmx)
{
- const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6801,11 +6793,10 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
- /* We need to handle NMIs before interrupts are enabled */
- else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
+void vmx_do_interrupt_irqoff(unsigned long entry);
+
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
u32 intr_info = vmx_get_intr_info(vcpu);
@@ -6816,7 +6807,10 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ vmx_do_interrupt_irqoff(gate_offset(desc));
+ kvm_after_interrupt(vcpu);
+
vcpu->arch.at_instruction_boundary = true;
}
@@ -6830,7 +6824,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_nmi_irqoff(vmx);
+ handle_exception_irqoff(vmx);
}
/*
@@ -7091,6 +7085,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_enable_fb_clear(vmx);
+ if (unlikely(vmx->fail))
+ vmx->exit_reason.full = 0xdead;
+ else
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+
+ if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
+ is_nmi(vmx_get_intr_info(vcpu))) {
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ asm("int $2");
+ kvm_after_interrupt(vcpu);
+ }
+
guest_state_exit_irqoff();
}
@@ -7232,12 +7238,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->idt_vectoring_info = 0;
- if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- }
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a3da84f4ea45..eb52cfde5ec2 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -680,7 +680,7 @@ static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
return vmx->exit_qualification;
}
-static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9de72586f406..44d1827f0a30 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -382,13 +382,13 @@ enum kvm_intr_type {
KVM_HANDLING_NMI,
};
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
- enum kvm_intr_type intr)
+static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
}
-static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
}
base-commit: 0fa32dad1e78629cb42999dacd82489503fdf4c2
--
> > > thouh we'd like want a fair bit of refactoring so that all of
> > > vmx_vcpu_run() and
> > > svm_vcpu_run() don't need to be noinstr.
>
> For the record, svm_vcpu_run() is fine, at least as far as NMIs are concerned.
>
> > This sounds reasonable to me, however from
> > Documentation/core-api/entry.rst, we do need it.
>
> Why do you say that?
>
Copy/Paste from Documentation/core-api/entry.rst:
KVM
---
Entering or exiting guest mode is very similar to syscalls. From the host
kernel point of view the CPU goes off into user space when entering the
guest and returns to the kernel on exit.
kvm_guest_enter_irqoff() is a KVM-specific variant of exit_to_user_mode()
and kvm_guest_exit_irqoff() is the KVM variant of enter_from_user_mode().
The state operations have the same ordering.
Task work handling is done separately for guest at the boundary of the
vcpu_run() loop via xfer_to_guest_mode_handle_work() which is a subset of
the work handled on return to user space.
Do not nest KVM entry/exit transitions because doing so is nonsensical.
>
> I believe this is what we should end up with. Compile tested only, and needs to
> split up into 4+ patches. I'll test and massage this into a proper series next
> week (US holiday the rest of this week).
Great, thanks for doing it quickly.
Xin
>
> ---
> arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
> arch/x86/kvm/vmx/vmenter.S | 4 +--
> arch/x86/kvm/vmx/vmx.c | 51 ++++++++++++++++++-----------------
> arch/x86/kvm/vmx/vmx.h | 2 +-
> arch/x86/kvm/x86.h | 6 ++---
> 5 files changed, 41 insertions(+), 38 deletions(-)
>
> diff --git a/arch/x86/kvm/kvm_cache_regs.h
> b/arch/x86/kvm/kvm_cache_regs.h index c09174f73a34..af9bd0374915
> 100644
> --- a/arch/x86/kvm/kvm_cache_regs.h
> +++ b/arch/x86/kvm/kvm_cache_regs.h
> @@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
> * 1 0 register in vcpu->arch
> * 1 1 register in vcpu->arch, needs to be stored back
> */
> -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); }
>
> -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); }
>
> -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_available(struct kvm_vcpu
> *vcpu,
> + enum kvm_reg reg)
> {
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); }
>
> -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); diff --git
> a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index
> 0b5db4de4d09..b104dfd282ed 100644
> --- a/arch/x86/kvm/vmx/vmenter.S
> +++ b/arch/x86/kvm/vmx/vmenter.S
> @@ -318,7 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline)
> RET
> SYM_FUNC_END(vmread_error_trampoline)
>
> -SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
> +SYM_FUNC_START(vmx_do_interrupt_irqoff)
> /*
> * Unconditionally create a stack frame, getting the correct RSP on the
> * stack (for x86-64) would take two instructions anyways, and RBP can
> @@ -349,4 +349,4 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
> mov %_ASM_BP, %_ASM_SP
> pop %_ASM_BP
> RET
> -SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
> +SYM_FUNC_END(vmx_do_interrupt_irqoff)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> cea8c07f5229..b721fde4f5af 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5064,8 +5064,13 @@ static int handle_exception_nmi(struct kvm_vcpu
> *vcpu)
> vect_info = vmx->idt_vectoring_info;
> intr_info = vmx_get_intr_info(vcpu);
>
> + /*
> + * Machine checks are handled by handle_exception_irqoff(), or by
> + * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
> + * vmx_vcpu_enter_exit().
> + */
> if (is_machine_check(intr_info) || is_nmi(intr_info))
> - return 1; /* handled by handle_exception_nmi_irqoff() */
> + return 1;
>
> /*
> * Queue the exception here instead of in handle_nm_fault_irqoff().
> @@ -6755,18 +6760,6 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
> memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); }
>
> -void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
> -
> -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
> - unsigned long entry)
> -{
> - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
> -
> - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI :
> KVM_HANDLING_IRQ);
> - vmx_do_interrupt_nmi_irqoff(entry);
> - kvm_after_interrupt(vcpu);
> -}
> -
> static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) {
> /*
> @@ -6787,9 +6780,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu
> *vcpu)
> rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); }
>
> -static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> +static void handle_exception_irqoff(struct vcpu_vmx *vmx)
> {
> - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
> u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
>
> /* if exit due to PF check for async PF */ @@ -6801,11 +6793,10 @@
> static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> /* Handle machine checks before interrupts are enabled */
> else if (is_machine_check(intr_info))
> kvm_machine_check();
> - /* We need to handle NMIs before interrupts are enabled */
> - else if (is_nmi(intr_info))
> - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
> }
>
> +void vmx_do_interrupt_irqoff(unsigned long entry);
> +
> static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) {
> u32 intr_info = vmx_get_intr_info(vcpu); @@ -6816,7 +6807,10 @@
> static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
> return;
>
> - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> + vmx_do_interrupt_irqoff(gate_offset(desc));
> + kvm_after_interrupt(vcpu);
> +
> vcpu->arch.at_instruction_boundary = true; }
>
> @@ -6830,7 +6824,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu
> *vcpu)
> if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
> handle_external_interrupt_irqoff(vcpu);
> else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
> - handle_exception_nmi_irqoff(vmx);
> + handle_exception_irqoff(vmx);
> }
>
> /*
> @@ -7091,6 +7085,18 @@ static noinstr void vmx_vcpu_enter_exit(struct
> kvm_vcpu *vcpu,
>
> vmx_enable_fb_clear(vmx);
>
> + if (unlikely(vmx->fail))
> + vmx->exit_reason.full = 0xdead;
> + else
> + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
> +
> + if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
> + is_nmi(vmx_get_intr_info(vcpu))) {
> + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
> + asm("int $2");
> + kvm_after_interrupt(vcpu);
> + }
> +
> guest_state_exit_irqoff();
> }
>
> @@ -7232,12 +7238,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu
> *vcpu)
>
> vmx->idt_vectoring_info = 0;
>
> - if (unlikely(vmx->fail)) {
> - vmx->exit_reason.full = 0xdead;
> + if (unlikely(vmx->fail))
> return EXIT_FASTPATH_NONE;
> - }
>
> - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
> if (unlikely((u16)vmx->exit_reason.basic ==
> EXIT_REASON_MCE_DURING_VMENTRY))
> kvm_machine_check();
>
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index
> a3da84f4ea45..eb52cfde5ec2 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -680,7 +680,7 @@ static inline unsigned long vmx_get_exit_qual(struct
> kvm_vcpu *vcpu)
> return vmx->exit_qualification;
> }
>
> -static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
> +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index
> 9de72586f406..44d1827f0a30 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -382,13 +382,13 @@ enum kvm_intr_type {
> KVM_HANDLING_NMI,
> };
>
> -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
> - enum kvm_intr_type intr)
> +static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
> + enum kvm_intr_type intr)
> {
> WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); }
>
> -static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
> +static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
> {
> WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); }
>
> base-commit: 0fa32dad1e78629cb42999dacd82489503fdf4c2
> --
On Wed, Nov 23, 2022 at 08:42:51PM +0000, Sean Christopherson wrote:
> arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
> arch/x86/kvm/vmx/vmenter.S | 4 +--
> arch/x86/kvm/vmx/vmx.c | 51 ++++++++++++++++++-----------------
> arch/x86/kvm/vmx/vmx.h | 2 +-
> arch/x86/kvm/x86.h | 6 ++---
> 5 files changed, 41 insertions(+), 38 deletions(-)
>
> diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
> index c09174f73a34..af9bd0374915 100644
> --- a/arch/x86/kvm/kvm_cache_regs.h
> +++ b/arch/x86/kvm/kvm_cache_regs.h
> @@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
> * 1 0 register in vcpu->arch
> * 1 1 register in vcpu->arch, needs to be stored back
> */
> -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> }
>
> -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
> }
>
> -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> }
>
> -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
You'll have to consider include/asm-generic/bitops/instrumented-non-atomic.h
and friend, and the above should probably switch to using:
arch_test_bit(), arch___set_bit() resp.
to avoid the explicit instrumentation.
On Thu, Nov 24, 2022, Li, Xin3 wrote:
> > > > thouh we'd like want a fair bit of refactoring so that all of
> > > > vmx_vcpu_run() and
> > > > svm_vcpu_run() don't need to be noinstr.
> >
> > For the record, svm_vcpu_run() is fine, at least as far as NMIs are concerned.
> >
> > > This sounds reasonable to me, however from
> > > Documentation/core-api/entry.rst, we do need it.
> >
> > Why do you say that?
> >
>
> Copy/Paste from Documentation/core-api/entry.rst:
I'm very confused. What do you mean by "we do need it". What is "it"? And what
does "it" have to do with the below documentation? The documentation does nothing
more than explain how KVM handles task work.
> KVM
> ---
>
> Entering or exiting guest mode is very similar to syscalls. From the host
> kernel point of view the CPU goes off into user space when entering the
> guest and returns to the kernel on exit.
>
> kvm_guest_enter_irqoff() is a KVM-specific variant of exit_to_user_mode()
> and kvm_guest_exit_irqoff() is the KVM variant of enter_from_user_mode().
> The state operations have the same ordering.
>
> Task work handling is done separately for guest at the boundary of the
> vcpu_run() loop via xfer_to_guest_mode_handle_work() which is a subset of
> the work handled on return to user space.
>
> Do not nest KVM entry/exit transitions because doing so is nonsensical.
On Thu, Nov 24, 2022, Peter Zijlstra wrote:
> On Wed, Nov 23, 2022 at 08:42:51PM +0000, Sean Christopherson wrote:
> > arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
> > arch/x86/kvm/vmx/vmenter.S | 4 +--
> > arch/x86/kvm/vmx/vmx.c | 51 ++++++++++++++++++-----------------
> > arch/x86/kvm/vmx/vmx.h | 2 +-
> > arch/x86/kvm/x86.h | 6 ++---
> > 5 files changed, 41 insertions(+), 38 deletions(-)
> >
> > diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
> > index c09174f73a34..af9bd0374915 100644
> > --- a/arch/x86/kvm/kvm_cache_regs.h
> > +++ b/arch/x86/kvm/kvm_cache_regs.h
> > @@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
> > * 1 0 register in vcpu->arch
> > * 1 1 register in vcpu->arch, needs to be stored back
> > */
> > -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> > - enum kvm_reg reg)
> > +static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> > + enum kvm_reg reg)
> > {
> > return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> > }
> >
> > -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> > - enum kvm_reg reg)
> > +static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> > + enum kvm_reg reg)
> > {
> > return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
> > }
> >
> > -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> > - enum kvm_reg reg)
> > +static __always_inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> > + enum kvm_reg reg)
> > {
> > __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> > }
> >
> > -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> > - enum kvm_reg reg)
> > +static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> > + enum kvm_reg reg)
> > {
> > __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> > __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
>
> You'll have to consider include/asm-generic/bitops/instrumented-non-atomic.h
> and friend, and the above should probably switch to using:
>
> arch_test_bit(), arch___set_bit() resp.
>
> to avoid the explicit instrumentation.
Well that's just mean. I'll figure out a solution, thanks for the heads up!