VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt.
With VT-d Posted-Interrupts enabled, external interrupts from
direct-assigned devices can be delivered to guests without VMM
intervention when guest is running in non-root mode.
You can find the VT-d Posted-Interrtups Spec. in the following URL:
http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/vt-directed-io-spec.html
This series implement some prerequisite parts for VT-d posted-interrupts. It was part of
http://thread.gmane.org/gmane.linux.kernel.iommu/7708. To make things clear, I will divide
the whole series which contain multiple components into three parts:
- prerequisite changes (included in this series)
- IOMMU part (v4 was reviewed, some comments need to be addressed)
- KVM and VFIO parts (will send out this part once the first two parts are accepted)
This series is rebased on the x86-apic branch of tip tree.
v4 --> v5:
- Move the declaration of "irq_chip_set_vcpu_affinity_parent()" to [1/3].
- Use the accessor to get "struct irq_data", "struct irq_chip".
- Use "irq_get_desc_lock()" instead of "irq_to_desc()".
- Declare "wakeup_handler_callback" in "asm/irq.h".
- Use entering_ack_irq()/exiting_irq() in smp_kvm_posted_intr_wakeup_ipi().
Feng Wu (2):
x86, irq: Implement irq_set_vcpu_affinity for pci_msi_ir_controller
x86, irq: Define a global vector for VT-d Posted-Interrupts
Jiang Liu (1):
genirq: Introduce irq_set_vcpu_affinity() to target an interrupt to a
VCPU
arch/x86/include/asm/entry_arch.h | 2 ++
arch/x86/include/asm/hardirq.h | 1 +
arch/x86/include/asm/hw_irq.h | 2 ++
arch/x86/include/asm/irq.h | 4 ++++
arch/x86/include/asm/irq_vectors.h | 1 +
arch/x86/kernel/apic/msi.c | 1 +
arch/x86/kernel/entry_64.S | 2 ++
arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++
arch/x86/kernel/irqinit.c | 2 ++
include/linux/irq.h | 6 ++++++
kernel/irq/chip.c | 14 ++++++++++++++
kernel/irq/manage.c | 21 +++++++++++++++++++++
12 files changed, 79 insertions(+)
--
2.1.0
From: Jiang Liu <[email protected]>
With Posted-Interrupts support in Intel CPU and IOMMU, an external
interrupt from assigned-devices could be directly delivered to a
virtual CPU in a virtual machine. Instead of hacking KVM and Intel
IOMMU drivers, we propose a platform independent interface to target
an interrupt to a specific virtual CPU in a virtual machine, or set
virtual CPU affinity for an interrupt.
By adopting this new interface and the hierarchy irqdomain, we could
easily support posted-interrupts on Intel platforms, and also provide
flexible enough interfaces for other platforms to support similar
features.
Here is the usage scenario for this interface:
Guest update MSI/MSI-X interrupt configuration
-->QEMU and KVM handle this
-->KVM call this interface (passing posted interrupts descriptor
and guest vector)
-->irq core will transfer the control to IOMMU
-->IOMMU will do the real work of updating IRTE (IRTE has new
format for VT-d Posted-Interrupts)
Signed-off-by: Jiang Liu <[email protected]>
Signed-off-by: Feng Wu <[email protected]>
---
include/linux/irq.h | 6 ++++++
kernel/irq/chip.c | 14 ++++++++++++++
kernel/irq/manage.c | 21 +++++++++++++++++++++
3 files changed, 41 insertions(+)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 62c6901..cb688fb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -327,6 +327,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
* @irq_write_msi_msg: optional to write message content for MSI
* @irq_get_irqchip_state: return the internal state of an interrupt
* @irq_set_irqchip_state: set the internal state of a interrupt
+ * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine
* @flags: chip specific flags
*/
struct irq_chip {
@@ -369,6 +370,8 @@ struct irq_chip {
int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state);
int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state);
+ int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
+
unsigned long flags;
};
@@ -422,6 +425,7 @@ extern void irq_cpu_online(void);
extern void irq_cpu_offline(void);
extern int irq_set_affinity_locked(struct irq_data *data,
const struct cpumask *cpumask, bool force);
+extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info);
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
void irq_move_irq(struct irq_data *data);
@@ -467,6 +471,8 @@ extern int irq_chip_set_affinity_parent(struct irq_data *data,
const struct cpumask *dest,
bool force);
extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on);
+extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data,
+ void *vcpu_info);
#endif
/* Handling of unhandled and spurious interrupts: */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eb9a4ea..55016b2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -950,6 +950,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
}
/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_vcpu_affinity)
+ return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
+ return -ENOSYS;
+}
+
+/**
* irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
* @data: Pointer to interrupt specific data
* @on: Whether to set or reset the wake-up capability of this irq
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e68932b..7167825 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,27 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ struct irq_data *data;
+ struct irq_chip *chip;
+ int ret = -ENOSYS;
+
+ if (!desc)
+ return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+ irq_put_desc_unlock(desc, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
static void irq_affinity_notify(struct work_struct *work)
{
struct irq_affinity_notify *notify =
--
2.1.0
Implement irq_set_vcpu_affinity for pci_msi_ir_controller.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
arch/x86/kernel/apic/msi.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 58fde66..d2d95e2 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -152,6 +152,7 @@ static struct irq_chip pci_msi_ir_controller = {
.irq_mask = pci_msi_mask_irq,
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
--
2.1.0
Currently, we use a global vector as the Posted-Interrupts
Notification Event for all the vCPUs in the system. We need
to introduce another global vector for VT-d Posted-Interrtups,
which will be used to wakeup the sleep vCPU when an external
interrupt from a direct-assigned device happens for that vCPU.
Signed-off-by: Feng Wu <[email protected]>
Suggested-by: Yang Zhang <[email protected]>
Acked-by: H. Peter Anvin <[email protected]>
---
arch/x86/include/asm/entry_arch.h | 2 ++
arch/x86/include/asm/hardirq.h | 1 +
arch/x86/include/asm/hw_irq.h | 2 ++
arch/x86/include/asm/irq.h | 4 ++++
arch/x86/include/asm/irq_vectors.h | 1 +
arch/x86/kernel/entry_64.S | 2 ++
arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++
arch/x86/kernel/irqinit.c | 2 ++
8 files changed, 37 insertions(+)
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index dc5fa66..27ca0af 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
#ifdef CONFIG_HAVE_KVM
BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
smp_kvm_posted_intr_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
+ smp_kvm_posted_intr_wakeup_ipi)
#endif
/*
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f5fb6b..9866065 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
#endif
#ifdef CONFIG_HAVE_KVM
unsigned int kvm_posted_intr_ipis;
+ unsigned int kvm_posted_intr_wakeup_ipis;
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1f88e71..6ffc847 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,7 @@
extern asmlinkage void apic_timer_interrupt(void);
extern asmlinkage void x86_platform_ipi(void);
extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
extern asmlinkage void error_interrupt(void);
extern asmlinkage void irq_work_interrupt(void);
@@ -92,6 +93,7 @@ extern void trace_call_function_single_interrupt(void);
#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
#define trace_reboot_interrupt reboot_interrupt
#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
+#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
#endif /* CONFIG_TRACING */
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index a80cbb8..16bfe4e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -30,6 +30,10 @@ extern void fixup_irqs(void);
extern void irq_force_complete_move(int);
#endif
+#ifdef CONFIG_HAVE_KVM
+ extern void (*wakeup_handler_callback)(void);
+#endif
+
extern void (*x86_platform_ipi_callback)(void);
extern void native_init_IRQ(void);
extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index b26cb12..dca94f2 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -105,6 +105,7 @@
/* Vector for KVM to deliver posted interrupt IPI */
#ifdef CONFIG_HAVE_KVM
#define POSTED_INTR_VECTOR 0xf2
+#define POSTED_INTR_WAKEUP_VECTOR 0xf1
#endif
/*
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c7b2384..177feec 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -919,6 +919,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
#ifdef CONFIG_HAVE_KVM
apicinterrupt3 POSTED_INTR_VECTOR \
kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \
+ kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e5952c2..87620bf 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -237,6 +237,9 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
}
#ifdef CONFIG_HAVE_KVM
+void (*wakeup_handler_callback)(void);
+EXPORT_SYMBOL_GPL(wakeup_handler_callback);
+
/*
* Handler for POSTED_INTERRUPT_VECTOR.
*/
@@ -256,6 +259,26 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
set_irq_regs(old_regs);
}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+
+ if (wakeup_handler_callback)
+ wakeup_handler_callback();
+
+ exiting_irq();
+
+ set_irq_regs(old_regs);
+}
+
#endif
__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index cd10a64..895941d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -144,6 +144,8 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_HAVE_KVM
/* IPI for KVM to deliver posted interrupt */
alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+ /* IPI for KVM to deliver interrupt to wake up tasks */
+ alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
#endif
/* IPI vectors for APIC spurious and error interrupts */
--
2.1.0
On Mon, 18 May 2015, Feng Wu wrote:
> diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
> index 0f5fb6b..9866065 100644
> --- a/arch/x86/include/asm/hardirq.h
> +++ b/arch/x86/include/asm/hardirq.h
> @@ -14,6 +14,7 @@ typedef struct {
> #endif
> #ifdef CONFIG_HAVE_KVM
> unsigned int kvm_posted_intr_ipis;
> + unsigned int kvm_posted_intr_wakeup_ipis;
So now we have another IPI with statistics and nothing which makes it
accessible. kvm_posted_intr_ipis lacks a printout in
arch_show_interrupts() as well.
> #ifdef CONFIG_HAVE_KVM
> +void (*wakeup_handler_callback)(void);
> +EXPORT_SYMBOL_GPL(wakeup_handler_callback);
> +
The naming sucks. Which wakeup?
As this is kvm specific, it should have a kvm_ prefix. And it should
tell what it actually does:
kvm_posted_intr_wakeup_handler
Hmm?
> /*
> * Handler for POSTED_INTERRUPT_VECTOR.
> */
> @@ -256,6 +259,26 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
>
> set_irq_regs(old_regs);
> }
> +
> +/*
> + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
> + */
> +__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs = set_irq_regs(regs);
> +
> + entering_ack_irq();
> +
> + inc_irq_stat(kvm_posted_intr_wakeup_ipis);
> +
> + if (wakeup_handler_callback)
> + wakeup_handler_callback();
Why do we need a conditional here?
staic void dummy_handler(void) { }
static void *kvm_posted_intr_wakeup_handler = dummy_handler;
void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
{
if (handler)
kvm_posted_intr_wakeup_handler = handler;
else
kvm_posted_intr_wakeup_handler = dummy_handler;
}
avoids the conditional in the exception handler....
Thanks,
tglx
> -----Original Message-----
> From: Thomas Gleixner [mailto:[email protected]]
> Sent: Monday, May 18, 2015 10:18 PM
> To: Wu, Feng
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]
> Subject: Re: [v5 3/3] x86, irq: Define a global vector for VT-d Posted-Interrupts
>
> On Mon, 18 May 2015, Feng Wu wrote:
> > diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
> > index 0f5fb6b..9866065 100644
> > --- a/arch/x86/include/asm/hardirq.h
> > +++ b/arch/x86/include/asm/hardirq.h
> > @@ -14,6 +14,7 @@ typedef struct {
> > #endif
> > #ifdef CONFIG_HAVE_KVM
> > unsigned int kvm_posted_intr_ipis;
> > + unsigned int kvm_posted_intr_wakeup_ipis;
>
> So now we have another IPI with statistics and nothing which makes it
> accessible. kvm_posted_intr_ipis lacks a printout in
> arch_show_interrupts() as well.
>
I will add the printouts for these two IPIs.
> > #ifdef CONFIG_HAVE_KVM
> > +void (*wakeup_handler_callback)(void);
> > +EXPORT_SYMBOL_GPL(wakeup_handler_callback);
> > +
>
> The naming sucks. Which wakeup?
>
> As this is kvm specific, it should have a kvm_ prefix. And it should
> tell what it actually does:
>
> kvm_posted_intr_wakeup_handler
>
> Hmm?
Good suggestion!
>
> > /*
> > * Handler for POSTED_INTERRUPT_VECTOR.
> > */
> > @@ -256,6 +259,26 @@ __visible void smp_kvm_posted_intr_ipi(struct
> pt_regs *regs)
> >
> > set_irq_regs(old_regs);
> > }
> > +
> > +/*
> > + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
> > + */
> > +__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
> > +{
> > + struct pt_regs *old_regs = set_irq_regs(regs);
> > +
> > + entering_ack_irq();
> > +
> > + inc_irq_stat(kvm_posted_intr_wakeup_ipis);
> > +
> > + if (wakeup_handler_callback)
> > + wakeup_handler_callback();
>
> Why do we need a conditional here?
>
> staic void dummy_handler(void) { }
> static void *kvm_posted_intr_wakeup_handler = dummy_handler;
>
> void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
> {
> if (handler)
> kvm_posted_intr_wakeup_handler = handler;
> else
> kvm_posted_intr_wakeup_handler = dummy_handler;
> }
>
> avoids the conditional in the exception handler....
Got it. Conditional branch in a critical path is really a bad thing...
Thanks,
Feng
>
> Thanks,
>
> tglx