VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt.
With VT-d Posted-Interrupts enabled, external interrupts from
direct-assigned devices can be delivered to guests without VMM
intervention when guest is running in non-root mode.
You can find the VT-d Posted-Interrtups Spec. in the following URL:
http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/vt-directed-io-spec.html
v1->v2:
* Use VFIO framework to enable this feature, the VFIO part of this series is
base on Eric's patch "[PATCH v3 0/8] KVM-VFIO IRQ forward control"
* Rebase this patchset on git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git,
then revise some irq logic based on the new hierarchy irqdomain patches provided
by Jiang Liu <[email protected]>
This patch series is made of the following groups:
1-6: Some preparation changes in iommu and irq component, this is based on the
new hierarchy irqdomain logic.
7-9, 25: IOMMU changes for VT-d Posted-Interrupts, such as, feature detection,
command line parameter.
10-16, 21-24: Changes related to KVM itself.
17-19: Changes in VFIO component, this part was previously sent out as
"[RFC PATCH v2 0/2] kvm-vfio: implement the vfio skeleton for VT-d Posted-Interrupts"
20: x86 irq related changes
Feng Wu (25):
genirq: Introduce irq_set_vcpu_affinity() to target an interrupt to a
VCPU
iommu: Add new member capability to struct irq_remap_ops
iommu, x86: Define new irte structure for VT-d Posted-Interrupts
iommu, x86: Implement irq_set_vcpu_affinity for intel_ir_chip
x86, irq: Implement irq_set_vcpu_affinity for pci_msi_ir_controller
iommu, x86: No need to migrating irq for VT-d Posted-Interrupts
iommu, x86: Add cap_pi_support() to detect VT-d PI capability
iommu, x86: Add intel_irq_remapping_capability() for Intel
iommu, x86: define irq_remapping_cap()
KVM: change struct pi_desc for VT-d Posted-Interrupts
KVM: Add some helper functions for Posted-Interrupts
KVM: Initialize VT-d Posted-Interrupts Descriptor
KVM: Define a new interface kvm_find_dest_vcpu() for VT-d PI
KVM: Get Posted-Interrupts descriptor address from struct kvm_vcpu
KVM: Make struct kvm_irq_routing_table accessible
KVM: make kvm_set_msi_irq() public
KVM: kvm-vfio: User API for VT-d Posted-Interrupts
KVM: kvm-vfio: implement the VFIO skeleton for VT-d Posted-Interrupts
KVM: x86: kvm-vfio: VT-d posted-interrupts setup
x86, irq: Define a global vector for VT-d Posted-Interrupts
KVM: Update Posted-Interrupts descriptor during vCPU scheduling
KVM: Change NDST field after vCPU scheduling
KVM: Add the handler for Wake-up Vector
KVM: Suppress posted-interrupt when 'SN' is set
iommu/vt-d: Add a command line parameter for VT-d posted-interrupts
Documentation/kernel-parameters.txt | 1 +
Documentation/virtual/kvm/devices/vfio.txt | 9 +
arch/x86/include/asm/entry_arch.h | 2 +
arch/x86/include/asm/hardirq.h | 1 +
arch/x86/include/asm/hw_irq.h | 2 +
arch/x86/include/asm/irq_remapping.h | 11 ++
arch/x86/include/asm/irq_vectors.h | 1 +
arch/x86/include/asm/kvm_host.h | 14 ++
arch/x86/kernel/apic/msi.c | 1 +
arch/x86/kernel/entry_64.S | 2 +
arch/x86/kernel/irq.c | 27 +++
arch/x86/kernel/irqinit.c | 2 +
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/kvm_vfio_x86.c | 68 ++++++++
arch/x86/kvm/vmx.c | 251 +++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 38 ++++-
drivers/iommu/intel_irq_remapping.c | 64 +++++++
drivers/iommu/irq_remapping.c | 24 +++-
drivers/iommu/irq_remapping.h | 8 +
include/linux/dmar.h | 32 ++++
include/linux/intel-iommu.h | 1 +
include/linux/irq.h | 7 +
include/linux/kvm_host.h | 43 +++++
include/uapi/linux/kvm.h | 10 +
kernel/irq/chip.c | 14 ++
kernel/irq/manage.c | 20 +++
virt/kvm/irq_comm.c | 43 +++++-
virt/kvm/irqchip.c | 11 --
virt/kvm/kvm_main.c | 14 ++
virt/kvm/vfio.c | 103 ++++++++++++
30 files changed, 799 insertions(+), 27 deletions(-)
create mode 100644 arch/x86/kvm/kvm_vfio_x86.c
This patch adds a new member capability to struct irq_remap_ops,
this new function ops can be used to check whether some
features are supported, such as VT-d Posted-Interrupts.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
arch/x86/include/asm/irq_remapping.h | 4 ++++
drivers/iommu/irq_remapping.h | 4 ++++
2 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 6ba2431..f67ae08 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -31,6 +31,10 @@ struct irq_alloc_info;
#ifdef CONFIG_IRQ_REMAP
+enum irq_remap_cap {
+ IRQ_POSTING_CAP = 0,
+};
+
extern void setup_irq_remapping_ops(void);
extern int irq_remapping_supported(void);
extern void set_irq_remapping_broken(void);
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 4bd791d..2d991b2 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -28,6 +28,7 @@ struct irq_data;
struct msi_msg;
struct irq_domain;
struct irq_alloc_info;
+enum irq_remap_cap;
extern int disable_irq_remap;
extern int irq_remap_broken;
@@ -39,6 +40,9 @@ struct irq_remap_ops {
/* Check whether Interrupt Remapping is supported */
int (*supported)(void);
+ /* Check some capability is supported */
+ bool (*capability)(enum irq_remap_cap);
+
/* Initializes hardware and makes it ready for remapping interrupts */
int (*prepare)(void);
--
1.7.1
Add a new irte_pi structure for VT-d Posted-Interrupts.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
include/linux/dmar.h | 32 ++++++++++++++++++++++++++++++++
1 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 8473756..c7f9cda 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -212,6 +212,38 @@ struct irte {
};
};
+struct irte_pi {
+ union {
+ struct {
+ __u64 present : 1,
+ fpd : 1,
+ __reserved_1 : 6,
+ avail : 4,
+ __reserved_2 : 2,
+ urg : 1,
+ pst : 1,
+ vector : 8,
+ __reserved_3 : 14,
+ pda_l : 26;
+ };
+ __u64 low;
+ };
+
+ union {
+ struct {
+ __u64 sid : 16,
+ sq : 2,
+ svt : 2,
+ __reserved_4 : 12,
+ pda_h : 32;
+ };
+ __u64 high;
+ };
+};
+
+#define PDA_LOW_BIT 26
+#define PDA_HIGH_BIT 32
+
enum {
IRQ_REMAP_XAPIC_MODE,
IRQ_REMAP_X2APIC_MODE,
--
1.7.1
Implement irq_set_vcpu_affinity for intel_ir_chip.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
arch/x86/include/asm/irq_remapping.h | 5 +++++
drivers/iommu/intel_irq_remapping.c | 27 +++++++++++++++++++++++++++
2 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f67ae08..f87ac70 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -60,6 +60,11 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
return x86_vector_domain;
}
+struct vcpu_data {
+ u64 pi_desc_addr; /* Physical address of PI Descriptor */
+ u32 vector; /* Guest vector of the interrupt */
+};
+
#else /* CONFIG_IRQ_REMAP */
static inline void setup_irq_remapping_ops(void) { }
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index f6da3b2..749cb93 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1010,10 +1010,37 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
*msg = ir_data->msi_entry;
}
+static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+ struct intel_ir_data *ir_data = data->chip_data;
+ struct irte *irte = &ir_data->irte_entry;
+ struct irte_pi *irte_pi = (struct irte_pi *)irte;
+ struct vcpu_data *vcpu_pi_info = (struct vcpu_data *)vcpu_info;
+
+ irte_pi->urg = 0;
+ irte_pi->vector = vcpu_pi_info->vector;
+ irte_pi->pda_l = (vcpu_pi_info->pi_desc_addr >> (32 - PDA_LOW_BIT)) &
+ ~(-1UL << PDA_LOW_BIT);
+ irte_pi->pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+ ~(-1UL << PDA_HIGH_BIT);
+
+ irte_pi->__reserved_1 = 0;
+ irte_pi->__reserved_2 = 0;
+ irte_pi->__reserved_3 = 0;
+ irte_pi->__reserved_4 = 0;
+
+ irte_pi->pst = 1;
+
+ modify_irte(&ir_data->irq_2_iommu, irte);
+
+ return 0;
+}
+
static struct irq_chip intel_ir_chip = {
.irq_ack = ir_ack_apic_edge,
.irq_set_affinity = intel_ir_set_affinity,
.irq_compose_msi_msg = intel_ir_compose_msi_msg,
+ .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
};
static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
--
1.7.1
Add helper function to detect VT-d Posted-Interrupts capability.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
include/linux/intel-iommu.h | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ecaf3a9..8174ae8 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -87,6 +87,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
/*
* Decoding Capability Register
*/
+#define cap_pi_support(c) (((c) >> 59) & 1)
#define cap_read_drain(c) (((c) >> 55) & 1)
#define cap_write_drain(c) (((c) >> 54) & 1)
#define cap_max_amask_val(c) (((c) >> 48) & 0x3f)
--
1.7.1
We don't need to migrate the irqs for VT-d Posted-Interrupts here.
When 'pst' is set in IRTE, the associated irq will be posted to
guests instead of interrupt remapping. The destination of the
interrupt is set in Posted-Interrupts Descriptor, and the migration
happens during vCPU scheduling.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
drivers/iommu/intel_irq_remapping.c | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 749cb93..01786a8 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -976,10 +976,20 @@ intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
{
struct intel_ir_data *ir_data = data->chip_data;
struct irte *irte = &ir_data->irte_entry;
+ struct irte_pi *irte_pi = (struct irte_pi *)irte;
struct irq_cfg *cfg = irqd_cfg(data);
struct irq_data *parent = data->parent_data;
int ret;
+ /*
+ * If the interrupt is for posting, it is used by guests,
+ * we cannot set irq affinity here.
+ */
+ if (irte_pi->pst == 1) {
+ pr_warn("cannot set irq affinity for posted-interrupts\n");
+ return -EBUSY;
+ }
+
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
return ret;
--
1.7.1
Add the Intel side implementation for capability in
struct irq_remap_ops.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
drivers/iommu/intel_irq_remapping.c | 27 +++++++++++++++++++++++++++
drivers/iommu/irq_remapping.c | 2 ++
drivers/iommu/irq_remapping.h | 4 ++++
3 files changed, 33 insertions(+), 0 deletions(-)
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 01786a8..827aeff 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -651,6 +651,32 @@ error:
return -1;
}
+static bool intel_irq_remapping_capability(enum irq_remap_cap cap)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+
+ switch (cap) {
+ case IRQ_POSTING_CAP:
+ /*
+ * If 1) posted-interrupts is disabled by user
+ * or 2) irq remapping is disabled, posted-interrupts
+ * is not supported.
+ */
+ if (disable_irq_post || !irq_remapping_enabled)
+ return 0;
+
+ for_each_iommu(iommu, drhd)
+ if (!cap_pi_support(iommu->cap))
+ return 0;
+
+ return 1;
+ default:
+ pr_warn("Unknown irq remapping capability.\n");
+ return 0;
+ }
+}
+
static int ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
struct intel_iommu *iommu,
struct acpi_dmar_hardware_unit *drhd)
@@ -947,6 +973,7 @@ static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
struct irq_remap_ops intel_irq_remap_ops = {
.supported = intel_irq_remapping_supported,
+ .capability = intel_irq_remapping_capability,
.prepare = dmar_table_init,
.enable = intel_enable_irq_remapping,
.disable = disable_irq_remapping,
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 3c3da04..e63e969 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -24,6 +24,8 @@ int irq_remap_broken;
int disable_sourceid_checking;
int no_x2apic_optout;
+int disable_irq_post = 1;
+
static struct irq_remap_ops *remap_ops;
static void irq_remapping_disable_io_apic(void)
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 2d991b2..cb1f46d 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -36,6 +36,8 @@ extern int disable_sourceid_checking;
extern int no_x2apic_optout;
extern int irq_remapping_enabled;
+extern int disable_irq_post;
+
struct irq_remap_ops {
/* Check whether Interrupt Remapping is supported */
int (*supported)(void);
@@ -76,6 +78,8 @@ extern void ir_ack_apic_edge(struct irq_data *data);
#define disable_irq_remap 1
#define irq_remap_broken 0
+#define disable_irq_post 1
+
#endif /* CONFIG_IRQ_REMAP */
#endif /* __IRQ_REMAPPING_H */
--
1.7.1
This patch adds some helper functions to manipulate the
Posted-Interrupts Descriptor.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++
1 files changed, 26 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index abdb84f..0b1383e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -408,6 +408,8 @@ struct nested_vmx {
};
#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
@@ -443,6 +445,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ return clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static void pi_set_sn(struct pi_desc *pi_desc)
+{
+ return set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
--
1.7.1
This patch initializes the VT-d Posted-Interrupts Descriptor.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/kvm/vmx.c | 27 +++++++++++++++++++++++++++
1 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0b1383e..66ca275 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
#include <asm/perf_event.h>
#include <asm/debugreg.h>
#include <asm/kexec.h>
+#include <asm/irq_remapping.h>
#include "trace.h"
@@ -4433,6 +4434,30 @@ static void ept_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
}
+static void pi_desc_init(struct vcpu_vmx *vmx)
+{
+ unsigned int dest;
+
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ /*
+ * Initialize Posted-Interrupt Descriptor
+ */
+
+ pi_clear_sn(&vmx->pi_desc);
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+
+ /* Physical mode for Notificaiton Event */
+ vmx->pi_desc.ndm = 0;
+ dest = cpu_physical_id(vmx->vcpu.cpu);
+
+ if (x2apic_enabled())
+ vmx->pi_desc.ndst = dest;
+ else
+ vmx->pi_desc.ndst = (dest << 8) & 0xFF00;
+}
+
/*
* Sets up the vmcs for emulated real mode.
*/
@@ -4476,6 +4501,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+
+ pi_desc_init(vmx);
}
if (ple_gap) {
--
1.7.1
Make kvm_set_msi_irq() public, we can use this function outside.
Signed-off-by: Feng Wu <[email protected]>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/irq_comm.c | 2 +-
2 files changed, 3 insertions(+), 1 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cfa85ac..5cd4420 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -785,6 +785,8 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
int kvm_request_irq_source_id(struct kvm *kvm);
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq);
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index f3c5d69..231671a 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -106,7 +106,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
--
1.7.1
This patch adds the kvm-vfio interface for VT-d Posted-Interrrupts.
When guests updates MSI/MSI-x information for an assigned-device,
QEMU will use KVM_DEV_VFIO_DEVICE_POSTING_IRQ attribute to setup
IRTE for VT-d PI. This patch implement this IRQ attribute.
Signed-off-by: Feng Wu <[email protected]>
---
include/linux/kvm_host.h | 19 ++++++++
virt/kvm/vfio.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 122 insertions(+), 0 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5cd4420..8d06678 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1134,6 +1134,25 @@ static inline int kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq,
}
#endif
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
+/*
+ * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * returns 0 on success, < 0 on failure
+ */
+int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq);
+#else
+static int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 6bc7001..5e5515f 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -446,6 +446,99 @@ out:
return ret;
}
+static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
+{
+ if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
+ u8 pin;
+
+ pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
+ if (pin)
+ return 1;
+ } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX)
+ return pci_msi_vec_count(pdev);
+ else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
+ return pci_msix_vec_count(pdev);
+
+ return 0;
+}
+
+static int kvm_vfio_set_pi(struct kvm_device *kdev, int32_t __user *argp)
+{
+ struct kvm_vfio_dev_irq pi_info;
+ uint32_t *gsi;
+ unsigned long minsz;
+ struct vfio_device *vdev;
+ struct msi_desc *entry;
+ struct device *dev;
+ struct pci_dev *pdev;
+ int i, max, ret;
+
+ minsz = offsetofend(struct kvm_vfio_dev_irq, count);
+
+ if (copy_from_user(&pi_info, (void __user *)argp, minsz))
+ return -EFAULT;
+
+ if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
+ return -EINVAL;
+
+ vdev = kvm_vfio_get_vfio_device(pi_info.fd);
+ if (IS_ERR(vdev))
+ return PTR_ERR(vdev);
+
+ dev = kvm_vfio_external_base_device(vdev);
+ if (!dev || !dev_is_pci(dev)) {
+ ret = -EFAULT;
+ goto put_vfio_device;
+ }
+
+ pdev = to_pci_dev(dev);
+
+ max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
+ if (max <= 0) {
+ ret = -EFAULT;
+ goto put_vfio_device;
+ }
+
+ if (pi_info.argsz - minsz < pi_info.count * sizeof(int) ||
+ pi_info.start >= max || pi_info.start + pi_info.count > max) {
+ ret = -EINVAL;
+ goto put_vfio_device;
+ }
+
+ gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
+ pi_info.count * sizeof(int));
+ if (IS_ERR(gsi)) {
+ ret = PTR_ERR(gsi);
+ goto put_vfio_device;
+ }
+
+#ifdef CONFIG_PCI_MSI
+ for (i = 0; i < pi_info.count; i++) {
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ if (entry->msi_attrib.entry_nr != pi_info.start+i)
+ continue;
+
+ ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
+ entry->irq,
+ gsi[i]);
+ if (ret) {
+ ret = -EFAULT;
+ goto free_gsi;
+ }
+ }
+ }
+#endif
+
+ ret = 0;
+
+free_gsi:
+ kfree(gsi);
+
+put_vfio_device:
+ kvm_vfio_put_vfio_device(vdev);
+ return ret;
+}
+
static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
{
int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
@@ -456,6 +549,11 @@ static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
ret = kvm_vfio_control_irq_forward(kdev, attr, argp);
break;
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
+ case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
+ ret = kvm_vfio_set_pi(kdev, argp);
+ break;
+#endif
default:
ret = -ENXIO;
}
@@ -511,6 +609,11 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
return 0;
#endif
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
+ case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
+ return 0;
+#endif
+
}
break;
}
--
1.7.1
This patch changes the NDST filed of Posted-Interrupts
Descriptor after vCPU is scheduled to another physical
CPU.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/kvm/vmx.c | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a1966b9..e71bf3b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1906,6 +1906,31 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
vmx->loaded_vmcs->cpu = cpu;
}
+
+ if (irq_remapping_cap(IRQ_POSTING_CAP) && (vcpu->cpu != cpu)) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ memset(&old, 0, sizeof(old));
+ memset(&new, 0, sizeof(new));
+
+ pi_set_sn(pi_desc);
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+ pi_clear_sn(pi_desc);
+ }
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
--
1.7.1
Enable VT-d Posted-Interrtups and add a command line
parameter for it.
Signed-off-by: Feng Wu <[email protected]>
---
Documentation/kernel-parameters.txt | 1 +
drivers/iommu/irq_remapping.c | 12 ++++++++----
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 838f377..324b790 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1453,6 +1453,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nosid disable Source ID checking
no_x2apic_optout
BIOS x2APIC opt-out request will be ignored
+ nopost disable Interrupt Posting
iomem= Disable strict checking of access to MMIO memory
strict regions from userspace.
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index b008663..aa3cd23 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -24,7 +24,7 @@ int irq_remap_broken;
int disable_sourceid_checking;
int no_x2apic_optout;
-int disable_irq_post = 1;
+int disable_irq_post = 0;
static struct irq_remap_ops *remap_ops;
@@ -59,14 +59,18 @@ static __init int setup_irqremap(char *str)
return -EINVAL;
while (*str) {
- if (!strncmp(str, "on", 2))
+ if (!strncmp(str, "on", 2)) {
disable_irq_remap = 0;
- else if (!strncmp(str, "off", 3))
+ disable_irq_post = 0;
+ } else if (!strncmp(str, "off", 3)) {
disable_irq_remap = 1;
- else if (!strncmp(str, "nosid", 5))
+ disable_irq_post = 1;
+ } else if (!strncmp(str, "nosid", 5))
disable_sourceid_checking = 1;
else if (!strncmp(str, "no_x2apic_optout", 16))
no_x2apic_optout = 1;
+ else if (!strncmp(str, "nopost", 6))
+ disable_irq_post = 1;
str += strcspn(str, ",");
while (*str == ',')
--
1.7.1
Currently, we don't support urgent interrupt, all interrupts
are recognized as non-urgent interrupt, so we cannot send
posted-interrupt when 'SN' is set.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/kvm/vmx.c | 11 +++++++++--
1 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dc6fd84..6b2f3e7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4306,15 +4306,22 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int r;
+ int r, sn;
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
+ /*
+ * Currently, we don't support urgent interrupt, all interrupts
+ * are recognized as non-urgent interrupt, so we cannot send
+ * posted-interrupt when 'SN' is set.
+ */
+ sn = pi_test_sn(&vmx->pi_desc);
+
r = pi_test_and_set_on(&vmx->pi_desc);
kvm_make_request(KVM_REQ_EVENT, vcpu);
#ifdef CONFIG_SMP
- if (!r && (vcpu->mode == IN_GUEST_MODE))
+ if (!r && !sn && (vcpu->mode == IN_GUEST_MODE))
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
else
--
1.7.1
When vCPU is blocked and an external interrupts from assigned
devices is delivered to it, VT-d Posted-Interrupts mechanism
will deliver an interrupt to the associated physical CPU with
Wake-up Vector. In its handler, we find the destination vCPU
and wake up it.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +
arch/x86/kvm/vmx.c | 52 +++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 22 +++++++++++-----
include/linux/kvm_host.h | 3 ++
virt/kvm/kvm_main.c | 3 ++
5 files changed, 75 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2fd85a5..76fc32d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -101,6 +101,8 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define ASYNC_PF_PER_VCPU 64
+extern void (*wakeup_handler_callback)(void);
+
enum kvm_reg {
VCPU_REGS_RAX = 0,
VCPU_REGS_RCX = 1,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e71bf3b..dc6fd84 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -822,6 +822,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
+/*
+ * We maintian a per-CPU linked-list of VCPU, so in wakeup_handler() we
+ * can find which VCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
@@ -2813,6 +2820,8 @@ static int hardware_enable(void)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
/*
* Now we can enable the vmclear operation in kdump
@@ -9177,6 +9186,7 @@ static int vmx_vcpu_pre_block(struct kvm_vcpu *vcpu)
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old;
struct pi_desc new;
+ unsigned long flags;
if (!irq_remapping_cap(IRQ_POSTING_CAP))
return 0;
@@ -9216,9 +9226,22 @@ static int vmx_vcpu_pre_block(struct kvm_vcpu *vcpu)
/* set 'NV' to 'wakeup vector' */
new.nv = POSTED_INTR_WAKEUP_VECTOR;
+
+ /*
+ * We should save physical cpu id here, vcpu->cpu may
+ * be changed due to preemption, in that case, this
+ * do-while loop will run again.
+ */
+ vcpu->wakeup_cpu = vcpu->cpu;
} while (cmpxchg(&pi_desc->control, old.control, new.control)
!= old.control);
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->wakeup_cpu), flags);
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu, vcpu->wakeup_cpu));
+ spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->wakeup_cpu), flags);
return 0;
}
@@ -9228,6 +9251,7 @@ static void vmx_vcpu_post_block(struct kvm_vcpu *vcpu)
struct pi_desc old;
struct pi_desc new;
unsigned int dest = 0;
+ unsigned long flags;
if (!irq_remapping_cap(IRQ_POSTING_CAP))
return;
@@ -9249,6 +9273,13 @@ static void vmx_vcpu_post_block(struct kvm_vcpu *vcpu)
} while (cmpxchg(&pi_desc->control, old.control, new.control)
!= old.control);
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->wakeup_cpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->wakeup_cpu), flags);
+ vcpu->wakeup_cpu = -1;
+
pi_clear_sn(pi_desc);
}
@@ -9366,6 +9397,25 @@ static struct kvm_x86_ops vmx_x86_ops = {
.vcpu_post_block = vmx_vcpu_post_block,
};
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
static int __init vmx_init(void)
{
int r, i, msr;
@@ -9480,6 +9530,8 @@ static int __init vmx_init(void)
update_ple_window_actual_max();
+ wakeup_handler_callback = wakeup_handler;
+
return 0;
out7:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9706984..37dd307 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6152,6 +6152,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_reload_apic_access_page(vcpu);
}
+ /*
+ * Since posted-interrupts can be set by VT-d HW now, in this
+ * case, KVM_REQ_EVENT is not set. We move the following
+ * operations out of the if statement.
+ */
+ if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
+ }
+
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
@@ -6168,13 +6183,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
- /*
- * Update architecture specific hints for APIC
- * virtual interrupt delivery.
- */
- if (kvm_x86_ops->hwapic_irr_update)
- kvm_x86_ops->hwapic_irr_update(vcpu,
- kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8d06678..a1cb764 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -239,6 +239,9 @@ struct kvm_vcpu {
unsigned long requests;
unsigned long guest_debug;
+ int wakeup_cpu;
+ struct list_head blocked_vcpu_list;
+
struct mutex mutex;
struct kvm_run *run;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1be1a45..fb3e504 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -224,6 +224,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
+ vcpu->wakeup_cpu = -1;
+ INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
+
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
--
1.7.1
Update Posted-Interrupts descriptor according to the
following rules:
- Before vCPU block, set 'NV' to POSTED_INTR_WAKEUP_VECTOR
- After vCPU block, set 'NV' back to POSTED_INTR_VECTOR
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 ++
arch/x86/kvm/vmx.c | 83 +++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 16 +++++++
virt/kvm/kvm_main.c | 11 +++++
4 files changed, 115 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6878429..2fd85a5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -775,6 +775,8 @@ struct kvm_x86_ops {
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu);
+ int (*vcpu_pre_block)(struct kvm_vcpu *vcpu);
+ void (*vcpu_post_block)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1100,4 +1102,7 @@ void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
bool kvm_find_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
+int kvm_arch_vcpu_pre_block(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_post_block(struct kvm_vcpu *vcpu);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81f239b..a1966b9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9147,6 +9147,86 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
shrink_ple_window(vcpu);
}
+static int vmx_vcpu_pre_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old;
+ struct pi_desc new;
+
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ memset(&old, 0, sizeof(old));
+ memset(&new, 0, sizeof(new));
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * A posted-interrupt happened in the one of the
+ * following two cases:
+ * 1. After the latest pir-to-virr sync operation
+ * in kvm_arch_vcpu_runnable() function
+ * 2. In this do-while() loop, a posted-interrupt
+ * occurs.
+ *
+ * For either of above cases, we should not block
+ * the VCPU.
+ */
+ if (pi_test_on(pi_desc) == 1) {
+ /*
+ * Need to set this flag, then the inject will
+ * be synced from PIR to vIRR before VM-ENTRY.
+ * In fact, for guest IPI case, in function
+ * vmx_deliver_posted_interrupt(), this flags
+ * has already been set, but if the interrupt
+ * is injected by VT-d PI hardware, we need
+ * to set this.
+ */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 1;
+ }
+
+ pi_clear_sn(&new);
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control, new.control)
+ != old.control);
+
+ return 0;
+}
+
+static void vmx_vcpu_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old;
+ struct pi_desc new;
+ unsigned int dest = 0;
+
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ pi_set_sn(pi_desc);
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control, new.control)
+ != old.control);
+
+ pi_clear_sn(pi_desc);
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -9256,6 +9336,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.sched_in = vmx_sched_in,
.get_pi_desc_addr = vmx_get_pi_desc_addr,
+
+ .vcpu_pre_block = vmx_vcpu_pre_block,
+ .vcpu_post_block = vmx_vcpu_post_block,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0033df3..9706984 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7731,6 +7731,22 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+int kvm_arch_vcpu_pre_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->vcpu_pre_block)
+ return kvm_x86_ops->vcpu_pre_block(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_vcpu_pre_block);
+
+void kvm_arch_vcpu_post_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->vcpu_post_block)
+ kvm_x86_ops->vcpu_post_block(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_vcpu_post_block);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 25ffac9..1be1a45 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1754,7 +1754,18 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
if (signal_pending(current))
break;
+#ifdef CONFIG_X86
+ if (kvm_arch_vcpu_pre_block(vcpu) == 1) {
+ kvm_make_request(KVM_REQ_UNHALT, vcpu);
+ break;
+ }
+#endif
+
schedule();
+
+#ifdef CONFIG_X86
+ kvm_arch_vcpu_post_block(vcpu);
+#endif
}
finish_wait(&vcpu->wq, &wait);
--
1.7.1
This patch defines macro __KVM_HAVE_ARCH_KVM_VFIO_POSTING and
implement kvm_arch_vfio_update_pi_irte for x86 architecture.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/kvm_vfio_x86.c | 68 +++++++++++++++++++++++++++++++++++++++
3 files changed, 71 insertions(+), 1 deletions(-)
create mode 100644 arch/x86/kvm/kvm_vfio_x86.c
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9b45b78..6878429 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -82,6 +82,8 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
+#define __KVM_HAVE_ARCH_KVM_VFIO_POSTING
+
#define SELECTOR_TI_MASK (1 << 2)
#define SELECTOR_RPL_MASK 0x03
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 25d22b2..8809d58 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o cpuid.o pmu.o
+ i8254.o cpuid.o pmu.o kvm_vfio_x86.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
diff --git a/arch/x86/kvm/kvm_vfio_x86.c b/arch/x86/kvm/kvm_vfio_x86.c
new file mode 100644
index 0000000..c59a31a
--- /dev/null
+++ b/arch/x86/kvm/kvm_vfio_x86.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2014 Intel Corporation.
+ * Authors: Feng Wu <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/irq_remapping.h>
+
+/*
+ * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * returns 0 on success, < 0 on failure
+ */
+int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = -EINVAL;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a VCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ */
+
+ kvm_set_msi_irq(e, &irq);
+ if (!kvm_find_dest_vcpu(kvm, &irq, &vcpu))
+ continue;
+
+ vcpu_info.pi_desc_addr = kvm_x86_ops->get_pi_desc_addr(vcpu);
+ vcpu_info.vector = irq.vector;
+
+ if (irq_set_vcpu_affinity(host_irq, &vcpu_info) < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
--
1.7.1
Currently, we use a global vector as the Posted-Interrupts
Notification Event for all the vCPUs in the system. We need
to introduce another global vector for VT-d Posted-Interrtups,
which will be used to wakeup the sleep vCPU when an external
interrupt from a direct-assigned device happens for that vCPU.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/include/asm/entry_arch.h | 2 ++
arch/x86/include/asm/hardirq.h | 1 +
arch/x86/include/asm/hw_irq.h | 2 ++
arch/x86/include/asm/irq_vectors.h | 1 +
arch/x86/kernel/entry_64.S | 2 ++
arch/x86/kernel/irq.c | 27 +++++++++++++++++++++++++++
arch/x86/kernel/irqinit.c | 2 ++
7 files changed, 37 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index dc5fa66..27ca0af 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
#ifdef CONFIG_HAVE_KVM
BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
smp_kvm_posted_intr_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
+ smp_kvm_posted_intr_wakeup_ipi)
#endif
/*
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f5fb6b..9866065 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
#endif
#ifdef CONFIG_HAVE_KVM
unsigned int kvm_posted_intr_ipis;
+ unsigned int kvm_posted_intr_wakeup_ipis;
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e7ae6eb..38fac9b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,7 @@
extern asmlinkage void apic_timer_interrupt(void);
extern asmlinkage void x86_platform_ipi(void);
extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
extern asmlinkage void error_interrupt(void);
extern asmlinkage void irq_work_interrupt(void);
@@ -92,6 +93,7 @@ extern void trace_call_function_single_interrupt(void);
#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
#define trace_reboot_interrupt reboot_interrupt
#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
+#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
#endif /* CONFIG_TRACING */
struct irq_domain;
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index b26cb12..dca94f2 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -105,6 +105,7 @@
/* Vector for KVM to deliver posted interrupt IPI */
#ifdef CONFIG_HAVE_KVM
#define POSTED_INTR_VECTOR 0xf2
+#define POSTED_INTR_WAKEUP_VECTOR 0xf1
#endif
/*
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e61c14a..a598447 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -960,6 +960,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
#ifdef CONFIG_HAVE_KVM
apicinterrupt3 POSTED_INTR_VECTOR \
kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \
+ kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 922d285..47408c3 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -237,6 +237,9 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
}
#ifdef CONFIG_HAVE_KVM
+void (*wakeup_handler_callback)(void) = NULL;
+EXPORT_SYMBOL_GPL(wakeup_handler_callback);
+
/*
* Handler for POSTED_INTERRUPT_VECTOR.
*/
@@ -256,6 +259,30 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
set_irq_regs(old_regs);
}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ ack_APIC_irq();
+
+ irq_enter();
+
+ exit_idle();
+
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+
+ if (wakeup_handler_callback)
+ wakeup_handler_callback();
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+}
+
#endif
__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181e..844673c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -144,6 +144,8 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_HAVE_KVM
/* IPI for KVM to deliver posted interrupt */
alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+ /* IPI for KVM to deliver interrupt to wake up tasks */
+ alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
#endif
/* IPI vectors for APIC spurious and error interrupts */
--
1.7.1
This patch adds and documents a new attribute
KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE group.
This new attribute is used for VT-d Posted-Interrupts.
When guest OS changes the interrupt configuration for an
assigned device, such as, MSI/MSIx data/address fields,
QEMU will use this IRQ attribute to tell KVM to update the
related IRTE according the VT-d Posted-Interrrupts Specification,
such as, the guest vector should be updated in the related IRTE.
Signed-off-by: Feng Wu <[email protected]>
---
Documentation/virtual/kvm/devices/vfio.txt | 9 +++++++++
include/uapi/linux/kvm.h | 10 ++++++++++
2 files changed, 19 insertions(+), 0 deletions(-)
diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
index f7aff29..41e12b7 100644
--- a/Documentation/virtual/kvm/devices/vfio.txt
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been called to trigger the IRQ
or associate an eventfd to it. Unforwarding can only be called while the
signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this condition is
not satisfied, the command returns an -EBUSY.
+
+ KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups mechanism to post
+ the IRQ to guests.
+For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq struct.
+
+When guest OS changes the interrupt configuration for an assigned device,
+such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
+to tell KVM to update the related IRTE according the VT-d Posted-Interrrupts
+Specification, such as, the guest vector should be updated in the related IRTE.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a269a42..7d98650 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -949,6 +949,7 @@ struct kvm_device_attr {
#define KVM_DEV_VFIO_DEVICE 2
#define KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
#define KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
+#define KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
enum kvm_device_type {
KVM_DEV_TYPE_FSL_MPIC_20 = 1,
@@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
__u32 gsi; /* gsi, ie. virtual IRQ number */
};
+struct kvm_vfio_dev_irq {
+ __u32 argsz;
+ __u32 fd; /* file descriptor of the VFIO device */
+ __u32 index; /* VFIO device IRQ index */
+ __u32 start;
+ __u32 count;
+ __u32 gsi[]; /* gsi, ie. virtual IRQ number */
+};
+
/*
* ioctls for VM fds
*/
--
1.7.1
Define a interface to get PI descriptor address from the vCPU structure.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/vmx.c | 12 ++++++++++++
2 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7a41808..9b45b78 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -772,6 +772,7 @@ struct kvm_x86_ops {
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+ u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 66ca275..81f239b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -562,6 +562,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -4298,6 +4303,11 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
return;
}
+static u64 vmx_get_pi_desc_addr(struct kvm_vcpu *vcpu)
+{
+ return __pa((u64)vcpu_to_pi_desc(vcpu));
+}
+
/*
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
* will not change in the lifetime of the guest.
@@ -9244,6 +9254,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_nested_events = vmx_check_nested_events,
.sched_in = vmx_sched_in,
+
+ .get_pi_desc_addr = vmx_get_pi_desc_addr,
};
static int __init vmx_init(void)
--
1.7.1
Move struct kvm_irq_routing_table from irqchip.c to kvm_host.h,
so we can use it outside of irqchip.c.
Signed-off-by: Feng Wu <[email protected]>
---
include/linux/kvm_host.h | 19 +++++++++++++++++++
virt/kvm/irqchip.c | 11 -----------
2 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0b9659d..cfa85ac 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -335,6 +335,25 @@ struct kvm_kernel_irq_routing_entry {
struct hlist_node link;
};
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+
+struct kvm_irq_routing_table {
+ int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
+ struct kvm_kernel_irq_routing_entry *rt_entries;
+ u32 nr_rt_entries;
+ /*
+ * Array indexed by gsi. Each entry contains list of irq chips
+ * the gsi is connected to.
+ */
+ struct hlist_head map[0];
+};
+
+#else
+
+struct kvm_irq_routing_table {};
+
+#endif
+
#ifndef KVM_PRIVATE_MEM_SLOTS
#define KVM_PRIVATE_MEM_SLOTS 0
#endif
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 7f256f3..cdf29a6 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -31,17 +31,6 @@
#include <trace/events/kvm.h>
#include "irq.h"
-struct kvm_irq_routing_table {
- int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
- struct kvm_kernel_irq_routing_entry *rt_entries;
- u32 nr_rt_entries;
- /*
- * Array indexed by gsi. Each entry contains list of irq chips
- * the gsi is connected to.
- */
- struct hlist_head map[0];
-};
-
int kvm_irq_map_gsi(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *entries, int gsi)
{
--
1.7.1
This patch defines a new interface kvm_find_dest_vcpu for
VT-d PI, which can returns the destination vCPU of the
interrupt for guests.
Since VT-d PI cannot handle broadcast/multicast interrupt,
Here we only handle Fixed and Lowest priority interrupts.
The current method of handling guest lowest priority interrtups
is to use a counter 'apic_arb_prio' for each vCPU, we choose the
vCPU with smallest 'apic_arb_prio' and then increase it by 1.
However, for VT-d PI, we cannot re-use this, since we no longer
have control to 'apic_arb_prio' with posted interrupt direct
delivery by Hardware.
Here, we introduce a similar way with 'apic_arb_prio' to handle
guest lowest priority interrtups when VT-d PI is used. Here is the
ideas:
- Each vCPU has a counter 'round_robin_counter'.
- When guests sets an interrupts to lowest priority, we choose
the vCPU with smallest 'round_robin_counter' as the destination,
then increase it.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 +++
virt/kvm/irq_comm.c | 41 +++++++++++++++++++++++++++++++++++++++
2 files changed, 45 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ed0c30..7a41808 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
struct kvm_lapic *apic; /* kernel irqchip context */
unsigned long apic_attention;
int32_t apic_arb_prio;
+ int32_t round_robin_counter;
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
@@ -1093,4 +1094,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
+bool kvm_find_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 963b899..f3c5d69 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -317,6 +317,47 @@ out:
return r;
}
+int kvm_compare_rr_counter(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.round_robin_counter -
+ vcpu2->arch.round_robin_counter;
+}
+
+bool kvm_find_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int i, r = 0;
+ struct kvm_vcpu *vcpu, *dest = NULL;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_is_dm_lowest_prio(irq)) {
+ r++;
+ *dest_vcpu = vcpu;
+ } else if (kvm_lapic_enabled(vcpu)) {
+ if (!dest)
+ dest = vcpu;
+ else if (kvm_compare_rr_counter(vcpu, dest) < 0)
+ dest = vcpu;
+ }
+ }
+
+ if (dest) {
+ dest->arch.round_robin_counter++;
+ *dest_vcpu = dest;
+ return true;
+ } else if (r == 1)
+ return true;
+
+ return false;
+}
+
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
--
1.7.1
Change struct pi_desc for VT-d Posted-Interrupts.
Signed-off-by: Feng Wu <[email protected]>
---
arch/x86/kvm/vmx.c | 15 +++++++++++++--
1 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3e556c6..abdb84f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -411,8 +411,19 @@ struct nested_vmx {
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
- u32 control; /* bit 0 of control is outstanding notification bit */
- u32 rsvd[7];
+ union {
+ struct {
+ u64 on : 1,
+ sn : 1,
+ rsvd_1 : 13,
+ ndm : 1,
+ nv : 8,
+ rsvd_2 : 8,
+ ndst : 32;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
} __aligned(64);
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
--
1.7.1
This patch adds a new interface irq_remapping_cap() to detect
whether irq remapping supports new features, such as VT-d
Posted-Interrupts. We export this function out, so that KVM
code can check this and use this mechanism properly.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
arch/x86/include/asm/irq_remapping.h | 2 ++
drivers/iommu/irq_remapping.c | 12 ++++++++++++
2 files changed, 14 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f87ac70..b3ad067 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -37,6 +37,7 @@ enum irq_remap_cap {
extern void setup_irq_remapping_ops(void);
extern int irq_remapping_supported(void);
+extern bool irq_remapping_cap(enum irq_remap_cap cap);
extern void set_irq_remapping_broken(void);
extern int irq_remapping_prepare(void);
extern int irq_remapping_enable(void);
@@ -69,6 +70,7 @@ struct vcpu_data {
static inline void setup_irq_remapping_ops(void) { }
static inline int irq_remapping_supported(void) { return 0; }
+static bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
static inline void set_irq_remapping_broken(void) { }
static inline int irq_remapping_prepare(void) { return -ENODEV; }
static inline int irq_remapping_enable(void) { return -ENODEV; }
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index e63e969..b008663 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -103,6 +103,18 @@ int irq_remapping_supported(void)
return remap_ops->supported();
}
+bool irq_remapping_cap(enum irq_remap_cap cap)
+{
+ if (disable_irq_post)
+ return 0;
+
+ if (!remap_ops || !remap_ops->capability)
+ return 0;
+
+ return remap_ops->capability(cap);
+}
+EXPORT_SYMBOL_GPL(irq_remapping_cap);
+
int __init irq_remapping_prepare(void)
{
if (!remap_ops || !remap_ops->prepare)
--
1.7.1
Implement irq_set_vcpu_affinity for pci_msi_ir_controller.
Signed-off-by: Feng Wu <[email protected]>
Reviewed-by: Jiang Liu <[email protected]>
---
arch/x86/kernel/apic/msi.c | 1 +
include/linux/irq.h | 3 +++
2 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index da163da..b0ed073 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -152,6 +152,7 @@ static struct irq_chip pci_msi_ir_controller = {
.irq_mask = pci_msi_mask_irq,
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 83abafc..5dcaa7f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -464,6 +464,9 @@ extern void irq_chip_eoi_parent(struct irq_data *data);
extern int irq_chip_set_affinity_parent(struct irq_data *data,
const struct cpumask *dest,
bool force);
+extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data,
+ void *vcpu_info);
+
#endif
static inline void irq_chip_write_msi_msg(struct irq_data *data,
--
1.7.1
From: Jiang Liu <[email protected]>
With Posted-Interrupts support in Intel CPU and IOMMU, an external
interrupt from assigned-devices could be directly delivered to a
virtual CPU in a virtual machine. Instead of hacking KVM and Intel
IOMMU drivers, we propose a platform independent interface to target
an interrupt to a specific virtual CPU in a virtual machine, or set
virtual CPU affinity for an interrupt.
By adopting this new interface and the hierarchy irqdomain, we could
easily support posted-interrupts on Intel platforms, and also provide
flexible enough interfaces for other platforms to support similar
features.
We may also cooperate between set_affinity() and set_vcpu_affinity()
in IRQ core or irq chip drivers.
Here is the usage scenario for this interface:
Guest update MSI/MSI-X interrupt configuration
-->QEMU and KVM handle this
-->KVM call this interface (passing posted interrupts descriptor
and guest vector)
-->irq core will transfer the control to IOMMU
-->IOMMU will do the real work of updating IRTE (IRTE has new
format for VT-d Posted-Interrupts)
Signed-off-by: Jiang Liu <[email protected]>
Signed-off-by: Feng Wu <[email protected]>
---
include/linux/irq.h | 4 ++++
kernel/irq/chip.c | 14 ++++++++++++++
kernel/irq/manage.c | 20 ++++++++++++++++++++
3 files changed, 38 insertions(+), 0 deletions(-)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f26e736..83abafc 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -324,6 +324,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
* irq_request_resources
* @irq_compose_msi_msg: optional to compose message content for MSI
* @irq_write_msi_msg: optional to write message content for MSI
+ * @irq_set_vcpu_affinity: optional to target a virtual CPU in a virtual
+ * machine
* @flags: chip specific flags
*/
struct irq_chip {
@@ -362,6 +364,7 @@ struct irq_chip {
void (*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg);
void (*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg);
+ int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
unsigned long flags;
};
@@ -416,6 +419,7 @@ extern void irq_cpu_online(void);
extern void irq_cpu_offline(void);
extern int irq_set_affinity_locked(struct irq_data *data,
const struct cpumask *cpumask, bool force);
+extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info);
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
void irq_move_irq(struct irq_data *data);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a5..fe0908f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -948,6 +948,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
return -ENOSYS;
}
+
+/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_vcpu_affinity)
+ return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
+ return -ENOSYS;
+}
#endif
/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8069237..bd3a1ba 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -247,6 +247,26 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+ unsigned long flags;
+ int ret = -ENOSYS;
+
+ if (!desc)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ chip = desc->irq_data.chip;
+ if (chip && chip->irq_set_vcpu_affinity)
+ ret = chip->irq_set_vcpu_affinity(irq_desc_get_irq_data(desc),
+ vcpu_info);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
static void irq_affinity_notify(struct work_struct *work)
{
struct irq_affinity_notify *notify =
--
1.7.1
Hi Feng,
On 12/03/2014 08:39 AM, Feng Wu wrote:
> This patch adds and documents a new attribute
> KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE group.
> This new attribute is used for VT-d Posted-Interrupts.
>
> When guest OS changes the interrupt configuration for an
> assigned device, such as, MSI/MSIx data/address fields,
> QEMU will use this IRQ attribute to tell KVM to update the
> related IRTE according the VT-d Posted-Interrrupts Specification,
> such as, the guest vector should be updated in the related IRTE.
>
> Signed-off-by: Feng Wu <[email protected]>
> ---
> Documentation/virtual/kvm/devices/vfio.txt | 9 +++++++++
> include/uapi/linux/kvm.h | 10 ++++++++++
> 2 files changed, 19 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
> index f7aff29..41e12b7 100644
> --- a/Documentation/virtual/kvm/devices/vfio.txt
> +++ b/Documentation/virtual/kvm/devices/vfio.txt
> @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been called to trigger the IRQ
> or associate an eventfd to it. Unforwarding can only be called while the
> signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this condition is
> not satisfied, the command returns an -EBUSY.
> +
> + KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups mechanism to post
typo
> + the IRQ to guests.
> +For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq struct.
> +
> +When guest OS changes the interrupt configuration for an assigned device,
> +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> +to tell KVM to update the related IRTE according the VT-d Posted-Interrrupts
> +Specification, such as, the guest vector should be updated in the related IRTE.
For my curiosity are there any restrictions about the instant at which
the change can be done?
I do not get here how you deactivate the posting?
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index a269a42..7d98650 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -949,6 +949,7 @@ struct kvm_device_attr {
> #define KVM_DEV_VFIO_DEVICE 2
> #define KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
> #define KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
> +#define KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
Maybe we should align our naming verb vs verbing here?
Best Regards
Eric
>
> enum kvm_device_type {
> KVM_DEV_TYPE_FSL_MPIC_20 = 1,
> @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> __u32 gsi; /* gsi, ie. virtual IRQ number */
> };
>
> +struct kvm_vfio_dev_irq {
> + __u32 argsz;
> + __u32 fd; /* file descriptor of the VFIO device */
> + __u32 index; /* VFIO device IRQ index */
> + __u32 start;
> + __u32 count;
> + __u32 gsi[]; /* gsi, ie. virtual IRQ number */
> +};
> +
> /*
> * ioctls for VM fds
> */
>
Hi Feng,
On 12/03/2014 08:39 AM, Feng Wu wrote:
> This patch adds the kvm-vfio interface for VT-d Posted-Interrrupts.
> When guests updates MSI/MSI-x information for an assigned-device,
update
> QEMU will use KVM_DEV_VFIO_DEVICE_POSTING_IRQ attribute to setup
> IRTE for VT-d PI. This patch implement this IRQ attribute.
s/implement/implements
>
> Signed-off-by: Feng Wu <[email protected]>
> ---
> include/linux/kvm_host.h | 19 ++++++++
> virt/kvm/vfio.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 122 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5cd4420..8d06678 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1134,6 +1134,25 @@ static inline int kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq,
> }
> #endif
>
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> +/*
> + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> + *
> + * @kvm: kvm
> + * @host_irq: host irq of the interrupt
> + * @guest_irq: gsi of the interrupt
> + * returns 0 on success, < 0 on failure
> + */
> +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> + uint32_t guest_irq);
> +#else
> +static int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> + uint32_t guest_irq)
> +{
> + return 0;
> +}
> +#endif
> +
> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>
> static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> index 6bc7001..5e5515f 100644
> --- a/virt/kvm/vfio.c
> +++ b/virt/kvm/vfio.c
> @@ -446,6 +446,99 @@ out:
> return ret;
> }
>
> +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> +{
> + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> + u8 pin;
> +
> + pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> + if (pin)
> + return 1;
> + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX)
> + return pci_msi_vec_count(pdev);
> + else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
> + return pci_msix_vec_count(pdev);
> +
> + return 0;
> +}
for platform case I was asked to move the retrieval of absolute irq
number to the architecture specific part. I don't know if it should
apply to PCI stuff as well? This explains why I need to pass the VFIO
device (or struct device handle) to the arch specific part. Actually we
do the same job, we provide a phys/virt IRQ mapping to KVM, right? So to
me our architecture specific API should look quite similar?
> +
> +static int kvm_vfio_set_pi(struct kvm_device *kdev, int32_t __user *argp)
> +{
> + struct kvm_vfio_dev_irq pi_info;
> + uint32_t *gsi;
> + unsigned long minsz;
> + struct vfio_device *vdev;
> + struct msi_desc *entry;
> + struct device *dev;
> + struct pci_dev *pdev;
> + int i, max, ret;
> +
> + minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> +
> + if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> + return -EFAULT;
> +
> + if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
PCI specific check, same remark as above but I will let Alex further
comment on this and possibly invalidate this commeny ;-)
> + return -EINVAL;
> +
> + vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> + if (IS_ERR(vdev))
> + return PTR_ERR(vdev);
> +
> + dev = kvm_vfio_external_base_device(vdev);
> + if (!dev || !dev_is_pci(dev)) {
> + ret = -EFAULT;
> + goto put_vfio_device;
> + }
> +
> + pdev = to_pci_dev(dev);
> +
> + max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> + if (max <= 0) {
> + ret = -EFAULT;
> + goto put_vfio_device;
> + }
> +
> + if (pi_info.argsz - minsz < pi_info.count * sizeof(int) ||
shouldn' we use the actual datatype?
> + pi_info.start >= max || pi_info.start + pi_info.count > max) {
> + ret = -EINVAL;
> + goto put_vfio_device;
> + }
> +
> + gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> + pi_info.count * sizeof(int));
same question as above
> + if (IS_ERR(gsi)) {
> + ret = PTR_ERR(gsi);
> + goto put_vfio_device;
> + }
> +
> +#ifdef CONFIG_PCI_MSI
> + for (i = 0; i < pi_info.count; i++) {
> + list_for_each_entry(entry, &pdev->msi_list, list) {
> + if (entry->msi_attrib.entry_nr != pi_info.start+i)
> + continue;
> +
> + ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> + entry->irq,
> + gsi[i]);
> + if (ret) {
> + ret = -EFAULT;
why -EFAULT? and not propagation of original error code?
you may have posting set for part of the subindexes and unset for rest.
Isn't it an issue?
Best Regards
Eric
> + goto free_gsi;
> + }
> + }
> + }
> +#endif
> +
> + ret = 0;
> +
> +free_gsi:
> + kfree(gsi);
> +
> +put_vfio_device:
> + kvm_vfio_put_vfio_device(vdev);
> + return ret;
> +}
> +
> static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> {
> int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> @@ -456,6 +549,11 @@ static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> ret = kvm_vfio_control_irq_forward(kdev, attr, argp);
> break;
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> + ret = kvm_vfio_set_pi(kdev, argp);
> + break;
> +#endif
> default:
> ret = -ENXIO;
> }
> @@ -511,6 +609,11 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
> case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> return 0;
> #endif
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> + return 0;
> +#endif
> +
> }
> break;
> }
>
> -----Original Message-----
> From: Eric Auger [mailto:[email protected]]
> Sent: Thursday, December 04, 2014 11:36 PM
> To: Wu, Feng; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]
> Cc: [email protected]; [email protected];
> [email protected]
> Subject: Re: [v2 18/25] KVM: kvm-vfio: implement the VFIO skeleton for VT-d
> Posted-Interrupts
>
> Hi Feng,
>
> On 12/03/2014 08:39 AM, Feng Wu wrote:
> > This patch adds the kvm-vfio interface for VT-d Posted-Interrrupts.
> > When guests updates MSI/MSI-x information for an assigned-device,
> update
> > QEMU will use KVM_DEV_VFIO_DEVICE_POSTING_IRQ attribute to setup
> > IRTE for VT-d PI. This patch implement this IRQ attribute.
> s/implement/implements
> >
> > Signed-off-by: Feng Wu <[email protected]>
> > ---
> > include/linux/kvm_host.h | 19 ++++++++
> > virt/kvm/vfio.c | 103
> ++++++++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 122 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 5cd4420..8d06678 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1134,6 +1134,25 @@ static inline int
> kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq,
> > }
> > #endif
> >
> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> > +/*
> > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> > + *
> > + * @kvm: kvm
> > + * @host_irq: host irq of the interrupt
> > + * @guest_irq: gsi of the interrupt
> > + * returns 0 on success, < 0 on failure
> > + */
> > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> > + uint32_t guest_irq);
> > +#else
> > +static int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int
> host_irq,
> > + uint32_t guest_irq)
> > +{
> > + return 0;
> > +}
> > +#endif
> > +
> > #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> >
> > static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool
> val)
> > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> > index 6bc7001..5e5515f 100644
> > --- a/virt/kvm/vfio.c
> > +++ b/virt/kvm/vfio.c
> > @@ -446,6 +446,99 @@ out:
> > return ret;
> > }
> >
> > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> > +{
> > + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> > + u8 pin;
> > +
> > + pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> > + if (pin)
> > + return 1;
> > + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX)
> > + return pci_msi_vec_count(pdev);
> > + else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
> > + return pci_msix_vec_count(pdev);
> > +
> > + return 0;
> > +}
> for platform case I was asked to move the retrieval of absolute irq
> number to the architecture specific part. I don't know if it should
> apply to PCI stuff as well? This explains why I need to pass the VFIO
> device (or struct device handle) to the arch specific part. Actually we
> do the same job, we provide a phys/virt IRQ mapping to KVM, right? So to
> me our architecture specific API should look quite similar?
In my patch, QEMU passes IRQ type(MSI/MSIx in my case), VFIO device index,
and sub-index via "struct kvm_vfio_dev_irq" to KVM, then KVM will find the
real host irq from the VFIO device index and the IRQ type. Is this something
similar with your patch?
>
> > +
> > +static int kvm_vfio_set_pi(struct kvm_device *kdev, int32_t __user *argp)
> > +{
> > + struct kvm_vfio_dev_irq pi_info;
> > + uint32_t *gsi;
> > + unsigned long minsz;
> > + struct vfio_device *vdev;
> > + struct msi_desc *entry;
> > + struct device *dev;
> > + struct pci_dev *pdev;
> > + int i, max, ret;
> > +
> > + minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> > +
> > + if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> > + return -EFAULT;
> > +
> > + if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> PCI specific check, same remark as above but I will let Alex further
> comment on this and possibly invalidate this commeny ;-)
> > + return -EINVAL;
> > +
> > + vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> > + if (IS_ERR(vdev))
> > + return PTR_ERR(vdev);
> > +
> > + dev = kvm_vfio_external_base_device(vdev);
> > + if (!dev || !dev_is_pci(dev)) {
> > + ret = -EFAULT;
> > + goto put_vfio_device;
> > + }
> > +
> > + pdev = to_pci_dev(dev);
> > +
> > + max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> > + if (max <= 0) {
> > + ret = -EFAULT;
> > + goto put_vfio_device;
> > + }
> > +
> > + if (pi_info.argsz - minsz < pi_info.count * sizeof(int) ||
> shouldn' we use the actual datatype?
I am afraid I don't get this, could you please be more specific? Thanks a lot!
> > + pi_info.start >= max || pi_info.start + pi_info.count > max) {
> > + ret = -EINVAL;
> > + goto put_vfio_device;
> > + }
> > +
> > + gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> > + pi_info.count * sizeof(int));
> same question as above
> > + if (IS_ERR(gsi)) {
> > + ret = PTR_ERR(gsi);
> > + goto put_vfio_device;
> > + }
> > +
> > +#ifdef CONFIG_PCI_MSI
> > + for (i = 0; i < pi_info.count; i++) {
> > + list_for_each_entry(entry, &pdev->msi_list, list) {
> > + if (entry->msi_attrib.entry_nr != pi_info.start+i)
> > + continue;
> > +
> > + ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> > + entry->irq,
> > + gsi[i]);
> > + if (ret) {
> > + ret = -EFAULT;
> why -EFAULT? and not propagation of original error code?
Yes, you are right. Thanks for the comments!
> you may have posting set for part of the subindexes and unset for rest.
> Isn't it an issue?
QEMU will always set the posting for all the sub-indexes for MSI/MSIx,
once the guest updates the configuration of some sub-indexes, KVM will
update it accordingly. So in which case will what you mentioned above
happen?
Thanks,
Feng
>
> Best Regards
>
> Eric
> > + goto free_gsi;
> > + }
> > + }
> > + }
> > +#endif
> > +
> > + ret = 0;
> > +
> > +free_gsi:
> > + kfree(gsi);
> > +
> > +put_vfio_device:
> > + kvm_vfio_put_vfio_device(vdev);
> > + return ret;
> > +}
> > +
> > static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> > {
> > int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> > @@ -456,6 +549,11 @@ static int kvm_vfio_set_device(struct kvm_device
> *kdev, long attr, u64 arg)
> > case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> > ret = kvm_vfio_control_irq_forward(kdev, attr, argp);
> > break;
> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> > + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> > + ret = kvm_vfio_set_pi(kdev, argp);
> > + break;
> > +#endif
> > default:
> > ret = -ENXIO;
> > }
> > @@ -511,6 +609,11 @@ static int kvm_vfio_has_attr(struct kvm_device
> *dev,
> > case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> > return 0;
> > #endif
> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> > + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> > + return 0;
> > +#endif
> > +
> > }
> > break;
> > }
> >
> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On
> Behalf Of Eric Auger
> Sent: Thursday, December 04, 2014 10:05 PM
> To: Wu, Feng; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]
> Cc: [email protected]; [email protected];
> [email protected]
> Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
>
> Hi Feng,
> On 12/03/2014 08:39 AM, Feng Wu wrote:
> > This patch adds and documents a new attribute
> > KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE group.
> > This new attribute is used for VT-d Posted-Interrupts.
> >
> > When guest OS changes the interrupt configuration for an
> > assigned device, such as, MSI/MSIx data/address fields,
> > QEMU will use this IRQ attribute to tell KVM to update the
> > related IRTE according the VT-d Posted-Interrrupts Specification,
> > such as, the guest vector should be updated in the related IRTE.
> >
> > Signed-off-by: Feng Wu <[email protected]>
> > ---
> > Documentation/virtual/kvm/devices/vfio.txt | 9 +++++++++
> > include/uapi/linux/kvm.h | 10 ++++++++++
> > 2 files changed, 19 insertions(+), 0 deletions(-)
> >
> > diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> b/Documentation/virtual/kvm/devices/vfio.txt
> > index f7aff29..41e12b7 100644
> > --- a/Documentation/virtual/kvm/devices/vfio.txt
> > +++ b/Documentation/virtual/kvm/devices/vfio.txt
> > @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been
> called to trigger the IRQ
> > or associate an eventfd to it. Unforwarding can only be called while the
> > signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this condition
> is
> > not satisfied, the command returns an -EBUSY.
> > +
> > + KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> mechanism to post
> typo
> > + the IRQ to guests.
> > +For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq
> struct.
> > +
> > +When guest OS changes the interrupt configuration for an assigned device,
> > +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> > +to tell KVM to update the related IRTE according the VT-d
> Posted-Interrrupts
> > +Specification, such as, the guest vector should be updated in the related
> IRTE.
> For my curiosity are there any restrictions about the instant at which
> the change can be done?
> I do not get here how you deactivate the posting?
The current method is if the hardware supports interrupts posting, we will
use it instead of interrupts remapping, since it has good performance. Why
do I need deactivate interrupts posting?
Here is the reply to Alex for the same question:
"In fact, I don't think we need to stop the posted-interrupts. For setting
posted interrupts, we update the related IRTE according to the new
format. If the guest reboots, or unload the drivers, or some other
operations, the msi/msix will be disabled first, in this path, the irq
will be disabled the related IRTE is not used anymore."
>
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index a269a42..7d98650 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -949,6 +949,7 @@ struct kvm_device_attr {
> > #define KVM_DEV_VFIO_DEVICE 2
> > #define KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
> > #define KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
> > +#define KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
> Maybe we should align our naming verb vs verbing here?
> Best Regards
> Eric
No problem, I will align my patch in the next version. Thanks!
Thanks,
Feng
> >
> > enum kvm_device_type {
> > KVM_DEV_TYPE_FSL_MPIC_20 = 1,
> > @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> > __u32 gsi; /* gsi, ie. virtual IRQ number */
> > };
> >
> > +struct kvm_vfio_dev_irq {
> > + __u32 argsz;
> > + __u32 fd; /* file descriptor of the VFIO device */
> > + __u32 index; /* VFIO device IRQ index */
> > + __u32 start;
> > + __u32 count;
> > + __u32 gsi[]; /* gsi, ie. virtual IRQ number */
> > +};
> > +
> > /*
> > * ioctls for VM fds
> > */
> >
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2014-12-08 at 04:58 +0000, Wu, Feng wrote:
>
> > -----Original Message-----
> > From: Eric Auger [mailto:[email protected]]
> > Sent: Thursday, December 04, 2014 11:36 PM
> > To: Wu, Feng; [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]
> > Cc: [email protected]; [email protected];
> > [email protected]
> > Subject: Re: [v2 18/25] KVM: kvm-vfio: implement the VFIO skeleton for VT-d
> > Posted-Interrupts
> >
> > Hi Feng,
> >
> > On 12/03/2014 08:39 AM, Feng Wu wrote:
> > > This patch adds the kvm-vfio interface for VT-d Posted-Interrrupts.
> > > When guests updates MSI/MSI-x information for an assigned-device,
> > update
> > > QEMU will use KVM_DEV_VFIO_DEVICE_POSTING_IRQ attribute to setup
> > > IRTE for VT-d PI. This patch implement this IRQ attribute.
> > s/implement/implements
> > >
> > > Signed-off-by: Feng Wu <[email protected]>
> > > ---
> > > include/linux/kvm_host.h | 19 ++++++++
> > > virt/kvm/vfio.c | 103
> > ++++++++++++++++++++++++++++++++++++++++++++++
> > > 2 files changed, 122 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 5cd4420..8d06678 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -1134,6 +1134,25 @@ static inline int
> > kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq,
> > > }
> > > #endif
> > >
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> > > +/*
> > > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> > > + *
> > > + * @kvm: kvm
> > > + * @host_irq: host irq of the interrupt
> > > + * @guest_irq: gsi of the interrupt
> > > + * returns 0 on success, < 0 on failure
> > > + */
> > > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> > > + uint32_t guest_irq);
> > > +#else
> > > +static int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int
> > host_irq,
> > > + uint32_t guest_irq)
> > > +{
> > > + return 0;
> > > +}
> > > +#endif
> > > +
> > > #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> > >
> > > static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool
> > val)
> > > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> > > index 6bc7001..5e5515f 100644
> > > --- a/virt/kvm/vfio.c
> > > +++ b/virt/kvm/vfio.c
> > > @@ -446,6 +446,99 @@ out:
> > > return ret;
> > > }
> > >
> > > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> > > +{
> > > + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> > > + u8 pin;
> > > +
> > > + pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> > > + if (pin)
> > > + return 1;
> > > + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX)
> > > + return pci_msi_vec_count(pdev);
> > > + else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
> > > + return pci_msix_vec_count(pdev);
> > > +
> > > + return 0;
> > > +}
> > for platform case I was asked to move the retrieval of absolute irq
> > number to the architecture specific part. I don't know if it should
> > apply to PCI stuff as well? This explains why I need to pass the VFIO
> > device (or struct device handle) to the arch specific part. Actually we
> > do the same job, we provide a phys/virt IRQ mapping to KVM, right? So to
> > me our architecture specific API should look quite similar?
>
> In my patch, QEMU passes IRQ type(MSI/MSIx in my case), VFIO device index,
> and sub-index via "struct kvm_vfio_dev_irq" to KVM, then KVM will find the
> real host irq from the VFIO device index and the IRQ type. Is this something
> similar with your patch?
>
> >
> > > +
> > > +static int kvm_vfio_set_pi(struct kvm_device *kdev, int32_t __user *argp)
> > > +{
> > > + struct kvm_vfio_dev_irq pi_info;
> > > + uint32_t *gsi;
> > > + unsigned long minsz;
> > > + struct vfio_device *vdev;
> > > + struct msi_desc *entry;
> > > + struct device *dev;
> > > + struct pci_dev *pdev;
> > > + int i, max, ret;
> > > +
> > > + minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> > > +
> > > + if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> > > + return -EFAULT;
> > > +
> > > + if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> > PCI specific check, same remark as above but I will let Alex further
> > comment on this and possibly invalidate this commeny ;-)
> > > + return -EINVAL;
> > > +
> > > + vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> > > + if (IS_ERR(vdev))
> > > + return PTR_ERR(vdev);
> > > +
> > > + dev = kvm_vfio_external_base_device(vdev);
> > > + if (!dev || !dev_is_pci(dev)) {
> > > + ret = -EFAULT;
> > > + goto put_vfio_device;
> > > + }
> > > +
> > > + pdev = to_pci_dev(dev);
> > > +
> > > + max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> > > + if (max <= 0) {
> > > + ret = -EFAULT;
> > > + goto put_vfio_device;
> > > + }
> > > +
> > > + if (pi_info.argsz - minsz < pi_info.count * sizeof(int) ||
> > shouldn' we use the actual datatype?
>
> I am afraid I don't get this, could you please be more specific? Thanks a lot!
We could have a platform that supports 64bit INTs.
> > > + pi_info.start >= max || pi_info.start + pi_info.count > max) {
> > > + ret = -EINVAL;
> > > + goto put_vfio_device;
> > > + }
> > > +
> > > + gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> > > + pi_info.count * sizeof(int));
> > same question as above
> > > + if (IS_ERR(gsi)) {
> > > + ret = PTR_ERR(gsi);
> > > + goto put_vfio_device;
> > > + }
> > > +
> > > +#ifdef CONFIG_PCI_MSI
> > > + for (i = 0; i < pi_info.count; i++) {
> > > + list_for_each_entry(entry, &pdev->msi_list, list) {
> > > + if (entry->msi_attrib.entry_nr != pi_info.start+i)
> > > + continue;
> > > +
> > > + ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> > > + entry->irq,
> > > + gsi[i]);
> > > + if (ret) {
> > > + ret = -EFAULT;
> > why -EFAULT? and not propagation of original error code?
> Yes, you are right. Thanks for the comments!
>
> > you may have posting set for part of the subindexes and unset for rest.
> > Isn't it an issue?
>
> QEMU will always set the posting for all the sub-indexes for MSI/MSIx,
> once the guest updates the configuration of some sub-indexes, KVM will
> update it accordingly. So in which case will what you mentioned above
> happen?
QEMU is just one userspace, not necessarily the only userspace. The
kernel shouldn't expect a specific userspace behavior.
> > > + goto free_gsi;
> > > + }
> > > + }
> > > + }
> > > +#endif
> > > +
> > > + ret = 0;
> > > +
> > > +free_gsi:
> > > + kfree(gsi);
> > > +
> > > +put_vfio_device:
> > > + kvm_vfio_put_vfio_device(vdev);
> > > + return ret;
> > > +}
> > > +
> > > static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> > > {
> > > int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> > > @@ -456,6 +549,11 @@ static int kvm_vfio_set_device(struct kvm_device
> > *kdev, long attr, u64 arg)
> > > case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> > > ret = kvm_vfio_control_irq_forward(kdev, attr, argp);
> > > break;
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> > > + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> > > + ret = kvm_vfio_set_pi(kdev, argp);
> > > + break;
> > > +#endif
> > > default:
> > > ret = -ENXIO;
> > > }
> > > @@ -511,6 +609,11 @@ static int kvm_vfio_has_attr(struct kvm_device
> > *dev,
> > > case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> > > return 0;
> > > #endif
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> > > + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> > > + return 0;
> > > +#endif
> > > +
> > > }
> > > break;
> > > }
> > >
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2014-12-08 at 04:58 +0000, Wu, Feng wrote:
>
> > -----Original Message-----
> > From: [email protected] [mailto:[email protected]] On
> > Behalf Of Eric Auger
> > Sent: Thursday, December 04, 2014 10:05 PM
> > To: Wu, Feng; [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]
> > Cc: [email protected]; [email protected];
> > [email protected]
> > Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
> >
> > Hi Feng,
> > On 12/03/2014 08:39 AM, Feng Wu wrote:
> > > This patch adds and documents a new attribute
> > > KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE group.
> > > This new attribute is used for VT-d Posted-Interrupts.
> > >
> > > When guest OS changes the interrupt configuration for an
> > > assigned device, such as, MSI/MSIx data/address fields,
> > > QEMU will use this IRQ attribute to tell KVM to update the
> > > related IRTE according the VT-d Posted-Interrrupts Specification,
> > > such as, the guest vector should be updated in the related IRTE.
> > >
> > > Signed-off-by: Feng Wu <[email protected]>
> > > ---
> > > Documentation/virtual/kvm/devices/vfio.txt | 9 +++++++++
> > > include/uapi/linux/kvm.h | 10 ++++++++++
> > > 2 files changed, 19 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> > b/Documentation/virtual/kvm/devices/vfio.txt
> > > index f7aff29..41e12b7 100644
> > > --- a/Documentation/virtual/kvm/devices/vfio.txt
> > > +++ b/Documentation/virtual/kvm/devices/vfio.txt
> > > @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been
> > called to trigger the IRQ
> > > or associate an eventfd to it. Unforwarding can only be called while the
> > > signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this condition
> > is
> > > not satisfied, the command returns an -EBUSY.
> > > +
> > > + KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> > mechanism to post
> > typo
> > > + the IRQ to guests.
> > > +For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq
> > struct.
> > > +
> > > +When guest OS changes the interrupt configuration for an assigned device,
> > > +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> > > +to tell KVM to update the related IRTE according the VT-d
> > Posted-Interrrupts
> > > +Specification, such as, the guest vector should be updated in the related
> > IRTE.
> > For my curiosity are there any restrictions about the instant at which
> > the change can be done?
> > I do not get here how you deactivate the posting?
>
> The current method is if the hardware supports interrupts posting, we will
> use it instead of interrupts remapping, since it has good performance. Why
> do I need deactivate interrupts posting?
>
> Here is the reply to Alex for the same question:
> "In fact, I don't think we need to stop the posted-interrupts. For setting
> posted interrupts, we update the related IRTE according to the new
> format. If the guest reboots, or unload the drivers, or some other
> operations, the msi/msix will be disabled first, in this path, the irq
> will be disabled the related IRTE is not used anymore."
Right, and I'm still not sure I agree with that reasoning. We need to
build the kernel interface to be generic, not tailored for a specific
userspace. I don't really feel comfortable having something that can't
be disabled via a similar path to it being enabled. For instance, what
about a dynamic debug interface where we want to enable tracing and see
each interrupt injected into the guest. At that point we'd want to
disabled posted interrupts and direct KVM injection and route via QEMU.
Thanks,
Alex
> > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > index a269a42..7d98650 100644
> > > --- a/include/uapi/linux/kvm.h
> > > +++ b/include/uapi/linux/kvm.h
> > > @@ -949,6 +949,7 @@ struct kvm_device_attr {
> > > #define KVM_DEV_VFIO_DEVICE 2
> > > #define KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
> > > #define KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
> > > +#define KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
> > Maybe we should align our naming verb vs verbing here?
> > Best Regards
> > Eric
>
> No problem, I will align my patch in the next version. Thanks!
>
> Thanks,
> Feng
>
> > >
> > > enum kvm_device_type {
> > > KVM_DEV_TYPE_FSL_MPIC_20 = 1,
> > > @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> > > __u32 gsi; /* gsi, ie. virtual IRQ number */
> > > };
> > >
> > > +struct kvm_vfio_dev_irq {
> > > + __u32 argsz;
> > > + __u32 fd; /* file descriptor of the VFIO device */
> > > + __u32 index; /* VFIO device IRQ index */
> > > + __u32 start;
> > > + __u32 count;
> > > + __u32 gsi[]; /* gsi, ie. virtual IRQ number */
> > > +};
> > > +
> > > /*
> > > * ioctls for VM fds
> > > */
> > >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 12/08/2014 06:12 AM, Alex Williamson wrote:
> On Mon, 2014-12-08 at 04:58 +0000, Wu, Feng wrote:
>>
>>> -----Original Message-----
>>> From: Eric Auger [mailto:[email protected]]
>>> Sent: Thursday, December 04, 2014 11:36 PM
>>> To: Wu, Feng; [email protected]; [email protected]; [email protected];
>>> [email protected]; [email protected]; [email protected];
>>> [email protected]; [email protected]; [email protected];
>>> [email protected]
>>> Cc: [email protected]; [email protected];
>>> [email protected]
>>> Subject: Re: [v2 18/25] KVM: kvm-vfio: implement the VFIO skeleton for VT-d
>>> Posted-Interrupts
>>>
>>> Hi Feng,
>>>
>>> On 12/03/2014 08:39 AM, Feng Wu wrote:
>>>> This patch adds the kvm-vfio interface for VT-d Posted-Interrrupts.
>>>> When guests updates MSI/MSI-x information for an assigned-device,
>>> update
>>>> QEMU will use KVM_DEV_VFIO_DEVICE_POSTING_IRQ attribute to setup
>>>> IRTE for VT-d PI. This patch implement this IRQ attribute.
>>> s/implement/implements
>>>>
>>>> Signed-off-by: Feng Wu <[email protected]>
>>>> ---
>>>> include/linux/kvm_host.h | 19 ++++++++
>>>> virt/kvm/vfio.c | 103
>>> ++++++++++++++++++++++++++++++++++++++++++++++
>>>> 2 files changed, 122 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>>> index 5cd4420..8d06678 100644
>>>> --- a/include/linux/kvm_host.h
>>>> +++ b/include/linux/kvm_host.h
>>>> @@ -1134,6 +1134,25 @@ static inline int
>>> kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq,
>>>> }
>>>> #endif
>>>>
>>>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
>>>> +/*
>>>> + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
>>>> + *
>>>> + * @kvm: kvm
>>>> + * @host_irq: host irq of the interrupt
>>>> + * @guest_irq: gsi of the interrupt
>>>> + * returns 0 on success, < 0 on failure
>>>> + */
>>>> +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
>>>> + uint32_t guest_irq);
>>>> +#else
>>>> +static int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int
>>> host_irq,
>>>> + uint32_t guest_irq)
>>>> +{
>>>> + return 0;
>>>> +}
>>>> +#endif
>>>> +
>>>> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>>>>
>>>> static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool
>>> val)
>>>> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
>>>> index 6bc7001..5e5515f 100644
>>>> --- a/virt/kvm/vfio.c
>>>> +++ b/virt/kvm/vfio.c
>>>> @@ -446,6 +446,99 @@ out:
>>>> return ret;
>>>> }
>>>>
>>>> +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
>>>> +{
>>>> + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
>>>> + u8 pin;
>>>> +
>>>> + pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
>>>> + if (pin)
>>>> + return 1;
>>>> + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX)
>>>> + return pci_msi_vec_count(pdev);
>>>> + else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
>>>> + return pci_msix_vec_count(pdev);
>>>> +
>>>> + return 0;
>>>> +}
>>> for platform case I was asked to move the retrieval of absolute irq
>>> number to the architecture specific part. I don't know if it should
>>> apply to PCI stuff as well? This explains why I need to pass the VFIO
>>> device (or struct device handle) to the arch specific part. Actually we
>>> do the same job, we provide a phys/virt IRQ mapping to KVM, right? So to
>>> me our architecture specific API should look quite similar?
>>
>> In my patch, QEMU passes IRQ type(MSI/MSIx in my case), VFIO device index,
>> and sub-index via "struct kvm_vfio_dev_irq" to KVM, then KVM will find the
>> real host irq from the VFIO device index and the IRQ type. Is this something
>> similar with your patch?
>>
>>>
>>>> +
>>>> +static int kvm_vfio_set_pi(struct kvm_device *kdev, int32_t __user *argp)
>>>> +{
>>>> + struct kvm_vfio_dev_irq pi_info;
>>>> + uint32_t *gsi;
>>>> + unsigned long minsz;
>>>> + struct vfio_device *vdev;
>>>> + struct msi_desc *entry;
>>>> + struct device *dev;
>>>> + struct pci_dev *pdev;
>>>> + int i, max, ret;
>>>> +
>>>> + minsz = offsetofend(struct kvm_vfio_dev_irq, count);
>>>> +
>>>> + if (copy_from_user(&pi_info, (void __user *)argp, minsz))
>>>> + return -EFAULT;
>>>> +
>>>> + if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
>>> PCI specific check, same remark as above but I will let Alex further
>>> comment on this and possibly invalidate this commeny ;-)
>>>> + return -EINVAL;
>>>> +
>>>> + vdev = kvm_vfio_get_vfio_device(pi_info.fd);
>>>> + if (IS_ERR(vdev))
>>>> + return PTR_ERR(vdev);
>>>> +
>>>> + dev = kvm_vfio_external_base_device(vdev);
>>>> + if (!dev || !dev_is_pci(dev)) {
>>>> + ret = -EFAULT;
>>>> + goto put_vfio_device;
>>>> + }
>>>> +
>>>> + pdev = to_pci_dev(dev);
>>>> +
>>>> + max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
>>>> + if (max <= 0) {
>>>> + ret = -EFAULT;
>>>> + goto put_vfio_device;
>>>> + }
>>>> +
>>>> + if (pi_info.argsz - minsz < pi_info.count * sizeof(int) ||
>>> shouldn' we use the actual datatype?
>>
>> I am afraid I don't get this, could you please be more specific? Thanks a lot!
>
> We could have a platform that supports 64bit INTs.
yes this is what I meant (struct datatype is __u32).
Thanks
Eric
>
>>>> + pi_info.start >= max || pi_info.start + pi_info.count > max) {
>>>> + ret = -EINVAL;
>>>> + goto put_vfio_device;
>>>> + }
>>>> +
>>>> + gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
>>>> + pi_info.count * sizeof(int));
>>> same question as above
>>>> + if (IS_ERR(gsi)) {
>>>> + ret = PTR_ERR(gsi);
>>>> + goto put_vfio_device;
>>>> + }
>>>> +
>>>> +#ifdef CONFIG_PCI_MSI
>>>> + for (i = 0; i < pi_info.count; i++) {
>>>> + list_for_each_entry(entry, &pdev->msi_list, list) {
>>>> + if (entry->msi_attrib.entry_nr != pi_info.start+i)
>>>> + continue;
>>>> +
>>>> + ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
>>>> + entry->irq,
>>>> + gsi[i]);
>>>> + if (ret) {
>>>> + ret = -EFAULT;
>>> why -EFAULT? and not propagation of original error code?
>> Yes, you are right. Thanks for the comments!
>>
>>> you may have posting set for part of the subindexes and unset for rest.
>>> Isn't it an issue?
>>
>> QEMU will always set the posting for all the sub-indexes for MSI/MSIx,
>> once the guest updates the configuration of some sub-indexes, KVM will
>> update it accordingly. So in which case will what you mentioned above
>> happen?
Was pointing out you handle the case where kvm_arch_vfio_update_pi_irte
could fail and you still continue looping thru the other indexes. So
theoretically you could have a mixed of non posted IRQs and posted IRQs.
Best Regards
Eric
>
> QEMU is just one userspace, not necessarily the only userspace. The
> kernel shouldn't expect a specific userspace behavior.
>
>>>> + goto free_gsi;
>>>> + }
>>>> + }
>>>> + }
>>>> +#endif
>>>> +
>>>> + ret = 0;
>>>> +
>>>> +free_gsi:
>>>> + kfree(gsi);
>>>> +
>>>> +put_vfio_device:
>>>> + kvm_vfio_put_vfio_device(vdev);
>>>> + return ret;
>>>> +}
>>>> +
>>>> static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
>>>> {
>>>> int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
>>>> @@ -456,6 +549,11 @@ static int kvm_vfio_set_device(struct kvm_device
>>> *kdev, long attr, u64 arg)
>>>> case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
>>>> ret = kvm_vfio_control_irq_forward(kdev, attr, argp);
>>>> break;
>>>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
>>>> + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
>>>> + ret = kvm_vfio_set_pi(kdev, argp);
>>>> + break;
>>>> +#endif
>>>> default:
>>>> ret = -ENXIO;
>>>> }
>>>> @@ -511,6 +609,11 @@ static int kvm_vfio_has_attr(struct kvm_device
>>> *dev,
>>>> case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
>>>> return 0;
>>>> #endif
>>>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
>>>> + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
>>>> + return 0;
>>>> +#endif
>>>> +
>>>> }
>>>> break;
>>>> }
>>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>
Ping...
Thanks,
Feng
> -----Original Message-----
> From: Wu, Feng
> Sent: Wednesday, December 03, 2014 3:39 PM
> To: [email protected]; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected]
> Cc: [email protected]; [email protected];
> [email protected]; Wu, Feng
> Subject: [v2 00/25] Add VT-d Posted-Interrupts support
>
> VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt.
> With VT-d Posted-Interrupts enabled, external interrupts from
> direct-assigned devices can be delivered to guests without VMM
> intervention when guest is running in non-root mode.
>
> You can find the VT-d Posted-Interrtups Spec. in the following URL:
> http://www.intel.com/content/www/us/en/intelligent-systems/intel-technolog
> y/vt-directed-io-spec.html
>
> v1->v2:
> * Use VFIO framework to enable this feature, the VFIO part of this series is
> base on Eric's patch "[PATCH v3 0/8] KVM-VFIO IRQ forward control"
> * Rebase this patchset on
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git,
> then revise some irq logic based on the new hierarchy irqdomain patches
> provided
> by Jiang Liu <[email protected]>
>
> This patch series is made of the following groups:
> 1-6: Some preparation changes in iommu and irq component, this is based on
> the
> new hierarchy irqdomain logic.
> 7-9, 25: IOMMU changes for VT-d Posted-Interrupts, such as, feature detection,
> command line parameter.
> 10-16, 21-24: Changes related to KVM itself.
> 17-19: Changes in VFIO component, this part was previously sent out as
> "[RFC PATCH v2 0/2] kvm-vfio: implement the vfio skeleton for VT-d
> Posted-Interrupts"
> 20: x86 irq related changes
>
> Feng Wu (25):
> genirq: Introduce irq_set_vcpu_affinity() to target an interrupt to a
> VCPU
> iommu: Add new member capability to struct irq_remap_ops
> iommu, x86: Define new irte structure for VT-d Posted-Interrupts
> iommu, x86: Implement irq_set_vcpu_affinity for intel_ir_chip
> x86, irq: Implement irq_set_vcpu_affinity for pci_msi_ir_controller
> iommu, x86: No need to migrating irq for VT-d Posted-Interrupts
> iommu, x86: Add cap_pi_support() to detect VT-d PI capability
> iommu, x86: Add intel_irq_remapping_capability() for Intel
> iommu, x86: define irq_remapping_cap()
> KVM: change struct pi_desc for VT-d Posted-Interrupts
> KVM: Add some helper functions for Posted-Interrupts
> KVM: Initialize VT-d Posted-Interrupts Descriptor
> KVM: Define a new interface kvm_find_dest_vcpu() for VT-d PI
> KVM: Get Posted-Interrupts descriptor address from struct kvm_vcpu
> KVM: Make struct kvm_irq_routing_table accessible
> KVM: make kvm_set_msi_irq() public
> KVM: kvm-vfio: User API for VT-d Posted-Interrupts
> KVM: kvm-vfio: implement the VFIO skeleton for VT-d Posted-Interrupts
> KVM: x86: kvm-vfio: VT-d posted-interrupts setup
> x86, irq: Define a global vector for VT-d Posted-Interrupts
> KVM: Update Posted-Interrupts descriptor during vCPU scheduling
> KVM: Change NDST field after vCPU scheduling
> KVM: Add the handler for Wake-up Vector
> KVM: Suppress posted-interrupt when 'SN' is set
> iommu/vt-d: Add a command line parameter for VT-d posted-interrupts
>
> Documentation/kernel-parameters.txt | 1 +
> Documentation/virtual/kvm/devices/vfio.txt | 9 +
> arch/x86/include/asm/entry_arch.h | 2 +
> arch/x86/include/asm/hardirq.h | 1 +
> arch/x86/include/asm/hw_irq.h | 2 +
> arch/x86/include/asm/irq_remapping.h | 11 ++
> arch/x86/include/asm/irq_vectors.h | 1 +
> arch/x86/include/asm/kvm_host.h | 14 ++
> arch/x86/kernel/apic/msi.c | 1 +
> arch/x86/kernel/entry_64.S | 2 +
> arch/x86/kernel/irq.c | 27 +++
> arch/x86/kernel/irqinit.c | 2 +
> arch/x86/kvm/Makefile | 2 +-
> arch/x86/kvm/kvm_vfio_x86.c | 68 ++++++++
> arch/x86/kvm/vmx.c | 251
> +++++++++++++++++++++++++++-
> arch/x86/kvm/x86.c | 38 ++++-
> drivers/iommu/intel_irq_remapping.c | 64 +++++++
> drivers/iommu/irq_remapping.c | 24 +++-
> drivers/iommu/irq_remapping.h | 8 +
> include/linux/dmar.h | 32 ++++
> include/linux/intel-iommu.h | 1 +
> include/linux/irq.h | 7 +
> include/linux/kvm_host.h | 43 +++++
> include/uapi/linux/kvm.h | 10 +
> kernel/irq/chip.c | 14 ++
> kernel/irq/manage.c | 20 +++
> virt/kvm/irq_comm.c | 43 +++++-
> virt/kvm/irqchip.c | 11 --
> virt/kvm/kvm_main.c | 14 ++
> virt/kvm/vfio.c | 103 ++++++++++++
> 30 files changed, 799 insertions(+), 27 deletions(-)
> create mode 100644 arch/x86/kvm/kvm_vfio_x86.c
> -----Original Message-----
> From: Alex Williamson [mailto:[email protected]]
> Sent: Monday, December 08, 2014 1:21 PM
> To: Wu, Feng
> Cc: Eric Auger; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]
> Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
>
> On Mon, 2014-12-08 at 04:58 +0000, Wu, Feng wrote:
> >
> > > -----Original Message-----
> > > From: [email protected] [mailto:[email protected]]
> On
> > > Behalf Of Eric Auger
> > > Sent: Thursday, December 04, 2014 10:05 PM
> > > To: Wu, Feng; [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]
> > > Cc: [email protected]; [email protected];
> > > [email protected]
> > > Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
> > >
> > > Hi Feng,
> > > On 12/03/2014 08:39 AM, Feng Wu wrote:
> > > > This patch adds and documents a new attribute
> > > > KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> group.
> > > > This new attribute is used for VT-d Posted-Interrupts.
> > > >
> > > > When guest OS changes the interrupt configuration for an
> > > > assigned device, such as, MSI/MSIx data/address fields,
> > > > QEMU will use this IRQ attribute to tell KVM to update the
> > > > related IRTE according the VT-d Posted-Interrrupts Specification,
> > > > such as, the guest vector should be updated in the related IRTE.
> > > >
> > > > Signed-off-by: Feng Wu <[email protected]>
> > > > ---
> > > > Documentation/virtual/kvm/devices/vfio.txt | 9 +++++++++
> > > > include/uapi/linux/kvm.h | 10 ++++++++++
> > > > 2 files changed, 19 insertions(+), 0 deletions(-)
> > > >
> > > > diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> > > b/Documentation/virtual/kvm/devices/vfio.txt
> > > > index f7aff29..41e12b7 100644
> > > > --- a/Documentation/virtual/kvm/devices/vfio.txt
> > > > +++ b/Documentation/virtual/kvm/devices/vfio.txt
> > > > @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been
> > > called to trigger the IRQ
> > > > or associate an eventfd to it. Unforwarding can only be called while the
> > > > signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> condition
> > > is
> > > > not satisfied, the command returns an -EBUSY.
> > > > +
> > > > + KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> > > mechanism to post
> > > typo
> > > > + the IRQ to guests.
> > > > +For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq
> > > struct.
> > > > +
> > > > +When guest OS changes the interrupt configuration for an assigned
> device,
> > > > +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> > > > +to tell KVM to update the related IRTE according the VT-d
> > > Posted-Interrrupts
> > > > +Specification, such as, the guest vector should be updated in the related
> > > IRTE.
> > > For my curiosity are there any restrictions about the instant at which
> > > the change can be done?
> > > I do not get here how you deactivate the posting?
> >
> > The current method is if the hardware supports interrupts posting, we will
> > use it instead of interrupts remapping, since it has good performance. Why
> > do I need deactivate interrupts posting?
> >
> > Here is the reply to Alex for the same question:
> > "In fact, I don't think we need to stop the posted-interrupts. For setting
> > posted interrupts, we update the related IRTE according to the new
> > format. If the guest reboots, or unload the drivers, or some other
> > operations, the msi/msix will be disabled first, in this path, the irq
> > will be disabled the related IRTE is not used anymore."
>
> Right, and I'm still not sure I agree with that reasoning. We need to
> build the kernel interface to be generic, not tailored for a specific
> userspace. I don't really feel comfortable having something that can't
> be disabled via a similar path to it being enabled. For instance, what
> about a dynamic debug interface where we want to enable tracing and see
> each interrupt injected into the guest. At that point we'd want to
> disabled posted interrupts and direct KVM injection and route via QEMU.
> Thanks,
>
> Alex
Okay, I will think about this.
Thanks,
Feng
>
> > > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > > index a269a42..7d98650 100644
> > > > --- a/include/uapi/linux/kvm.h
> > > > +++ b/include/uapi/linux/kvm.h
> > > > @@ -949,6 +949,7 @@ struct kvm_device_attr {
> > > > #define KVM_DEV_VFIO_DEVICE 2
> > > > #define KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
> > > > #define KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
> > > > +#define KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
> > > Maybe we should align our naming verb vs verbing here?
> > > Best Regards
> > > Eric
> >
> > No problem, I will align my patch in the next version. Thanks!
> >
> > Thanks,
> > Feng
> >
> > > >
> > > > enum kvm_device_type {
> > > > KVM_DEV_TYPE_FSL_MPIC_20 = 1,
> > > > @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> > > > __u32 gsi; /* gsi, ie. virtual IRQ number */
> > > > };
> > > >
> > > > +struct kvm_vfio_dev_irq {
> > > > + __u32 argsz;
> > > > + __u32 fd; /* file descriptor of the VFIO device */
> > > > + __u32 index; /* VFIO device IRQ index */
> > > > + __u32 start;
> > > > + __u32 count;
> > > > + __u32 gsi[]; /* gsi, ie. virtual IRQ number */
> > > > +};
> > > > +
> > > > /*
> > > > * ioctls for VM fds
> > > > */
> > > >
> > >
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > the body of a message to [email protected]
> > > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
> -----Original Message-----
> From: Eric Auger [mailto:[email protected]]
> Sent: Monday, December 08, 2014 6:16 PM
> To: Alex Williamson; Wu, Feng
> Cc: [email protected]; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Subject: Re: [v2 18/25] KVM: kvm-vfio: implement the VFIO skeleton for VT-d
> Posted-Interrupts
>
> On 12/08/2014 06:12 AM, Alex Williamson wrote:
> > On Mon, 2014-12-08 at 04:58 +0000, Wu, Feng wrote:
> >>
> >>> -----Original Message-----
> >>> From: Eric Auger [mailto:[email protected]]
> >>> Sent: Thursday, December 04, 2014 11:36 PM
> >>> To: Wu, Feng; [email protected]; [email protected]; [email protected];
> >>> [email protected]; [email protected]; [email protected];
> >>> [email protected]; [email protected]; [email protected];
> >>> [email protected]
> >>> Cc: [email protected]; [email protected];
> >>> [email protected]
> >>> Subject: Re: [v2 18/25] KVM: kvm-vfio: implement the VFIO skeleton for
> VT-d
> >>> Posted-Interrupts
> >>>
> >>> Hi Feng,
> >>>
> >>> On 12/03/2014 08:39 AM, Feng Wu wrote:
> >>>> This patch adds the kvm-vfio interface for VT-d Posted-Interrrupts.
> >>>> When guests updates MSI/MSI-x information for an assigned-device,
> >>> update
> >>>> QEMU will use KVM_DEV_VFIO_DEVICE_POSTING_IRQ attribute to setup
> >>>> IRTE for VT-d PI. This patch implement this IRQ attribute.
> >>> s/implement/implements
> >>>>
> >>>> Signed-off-by: Feng Wu <[email protected]>
> >>>> ---
> >>>> include/linux/kvm_host.h | 19 ++++++++
> >>>> virt/kvm/vfio.c | 103
> >>> ++++++++++++++++++++++++++++++++++++++++++++++
> >>>> 2 files changed, 122 insertions(+), 0 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >>>> index 5cd4420..8d06678 100644
> >>>> --- a/include/linux/kvm_host.h
> >>>> +++ b/include/linux/kvm_host.h
> >>>> @@ -1134,6 +1134,25 @@ static inline int
> >>> kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq,
> >>>> }
> >>>> #endif
> >>>>
> >>>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> >>>> +/*
> >>>> + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> >>>> + *
> >>>> + * @kvm: kvm
> >>>> + * @host_irq: host irq of the interrupt
> >>>> + * @guest_irq: gsi of the interrupt
> >>>> + * returns 0 on success, < 0 on failure
> >>>> + */
> >>>> +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int
> host_irq,
> >>>> + uint32_t guest_irq);
> >>>> +#else
> >>>> +static int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int
> >>> host_irq,
> >>>> + uint32_t guest_irq)
> >>>> +{
> >>>> + return 0;
> >>>> +}
> >>>> +#endif
> >>>> +
> >>>> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> >>>>
> >>>> static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu,
> bool
> >>> val)
> >>>> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> >>>> index 6bc7001..5e5515f 100644
> >>>> --- a/virt/kvm/vfio.c
> >>>> +++ b/virt/kvm/vfio.c
> >>>> @@ -446,6 +446,99 @@ out:
> >>>> return ret;
> >>>> }
> >>>>
> >>>> +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> >>>> +{
> >>>> + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> >>>> + u8 pin;
> >>>> +
> >>>> + pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> >>>> + if (pin)
> >>>> + return 1;
> >>>> + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX)
> >>>> + return pci_msi_vec_count(pdev);
> >>>> + else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
> >>>> + return pci_msix_vec_count(pdev);
> >>>> +
> >>>> + return 0;
> >>>> +}
> >>> for platform case I was asked to move the retrieval of absolute irq
> >>> number to the architecture specific part. I don't know if it should
> >>> apply to PCI stuff as well? This explains why I need to pass the VFIO
> >>> device (or struct device handle) to the arch specific part. Actually we
> >>> do the same job, we provide a phys/virt IRQ mapping to KVM, right? So to
> >>> me our architecture specific API should look quite similar?
> >>
> >> In my patch, QEMU passes IRQ type(MSI/MSIx in my case), VFIO device
> index,
> >> and sub-index via "struct kvm_vfio_dev_irq" to KVM, then KVM will find the
> >> real host irq from the VFIO device index and the IRQ type. Is this something
> >> similar with your patch?
> >>
> >>>
> >>>> +
> >>>> +static int kvm_vfio_set_pi(struct kvm_device *kdev, int32_t __user
> *argp)
> >>>> +{
> >>>> + struct kvm_vfio_dev_irq pi_info;
> >>>> + uint32_t *gsi;
> >>>> + unsigned long minsz;
> >>>> + struct vfio_device *vdev;
> >>>> + struct msi_desc *entry;
> >>>> + struct device *dev;
> >>>> + struct pci_dev *pdev;
> >>>> + int i, max, ret;
> >>>> +
> >>>> + minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> >>>> +
> >>>> + if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> >>>> + return -EFAULT;
> >>>> +
> >>>> + if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> >>> PCI specific check, same remark as above but I will let Alex further
> >>> comment on this and possibly invalidate this commeny ;-)
> >>>> + return -EINVAL;
> >>>> +
> >>>> + vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> >>>> + if (IS_ERR(vdev))
> >>>> + return PTR_ERR(vdev);
> >>>> +
> >>>> + dev = kvm_vfio_external_base_device(vdev);
> >>>> + if (!dev || !dev_is_pci(dev)) {
> >>>> + ret = -EFAULT;
> >>>> + goto put_vfio_device;
> >>>> + }
> >>>> +
> >>>> + pdev = to_pci_dev(dev);
> >>>> +
> >>>> + max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> >>>> + if (max <= 0) {
> >>>> + ret = -EFAULT;
> >>>> + goto put_vfio_device;
> >>>> + }
> >>>> +
> >>>> + if (pi_info.argsz - minsz < pi_info.count * sizeof(int) ||
> >>> shouldn' we use the actual datatype?
> >>
> >> I am afraid I don't get this, could you please be more specific? Thanks a lot!
> >
> > We could have a platform that supports 64bit INTs.
> yes this is what I meant (struct datatype is __u32).
>
> Thanks
>
> Eric
Oh, I got it. In fact, I changed the type of gsi[] from "int" to "u32" in
struct kvm_vfio_dev_irq, but I forgot to change this place. I will correct it.
Thanks for the comments!
> >
> >>>> + pi_info.start >= max || pi_info.start + pi_info.count > max) {
> >>>> + ret = -EINVAL;
> >>>> + goto put_vfio_device;
> >>>> + }
> >>>> +
> >>>> + gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> >>>> + pi_info.count * sizeof(int));
> >>> same question as above
> >>>> + if (IS_ERR(gsi)) {
> >>>> + ret = PTR_ERR(gsi);
> >>>> + goto put_vfio_device;
> >>>> + }
> >>>> +
> >>>> +#ifdef CONFIG_PCI_MSI
> >>>> + for (i = 0; i < pi_info.count; i++) {
> >>>> + list_for_each_entry(entry, &pdev->msi_list, list) {
> >>>> + if (entry->msi_attrib.entry_nr != pi_info.start+i)
> >>>> + continue;
> >>>> +
> >>>> + ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> >>>> + entry->irq,
> >>>> + gsi[i]);
> >>>> + if (ret) {
> >>>> + ret = -EFAULT;
> >>> why -EFAULT? and not propagation of original error code?
> >> Yes, you are right. Thanks for the comments!
> >>
> >>> you may have posting set for part of the subindexes and unset for rest.
> >>> Isn't it an issue?
> >>
> >> QEMU will always set the posting for all the sub-indexes for MSI/MSIx,
> >> once the guest updates the configuration of some sub-indexes, KVM will
> >> update it accordingly. So in which case will what you mentioned above
> >> happen?
>
> Was pointing out you handle the case where kvm_arch_vfio_update_pi_irte
> could fail and you still continue looping thru the other indexes. So
> theoretically you could have a mixed of non posted IRQs and posted IRQs.
>
Okay, I got your point. In fact, remapped IRQs and Posted IRQs are independent.
We can make some of the IRQ posted and the rest remapped. They are using
different IRTEs in the remapped structure.
Thanks,
Feng
> Best Regards
>
> Eric
> >
> > QEMU is just one userspace, not necessarily the only userspace. The
> > kernel shouldn't expect a specific userspace behavior.
> >
> >>>> + goto free_gsi;
> >>>> + }
> >>>> + }
> >>>> + }
> >>>> +#endif
> >>>> +
> >>>> + ret = 0;
> >>>> +
> >>>> +free_gsi:
> >>>> + kfree(gsi);
> >>>> +
> >>>> +put_vfio_device:
> >>>> + kvm_vfio_put_vfio_device(vdev);
> >>>> + return ret;
> >>>> +}
> >>>> +
> >>>> static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64
> arg)
> >>>> {
> >>>> int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> >>>> @@ -456,6 +549,11 @@ static int kvm_vfio_set_device(struct
> kvm_device
> >>> *kdev, long attr, u64 arg)
> >>>> case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> >>>> ret = kvm_vfio_control_irq_forward(kdev, attr, argp);
> >>>> break;
> >>>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> >>>> + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> >>>> + ret = kvm_vfio_set_pi(kdev, argp);
> >>>> + break;
> >>>> +#endif
> >>>> default:
> >>>> ret = -ENXIO;
> >>>> }
> >>>> @@ -511,6 +609,11 @@ static int kvm_vfio_has_attr(struct kvm_device
> >>> *dev,
> >>>> case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
> >>>> return 0;
> >>>> #endif
> >>>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
> >>>> + case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
> >>>> + return 0;
> >>>> +#endif
> >>>> +
> >>>> }
> >>>> break;
> >>>> }
> >>>>
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> the body of a message to [email protected]
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >
> >
> >
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
> -----Original Message-----
> From: Alex Williamson [mailto:[email protected]]
> Sent: Monday, December 08, 2014 1:21 PM
> To: Wu, Feng
> Cc: Eric Auger; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]
> Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
>
> On Mon, 2014-12-08 at 04:58 +0000, Wu, Feng wrote:
> >
> > > -----Original Message-----
> > > From: [email protected] [mailto:[email protected]]
> On
> > > Behalf Of Eric Auger
> > > Sent: Thursday, December 04, 2014 10:05 PM
> > > To: Wu, Feng; [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]
> > > Cc: [email protected]; [email protected];
> > > [email protected]
> > > Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
> > >
> > > Hi Feng,
> > > On 12/03/2014 08:39 AM, Feng Wu wrote:
> > > > This patch adds and documents a new attribute
> > > > KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> group.
> > > > This new attribute is used for VT-d Posted-Interrupts.
> > > >
> > > > When guest OS changes the interrupt configuration for an
> > > > assigned device, such as, MSI/MSIx data/address fields,
> > > > QEMU will use this IRQ attribute to tell KVM to update the
> > > > related IRTE according the VT-d Posted-Interrrupts Specification,
> > > > such as, the guest vector should be updated in the related IRTE.
> > > >
> > > > Signed-off-by: Feng Wu <[email protected]>
> > > > ---
> > > > Documentation/virtual/kvm/devices/vfio.txt | 9 +++++++++
> > > > include/uapi/linux/kvm.h | 10 ++++++++++
> > > > 2 files changed, 19 insertions(+), 0 deletions(-)
> > > >
> > > > diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> > > b/Documentation/virtual/kvm/devices/vfio.txt
> > > > index f7aff29..41e12b7 100644
> > > > --- a/Documentation/virtual/kvm/devices/vfio.txt
> > > > +++ b/Documentation/virtual/kvm/devices/vfio.txt
> > > > @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has
> been
> > > called to trigger the IRQ
> > > > or associate an eventfd to it. Unforwarding can only be called while the
> > > > signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> condition
> > > is
> > > > not satisfied, the command returns an -EBUSY.
> > > > +
> > > > + KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> > > mechanism to post
> > > typo
> > > > + the IRQ to guests.
> > > > +For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq
> > > struct.
> > > > +
> > > > +When guest OS changes the interrupt configuration for an assigned
> device,
> > > > +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> > > > +to tell KVM to update the related IRTE according the VT-d
> > > Posted-Interrrupts
> > > > +Specification, such as, the guest vector should be updated in the related
> > > IRTE.
> > > For my curiosity are there any restrictions about the instant at which
> > > the change can be done?
> > > I do not get here how you deactivate the posting?
> >
> > The current method is if the hardware supports interrupts posting, we will
> > use it instead of interrupts remapping, since it has good performance. Why
> > do I need deactivate interrupts posting?
> >
> > Here is the reply to Alex for the same question:
> > "In fact, I don't think we need to stop the posted-interrupts. For setting
> > posted interrupts, we update the related IRTE according to the new
> > format. If the guest reboots, or unload the drivers, or some other
> > operations, the msi/msix will be disabled first, in this path, the irq
> > will be disabled the related IRTE is not used anymore."
>
> Right, and I'm still not sure I agree with that reasoning. We need to
> build the kernel interface to be generic, not tailored for a specific
> userspace. I don't really feel comfortable having something that can't
> be disabled via a similar path to it being enabled. For instance, what
> about a dynamic debug interface where we want to enable tracing and see
> each interrupt injected into the guest. At that point we'd want to
> disabled posted interrupts and direct KVM injection and route via QEMU.
> Thanks,
>
> Alex
I am not quite understand why we need to debug the software
delivery path for interrupt when PI is used, in this case, the software
injection code will have no chance to execute. If we don't want the use
PI, we can disable it from kernel command line.
Thanks,
Feng
>
> > > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > > index a269a42..7d98650 100644
> > > > --- a/include/uapi/linux/kvm.h
> > > > +++ b/include/uapi/linux/kvm.h
> > > > @@ -949,6 +949,7 @@ struct kvm_device_attr {
> > > > #define KVM_DEV_VFIO_DEVICE 2
> > > > #define KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
> > > > #define KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
> > > > +#define KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
> > > Maybe we should align our naming verb vs verbing here?
> > > Best Regards
> > > Eric
> >
> > No problem, I will align my patch in the next version. Thanks!
> >
> > Thanks,
> > Feng
> >
> > > >
> > > > enum kvm_device_type {
> > > > KVM_DEV_TYPE_FSL_MPIC_20 = 1,
> > > > @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> > > > __u32 gsi; /* gsi, ie. virtual IRQ number */
> > > > };
> > > >
> > > > +struct kvm_vfio_dev_irq {
> > > > + __u32 argsz;
> > > > + __u32 fd; /* file descriptor of the VFIO device */
> > > > + __u32 index; /* VFIO device IRQ index */
> > > > + __u32 start;
> > > > + __u32 count;
> > > > + __u32 gsi[]; /* gsi, ie. virtual IRQ number */
> > > > +};
> > > > +
> > > > /*
> > > > * ioctls for VM fds
> > > > */
> > > >
> > >
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > the body of a message to [email protected]
> > > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Thu, 2014-12-11 at 05:55 +0000, Wu, Feng wrote:
>
> > -----Original Message-----
> > From: Alex Williamson [mailto:[email protected]]
> > Sent: Monday, December 08, 2014 1:21 PM
> > To: Wu, Feng
> > Cc: Eric Auger; [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]
> > Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
> >
> > On Mon, 2014-12-08 at 04:58 +0000, Wu, Feng wrote:
> > >
> > > > -----Original Message-----
> > > > From: [email protected] [mailto:[email protected]]
> > On
> > > > Behalf Of Eric Auger
> > > > Sent: Thursday, December 04, 2014 10:05 PM
> > > > To: Wu, Feng; [email protected]; [email protected]; [email protected];
> > > > [email protected]; [email protected]; [email protected];
> > > > [email protected]; [email protected]; [email protected];
> > > > [email protected]
> > > > Cc: [email protected]; [email protected];
> > > > [email protected]
> > > > Subject: Re: [v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts
> > > >
> > > > Hi Feng,
> > > > On 12/03/2014 08:39 AM, Feng Wu wrote:
> > > > > This patch adds and documents a new attribute
> > > > > KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> > group.
> > > > > This new attribute is used for VT-d Posted-Interrupts.
> > > > >
> > > > > When guest OS changes the interrupt configuration for an
> > > > > assigned device, such as, MSI/MSIx data/address fields,
> > > > > QEMU will use this IRQ attribute to tell KVM to update the
> > > > > related IRTE according the VT-d Posted-Interrrupts Specification,
> > > > > such as, the guest vector should be updated in the related IRTE.
> > > > >
> > > > > Signed-off-by: Feng Wu <[email protected]>
> > > > > ---
> > > > > Documentation/virtual/kvm/devices/vfio.txt | 9 +++++++++
> > > > > include/uapi/linux/kvm.h | 10 ++++++++++
> > > > > 2 files changed, 19 insertions(+), 0 deletions(-)
> > > > >
> > > > > diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> > > > b/Documentation/virtual/kvm/devices/vfio.txt
> > > > > index f7aff29..41e12b7 100644
> > > > > --- a/Documentation/virtual/kvm/devices/vfio.txt
> > > > > +++ b/Documentation/virtual/kvm/devices/vfio.txt
> > > > > @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has
> > been
> > > > called to trigger the IRQ
> > > > > or associate an eventfd to it. Unforwarding can only be called while the
> > > > > signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> > condition
> > > > is
> > > > > not satisfied, the command returns an -EBUSY.
> > > > > +
> > > > > + KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> > > > mechanism to post
> > > > typo
> > > > > + the IRQ to guests.
> > > > > +For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq
> > > > struct.
> > > > > +
> > > > > +When guest OS changes the interrupt configuration for an assigned
> > device,
> > > > > +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> > > > > +to tell KVM to update the related IRTE according the VT-d
> > > > Posted-Interrrupts
> > > > > +Specification, such as, the guest vector should be updated in the related
> > > > IRTE.
> > > > For my curiosity are there any restrictions about the instant at which
> > > > the change can be done?
> > > > I do not get here how you deactivate the posting?
> > >
> > > The current method is if the hardware supports interrupts posting, we will
> > > use it instead of interrupts remapping, since it has good performance. Why
> > > do I need deactivate interrupts posting?
> > >
> > > Here is the reply to Alex for the same question:
> > > "In fact, I don't think we need to stop the posted-interrupts. For setting
> > > posted interrupts, we update the related IRTE according to the new
> > > format. If the guest reboots, or unload the drivers, or some other
> > > operations, the msi/msix will be disabled first, in this path, the irq
> > > will be disabled the related IRTE is not used anymore."
> >
> > Right, and I'm still not sure I agree with that reasoning. We need to
> > build the kernel interface to be generic, not tailored for a specific
> > userspace. I don't really feel comfortable having something that can't
> > be disabled via a similar path to it being enabled. For instance, what
> > about a dynamic debug interface where we want to enable tracing and see
> > each interrupt injected into the guest. At that point we'd want to
> > disabled posted interrupts and direct KVM injection and route via QEMU.
> > Thanks,
> >
> > Alex
>
> I am not quite understand why we need to debug the software
> delivery path for interrupt when PI is used, in this case, the software
> injection code will have no chance to execute. If we don't want the use
> PI, we can disable it from kernel command line.
Well, first off, I think it's just good interface design that if we
introduce the ability to enable something we also provide the ability to
disable it without making assumption about how a specific userspace is
expected to make use of the interface. In this specific case, you're
missing the "dynamic" aspect of the switch. What if we want to attach
gdb to the guest and at that point disable all KVM accelerations for an
assigned device so that we can see every register access and every
interrupt? I don't think the interface proposed here allows for that
because it expects the guest to switch to a different interrupt mode for
the device in order to disable PI. Don't design the interface tailored
only for a specific use case. Thanks,
Alex
> > > > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > > > index a269a42..7d98650 100644
> > > > > --- a/include/uapi/linux/kvm.h
> > > > > +++ b/include/uapi/linux/kvm.h
> > > > > @@ -949,6 +949,7 @@ struct kvm_device_attr {
> > > > > #define KVM_DEV_VFIO_DEVICE 2
> > > > > #define KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
> > > > > #define KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
> > > > > +#define KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
> > > > Maybe we should align our naming verb vs verbing here?
> > > > Best Regards
> > > > Eric
> > >
> > > No problem, I will align my patch in the next version. Thanks!
> > >
> > > Thanks,
> > > Feng
> > >
> > > > >
> > > > > enum kvm_device_type {
> > > > > KVM_DEV_TYPE_FSL_MPIC_20 = 1,
> > > > > @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> > > > > __u32 gsi; /* gsi, ie. virtual IRQ number */
> > > > > };
> > > > >
> > > > > +struct kvm_vfio_dev_irq {
> > > > > + __u32 argsz;
> > > > > + __u32 fd; /* file descriptor of the VFIO device */
> > > > > + __u32 index; /* VFIO device IRQ index */
> > > > > + __u32 start;
> > > > > + __u32 count;
> > > > > + __u32 gsi[]; /* gsi, ie. virtual IRQ number */
> > > > > +};
> > > > > +
> > > > > /*
> > > > > * ioctls for VM fds
> > > > > */
> > > > >
> > > >
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > > the body of a message to [email protected]
> > > > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > the body of a message to [email protected]
> > > More majordomo info at http://vger.kernel.org/majordomo-info.html
> >
> >
>