LinuxLists.cc - [KVM PATCH v3 0/3] irqfd enhancements, and irq

2009-10-26 16:22:03

Subject: [KVM PATCH v3 0/3] irqfd enhancements, and irq_routing fixes

(Applies to kvm.git/master:11b06403)

The following patches are cleanups/enhancements for IRQFD now that
we have lockless interrupt injection. For more details, please see
the patch headers.

These patches pass checkpatch, and are fully tested. Please consider
for merging. Patch 1/3 is a fix for an issue that may exist upstream
and should be considered for a more timely push upstream. Patches 2/3
- 3/3 are an enhancement only, so there is no urgency to push to
mainline until a suitable merge window presents itself.

Kind Regards,
-Greg

[ Change log:

v3:
*) Added patch 1/3 as a fix for a race condition
*) Minor cleanup to 2/3 to ensure that all shared vectors conform
to a unified locking model.

v2:
*) dropped original cleanup which relied on the user registering
MSI based GSIs or we may crash at runtime. Instead, we now
check at registration whether the GSI supports lockless
operation and dynamically adapt to either the original
deferred path for lock-based injections, or direct for lockless.

v1:
*) original release
]

---

Gregory Haskins (3):
KVM: Directly inject interrupts if they support lockless operation
KVM: export lockless GSI attribute
KVM: fix race in irq_routing logic

include/linux/kvm_host.h | 8 ++++
virt/kvm/eventfd.c | 31 +++++++++++++++--
virt/kvm/irq_comm.c | 85 ++++++++++++++++++++++++++++++++++------------
virt/kvm/kvm_main.c | 1 +
4 files changed, 98 insertions(+), 27 deletions(-)

--
Signature

2009-10-26 16:22:21

by Gregory Haskins

[permalink] [raw]

Subject: [KVM PATCH v3 1/3] KVM: fix race in irq_routing logic

The current code suffers from the following race condition:

thread-1 thread-2
-----------------------------------------------------------

kvm_set_irq() {
rcu_read_lock()
irq_rt = rcu_dereference(table);
rcu_read_unlock();

kvm_set_irq_routing() {
mutex_lock();
irq_rt = table;
rcu_assign_pointer();
mutex_unlock();
synchronize_rcu();

kfree(irq_rt);

irq_rt->entry->set(); /* bad */

-------------------------------------------------------------

Because the pointer is accessed outside of the read-side critical
section. There are two basic patterns we can use to fix this bug:

1) Switch to sleeping-rcu and encompass the ->set() access within the
read-side critical section,

OR

2) Add reference counting to the irq_rt structure, and simply acquire
the reference from within the RSCS.

This patch implements solution (1).

Signed-off-by: Gregory Haskins <[email protected]>
---

include/linux/kvm_host.h | 6 +++++-
virt/kvm/irq_comm.c | 50 +++++++++++++++++++++++++++-------------------
virt/kvm/kvm_main.c | 1 +
3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd5a616..1fe135d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -185,7 +185,10 @@ struct kvm {

struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
- struct kvm_irq_routing_table *irq_routing;
+ struct {
+ struct srcu_struct srcu;
+ struct kvm_irq_routing_table *table;
+ } irq_routing;
struct hlist_head mask_notifier_list;
struct hlist_head irq_ack_notifier_list;
#endif
@@ -541,6 +544,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
unsigned flags);
+void kvm_init_irq_routing(struct kvm *kvm);
void kvm_free_irq_routing(struct kvm *kvm);

#else
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 00c68d2..db2553f 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -144,10 +144,11 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
*/
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
{
- struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
- int ret = -1, i = 0;
+ struct kvm_kernel_irq_routing_entry *e;
+ int ret = -1;
struct kvm_irq_routing_table *irq_rt;
struct hlist_node *n;
+ int idx;

trace_kvm_set_irq(irq, level, irq_source_id);

@@ -155,21 +156,19 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
* IOAPIC. So set the bit in both. The guest will ignore
* writes to the unused one.
*/
- rcu_read_lock();
- irq_rt = rcu_dereference(kvm->irq_routing);
+ idx = srcu_read_lock(&kvm->irq_routing.srcu);
+ irq_rt = rcu_dereference(kvm->irq_routing.table);
if (irq < irq_rt->nr_rt_entries)
- hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
- irq_set[i++] = *e;
- rcu_read_unlock();
+ hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
+ int r;

- while(i--) {
- int r;
- r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
- if (r < 0)
- continue;
+ r = e->set(e, kvm, irq_source_id, level);
+ if (r < 0)
+ continue;

- ret = r + ((ret < 0) ? 0 : ret);
- }
+ ret = r + ((ret < 0) ? 0 : ret);
+ }
+ srcu_read_unlock(&kvm->irq_routing.srcu, idx);

return ret;
}
@@ -179,17 +178,18 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
struct kvm_irq_ack_notifier *kian;
struct hlist_node *n;
int gsi;
+ int idx;

trace_kvm_ack_irq(irqchip, pin);

- rcu_read_lock();
- gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ idx = srcu_read_lock(&kvm->irq_routing.srcu);
+ gsi = rcu_dereference(kvm->irq_routing.table)->chip[irqchip][pin];
if (gsi != -1)
hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
link)
if (kian->gsi == gsi)
kian->irq_acked(kian);
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_routing.srcu, idx);
}

void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -287,11 +287,19 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
rcu_read_unlock();
}

+void kvm_init_irq_routing(struct kvm *kvm)
+{
+ init_srcu_struct(&kvm->irq_routing.srcu);
+}
+
void kvm_free_irq_routing(struct kvm *kvm)
{
/* Called only during vm destruction. Nobody can use the pointer
at this stage */
- kfree(kvm->irq_routing);
+ synchronize_srcu(&kvm->irq_routing.srcu);
+ cleanup_srcu_struct(&kvm->irq_routing.srcu);
+
+ kfree(kvm->irq_routing.table);
}

static int setup_routing_entry(struct kvm_irq_routing_table *rt,
@@ -396,10 +404,10 @@ int kvm_set_irq_routing(struct kvm *kvm,
}

mutex_lock(&kvm->irq_lock);
- old = kvm->irq_routing;
- rcu_assign_pointer(kvm->irq_routing, new);
+ old = kvm->irq_routing.table;
+ rcu_assign_pointer(kvm->irq_routing.table, new);
mutex_unlock(&kvm->irq_lock);
- synchronize_rcu();
+ synchronize_srcu(&kvm->irq_routing.srcu);

new = old;
r = 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cac69c4..ba94159 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -388,6 +388,7 @@ static struct kvm *kvm_create_vm(void)
atomic_inc(&kvm->mm->mm_count);
spin_lock_init(&kvm->mmu_lock);
spin_lock_init(&kvm->requests_lock);
+ kvm_init_irq_routing(kvm);
kvm_io_bus_init(&kvm->pio_bus);
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);

2009-10-26 16:22:39

by Gregory Haskins

[permalink] [raw]

Subject: [KVM PATCH v3 2/3] KVM: export lockless GSI attribute

Certain GSI's support lockless injecton, but we have no way to detect
which ones at the GSI level. Knowledge of this attribute will be
useful later in the series so that we can optimize irqfd injection
paths for cases where we know the code will not sleep. Therefore,
we provide an API to query a specific GSI.

Signed-off-by: Gregory Haskins <[email protected]>
---

include/linux/kvm_host.h | 2 ++
virt/kvm/irq_comm.c | 35 ++++++++++++++++++++++++++++++++++-
2 files changed, 36 insertions(+), 1 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1fe135d..01151a6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -119,6 +119,7 @@ struct kvm_memory_slot {
struct kvm_kernel_irq_routing_entry {
u32 gsi;
u32 type;
+ bool lockless;
int (*set)(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level);
union {
@@ -420,6 +421,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
unsigned long *deliver_bitmask);
#endif
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_irq_check_lockless(struct kvm *kvm, u32 irq);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index db2553f..a7fd487 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -173,6 +173,35 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
return ret;
}

+int kvm_irq_check_lockless(struct kvm *kvm, u32 irq)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct hlist_node *n;
+ int ret = -ENOENT;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_routing.srcu);
+ irq_rt = rcu_dereference(kvm->irq_routing.table);
+ if (irq < irq_rt->nr_rt_entries)
+ hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
+ if (!e->lockless) {
+ /*
+ * all destinations need to be lockless to
+ * declare that the GSI as a whole is also
+ * lockless
+ */
+ ret = 0;
+ break;
+ }
+
+ ret = 1;
+ }
+ srcu_read_unlock(&kvm->irq_routing.srcu, idx);
+
+ return ret;
+}
+
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
struct kvm_irq_ack_notifier *kian;
@@ -310,18 +339,22 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
int delta;
struct kvm_kernel_irq_routing_entry *ei;
struct hlist_node *n;
+ bool lockless = ue->type == KVM_IRQ_ROUTING_MSI;

/*
* Do not allow GSI to be mapped to the same irqchip more than once.
* Allow only one to one mapping between GSI and MSI.
+ * Do not allow mixed lockless vs locked variants to coexist.
*/
hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
if (ei->type == KVM_IRQ_ROUTING_MSI ||
- ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+ ue->u.irqchip.irqchip == ei->irqchip.irqchip ||
+ ei->lockless != lockless)
return r;

e->gsi = ue->gsi;
e->type = ue->type;
+ e->lockless = lockless;
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP:
delta = 0;

2009-10-26 16:22:17

by Gregory Haskins

[permalink] [raw]

Subject: [KVM PATCH v3 3/3] KVM: Directly inject interrupts if they support lockless operation

IRQFD currently uses a deferred workqueue item to execute the injection
operation. It was originally designed this way because kvm_set_irq()
required the caller to hold the irq_lock mutex, and the eventfd callback
is invoked from within a non-preemptible critical section.

With the advent of lockless injection support for certain GSIs, the
deferment mechanism is no longer technically needed in all cases.
Since context switching to the workqueue is a source of interrupt
latency, lets switch to a direct method whenever possible. Fortunately
for us, the most common use of irqfd (MSI-based GSIs) readily support
lockless injection.

Signed-off-by: Gregory Haskins <[email protected]>
---

virt/kvm/eventfd.c | 31 +++++++++++++++++++++++++++----
1 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 30f70fd..e6cc958 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -51,20 +51,34 @@ struct _irqfd {
wait_queue_t wait;
struct work_struct inject;
struct work_struct shutdown;
+ void (*execute)(struct _irqfd *);
};

static struct workqueue_struct *irqfd_cleanup_wq;

static void
-irqfd_inject(struct work_struct *work)
+irqfd_inject(struct _irqfd *irqfd)
{
- struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;

kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
}

+static void
+irqfd_deferred_inject(struct work_struct *work)
+{
+ struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+
+ irqfd_inject(irqfd);
+}
+
+static void
+irqfd_schedule(struct _irqfd *irqfd)
+{
+ schedule_work(&irqfd->inject);
+}
+
/*
* Race-free decouple logic (ordering is critical)
*/
@@ -126,7 +140,7 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)

if (flags & POLLIN)
/* An event has been signaled, inject an interrupt */
- schedule_work(&irqfd->inject);
+ irqfd->execute(irqfd);

if (flags & POLLHUP) {
/* The eventfd is closing, detach from KVM */
@@ -179,7 +193,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
irqfd->kvm = kvm;
irqfd->gsi = gsi;
INIT_LIST_HEAD(&irqfd->list);
- INIT_WORK(&irqfd->inject, irqfd_inject);
+ INIT_WORK(&irqfd->inject, irqfd_deferred_inject);
INIT_WORK(&irqfd->shutdown, irqfd_shutdown);

file = eventfd_fget(fd);
@@ -209,6 +223,15 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
list_add_tail(&irqfd->list, &kvm->irqfds.items);
spin_unlock_irq(&kvm->irqfds.lock);

+ ret = kvm_irq_check_lockless(kvm, gsi);
+ if (ret < 0)
+ goto fail;
+
+ if (ret)
+ irqfd->execute = &irqfd_inject;
+ else
+ irqfd->execute = &irqfd_schedule;
+
/*
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.

2009-10-27 03:46:10

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [KVM PATCH v3 1/3] KVM: fix race in irq_routing logic

2009-10-27 06:45:45

by Gleb Natapov

by Michael S. Tsirkin

[permalink] [raw]

Subject: Re: [KVM PATCH v3 3/3] KVM: Directly inject interrupts if they support lockless operation

On Mon, Oct 26, 2009 at 12:22:08PM -0400, Gregory Haskins wrote:
> IRQFD currently uses a deferred workqueue item to execute the injection
> operation. It was originally designed this way because kvm_set_irq()
> required the caller to hold the irq_lock mutex, and the eventfd callback
> is invoked from within a non-preemptible critical section.
>
> With the advent of lockless injection support for certain GSIs, the
> deferment mechanism is no longer technically needed in all cases.
> Since context switching to the workqueue is a source of interrupt
> latency, lets switch to a direct method whenever possible. Fortunately
> for us, the most common use of irqfd (MSI-based GSIs) readily support
> lockless injection.
>
> Signed-off-by: Gregory Haskins <[email protected]>

This is a useful optimization I think.
Some comments below.

> ---
>
> virt/kvm/eventfd.c | 31 +++++++++++++++++++++++++++----
> 1 files changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 30f70fd..e6cc958 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -51,20 +51,34 @@ struct _irqfd {
> wait_queue_t wait;
> struct work_struct inject;
> struct work_struct shutdown;
> + void (*execute)(struct _irqfd *);
> };
>
> static struct workqueue_struct *irqfd_cleanup_wq;
>
> static void
> -irqfd_inject(struct work_struct *work)
> +irqfd_inject(struct _irqfd *irqfd)
> {
> - struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> struct kvm *kvm = irqfd->kvm;
>
> kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> }
>
> +static void
> +irqfd_deferred_inject(struct work_struct *work)
> +{
> + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> +
> + irqfd_inject(irqfd);
> +}
> +
> +static void
> +irqfd_schedule(struct _irqfd *irqfd)
> +{
> + schedule_work(&irqfd->inject);
> +}
> +
> /*
> * Race-free decouple logic (ordering is critical)
> */
> @@ -126,7 +140,7 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>
> if (flags & POLLIN)
> /* An event has been signaled, inject an interrupt */
> - schedule_work(&irqfd->inject);
> + irqfd->execute(irqfd);
>
> if (flags & POLLHUP) {
> /* The eventfd is closing, detach from KVM */
> @@ -179,7 +193,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> irqfd->kvm = kvm;
> irqfd->gsi = gsi;
> INIT_LIST_HEAD(&irqfd->list);
> - INIT_WORK(&irqfd->inject, irqfd_inject);
> + INIT_WORK(&irqfd->inject, irqfd_deferred_inject);
> INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
>
> file = eventfd_fget(fd);
> @@ -209,6 +223,15 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> list_add_tail(&irqfd->list, &kvm->irqfds.items);
> spin_unlock_irq(&kvm->irqfds.lock);
>
> + ret = kvm_irq_check_lockless(kvm, gsi);
> + if (ret < 0)
> + goto fail;
> +
> + if (ret)
> + irqfd->execute = &irqfd_inject;
> + else
> + irqfd->execute = &irqfd_schedule;
> +

Can't gsi get converted from lockless to non-lockless
after it's checked (by the routing ioctl)? Kernel will crash then.

How about, each time we get event from eventfd, we implement
kvm_irqfd_toggle_lockless, which does a single scan, and returns
true/false status (and I really mean toggle, let's not do set 1 / set 0
as well) telling us whether interrupts could be delivered in a lockless
manner?

Then we can either just ignore the error (no one uses eventfd this way),
or handle the mostly irrelevant case of level by means of the workqueue,
like we did previously.

> /*
> * Check if there was an event already pending on the eventfd
> * before we registered, and trigger it as if we didn't miss it.
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2009-10-27 18:54:42

Michael S. Tsirkin wrote:
> On Tue, Oct 27, 2009 at 02:54:40PM -0400, Gregory Haskins wrote:
>> Michael S. Tsirkin wrote:
>>> On Mon, Oct 26, 2009 at 12:22:08PM -0400, Gregory Haskins wrote:
>>>> IRQFD currently uses a deferred workqueue item to execute the injection
>>>> operation. It was originally designed this way because kvm_set_irq()
>>>> required the caller to hold the irq_lock mutex, and the eventfd callback
>>>> is invoked from within a non-preemptible critical section.
>>>>
>>>> With the advent of lockless injection support for certain GSIs, the
>>>> deferment mechanism is no longer technically needed in all cases.
>>>> Since context switching to the workqueue is a source of interrupt
>>>> latency, lets switch to a direct method whenever possible. Fortunately
>>>> for us, the most common use of irqfd (MSI-based GSIs) readily support
>>>> lockless injection.
>>>>
>>>> Signed-off-by: Gregory Haskins <[email protected]>
>>> This is a useful optimization I think.
>>> Some comments below.
>>>
>>>> ---
>>>>
>>>> virt/kvm/eventfd.c | 31 +++++++++++++++++++++++++++----
>>>> 1 files changed, 27 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>>> index 30f70fd..e6cc958 100644
>>>> --- a/virt/kvm/eventfd.c
>>>> +++ b/virt/kvm/eventfd.c
>>>> @@ -51,20 +51,34 @@ struct _irqfd {
>>>> wait_queue_t wait;
>>>> struct work_struct inject;
>>>> struct work_struct shutdown;
>>>> + void (*execute)(struct _irqfd *);
>>>> };
>>>>
>>>> static struct workqueue_struct *irqfd_cleanup_wq;
>>>>
>>>> static void
>>>> -irqfd_inject(struct work_struct *work)
>>>> +irqfd_inject(struct _irqfd *irqfd)
>>>> {
>>>> - struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
>>>> struct kvm *kvm = irqfd->kvm;
>>>>
>>>> kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
>>>> kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>>>> }
>>>>
>>>> +static void
>>>> +irqfd_deferred_inject(struct work_struct *work)
>>>> +{
>>>> + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
>>>> +
>>>> + irqfd_inject(irqfd);
>>>> +}
>>>> +
>>>> +static void
>>>> +irqfd_schedule(struct _irqfd *irqfd)
>>>> +{
>>>> + schedule_work(&irqfd->inject);
>>>> +}
>>>> +
>>>> /*
>>>> * Race-free decouple logic (ordering is critical)
>>>> */
>>>> @@ -126,7 +140,7 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>>>>
>>>> if (flags & POLLIN)
>>>> /* An event has been signaled, inject an interrupt */
>>>> - schedule_work(&irqfd->inject);
>>>> + irqfd->execute(irqfd);
>>>>
>>>> if (flags & POLLHUP) {
>>>> /* The eventfd is closing, detach from KVM */
>>>> @@ -179,7 +193,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>>>> irqfd->kvm = kvm;
>>>> irqfd->gsi = gsi;
>>>> INIT_LIST_HEAD(&irqfd->list);
>>>> - INIT_WORK(&irqfd->inject, irqfd_inject);
>>>> + INIT_WORK(&irqfd->inject, irqfd_deferred_inject);
>>>> INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
>>>>
>>>> file = eventfd_fget(fd);
>>>> @@ -209,6 +223,15 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>>>> list_add_tail(&irqfd->list, &kvm->irqfds.items);
>>>> spin_unlock_irq(&kvm->irqfds.lock);
>>>>
>>>> + ret = kvm_irq_check_lockless(kvm, gsi);
>>>> + if (ret < 0)
>>>> + goto fail;
>>>> +
>>>> + if (ret)
>>>> + irqfd->execute = &irqfd_inject;
>>>> + else
>>>> + irqfd->execute = &irqfd_schedule;
>>>> +
>>> Can't gsi get converted from lockless to non-lockless
>>> after it's checked (by the routing ioctl)?
>> I think I protect against this in patch 2/3 by ensuring that any vectors
>> that are added have to conform to the same locking rules. The code
>> doesn't support deleting routes, so we really only need to make sure
>> that new routes do not change.
>
> What I refer to, is when userspace calls KVM_SET_GSI_ROUTING.
> I don't see how your patch helps here: can't a GSI formerly
> used for MSI become unused, and then reused for non-MSI?
> If not, it's a problem I think, because I think userspace currently does this
> sometimes.

I see your point. I was thinking vectors could only be added, not
deleted, but I see upon further inspection that is not the case.

>
>>> Kernel will crash then.
>>>
>>> How about, each time we get event from eventfd, we implement
>>> kvm_irqfd_toggle_lockless, which does a single scan, and returns
>>> true/false status (and I really mean toggle, let's not do set 1 / set 0
>>> as well) telling us whether interrupts could be delivered in a lockless
>>> manner?
>> I am not sure I like this idea in general given that I believe I already
>> handle the error case you are concerned with.
>>
>> However, the concept of providing a "toggle" option so we can avoid
>> scanning the list twice is a good one. That can be done as a new patch
>> series, but it would be a nice addition.
>>
>> Thanks Michael,
>> -Greg
>>
>
>

Attachments:

signature.asc (267.00 B)
OpenPGP digital signature

2009-10-28 13:24:36

by Gregory Haskins

[permalink] [raw]

Subject: Re: [KVM PATCH v3 2/3] KVM: export lockless GSI attribute

Michael S. Tsirkin wrote:
> On Mon, Oct 26, 2009 at 12:22:03PM -0400, Gregory Haskins wrote:
>> Certain GSI's support lockless injecton, but we have no way to detect
>> which ones at the GSI level. Knowledge of this attribute will be
>> useful later in the series so that we can optimize irqfd injection
>> paths for cases where we know the code will not sleep. Therefore,
>> we provide an API to query a specific GSI.
>>
>> Signed-off-by: Gregory Haskins <[email protected]>
>> ---
>>
>> include/linux/kvm_host.h | 2 ++
>> virt/kvm/irq_comm.c | 35 ++++++++++++++++++++++++++++++++++-
>> 2 files changed, 36 insertions(+), 1 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 1fe135d..01151a6 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -119,6 +119,7 @@ struct kvm_memory_slot {
>> struct kvm_kernel_irq_routing_entry {
>> u32 gsi;
>> u32 type;
>> + bool lockless;
>
> So lockless is the same as type == MSI from below?

Yep, today anyway.

> If the idea is to make it extensible for the future,
> let's just add an inline function, we don't need a field for this.
>

This makes sense.

>> int (*set)(struct kvm_kernel_irq_routing_entry *e,
>> struct kvm *kvm, int irq_source_id, int level);
>> union {
>> @@ -420,6 +421,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
>> unsigned long *deliver_bitmask);
>> #endif
>> int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
>> +int kvm_irq_check_lockless(struct kvm *kvm, u32 irq);
>> void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
>> void kvm_register_irq_ack_notifier(struct kvm *kvm,
>> struct kvm_irq_ack_notifier *kian);
>> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
>> index db2553f..a7fd487 100644
>> --- a/virt/kvm/irq_comm.c
>> +++ b/virt/kvm/irq_comm.c
>> @@ -173,6 +173,35 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
>> return ret;
>> }
>>
>> +int kvm_irq_check_lockless(struct kvm *kvm, u32 irq)
>> +{
>> + struct kvm_kernel_irq_routing_entry *e;
>> + struct kvm_irq_routing_table *irq_rt;
>> + struct hlist_node *n;
>> + int ret = -ENOENT;
>> + int idx;
>> +
>> + idx = srcu_read_lock(&kvm->irq_routing.srcu);
>> + irq_rt = rcu_dereference(kvm->irq_routing.table);
>> + if (irq < irq_rt->nr_rt_entries)
>> + hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
>> + if (!e->lockless) {
>> + /*
>> + * all destinations need to be lockless to
>> + * declare that the GSI as a whole is also
>> + * lockless
>> + */
>> + ret = 0;
>> + break;
>> + }
>> +
>> + ret = 1;
>> + }
>> + srcu_read_unlock(&kvm->irq_routing.srcu, idx);
>> +
>> + return ret;
>> +}
>> +
>> void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
>> {
>> struct kvm_irq_ack_notifier *kian;
>> @@ -310,18 +339,22 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
>> int delta;
>> struct kvm_kernel_irq_routing_entry *ei;
>> struct hlist_node *n;
>> + bool lockless = ue->type == KVM_IRQ_ROUTING_MSI;
>>
>> /*
>> * Do not allow GSI to be mapped to the same irqchip more than once.
>> * Allow only one to one mapping between GSI and MSI.
>> + * Do not allow mixed lockless vs locked variants to coexist.
>
> Userspace has no idea which entries are lockless and which are not:
> this is an implementation detail - so it might not be able to avoid
> illegal combinations.
> Since this is called on an ioctl, can the rule be formulated in a way
> that makes sense for userspace?
>

I'm not sure.

>> */
>> hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
>> if (ei->type == KVM_IRQ_ROUTING_MSI ||
>> - ue->u.irqchip.irqchip == ei->irqchip.irqchip)
>> + ue->u.irqchip.irqchip == ei->irqchip.irqchip ||
>> + ei->lockless != lockless)
>
> So this check seems like it does nothing, because lockless is same as
> MSI, and MSI is always 1:1? Intentional?
>

Yeah, it was more to guard-against/document the dependency, in case the
1:1 with MSI ever changes in the future.

Kind Regards,
-Greg

Attachments:

signature.asc (267.00 B)
OpenPGP digital signature