Gregory Haskins wrote:
> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> "release" callback. This lets eventfd clients know if the eventfd is about
> to go away and is very useful particularly for in-kernel clients. However,
> until recently it is not possible to use this feature of eventfd in a
> race-free way. This patch utilizes a new eventfd interface to rectify
> the problem.
>
> Note that one final race is known to exist: the slow-work thread may race
> with module removal. We are currently working with slow-work upstream
> to fix this issue as well. Since the code prior to this patch also
> races with module_put(), we are not making anything worse, but rather
> shifting the cause of the race. Once the slow-work code is patched we
> will be fixing the last remaining issue.
>
> Signed-off-by: Gregory Haskins <[email protected]>
> ---
>
> include/linux/kvm_host.h | 7 +-
> virt/kvm/Kconfig | 1
> virt/kvm/eventfd.c | 199 ++++++++++++++++++++++++++++++++++++++++------
> 3 files changed, 179 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 2451f48..d94ee72 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -141,7 +141,12 @@ struct kvm {
> struct kvm_io_bus mmio_bus;
> struct kvm_io_bus pio_bus;
> #ifdef CONFIG_HAVE_KVM_EVENTFD
> - struct list_head irqfds;
> + struct {
> + spinlock_t lock;
> + struct list_head items;
> + atomic_t outstanding;
> + wait_queue_head_t wqh;
> + } irqfds;
> #endif
> struct kvm_vm_stat stat;
> struct kvm_arch arch;
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index daece36..ab7848a 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -9,6 +9,7 @@ config HAVE_KVM_IRQCHIP
> config HAVE_KVM_EVENTFD
> bool
> select EVENTFD
> + select SLOW_WORK
>
> config KVM_APIC_ARCHITECTURE
> bool
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 9656027..ca21e8a 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -28,6 +28,7 @@
> #include <linux/file.h>
> #include <linux/list.h>
> #include <linux/eventfd.h>
> +#include <linux/slow-work.h>
>
> /*
> * --------------------------------------------------------------------
> @@ -36,17 +37,36 @@
> * Credit goes to Avi Kivity for the original idea.
> * --------------------------------------------------------------------
> */
> +
> struct _irqfd {
> struct kvm *kvm;
> + struct eventfd_ctx *eventfd;
> int gsi;
> struct list_head list;
> poll_table pt;
> wait_queue_head_t *wqh;
> wait_queue_t wait;
> struct work_struct inject;
> + struct slow_work shutdown;
> + int active:1;
> };
>
> static void
> +irqfd_release(struct _irqfd *irqfd)
> +{
> + eventfd_ctx_put(irqfd->eventfd);
> + kfree(irqfd);
> +}
> +
> +/* assumes kvm->irqfds.lock is held */
> +static void
> +irqfd_deactivate(struct _irqfd *irqfd)
> +{
> + irqfd->active = false;
> + list_del(&irqfd->list);
> +}
> +
> +static void
> irqfd_inject(struct work_struct *work)
> {
> struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> @@ -58,6 +78,55 @@ irqfd_inject(struct work_struct *work)
> mutex_unlock(&kvm->irq_lock);
> }
>
> +static struct _irqfd *
> +work_to_irqfd(struct slow_work *work)
> +{
> + return container_of(work, struct _irqfd, shutdown);
> +}
> +
> +static int
> +irqfd_shutdown_get_ref(struct slow_work *work)
> +{
> + struct _irqfd *irqfd = work_to_irqfd(work);
> + struct kvm *kvm = irqfd->kvm;
> +
> + atomic_inc(&kvm->irqfds.outstanding);
> +
> + return 0;
> +}
> +
> +static void
> +irqfd_shutdown_put_ref(struct slow_work *work)
> +{
> + struct _irqfd *irqfd = work_to_irqfd(work);
> + struct kvm *kvm = irqfd->kvm;
> +
> + irqfd_release(irqfd);
> +
> + if (atomic_dec_and_test(&kvm->irqfds.outstanding))
> + wake_up(&kvm->irqfds.wqh);
> +}
> +
> +static void
> +irqfd_shutdown_execute(struct slow_work *work)
> +{
> + struct _irqfd *irqfd = work_to_irqfd(work);
> +
> + /*
> + * Ensure there are no outstanding "inject" work-items before we blow
> + * away our state. Once this job completes, the slow_work
> + * infrastructure will drop the irqfd object completely via put_ref
> + */
> + flush_work(&irqfd->inject);
> +}
> +
> +const static struct slow_work_ops irqfd_shutdown_work_ops = {
> + .get_ref = irqfd_shutdown_get_ref,
> + .put_ref = irqfd_shutdown_put_ref,
> + .execute = irqfd_shutdown_execute,
> +};
> +
> +
> static int
> irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> {
> @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> unsigned long flags = (unsigned long)key;
>
> /*
> - * Assume we will be called with interrupts disabled
> + * Called with interrupts disabled
> */
> if (flags & POLLIN)
> - /*
> - * Defer the IRQ injection until later since we need to
> - * acquire the kvm->lock to do so.
> - */
> + /* An event has been signaled, inject an interrupt */
> schedule_work(&irqfd->inject);
>
> if (flags & POLLHUP) {
> - /*
> - * for now, just remove ourselves from the list and let
> - * the rest dangle. We will fix this up later once
> - * the races in eventfd are fixed
> - */
> + /* The eventfd is closing, detach from KVM */
> + struct kvm *kvm = irqfd->kvm;
> + unsigned long flags;
> +
> __remove_wait_queue(irqfd->wqh, &irqfd->wait);
> - irqfd->wqh = NULL;
> +
> + spin_lock_irqsave(&kvm->irqfds.lock, flags);
> +
> + if (irqfd->active) {
> + /*
> + * If the item is still active we can be sure that
> + * no-one else is trying to shutdown this object at
> + * the same time.
> + *
> + * Defer the shutdown to a thread so we can flush
> + * all remaining inject jobs. We use a slow-work
> + * item to prevent a deadlock against the work-queue
> + */
> + irqfd_deactivate(irqfd);
> + slow_work_enqueue(&irqfd->shutdown);
> + }
> +
> + spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
> }
>
> +
> return 0;
> }
>
> @@ -102,6 +185,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> {
> struct _irqfd *irqfd;
> struct file *file = NULL;
> + struct eventfd_ctx *eventfd = NULL;
> int ret;
> unsigned int events;
>
> @@ -113,6 +197,8 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> irqfd->gsi = gsi;
> INIT_LIST_HEAD(&irqfd->list);
> INIT_WORK(&irqfd->inject, irqfd_inject);
> + slow_work_init(&irqfd->shutdown, &irqfd_shutdown_work_ops);
> + irqfd->active = true;
>
> file = eventfd_fget(fd);
> if (IS_ERR(file)) {
> @@ -129,12 +215,21 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>
> events = file->f_op->poll(file, &irqfd->pt);
>
> - mutex_lock(&kvm->lock);
> - list_add_tail(&irqfd->list, &kvm->irqfds);
> - mutex_unlock(&kvm->lock);
> + spin_lock_irq(&kvm->irqfds.lock);
> + list_add_tail(&irqfd->list, &kvm->irqfds.items);
> + spin_unlock_irq(&kvm->irqfds.lock);
> +
> + eventfd = eventfd_ctx_fileget(file);
> + if (IS_ERR(file)) {
> + ret = PTR_ERR(file);
> + goto fail;
> + }
> +
> + irqfd->eventfd = eventfd;
>

<sigh> I just noticed this while doing a self review: I need to assign
the eventfd context *before* putting the item on the list. Not sure why
I even did that. I suspect I re-arranged the code at the last minute
and didn't notice what a dumb thing I was doing.

So this will need at least a v6, but I will wait to hear if there are
any other comments from Michael et. al.

-Greg

>
> /*
> - * Check if there was an event already queued
> + * Check if there was an event already pending on the eventfd
> + * before we registered, and trigger it as if we didn't miss it.
> */
> if (events & POLLIN)
> schedule_work(&irqfd->inject);
> @@ -148,6 +243,9 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> return 0;
>
> fail:
> + if (eventfd && !IS_ERR(eventfd))
> + eventfd_ctx_put(eventfd);
> +
> if (file && !IS_ERR(file))
> fput(file);
>
> @@ -158,24 +256,71 @@ fail:
> void
> kvm_irqfd_init(struct kvm *kvm)
> {
> - INIT_LIST_HEAD(&kvm->irqfds);
> + slow_work_register_user();
> +
> + spin_lock_init(&kvm->irqfds.lock);
> + INIT_LIST_HEAD(&kvm->irqfds.items);
> + atomic_set(&kvm->irqfds.outstanding, 0);
> + init_waitqueue_head(&kvm->irqfds.wqh);
> +}
> +
> +static struct _irqfd *
> +irqfd_pop(struct kvm *kvm)
> +{
> + struct _irqfd *irqfd = NULL;
> +
> + spin_lock_irq(&kvm->irqfds.lock);
> +
> + if (!list_empty(&kvm->irqfds.items)) {
> + irqfd = list_first_entry(&kvm->irqfds.items,
> + struct _irqfd, list);
> + irqfd_deactivate(irqfd);
> + }
> +
> + spin_unlock_irq(&kvm->irqfds.lock);
> +
> + return irqfd;
> +}
> +
> +/*
> + * locally releases the irqfd
> + *
> + * This function is called when KVM won the race with eventfd (signalled by
> + * finding the item active on the kvm->irqfds.item list). We are now guaranteed
> + * that we will never schedule a deferred shutdown task against this object,
> + * so we take steps to perform the shutdown ourselves.
> + *
> + * 1) We must remove ourselves from the wait-queue to prevent further events,
> + * which will simultaneously act to sync us with eventfd (via wqh->lock)
> + * 2) Flush any outstanding inject-tasks to ensure its safe to free memory
> + * 3) Delete the object
> + */
> +static void
> +irqfd_shutdown(struct _irqfd *irqfd)
> +{
> + remove_wait_queue(irqfd->wqh, &irqfd->wait);
> + flush_work(&irqfd->inject);
> + irqfd_release(irqfd);
> }
>
> void
> kvm_irqfd_release(struct kvm *kvm)
> {
> - struct _irqfd *irqfd, *tmp;
> -
> - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
> - if (irqfd->wqh)
> - remove_wait_queue(irqfd->wqh, &irqfd->wait);
> + struct _irqfd *irqfd;
>
> - flush_work(&irqfd->inject);
> + /*
> + * Shutdown all irqfds that still remain
> + */
> + while ((irqfd = irqfd_pop(kvm)))
> + irqfd_shutdown(irqfd);
>
> - mutex_lock(&kvm->lock);
> - list_del(&irqfd->list);
> - mutex_unlock(&kvm->lock);
> + /*
> + * irqfds.outstanding tracks the number of outstanding "shutdown"
> + * jobs pending at any given time. Once we get here, we know that
> + * no more jobs will get scheduled, so go ahead and block until all
> + * of them complete
> + */
> + wait_event(kvm->irqfds.wqh, (!atomic_read(&kvm->irqfds.outstanding)));
>
> - kfree(irqfd);
> - }
> + slow_work_unregister_user();
> }
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

Attachments:

signature.asc (266.00 B)
OpenPGP digital signature

2009-06-28 10:47:03

by Michael S. Tsirkin

[permalink] [raw]

Subject: Re: [KVM PATCH v5 4/4] KVM: add irqfd DEASSIGN feature

On Thu, Jun 25, 2009 at 09:28:32AM -0400, Gregory Haskins wrote:
> DEASSIGN allows us to optionally disassociate an IRQFD from its underlying
> eventfd without destroying the eventfd in the process. This is useful
> for conditions like live-migration which may have an eventfd associated
> with a device and an IRQFD. We need to be able to decouple the guest
> from the event source while not perturbing the event source itself.
>
> Signed-off-by: Gregory Haskins <[email protected]>
> CC: Michael S. Tsirkin <[email protected]>
> ---
>
> include/linux/kvm.h | 2 ++
> virt/kvm/eventfd.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 56 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 38ff31e..6710518 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -490,6 +490,8 @@ struct kvm_x86_mce {
> };
> #endif
>
> +#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> +
> struct kvm_irqfd {
> __u32 fd;
> __u32 gsi;
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index ca21e8a..2d4549c 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -180,8 +180,8 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> add_wait_queue(wqh, &irqfd->wait);
> }
>
> -int
> -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> +static int
> +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> {
> struct _irqfd *irqfd;
> struct file *file = NULL;
> @@ -303,6 +303,58 @@ irqfd_shutdown(struct _irqfd *irqfd)
> irqfd_release(irqfd);
> }
>
> +/*
> + * assumes kvm->irqfds.lock is held
> + */
> +static struct _irqfd *
> +irqfd_find(struct kvm *kvm, int fd, int gsi)
> +{
> + struct _irqfd *irqfd, *tmp, *ret = ERR_PTR(-ENOENT);
> + struct eventfd_ctx *eventfd;
> +
> + eventfd = eventfd_ctx_fdget(fd);
> + if (IS_ERR(eventfd))
> + return ERR_PTR(PTR_ERR(eventfd));
> +
> + spin_lock_irq(&kvm->irqfds.lock);
> +
> + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
> + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
> + irqfd_deactivate(irqfd);
> + ret = irqfd;
> + break;
> + }
> + }
> +
> + spin_unlock_irq(&kvm->irqfds.lock);
> + eventfd_ctx_put(eventfd);
> +
> + return ret;
> +}
> +
> +static int
> +kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> +{
> + struct _irqfd *irqfd;
> +
> + irqfd = irqfd_find(kvm, fd, gsi);
> + if (IS_ERR(irqfd))
> + return PTR_ERR(irqfd);
> +
> + irqfd_shutdown(irqfd);
> +
> + return 0;
> +}

I think that, to make this work properly, you must
add irqfd to list the last thing so do.
As it is, when you assign irqfd, the last thing you do is

irqfd->eventfd = eventfd;

I think you should move this to within a spinlock.

> +
> +int
> +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> +{
> + if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> + return kvm_irqfd_deassign(kvm, fd, gsi);
> +
> + return kvm_irqfd_assign(kvm, fd, gsi);
> +}
> +

At some point we discussed limiting the number of
irqfds that can be created in some way, so that userspace
can not consume unlimited amount of kernel memory.

What happened to that idea?

This will happen naturally if
- we keep fget on the file until irqfd goes away
- we allow the same file be bound to only one irqfd
but there might be other ways to do this

--
MST

2009-06-28 11:02:20

by Avi Kivity

[permalink] [raw]

Subject: Re: [KVM PATCH v5 0/4] irqfd fixes and enhancements

On 06/25/2009 04:59 PM, Gregory Haskins wrote:
> Gregory Haskins wrote:
>
>> (Applies to kvm.git/master:4631e094)
>>
>> The following is the latest attempt to fix the races in irqfd/eventfd, as
>> well as restore DEASSIGN support. For more details, please read the patch
>> headers.
>>
>> This series has been tested against the kvm-eventfd unit test, and
>> appears to be functioning properly. You can download this test here:
>>
>> ftp://ftp.novell.com/dev/ghaskins/kvm-eventfd.tar.bz2
>>
>> I've included version 4 of Davide's eventfd patch (ported to kvm.git) so
>> that its a complete reviewable series. Note, however, that there may be
>> later versions of his patch to consider for merging, so we should
>> coordinate with him.
>>
>>
>
> So I know we talked yesterday in the review session about whether it was
> actually worth all this complexity to deal with the POLLHUP or if we
> should just revert to the prior "two syscall" model and be done with
> it. Rusty reflected these same sentiments this morning in response to
> Davide's patch in a different thread.
>
> I am a bit torn myself, tbh. I do feel as though I have a good handle
> on the issue and that it is indeed now fixed (at least, if this series
> is applied and the slow-work issue is fixed, still pending upstream
> ACK). I have a lot invested in going the POLLHUP direction having spent
> so much time thinking about the problem and working on the patches, so I
> a bit of a biased opinion, I know.
>
> The reason why I am pushing this series out now is at least partly so we
> can tie up these loose ends. We have both solutions in front of us and
> can make a decision either way. At least the solution is formally
> documented in the internet archives forever this way ;)
>
> I took the review comments to heart that the shutdown code was
> substantially larger and more complex than the actual fast-path code. I
> went though last night and simplified and clarified it. I think the
> latest result is leaner and clearer, so please give it another review
> (particularly for races) before dismissing it.
>

Yes, it's much nicer. I can't say I'm certain it's race free but it's a
lot more digestible

> Ultimately, I think the concept of a release notification for eventfd is
> a good thing for all eventfd users, so I don't think this thing should
> go away per se even if irqfd decides to not use it.
>

I agree that we want POLLHUP support, it's better than holding on to the
eventfd. But I think we can make it even cleaner by merging it with
deassign. Basically, when we get POLLHUP, we launch a slow_work (or
something) that does a regular deassign. That slow_work can grab a ref
to the vm, so we don't race with the VM disappearing.

But given that the current slow_work does almost nothing, I'm not sure
it's worth it.

--
error compiling committee.c: too many arguments to function

2009-06-28 11:07:34

by Michael S. Tsirkin

On 06/28/2009 03:59 PM, Gregory Haskins wrote:
>> I agree that we want POLLHUP support, it's better than holding on to
>> the eventfd. But I think we can make it even cleaner by merging it
>> with deassign. Basically, when we get POLLHUP, we launch a slow_work
>> (or something) that does a regular deassign. That slow_work can grab
>> a ref to the vm, so we don't race with the VM disappearing.
>>
>> But given that the current slow_work does almost nothing, I'm not sure
>> it's worth it.
>>
>
> Yeah, and also note that the algorithm to unhook each side is not quite
> symmetrical. I think I've captured all the common parts (in things like
> irqfd_deactivate(), etc). A minor change in kvm_irqfd_release() could
> technically use a deferred job to release instead of doing it inline,
> but I do not think it buys us very much to do so (as you pointed out,
> the defered part is actually fairly simple). The important parts of the
> protocol lie outside of the work we can do in the work-item anyway.
>

Is the case of deassign vs POLLHUP covered?

Reusing deassign in POLLHUP at least makes it easy to verify that it is.

--
error compiling committee.c: too many arguments to function