Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754727AbZFEEzZ (ORCPT ); Fri, 5 Jun 2009 00:55:25 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751050AbZFEEzK (ORCPT ); Fri, 5 Jun 2009 00:55:10 -0400 Received: from ozlabs.org ([203.10.76.45]:40570 "EHLO ozlabs.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750867AbZFEEzI (ORCPT ); Fri, 5 Jun 2009 00:55:08 -0400 From: Rusty Russell To: Gregory Haskins Subject: Re: [RFC PATCH v2 00/19] virtual-bus Date: Fri, 5 Jun 2009 14:25:01 +0930 User-Agent: KMail/1.11.2 (Linux/2.6.28-11-generic; KDE/4.2.2; i686; ; ) Cc: "Michael S. Tsirkin" , Avi Kivity , Gregory Haskins , linux-kernel@vger.kernel.org, agraf@suse.de, pmullaney@novell.com, pmorreale@novell.com, anthony@codemonkey.ws, netdev@vger.kernel.org, kvm@vger.kernel.org, bhutchings@solarflare.com, andi@firstfloor.org, gregkh@suse.de, herber@gondor.apana.org.au, chrisw@sous-sol.org, shemminger@vyatta.com References: <20090409155200.32740.19358.stgit@dev.haskins.net> <49E0C93E.5030205@redhat.com> <4A28172D.6010906@gmail.com> In-Reply-To: <4A28172D.6010906@gmail.com> MIME-Version: 1.0 Content-Type: Text/Plain; charset="iso-8859-15" Content-Transfer-Encoding: 7bit Content-Disposition: inline Message-Id: <200906051425.02924.rusty@rustcorp.com.au> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7014 Lines: 228 On Fri, 5 Jun 2009 04:19:17 am Gregory Haskins wrote: > Avi Kivity wrote: > > Gregory Haskins wrote: > > One idea is similar to signalfd() or eventfd() > > And thus the "kvm-eventfd" (irqfd/iosignalfd) interface project was born. > ;) The lguest patch queue already has such an interface :) And I have a partially complete in-kernel virtio_pci patch with the same trick. I switched from "kernel created eventfd" to "userspace passes in eventfd" after a while though; it lets you connect multiple virtqueues to a single fd if you want. Combined with a minor change to allow any process with access to the lguest fd to queue interrupts, this allowed lguest to move to a thread-per-virtqueue model which was a significant speedup as well as nice code reduction. Here's the relevant kernel patch for reading. Thanks! Rusty. lguest: use eventfds for device notification Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with an address: the main Launcher process returns with this address, and figures out what device to run. A far nicer model is to let processes bind an eventfd to an address: if we find one, we simply signal the eventfd. Signed-off-by: Rusty Russell Cc: Davide Libenzi --- drivers/lguest/Kconfig | 2 - drivers/lguest/core.c | 8 ++-- drivers/lguest/lg.h | 9 ++++ drivers/lguest/lguest_user.c | 73 ++++++++++++++++++++++++++++++++++++++++ include/linux/lguest_launcher.h | 1 5 files changed, 89 insertions(+), 4 deletions(-) diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX + depends on X86_32 && EXPERIMENTAL && !X86_PAE && EVENTFD select HVC_DRIVER ---help--- This is a very simple module which allows you to run diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsign /* It's possible the Guest did a NOTIFY hypercall to the * Launcher, in which case we return from the read() now. */ if (cpu->pending_notify) { - if (put_user(cpu->pending_notify, user)) - return -EFAULT; - return sizeof(cpu->pending_notify); + if (!send_notify_to_eventfd(cpu)) { + if (put_user(cpu->pending_notify, user)) + return -EFAULT; + return sizeof(cpu->pending_notify); + } } /* Check for signals */ diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -82,6 +82,11 @@ struct lg_cpu { struct lg_cpu_arch arch; }; +struct lg_eventfds { + unsigned long addr; + struct file *event; +}; + /* The private info the thread maintains about the guest. */ struct lguest { @@ -102,6 +107,9 @@ struct lguest unsigned int stack_pages; u32 tsc_khz; + unsigned int num_eventfds; + struct lg_eventfds *eventfds; + /* Dead? */ const char *dead; }; @@ -152,6 +160,7 @@ void setup_default_idt_entries(struct lg void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def); void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); +bool send_notify_to_eventfd(struct lg_cpu *cpu); void init_clockdev(struct lg_cpu *cpu); bool check_syscall_vector(struct lguest *lg); int init_interrupts(void); diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include "lg.h" /*L:055 When something happens, the Waker process needs a way to stop the @@ -35,6 +37,70 @@ static int break_guest_out(struct lg_cpu } } +bool send_notify_to_eventfd(struct lg_cpu *cpu) +{ + unsigned int i; + + /* lg->eventfds is RCU-protected */ + preempt_disable(); + for (i = 0; i < cpu->lg->num_eventfds; i++) { + if (cpu->lg->eventfds[i].addr == cpu->pending_notify) { + eventfd_signal(cpu->lg->eventfds[i].event, 1); + cpu->pending_notify = 0; + break; + } + } + preempt_enable(); + return cpu->pending_notify == 0; +} + +static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) +{ + struct lg_eventfds *new, *old; + + if (!addr) + return -EINVAL; + + /* Replace the old array with the new one, carefully: others can + * be accessing it at the same time */ + new = kmalloc(sizeof(*new) * (lg->num_eventfds + 1), GFP_KERNEL); + if (!new) + return -ENOMEM; + + memcpy(new, lg->eventfds, sizeof(*new) * lg->num_eventfds); + old = lg->eventfds; + lg->eventfds = new; + synchronize_rcu(); + kfree(old); + + lg->eventfds[lg->num_eventfds].addr = addr; + lg->eventfds[lg->num_eventfds].event = eventfd_fget(fd); + if (IS_ERR(lg->eventfds[lg->num_eventfds].event)) + return PTR_ERR(lg->eventfds[lg->num_eventfds].event); + + wmb(); + lg->num_eventfds++; + return 0; +} + +static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) +{ + unsigned long addr, fd; + int err; + + if (get_user(addr, input) != 0) + return -EFAULT; + input++; + if (get_user(fd, input) != 0) + return -EFAULT; + + mutex_lock(&lguest_lock); + err = add_eventfd(lg, addr, fd); + mutex_unlock(&lguest_lock); + + return 0; +} + /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt * number to /dev/lguest. */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) @@ -260,6 +326,8 @@ static ssize_t write(struct file *file, return user_send_irq(cpu, input); case LHREQ_BREAK: return break_guest_out(cpu, input); + case LHREQ_EVENTFD: + return attach_eventfd(lg, input); default: return -EINVAL; } @@ -297,6 +365,11 @@ static int close(struct inode *inode, st * the Launcher's memory management structure. */ mmput(lg->cpus[i].mm); } + + /* Release any eventfds they registered. */ + for (i = 0; i < lg->num_eventfds; i++) + fput(lg->eventfds[i].event); + /* If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). */ if (!IS_ERR(lg->dead)) diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -58,6 +58,7 @@ enum lguest_req LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ + LHREQ_EVENTFD, /* + address, fd. */ }; /* The alignment to use between consumer and producer parts of vring. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/