by Stephen Hemminger

[permalink] [raw]

Subject: Re: [RFC PATCH v2 09/19] net: Add vbus_enet driver

On Thu, 09 Apr 2009 12:31:29 -0400
Gregory Haskins <[email protected]> wrote:

> Signed-off-by: Gregory Haskins <[email protected]>
> ---
>
> drivers/net/Kconfig | 13 +
> drivers/net/Makefile | 1
> drivers/net/vbus-enet.c | 680 +++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 694 insertions(+), 0 deletions(-)
> create mode 100644 drivers/net/vbus-enet.c
>
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index 62d732a..ac9dabd 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -3099,4 +3099,17 @@ config VIRTIO_NET
> This is the virtual network driver for virtio. It can be used with
> lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
>
> +config VBUS_ENET
> + tristate "Virtual Ethernet Driver"
> + depends on VBUS_DRIVERS
> + help
> + A virtualized 802.x network device based on the VBUS interface.
> + It can be used with any hypervisor/kernel that supports the
> + vbus protocol.
> +
> +config VBUS_ENET_DEBUG
> + bool "Enable Debugging"
> + depends on VBUS_ENET
> + default n
> +
> endif # NETDEVICES
> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
> index 471baaf..61db928 100644
> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -264,6 +264,7 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
> obj-$(CONFIG_NETXEN_NIC) += netxen/
> obj-$(CONFIG_NIU) += niu.o
> obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
> +obj-$(CONFIG_VBUS_ENET) += vbus-enet.o
> obj-$(CONFIG_SFC) += sfc/
>
> obj-$(CONFIG_WIMAX) += wimax/
> diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
> new file mode 100644
> index 0000000..3779f77
> --- /dev/null
> +++ b/drivers/net/vbus-enet.c
> @@ -0,0 +1,680 @@
> +/*
> + * vbus_enet - A virtualized 802.x network device based on the VBUS interface
> + *
> + * Copyright (C) 2009 Novell, Gregory Haskins <[email protected]>
> + *
> + * Derived from the SNULL example from the book "Linux Device Drivers" by
> + * Alessandro Rubini, Jonathan Corbet, and Greg Kroah-Hartman, published
> + * by O'Reilly & Associates.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/moduleparam.h>
> +
> +#include <linux/sched.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include <linux/errno.h>
> +#include <linux/types.h>
> +#include <linux/interrupt.h>
> +
> +#include <linux/in.h>
> +#include <linux/netdevice.h>
> +#include <linux/etherdevice.h>
> +#include <linux/ip.h>
> +#include <linux/tcp.h>
> +#include <linux/skbuff.h>
> +#include <linux/ioq.h>
> +#include <linux/vbus_driver.h>
> +
> +#include <linux/in6.h>
> +#include <asm/checksum.h>
> +
> +#include <linux/venet.h>
> +
> +MODULE_AUTHOR("Gregory Haskins");
> +MODULE_LICENSE("GPL");

MODULE_DESCRIPTION ?
MODULE_VERSION ?

> +static int napi_weight = 128;
> +module_param(napi_weight, int, 0444);
Already accessible through sysfs

> +static int rx_ringlen = 256;
> +module_param(rx_ringlen, int, 0444);

API for ring length exists via ethtool. If you used this
then there would be no need for device special parameter.

> +static int tx_ringlen = 256;
> +module_param(tx_ringlen, int, 0444);
> +
> +#undef PDEBUG /* undef it, just in case */
> +#ifdef VBUS_ENET_DEBUG
> +# define PDEBUG(fmt, args...) printk(KERN_DEBUG "vbus_enet: " fmt, ## args)
> +#else
> +# define PDEBUG(fmt, args...) /* not debugging: nothing */
> +#endif

Why reinvent pr_debug()?

> +
> +struct vbus_enet_queue {
> + struct ioq *queue;
> + struct ioq_notifier notifier;
> +};
> +
> +struct vbus_enet_priv {
> + spinlock_t lock;
> + struct net_device *dev;
> + struct vbus_device_proxy *vdev;
> + struct napi_struct napi;
> + struct vbus_enet_queue rxq;
> + struct vbus_enet_queue txq;
> + struct tasklet_struct txtask;
> +};
> +
> +static struct vbus_enet_priv *
> +napi_to_priv(struct napi_struct *napi)
> +{
> + return container_of(napi, struct vbus_enet_priv, napi);
> +}
> +
> +static int
> +queue_init(struct vbus_enet_priv *priv,
> + struct vbus_enet_queue *q,
> + int qid,
> + size_t ringsize,
> + void (*func)(struct ioq_notifier *))
> +{
> + struct vbus_device_proxy *dev = priv->vdev;
> + int ret;
> +
> + ret = vbus_driver_ioq_alloc(dev, qid, 0, ringsize, &q->queue);
> + if (ret < 0)
> + panic("ioq_alloc failed: %d\n", ret);
> +
> + if (func) {
> + q->notifier.signal = func;
> + q->queue->notifier = &q->notifier;
> + }
> +
> + return 0;
> +}
> +
> +static int
> +devcall(struct vbus_enet_priv *priv, u32 func, void *data, size_t len)
> +{
> + struct vbus_device_proxy *dev = priv->vdev;
> +
> + return dev->ops->call(dev, func, data, len, 0);
> +}
> +
> +/*
> + * ---------------
> + * rx descriptors
> + * ---------------
> + */
> +
> +static void
> +rxdesc_alloc(struct ioq_ring_desc *desc, size_t len)
> +{
> + struct sk_buff *skb;
> +
> + len += ETH_HLEN;
> +
> + skb = dev_alloc_skb(len + 2);
> + BUG_ON(!skb);
> +
> + skb_reserve(skb, 2); /* align IP on 16B boundary */
Use NET_IP_ALIGN rather than 2

use netdev_alloc_skb because it NUMA aware.

> +
> + desc->cookie = (u64)skb;
> + desc->ptr = (u64)__pa(skb->data);
> + desc->len = len; /* total length */
> + desc->valid = 1;
> +}
> +
> +static void
> +rx_setup(struct vbus_enet_priv *priv)
> +{
> + struct ioq *ioq = priv->rxq.queue;
> + struct ioq_iterator iter;
> + int ret;
> +
> + /*
> + * We want to iterate on the "valid" index. By default the iterator
> + * will not "autoupdate" which means it will not hypercall the host
> + * with our changes. This is good, because we are really just
> + * initializing stuff here anyway. Note that you can always manually
> + * signal the host with ioq_signal() if the autoupdate feature is not
> + * used.
> + */
> + ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
> + BUG_ON(ret < 0);

Why not doing proper initialization error handling, I.e fail the
attempt to bring device up with error code (-ENOMEM)...

> + /*
> + * Seek to the tail of the valid index (which should be our first
> + * item, since the queue is brand-new)
> + */
> + ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
> + BUG_ON(ret < 0);
> +
> + /*
> + * Now populate each descriptor with an empty SKB and mark it valid
> + */
> + while (!iter.desc->valid) {
> + rxdesc_alloc(iter.desc, priv->dev->mtu);
> +
> + /*
> + * This push operation will simultaneously advance the
> + * valid-head index and increment our position in the queue
> + * by one.
> + */
> + ret = ioq_iter_push(&iter, 0);
> + BUG_ON(ret < 0);
> + }
> +}
> +
> +static void
> +rx_teardown(struct vbus_enet_priv *priv)
> +{
> + struct ioq *ioq = priv->rxq.queue;
> + struct ioq_iterator iter;
> + int ret;
> +
> + ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
> + BUG_ON(ret < 0);
> +
> + ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
> + BUG_ON(ret < 0);
> +
> + /*
> + * free each valid descriptor
> + */
> + while (iter.desc->valid) {
> + struct sk_buff *skb = (struct sk_buff *)iter.desc->cookie;
> +
> + iter.desc->valid = 0;
> + wmb();
> +
> + iter.desc->ptr = 0;
> + iter.desc->cookie = 0;
> +
> + ret = ioq_iter_pop(&iter, 0);
> + BUG_ON(ret < 0);
> +
> + dev_kfree_skb(skb);
> + }
> +}
> +
> +/*
> + * Open and close
> + */
> +
> +static int
> +vbus_enet_open(struct net_device *dev)
> +{
> + struct vbus_enet_priv *priv = netdev_priv(dev);
> + int ret;
> +
> + ret = devcall(priv, VENET_FUNC_LINKUP, NULL, 0);
> + BUG_ON(ret < 0);
> +
> + napi_enable(&priv->napi);
> +
> + return 0;
> +}
> +
> +static int
> +vbus_enet_stop(struct net_device *dev)
> +{
> + struct vbus_enet_priv *priv = netdev_priv(dev);
> + int ret;
> +
> + napi_disable(&priv->napi);
> +
> + ret = devcall(priv, VENET_FUNC_LINKDOWN, NULL, 0);
> + BUG_ON(ret < 0);
> +
> + return 0;
> +}
> +
> +/*
> + * Configuration changes (passed on by ifconfig)
> + */
> +static int
> +vbus_enet_config(struct net_device *dev, struct ifmap *map)
> +{
> + if (dev->flags & IFF_UP) /* can't act on a running interface */
> + return -EBUSY;
> +
> + /* Don't allow changing the I/O address */
> + if (map->base_addr != dev->base_addr) {
> + printk(KERN_WARNING "vbus_enet: Can't change I/O address\n");
> + return -EOPNOTSUPP;
> + }
> +
> + /* ignore other fields */
> + return 0;
> +}
> +
> +static void
> +vbus_enet_schedule_rx(struct vbus_enet_priv *priv)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&priv->lock, flags);
> +
> + if (netif_rx_schedule_prep(&priv->napi)) {
> + /* Disable further interrupts */
> + ioq_notify_disable(priv->rxq.queue, 0);
> + __netif_rx_schedule(&priv->napi);
> + }
> +
> + spin_unlock_irqrestore(&priv->lock, flags);
> +}
> +
> +static int
> +vbus_enet_change_mtu(struct net_device *dev, int new_mtu)
> +{
> + struct vbus_enet_priv *priv = netdev_priv(dev);
> + int ret;
> +
> + dev->mtu = new_mtu;
> +
> + /*
> + * FLUSHRX will cause the device to flush any outstanding
> + * RX buffers. They will appear to come in as 0 length
> + * packets which we can simply discard and replace with new_mtu
> + * buffers for the future.
> + */
> + ret = devcall(priv, VENET_FUNC_FLUSHRX, NULL, 0);
> + BUG_ON(ret < 0);
> +
> + vbus_enet_schedule_rx(priv);
> +
> + return 0;
> +}
> +
> +/*
> + * The poll implementation.
> + */
> +static int
> +vbus_enet_poll(struct napi_struct *napi, int budget)
> +{
> + struct vbus_enet_priv *priv = napi_to_priv(napi);
> + int npackets = 0;
> + struct ioq_iterator iter;
> + int ret;
> +
> + PDEBUG("%lld: polling...\n", priv->vdev->id);
> +
> + /* We want to iterate on the head of the in-use index */
> + ret = ioq_iter_init(priv->rxq.queue, &iter, ioq_idxtype_inuse,
> + IOQ_ITER_AUTOUPDATE);
> + BUG_ON(ret < 0);
> +
> + ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
> + BUG_ON(ret < 0);
> +
> + /*
> + * We stop if we have met the quota or there are no more packets.
> + * The EOM is indicated by finding a packet that is still owned by
> + * the south side
> + */
> + while ((npackets < budget) && (!iter.desc->sown)) {
> + struct sk_buff *skb = (struct sk_buff *)iter.desc->cookie;
> +
> + if (iter.desc->len) {
> + skb_put(skb, iter.desc->len);
> +
> + /* Maintain stats */
> + npackets++;
> + priv->dev->stats.rx_packets++;
> + priv->dev->stats.rx_bytes += iter.desc->len;
> +
> + /* Pass the buffer up to the stack */
> + skb->dev = priv->dev;
> + skb->protocol = eth_type_trans(skb, priv->dev);
> + netif_receive_skb(skb);
> +
> + mb();
> + } else
> + /*
> + * the device may send a zero-length packet when its
> + * flushing references on the ring. We can just drop
> + * these on the floor
> + */
> + dev_kfree_skb(skb);
> +
> + /* Grab a new buffer to put in the ring */
> + rxdesc_alloc(iter.desc, priv->dev->mtu);
> +
> + /* Advance the in-use tail */
> + ret = ioq_iter_pop(&iter, 0);
> + BUG_ON(ret < 0);
> + }
> +
> + PDEBUG("%lld poll: %d packets received\n", priv->vdev->id, npackets);
> +
> + /*
> + * If we processed all packets, we're done; tell the kernel and
> + * reenable ints
> + */
> + if (ioq_empty(priv->rxq.queue, ioq_idxtype_inuse)) {
> + netif_rx_complete(napi);
> + ioq_notify_enable(priv->rxq.queue, 0);
> + ret = 0;
> + } else
> + /* We couldn't process everything. */
> + ret = 1;
> +
> + return ret;
> +}
> +
> +/*
> + * Transmit a packet (called by the kernel)
> + */
> +static int
> +vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
> +{
> + struct vbus_enet_priv *priv = netdev_priv(dev);
> + struct ioq_iterator iter;
> + int ret;
> + unsigned long flags;
> +
> + PDEBUG("%lld: sending %d bytes\n", priv->vdev->id, skb->len);
> +
> + spin_lock_irqsave(&priv->lock, flags);
> +
> + if (ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
> + /*
> + * We must flow-control the kernel by disabling the
> + * queue
> + */
> + spin_unlock_irqrestore(&priv->lock, flags);
> + netif_stop_queue(dev);
> + printk(KERN_ERR "VBUS_ENET: tx on full queue bug " \
> + "on device %lld\n", priv->vdev->id);
> + return 1;
> + }
> +
> + /*
> + * We want to iterate on the tail of both the "inuse" and "valid" index
> + * so we specify the "both" index
> + */
> + ret = ioq_iter_init(priv->txq.queue, &iter, ioq_idxtype_both,
> + IOQ_ITER_AUTOUPDATE);
> + BUG_ON(ret < 0);
> +
> + ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
> + BUG_ON(ret < 0);
> + BUG_ON(iter.desc->sown);
> +
> + /*
> + * We simply put the skb right onto the ring. We will get an interrupt
> + * later when the data has been consumed and we can reap the pointers
> + * at that time
> + */
> + iter.desc->cookie = (u64)skb;
> + iter.desc->len = (u64)skb->len;
> + iter.desc->ptr = (u64)__pa(skb->data);
> + iter.desc->valid = 1;
> +
> + priv->dev->stats.tx_packets++;
> + priv->dev->stats.tx_bytes += skb->len;
> +
> + /*
> + * This advances both indexes together implicitly, and then
> + * signals the south side to consume the packet
> + */
> + ret = ioq_iter_push(&iter, 0);
> + BUG_ON(ret < 0);
> +
> + dev->trans_start = jiffies; /* save the timestamp */
> +
> + if (ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
> + /*
> + * If the queue is congested, we must flow-control the kernel
> + */
> + PDEBUG("%lld: backpressure tx queue\n", priv->vdev->id);
> + netif_stop_queue(dev);
> + }
> +
> + spin_unlock_irqrestore(&priv->lock, flags);
> +
> + return 0;
> +}
> +
> +/*
> + * reclaim any outstanding completed tx packets
> + *
> + * assumes priv->lock held
> + */
> +static void
> +vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
> +{
> + struct ioq_iterator iter;
> + int ret;
> +
> + /*
> + * We want to iterate on the head of the valid index, but we
> + * do not want the iter_pop (below) to flip the ownership, so
> + * we set the NOFLIPOWNER option
> + */
> + ret = ioq_iter_init(priv->txq.queue, &iter, ioq_idxtype_valid,
> + IOQ_ITER_NOFLIPOWNER);
> + BUG_ON(ret < 0);
> +
> + ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
> + BUG_ON(ret < 0);
> +
> + /*
> + * We are done once we find the first packet either invalid or still
> + * owned by the south-side
> + */
> + while (iter.desc->valid && (!iter.desc->sown || force)) {
> + struct sk_buff *skb = (struct sk_buff *)iter.desc->cookie;
> +
> + PDEBUG("%lld: completed sending %d bytes\n",
> + priv->vdev->id, skb->len);
> +
> + /* Reset the descriptor */
> + iter.desc->valid = 0;
> +
> + dev_kfree_skb(skb);
> +
> + /* Advance the valid-index head */
> + ret = ioq_iter_pop(&iter, 0);
> + BUG_ON(ret < 0);
> + }
> +
> + /*
> + * If we were previously stopped due to flow control, restart the
> + * processing
> + */
> + if (netif_queue_stopped(priv->dev)
> + && !ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
> + PDEBUG("%lld: re-enabling tx queue\n", priv->vdev->id);
> + netif_wake_queue(priv->dev);
> + }
> +}
> +
> +static void
> +vbus_enet_timeout(struct net_device *dev)
> +{
> + struct vbus_enet_priv *priv = netdev_priv(dev);
> + unsigned long flags;
> +
> + printk(KERN_DEBUG "VBUS_ENET %lld: Transmit timeout\n", priv->vdev->id);
> +
> + spin_lock_irqsave(&priv->lock, flags);
> + vbus_enet_tx_reap(priv, 0);
> + spin_unlock_irqrestore(&priv->lock, flags);
> +}
> +
> +static void
> +rx_isr(struct ioq_notifier *notifier)
> +{
> + struct vbus_enet_priv *priv;
> + struct net_device *dev;
> +
> + priv = container_of(notifier, struct vbus_enet_priv, rxq.notifier);
> + dev = priv->dev;
> +
> + if (!ioq_empty(priv->rxq.queue, ioq_idxtype_inuse))
> + vbus_enet_schedule_rx(priv);
> +}
> +
> +static void
> +deferred_tx_isr(unsigned long data)
> +{
> + struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
> + unsigned long flags;
> +
> + PDEBUG("deferred_tx_isr for %lld\n", priv->vdev->id);
> +
> + spin_lock_irqsave(&priv->lock, flags);
> + vbus_enet_tx_reap(priv, 0);
> + spin_unlock_irqrestore(&priv->lock, flags);
> +
> + ioq_notify_enable(priv->txq.queue, 0);
> +}
> +
> +static void
> +tx_isr(struct ioq_notifier *notifier)
> +{
> + struct vbus_enet_priv *priv;
> + unsigned long flags;
> +
> + priv = container_of(notifier, struct vbus_enet_priv, txq.notifier);
> +
> + PDEBUG("tx_isr for %lld\n", priv->vdev->id);
> +
> + ioq_notify_disable(priv->txq.queue, 0);
> + tasklet_schedule(&priv->txtask);
> +}
> +
> +static const struct net_device_ops vbus_enet_netdev_ops = {
> + .ndo_open = vbus_enet_open,
> + .ndo_stop = vbus_enet_stop,
> + .ndo_set_config = vbus_enet_config,
> + .ndo_start_xmit = vbus_enet_tx_start,
> + .ndo_change_mtu = vbus_enet_change_mtu,
> + .ndo_tx_timeout = vbus_enet_timeout,

add .ndo_validate_addr = eth_valid_addr?
multicast list?

> +};
> +
> +/*
> + * This is called whenever a new vbus_device_proxy is added to the vbus
> + * with the matching VENET_ID
> + */
> +static int
> +vbus_enet_probe(struct vbus_device_proxy *vdev)
> +{
> + struct net_device *dev;
> + struct vbus_enet_priv *priv;
> + int ret;
> +
> + printk(KERN_INFO "VBUS_ENET: Found new device at %lld\n", vdev->id);
> +
> + ret = vdev->ops->open(vdev, VENET_VERSION, 0);
> + if (ret < 0)
> + return ret;
> +
> + dev = alloc_etherdev(sizeof(struct vbus_enet_priv));
> + if (!dev)
> + return -ENOMEM;
> +
> + priv = netdev_priv(dev);
> +
> + spin_lock_init(&priv->lock);
> + priv->dev = dev;
> + priv->vdev = vdev;
> +
> + tasklet_init(&priv->txtask, deferred_tx_isr, (unsigned long)priv);
> +
> + queue_init(priv, &priv->rxq, VENET_QUEUE_RX, rx_ringlen, rx_isr);
> + queue_init(priv, &priv->txq, VENET_QUEUE_TX, tx_ringlen, tx_isr);
> +
> + rx_setup(priv);
> +
> + ioq_notify_enable(priv->rxq.queue, 0); /* enable interrupts */
> + ioq_notify_enable(priv->txq.queue, 0);
> +
> + dev->netdev_ops = &vbus_enet_netdev_ops;
> + dev->watchdog_timeo = 5 * HZ;
> +
> + netif_napi_add(dev, &priv->napi, vbus_enet_poll, napi_weight);
> +
> + ret = devcall(priv, VENET_FUNC_MACQUERY, priv->dev->dev_addr, ETH_ALEN);
> + if (ret < 0) {
> + printk(KERN_INFO "VENET: Error obtaining MAC address for " \
> + "%lld\n",
> + priv->vdev->id);
> + goto out_free;
> + }
> +
> + dev->features |= NETIF_F_HIGHDMA;
> +
> + ret = register_netdev(dev);
> + if (ret < 0) {
> + printk(KERN_INFO "VENET: error %i registering device \"%s\"\n",
> + ret, dev->name);
> + goto out_free;
> + }
> +
> + vdev->priv = priv;
> +
> + return 0;
> +
> + out_free:
> + free_netdev(dev);
> +
> + return ret;
> +}
> +
> +static int
> +vbus_enet_remove(struct vbus_device_proxy *vdev)
> +{
> + struct vbus_enet_priv *priv = (struct vbus_enet_priv *)vdev->priv;
> + struct vbus_device_proxy *dev = priv->vdev;
> +
> + unregister_netdev(priv->dev);
> + napi_disable(&priv->napi);
> +
> + rx_teardown(priv);
> + vbus_enet_tx_reap(priv, 1);
> +
> + ioq_put(priv->rxq.queue);
> + ioq_put(priv->txq.queue);
> +
> + dev->ops->close(dev, 0);
> +
> + free_netdev(priv->dev);
> +
> + return 0;
> +}
> +
> +/*
> + * Finally, the module stuff
> + */
> +
> +static struct vbus_driver_ops vbus_enet_driver_ops = {
> + .probe = vbus_enet_probe,
> + .remove = vbus_enet_remove,
> +};
> +
> +static struct vbus_driver vbus_enet_driver = {
> + .type = VENET_TYPE,
> + .owner = THIS_MODULE,
> + .ops = &vbus_enet_driver_ops,
> +};
> +
> +static __init int
> +vbus_enet_init_module(void)
> +{
> + printk(KERN_INFO "Virtual Ethernet: Copyright (C) 2009 Novell, Gregory Haskins\n");
> + printk(KERN_DEBUG "VBUSENET: Using %d/%d queue depth\n",
> + rx_ringlen, tx_ringlen);
> + return vbus_driver_register(&vbus_enet_driver);
> +}
> +
> +static __exit void
> +vbus_enet_cleanup(void)
> +{
> + vbus_driver_unregister(&vbus_enet_driver);
> +}
> +
> +module_init(vbus_enet_init_module);
> +module_exit(vbus_enet_cleanup);
>

2009-04-09 16:48:00

Michael S. Tsirkin wrote:
> On Thu, Apr 09, 2009 at 12:30:57PM -0400, Gregory Haskins wrote:
>
>> +static unsigned long
>> +task_memctx_copy_to(struct vbus_memctx *ctx, void *dst, const void *src,
>> + unsigned long n)
>> +{
>> + struct task_memctx *tm = to_task_memctx(ctx);
>> + struct task_struct *p = tm->task;
>> +
>> + while (n) {
>> + unsigned long offset = ((unsigned long)dst)%PAGE_SIZE;
>> + unsigned long len = PAGE_SIZE - offset;
>> + int ret;
>> + struct page *pg;
>> + void *maddr;
>> +
>> + if (len > n)
>> + len = n;
>> +
>> + down_read(&p->mm->mmap_sem);
>> + ret = get_user_pages(p, p->mm,
>> + (unsigned long)dst, 1, 1, 0, &pg, NULL);
>> +
>> + if (ret != 1) {
>> + up_read(&p->mm->mmap_sem);
>> + break;
>> + }
>> +
>> + maddr = kmap_atomic(pg, KM_USER0);
>> + memcpy(maddr + offset, src, len);
>> + kunmap_atomic(maddr, KM_USER0);
>> + set_page_dirty_lock(pg);
>> + put_page(pg);
>> + up_read(&p->mm->mmap_sem);
>> +
>> + src += len;
>> + dst += len;
>> + n -= len;
>> + }
>> +
>> + return n;
>> +}
>>
>
> BTW, why did you decide to use get_user_pages?
> Would switch_mm + copy_to_user work as well
> avoiding page walk if all pages are present?
>

Well, basic c_t_u() won't work because its likely not "current" if you
are updating the ring from some other task, but I think you have already
figured that out based on the switch_mm suggestion. The simple truth is
I was not familiar with switch_mm at the time I wrote this (nor am I
now). If this is a superior method that allows you to acquire
c_t_u(some_other_ctx) like behavior, I see no problem in changing. I
will look into this, and thanks for the suggestion!

> Also - if we just had vmexit because a process executed
> io (or hypercall), can't we just do copy_to_user there?
> Avi, I think at some point you said that we can?
>

Right, and yes that will work I believe. We could always do a "if (p ==
current)" check to test for this. To date, I don't typically do
anything mem-ops related directly in vcpu context so this wasn't an
issue...but that doesn't mean someone wont try in the future.
Therefore, I agree we should strive to optimize it if we can.
>
>

Thanks Michael,
-Greg

Attachments:

signature.asc (266.00 B)
OpenPGP digital signature

2009-06-04 18:24:31

by Avi Kivity

[permalink] [raw]

Subject: Re: [RFC PATCH v2 03/19] vbus: add connection-client helper infrastructure

Michael S. Tsirkin wrote:
> Also - if we just had vmexit because a process executed
> io (or hypercall), can't we just do copy_to_user there?
> Avi, I think at some point you said that we can?
>

You can do copy_to_user() whereever it is legal in Linux. Almost all of
kvm runs in process context, preemptible, and with interrupts enabled.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2009-06-04 18:25:39

by Avi Kivity

[permalink] [raw]

Subject: Re: [RFC PATCH v2 03/19] vbus: add connection-client helper infrastructure

Gregory Haskins wrote:

>> BTW, why did you decide to use get_user_pages?
>> Would switch_mm + copy_to_user work as well
>> avoiding page walk if all pages are present?
>>
>>
>
> Well, basic c_t_u() won't work because its likely not "current" if you
> are updating the ring from some other task, but I think you have already
> figured that out based on the switch_mm suggestion. The simple truth is
> I was not familiar with switch_mm at the time I wrote this (nor am I
> now). If this is a superior method that allows you to acquire
> c_t_u(some_other_ctx) like behavior, I see no problem in changing. I
> will look into this, and thanks for the suggestion!
>

copy_to_user() is significantly faster than get_user_pages() + kmap() +
memcmp() (or their variants).

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2009-06-04 18:31:31

by Gregory Haskins

[permalink] [raw]

Subject: Re: [RFC PATCH v2 03/19] vbus: add connection-client helper infrastructure

Avi Kivity wrote:
> Gregory Haskins wrote:
>
>
>>> BTW, why did you decide to use get_user_pages?
>>> Would switch_mm + copy_to_user work as well
>>> avoiding page walk if all pages are present?
>>>
>>
>> Well, basic c_t_u() won't work because its likely not "current" if you
>> are updating the ring from some other task, but I think you have already
>> figured that out based on the switch_mm suggestion. The simple truth is
>> I was not familiar with switch_mm at the time I wrote this (nor am I
>> now). If this is a superior method that allows you to acquire
>> c_t_u(some_other_ctx) like behavior, I see no problem in changing. I
>> will look into this, and thanks for the suggestion!
>>
>
> copy_to_user() is significantly faster than get_user_pages() + kmap()
> + memcmp() (or their variants).
>

Oh, I don't doubt that (in fact, I was pretty sure that was the case
based on some of the optimizations I could see in studying the c_t_u()
path). I just didn't realize there were other ways to do it if its a
non "current" task. ;)

I guess the enigma for me right now is what cost does switch_mm have?
(Thats not a slam against the suggested approach...I really do not know
and am curious).

As an aside, note that we seem to be reviewing v2, where v3 is really
the last set I pushed. I think this patch is more or less the same
across both iterations, but FYI that I would recommend looking at v3
instead.

-Greg

Attachments:

signature.asc (266.00 B)
OpenPGP digital signature

2009-06-04 18:57:03

Hi Rusty,

Rusty Russell wrote:
> On Fri, 5 Jun 2009 04:19:17 am Gregory Haskins wrote:
>
>> Avi Kivity wrote:
>>
>>> Gregory Haskins wrote:
>>> One idea is similar to signalfd() or eventfd()
>>>
>> And thus the "kvm-eventfd" (irqfd/iosignalfd) interface project was born.
>> ;)
>>
>
> The lguest patch queue already has such an interface :)

Cool! Ultimately I think it will be easier if both lguest+kvm support
the same eventfd notion so this is good you are already moving in the
same direction.

> And I have a partially complete in-kernel virtio_pci patch with the same trick.
>

I thought lguest didn't use pci? Or do you just mean that you have an
in-kernel virtio-net for lguest?

As a follow up question, I wonder if we can easily port that to vbus so
that it will work in both lguest and kvm? (note to self: push a skeleton
example today)

> I switched from "kernel created eventfd" to "userspace passes in eventfd"
> after a while though; it lets you connect multiple virtqueues to a single fd
> if you want.
>

Yeah, actually we switched that that model, too. Aside from the
limitation you point out, there were some problems that Al Viro had
raised trying to do it in kernel w.r.t. fd abuse.

> Combined with a minor change to allow any process with access to the lguest fd
> to queue interrupts, this allowed lguest to move to a thread-per-virtqueue
> model which was a significant speedup as well as nice code reduction.
>

Yep, that was one of my findings on venet as well so I was looking
forward to trying to get virtio-net to do the same.
> Here's the relevant kernel patch for reading.
>

Thanks Rusty! Will take a look.
> Thanks!
> Rusty.
>
> lguest: use eventfds for device notification
>
> Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with
> an address: the main Launcher process returns with this address, and figures
> out what device to run.
>
> A far nicer model is to let processes bind an eventfd to an address: if we
> find one, we simply signal the eventfd.
>
> Signed-off-by: Rusty Russell <[email protected]>
> Cc: Davide Libenzi <[email protected]>
> ---
> drivers/lguest/Kconfig | 2 -
> drivers/lguest/core.c | 8 ++--
> drivers/lguest/lg.h | 9 ++++
> drivers/lguest/lguest_user.c | 73 ++++++++++++++++++++++++++++++++++++++++
> include/linux/lguest_launcher.h | 1
> 5 files changed, 89 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
> --- a/drivers/lguest/Kconfig
> +++ b/drivers/lguest/Kconfig
> @@ -1,6 +1,6 @@
> config LGUEST
> tristate "Linux hypervisor example code"
> - depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX
> + depends on X86_32 && EXPERIMENTAL && !X86_PAE && EVENTFD
>

Note to self: we probably need a similar line in KVM now.

> select HVC_DRIVER
> ---help---
> This is a very simple module which allows you to run
> diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
> --- a/drivers/lguest/core.c
> +++ b/drivers/lguest/core.c
> @@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsign
> /* It's possible the Guest did a NOTIFY hypercall to the
> * Launcher, in which case we return from the read() now. */
> if (cpu->pending_notify) {
> - if (put_user(cpu->pending_notify, user))
> - return -EFAULT;
> - return sizeof(cpu->pending_notify);
> + if (!send_notify_to_eventfd(cpu)) {
> + if (put_user(cpu->pending_notify, user))
> + return -EFAULT;
> + return sizeof(cpu->pending_notify);
> + }
> }
>
> /* Check for signals */
> diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
> --- a/drivers/lguest/lg.h
> +++ b/drivers/lguest/lg.h
> @@ -82,6 +82,11 @@ struct lg_cpu {
> struct lg_cpu_arch arch;
> };
>
> +struct lg_eventfds {
> + unsigned long addr;
> + struct file *event;
> +};
> +
> /* The private info the thread maintains about the guest. */
> struct lguest
> {
> @@ -102,6 +107,9 @@ struct lguest
> unsigned int stack_pages;
> u32 tsc_khz;
>
> + unsigned int num_eventfds;
> + struct lg_eventfds *eventfds;
> +
> /* Dead? */
> const char *dead;
> };
> @@ -152,6 +160,7 @@ void setup_default_idt_entries(struct lg
> void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
> const unsigned long *def);
> void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
> +bool send_notify_to_eventfd(struct lg_cpu *cpu);
> void init_clockdev(struct lg_cpu *cpu);
> bool check_syscall_vector(struct lguest *lg);
> int init_interrupts(void);
> diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
> --- a/drivers/lguest/lguest_user.c
> +++ b/drivers/lguest/lguest_user.c
> @@ -7,6 +7,8 @@
> #include <linux/miscdevice.h>
> #include <linux/fs.h>
> #include <linux/sched.h>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> #include "lg.h"
>
> /*L:055 When something happens, the Waker process needs a way to stop the
> @@ -35,6 +37,70 @@ static int break_guest_out(struct lg_cpu
> }
> }
>
> +bool send_notify_to_eventfd(struct lg_cpu *cpu)
> +{
> + unsigned int i;
> +
> + /* lg->eventfds is RCU-protected */
> + preempt_disable();
> + for (i = 0; i < cpu->lg->num_eventfds; i++) {
> + if (cpu->lg->eventfds[i].addr == cpu->pending_notify) {
> + eventfd_signal(cpu->lg->eventfds[i].event, 1);
> + cpu->pending_notify = 0;
> + break;
> + }
> + }
> + preempt_enable();
> + return cpu->pending_notify == 0;
> +}
> +
> +static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
> +{
> + struct lg_eventfds *new, *old;
> +
> + if (!addr)
> + return -EINVAL;
> +
> + /* Replace the old array with the new one, carefully: others can
> + * be accessing it at the same time */
> + new = kmalloc(sizeof(*new) * (lg->num_eventfds + 1), GFP_KERNEL);
> + if (!new)
> + return -ENOMEM;
> +
> + memcpy(new, lg->eventfds, sizeof(*new) * lg->num_eventfds);
> + old = lg->eventfds;
> + lg->eventfds = new;
> + synchronize_rcu();
> + kfree(old);
> +
> + lg->eventfds[lg->num_eventfds].addr = addr;
> + lg->eventfds[lg->num_eventfds].event = eventfd_fget(fd);
> + if (IS_ERR(lg->eventfds[lg->num_eventfds].event))
> + return PTR_ERR(lg->eventfds[lg->num_eventfds].event);
> +
> + wmb();
> + lg->num_eventfds++;
> + return 0;
> +}
> +
> +static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
> +{
> + unsigned long addr, fd;
> + int err;
> +
> + if (get_user(addr, input) != 0)
> + return -EFAULT;
> + input++;
> + if (get_user(fd, input) != 0)
> + return -EFAULT;
> +
> + mutex_lock(&lguest_lock);
> + err = add_eventfd(lg, addr, fd);
> + mutex_unlock(&lguest_lock);
> +
> + return 0;
> +}
> +
> /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
> * number to /dev/lguest. */
> static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
> @@ -260,6 +326,8 @@ static ssize_t write(struct file *file,
> return user_send_irq(cpu, input);
> case LHREQ_BREAK:
> return break_guest_out(cpu, input);
> + case LHREQ_EVENTFD:
> + return attach_eventfd(lg, input);
> default:
> return -EINVAL;
> }
> @@ -297,6 +365,11 @@ static int close(struct inode *inode, st
> * the Launcher's memory management structure. */
> mmput(lg->cpus[i].mm);
> }
> +
> + /* Release any eventfds they registered. */
> + for (i = 0; i < lg->num_eventfds; i++)
> + fput(lg->eventfds[i].event);
> +
> /* If lg->dead doesn't contain an error code it will be NULL or a
> * kmalloc()ed string, either of which is ok to hand to kfree(). */
> if (!IS_ERR(lg->dead))
> diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
> --- a/include/linux/lguest_launcher.h
> +++ b/include/linux/lguest_launcher.h
> @@ -58,6 +58,7 @@ enum lguest_req
> LHREQ_GETDMA, /* No longer used */
> LHREQ_IRQ, /* + irq */
> LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
> + LHREQ_EVENTFD, /* + address, fd. */
> };
>
> /* The alignment to use between consumer and producer parts of vring.
>
>
>
>
Other than the potential rcu issues that Paul already addressed, looks
good. FWIW: this looks like what we are calling "iosignalfd" on the kvm
land (unless I am misunderstanding). Do you have the equivalent of
"irqfd" going the other way?

Thanks Rusty,
-Greg

Attachments:

signature.asc (266.00 B)
OpenPGP digital signature

2009-06-05 12:54:43

Rusty Russell wrote:
> On Fri, 5 Jun 2009 09:26:48 pm Gregory Haskins wrote:
>
>> Hi Rusty,
>>
>> Rusty Russell wrote:
>>
>>> On Fri, 5 Jun 2009 04:19:17 am Gregory Haskins wrote:
>>>
>>>> Avi Kivity wrote:
>>>>
>>>>> Gregory Haskins wrote:
>>>>> One idea is similar to signalfd() or eventfd()
>>>>>
>>>> And thus the "kvm-eventfd" (irqfd/iosignalfd) interface project was
>>>> born. ;)
>>>>
>>> The lguest patch queue already has such an interface :)
>>>
>> Cool! Ultimately I think it will be easier if both lguest+kvm support
>> the same eventfd notion so this is good you are already moving in the
>> same direction.
>>
>
> Not really; lguest doesn't do PCI.
>

Thats ok. I see these eventfd interfaces as somewhat orthogonal to
PCI. I.e. if both lguest and kvm have an eventfd mechnism for signaling
in both directions (e.g. interrupts and io), it would make it easier to
support the kind of thing I am striving for with a unified backend.
That is: one in-kernel virtio-net that works in both (or even many) HV
environments. I see that as a higher layer abstraction than PCI, per se.
>
>>> And I have a partially complete in-kernel virtio_pci patch with the same
>>> trick.
>>>
>> I thought lguest didn't use pci? Or do you just mean that you have an
>> in-kernel virtio-net for lguest?
>>
>
> No, this was for kvm. Sorry for the confusion.
>

Ah, sorry. Well, if its in any kind of shape to see the light of day,
please forward it over. Perhaps Michael and I can craft it into a
working solution.

>
>> Other than the potential rcu issues that Paul already addressed, looks
>> good. FWIW: this looks like what we are calling "iosignalfd" on the kvm
>> land (unless I am misunderstanding). Do you have the equivalent of
>> "irqfd" going the other way?
>>
>
> Yes; lguest uses write() (offset indicates cpu #) rather than ioctls, but
> anyone can do the LHREQ_IRQ write to queue an interrupt for delivery.
>
> So the threads just get the same /dev/lguest fd and it's simple.
>

Ah, ok. Thats workable, too. (This kind of detail would be buried in
the "lguest connector" for vbus anyway, so it doesn't have to have a
uniform "eventfd_signal()" interface to work. The fd concept alone is
sufficiently flexible).

Thanks Rusty,
-Greg

Attachments:

signature.asc (266.00 B)
OpenPGP digital signature

2009-06-05 15:08:37

by Rusty Russell

[permalink] [raw]

Subject: Re: [RFC PATCH v2 00/19] virtual-bus

On Fri, 5 Jun 2009 03:00:10 pm Paul E. McKenney wrote:
> On Fri, Jun 05, 2009 at 02:25:01PM +0930, Rusty Russell wrote:
> > + /* lg->eventfds is RCU-protected */
> > + preempt_disable();
>
> Suggest changing to rcu_read_lock() to match the synchronize_rcu().

Ah yes, much better. As I was implementing it I warred with myself since
lguest aims for simplicity above all else. But since we only ever add things
to the array, RCU probably is simpler.

> > + for (i = 0; i < cpu->lg->num_eventfds; i++) {
> > + if (cpu->lg->eventfds[i].addr == cpu->pending_notify) {
> > + eventfd_signal(cpu->lg->eventfds[i].event, 1);
>
> Shouldn't this be something like the following?
>
> p = rcu_dereference(cpu->lg->eventfds);
> if (p[i].addr == cpu->pending_notify) {
> eventfd_signal(p[i].event, 1);

Hmm, need to read num_eventfds first, too. It doesn't matter if we get the old
->num_eventfds and the new ->eventfds, but the other way around would be bad.

Here's the inter-diff:

diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -39,18 +39,24 @@ static int break_guest_out(struct lg_cpu

bool send_notify_to_eventfd(struct lg_cpu *cpu)
{
- unsigned int i;
+ unsigned int i, num;
+ struct lg_eventfds *eventfds;
+
+ /* Make sure we grab the total number before accessing the array. */
+ cpu->lg->num_eventfds = num;
+ rmb();

/* lg->eventfds is RCU-protected */
rcu_read_lock();
- for (i = 0; i < cpu->lg->num_eventfds; i++) {
- if (cpu->lg->eventfds[i].addr == cpu->pending_notify) {
- eventfd_signal(cpu->lg->eventfds[i].event, 1);
+ eventfds = rcu_dereference(cpu->lg->eventfds);
+ for (i = 0; i < num; i++) {
+ if (eventfds[i].addr == cpu->pending_notify) {
+ eventfd_signal(eventfds[i].event, 1);
cpu->pending_notify = 0;
break;
}
}
- preempt_enable();
+ rcu_read_unlock();
return cpu->pending_notify == 0;
}

Thanks!
Rusty.

2009-06-05 16:26:13

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [RFC PATCH v2 00/19] virtual-bus

On Sat, Jun 06, 2009 at 12:25:57AM +0930, Rusty Russell wrote:
> On Fri, 5 Jun 2009 03:00:10 pm Paul E. McKenney wrote:
> > On Fri, Jun 05, 2009 at 02:25:01PM +0930, Rusty Russell wrote:
> > > + /* lg->eventfds is RCU-protected */
> > > + preempt_disable();
> >
> > Suggest changing to rcu_read_lock() to match the synchronize_rcu().
>
> Ah yes, much better. As I was implementing it I warred with myself since
> lguest aims for simplicity above all else. But since we only ever add things
> to the array, RCU probably is simpler.

;-)

> > > + for (i = 0; i < cpu->lg->num_eventfds; i++) {
> > > + if (cpu->lg->eventfds[i].addr == cpu->pending_notify) {
> > > + eventfd_signal(cpu->lg->eventfds[i].event, 1);
> >
> > Shouldn't this be something like the following?
> >
> > p = rcu_dereference(cpu->lg->eventfds);
> > if (p[i].addr == cpu->pending_notify) {
> > eventfd_signal(p[i].event, 1);
>
> Hmm, need to read num_eventfds first, too. It doesn't matter if we get the old
> ->num_eventfds and the new ->eventfds, but the other way around would be bad.

Yep!!! ;-)

> Here's the inter-diff:
>
> diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
> --- a/drivers/lguest/lguest_user.c
> +++ b/drivers/lguest/lguest_user.c
> @@ -39,18 +39,24 @@ static int break_guest_out(struct lg_cpu
>
> bool send_notify_to_eventfd(struct lg_cpu *cpu)
> {
> - unsigned int i;
> + unsigned int i, num;
> + struct lg_eventfds *eventfds;
> +
> + /* Make sure we grab the total number before accessing the array. */
> + cpu->lg->num_eventfds = num;
> + rmb();
>
> /* lg->eventfds is RCU-protected */
> rcu_read_lock();
> - for (i = 0; i < cpu->lg->num_eventfds; i++) {
> - if (cpu->lg->eventfds[i].addr == cpu->pending_notify) {
> - eventfd_signal(cpu->lg->eventfds[i].event, 1);
> + eventfds = rcu_dereference(cpu->lg->eventfds);
> + for (i = 0; i < num; i++) {
> + if (eventfds[i].addr == cpu->pending_notify) {
> + eventfd_signal(eventfds[i].event, 1);
> cpu->pending_notify = 0;
> break;
> }
> }
> - preempt_enable();
> + rcu_read_unlock();
> return cpu->pending_notify == 0;
> }

It is possible to get rid of the rmb() and wmb() as well, doing
something like the following:

struct lg_eventfds_num {
unsigned int n;
struct lg_eventfds a[0];
}

Then the rcu_dereference() gets you a pointer to a struct lg_eventfds_num,
which has the array and its length in guaranteed synchronization without
the need for barriers.

Does this work for you, or is there some complication that I am missing?

Thanx, Paul

2009-06-11 13:21:30

by Rusty Russell

[permalink] [raw]

Subject: Re: [RFC PATCH v2 00/19] virtual-bus

On Sat, 6 Jun 2009 01:55:53 am Paul E. McKenney wrote:
> It is possible to get rid of the rmb() and wmb() as well, doing
> something like the following:
>
> struct lg_eventfds_num {
> unsigned int n;
> struct lg_eventfds a[0];
> }
>
> Then the rcu_dereference() gets you a pointer to a struct lg_eventfds_num,
> which has the array and its length in guaranteed synchronization without
> the need for barriers.

Yep, that's actually quite nice. The only wart is that it needs to be
allocated even when n == 0, but IMHO worth it for barrier avoidance.

This is what I ended up with:

lguest: use eventfds for device notification

Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with
an address: the main Launcher process returns with this address, and figures
out what device to run.

A far nicer model is to let processes bind an eventfd to an address: if we
find one, we simply signal the eventfd.

Signed-off-by: Rusty Russell <[email protected]>
Cc: Davide Libenzi <[email protected]>
---
drivers/lguest/Kconfig | 2
drivers/lguest/core.c | 8 ++-
drivers/lguest/lg.h | 13 +++++
drivers/lguest/lguest_user.c | 98 +++++++++++++++++++++++++++++++++++++++-
include/linux/lguest_launcher.h | 1
5 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
config LGUEST
tristate "Linux hypervisor example code"
- depends on X86_32 && EXPERIMENTAL && FUTEX
+ depends on X86_32 && EXPERIMENTAL && EVENTFD
select HVC_DRIVER
---help---
This is a very simple module which allows you to run
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsign
/* It's possible the Guest did a NOTIFY hypercall to the
* Launcher, in which case we return from the read() now. */
if (cpu->pending_notify) {
- if (put_user(cpu->pending_notify, user))
- return -EFAULT;
- return sizeof(cpu->pending_notify);
+ if (!send_notify_to_eventfd(cpu)) {
+ if (put_user(cpu->pending_notify, user))
+ return -EFAULT;
+ return sizeof(cpu->pending_notify);
+ }
}

/* Check for signals */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -82,6 +82,16 @@ struct lg_cpu {
struct lg_cpu_arch arch;
};

+struct lg_eventfd {
+ unsigned long addr;
+ struct file *event;
+};
+
+struct lg_eventfd_map {
+ unsigned int num;
+ struct lg_eventfd map[];
+};
+
/* The private info the thread maintains about the guest. */
struct lguest
{
@@ -102,6 +112,8 @@ struct lguest
unsigned int stack_pages;
u32 tsc_khz;

+ struct lg_eventfd_map *eventfds;
+
/* Dead? */
const char *dead;
};
@@ -154,6 +166,7 @@ void setup_default_idt_entries(struct lg
void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def);
void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+bool send_notify_to_eventfd(struct lg_cpu *cpu);
void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
int init_interrupts(void);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -7,6 +7,8 @@
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/sched.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
#include "lg.h"

/*L:055 When something happens, the Waker process needs a way to stop the
@@ -35,6 +37,81 @@ static int break_guest_out(struct lg_cpu
}
}

+bool send_notify_to_eventfd(struct lg_cpu *cpu)
+{
+ unsigned int i;
+ struct lg_eventfd_map *map;
+
+ /* lg->eventfds is RCU-protected */
+ rcu_read_lock();
+ map = rcu_dereference(cpu->lg->eventfds);
+ for (i = 0; i < map->num; i++) {
+ if (map->map[i].addr == cpu->pending_notify) {
+ eventfd_signal(map->map[i].event, 1);
+ cpu->pending_notify = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return cpu->pending_notify == 0;
+}
+
+static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
+{
+ struct lg_eventfd_map *new, *old = lg->eventfds;
+
+ if (!addr)
+ return -EINVAL;
+
+ /* Replace the old array with the new one, carefully: others can
+ * be accessing it at the same time */
+ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
+ GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* First make identical copy. */
+ memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
+ new->num = old->num;
+
+ /* Now append new entry. */
+ new->map[new->num].addr = addr;
+ new->map[new->num].event = eventfd_fget(fd);
+ if (IS_ERR(new->map[new->num].event)) {
+ kfree(new);
+ return PTR_ERR(new->map[new->num].event);
+ }
+ new->num++;
+
+ /* Now put new one in place. */
+ rcu_assign_pointer(lg->eventfds, new);
+
+ /* We're not in a big hurry. Wait until noone's looking at old
+ * version, then delete it. */
+ synchronize_rcu();
+ kfree(old);
+
+ return 0;
+}
+
+static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+{
+ unsigned long addr, fd;
+ int err;
+
+ if (get_user(addr, input) != 0)
+ return -EFAULT;
+ input++;
+ if (get_user(fd, input) != 0)
+ return -EFAULT;
+
+ mutex_lock(&lguest_lock);
+ err = add_eventfd(lg, addr, fd);
+ mutex_unlock(&lguest_lock);
+
+ return 0;
+}
+
/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
* number to /dev/lguest. */
static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
@@ -184,6 +261,13 @@ static int initialize(struct file *file,
goto unlock;
}

+ lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
+ if (!lg->eventfds) {
+ err = -ENOMEM;
+ goto free_lg;
+ }
+ lg->eventfds->num = 0;
+
/* Populate the easy fields of our "struct lguest" */
lg->mem_base = (void __user *)args[0];
lg->pfn_limit = args[1];
@@ -191,7 +275,7 @@ static int initialize(struct file *file,
/* This is the first cpu (cpu 0) and it will start booting at args[2] */
err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
if (err)
- goto release_guest;
+ goto free_eventfds;

/* Initialize the Guest's shadow page tables, using the toplevel
* address the Launcher gave us. This allocates memory, so can fail. */
@@ -210,7 +294,9 @@ static int initialize(struct file *file,
free_regs:
/* FIXME: This should be in free_vcpu */
free_page(lg->cpus[0].regs_page);
-release_guest:
+free_eventfds:
+ kfree(lg->eventfds);
+free_lg:
kfree(lg);
unlock:
mutex_unlock(&lguest_lock);
@@ -260,6 +346,8 @@ static ssize_t write(struct file *file,
return user_send_irq(cpu, input);
case LHREQ_BREAK:
return break_guest_out(cpu, input);
+ case LHREQ_EVENTFD:
+ return attach_eventfd(lg, input);
default:
return -EINVAL;
}
@@ -297,6 +385,12 @@ static int close(struct inode *inode, st
* the Launcher's memory management structure. */
mmput(lg->cpus[i].mm);
}
+
+ /* Release any eventfds they registered. */
+ for (i = 0; i < lg->eventfds->num; i++)
+ fput(lg->eventfds->map[i].event);
+ kfree(lg->eventfds);
+
/* If lg->dead doesn't contain an error code it will be NULL or a
* kmalloc()ed string, either of which is ok to hand to kfree(). */
if (!IS_ERR(lg->dead))
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -58,6 +58,7 @@ enum lguest_req
LHREQ_GETDMA, /* No longer used */
LHREQ_IRQ, /* + irq */
LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+ LHREQ_EVENTFD, /* + address, fd. */
};

/* The alignment to use between consumer and producer parts of vring.

2009-06-11 15:48:54

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [RFC PATCH v2 00/19] virtual-bus

On Thu, Jun 11, 2009 at 10:51:20PM +0930, Rusty Russell wrote:
> On Sat, 6 Jun 2009 01:55:53 am Paul E. McKenney wrote:
> > It is possible to get rid of the rmb() and wmb() as well, doing
> > something like the following:
> >
> > struct lg_eventfds_num {
> > unsigned int n;
> > struct lg_eventfds a[0];
> > }
> >
> > Then the rcu_dereference() gets you a pointer to a struct lg_eventfds_num,
> > which has the array and its length in guaranteed synchronization without
> > the need for barriers.
>
> Yep, that's actually quite nice. The only wart is that it needs to be
> allocated even when n == 0, but IMHO worth it for barrier avoidance.

Well, I suppose that you -could- statically allocate one in struct
lguest, but it is not clear to me that this cure would be better than
the always-allocate disease in this case. But either way, you would
be allocating an instance, so your statement above is correct. ;-)

> This is what I ended up with:
>
> lguest: use eventfds for device notification
>
> Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with
> an address: the main Launcher process returns with this address, and figures
> out what device to run.
>
> A far nicer model is to let processes bind an eventfd to an address: if we
> find one, we simply signal the eventfd.

Looks very good to me from an RCU viewpoint!!!

Reviewed-by: Paul E. McKenney <[email protected]>

> Signed-off-by: Rusty Russell <[email protected]>
> Cc: Davide Libenzi <[email protected]>
> ---
> drivers/lguest/Kconfig | 2
> drivers/lguest/core.c | 8 ++-
> drivers/lguest/lg.h | 13 +++++
> drivers/lguest/lguest_user.c | 98 +++++++++++++++++++++++++++++++++++++++-
> include/linux/lguest_launcher.h | 1
> 5 files changed, 116 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
> --- a/drivers/lguest/Kconfig
> +++ b/drivers/lguest/Kconfig
> @@ -1,6 +1,6 @@
> config LGUEST
> tristate "Linux hypervisor example code"
> - depends on X86_32 && EXPERIMENTAL && FUTEX
> + depends on X86_32 && EXPERIMENTAL && EVENTFD
> select HVC_DRIVER
> ---help---
> This is a very simple module which allows you to run
> diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
> --- a/drivers/lguest/core.c
> +++ b/drivers/lguest/core.c
> @@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsign
> /* It's possible the Guest did a NOTIFY hypercall to the
> * Launcher, in which case we return from the read() now. */
> if (cpu->pending_notify) {
> - if (put_user(cpu->pending_notify, user))
> - return -EFAULT;
> - return sizeof(cpu->pending_notify);
> + if (!send_notify_to_eventfd(cpu)) {
> + if (put_user(cpu->pending_notify, user))
> + return -EFAULT;
> + return sizeof(cpu->pending_notify);
> + }
> }
>
> /* Check for signals */
> diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
> --- a/drivers/lguest/lg.h
> +++ b/drivers/lguest/lg.h
> @@ -82,6 +82,16 @@ struct lg_cpu {
> struct lg_cpu_arch arch;
> };
>
> +struct lg_eventfd {
> + unsigned long addr;
> + struct file *event;
> +};
> +
> +struct lg_eventfd_map {
> + unsigned int num;
> + struct lg_eventfd map[];
> +};
> +
> /* The private info the thread maintains about the guest. */
> struct lguest
> {
> @@ -102,6 +112,8 @@ struct lguest
> unsigned int stack_pages;
> u32 tsc_khz;
>
> + struct lg_eventfd_map *eventfds;
> +
> /* Dead? */
> const char *dead;
> };
> @@ -154,6 +166,7 @@ void setup_default_idt_entries(struct lg
> void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
> const unsigned long *def);
> void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
> +bool send_notify_to_eventfd(struct lg_cpu *cpu);
> void init_clockdev(struct lg_cpu *cpu);
> bool check_syscall_vector(struct lguest *lg);
> int init_interrupts(void);
> diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
> --- a/drivers/lguest/lguest_user.c
> +++ b/drivers/lguest/lguest_user.c
> @@ -7,6 +7,8 @@
> #include <linux/miscdevice.h>
> #include <linux/fs.h>
> #include <linux/sched.h>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> #include "lg.h"
>
> /*L:055 When something happens, the Waker process needs a way to stop the
> @@ -35,6 +37,81 @@ static int break_guest_out(struct lg_cpu
> }
> }
>
> +bool send_notify_to_eventfd(struct lg_cpu *cpu)
> +{
> + unsigned int i;
> + struct lg_eventfd_map *map;
> +
> + /* lg->eventfds is RCU-protected */
> + rcu_read_lock();
> + map = rcu_dereference(cpu->lg->eventfds);
> + for (i = 0; i < map->num; i++) {
> + if (map->map[i].addr == cpu->pending_notify) {
> + eventfd_signal(map->map[i].event, 1);
> + cpu->pending_notify = 0;
> + break;
> + }
> + }
> + rcu_read_unlock();
> + return cpu->pending_notify == 0;
> +}
> +
> +static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
> +{
> + struct lg_eventfd_map *new, *old = lg->eventfds;
> +
> + if (!addr)
> + return -EINVAL;
> +
> + /* Replace the old array with the new one, carefully: others can
> + * be accessing it at the same time */
> + new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
> + GFP_KERNEL);
> + if (!new)
> + return -ENOMEM;
> +
> + /* First make identical copy. */
> + memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
> + new->num = old->num;
> +
> + /* Now append new entry. */
> + new->map[new->num].addr = addr;
> + new->map[new->num].event = eventfd_fget(fd);
> + if (IS_ERR(new->map[new->num].event)) {
> + kfree(new);
> + return PTR_ERR(new->map[new->num].event);
> + }
> + new->num++;
> +
> + /* Now put new one in place. */
> + rcu_assign_pointer(lg->eventfds, new);
> +
> + /* We're not in a big hurry. Wait until noone's looking at old
> + * version, then delete it. */
> + synchronize_rcu();
> + kfree(old);
> +
> + return 0;
> +}
> +
> +static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
> +{
> + unsigned long addr, fd;
> + int err;
> +
> + if (get_user(addr, input) != 0)
> + return -EFAULT;
> + input++;
> + if (get_user(fd, input) != 0)
> + return -EFAULT;
> +
> + mutex_lock(&lguest_lock);
> + err = add_eventfd(lg, addr, fd);
> + mutex_unlock(&lguest_lock);
> +
> + return 0;
> +}
> +
> /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
> * number to /dev/lguest. */
> static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
> @@ -184,6 +261,13 @@ static int initialize(struct file *file,
> goto unlock;
> }
>
> + lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
> + if (!lg->eventfds) {
> + err = -ENOMEM;
> + goto free_lg;
> + }
> + lg->eventfds->num = 0;
> +
> /* Populate the easy fields of our "struct lguest" */
> lg->mem_base = (void __user *)args[0];
> lg->pfn_limit = args[1];
> @@ -191,7 +275,7 @@ static int initialize(struct file *file,
> /* This is the first cpu (cpu 0) and it will start booting at args[2] */
> err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
> if (err)
> - goto release_guest;
> + goto free_eventfds;
>
> /* Initialize the Guest's shadow page tables, using the toplevel
> * address the Launcher gave us. This allocates memory, so can fail. */
> @@ -210,7 +294,9 @@ static int initialize(struct file *file,
> free_regs:
> /* FIXME: This should be in free_vcpu */
> free_page(lg->cpus[0].regs_page);
> -release_guest:
> +free_eventfds:
> + kfree(lg->eventfds);
> +free_lg:
> kfree(lg);
> unlock:
> mutex_unlock(&lguest_lock);
> @@ -260,6 +346,8 @@ static ssize_t write(struct file *file,
> return user_send_irq(cpu, input);
> case LHREQ_BREAK:
> return break_guest_out(cpu, input);
> + case LHREQ_EVENTFD:
> + return attach_eventfd(lg, input);
> default:
> return -EINVAL;
> }
> @@ -297,6 +385,12 @@ static int close(struct inode *inode, st
> * the Launcher's memory management structure. */
> mmput(lg->cpus[i].mm);
> }
> +
> + /* Release any eventfds they registered. */
> + for (i = 0; i < lg->eventfds->num; i++)
> + fput(lg->eventfds->map[i].event);
> + kfree(lg->eventfds);
> +
> /* If lg->dead doesn't contain an error code it will be NULL or a
> * kmalloc()ed string, either of which is ok to hand to kfree(). */
> if (!IS_ERR(lg->dead))
> diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
> --- a/include/linux/lguest_launcher.h
> +++ b/include/linux/lguest_launcher.h
> @@ -58,6 +58,7 @@ enum lguest_req
> LHREQ_GETDMA, /* No longer used */
> LHREQ_IRQ, /* + irq */
> LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
> + LHREQ_EVENTFD, /* + address, fd. */
> };
>
> /* The alignment to use between consumer and producer parts of vring.
>