2010-06-01 09:35:18

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Replace vhost_workqueue with per-vhost kthread. Other than callback
argument change from struct work_struct * to struct vhost_work *,
there's no visible change to vhost_poll_*() interface.

This conversion is to make each vhost use a dedicated kthread so that
resource control via cgroup can be applied.

Partially based on Sridhar Samudrala's patch.

* Updated to use sub structure vhost_work instead of directly using
vhost_poll at Michael's suggestion.

* Added flusher wake_up() optimization at Michael's suggestion.

NOT_SIGNED_OFF_YET
Cc: Michael S. Tsirkin <[email protected]>
Cc: Sridhar Samudrala <[email protected]>
---
Okay, here's the updated version. I'm still in the process of setting
up the testing environment. I'll be traveling from this afternoon
till tomorrow so I don't think I'll be able to test it before that.
If you can give it a shot, it would be great.

Thanks.

drivers/vhost/net.c | 56 ++++++++++---------------
drivers/vhost/vhost.c | 110 ++++++++++++++++++++++++++++++++++++++------------
drivers/vhost/vhost.h | 38 +++++++++++------
3 files changed, 133 insertions(+), 71 deletions(-)

Index: work/drivers/vhost/net.c
===================================================================
--- work.orig/drivers/vhost/net.c
+++ work/drivers/vhost/net.c
@@ -294,54 +294,58 @@ static void handle_rx(struct vhost_net *
unuse_mm(net->dev.mm);
}

-static void handle_tx_kick(struct work_struct *work)
+static void handle_tx_kick(struct vhost_work *work)
{
- struct vhost_virtqueue *vq;
- struct vhost_net *net;
- vq = container_of(work, struct vhost_virtqueue, poll.work);
- net = container_of(vq->dev, struct vhost_net, dev);
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
handle_tx(net);
}

-static void handle_rx_kick(struct work_struct *work)
+static void handle_rx_kick(struct vhost_work *work)
{
- struct vhost_virtqueue *vq;
- struct vhost_net *net;
- vq = container_of(work, struct vhost_virtqueue, poll.work);
- net = container_of(vq->dev, struct vhost_net, dev);
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
handle_rx(net);
}

-static void handle_tx_net(struct work_struct *work)
+static void handle_tx_net(struct vhost_work *work)
{
- struct vhost_net *net;
- net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+ struct vhost_net *net = container_of(work, struct vhost_net,
+ poll[VHOST_NET_VQ_TX].work);
handle_tx(net);
}

-static void handle_rx_net(struct work_struct *work)
+static void handle_rx_net(struct vhost_work *work)
{
- struct vhost_net *net;
- net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+ struct vhost_net *net = container_of(work, struct vhost_net,
+ poll[VHOST_NET_VQ_RX].work);
handle_rx(net);
}

static int vhost_net_open(struct inode *inode, struct file *f)
{
struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+ struct vhost_dev *dev;
int r;
+
if (!n)
return -ENOMEM;
+
+ dev = &n->dev;
n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
- r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+ r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
if (r < 0) {
kfree(n);
return r;
}

- vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
- vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;

f->private_data = n;
@@ -644,25 +648,13 @@ static struct miscdevice vhost_net_misc

static int vhost_net_init(void)
{
- int r = vhost_init();
- if (r)
- goto err_init;
- r = misc_register(&vhost_net_misc);
- if (r)
- goto err_reg;
- return 0;
-err_reg:
- vhost_cleanup();
-err_init:
- return r;
-
+ return misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);

static void vhost_net_exit(void)
{
misc_deregister(&vhost_net_misc);
- vhost_cleanup();
}
module_exit(vhost_net_exit);

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -17,12 +17,12 @@
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/slab.h>
+#include <linux/kthread.h>

#include <linux/net.h>
#include <linux/if_packet.h>
@@ -37,8 +37,6 @@ enum {
VHOST_MEMORY_F_LOG = 0x1,
};

-static struct workqueue_struct *vhost_workqueue;
-
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -52,23 +50,31 @@ static void vhost_poll_func(struct file
static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
void *key)
{
- struct vhost_poll *poll;
- poll = container_of(wait, struct vhost_poll, wait);
+ struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
+
if (!((unsigned long)key & poll->mask))
return 0;

- queue_work(vhost_workqueue, &poll->work);
+ vhost_poll_queue(poll);
return 0;
}

/* Init poll structure */
-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
- unsigned long mask)
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+ unsigned long mask, struct vhost_dev *dev)
{
- INIT_WORK(&poll->work, func);
+ struct vhost_work *work = &poll->work;
+
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
+ poll->dev = dev;
+
+ INIT_LIST_HEAD(&work->node);
+ work->fn = fn;
+ init_waitqueue_head(&work->done);
+ atomic_set(&work->flushing, 0);
+ work->queue_seq = work->done_seq = 0;
}

/* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -92,12 +98,28 @@ void vhost_poll_stop(struct vhost_poll *
* locks that are also used by the callback. */
void vhost_poll_flush(struct vhost_poll *poll)
{
- flush_work(&poll->work);
+ struct vhost_work *work = &poll->work;
+ int seq = work->queue_seq;
+
+ atomic_inc(&work->flushing);
+ smp_mb__after_atomic_inc(); /* mb flush-b0 paired with worker-b1 */
+ wait_event(work->done, seq - work->done_seq <= 0);
+ atomic_dec(&work->flushing);
+ smp_mb__after_atomic_dec(); /* rmb flush-b1 paired with worker-b0 */
}

void vhost_poll_queue(struct vhost_poll *poll)
{
- queue_work(vhost_workqueue, &poll->work);
+ struct vhost_dev *dev = poll->dev;
+ struct vhost_work *work = &poll->work;
+
+ spin_lock(&dev->work_lock);
+ if (list_empty(&work->node)) {
+ list_add_tail(&work->node, &dev->work_list);
+ work->queue_seq++;
+ wake_up_process(dev->worker);
+ }
+ spin_unlock(&dev->work_lock);
}

static void vhost_vq_reset(struct vhost_dev *dev,
@@ -125,10 +147,52 @@ static void vhost_vq_reset(struct vhost_
vq->log_ctx = NULL;
}

+static int vhost_worker(void *data)
+{
+ struct vhost_dev *dev = data;
+ struct vhost_work *work;
+
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ work = NULL;
+ spin_lock(&dev->work_lock);
+ if (!list_empty(&dev->work_list)) {
+ work = list_first_entry(&dev->work_list,
+ struct vhost_work, node);
+ list_del_init(&work->node);
+ }
+ spin_unlock(&dev->work_lock);
+
+ if (work) {
+ __set_current_state(TASK_RUNNING);
+ work->fn(work);
+ smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
+ work->done_seq = work->queue_seq;
+ smp_mb(); /* mb worker-b1 paired with flush-b0 */
+ if (atomic_read(&work->flushing))
+ wake_up_all(&work->done);
+ } else
+ schedule();
+
+ goto repeat;
+}
+
long vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue *vqs, int nvqs)
{
+ struct task_struct *worker;
int i;
+
+ worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
+ if (IS_ERR(worker))
+ return PTR_ERR(worker);
+
dev->vqs = vqs;
dev->nvqs = nvqs;
mutex_init(&dev->mutex);
@@ -136,6 +200,9 @@ long vhost_dev_init(struct vhost_dev *de
dev->log_file = NULL;
dev->memory = NULL;
dev->mm = NULL;
+ spin_lock_init(&dev->work_lock);
+ INIT_LIST_HEAD(&dev->work_list);
+ dev->worker = worker;

for (i = 0; i < dev->nvqs; ++i) {
dev->vqs[i].dev = dev;
@@ -143,9 +210,10 @@ long vhost_dev_init(struct vhost_dev *de
vhost_vq_reset(dev, dev->vqs + i);
if (dev->vqs[i].handle_kick)
vhost_poll_init(&dev->vqs[i].poll,
- dev->vqs[i].handle_kick,
- POLLIN);
+ dev->vqs[i].handle_kick, POLLIN, dev);
}
+
+ wake_up_process(worker); /* avoid contributing to loadavg */
return 0;
}

@@ -217,6 +285,9 @@ void vhost_dev_cleanup(struct vhost_dev
if (dev->mm)
mmput(dev->mm);
dev->mm = NULL;
+
+ WARN_ON(!list_empty(&dev->work_list));
+ kthread_stop(dev->worker);
}

static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
@@ -1113,16 +1184,3 @@ void vhost_disable_notify(struct vhost_v
vq_err(vq, "Failed to enable notification at %p: %d\n",
&vq->used->flags, r);
}
-
-int vhost_init(void)
-{
- vhost_workqueue = create_singlethread_workqueue("vhost");
- if (!vhost_workqueue)
- return -ENOMEM;
- return 0;
-}
-
-void vhost_cleanup(void)
-{
- destroy_workqueue(vhost_workqueue);
-}
Index: work/drivers/vhost/vhost.h
===================================================================
--- work.orig/drivers/vhost/vhost.h
+++ work/drivers/vhost/vhost.h
@@ -5,13 +5,13 @@
#include <linux/vhost.h>
#include <linux/mm.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/skbuff.h>
#include <linux/uio.h>
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
+#include <asm/atomic.h>

struct vhost_device;

@@ -20,19 +20,31 @@ enum {
VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
};

+struct vhost_work;
+typedef void (*vhost_work_fn_t)(struct vhost_work *work);
+
+struct vhost_work {
+ struct list_head node;
+ vhost_work_fn_t fn;
+ wait_queue_head_t done;
+ atomic_t flushing;
+ int queue_seq;
+ int done_seq;
+};
+
/* Poll a file (eventfd or socket) */
/* Note: there's nothing vhost specific about this structure. */
struct vhost_poll {
poll_table table;
wait_queue_head_t *wqh;
wait_queue_t wait;
- /* struct which will handle all actual work. */
- struct work_struct work;
+ struct vhost_work work;
unsigned long mask;
+ struct vhost_dev *dev;
};

-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
- unsigned long mask);
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+ unsigned long mask, struct vhost_dev *dev);
void vhost_poll_start(struct vhost_poll *poll, struct file *file);
void vhost_poll_stop(struct vhost_poll *poll);
void vhost_poll_flush(struct vhost_poll *poll);
@@ -63,7 +75,7 @@ struct vhost_virtqueue {
struct vhost_poll poll;

/* The routine to call when the Guest pings us, or timeout. */
- work_func_t handle_kick;
+ vhost_work_fn_t handle_kick;

/* Last available index we saw. */
u16 last_avail_idx;
@@ -86,11 +98,11 @@ struct vhost_virtqueue {
struct iovec hdr[VHOST_NET_MAX_SG];
size_t hdr_size;
/* We use a kind of RCU to access private pointer.
- * All readers access it from workqueue, which makes it possible to
- * flush the workqueue instead of synchronize_rcu. Therefore readers do
+ * All readers access it from worker, which makes it possible to
+ * flush the vhost_work instead of synchronize_rcu. Therefore readers do
* not need to call rcu_read_lock/rcu_read_unlock: the beginning of
- * work item execution acts instead of rcu_read_lock() and the end of
- * work item execution acts instead of rcu_read_lock().
+ * vhost_work execution acts instead of rcu_read_lock() and the end of
+ * vhost_work execution acts instead of rcu_read_lock().
* Writers use virtqueue mutex. */
void *private_data;
/* Log write descriptors */
@@ -110,6 +122,9 @@ struct vhost_dev {
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
+ spinlock_t work_lock;
+ struct list_head work_list;
+ struct task_struct *worker;
};

long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -136,9 +151,6 @@ bool vhost_enable_notify(struct vhost_vi
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);

-int vhost_init(void);
-void vhost_cleanup(void);
-
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
if ((vq)->error_ctx) \


2010-06-01 14:15:04

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, May 31, 2010 at 06:22:21PM +0300, Michael S. Tsirkin wrote:
> On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
> > Replace vhost_workqueue with per-vhost kthread. Other than callback
> > argument change from struct work_struct * to struct vhost_poll *,
> > there's no visible change to vhost_poll_*() interface.
>
> I would prefer a substructure vhost_work, even just to make
> the code easier to review and compare to workqueue.c.

Either way this plays out, the rcu_dereference_check() calls will need
to be adjusted to reflect the change.

Thanx, Paul

> > This conversion is to make each vhost use a dedicated kthread so that
> > resource control via cgroup can be applied.
> >
> > Partially based on Sridhar Samudrala's patch.
> >
> > Cc: Michael S. Tsirkin <[email protected]>
> > Cc: Sridhar Samudrala <[email protected]>
> > ---
> > Okay, here is three patch series to convert vhost to use per-vhost
> > kthread, add cgroup_attach_task_current_cg() and apply it to the vhost
> > kthreads. The conversion is mostly straight forward although flush is
> > slightly tricky.
> >
> > The problem is that I have no idea how to test this.
>
> It's a 3 step process:
>
> 1.
> Install qemu-kvm under fc13, or build recent one from source,
> get it from here:
> git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git
>
> 2. install guest under it:
> qemu-img create -f qcow2 disk.qcow2 100G
> Now get some image (e.g. fedora 13 in fc13.iso)
> and install guest:
> qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2
>
>
> 3. set up networking. I usually simply do host to guest
> on a special subnet for testing purposes:
>
> Set up a bridge named mstbr0:
>
> ifconfig mstbr0 down
> brctl delbr mstbr0
> brctl addbr mstbr0
> brctl setfd mstbr0 0
> ifconfig mstbr0 11.0.0.1
>
> cat > ifup << EOF
> #!/bin/sh -x
> /sbin/ifconfig msttap0 0.0.0.0 up
> brctl addif mstbr0 msttap0
> EOF
>
>
> qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2
> -net nic,model=virtio,netdev=foo -netdev
> tap,id=foo,ifname=msttap0,script=/home/mst/ifup,downscript=no,vhost=on
>
> after you set up the guest, log into it and
> ifconfig eth0 11.0.0.2
>
> You should now be able to ping guest to host and back.
> Use something like netperf to stress the connection.
> Close qemu with kill -9 and unload module to test flushing code.
>
>
>
> > Index: work/drivers/vhost/vhost.c
> > ===================================================================
> > --- work.orig/drivers/vhost/vhost.c
> > +++ work/drivers/vhost/vhost.c
>
> ...
>
> > @@ -125,10 +139,50 @@ static void vhost_vq_reset(struct vhost_
> > vq->log_ctx = NULL;
> > }
> >
> > +static int vhost_poller(void *data)
> > +{
> > + struct vhost_dev *dev = data;
> > + struct vhost_poll *poll;
> > +
> > +repeat:
> > + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
> > +
> > + if (kthread_should_stop()) {
> > + __set_current_state(TASK_RUNNING);
> > + return 0;
> > + }
> > +
> > + poll = NULL;
> > + spin_lock(&dev->poller_lock);
> > + if (!list_empty(&dev->poll_list)) {
> > + poll = list_first_entry(&dev->poll_list,
> > + struct vhost_poll, node);
> > + list_del_init(&poll->node);
> > + }
> > + spin_unlock(&dev->poller_lock);
> > +
> > + if (poll) {
> > + __set_current_state(TASK_RUNNING);
> > + poll->fn(poll);
> > + smp_wmb(); /* paired with rmb in vhost_poll_flush() */
> > + poll->done_seq = poll->queue_seq;
> > + wake_up_all(&poll->done);
>
>
> This seems to add wakeups on data path, which uses spinlocks etc.
> OTOH workqueue.c adds a special barrier
> entry which only does a wakeup when needed.
> Right?
>
> > + } else
> > + schedule();
> > +
> > + goto repeat;
> > +}
> > +
> > long vhost_dev_init(struct vhost_dev *dev,
> > struct vhost_virtqueue *vqs, int nvqs)
> > {
> > + struct task_struct *poller;
> > int i;
> > +
> > + poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> > + if (IS_ERR(poller))
> > + return PTR_ERR(poller);
> > +
> > dev->vqs = vqs;
> > dev->nvqs = nvqs;
> > mutex_init(&dev->mutex);
> > @@ -136,6 +190,9 @@ long vhost_dev_init(struct vhost_dev *de
> > dev->log_file = NULL;
> > dev->memory = NULL;
> > dev->mm = NULL;
> > + spin_lock_init(&dev->poller_lock);
> > + INIT_LIST_HEAD(&dev->poll_list);
> > + dev->poller = poller;
> >
> > for (i = 0; i < dev->nvqs; ++i) {
> > dev->vqs[i].dev = dev;
> > @@ -143,8 +200,7 @@ long vhost_dev_init(struct vhost_dev *de
> > vhost_vq_reset(dev, dev->vqs + i);
> > if (dev->vqs[i].handle_kick)
> > vhost_poll_init(&dev->vqs[i].poll,
> > - dev->vqs[i].handle_kick,
> > - POLLIN);
> > + dev->vqs[i].handle_kick, POLLIN, dev);
> > }
> > return 0;
> > }
> > @@ -217,6 +273,8 @@ void vhost_dev_cleanup(struct vhost_dev
> > if (dev->mm)
> > mmput(dev->mm);
> > dev->mm = NULL;
> > +
> > + kthread_stop(dev->poller);
> > }
> >
> > static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
> > @@ -1113,16 +1171,3 @@ void vhost_disable_notify(struct vhost_v
> > vq_err(vq, "Failed to enable notification at %p: %d\n",
> > &vq->used->flags, r);
> > }
> > -
> > -int vhost_init(void)
> > -{
> > - vhost_workqueue = create_singlethread_workqueue("vhost");
> > - if (!vhost_workqueue)
> > - return -ENOMEM;
> > - return 0;
> > -}
> > -
> > -void vhost_cleanup(void)
> > -{
> > - destroy_workqueue(vhost_workqueue);
>
> I note that destroy_workqueue does a flush, kthread_stop
> doesn't. Right? Sure we don't need to check nothing is in one of
> the lists? Maybe add a BUG_ON?
>
> > -}
> > Index: work/drivers/vhost/vhost.h
> > ===================================================================
> > --- work.orig/drivers/vhost/vhost.h
> > +++ work/drivers/vhost/vhost.h
> > @@ -5,7 +5,6 @@
> > #include <linux/vhost.h>
> > #include <linux/mm.h>
> > #include <linux/mutex.h>
> > -#include <linux/workqueue.h>
> > #include <linux/poll.h>
> > #include <linux/file.h>
> > #include <linux/skbuff.h>
> > @@ -20,19 +19,26 @@ enum {
> > VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
> > };
> >
> > +struct vhost_poll;
> > +typedef void (*vhost_poll_fn_t)(struct vhost_poll *poll);
> > +
> > /* Poll a file (eventfd or socket) */
> > /* Note: there's nothing vhost specific about this structure. */
> > struct vhost_poll {
> > + vhost_poll_fn_t fn;
> > poll_table table;
> > wait_queue_head_t *wqh;
> > wait_queue_t wait;
> > - /* struct which will handle all actual work. */
> > - struct work_struct work;
> > + struct list_head node;
> > + wait_queue_head_t done;
> > unsigned long mask;
> > + struct vhost_dev *dev;
> > + int queue_seq;
> > + int done_seq;
> > };
> >
> > -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> > - unsigned long mask);
> > +void vhost_poll_init(struct vhost_poll *poll, vhost_poll_fn_t fn,
> > + unsigned long mask, struct vhost_dev *dev);
> > void vhost_poll_start(struct vhost_poll *poll, struct file *file);
> > void vhost_poll_stop(struct vhost_poll *poll);
> > void vhost_poll_flush(struct vhost_poll *poll);
> > @@ -63,7 +69,7 @@ struct vhost_virtqueue {
> > struct vhost_poll poll;
> >
> > /* The routine to call when the Guest pings us, or timeout. */
> > - work_func_t handle_kick;
> > + vhost_poll_fn_t handle_kick;
> >
> > /* Last available index we saw. */
> > u16 last_avail_idx;
> > @@ -86,11 +92,11 @@ struct vhost_virtqueue {
> > struct iovec hdr[VHOST_NET_MAX_SG];
> > size_t hdr_size;
> > /* We use a kind of RCU to access private pointer.
> > - * All readers access it from workqueue, which makes it possible to
> > - * flush the workqueue instead of synchronize_rcu. Therefore readers do
> > + * All readers access it from poller, which makes it possible to
> > + * flush the vhost_poll instead of synchronize_rcu. Therefore readers do
> > * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
> > - * work item execution acts instead of rcu_read_lock() and the end of
> > - * work item execution acts instead of rcu_read_lock().
> > + * vhost_poll execution acts instead of rcu_read_lock() and the end of
> > + * vhost_poll execution acts instead of rcu_read_lock().
> > * Writers use virtqueue mutex. */
> > void *private_data;
> > /* Log write descriptors */
> > @@ -110,6 +116,9 @@ struct vhost_dev {
> > int nvqs;
> > struct file *log_file;
> > struct eventfd_ctx *log_ctx;
> > + spinlock_t poller_lock;
> > + struct list_head poll_list;
> > + struct task_struct *poller;
> > };
> >
> > long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> > @@ -136,9 +145,6 @@ bool vhost_enable_notify(struct vhost_vi
> > int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
> > unsigned int log_num, u64 len);
> >
> > -int vhost_init(void);
> > -void vhost_cleanup(void);
> > -
> > #define vq_err(vq, fmt, ...) do { \
> > pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
> > if ((vq)->error_ctx) \
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2010-06-02 18:41:16

by Tejun Heo

[permalink] [raw]
Subject: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Replace vhost_workqueue with per-vhost kthread. Other than callback
argument change from struct work_struct * to struct vhost_work *,
there's no visible change to vhost_poll_*() interface.

This conversion is to make each vhost use a dedicated kthread so that
resource control via cgroup can be applied.

Partially based on Sridhar Samudrala's patch.

* Updated to use sub structure vhost_work instead of directly using
vhost_poll at Michael's suggestion.

* Added flusher wake_up() optimization at Michael's suggestion.

Signed-off-by: Tejun Heo <[email protected]>
Cc: Michael S. Tsirkin <[email protected]>
Cc: Sridhar Samudrala <[email protected]>
---
Okay, just tested it. dev->work_lock had to be updated to use irq
operations but other than that it worked just fine. Copied a large
file using scp and it seems to perform pretty well although I don't
have any reference of comparison. So, here's the updated version with
the sign off.

Thanks.

drivers/vhost/net.c | 56 ++++++++++---------------
drivers/vhost/vhost.c | 111 ++++++++++++++++++++++++++++++++++++++------------
drivers/vhost/vhost.h | 38 +++++++++++------
3 files changed, 134 insertions(+), 71 deletions(-)

Index: work/drivers/vhost/net.c
===================================================================
--- work.orig/drivers/vhost/net.c
+++ work/drivers/vhost/net.c
@@ -294,54 +294,58 @@ static void handle_rx(struct vhost_net *
unuse_mm(net->dev.mm);
}

-static void handle_tx_kick(struct work_struct *work)
+static void handle_tx_kick(struct vhost_work *work)
{
- struct vhost_virtqueue *vq;
- struct vhost_net *net;
- vq = container_of(work, struct vhost_virtqueue, poll.work);
- net = container_of(vq->dev, struct vhost_net, dev);
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
handle_tx(net);
}

-static void handle_rx_kick(struct work_struct *work)
+static void handle_rx_kick(struct vhost_work *work)
{
- struct vhost_virtqueue *vq;
- struct vhost_net *net;
- vq = container_of(work, struct vhost_virtqueue, poll.work);
- net = container_of(vq->dev, struct vhost_net, dev);
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
handle_rx(net);
}

-static void handle_tx_net(struct work_struct *work)
+static void handle_tx_net(struct vhost_work *work)
{
- struct vhost_net *net;
- net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+ struct vhost_net *net = container_of(work, struct vhost_net,
+ poll[VHOST_NET_VQ_TX].work);
handle_tx(net);
}

-static void handle_rx_net(struct work_struct *work)
+static void handle_rx_net(struct vhost_work *work)
{
- struct vhost_net *net;
- net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+ struct vhost_net *net = container_of(work, struct vhost_net,
+ poll[VHOST_NET_VQ_RX].work);
handle_rx(net);
}

static int vhost_net_open(struct inode *inode, struct file *f)
{
struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+ struct vhost_dev *dev;
int r;
+
if (!n)
return -ENOMEM;
+
+ dev = &n->dev;
n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
- r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+ r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
if (r < 0) {
kfree(n);
return r;
}

- vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
- vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;

f->private_data = n;
@@ -644,25 +648,13 @@ static struct miscdevice vhost_net_misc

static int vhost_net_init(void)
{
- int r = vhost_init();
- if (r)
- goto err_init;
- r = misc_register(&vhost_net_misc);
- if (r)
- goto err_reg;
- return 0;
-err_reg:
- vhost_cleanup();
-err_init:
- return r;
-
+ return misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);

static void vhost_net_exit(void)
{
misc_deregister(&vhost_net_misc);
- vhost_cleanup();
}
module_exit(vhost_net_exit);

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -17,12 +17,12 @@
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/slab.h>
+#include <linux/kthread.h>

#include <linux/net.h>
#include <linux/if_packet.h>
@@ -37,8 +37,6 @@ enum {
VHOST_MEMORY_F_LOG = 0x1,
};

-static struct workqueue_struct *vhost_workqueue;
-
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -52,23 +50,31 @@ static void vhost_poll_func(struct file
static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
void *key)
{
- struct vhost_poll *poll;
- poll = container_of(wait, struct vhost_poll, wait);
+ struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
+
if (!((unsigned long)key & poll->mask))
return 0;

- queue_work(vhost_workqueue, &poll->work);
+ vhost_poll_queue(poll);
return 0;
}

/* Init poll structure */
-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
- unsigned long mask)
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+ unsigned long mask, struct vhost_dev *dev)
{
- INIT_WORK(&poll->work, func);
+ struct vhost_work *work = &poll->work;
+
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
+ poll->dev = dev;
+
+ INIT_LIST_HEAD(&work->node);
+ work->fn = fn;
+ init_waitqueue_head(&work->done);
+ atomic_set(&work->flushing, 0);
+ work->queue_seq = work->done_seq = 0;
}

/* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -92,12 +98,29 @@ void vhost_poll_stop(struct vhost_poll *
* locks that are also used by the callback. */
void vhost_poll_flush(struct vhost_poll *poll)
{
- flush_work(&poll->work);
+ struct vhost_work *work = &poll->work;
+ int seq = work->queue_seq;
+
+ atomic_inc(&work->flushing);
+ smp_mb__after_atomic_inc(); /* mb flush-b0 paired with worker-b1 */
+ wait_event(work->done, seq - work->done_seq <= 0);
+ atomic_dec(&work->flushing);
+ smp_mb__after_atomic_dec(); /* rmb flush-b1 paired with worker-b0 */
}

void vhost_poll_queue(struct vhost_poll *poll)
{
- queue_work(vhost_workqueue, &poll->work);
+ struct vhost_dev *dev = poll->dev;
+ struct vhost_work *work = &poll->work;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->work_lock, flags);
+ if (list_empty(&work->node)) {
+ list_add_tail(&work->node, &dev->work_list);
+ work->queue_seq++;
+ wake_up_process(dev->worker);
+ }
+ spin_unlock_irqrestore(&dev->work_lock, flags);
}

static void vhost_vq_reset(struct vhost_dev *dev,
@@ -125,10 +148,52 @@ static void vhost_vq_reset(struct vhost_
vq->log_ctx = NULL;
}

+static int vhost_worker(void *data)
+{
+ struct vhost_dev *dev = data;
+ struct vhost_work *work;
+
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ work = NULL;
+ spin_lock_irq(&dev->work_lock);
+ if (!list_empty(&dev->work_list)) {
+ work = list_first_entry(&dev->work_list,
+ struct vhost_work, node);
+ list_del_init(&work->node);
+ }
+ spin_unlock_irq(&dev->work_lock);
+
+ if (work) {
+ __set_current_state(TASK_RUNNING);
+ work->fn(work);
+ smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
+ work->done_seq = work->queue_seq;
+ smp_mb(); /* mb worker-b1 paired with flush-b0 */
+ if (atomic_read(&work->flushing))
+ wake_up_all(&work->done);
+ } else
+ schedule();
+
+ goto repeat;
+}
+
long vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue *vqs, int nvqs)
{
+ struct task_struct *worker;
int i;
+
+ worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
+ if (IS_ERR(worker))
+ return PTR_ERR(worker);
+
dev->vqs = vqs;
dev->nvqs = nvqs;
mutex_init(&dev->mutex);
@@ -136,6 +201,9 @@ long vhost_dev_init(struct vhost_dev *de
dev->log_file = NULL;
dev->memory = NULL;
dev->mm = NULL;
+ spin_lock_init(&dev->work_lock);
+ INIT_LIST_HEAD(&dev->work_list);
+ dev->worker = worker;

for (i = 0; i < dev->nvqs; ++i) {
dev->vqs[i].dev = dev;
@@ -143,9 +211,10 @@ long vhost_dev_init(struct vhost_dev *de
vhost_vq_reset(dev, dev->vqs + i);
if (dev->vqs[i].handle_kick)
vhost_poll_init(&dev->vqs[i].poll,
- dev->vqs[i].handle_kick,
- POLLIN);
+ dev->vqs[i].handle_kick, POLLIN, dev);
}
+
+ wake_up_process(worker); /* avoid contributing to loadavg */
return 0;
}

@@ -217,6 +286,9 @@ void vhost_dev_cleanup(struct vhost_dev
if (dev->mm)
mmput(dev->mm);
dev->mm = NULL;
+
+ WARN_ON(!list_empty(&dev->work_list));
+ kthread_stop(dev->worker);
}

static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
@@ -1113,16 +1185,3 @@ void vhost_disable_notify(struct vhost_v
vq_err(vq, "Failed to enable notification at %p: %d\n",
&vq->used->flags, r);
}
-
-int vhost_init(void)
-{
- vhost_workqueue = create_singlethread_workqueue("vhost");
- if (!vhost_workqueue)
- return -ENOMEM;
- return 0;
-}
-
-void vhost_cleanup(void)
-{
- destroy_workqueue(vhost_workqueue);
-}
Index: work/drivers/vhost/vhost.h
===================================================================
--- work.orig/drivers/vhost/vhost.h
+++ work/drivers/vhost/vhost.h
@@ -5,13 +5,13 @@
#include <linux/vhost.h>
#include <linux/mm.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/skbuff.h>
#include <linux/uio.h>
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
+#include <asm/atomic.h>

struct vhost_device;

@@ -20,19 +20,31 @@ enum {
VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
};

+struct vhost_work;
+typedef void (*vhost_work_fn_t)(struct vhost_work *work);
+
+struct vhost_work {
+ struct list_head node;
+ vhost_work_fn_t fn;
+ wait_queue_head_t done;
+ atomic_t flushing;
+ int queue_seq;
+ int done_seq;
+};
+
/* Poll a file (eventfd or socket) */
/* Note: there's nothing vhost specific about this structure. */
struct vhost_poll {
poll_table table;
wait_queue_head_t *wqh;
wait_queue_t wait;
- /* struct which will handle all actual work. */
- struct work_struct work;
+ struct vhost_work work;
unsigned long mask;
+ struct vhost_dev *dev;
};

-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
- unsigned long mask);
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+ unsigned long mask, struct vhost_dev *dev);
void vhost_poll_start(struct vhost_poll *poll, struct file *file);
void vhost_poll_stop(struct vhost_poll *poll);
void vhost_poll_flush(struct vhost_poll *poll);
@@ -63,7 +75,7 @@ struct vhost_virtqueue {
struct vhost_poll poll;

/* The routine to call when the Guest pings us, or timeout. */
- work_func_t handle_kick;
+ vhost_work_fn_t handle_kick;

/* Last available index we saw. */
u16 last_avail_idx;
@@ -86,11 +98,11 @@ struct vhost_virtqueue {
struct iovec hdr[VHOST_NET_MAX_SG];
size_t hdr_size;
/* We use a kind of RCU to access private pointer.
- * All readers access it from workqueue, which makes it possible to
- * flush the workqueue instead of synchronize_rcu. Therefore readers do
+ * All readers access it from worker, which makes it possible to
+ * flush the vhost_work instead of synchronize_rcu. Therefore readers do
* not need to call rcu_read_lock/rcu_read_unlock: the beginning of
- * work item execution acts instead of rcu_read_lock() and the end of
- * work item execution acts instead of rcu_read_lock().
+ * vhost_work execution acts instead of rcu_read_lock() and the end of
+ * vhost_work execution acts instead of rcu_read_lock().
* Writers use virtqueue mutex. */
void *private_data;
/* Log write descriptors */
@@ -110,6 +122,9 @@ struct vhost_dev {
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
+ spinlock_t work_lock;
+ struct list_head work_list;
+ struct task_struct *worker;
};

long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -136,9 +151,6 @@ bool vhost_enable_notify(struct vhost_vi
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);

-int vhost_init(void);
-void vhost_cleanup(void);
-
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
if ((vq)->error_ctx) \

2010-06-02 21:35:18

by Sridhar Samudrala

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On 6/2/2010 11:40 AM, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread. Other than callback
> argument change from struct work_struct * to struct vhost_work *,
> there's no visible change to vhost_poll_*() interface.
>
> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
>
> Partially based on Sridhar Samudrala's patch.
>
> * Updated to use sub structure vhost_work instead of directly using
> vhost_poll at Michael's suggestion.
>
> * Added flusher wake_up() optimization at Michael's suggestion.
>
> Signed-off-by: Tejun Heo<[email protected]>
> Cc: Michael S. Tsirkin<[email protected]>
> Cc: Sridhar Samudrala<[email protected]>
> ---
> Okay, just tested it. dev->work_lock had to be updated to use irq
> operations but other than that it worked just fine. Copied a large
> file using scp and it seems to perform pretty well although I don't
> have any reference of comparison. So, here's the updated version with
> the sign off.
>

I tested this with 4 VMs running netperf TCP stream tests from guest to
host and i am seeing similar
level of scalability in throughput i saw with the multi-thread workqueue
patch.
11200Mb/s - default (host cpu utilization: 40%)
21600Mb/s - multi-thread (host cpu utilization: 86%)

Signed-off-by: Sridhar Samudrala <[email protected]>

Thanks
Sridhar
> Thanks.
>
> drivers/vhost/net.c | 56 ++++++++++---------------
> drivers/vhost/vhost.c | 111 ++++++++++++++++++++++++++++++++++++++------------
> drivers/vhost/vhost.h | 38 +++++++++++------
> 3 files changed, 134 insertions(+), 71 deletions(-)
>
> Index: work/drivers/vhost/net.c
> ===================================================================
> --- work.orig/drivers/vhost/net.c
> +++ work/drivers/vhost/net.c
> @@ -294,54 +294,58 @@ static void handle_rx(struct vhost_net *
> unuse_mm(net->dev.mm);
> }
>
> -static void handle_tx_kick(struct work_struct *work)
> +static void handle_tx_kick(struct vhost_work *work)
> {
> - struct vhost_virtqueue *vq;
> - struct vhost_net *net;
> - vq = container_of(work, struct vhost_virtqueue, poll.work);
> - net = container_of(vq->dev, struct vhost_net, dev);
> + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> + poll.work);
> + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
> +
> handle_tx(net);
> }
>
> -static void handle_rx_kick(struct work_struct *work)
> +static void handle_rx_kick(struct vhost_work *work)
> {
> - struct vhost_virtqueue *vq;
> - struct vhost_net *net;
> - vq = container_of(work, struct vhost_virtqueue, poll.work);
> - net = container_of(vq->dev, struct vhost_net, dev);
> + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> + poll.work);
> + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
> +
> handle_rx(net);
> }
>
> -static void handle_tx_net(struct work_struct *work)
> +static void handle_tx_net(struct vhost_work *work)
> {
> - struct vhost_net *net;
> - net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
> + struct vhost_net *net = container_of(work, struct vhost_net,
> + poll[VHOST_NET_VQ_TX].work);
> handle_tx(net);
> }
>
> -static void handle_rx_net(struct work_struct *work)
> +static void handle_rx_net(struct vhost_work *work)
> {
> - struct vhost_net *net;
> - net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
> + struct vhost_net *net = container_of(work, struct vhost_net,
> + poll[VHOST_NET_VQ_RX].work);
> handle_rx(net);
> }
>
> static int vhost_net_open(struct inode *inode, struct file *f)
> {
> struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
> + struct vhost_dev *dev;
> int r;
> +
> if (!n)
> return -ENOMEM;
> +
> + dev =&n->dev;
> n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
> n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
> - r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
> + r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
> if (r< 0) {
> kfree(n);
> return r;
> }
>
> - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
> + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
> n->tx_poll_state = VHOST_NET_POLL_DISABLED;
>
> f->private_data = n;
> @@ -644,25 +648,13 @@ static struct miscdevice vhost_net_misc
>
> static int vhost_net_init(void)
> {
> - int r = vhost_init();
> - if (r)
> - goto err_init;
> - r = misc_register(&vhost_net_misc);
> - if (r)
> - goto err_reg;
> - return 0;
> -err_reg:
> - vhost_cleanup();
> -err_init:
> - return r;
> -
> + return misc_register(&vhost_net_misc);
> }
> module_init(vhost_net_init);
>
> static void vhost_net_exit(void)
> {
> misc_deregister(&vhost_net_misc);
> - vhost_cleanup();
> }
> module_exit(vhost_net_exit);
>
> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c
> @@ -17,12 +17,12 @@
> #include<linux/mm.h>
> #include<linux/miscdevice.h>
> #include<linux/mutex.h>
> -#include<linux/workqueue.h>
> #include<linux/rcupdate.h>
> #include<linux/poll.h>
> #include<linux/file.h>
> #include<linux/highmem.h>
> #include<linux/slab.h>
> +#include<linux/kthread.h>
>
> #include<linux/net.h>
> #include<linux/if_packet.h>
> @@ -37,8 +37,6 @@ enum {
> VHOST_MEMORY_F_LOG = 0x1,
> };
>
> -static struct workqueue_struct *vhost_workqueue;
> -
> static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
> poll_table *pt)
> {
> @@ -52,23 +50,31 @@ static void vhost_poll_func(struct file
> static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
> void *key)
> {
> - struct vhost_poll *poll;
> - poll = container_of(wait, struct vhost_poll, wait);
> + struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
> +
> if (!((unsigned long)key& poll->mask))
> return 0;
>
> - queue_work(vhost_workqueue,&poll->work);
> + vhost_poll_queue(poll);
> return 0;
> }
>
> /* Init poll structure */
> -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> - unsigned long mask)
> +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
> + unsigned long mask, struct vhost_dev *dev)
> {
> - INIT_WORK(&poll->work, func);
> + struct vhost_work *work =&poll->work;
> +
> init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
> init_poll_funcptr(&poll->table, vhost_poll_func);
> poll->mask = mask;
> + poll->dev = dev;
> +
> + INIT_LIST_HEAD(&work->node);
> + work->fn = fn;
> + init_waitqueue_head(&work->done);
> + atomic_set(&work->flushing, 0);
> + work->queue_seq = work->done_seq = 0;
> }
>
> /* Start polling a file. We add ourselves to file's wait queue. The caller must
> @@ -92,12 +98,29 @@ void vhost_poll_stop(struct vhost_poll *
> * locks that are also used by the callback. */
> void vhost_poll_flush(struct vhost_poll *poll)
> {
> - flush_work(&poll->work);
> + struct vhost_work *work =&poll->work;
> + int seq = work->queue_seq;
> +
> + atomic_inc(&work->flushing);
> + smp_mb__after_atomic_inc(); /* mb flush-b0 paired with worker-b1 */
> + wait_event(work->done, seq - work->done_seq<= 0);
> + atomic_dec(&work->flushing);
> + smp_mb__after_atomic_dec(); /* rmb flush-b1 paired with worker-b0 */
> }
>
> void vhost_poll_queue(struct vhost_poll *poll)
> {
> - queue_work(vhost_workqueue,&poll->work);
> + struct vhost_dev *dev = poll->dev;
> + struct vhost_work *work =&poll->work;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&dev->work_lock, flags);
> + if (list_empty(&work->node)) {
> + list_add_tail(&work->node,&dev->work_list);
> + work->queue_seq++;
> + wake_up_process(dev->worker);
> + }
> + spin_unlock_irqrestore(&dev->work_lock, flags);
> }
>
> static void vhost_vq_reset(struct vhost_dev *dev,
> @@ -125,10 +148,52 @@ static void vhost_vq_reset(struct vhost_
> vq->log_ctx = NULL;
> }
>
> +static int vhost_worker(void *data)
> +{
> + struct vhost_dev *dev = data;
> + struct vhost_work *work;
> +
> +repeat:
> + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
> +
> + if (kthread_should_stop()) {
> + __set_current_state(TASK_RUNNING);
> + return 0;
> + }
> +
> + work = NULL;
> + spin_lock_irq(&dev->work_lock);
> + if (!list_empty(&dev->work_list)) {
> + work = list_first_entry(&dev->work_list,
> + struct vhost_work, node);
> + list_del_init(&work->node);
> + }
> + spin_unlock_irq(&dev->work_lock);
> +
> + if (work) {
> + __set_current_state(TASK_RUNNING);
> + work->fn(work);
> + smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
> + work->done_seq = work->queue_seq;
> + smp_mb(); /* mb worker-b1 paired with flush-b0 */
> + if (atomic_read(&work->flushing))
> + wake_up_all(&work->done);
> + } else
> + schedule();
> +
> + goto repeat;
> +}
> +
> long vhost_dev_init(struct vhost_dev *dev,
> struct vhost_virtqueue *vqs, int nvqs)
> {
> + struct task_struct *worker;
> int i;
> +
> + worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
> + if (IS_ERR(worker))
> + return PTR_ERR(worker);
> +
> dev->vqs = vqs;
> dev->nvqs = nvqs;
> mutex_init(&dev->mutex);
> @@ -136,6 +201,9 @@ long vhost_dev_init(struct vhost_dev *de
> dev->log_file = NULL;
> dev->memory = NULL;
> dev->mm = NULL;
> + spin_lock_init(&dev->work_lock);
> + INIT_LIST_HEAD(&dev->work_list);
> + dev->worker = worker;
>
> for (i = 0; i< dev->nvqs; ++i) {
> dev->vqs[i].dev = dev;
> @@ -143,9 +211,10 @@ long vhost_dev_init(struct vhost_dev *de
> vhost_vq_reset(dev, dev->vqs + i);
> if (dev->vqs[i].handle_kick)
> vhost_poll_init(&dev->vqs[i].poll,
> - dev->vqs[i].handle_kick,
> - POLLIN);
> + dev->vqs[i].handle_kick, POLLIN, dev);
> }
> +
> + wake_up_process(worker); /* avoid contributing to loadavg */
> return 0;
> }
>
> @@ -217,6 +286,9 @@ void vhost_dev_cleanup(struct vhost_dev
> if (dev->mm)
> mmput(dev->mm);
> dev->mm = NULL;
> +
> + WARN_ON(!list_empty(&dev->work_list));
> + kthread_stop(dev->worker);
> }
>
> static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
> @@ -1113,16 +1185,3 @@ void vhost_disable_notify(struct vhost_v
> vq_err(vq, "Failed to enable notification at %p: %d\n",
> &vq->used->flags, r);
> }
> -
> -int vhost_init(void)
> -{
> - vhost_workqueue = create_singlethread_workqueue("vhost");
> - if (!vhost_workqueue)
> - return -ENOMEM;
> - return 0;
> -}
> -
> -void vhost_cleanup(void)
> -{
> - destroy_workqueue(vhost_workqueue);
> -}
> Index: work/drivers/vhost/vhost.h
> ===================================================================
> --- work.orig/drivers/vhost/vhost.h
> +++ work/drivers/vhost/vhost.h
> @@ -5,13 +5,13 @@
> #include<linux/vhost.h>
> #include<linux/mm.h>
> #include<linux/mutex.h>
> -#include<linux/workqueue.h>
> #include<linux/poll.h>
> #include<linux/file.h>
> #include<linux/skbuff.h>
> #include<linux/uio.h>
> #include<linux/virtio_config.h>
> #include<linux/virtio_ring.h>
> +#include<asm/atomic.h>
>
> struct vhost_device;
>
> @@ -20,19 +20,31 @@ enum {
> VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
> };
>
> +struct vhost_work;
> +typedef void (*vhost_work_fn_t)(struct vhost_work *work);
> +
> +struct vhost_work {
> + struct list_head node;
> + vhost_work_fn_t fn;
> + wait_queue_head_t done;
> + atomic_t flushing;
> + int queue_seq;
> + int done_seq;
> +};
> +
> /* Poll a file (eventfd or socket) */
> /* Note: there's nothing vhost specific about this structure. */
> struct vhost_poll {
> poll_table table;
> wait_queue_head_t *wqh;
> wait_queue_t wait;
> - /* struct which will handle all actual work. */
> - struct work_struct work;
> + struct vhost_work work;
> unsigned long mask;
> + struct vhost_dev *dev;
> };
>
> -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> - unsigned long mask);
> +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
> + unsigned long mask, struct vhost_dev *dev);
> void vhost_poll_start(struct vhost_poll *poll, struct file *file);
> void vhost_poll_stop(struct vhost_poll *poll);
> void vhost_poll_flush(struct vhost_poll *poll);
> @@ -63,7 +75,7 @@ struct vhost_virtqueue {
> struct vhost_poll poll;
>
> /* The routine to call when the Guest pings us, or timeout. */
> - work_func_t handle_kick;
> + vhost_work_fn_t handle_kick;
>
> /* Last available index we saw. */
> u16 last_avail_idx;
> @@ -86,11 +98,11 @@ struct vhost_virtqueue {
> struct iovec hdr[VHOST_NET_MAX_SG];
> size_t hdr_size;
> /* We use a kind of RCU to access private pointer.
> - * All readers access it from workqueue, which makes it possible to
> - * flush the workqueue instead of synchronize_rcu. Therefore readers do
> + * All readers access it from worker, which makes it possible to
> + * flush the vhost_work instead of synchronize_rcu. Therefore readers do
> * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
> - * work item execution acts instead of rcu_read_lock() and the end of
> - * work item execution acts instead of rcu_read_lock().
> + * vhost_work execution acts instead of rcu_read_lock() and the end of
> + * vhost_work execution acts instead of rcu_read_lock().
> * Writers use virtqueue mutex. */
> void *private_data;
> /* Log write descriptors */
> @@ -110,6 +122,9 @@ struct vhost_dev {
> int nvqs;
> struct file *log_file;
> struct eventfd_ctx *log_ctx;
> + spinlock_t work_lock;
> + struct list_head work_list;
> + struct task_struct *worker;
> };
>
> long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> @@ -136,9 +151,6 @@ bool vhost_enable_notify(struct vhost_vi
> int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
> unsigned int log_num, u64 len);
>
> -int vhost_init(void);
> -void vhost_cleanup(void);
> -
> #define vq_err(vq, fmt, ...) do { \
> pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
> if ((vq)->error_ctx) \
>

2010-07-22 16:04:35

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Wed, Jun 02, 2010 at 08:40:00PM +0200, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread. Other than callback
> argument change from struct work_struct * to struct vhost_work *,
> there's no visible change to vhost_poll_*() interface.
>
> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
>
> Partially based on Sridhar Samudrala's patch.
>
> * Updated to use sub structure vhost_work instead of directly using
> vhost_poll at Michael's suggestion.
>
> * Added flusher wake_up() optimization at Michael's suggestion.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Cc: Michael S. Tsirkin <[email protected]>
> Cc: Sridhar Samudrala <[email protected]>

All the tricky barrier pairing made me uncomfortable. So I came up with
this on top (untested): if we do all operations under the spinlock, we
can get by without barriers and atomics. And since we need the lock for
list operations anyway, this should have no paerformance impact.

What do you think?


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0c6b533..7730a30 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -73,7 +73,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
INIT_LIST_HEAD(&work->node);
work->fn = fn;
init_waitqueue_head(&work->done);
- atomic_set(&work->flushing, 0);
+ work->flushing = 0;
work->queue_seq = work->done_seq = 0;
}

@@ -99,13 +99,23 @@ void vhost_poll_stop(struct vhost_poll *poll)
void vhost_poll_flush(struct vhost_poll *poll)
{
struct vhost_work *work = &poll->work;
- int seq = work->queue_seq;
+ unsigned seq, left;
+ int flushing;

- atomic_inc(&work->flushing);
- smp_mb__after_atomic_inc(); /* mb flush-b0 paired with worker-b1 */
- wait_event(work->done, seq - work->done_seq <= 0);
- atomic_dec(&work->flushing);
- smp_mb__after_atomic_dec(); /* rmb flush-b1 paired with worker-b0 */
+ spin_lock_irq(&dev->work_lock);
+ seq = work->queue_seq;
+ work->flushing++;
+ spin_unlock_irq(&dev->work_lock);
+ wait_event(work->done, {
+ spin_lock_irq(&dev->work_lock);
+ left = work->done_seq - seq;
+ spin_unlock_irq(&dev->work_lock);
+ left < UINT_MAX / 2;
+ });
+ spin_lock_irq(&dev->work_lock);
+ flushing = --work->flushing;
+ spin_unlock_irq(&dev->work_lock);
+ BUG_ON(flushing < 0);
}

void vhost_poll_queue(struct vhost_poll *poll)
@@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
static int vhost_worker(void *data)
{
struct vhost_dev *dev = data;
- struct vhost_work *work;
+ struct vhost_work *work = NULL;

-repeat:
- set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */

- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }

- work = NULL;
- spin_lock_irq(&dev->work_lock);
- if (!list_empty(&dev->work_list)) {
- work = list_first_entry(&dev->work_list,
- struct vhost_work, node);
- list_del_init(&work->node);
- }
- spin_unlock_irq(&dev->work_lock);
+ spin_lock_irq(&dev->work_lock);
+ if (work) {
+ work->done_seq = work->queue_seq;
+ if (work->flushing)
+ wake_up_all(&work->done);
+ }
+ if (!list_empty(&dev->work_list)) {
+ work = list_first_entry(&dev->work_list,
+ struct vhost_work, node);
+ list_del_init(&work->node);
+ } else
+ work = NULL;
+ spin_unlock_irq(&dev->work_lock);
+
+ if (work) {
+ __set_current_state(TASK_RUNNING);
+ work->fn(work);
+ } else
+ schedule();

- if (work) {
- __set_current_state(TASK_RUNNING);
- work->fn(work);
- smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
- work->done_seq = work->queue_seq;
- smp_mb(); /* mb worker-b1 paired with flush-b0 */
- if (atomic_read(&work->flushing))
- wake_up_all(&work->done);
- } else
- schedule();
-
- goto repeat;
+ }
}

long vhost_dev_init(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0e63091..3693327 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -27,9 +27,9 @@ struct vhost_work {
struct list_head node;
vhost_work_fn_t fn;
wait_queue_head_t done;
- atomic_t flushing;
- int queue_seq;
- int done_seq;
+ int flushing;
+ unsigned queue_seq;
+ unsigned done_seq;
};

/* Poll a file (eventfd or socket) */

2010-07-22 21:22:48

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/22/2010 05:58 PM, Michael S. Tsirkin wrote:
> All the tricky barrier pairing made me uncomfortable. So I came up with
> this on top (untested): if we do all operations under the spinlock, we
> can get by without barriers and atomics. And since we need the lock for
> list operations anyway, this should have no paerformance impact.
>
> What do you think?

I've created kthread_worker in wq#for-next tree and already converted
ivtv to use it. Once this lands in mainline, I think converting vhost
to use it would be better choice. kthread worker code uses basically
the same logic used in the vhost_workqueue code but is better
organized and documented. So, I think it would be better to stick
with the original implementation, as otherwise we're likely to just
decrease test coverage without much gain.

http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439

> @@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> static int vhost_worker(void *data)
> {
> struct vhost_dev *dev = data;
> - struct vhost_work *work;
> + struct vhost_work *work = NULL;
>
> -repeat:
> - set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
> + for (;;) {
> + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
>
> - if (kthread_should_stop()) {
> - __set_current_state(TASK_RUNNING);
> - return 0;
> - }
> + if (kthread_should_stop()) {
> + __set_current_state(TASK_RUNNING);
> + return 0;
> + }
>
> - work = NULL;
> - spin_lock_irq(&dev->work_lock);
> - if (!list_empty(&dev->work_list)) {
> - work = list_first_entry(&dev->work_list,
> - struct vhost_work, node);
> - list_del_init(&work->node);
> - }
> - spin_unlock_irq(&dev->work_lock);
> + spin_lock_irq(&dev->work_lock);
> + if (work) {
> + work->done_seq = work->queue_seq;
> + if (work->flushing)
> + wake_up_all(&work->done);

I don't think doing this before executing the function is correct, so
you'll have to release the lock, execute the function, regrab the lock
and then do the flush processing.

Thanks.

--
tejun

2010-07-24 19:21:20

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Thu, Jul 22, 2010 at 11:21:40PM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/22/2010 05:58 PM, Michael S. Tsirkin wrote:
> > All the tricky barrier pairing made me uncomfortable. So I came up with
> > this on top (untested): if we do all operations under the spinlock, we
> > can get by without barriers and atomics. And since we need the lock for
> > list operations anyway, this should have no paerformance impact.
> >
> > What do you think?
>
> I've created kthread_worker in wq#for-next tree and already converted
> ivtv to use it. Once this lands in mainline, I think converting vhost
> to use it would be better choice. kthread worker code uses basically
> the same logic used in the vhost_workqueue code but is better
> organized and documented. So, I think it would be better to stick
> with the original implementation, as otherwise we're likely to just
> decrease test coverage without much gain.
>
> http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439

Sure, if we keep using workqueue. But I'd like to investigate this
direction a bit more because there's discussion to switching from kthread to
regular threads altogether.

> > @@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> > static int vhost_worker(void *data)
> > {
> > struct vhost_dev *dev = data;
> > - struct vhost_work *work;
> > + struct vhost_work *work = NULL;
> >
> > -repeat:
> > - set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
> > + for (;;) {
> > + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
> >
> > - if (kthread_should_stop()) {
> > - __set_current_state(TASK_RUNNING);
> > - return 0;
> > - }
> > + if (kthread_should_stop()) {
> > + __set_current_state(TASK_RUNNING);
> > + return 0;
> > + }
> >
> > - work = NULL;
> > - spin_lock_irq(&dev->work_lock);
> > - if (!list_empty(&dev->work_list)) {
> > - work = list_first_entry(&dev->work_list,
> > - struct vhost_work, node);
> > - list_del_init(&work->node);
> > - }
> > - spin_unlock_irq(&dev->work_lock);
> > + spin_lock_irq(&dev->work_lock);
> > + if (work) {
> > + work->done_seq = work->queue_seq;
> > + if (work->flushing)
> > + wake_up_all(&work->done);
>
> I don't think doing this before executing the function is correct,

Well, before I execute the function work is NULL, so this is skipped.
Correct?

> so
> you'll have to release the lock, execute the function, regrab the lock
> and then do the flush processing.
>
> Thanks.

It's done in the loop, so I thought we can reuse the locking
done for the sake of processing the next work item.
Makes sense?


> --
> tejun

2010-07-25 07:42:09

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
>> I've created kthread_worker in wq#for-next tree and already converted
>> ivtv to use it. Once this lands in mainline, I think converting vhost
>> to use it would be better choice. kthread worker code uses basically
>> the same logic used in the vhost_workqueue code but is better
>> organized and documented. So, I think it would be better to stick
>> with the original implementation, as otherwise we're likely to just
>> decrease test coverage without much gain.
>>
>> http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
>
> Sure, if we keep using workqueue. But I'd like to investigate this
> direction a bit more because there's discussion to switching from kthread to
> regular threads altogether.

Hmmm? It doesn't have much to do with workqueue. kthread_worker is a
simple wrapper around kthread. It now assumes kthread but changing it
to be useable with any thread shouldn't be too hard. Wouldn't that be
better?

>> I don't think doing this before executing the function is correct,
>
> Well, before I execute the function work is NULL, so this is skipped.
> Correct?
>
>> so
>> you'll have to release the lock, execute the function, regrab the lock
>> and then do the flush processing.
>>
>> Thanks.
>
> It's done in the loop, so I thought we can reuse the locking
> done for the sake of processing the next work item.
> Makes sense?

Yeap, right. I think it would make much more sense to use common code
when it becomes available but if you think the posted change is
necessary till then, please feel free to go ahead.

Thanks.

--
tejun

2010-07-25 10:10:35

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Sun, Jul 25, 2010 at 09:41:22AM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
> >> I've created kthread_worker in wq#for-next tree and already converted
> >> ivtv to use it. Once this lands in mainline, I think converting vhost
> >> to use it would be better choice. kthread worker code uses basically
> >> the same logic used in the vhost_workqueue code but is better
> >> organized and documented. So, I think it would be better to stick
> >> with the original implementation, as otherwise we're likely to just
> >> decrease test coverage without much gain.
> >>
> >> http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> >
> > Sure, if we keep using workqueue. But I'd like to investigate this
> > direction a bit more because there's discussion to switching from kthread to
> > regular threads altogether.
>
> Hmmm? It doesn't have much to do with workqueue. kthread_worker is a
> simple wrapper around kthread. It now assumes kthread but changing it
> to be useable with any thread shouldn't be too hard. Wouldn't that be
> better?

Yes, of course, when common code becomes available we should
switch to that.

> >> I don't think doing this before executing the function is correct,
> >
> > Well, before I execute the function work is NULL, so this is skipped.
> > Correct?
> >
> >> so
> >> you'll have to release the lock, execute the function, regrab the lock
> >> and then do the flush processing.
> >>
> >> Thanks.
> >
> > It's done in the loop, so I thought we can reuse the locking
> > done for the sake of processing the next work item.
> > Makes sense?
>
> Yeap, right. I think it would make much more sense to use common code
> when it becomes available but if you think the posted change is
> necessary till then, please feel free to go ahead.
>
> Thanks.
>
> --
> tejun

2010-07-26 15:31:37

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Sun, Jul 25, 2010 at 09:41:22AM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
> >> I've created kthread_worker in wq#for-next tree and already converted
> >> ivtv to use it. Once this lands in mainline, I think converting vhost
> >> to use it would be better choice. kthread worker code uses basically
> >> the same logic used in the vhost_workqueue code but is better
> >> organized and documented. So, I think it would be better to stick
> >> with the original implementation, as otherwise we're likely to just
> >> decrease test coverage without much gain.
> >>
> >> http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> >
> > Sure, if we keep using workqueue. But I'd like to investigate this
> > direction a bit more because there's discussion to switching from kthread to
> > regular threads altogether.
>
> Hmmm? It doesn't have much to do with workqueue. kthread_worker is a
> simple wrapper around kthread. It now assumes kthread but changing it
> to be useable with any thread shouldn't be too hard. Wouldn't that be
> better?

BTW, kthread_worker would benefit from the optimization I implemented
here as well.

--
MST

2010-07-26 15:35:54

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> BTW, kthread_worker would benefit from the optimization I implemented
> here as well.

Hmmm... I'm not quite sure whether it's an optimization. I thought
the patch was due to feeling uncomfortable about using barriers? Is
it an optimization?

Thanks.

--
tejun

2010-07-26 15:47:36

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On 07/26/2010 05:34 PM, Tejun Heo wrote:
> Hello,
>
> On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
>> BTW, kthread_worker would benefit from the optimization I implemented
>> here as well.
>
> Hmmm... I'm not quite sure whether it's an optimization. I thought
> the patch was due to feeling uncomfortable about using barriers? Is
> it an optimization?

Yeah, one less smp_mb() in execution path. The lock dancing in
flush() is ugly but then again mucking with barriers could be harder
to understand. Care to send a patch against wq#for-next tree?

Thanks.

--
tejun

2010-07-26 15:56:52

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 05:34:44PM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> > BTW, kthread_worker would benefit from the optimization I implemented
> > here as well.
>
> Hmmm... I'm not quite sure whether it's an optimization. I thought
> the patch was due to feeling uncomfortable about using barriers?

Oh yes. But getting rid of barriers is what motivated me originally.

> Is it an optimization?
>
> Thanks.

Yes, sure. This removes atomic read and 2 barrier operations on data path. And
it does not add any new synchronization: instead, we reuse the lock that we
take anyway. The relevant part is:


+ if (work) {
+ __set_current_state(TASK_RUNNING);
+ work->fn(work);
+ } else
+ schedule();

- if (work) {
- __set_current_state(TASK_RUNNING);
- work->fn(work);
- smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
- work->done_seq = work->queue_seq;
- smp_mb(); /* mb worker-b1 paired with flush-b0 */
- if (atomic_read(&work->flushing))
- wake_up_all(&work->done);
- } else
- schedule();
-
- goto repeat;

Is there a git tree with kthread_worker applied?
I might do this just for fun ...


> --
> tejun

2010-07-26 15:57:31

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 05:46:30PM +0200, Tejun Heo wrote:
> On 07/26/2010 05:34 PM, Tejun Heo wrote:
> > Hello,
> >
> > On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> >> BTW, kthread_worker would benefit from the optimization I implemented
> >> here as well.
> >
> > Hmmm... I'm not quite sure whether it's an optimization. I thought
> > the patch was due to feeling uncomfortable about using barriers? Is
> > it an optimization?
>
> Yeah, one less smp_mb() in execution path. The lock dancing in
> flush() is ugly but then again mucking with barriers could be harder
> to understand. Care to send a patch against wq#for-next tree?
>
> Thanks.

Sure. Where's that, exactly?

> --
> tejun

2010-07-26 16:06:59

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/26/2010 05:50 PM, Michael S. Tsirkin wrote:
>> Hmmm... I'm not quite sure whether it's an optimization. I thought
>> the patch was due to feeling uncomfortable about using barriers?
>
> Oh yes. But getting rid of barriers is what motivated me originally.

Yeah, getting rid of barriers is always good. :-)

> Is there a git tree with kthread_worker applied?
> I might do this just for fun ...

git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-next

For the original implementaiton, please take a look at commit
b56c0d8937e665a27d90517ee7a746d0aa05af46.

* Can you please keep the outer goto repeat loop? I just don't like
outermost for (;;).

* Placing try_to_freeze() could be a bit annoying. It shouldn't be
executed when there's a work to flush.

* I think A - B <= 0 test would be more familiar. At least
time_before/after() are implemented that way.

Thanks.

--
tejun

2010-07-26 16:15:30

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Just one more thing.

On 07/26/2010 06:05 PM, Tejun Heo wrote:
> * Placing try_to_freeze() could be a bit annoying. It shouldn't be
> executed when there's a work to flush.

* Similar issue exists for kthread_stop(). The kthread shouldn't exit
while there's a work to flush (please note that kthread_worker
interface allows detaching / attaching worker kthread during
operation, so it should remain in consistent state with regard to
flushing).

Thanks.

--
tejun

2010-07-26 16:37:16

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
> Just one more thing.
>
> On 07/26/2010 06:05 PM, Tejun Heo wrote:
> > * Placing try_to_freeze() could be a bit annoying. It shouldn't be
> > executed when there's a work to flush.

BTW why is this important?
We could always get another work and flush right after
try_to_freeze, and then flush would block for a long time.


BTW the vhost patch you sent does not do this at all.
I am guessing it is because our thread is not freezable?

> * Similar issue exists for kthread_stop(). The kthread shouldn't exit
> while there's a work to flush (please note that kthread_worker
> interface allows detaching / attaching worker kthread during
> operation, so it should remain in consistent state with regard to
> flushing).
>
> Thanks.

Not sure I agree here. Users must synchronise flush and stop calls.
Otherwise a work might get queued after stop is called, and
you won't be able to flush it.


> --
> tejun

2010-07-26 16:57:14

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
> Just one more thing.

I noticed that with vhost, flush_work was getting the worker
pointer as well. Can we live with this API change?

--
MST

2010-07-26 17:03:16

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Here's an untested patch forward-ported from vhost
(works fine for vhost).

kthread_worker: replace barriers+atomics with a lock

We can save some cycles and make code simpler by
reusing worker lock for flush, instead of atomics.
flush_kthread_work needs to get worker pointer for
this to work.

Signed-off-by: Michael S. Tsirkin <[email protected]>

---

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 685ea65..19ae9f2 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -58,7 +58,7 @@ struct kthread_work {
struct list_head node;
kthread_work_func_t func;
wait_queue_head_t done;
- atomic_t flushing;
+ int flushing;
int queue_seq;
int done_seq;
};
@@ -72,7 +72,7 @@ struct kthread_work {
.node = LIST_HEAD_INIT((work).node), \
.func = (fn), \
.done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done), \
- .flushing = ATOMIC_INIT(0), \
+ .flushing = 0, \
}

#define DEFINE_KTHREAD_WORKER(worker) \
@@ -96,7 +96,8 @@ int kthread_worker_fn(void *worker_ptr);

bool queue_kthread_work(struct kthread_worker *worker,
struct kthread_work *work);
-void flush_kthread_work(struct kthread_work *work);
+void flush_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work);
void flush_kthread_worker(struct kthread_worker *worker);

#endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786..461f58d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -283,10 +283,12 @@ int kthreadd(void *unused)
int kthread_worker_fn(void *worker_ptr)
{
struct kthread_worker *worker = worker_ptr;
- struct kthread_work *work;
+ struct kthread_work *work = NULL;

+ spin_lock_irq(&worker->lock);
WARN_ON(worker->task);
worker->task = current;
+ spin_unlock_irq(&worker->lock);
repeat:
set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */

@@ -298,23 +300,23 @@ repeat:
return 0;
}

- work = NULL;
spin_lock_irq(&worker->lock);
+ if (work) {
+ work->done_seq = work->queue_seq;
+ if (work->flushing)
+ wake_up_all(&work->done);
+ }
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
- }
+ } else
+ work = NULL;
spin_unlock_irq(&worker->lock);

if (work) {
__set_current_state(TASK_RUNNING);
work->func(work);
- smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
- work->done_seq = work->queue_seq;
- smp_mb(); /* mb worker-b1 paired with flush-b0 */
- if (atomic_read(&work->flushing))
- wake_up_all(&work->done);
} else if (!freezing(current))
schedule();

@@ -353,31 +355,33 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);

/**
* flush_kthread_work - flush a kthread_work
+ * @worker: where work might be running
* @work: work to flush
*
* If @work is queued or executing, wait for it to finish execution.
*/
-void flush_kthread_work(struct kthread_work *work)
+void flush_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work)
{
- int seq = work->queue_seq;
+ int seq

- atomic_inc(&work->flushing);
-
- /*
- * mb flush-b0 paired with worker-b1, to make sure either
- * worker sees the above increment or we see done_seq update.
- */
- smp_mb__after_atomic_inc();
+ spin_lock_irq(&worker->lock);
+ seq = work->queue_seq;
+ ++work->flushing;
+ spin_unlock_irq(&worker->lock);

/* A - B <= 0 tests whether B is in front of A regardless of overflow */
- wait_event(work->done, seq - work->done_seq <= 0);
- atomic_dec(&work->flushing);
-
- /*
- * rmb flush-b1 paired with worker-b0, to make sure our caller
- * sees every change made by work->func().
- */
- smp_mb__after_atomic_dec();
+ wait_event(work->done,
+ ({
+ int done;
+ spin_lock_irq(&worker->lock);
+ delta = seq - work->done_seq <= 0;
+ spin_unlock_irq(&worker->lock);
+ done;
+ });
+ spin_lock_irq(&worker->lock);
+ --work->flushing;
+ spin_unlock_irq(&worker->lock);
}
EXPORT_SYMBOL_GPL(flush_kthread_work);

2010-07-26 17:16:01

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 06:05:27PM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/26/2010 05:50 PM, Michael S. Tsirkin wrote:
> >> Hmmm... I'm not quite sure whether it's an optimization. I thought
> >> the patch was due to feeling uncomfortable about using barriers?
> >
> > Oh yes. But getting rid of barriers is what motivated me originally.
>
> Yeah, getting rid of barriers is always good. :-)
>
> > Is there a git tree with kthread_worker applied?
> > I might do this just for fun ...
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-next
>
> For the original implementaiton, please take a look at commit
> b56c0d8937e665a27d90517ee7a746d0aa05af46.
>
> * Can you please keep the outer goto repeat loop? I just don't like
> outermost for (;;).

Okay ... can we put the code in a {} scope to make it clear
where does the loop starts and ends?

> * Placing try_to_freeze() could be a bit annoying. It shouldn't be
> executed when there's a work to flush.

It currently seems to be executed when there is work to flush.
Is this wrong?

> * I think A - B <= 0 test would be more familiar. At least
> time_before/after() are implemented that way.

I am concerned that this overflows a signed integer -
which I seem to remeber that C99 disallows.
timer macros are on data path so might be worth the risk there,
but flush is slow path so better be safe?

> Thanks.
>
> --
> tejun

2010-07-26 18:52:32

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/26/2010 06:31 PM, Michael S. Tsirkin wrote:
>> On 07/26/2010 06:05 PM, Tejun Heo wrote:
>>> * Placing try_to_freeze() could be a bit annoying. It shouldn't be
>>> executed when there's a work to flush.
>
> BTW why is this important?
> We could always get another work and flush right after
> try_to_freeze, and then flush would block for a long time.
>
> BTW the vhost patch you sent does not do this at all.
> I am guessing it is because our thread is not freezable?

Yeap, I think so.

>> * Similar issue exists for kthread_stop(). The kthread shouldn't exit
>> while there's a work to flush (please note that kthread_worker
>> interface allows detaching / attaching worker kthread during
>> operation, so it should remain in consistent state with regard to
>> flushing).
>
> Not sure I agree here. Users must synchronise flush and stop calls.
> Otherwise a work might get queued after stop is called, and
> you won't be able to flush it.

For freeze, it probably is okay but for stop, I think it's better to
keep the semantics straight forward. It may be okay to do otherwise
but having such oddity in generic interface is nasty and may lead to
surprises which can be pretty difficult to track down later on. It's
just a bit more of annoyance while writing the generic code, so...

Thanks.

--
tejun

2010-07-26 19:05:34

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On 07/26/2010 06:23 PM, Michael S. Tsirkin wrote:
>> * Can you please keep the outer goto repeat loop? I just don't like
>> outermost for (;;).
>
> Okay ... can we put the code in a {} scope to make it clear
> where does the loop starts and ends?

If we're gonna do that, it would be better to put it inside a loop
construct. The reason why I don't like it is that loops like that
don't really help read/writeability much while indenting the whole
logic unnecessarily and look more like a result of obsession against
goto rather than any practical reason. It's just a cosmetic
preference and I might as well be the weirdo here, so if you feel
strong about it, please feel free to put everything in a loop.

>> * Placing try_to_freeze() could be a bit annoying. It shouldn't be
>> executed when there's a work to flush.
>
> It currently seems to be executed when there is work to flush.
> Is this wrong?

Oh, does it? As I wrote in the other mail, things like that wouldn't
necessarily break correctness but I think it would be better to avoid
surprises in the generic code if not too difficult.

>> * I think A - B <= 0 test would be more familiar. At least
>> time_before/after() are implemented that way.
>
> I am concerned that this overflows a signed integer -
> which I seem to remeber that C99 disallows.

Really? Overflows of pointer isn't expected and that's why we have
weird RELOC_HIDE() macro for such calculations but integers not
expected to overflow is a news to me. Are you sure? That basically
means time_before/after() aren't safe either.

> timer macros are on data path so might be worth the risk there,
> but flush is slow path so better be safe?

I don't think performance matters much here. I just think the sign
test is clearer / more familiar for the logic.

Thanks.

--
tejun

2010-07-26 19:15:18

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
>> Just one more thing.
>
> I noticed that with vhost, flush_work was getting the worker
> pointer as well. Can we live with this API change?

Yeah, the flushing mechanism wouldn't work reliably if the work is
queued to a different worker without flushing, so yeah passing in
@worker might actually be better.

Thanks.

--
tejun

2010-07-26 19:33:12

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/26/2010 09:14 PM, Tejun Heo wrote:
> On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
>> I noticed that with vhost, flush_work was getting the worker
>> pointer as well. Can we live with this API change?
>
> Yeah, the flushing mechanism wouldn't work reliably if the work is
> queued to a different worker without flushing, so yeah passing in
> @worker might actually be better.

Thinking a bit more about it, it kind of sucks that queueing to
another worker from worker->func() breaks flush. Maybe the right
thing to do there is using atomic_t for done_seq? It pays a bit more
overhead but maybe that's justifiable to keep the API saner? It would
be great if it can be fixed somehow even if it means that the work has
to be separately flushed for each worker it has been on before being
destroyed.

Or, if flushing has to be associated with a specific worker anyway,
maybe it would be better to move the sequence counter to
kthread_worker and do it similarly with the original workqueue so that
work can be destroyed once execution starts? Then, it can at least
remain semantically identical to the original workqueue.

Thanks.

--
tejun

2010-07-26 20:03:46

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 08:51:50PM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/26/2010 06:31 PM, Michael S. Tsirkin wrote:
> >> On 07/26/2010 06:05 PM, Tejun Heo wrote:
> >>> * Placing try_to_freeze() could be a bit annoying. It shouldn't be
> >>> executed when there's a work to flush.
> >
> > BTW why is this important?
> > We could always get another work and flush right after
> > try_to_freeze, and then flush would block for a long time.
> >
> > BTW the vhost patch you sent does not do this at all.
> > I am guessing it is because our thread is not freezable?
>
> Yeap, I think so.
>
> >> * Similar issue exists for kthread_stop(). The kthread shouldn't exit
> >> while there's a work to flush (please note that kthread_worker
> >> interface allows detaching / attaching worker kthread during
> >> operation, so it should remain in consistent state with regard to
> >> flushing).
> >
> > Not sure I agree here. Users must synchronise flush and stop calls.
> > Otherwise a work might get queued after stop is called, and
> > you won't be able to flush it.
>
> For freeze, it probably is okay but for stop, I think it's better to
> keep the semantics straight forward.

What are the semantics then? What do we want stop followed
by queue and flush to do?

> It may be okay to do otherwise
> but having such oddity in generic interface is nasty and may lead to
> surprises which can be pretty difficult to track down later on. It's
> just a bit more of annoyance while writing the generic code, so...
>
> Thanks.
>
> --
> tejun

2010-07-26 20:05:20

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 09:31:58PM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/26/2010 09:14 PM, Tejun Heo wrote:
> > On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> >> I noticed that with vhost, flush_work was getting the worker
> >> pointer as well. Can we live with this API change?
> >
> > Yeah, the flushing mechanism wouldn't work reliably if the work is
> > queued to a different worker without flushing, so yeah passing in
> > @worker might actually be better.
>
> Thinking a bit more about it, it kind of sucks that queueing to
> another worker from worker->func() breaks flush. Maybe the right
> thing to do there is using atomic_t for done_seq? It pays a bit more
> overhead but maybe that's justifiable to keep the API saner? It would
> be great if it can be fixed somehow even if it means that the work has
> to be separately flushed for each worker it has been on before being
> destroyed.
>
> Or, if flushing has to be associated with a specific worker anyway,
> maybe it would be better to move the sequence counter to
> kthread_worker and do it similarly with the original workqueue so that
> work can be destroyed once execution starts? Then, it can at least
> remain semantically identical to the original workqueue.
>
> Thanks.

This last sounds sane: in fact I didn't know there is any difference.

> --
> tejun

2010-07-26 20:25:31

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 09:04:17PM +0200, Tejun Heo wrote:
> On 07/26/2010 06:23 PM, Michael S. Tsirkin wrote:
> >> * Can you please keep the outer goto repeat loop? I just don't like
> >> outermost for (;;).
> >
> > Okay ... can we put the code in a {} scope to make it clear
> > where does the loop starts and ends?
>
> If we're gonna do that, it would be better to put it inside a loop
> construct. The reason why I don't like it is that loops like that
> don't really help read/writeability much while indenting the whole
> logic unnecessarily and look more like a result of obsession against
> goto rather than any practical reason. It's just a cosmetic
> preference and I might as well be the weirdo here, so if you feel
> strong about it, please feel free to put everything in a loop.
>
> >> * Placing try_to_freeze() could be a bit annoying. It shouldn't be
> >> executed when there's a work to flush.
> >
> > It currently seems to be executed when there is work to flush.
> > Is this wrong?
>
> Oh, does it? As I wrote in the other mail, things like that wouldn't
> necessarily break correctness but I think it would be better to avoid
> surprises in the generic code if not too difficult.

Let's try to define what do we want to achieve then.
Do you want code that flushes workers not to block
when workers are frozen? How will we handle work
submitted when worker is frozen?


> >> * I think A - B <= 0 test would be more familiar. At least
> >> time_before/after() are implemented that way.
> >
> > I am concerned that this overflows a signed integer -
> > which I seem to remeber that C99 disallows.
>
> Really? Overflows of pointer isn't expected and that's why we have
> weird RELOC_HIDE() macro for such calculations but integers not
> expected to overflow is a news to me. Are you sure? That basically
> means time_before/after() aren't safe either.

As I said, in C99.
However, the kernel is built with -fno-strict-overflow, so it will work.

> > timer macros are on data path so might be worth the risk there,
> > but flush is slow path so better be safe?
>
> I don't think performance matters much here. I just think the sign
> test is clearer / more familiar for the logic.
>
> Thanks.
>
> --
> tejun

2010-07-27 08:18:41

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/26/2010 09:57 PM, Michael S. Tsirkin wrote:
>> For freeze, it probably is okay but for stop, I think it's better to
>> keep the semantics straight forward.
>
> What are the semantics then? What do we want stop followed
> by queue and flush to do?

One scenario I can think of is the following.

kthread_worker allows kthreads to be attached and stopped anytime, so
if the caller stops the current worker while flushing is pending and
attaches a new worker, the flushing which was pending will never
happen.

But, in general, it's nasty to allow execution and its completion to
be separated. Things like that are likely to bite us back in obscure
ways. I think it would be silly to have such oddity in generic code
when it can be avoided without too much trouble.

Thanks.

--
tejun

2010-07-27 08:22:29

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/26/2010 10:19 PM, Michael S. Tsirkin wrote:
> Let's try to define what do we want to achieve then. Do you want
> code that flushes workers not to block when workers are frozen? How
> will we handle work submitted when worker is frozen?

As I wrote earlier, it's not necessarily about correctness but rather
avoiding unnecessary surprises and of course flushing can and should
stall if the queue is frozen but let's not separate execution of a
work and its completion with something which can take undeterminate
amount of time.

Thanks.

--
tejun

2010-07-27 19:25:19

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Mon, Jul 26, 2010 at 09:31:58PM +0200, Tejun Heo wrote:
> Hello,
>
> On 07/26/2010 09:14 PM, Tejun Heo wrote:
> > On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> >> I noticed that with vhost, flush_work was getting the worker
> >> pointer as well. Can we live with this API change?
> >
> > Yeah, the flushing mechanism wouldn't work reliably if the work is
> > queued to a different worker without flushing, so yeah passing in
> > @worker might actually be better.
>
> Thinking a bit more about it, it kind of sucks that queueing to
> another worker from worker->func() breaks flush. Maybe the right
> thing to do there is using atomic_t for done_seq?

I don't believe it will help: we might have:

worker1 runs work
work requeues itself queued index = 1
worker1 reads queued index = 1
worker2 runs work
work requeues itself queued index = 2
worker2 runs work
worker2 reads queued index = 2
worker2 writes done index = 2
worker1 writes done index = 1

As you see, done index got moved back.



> It pays a bit more
> overhead but maybe that's justifiable to keep the API saner? It would
> be great if it can be fixed somehow even if it means that the work has
> to be separately flushed for each worker it has been on before being
> destroyed.
>
> Or, if flushing has to be associated with a specific worker anyway,
> maybe it would be better to move the sequence counter to
> kthread_worker and do it similarly with the original workqueue so that
> work can be destroyed once execution starts? Then, it can at least
> remain semantically identical to the original workqueue.
>
> Thanks.
>
> --
> tejun

2010-07-28 07:49:19

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
>> Thinking a bit more about it, it kind of sucks that queueing to
>> another worker from worker->func() breaks flush. Maybe the right
>> thing to do there is using atomic_t for done_seq?
>
> I don't believe it will help: we might have:
>
> worker1 runs work
> work requeues itself queued index = 1
> worker1 reads queued index = 1
> worker2 runs work
> work requeues itself queued index = 2
> worker2 runs work
> worker2 reads queued index = 2
> worker2 writes done index = 2
> worker1 writes done index = 1
>
> As you see, done index got moved back.

Yeah, I think the flushing logic should be moved to the worker. Are
you interested in doing it w/ your change?

Thanks.

--
tejun

2010-07-28 10:55:04

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

On Wed, Jul 28, 2010 at 09:48:31AM +0200, Tejun Heo wrote:
> On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
> >> Thinking a bit more about it, it kind of sucks that queueing to
> >> another worker from worker->func() breaks flush. Maybe the right
> >> thing to do there is using atomic_t for done_seq?
> >
> > I don't believe it will help: we might have:
> >
> > worker1 runs work
> > work requeues itself queued index = 1
> > worker1 reads queued index = 1
> > worker2 runs work
> > work requeues itself queued index = 2
> > worker2 runs work
> > worker2 reads queued index = 2
> > worker2 writes done index = 2
> > worker1 writes done index = 1
> >
> > As you see, done index got moved back.
>
> Yeah, I think the flushing logic should be moved to the worker.
> Are you interested in doing it w/ your change?
>
> Thanks.

I'm unsure how flush_work operates under these conditions. E.g. in
workqueue.c, this seems to work by keeping a pointer to current
workqueue in the work. But what prevents us from destroying the
workqueue when work might not be running?

Is this currently broken if you use multiple workqueues
for the same work? If yes, I propose we do as I did,
making flush_work get worker pointer, and only flushing
on that worker.

> --
> tejun

2010-07-28 12:02:09

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread

Hello,

On 07/28/2010 12:48 PM, Michael S. Tsirkin wrote:
> I'm unsure how flush_work operates under these conditions. E.g. in
> workqueue.c, this seems to work by keeping a pointer to current
> workqueue in the work. But what prevents us from destroying the
> workqueue when work might not be running?

In cmwq, work points to the gcwq it was on, which keeps track of all
the works in progress, so flushing work which is on a destroyed
workqueue should be fine, but in the original implementation, it would
end up accessing freed memory.

> Is this currently broken if you use multiple workqueues
> for the same work? If yes, I propose we do as I did,
> making flush_work get worker pointer, and only flushing
> on that worker.

The original semantics of workqueue is that flush_work() guarantees
that the work has finished executing on the workqueue it was last
queued on. Adding @worker to flush_work() is okay, I think.

Thanks.

--
tejun