Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1761443Ab2EQJVM (ORCPT ); Thu, 17 May 2012 05:21:12 -0400 Received: from mail-yx0-f174.google.com ([209.85.213.174]:48427 "EHLO mail-yx0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1761424Ab2EQJVG (ORCPT ); Thu, 17 May 2012 05:21:06 -0400 From: Liu Ping Fan To: kvm@vger.kernel.org, netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org, qemu-devel@nongnu.org, Avi Kivity , "Michael S. Tsirkin" , Srivatsa Vaddagiri , Rusty Russell , Anthony Liguori , Ryan Harper , Shirley Ma , Krishna Kumar , Tom Lendacky Subject: [PATCH 1/2] [kvm/vhost]: make vhost support NUMA model. Date: Thu, 17 May 2012 17:20:53 +0800 Message-Id: <1337246456-30909-2-git-send-email-kernelfans@gmail.com> X-Mailer: git-send-email 1.7.4.4 In-Reply-To: <1337246456-30909-1-git-send-email-kernelfans@gmail.com> References: <1337246456-30909-1-git-send-email-kernelfans@gmail.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 22035 Lines: 759 From: Liu Ping Fan Make vhost allocate vhost_virtqueue on different host nodes as required. Signed-off-by: Liu Ping Fan --- drivers/vhost/vhost.c | 380 +++++++++++++++++++++++++++++++++++-------------- drivers/vhost/vhost.h | 41 ++++-- include/linux/vhost.h | 2 +- 3 files changed, 304 insertions(+), 119 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 51e4c1e..b0d2855 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -37,12 +38,11 @@ enum { VHOST_MEMORY_F_LOG = 0x1, }; -static unsigned vhost_zcopy_mask __read_mostly; #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) -static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, +void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct vhost_poll *poll; @@ -75,12 +75,12 @@ static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, - unsigned long mask, struct vhost_dev *dev) + unsigned long mask, struct vhost_sub_dev *dev) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; - poll->dev = dev; + poll->subdev = dev; vhost_work_init(&poll->work, fn); } @@ -103,7 +103,7 @@ void vhost_poll_stop(struct vhost_poll *poll) remove_wait_queue(poll->wqh, &poll->wait); } -static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, +static bool vhost_work_seq_done(struct vhost_sub_dev *dev, struct vhost_work *work, unsigned seq) { int left; @@ -114,19 +114,19 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, return left <= 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +static void vhost_work_flush(struct vhost_sub_dev *sub, struct vhost_work *work) { unsigned seq; int flushing; - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&sub->work_lock); seq = work->queue_seq; work->flushing++; - spin_unlock_irq(&dev->work_lock); - wait_event(work->done, vhost_work_seq_done(dev, work, seq)); - spin_lock_irq(&dev->work_lock); + spin_unlock_irq(&sub->work_lock); + wait_event(work->done, vhost_work_seq_done(sub, work, seq)); + spin_lock_irq(&sub->work_lock); flushing = --work->flushing; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&sub->work_lock); BUG_ON(flushing < 0); } @@ -134,26 +134,26 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) * locks that are also used by the callback. */ void vhost_poll_flush(struct vhost_poll *poll) { - vhost_work_flush(poll->dev, &poll->work); + vhost_work_flush(poll->subdev, &poll->work); } -static inline void vhost_work_queue(struct vhost_dev *dev, +static inline void vhost_work_queue(struct vhost_sub_dev *sub, struct vhost_work *work) { unsigned long flags; - spin_lock_irqsave(&dev->work_lock, flags); + spin_lock_irqsave(&sub->work_lock, flags); if (list_empty(&work->node)) { - list_add_tail(&work->node, &dev->work_list); + list_add_tail(&work->node, &sub->work_list); work->queue_seq++; - wake_up_process(dev->worker); + wake_up_process(sub->worker); } - spin_unlock_irqrestore(&dev->work_lock, flags); + spin_unlock_irqrestore(&sub->work_lock, flags); } void vhost_poll_queue(struct vhost_poll *poll) { - vhost_work_queue(poll->dev, &poll->work); + vhost_work_queue(poll->subdev, &poll->work); } static void vhost_vq_reset(struct vhost_dev *dev, @@ -188,7 +188,8 @@ static void vhost_vq_reset(struct vhost_dev *dev, static int vhost_worker(void *data) { - struct vhost_dev *dev = data; + struct vhost_sub_dev *sub = data; + struct vhost_dev *dev = sub->owner; struct vhost_work *work = NULL; unsigned uninitialized_var(seq); @@ -198,7 +199,7 @@ static int vhost_worker(void *data) /* mb paired w/ kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&sub->work_lock); if (work) { work->done_seq = seq; if (work->flushing) @@ -206,18 +207,18 @@ static int vhost_worker(void *data) } if (kthread_should_stop()) { - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&sub->work_lock); __set_current_state(TASK_RUNNING); break; } - if (!list_empty(&dev->work_list)) { - work = list_first_entry(&dev->work_list, + if (!list_empty(&sub->work_list)) { + work = list_first_entry(&sub->work_list, struct vhost_work, node); list_del_init(&work->node); seq = work->queue_seq; } else work = NULL; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&sub->work_lock); if (work) { __set_current_state(TASK_RUNNING); @@ -244,54 +245,189 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) vq->ubuf_info = NULL; } -void vhost_enable_zcopy(int vq) +void vhost_enable_zcopy(struct vhost_dev *dev, int rx) { - vhost_zcopy_mask |= 0x1 << vq; + int i; + if (rx == 0) + for (i = 0; i < dev->node_cnt; i++) + dev->zcopy_mask |= 0x1<<(2*i+1); } -/* Helper to allocate iovec buffers for all vqs. */ -static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) +/* Need for vq dynamicly allocator, which is important to migrate among NUMA */ +static int vhost_vq_alloc_iovecs(struct vhost_virtqueue *vq) { - int i; bool zcopy; + int i; + struct vhost_dev *dev = vq->dev; + int node = vq->node_id; + vq->indirect = kmalloc_node(sizeof *vq->indirect * + UIO_MAXIOV, GFP_KERNEL, node); + vq->log = kmalloc_node(sizeof *vq->log * UIO_MAXIOV, + GFP_KERNEL, node); + vq->heads = kmalloc_node(sizeof *vq->heads * + UIO_MAXIOV, GFP_KERNEL, node); + for (i = 0; i < dev->node_cnt*2; i++) { + if (dev->vqs[i] == vq) { + zcopy = dev->zcopy_mask & (0x1 << i); + break; + } + } + if (zcopy) + vq->ubuf_info = + kmalloc_node(sizeof *vq->ubuf_info * + UIO_MAXIOV, GFP_KERNEL, node); + if (!vq->indirect || !vq->log || !vq->heads || + (zcopy && !vq->ubuf_info)) { + kfree(vq->indirect); + kfree(vq->log); + kfree(vq->heads); + kfree(vq->ubuf_info); - for (i = 0; i < dev->nvqs; ++i) { - dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect * - UIO_MAXIOV, GFP_KERNEL); - dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV, - GFP_KERNEL); - dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads * - UIO_MAXIOV, GFP_KERNEL); - zcopy = vhost_zcopy_mask & (0x1 << i); - if (zcopy) - dev->vqs[i].ubuf_info = - kmalloc(sizeof *dev->vqs[i].ubuf_info * - UIO_MAXIOV, GFP_KERNEL); - if (!dev->vqs[i].indirect || !dev->vqs[i].log || - !dev->vqs[i].heads || - (zcopy && !dev->vqs[i].ubuf_info)) + return -ENOMEM; + } else + return 0; +} + +/* Helper to allocate iovec buffers for all vqs. */ +static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) +{ + int i, ret; + for (i = 0; i < dev->nvqs; i++) { + ret = vhost_vq_alloc_iovecs(dev->vqs[i]); + if (ret < 0) { + i -= 1; goto err_nomem; + } } return 0; - err_nomem: for (; i >= 0; --i) - vhost_vq_free_iovecs(&dev->vqs[i]); + vhost_vq_free_iovecs(dev->vqs[i]); return -ENOMEM; } static void vhost_dev_free_iovecs(struct vhost_dev *dev) { int i; - for (i = 0; i < dev->nvqs; ++i) - vhost_vq_free_iovecs(&dev->vqs[i]); + vhost_vq_free_iovecs(dev->vqs[i]); } -long vhost_dev_init(struct vhost_dev *dev, - struct vhost_virtqueue *vqs, int nvqs) +int vhost_dev_alloc_subdevs(struct vhost_dev *dev, unsigned long *numa_map, + int sz) +{ + int i, j = 0; + int cur, prev = 0; + struct vhost_sub_dev *sub; + /* Todo,replace allow_map with dynamic allocated */ + dev->allow_map = *numa_map; + dev->sub_devs = kmalloc(dev->node_cnt*sizeof(void *), GFP_KERNEL); + + while (1) { + cur = find_next_bit(numa_map, sz, prev); + if (cur >= sz) + break; + prev = cur; + sub = kmalloc_node(sizeof(struct vhost_sub_dev), GFP_KERNEL, cur); + if (sub == NULL) + goto err; + j++; + sub->node_id = cur; + sub->owner = dev; + spin_lock_init(&sub->work_lock); + INIT_LIST_HEAD(&sub->work_list); + dev->sub_devs[i] = sub; + } + + dev->node_cnt = j; + return 0; +err: + for (i = 0; i < j; i++) { + kfree(dev->sub_devs[i]); + dev->sub_devs[i] = NULL; + } + return -ENOMEM; + +} + +void vhost_dev_free_subdevs(struct vhost_dev *dev) { int i; + for (i = 0; i < dev->node_cnt; i++) + kfree(dev->sub_devs[i]); + return; +} + +static int check_numa(int *vqs_map, int sz) +{ + int i, node; + + for (i = 0; i < sz; i++) { + for_each_online_node(node) + if (vqs_map[i] == node) + break; + if (vqs_map[i] != node) + return -1; + } + return 0; +} + +int check_numa_bmp(unsigned long *numa_bmp, int sz) +{ + int i, node, cur, prev = 0; + + for (i = 0; i < sz; i++) { + cur = find_next_bit(numa_bmp, sz, prev); + prev = cur; + if (cur >= sz) + return 0; + for_each_online_node(node) + if (cur == node) + break; + if (cur != node) + return -1; + } + return 0; +} + +/* allocate vqs in node according to request map */ +int vhost_dev_alloc_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int cnt, + int *vqs_map, int sz, vhost_work_fn_t *handle_kick) +{ + int r, i, j = 0; + r = check_numa(vqs_map, sz); + if (r < 0) + return -EINVAL; + for (i = 0; i < cnt ; i++) { + vqs[i] = kmalloc_node(sizeof(struct vhost_virtqueue), + GFP_KERNEL, vqs_map[i]); + if (vqs[i] == NULL) + goto err; + vqs[i]->handle_kick = handle_kick[i]; + j = i; + } + return 0; +err: + for (i = 0; i < j; i++) + kfree(vqs[i]); + return -ENOMEM; + +} + +void vhost_dev_free_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs, + int cnt) +{ + int i; + for (i = 0; i < cnt ; i++) + kfree(vqs[i]); + return; +} + +long vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs) +{ + int i, j, ret = 0; + struct vhost_sub_dev *subdev; + struct vhost_virtqueue *vq; dev->vqs = vqs; dev->nvqs = nvqs; @@ -300,24 +436,32 @@ long vhost_dev_init(struct vhost_dev *dev, dev->log_file = NULL; dev->memory = NULL; dev->mm = NULL; - spin_lock_init(&dev->work_lock); - INIT_LIST_HEAD(&dev->work_list); - dev->worker = NULL; for (i = 0; i < dev->nvqs; ++i) { - dev->vqs[i].log = NULL; - dev->vqs[i].indirect = NULL; - dev->vqs[i].heads = NULL; - dev->vqs[i].ubuf_info = NULL; - dev->vqs[i].dev = dev; - mutex_init(&dev->vqs[i].mutex); - vhost_vq_reset(dev, dev->vqs + i); - if (dev->vqs[i].handle_kick) - vhost_poll_init(&dev->vqs[i].poll, - dev->vqs[i].handle_kick, POLLIN, dev); - } + vq = dev->vqs[i]; + /* for each numa node, in-vq/out-vq */ + vq->log = NULL; + vq->indirect = NULL; + vq->heads = NULL; + vq->ubuf_info = NULL; + vq->dev = dev; + mutex_init(&vq->mutex); + vhost_vq_reset(dev, vq); + + if (vq->handle_kick) { + for (j = 0; j < i; j++) { + subdev = dev->sub_devs[j]; + if (vq->node_id == subdev->node_id) + vhost_poll_init(&vq->poll, vq->handle_kick, POLLIN, subdev); + else { + vhost_poll_init(&vq->poll, vq->handle_kick, POLLIN, dev->sub_devs[0]); + ret = 1; + } + } + } - return 0; + } + return ret; } /* Caller should have device mutex */ @@ -344,19 +488,26 @@ static void vhost_attach_cgroups_work(struct vhost_work *work) static int vhost_attach_cgroups(struct vhost_dev *dev) { struct vhost_attach_cgroups_struct attach; - + int i, ret = 0; + struct vhost_sub_dev *sub; attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); - return attach.ret; + for (i = 0; i < dev->node_cnt; i++) { + sub = dev->sub_devs[i]; + vhost_work_init(&attach.work, vhost_attach_cgroups_work); + vhost_work_queue(sub, &attach.work); + vhost_work_flush(sub, &attach.work); + ret |= attach.ret; + } + return ret; } /* Caller should have device mutex */ static long vhost_dev_set_owner(struct vhost_dev *dev) { struct task_struct *worker; - int err; + int err, i, j, cur, prev = 0; + int sz = sizeof(unsigned long); + const struct cpumask *mask; /* Is there an owner already? */ if (dev->mm) { @@ -366,14 +517,19 @@ static long vhost_dev_set_owner(struct vhost_dev *dev) /* No owner, become one */ dev->mm = get_task_mm(current); - worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; + + for (i = 0, j = 0; i < dev->node_cnt; i++, j++) { + cur = find_next_bit(&dev->allow_map, sz, prev); + dev->sub_devs[i]->worker = kthread_create_on_node(vhost_worker, + dev->sub_devs[i], cur, "vhost-%d-node-%d", current->pid, cur); + if (dev->sub_devs[i]->worker == NULL) + goto err_cgroup; + mask = cpumask_of_node(cur); + do_set_cpus_allowed(worker, mask); } - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ + for (i = 0; i < dev->node_cnt; i++) + wake_up_process(dev->sub_devs[i]->worker); err = vhost_attach_cgroups(dev); if (err) @@ -385,9 +541,12 @@ static long vhost_dev_set_owner(struct vhost_dev *dev) return 0; err_cgroup: - kthread_stop(worker); - dev->worker = NULL; -err_worker: + + for (i = 0; i < j; i++) { + kthread_stop(dev->sub_devs[i]->worker); + dev->sub_devs[i]->worker = NULL; + } + if (dev->mm) mmput(dev->mm); dev->mm = NULL; @@ -442,28 +601,28 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) int i; for (i = 0; i < dev->nvqs; ++i) { - if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { - vhost_poll_stop(&dev->vqs[i].poll); - vhost_poll_flush(&dev->vqs[i].poll); + if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { + vhost_poll_stop(&dev->vqs[i]->poll); + vhost_poll_flush(&dev->vqs[i]->poll); } /* Wait for all lower device DMAs done. */ - if (dev->vqs[i].ubufs) - vhost_ubuf_put_and_wait(dev->vqs[i].ubufs); + if (dev->vqs[i]->ubufs) + vhost_ubuf_put_and_wait(dev->vqs[i]->ubufs); /* Signal guest as appropriate. */ - vhost_zerocopy_signal_used(&dev->vqs[i]); - - if (dev->vqs[i].error_ctx) - eventfd_ctx_put(dev->vqs[i].error_ctx); - if (dev->vqs[i].error) - fput(dev->vqs[i].error); - if (dev->vqs[i].kick) - fput(dev->vqs[i].kick); - if (dev->vqs[i].call_ctx) - eventfd_ctx_put(dev->vqs[i].call_ctx); - if (dev->vqs[i].call) - fput(dev->vqs[i].call); - vhost_vq_reset(dev, dev->vqs + i); + vhost_zerocopy_signal_used(dev->vqs[i]); + + if (dev->vqs[i]->error_ctx) + eventfd_ctx_put(dev->vqs[i]->error_ctx); + if (dev->vqs[i]->error) + fput(dev->vqs[i]->error); + if (dev->vqs[i]->kick) + fput(dev->vqs[i]->kick); + if (dev->vqs[i]->call_ctx) + eventfd_ctx_put(dev->vqs[i]->call_ctx); + if (dev->vqs[i]->call) + fput(dev->vqs[i]->call); + vhost_vq_reset(dev, dev->vqs[i]); } vhost_dev_free_iovecs(dev); if (dev->log_ctx) @@ -477,11 +636,15 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) locked == lockdep_is_held(&dev->mutex))); RCU_INIT_POINTER(dev->memory, NULL); + + /* fixme,It will be considered and fixed in next verion */ WARN_ON(!list_empty(&dev->work_list)); if (dev->worker) { kthread_stop(dev->worker); dev->worker = NULL; } + /* end*/ + if (dev->mm) mmput(dev->mm); dev->mm = NULL; @@ -534,14 +697,14 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, for (i = 0; i < d->nvqs; ++i) { int ok; - mutex_lock(&d->vqs[i].mutex); + mutex_lock(&d->vqs[i]->mutex); /* If ring is inactive, will check when it's enabled. */ - if (d->vqs[i].private_data) - ok = vq_memory_access_ok(d->vqs[i].log_base, mem, + if (d->vqs[i]->private_data) + ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log_all); else ok = 1; - mutex_unlock(&d->vqs[i].mutex); + mutex_unlock(&d->vqs[i]->mutex); if (!ok) return 0; } @@ -650,8 +813,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) return r; if (idx >= d->nvqs) return -ENOBUFS; - - vq = d->vqs + idx; + vq = d->vqs[idx]; mutex_lock(&vq->mutex); @@ -750,6 +912,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) vq->log_addr = a.log_guest_addr; vq->used = (void __user *)(unsigned long)a.used_user_addr; break; + case VHOST_SET_VRING_KICK: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; @@ -766,6 +929,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) } else filep = eventfp; break; + case VHOST_SET_VRING_CALL: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; @@ -863,7 +1027,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsigned long)p; - vq = d->vqs + i; + vq = d->vqs[i]; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(d, vq, base)) @@ -890,9 +1054,9 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) } else filep = eventfp; for (i = 0; i < d->nvqs; ++i) { - mutex_lock(&d->vqs[i].mutex); - d->vqs[i].log_ctx = d->log_ctx; - mutex_unlock(&d->vqs[i].mutex); + mutex_lock(&d->vqs[i]->mutex); + d->vqs[i]->log_ctx = d->log_ctx; + mutex_unlock(&d->vqs[i]->mutex); } if (ctx) eventfd_ctx_put(ctx); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 8de1fd5..12d4237 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -13,12 +13,13 @@ #include #include +#define VHOST_NUMA /* This is for zerocopy, used buffer len is set to 1 when lower device DMA * done */ #define VHOST_DMA_DONE_LEN 1 #define VHOST_DMA_CLEAR_LEN 0 -struct vhost_device; +struct vhost_dev; struct vhost_work; typedef void (*vhost_work_fn_t)(struct vhost_work *work); @@ -32,6 +33,8 @@ struct vhost_work { unsigned done_seq; }; +struct vhost_sub_dev; + /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { @@ -40,11 +43,13 @@ struct vhost_poll { wait_queue_t wait; struct vhost_work work; unsigned long mask; - struct vhost_dev *dev; + struct vhost_sub_dev *subdev; }; +void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt); void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, - unsigned long mask, struct vhost_dev *dev); + unsigned long mask, struct vhost_sub_dev *dev); void vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); @@ -70,7 +75,7 @@ void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *); /* The virtqueue structure describes a queue attached to a device. */ struct vhost_virtqueue { struct vhost_dev *dev; - + int node_id; /* The actual ring of buffers. */ struct mutex mutex; unsigned int num; @@ -143,6 +148,14 @@ struct vhost_virtqueue { struct vhost_ubuf_ref *ubufs; }; +struct vhost_sub_dev { + struct vhost_dev *owner; + int node_id; + spinlock_t work_lock; + struct list_head work_list; + struct task_struct *worker; +}; + struct vhost_dev { /* Readers use RCU to access memory table pointer * log base pointer and features. @@ -151,16 +164,24 @@ struct vhost_dev { struct mm_struct *mm; struct mutex mutex; unsigned acked_features; - struct vhost_virtqueue *vqs; + struct vhost_virtqueue **vqs; int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; - spinlock_t work_lock; - struct list_head work_list; - struct task_struct *worker; + /* todo, change it to bitmap */ + unsigned long allow_map; + unsigned long node_cnt; + unsigned long zcopy_mask; + struct vhost_sub_dev **sub_devs; }; -long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +int check_numa_bmp(unsigned long *numa_bmp, int sz); +int vhost_dev_alloc_subdevs(struct vhost_dev *dev, unsigned long *numa_map, + int sz); +void vhost_dev_free_subdevs(struct vhost_dev *dev); +int vhost_dev_alloc_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs, + int cnt, int *vqs_map, int sz, vhost_work_fn_t *handle_kick); +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); long vhost_dev_check_owner(struct vhost_dev *); long vhost_dev_reset_owner(struct vhost_dev *); void vhost_dev_cleanup(struct vhost_dev *, bool locked); @@ -216,6 +237,6 @@ static inline int vhost_has_feature(struct vhost_dev *dev, int bit) return acked_features & (1 << bit); } -void vhost_enable_zcopy(int vq); +void vhost_enable_zcopy(struct vhost_dev *dev, int rx); #endif diff --git a/include/linux/vhost.h b/include/linux/vhost.h index e847f1e..d8c76f1 100644 --- a/include/linux/vhost.h +++ b/include/linux/vhost.h @@ -120,7 +120,7 @@ struct vhost_memory { * used for transmit. Pass fd -1 to unbind from the socket and the transmit * device. This can be used to stop the ring (e.g. for migration). */ #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) - +#define VHOST_NET_SET_NUMA _IOW(VHOST_VIRTIO, 0x31, unsigned long) /* Feature bits */ /* Log all write descriptors. Can be changed while device is active. */ #define VHOST_F_LOG_ALL 26 -- 1.7.4.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/