Ref to prefious version discussion:
[PATCH 0/5] vhost: support upto 509 memory regions
http://www.spinics.net/lists/kvm/msg117654.html
Chagelog v1->v2:
* fix spelling errors
* move "vhost: support upto 509 memory regions" to the end of queue
* move kvfree() form 1/6 to 2/6 where it belongs
* add vhost module parameter to enable/disable translation caching
Series extends vhost to support upto 509 memory regions,
and adds some vhost:translate_desc() performance improvemnts
so it won't regress when memslots are increased to 509.
It fixes running VM crashing during memory hotplug due
to vhost refusing accepting more than 64 memory regions.
It's only host kernel side fix to make it work with QEMU
versions that support memory hotplug. But I'll continue
to work on QEMU side solution to reduce amount of memory
regions to make things even better.
Performance wise for guest with (in my case 3 memory regions)
and netperf's UDP_RR workload translate_desc() execution
time from total workload takes:
Memory |1G RAM|cached|non cached
regions # | 3 | 53 | 53
------------------------------------
upstream | 0.3% | - | 3.5%
------------------------------------
this series | 0.2% | 0.5% | 0.7%
where "non cached" column reflects trashing wokload
with constant cache miss. More details on timing in
respective patches.
Igor Mammedov (6):
vhost: use binary search instead of linear in find_region()
vhost: extend memory regions allocation to vmalloc
vhost: add per VQ memory region caching
vhost: translate_desc: optimization for desc.len < region size
vhost: add 'translation_cache' module parameter
vhost: support upto 509 memory regions
drivers/vhost/vhost.c | 105 ++++++++++++++++++++++++++++++++++++++------------
drivers/vhost/vhost.h | 1 +
2 files changed, 82 insertions(+), 24 deletions(-)
--
1.8.3.1
For default region layouts performance stays the same
as linear search i.e. it takes around 210ns average for
translate_desc() that inlines find_region().
But it scales better with larger amount of regions,
235ns BS vs 300ns LS with 55 memory regions
and it will be about the same values when allowed number
of slots is increased to 509 like it has been done in KVM.
Signed-off-by: Igor Mammedov <[email protected]>
---
v2:
move kvfree() to 2/2 where it belongs
---
drivers/vhost/vhost.c | 36 +++++++++++++++++++++++++++---------
1 file changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ee2826..f1e07b8 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -25,6 +25,7 @@
#include <linux/kthread.h>
#include <linux/cgroup.h>
#include <linux/module.h>
+#include <linux/sort.h>
#include "vhost.h"
@@ -590,6 +591,16 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
+static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
+{
+ const struct vhost_memory_region *r1 = p1, *r2 = p2;
+ if (r1->guest_phys_addr < r2->guest_phys_addr)
+ return 1;
+ if (r1->guest_phys_addr > r2->guest_phys_addr)
+ return -1;
+ return 0;
+}
+
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
struct vhost_memory mem, *newmem, *oldmem;
@@ -612,6 +623,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
kfree(newmem);
return -EFAULT;
}
+ sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
+ vhost_memory_reg_sort_cmp, NULL);
if (!memory_access_ok(d, newmem, 0)) {
kfree(newmem);
@@ -913,17 +926,22 @@ EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
__u64 addr, __u32 len)
{
- struct vhost_memory_region *reg;
- int i;
+ const struct vhost_memory_region *reg;
+ int start = 0, end = mem->nregions;
- /* linear search is not brilliant, but we really have on the order of 6
- * regions in practice */
- for (i = 0; i < mem->nregions; ++i) {
- reg = mem->regions + i;
- if (reg->guest_phys_addr <= addr &&
- reg->guest_phys_addr + reg->memory_size - 1 >= addr)
- return reg;
+ while (start < end) {
+ int slot = start + (end - start) / 2;
+ reg = mem->regions + slot;
+ if (addr >= reg->guest_phys_addr)
+ end = slot;
+ else
+ start = slot + 1;
}
+
+ reg = mem->regions + start;
+ if (addr >= reg->guest_phys_addr &&
+ reg->guest_phys_addr + reg->memory_size > addr)
+ return reg;
return NULL;
}
--
1.8.3.1
with large number of memory regions we could end up with
high order allocations and kmalloc could fail if
host is under memory pressure.
Considering that memory regions array is used on hot path
try harder to allocate using kmalloc and if it fails resort
to vmalloc.
It's still better than just failing vhost_set_memory() and
causing guest crash due to it when a new memory hotplugged
to guest.
I'll still look at QEMU side solution to reduce amount of
memory regions it feeds to vhost to make things even better,
but it doesn't hurt for kernel to behave smarter and don't
crash older QEMU's which could use large amount of memory
regions.
Signed-off-by: Igor Mammedov <[email protected]>
---
drivers/vhost/vhost.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f1e07b8..99931a0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -471,7 +471,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
fput(dev->log_file);
dev->log_file = NULL;
/* No one will access memory at this point */
- kfree(dev->memory);
+ kvfree(dev->memory);
dev->memory = NULL;
WARN_ON(!list_empty(&dev->work_list));
if (dev->worker) {
@@ -601,6 +601,18 @@ static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
return 0;
}
+static void *vhost_kvzalloc(unsigned long size)
+{
+ void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+
+ if (!n) {
+ n = vzalloc(size);
+ if (!n)
+ return ERR_PTR(-ENOMEM);
+ }
+ return n;
+}
+
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
struct vhost_memory mem, *newmem, *oldmem;
@@ -613,21 +625,21 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
return -EOPNOTSUPP;
if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS)
return -E2BIG;
- newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL);
+ newmem = vhost_kvzalloc(size + mem.nregions * sizeof(*m->regions));
if (!newmem)
return -ENOMEM;
memcpy(newmem, &mem, size);
if (copy_from_user(newmem->regions, m->regions,
mem.nregions * sizeof *m->regions)) {
- kfree(newmem);
+ kvfree(newmem);
return -EFAULT;
}
sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
vhost_memory_reg_sort_cmp, NULL);
if (!memory_access_ok(d, newmem, 0)) {
- kfree(newmem);
+ kvfree(newmem);
return -EFAULT;
}
oldmem = d->memory;
@@ -639,7 +651,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
d->vqs[i]->memory = newmem;
mutex_unlock(&d->vqs[i]->mutex);
}
- kfree(oldmem);
+ kvfree(oldmem);
return 0;
}
--
1.8.3.1
that brings down translate_desc() cost to around 210ns
if accessed descriptors are from the same memory region.
Signed-off-by: Igor Mammedov <[email protected]>
---
that's what netperf/iperf workloads were during testing.
---
drivers/vhost/vhost.c | 16 +++++++++++++---
drivers/vhost/vhost.h | 1 +
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 99931a0..5c39a1e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -200,6 +200,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call = NULL;
vq->log_ctx = NULL;
vq->memory = NULL;
+ vq->cached_reg = 0;
}
static int vhost_worker(void *data)
@@ -649,6 +650,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
for (i = 0; i < d->nvqs; ++i) {
mutex_lock(&d->vqs[i]->mutex);
d->vqs[i]->memory = newmem;
+ d->vqs[i]->cached_reg = 0;
mutex_unlock(&d->vqs[i]->mutex);
}
kvfree(oldmem);
@@ -936,11 +938,17 @@ done:
EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
- __u64 addr, __u32 len)
+ __u64 addr, __u32 len,
+ int *cached_reg)
{
const struct vhost_memory_region *reg;
int start = 0, end = mem->nregions;
+ reg = mem->regions + *cached_reg;
+ if (likely(addr >= reg->guest_phys_addr &&
+ reg->guest_phys_addr + reg->memory_size > addr))
+ return reg;
+
while (start < end) {
int slot = start + (end - start) / 2;
reg = mem->regions + slot;
@@ -952,8 +960,10 @@ static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
reg = mem->regions + start;
if (addr >= reg->guest_phys_addr &&
- reg->guest_phys_addr + reg->memory_size > addr)
+ reg->guest_phys_addr + reg->memory_size > addr) {
+ *cached_reg = start;
return reg;
+ }
return NULL;
}
@@ -1107,7 +1117,7 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
ret = -ENOBUFS;
break;
}
- reg = find_region(mem, addr, len);
+ reg = find_region(mem, addr, len, &vq->cached_reg);
if (unlikely(!reg)) {
ret = -EFAULT;
break;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8c1c792..68bd00f 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,6 +106,7 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
+ int cached_reg;
};
struct vhost_dev {
--
1.8.3.1
when translating descriptors they are typically less than
memory region that holds them and translated into 1 iov
entry, so it's not nessesary to check remaining length
twice and calculate used length and next address
in such cases.
replace a remaining length and 'size' increment branches
with a single remaining length check and execute
next iov steps only when it needed.
It saves a tiny 2% of translate_desc() execution time.
Signed-off-by: Igor Mammedov <[email protected]>
---
PS:
I'm not sure if iov_size > 0 is always true, if it's not
then better to drop this patch.
---
drivers/vhost/vhost.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5c39a1e..5bcb323 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1111,12 +1111,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
int ret = 0;
mem = vq->memory;
- while ((u64)len > s) {
+ do {
u64 size;
- if (unlikely(ret >= iov_size)) {
- ret = -ENOBUFS;
- break;
- }
reg = find_region(mem, addr, len, &vq->cached_reg);
if (unlikely(!reg)) {
ret = -EFAULT;
@@ -1124,13 +1120,22 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
}
_iov = iov + ret;
size = reg->memory_size - addr + reg->guest_phys_addr;
- _iov->iov_len = min((u64)len - s, size);
_iov->iov_base = (void __user *)(unsigned long)
(reg->userspace_addr + addr - reg->guest_phys_addr);
+ ++ret;
+ if (likely((u64)len - s < size)) {
+ _iov->iov_len = (u64)len - s;
+ break;
+ }
+
+ if (unlikely(ret >= iov_size)) {
+ ret = -ENOBUFS;
+ break;
+ }
+ _iov->iov_len = size;
s += size;
addr += size;
- ++ret;
- }
+ } while (1);
return ret;
}
--
1.8.3.1
by default translation of virtqueue descriptors is done with
caching enabled, but caching will add only extra cost
in cases of trashing workload where majority descriptors
are translated to different memory regions.
So add an option to allow exclude cache miss cost for such cases.
Performance with cashing enabled for sequential workload
doesn't seem to be affected much vs version without static key switch,
i.e. still the same 0.2% of total time with key(NOPs) consuming
5ms on 5min workload.
Signed-off-by: Igor Mammedov <[email protected]>
---
I don't have a test case for trashing workload though, but jmp
instruction adds up ~6ms(55M instructions) minus excluded caching
around 24ms on 5min workload.
---
drivers/vhost/vhost.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5bcb323..78290b7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -29,6 +29,13 @@
#include "vhost.h"
+struct static_key translation_cache_key = STATIC_KEY_INIT_TRUE;
+static bool translation_cache = true;
+module_param(translation_cache, bool, 0444);
+MODULE_PARM_DESC(translation_cache,
+ "Enables/disables virtqueue descriptor translation caching,"
+ " Set to 0 to disable. (default: 1)");
+
enum {
VHOST_MEMORY_MAX_NREGIONS = 64,
VHOST_MEMORY_F_LOG = 0x1,
@@ -944,10 +951,12 @@ static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
const struct vhost_memory_region *reg;
int start = 0, end = mem->nregions;
- reg = mem->regions + *cached_reg;
- if (likely(addr >= reg->guest_phys_addr &&
- reg->guest_phys_addr + reg->memory_size > addr))
- return reg;
+ if (static_key_true(&translation_cache_key)) {
+ reg = mem->regions + *cached_reg;
+ if (likely(addr >= reg->guest_phys_addr &&
+ reg->guest_phys_addr + reg->memory_size > addr))
+ return reg;
+ }
while (start < end) {
int slot = start + (end - start) / 2;
@@ -1612,6 +1621,9 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify);
static int __init vhost_init(void)
{
+ if (!translation_cache)
+ static_key_slow_dec(&translation_cache_key);
+
return 0;
}
--
1.8.3.1
since commit
1d4e7e3 kvm: x86: increase user memory slots to 509
it became possible to use a bigger amount of memory
slots, which is used by memory hotplug for
registering hotplugged memory.
However QEMU crashes if it's used with more than ~60
pc-dimm devices and vhost-net since host kernel
in module vhost-net refuses to accept more than 64
memory regions.
Increase VHOST_MEMORY_MAX_NREGIONS limit from 64 to 509
to match KVM_USER_MEM_SLOTS to fix issue for vhost-net
and current QEMU versions.
Signed-off-by: Igor Mammedov <[email protected]>
---
drivers/vhost/vhost.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 78290b7..e93023e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -37,7 +37,7 @@ MODULE_PARM_DESC(translation_cache,
" Set to 0 to disable. (default: 1)");
enum {
- VHOST_MEMORY_MAX_NREGIONS = 64,
+ VHOST_MEMORY_MAX_NREGIONS = 509,
VHOST_MEMORY_F_LOG = 0x1,
};
--
1.8.3.1