2020-01-15 03:52:28

by Yan Zhao

[permalink] [raw]
Subject: [PATCH v2 0/2] use vfio_dma_rw to read/write IOVAs from CPU side

It is better for a device model to use IOVAs to read/write memory.
And because the rw operations come from CPUs, it is not necessary to call
vfio_pin_pages() to pin those pages.

patch 1 introduces interface vfio_dma_rw in vfio to read/write IOVAs
without pinning user space pages.

patch 2 let gvt switch from kvm side rw interface to vfio_dma_rw.

v2 changelog:
- rename vfio_iova_rw to vfio_dma_rw, vfio iommu driver ops .iova_rw
to .dma_rw. (Alex).
- change iova and len from unsigned long to dma_addr_t and size_t,
respectively. (Alex)
- fix possible overflow in dma->vaddr + iova - dma->iova + offset (Alex)
- split DMAs from on page boundary to on max available size to eliminate
redundant searching of vfio_dma and switching mm. (Alex)
- add a check for IOMMU_WRITE permission.

Yan Zhao (2):
vfio: introduce vfio_dma_rw to read/write a range of IOVAs
drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

drivers/gpu/drm/i915/gvt/kvmgt.c | 26 +++--------
drivers/vfio/vfio.c | 45 +++++++++++++++++++
drivers/vfio/vfio_iommu_type1.c | 76 ++++++++++++++++++++++++++++++++
include/linux/vfio.h | 5 +++
4 files changed, 133 insertions(+), 19 deletions(-)

--
2.17.1


2020-01-15 04:04:59

by Yan Zhao

[permalink] [raw]
Subject: [PATCH v2 1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs

vfio_dma_rw will read/write a range of user space memory pointed to by
IOVA into/from a kernel buffer without pinning the user space memory.

TODO: mark the IOVAs to user space memory dirty if they are written in
vfio_dma_rw().

Cc: Kevin Tian <[email protected]>
Signed-off-by: Yan Zhao <[email protected]>
---
drivers/vfio/vfio.c | 45 +++++++++++++++++++
drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
include/linux/vfio.h | 5 +++
3 files changed, 126 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c8482624ca34..8bd52bc841cf 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
}
EXPORT_SYMBOL(vfio_unpin_pages);

+/*
+ * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
+ * buffer without pinning the user space memory
+ * @dev [in] : device
+ * @iova [in] : base IOVA of a user space buffer
+ * @data [in] : pointer to kernel buffer
+ * @len [in] : kernel buffer length
+ * @write : indicate read or write
+ * Return error code on failure or 0 on success.
+ */
+int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
+ size_t len, bool write)
+{
+ struct vfio_container *container;
+ struct vfio_group *group;
+ struct vfio_iommu_driver *driver;
+ int ret = 0;
+
+ if (!dev || !data || len <= 0)
+ return -EINVAL;
+
+ group = vfio_group_get_from_dev(dev);
+ if (!group)
+ return -ENODEV;
+
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ goto out;
+
+ container = group->container;
+ driver = container->iommu_driver;
+
+ if (likely(driver && driver->ops->dma_rw))
+ ret = driver->ops->dma_rw(container->iommu_data,
+ iova, data, len, write);
+ else
+ ret = -ENOTTY;
+
+ vfio_group_try_dissolve_container(group);
+out:
+ vfio_group_put(group);
+ return ret;
+}
+EXPORT_SYMBOL(vfio_dma_rw);
+
static int vfio_register_iommu_notifier(struct vfio_group *group,
unsigned long *events,
struct notifier_block *nb)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ada8e6cdb88..a2d850b97ae6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -27,6 +27,7 @@
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/mmu_context.h>
#include <linux/rbtree.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
@@ -2326,6 +2327,80 @@ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
return blocking_notifier_chain_unregister(&iommu->notifier, nb);
}

+static size_t vfio_iommu_type1_rw_dma_nopin(struct vfio_iommu *iommu,
+ dma_addr_t iova, void *data,
+ size_t count, bool write)
+{
+ struct mm_struct *mm;
+ unsigned long vaddr;
+ struct vfio_dma *dma;
+ bool kthread = current->mm == NULL;
+ size_t done = 0;
+ size_t offset;
+
+ dma = vfio_find_dma(iommu, iova, 1);
+ if (!dma)
+ return 0;
+
+ if (write && !(dma->prot & IOMMU_WRITE))
+ return 0;
+
+ mm = get_task_mm(dma->task);
+
+ if (!mm)
+ return 0;
+
+ if (kthread)
+ use_mm(mm);
+ else if (current->mm != mm)
+ goto out;
+
+ offset = iova - dma->iova;
+
+ if (count > dma->size - offset)
+ count = dma->size - offset;
+
+ vaddr = dma->vaddr + offset;
+
+ if (write)
+ done = __copy_to_user((void __user *)vaddr, data, count) ?
+ 0 : count;
+ else
+ done = __copy_from_user(data, (void __user *)vaddr, count) ?
+ 0 : count;
+
+ if (kthread)
+ unuse_mm(mm);
+out:
+ mmput(mm);
+ return done;
+}
+
+static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t iova,
+ void *data, size_t count, bool write)
+{
+ struct vfio_iommu *iommu = iommu_data;
+ int ret = 0;
+ size_t done = 0;
+
+ mutex_lock(&iommu->lock);
+ while (count > 0) {
+ done = vfio_iommu_type1_rw_dma_nopin(iommu, iova, data,
+ count, write);
+ if (!done) {
+ ret = -EFAULT;
+ break;
+ }
+
+ count -= done;
+ data += done;
+ iova += done;
+ }
+
+ mutex_unlock(&iommu->lock);
+ return ret;
+}
+
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.name = "vfio-iommu-type1",
.owner = THIS_MODULE,
@@ -2338,6 +2413,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.unpin_pages = vfio_iommu_type1_unpin_pages,
.register_notifier = vfio_iommu_type1_register_notifier,
.unregister_notifier = vfio_iommu_type1_unregister_notifier,
+ .dma_rw = vfio_iommu_type1_dma_rw,
};

static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e42a711a2800..962f76a2d668 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -82,6 +82,8 @@ struct vfio_iommu_driver_ops {
struct notifier_block *nb);
int (*unregister_notifier)(void *iommu_data,
struct notifier_block *nb);
+ int (*dma_rw)(void *iommu_data, dma_addr_t iova,
+ void *data, size_t count, bool write);
};

extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -107,6 +109,9 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
int npage);

+extern int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
+ size_t len, bool write);
+
/* each type has independent events */
enum vfio_notify_type {
VFIO_IOMMU_NOTIFY = 0,
--
2.17.1

2020-01-15 04:05:26

by Yan Zhao

[permalink] [raw]
Subject: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

As a device model, it is better to read/write guest memory using vfio
interface, so that vfio is able to maintain dirty info of device IOVAs.

Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
cycles more overhead on average.

-------------------------------------
| interface | avg cpu cycles |
|-----------------------------------|
| kvm_write_guest | 1554 |
| ----------------------------------|
| kvm_read_guest | 707 |
|-----------------------------------|
| vfio_dma_rw(w) | 2274 |
|-----------------------------------|
| vfio_dma_rw(r) | 1378 |
-------------------------------------

Comparison of benchmarks scores are as blow:
------------------------------------------------------
| avg score | kvm_read/write_guest | vfio_dma_rw |
|----------------------------------------------------|
| Glmark2 | 1284 | 1296 |
|----------------------------------------------------|
| Lightsmark | 61.24 | 61.27 |
|----------------------------------------------------|
| OpenArena | 140.9 | 137.4 |
|----------------------------------------------------|
| Heaven | 671 | 670 |
------------------------------------------------------
No obvious performance downgrade found.

Cc: Kevin Tian <[email protected]>
Signed-off-by: Yan Zhao <[email protected]>
---
drivers/gpu/drm/i915/gvt/kvmgt.c | 26 +++++++-------------------
1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index bd79a9718cc7..17edc9a7ff05 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1966,31 +1966,19 @@ static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
void *buf, unsigned long len, bool write)
{
struct kvmgt_guest_info *info;
- struct kvm *kvm;
- int idx, ret;
- bool kthread = current->mm == NULL;
+ int ret;
+ struct intel_vgpu *vgpu;
+ struct device *dev;

if (!handle_valid(handle))
return -ESRCH;

info = (struct kvmgt_guest_info *)handle;
- kvm = info->kvm;
-
- if (kthread) {
- if (!mmget_not_zero(kvm->mm))
- return -EFAULT;
- use_mm(kvm->mm);
- }
-
- idx = srcu_read_lock(&kvm->srcu);
- ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
- kvm_read_guest(kvm, gpa, buf, len);
- srcu_read_unlock(&kvm->srcu, idx);
+ vgpu = info->vgpu;
+ dev = mdev_dev(vgpu->vdev.mdev);

- if (kthread) {
- unuse_mm(kvm->mm);
- mmput(kvm->mm);
- }
+ ret = write ? vfio_dma_rw(dev, gpa, buf, len, true) :
+ vfio_dma_rw(dev, gpa, buf, len, false);

return ret;
}
--
2.17.1

2020-01-15 20:12:19

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs

On Tue, 14 Jan 2020 22:53:03 -0500
Yan Zhao <[email protected]> wrote:

> vfio_dma_rw will read/write a range of user space memory pointed to by
> IOVA into/from a kernel buffer without pinning the user space memory.
>
> TODO: mark the IOVAs to user space memory dirty if they are written in
> vfio_dma_rw().
>
> Cc: Kevin Tian <[email protected]>
> Signed-off-by: Yan Zhao <[email protected]>
> ---
> drivers/vfio/vfio.c | 45 +++++++++++++++++++
> drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
> include/linux/vfio.h | 5 +++
> 3 files changed, 126 insertions(+)
>
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c8482624ca34..8bd52bc841cf 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
> }
> EXPORT_SYMBOL(vfio_unpin_pages);
>
> +/*
> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> + * buffer without pinning the user space memory
> + * @dev [in] : device
> + * @iova [in] : base IOVA of a user space buffer
> + * @data [in] : pointer to kernel buffer
> + * @len [in] : kernel buffer length
> + * @write : indicate read or write
> + * Return error code on failure or 0 on success.
> + */
> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> + size_t len, bool write)
> +{
> + struct vfio_container *container;
> + struct vfio_group *group;
> + struct vfio_iommu_driver *driver;
> + int ret = 0;
> +
> + if (!dev || !data || len <= 0)
> + return -EINVAL;
> +
> + group = vfio_group_get_from_dev(dev);
> + if (!group)
> + return -ENODEV;
> +
> + ret = vfio_group_add_container_user(group);
> + if (ret)
> + goto out;
> +
> + container = group->container;
> + driver = container->iommu_driver;
> +
> + if (likely(driver && driver->ops->dma_rw))
> + ret = driver->ops->dma_rw(container->iommu_data,
> + iova, data, len, write);
> + else
> + ret = -ENOTTY;
> +
> + vfio_group_try_dissolve_container(group);
> +out:
> + vfio_group_put(group);
> + return ret;
> +}
> +EXPORT_SYMBOL(vfio_dma_rw);
> +
> static int vfio_register_iommu_notifier(struct vfio_group *group,
> unsigned long *events,
> struct notifier_block *nb)
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ada8e6cdb88..a2d850b97ae6 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -27,6 +27,7 @@
> #include <linux/iommu.h>
> #include <linux/module.h>
> #include <linux/mm.h>
> +#include <linux/mmu_context.h>
> #include <linux/rbtree.h>
> #include <linux/sched/signal.h>
> #include <linux/sched/mm.h>
> @@ -2326,6 +2327,80 @@ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
> return blocking_notifier_chain_unregister(&iommu->notifier, nb);
> }
>
> +static size_t vfio_iommu_type1_rw_dma_nopin(struct vfio_iommu *iommu,
> + dma_addr_t iova, void *data,
> + size_t count, bool write)

"_nopin"? It might be pinned, but that's irrelevant to this interface.
Maybe "_chunk" as we're only trying to operate on the chunk of the whole
that fits within the next vfio_dma? Also swapping rw_dma vs dma_rw,
pick one.

> +{
> + struct mm_struct *mm;
> + unsigned long vaddr;
> + struct vfio_dma *dma;
> + bool kthread = current->mm == NULL;
> + size_t done = 0;
> + size_t offset;
> +
> + dma = vfio_find_dma(iommu, iova, 1);
> + if (!dma)
> + return 0;
> +
> + if (write && !(dma->prot & IOMMU_WRITE))
> + return 0;

Good catch, but users can also designate a mapping without read
permissions, in which case this interface should not allow read either.
Thanks,

Alex

> +
> + mm = get_task_mm(dma->task);
> +
> + if (!mm)
> + return 0;
> +
> + if (kthread)
> + use_mm(mm);
> + else if (current->mm != mm)
> + goto out;
> +
> + offset = iova - dma->iova;
> +
> + if (count > dma->size - offset)
> + count = dma->size - offset;
> +
> + vaddr = dma->vaddr + offset;
> +
> + if (write)
> + done = __copy_to_user((void __user *)vaddr, data, count) ?
> + 0 : count;
> + else
> + done = __copy_from_user(data, (void __user *)vaddr, count) ?
> + 0 : count;
> +
> + if (kthread)
> + unuse_mm(mm);
> +out:
> + mmput(mm);
> + return done;
> +}
> +
> +static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t iova,
> + void *data, size_t count, bool write)
> +{
> + struct vfio_iommu *iommu = iommu_data;
> + int ret = 0;
> + size_t done = 0;
> +
> + mutex_lock(&iommu->lock);
> + while (count > 0) {
> + done = vfio_iommu_type1_rw_dma_nopin(iommu, iova, data,
> + count, write);
> + if (!done) {
> + ret = -EFAULT;
> + break;
> + }
> +
> + count -= done;
> + data += done;
> + iova += done;
> + }
> +
> + mutex_unlock(&iommu->lock);
> + return ret;
> +}
> +
> static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> .name = "vfio-iommu-type1",
> .owner = THIS_MODULE,
> @@ -2338,6 +2413,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> .unpin_pages = vfio_iommu_type1_unpin_pages,
> .register_notifier = vfio_iommu_type1_register_notifier,
> .unregister_notifier = vfio_iommu_type1_unregister_notifier,
> + .dma_rw = vfio_iommu_type1_dma_rw,
> };
>
> static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index e42a711a2800..962f76a2d668 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -82,6 +82,8 @@ struct vfio_iommu_driver_ops {
> struct notifier_block *nb);
> int (*unregister_notifier)(void *iommu_data,
> struct notifier_block *nb);
> + int (*dma_rw)(void *iommu_data, dma_addr_t iova,
> + void *data, size_t count, bool write);
> };
>
> extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -107,6 +109,9 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
> int npage);
>
> +extern int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> + size_t len, bool write);
> +
> /* each type has independent events */
> enum vfio_notify_type {
> VFIO_IOMMU_NOTIFY = 0,

2020-01-15 20:12:29

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Tue, 14 Jan 2020 22:54:55 -0500
Yan Zhao <[email protected]> wrote:

> As a device model, it is better to read/write guest memory using vfio
> interface, so that vfio is able to maintain dirty info of device IOVAs.
>
> Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> cycles more overhead on average.
>
> -------------------------------------
> | interface | avg cpu cycles |
> |-----------------------------------|
> | kvm_write_guest | 1554 |
> | ----------------------------------|
> | kvm_read_guest | 707 |
> |-----------------------------------|
> | vfio_dma_rw(w) | 2274 |
> |-----------------------------------|
> | vfio_dma_rw(r) | 1378 |
> -------------------------------------

In v1 you had:

-------------------------------------
| interface | avg cpu cycles |
|-----------------------------------|
| kvm_write_guest | 1546 |
| ----------------------------------|
| kvm_read_guest | 686 |
|-----------------------------------|
| vfio_iova_rw(w) | 2233 |
|-----------------------------------|
| vfio_iova_rw(r) | 1262 |
-------------------------------------

So the kvm numbers remained within +0.5-3% while the vfio numbers are
now +1.8-9.2%. I would have expected the algorithm change to at least
not be worse for small accesses and be better for accesses crossing
page boundaries. Do you know what happened?

> Comparison of benchmarks scores are as blow:
> ------------------------------------------------------
> | avg score | kvm_read/write_guest | vfio_dma_rw |
> |----------------------------------------------------|
> | Glmark2 | 1284 | 1296 |
> |----------------------------------------------------|
> | Lightsmark | 61.24 | 61.27 |
> |----------------------------------------------------|
> | OpenArena | 140.9 | 137.4 |
> |----------------------------------------------------|
> | Heaven | 671 | 670 |
> ------------------------------------------------------
> No obvious performance downgrade found.
>
> Cc: Kevin Tian <[email protected]>
> Signed-off-by: Yan Zhao <[email protected]>
> ---
> drivers/gpu/drm/i915/gvt/kvmgt.c | 26 +++++++-------------------
> 1 file changed, 7 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> index bd79a9718cc7..17edc9a7ff05 100644
> --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> @@ -1966,31 +1966,19 @@ static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
> void *buf, unsigned long len, bool write)
> {
> struct kvmgt_guest_info *info;
> - struct kvm *kvm;
> - int idx, ret;
> - bool kthread = current->mm == NULL;
> + int ret;
> + struct intel_vgpu *vgpu;
> + struct device *dev;
>
> if (!handle_valid(handle))
> return -ESRCH;
>
> info = (struct kvmgt_guest_info *)handle;
> - kvm = info->kvm;
> -
> - if (kthread) {
> - if (!mmget_not_zero(kvm->mm))
> - return -EFAULT;
> - use_mm(kvm->mm);
> - }
> -
> - idx = srcu_read_lock(&kvm->srcu);
> - ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
> - kvm_read_guest(kvm, gpa, buf, len);
> - srcu_read_unlock(&kvm->srcu, idx);
> + vgpu = info->vgpu;
> + dev = mdev_dev(vgpu->vdev.mdev);
>
> - if (kthread) {
> - unuse_mm(kvm->mm);
> - mmput(kvm->mm);
> - }
> + ret = write ? vfio_dma_rw(dev, gpa, buf, len, true) :
> + vfio_dma_rw(dev, gpa, buf, len, false);

As Paolo suggested previously, this can be simplified:

ret = vfio_dma_rw(dev, gpa, buf, len, write);

>
> return ret;

Or even more simple, remove the ret variable:

return vfio_dma_rw(dev, gpa, buf, len, write);

Thanks,
Alex

> }

2020-01-16 02:48:36

by Mika Penttilä

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs



On 15.1.2020 22.06, Alex Williamson wrote:
> On Tue, 14 Jan 2020 22:53:03 -0500
> Yan Zhao <[email protected]> wrote:
>
>> vfio_dma_rw will read/write a range of user space memory pointed to by
>> IOVA into/from a kernel buffer without pinning the user space memory.
>>
>> TODO: mark the IOVAs to user space memory dirty if they are written in
>> vfio_dma_rw().
>>
>> Cc: Kevin Tian <[email protected]>
>> Signed-off-by: Yan Zhao <[email protected]>
>> ---
>> drivers/vfio/vfio.c | 45 +++++++++++++++++++
>> drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
>> include/linux/vfio.h | 5 +++
>> 3 files changed, 126 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index c8482624ca34..8bd52bc841cf 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
>> }
>> EXPORT_SYMBOL(vfio_unpin_pages);
>>
>> +/*
>> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
>> + * buffer without pinning the user space memory
>> + * @dev [in] : device
>> + * @iova [in] : base IOVA of a user space buffer
>> + * @data [in] : pointer to kernel buffer
>> + * @len [in] : kernel buffer length
>> + * @write : indicate read or write
>> + * Return error code on failure or 0 on success.
>> + */
>> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
>> + size_t len, bool write)
>> +{
>> + struct vfio_container *container;
>> + struct vfio_group *group;
>> + struct vfio_iommu_driver *driver;
>> + int ret = 0;

Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova
from a iommu mapping? So isn't it you actually assume all the guest is
pinned,
like from device assignment?

Or who and how is the vfio mapping added before the vfio_dma_rw() ?

Thanks,
Mika

2020-01-16 03:28:49

by Mika Penttilä

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs



On 16.1.2020 4.59, Alex Williamson wrote:
> On Thu, 16 Jan 2020 02:30:52 +0000
> Mika Penttilä <[email protected]> wrote:
>
>> On 15.1.2020 22.06, Alex Williamson wrote:
>>> On Tue, 14 Jan 2020 22:53:03 -0500
>>> Yan Zhao <[email protected]> wrote:
>>>
>>>> vfio_dma_rw will read/write a range of user space memory pointed to by
>>>> IOVA into/from a kernel buffer without pinning the user space memory.
>>>>
>>>> TODO: mark the IOVAs to user space memory dirty if they are written in
>>>> vfio_dma_rw().
>>>>
>>>> Cc: Kevin Tian <[email protected]>
>>>> Signed-off-by: Yan Zhao <[email protected]>
>>>> ---
>>>> drivers/vfio/vfio.c | 45 +++++++++++++++++++
>>>> drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
>>>> include/linux/vfio.h | 5 +++
>>>> 3 files changed, 126 insertions(+)
>>>>
>>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>>>> index c8482624ca34..8bd52bc841cf 100644
>>>> --- a/drivers/vfio/vfio.c
>>>> +++ b/drivers/vfio/vfio.c
>>>> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
>>>> }
>>>> EXPORT_SYMBOL(vfio_unpin_pages);
>>>>
>>>> +/*
>>>> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
>>>> + * buffer without pinning the user space memory
>>>> + * @dev [in] : device
>>>> + * @iova [in] : base IOVA of a user space buffer
>>>> + * @data [in] : pointer to kernel buffer
>>>> + * @len [in] : kernel buffer length
>>>> + * @write : indicate read or write
>>>> + * Return error code on failure or 0 on success.
>>>> + */
>>>> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
>>>> + size_t len, bool write)
>>>> +{
>>>> + struct vfio_container *container;
>>>> + struct vfio_group *group;
>>>> + struct vfio_iommu_driver *driver;
>>>> + int ret = 0;
>> Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova
>> from a iommu mapping? So isn't it you actually assume all the guest is
>> pinned,
>> like from device assignment?
>>
>> Or who and how is the vfio mapping added before the vfio_dma_rw() ?
> vfio only knows about IOVAs, not GPAs. It's possible that IOVAs are
> identity mapped to the GPA space, but a VM with a vIOMMU would quickly
> break any such assumption. Pinning is also not required. This access
> is via the CPU, not the I/O device, so we don't require the memory to
> be pinning and it potentially won't be for a non-IOMMU backed mediated
> device. The intention here is that via the mediation of an mdev
> device, a vendor driver would already know IOVA ranges for the device
> to access via the guest driver programming of the device. Thanks,
>
> Alex

Thanks Alex... you mean IOVA is in the case of iommu already a
iommu-translated address to a user space VA in VM host space?
How does it get to hold on that? What piece of meditation is responsible
for this?

--Mika

2020-01-16 03:54:15

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs

On Thu, 16 Jan 2020 02:30:52 +0000
Mika Penttilä <[email protected]> wrote:

> On 15.1.2020 22.06, Alex Williamson wrote:
> > On Tue, 14 Jan 2020 22:53:03 -0500
> > Yan Zhao <[email protected]> wrote:
> >
> >> vfio_dma_rw will read/write a range of user space memory pointed to by
> >> IOVA into/from a kernel buffer without pinning the user space memory.
> >>
> >> TODO: mark the IOVAs to user space memory dirty if they are written in
> >> vfio_dma_rw().
> >>
> >> Cc: Kevin Tian <[email protected]>
> >> Signed-off-by: Yan Zhao <[email protected]>
> >> ---
> >> drivers/vfio/vfio.c | 45 +++++++++++++++++++
> >> drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
> >> include/linux/vfio.h | 5 +++
> >> 3 files changed, 126 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >> index c8482624ca34..8bd52bc841cf 100644
> >> --- a/drivers/vfio/vfio.c
> >> +++ b/drivers/vfio/vfio.c
> >> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
> >> }
> >> EXPORT_SYMBOL(vfio_unpin_pages);
> >>
> >> +/*
> >> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> >> + * buffer without pinning the user space memory
> >> + * @dev [in] : device
> >> + * @iova [in] : base IOVA of a user space buffer
> >> + * @data [in] : pointer to kernel buffer
> >> + * @len [in] : kernel buffer length
> >> + * @write : indicate read or write
> >> + * Return error code on failure or 0 on success.
> >> + */
> >> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> >> + size_t len, bool write)
> >> +{
> >> + struct vfio_container *container;
> >> + struct vfio_group *group;
> >> + struct vfio_iommu_driver *driver;
> >> + int ret = 0;
>
> Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova
> from a iommu mapping? So isn't it you actually assume all the guest is
> pinned,
> like from device assignment?
>
> Or who and how is the vfio mapping added before the vfio_dma_rw() ?

vfio only knows about IOVAs, not GPAs. It's possible that IOVAs are
identity mapped to the GPA space, but a VM with a vIOMMU would quickly
break any such assumption. Pinning is also not required. This access
is via the CPU, not the I/O device, so we don't require the memory to
be pinning and it potentially won't be for a non-IOMMU backed mediated
device. The intention here is that via the mediation of an mdev
device, a vendor driver would already know IOVA ranges for the device
to access via the guest driver programming of the device. Thanks,

Alex

2020-01-16 05:45:00

by Yan Zhao

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs

On Thu, Jan 16, 2020 at 04:06:38AM +0800, Alex Williamson wrote:
> On Tue, 14 Jan 2020 22:53:03 -0500
> Yan Zhao <[email protected]> wrote:
>
> > vfio_dma_rw will read/write a range of user space memory pointed to by
> > IOVA into/from a kernel buffer without pinning the user space memory.
> >
> > TODO: mark the IOVAs to user space memory dirty if they are written in
> > vfio_dma_rw().
> >
> > Cc: Kevin Tian <[email protected]>
> > Signed-off-by: Yan Zhao <[email protected]>
> > ---
> > drivers/vfio/vfio.c | 45 +++++++++++++++++++
> > drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
> > include/linux/vfio.h | 5 +++
> > 3 files changed, 126 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > index c8482624ca34..8bd52bc841cf 100644
> > --- a/drivers/vfio/vfio.c
> > +++ b/drivers/vfio/vfio.c
> > @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
> > }
> > EXPORT_SYMBOL(vfio_unpin_pages);
> >
> > +/*
> > + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> > + * buffer without pinning the user space memory
> > + * @dev [in] : device
> > + * @iova [in] : base IOVA of a user space buffer
> > + * @data [in] : pointer to kernel buffer
> > + * @len [in] : kernel buffer length
> > + * @write : indicate read or write
> > + * Return error code on failure or 0 on success.
> > + */
> > +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> > + size_t len, bool write)
> > +{
> > + struct vfio_container *container;
> > + struct vfio_group *group;
> > + struct vfio_iommu_driver *driver;
> > + int ret = 0;
> > +
> > + if (!dev || !data || len <= 0)
> > + return -EINVAL;
> > +
> > + group = vfio_group_get_from_dev(dev);
> > + if (!group)
> > + return -ENODEV;
> > +
> > + ret = vfio_group_add_container_user(group);
> > + if (ret)
> > + goto out;
> > +
> > + container = group->container;
> > + driver = container->iommu_driver;
> > +
> > + if (likely(driver && driver->ops->dma_rw))
> > + ret = driver->ops->dma_rw(container->iommu_data,
> > + iova, data, len, write);
> > + else
> > + ret = -ENOTTY;
> > +
> > + vfio_group_try_dissolve_container(group);
> > +out:
> > + vfio_group_put(group);
> > + return ret;
> > +}
> > +EXPORT_SYMBOL(vfio_dma_rw);
> > +
> > static int vfio_register_iommu_notifier(struct vfio_group *group,
> > unsigned long *events,
> > struct notifier_block *nb)
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 2ada8e6cdb88..a2d850b97ae6 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -27,6 +27,7 @@
> > #include <linux/iommu.h>
> > #include <linux/module.h>
> > #include <linux/mm.h>
> > +#include <linux/mmu_context.h>
> > #include <linux/rbtree.h>
> > #include <linux/sched/signal.h>
> > #include <linux/sched/mm.h>
> > @@ -2326,6 +2327,80 @@ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
> > return blocking_notifier_chain_unregister(&iommu->notifier, nb);
> > }
> >
> > +static size_t vfio_iommu_type1_rw_dma_nopin(struct vfio_iommu *iommu,
> > + dma_addr_t iova, void *data,
> > + size_t count, bool write)
>
> "_nopin"? It might be pinned, but that's irrelevant to this interface.
> Maybe "_chunk" as we're only trying to operate on the chunk of the whole
> that fits within the next vfio_dma? Also swapping rw_dma vs dma_rw,
> pick one.
>
got it!

> > +{
> > + struct mm_struct *mm;
> > + unsigned long vaddr;
> > + struct vfio_dma *dma;
> > + bool kthread = current->mm == NULL;
> > + size_t done = 0;
> > + size_t offset;
> > +
> > + dma = vfio_find_dma(iommu, iova, 1);
> > + if (!dma)
> > + return 0;
> > +
> > + if (write && !(dma->prot & IOMMU_WRITE))
> > + return 0;
>
> Good catch, but users can also designate a mapping without read
> permissions, in which case this interface should not allow read either.
> Thanks,
>
yes, will add it too. thanks :)

Yan

> > +
> > + mm = get_task_mm(dma->task);
> > +
> > + if (!mm)
> > + return 0;
> > +
> > + if (kthread)
> > + use_mm(mm);
> > + else if (current->mm != mm)
> > + goto out;
> > +
> > + offset = iova - dma->iova;
> > +
> > + if (count > dma->size - offset)
> > + count = dma->size - offset;
> > +
> > + vaddr = dma->vaddr + offset;
> > +
> > + if (write)
> > + done = __copy_to_user((void __user *)vaddr, data, count) ?
> > + 0 : count;
> > + else
> > + done = __copy_from_user(data, (void __user *)vaddr, count) ?
> > + 0 : count;
> > +
> > + if (kthread)
> > + unuse_mm(mm);
> > +out:
> > + mmput(mm);
> > + return done;
> > +}
> > +
> > +static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t iova,
> > + void *data, size_t count, bool write)
> > +{
> > + struct vfio_iommu *iommu = iommu_data;
> > + int ret = 0;
> > + size_t done = 0;
> > +
> > + mutex_lock(&iommu->lock);
> > + while (count > 0) {
> > + done = vfio_iommu_type1_rw_dma_nopin(iommu, iova, data,
> > + count, write);
> > + if (!done) {
> > + ret = -EFAULT;
> > + break;
> > + }
> > +
> > + count -= done;
> > + data += done;
> > + iova += done;
> > + }
> > +
> > + mutex_unlock(&iommu->lock);
> > + return ret;
> > +}
> > +
> > static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> > .name = "vfio-iommu-type1",
> > .owner = THIS_MODULE,
> > @@ -2338,6 +2413,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> > .unpin_pages = vfio_iommu_type1_unpin_pages,
> > .register_notifier = vfio_iommu_type1_register_notifier,
> > .unregister_notifier = vfio_iommu_type1_unregister_notifier,
> > + .dma_rw = vfio_iommu_type1_dma_rw,
> > };
> >
> > static int __init vfio_iommu_type1_init(void)
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index e42a711a2800..962f76a2d668 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -82,6 +82,8 @@ struct vfio_iommu_driver_ops {
> > struct notifier_block *nb);
> > int (*unregister_notifier)(void *iommu_data,
> > struct notifier_block *nb);
> > + int (*dma_rw)(void *iommu_data, dma_addr_t iova,
> > + void *data, size_t count, bool write);
> > };
> >
> > extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> > @@ -107,6 +109,9 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> > extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
> > int npage);
> >
> > +extern int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> > + size_t len, bool write);
> > +
> > /* each type has independent events */
> > enum vfio_notify_type {
> > VFIO_IOMMU_NOTIFY = 0,
>
> _______________________________________________
> intel-gvt-dev mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev

2020-01-16 06:15:51

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] vfio: introduce vfio_dma_rw to read/write a range of IOVAs

On Thu, 16 Jan 2020 03:15:58 +0000
Mika Penttilä <[email protected]> wrote:

> On 16.1.2020 4.59, Alex Williamson wrote:
> > On Thu, 16 Jan 2020 02:30:52 +0000
> > Mika Penttilä <[email protected]> wrote:
> >
> >> On 15.1.2020 22.06, Alex Williamson wrote:
> >>> On Tue, 14 Jan 2020 22:53:03 -0500
> >>> Yan Zhao <[email protected]> wrote:
> >>>
> >>>> vfio_dma_rw will read/write a range of user space memory pointed to by
> >>>> IOVA into/from a kernel buffer without pinning the user space memory.
> >>>>
> >>>> TODO: mark the IOVAs to user space memory dirty if they are written in
> >>>> vfio_dma_rw().
> >>>>
> >>>> Cc: Kevin Tian <[email protected]>
> >>>> Signed-off-by: Yan Zhao <[email protected]>
> >>>> ---
> >>>> drivers/vfio/vfio.c | 45 +++++++++++++++++++
> >>>> drivers/vfio/vfio_iommu_type1.c | 76 +++++++++++++++++++++++++++++++++
> >>>> include/linux/vfio.h | 5 +++
> >>>> 3 files changed, 126 insertions(+)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >>>> index c8482624ca34..8bd52bc841cf 100644
> >>>> --- a/drivers/vfio/vfio.c
> >>>> +++ b/drivers/vfio/vfio.c
> >>>> @@ -1961,6 +1961,51 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
> >>>> }
> >>>> EXPORT_SYMBOL(vfio_unpin_pages);
> >>>>
> >>>> +/*
> >>>> + * Read/Write a range of IOVAs pointing to user space memory into/from a kernel
> >>>> + * buffer without pinning the user space memory
> >>>> + * @dev [in] : device
> >>>> + * @iova [in] : base IOVA of a user space buffer
> >>>> + * @data [in] : pointer to kernel buffer
> >>>> + * @len [in] : kernel buffer length
> >>>> + * @write : indicate read or write
> >>>> + * Return error code on failure or 0 on success.
> >>>> + */
> >>>> +int vfio_dma_rw(struct device *dev, dma_addr_t iova, void *data,
> >>>> + size_t len, bool write)
> >>>> +{
> >>>> + struct vfio_container *container;
> >>>> + struct vfio_group *group;
> >>>> + struct vfio_iommu_driver *driver;
> >>>> + int ret = 0;
> >> Do you know the iova given to vfio_dma_rw() is indeed a gpa and not iova
> >> from a iommu mapping? So isn't it you actually assume all the guest is
> >> pinned,
> >> like from device assignment?
> >>
> >> Or who and how is the vfio mapping added before the vfio_dma_rw() ?
> > vfio only knows about IOVAs, not GPAs. It's possible that IOVAs are
> > identity mapped to the GPA space, but a VM with a vIOMMU would quickly
> > break any such assumption. Pinning is also not required. This access
> > is via the CPU, not the I/O device, so we don't require the memory to
> > be pinning and it potentially won't be for a non-IOMMU backed mediated
> > device. The intention here is that via the mediation of an mdev
> > device, a vendor driver would already know IOVA ranges for the device
> > to access via the guest driver programming of the device. Thanks,
> >
> > Alex
>
> Thanks Alex... you mean IOVA is in the case of iommu already a
> iommu-translated address to a user space VA in VM host space?

The user (QEMU in the case of device assignment) performs ioctls to map
user VAs to IOVAs for the device. With IOMMU backing the VAs are
pinned to get HPA and the IOVA to HPA mappings are programmed into the
IOMMU. Thus the device accesses the IOVA to get to the HPA, which is
the backing for the VA. In this case we're simply using the IOVA to
lookup the VA and access it with the CPU directly. The IOMMU isn't
involved, but we're still performing an access as if we were the device
doing a DMA. Let me know if that doesn't answer your question.

> How does it get to hold on that? What piece of meditation is responsible
> for this?

It's device specific. The mdev vendor driver is mediating a specific
hardware device where user accesses to MMIO on the device configures
DMA targets. The mediation needs to trap those accesses in order to
pin page and program the real hardware with real physical addresses (be
they HPA or host-IOVAs depending on the host IOMMU config) to perform
those DMAs. For cases where the CPU might choose to perform some sort
of virtual DMA on behalf of the device itself, this interface would be
used. Thanks,

Alex

2020-01-16 06:39:03

by Yan Zhao

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> On Tue, 14 Jan 2020 22:54:55 -0500
> Yan Zhao <[email protected]> wrote:
>
> > As a device model, it is better to read/write guest memory using vfio
> > interface, so that vfio is able to maintain dirty info of device IOVAs.
> >
> > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > cycles more overhead on average.
> >
> > -------------------------------------
> > | interface | avg cpu cycles |
> > |-----------------------------------|
> > | kvm_write_guest | 1554 |
> > | ----------------------------------|
> > | kvm_read_guest | 707 |
> > |-----------------------------------|
> > | vfio_dma_rw(w) | 2274 |
> > |-----------------------------------|
> > | vfio_dma_rw(r) | 1378 |
> > -------------------------------------
>
> In v1 you had:
>
> -------------------------------------
> | interface | avg cpu cycles |
> |-----------------------------------|
> | kvm_write_guest | 1546 |
> | ----------------------------------|
> | kvm_read_guest | 686 |
> |-----------------------------------|
> | vfio_iova_rw(w) | 2233 |
> |-----------------------------------|
> | vfio_iova_rw(r) | 1262 |
> -------------------------------------
>
> So the kvm numbers remained within +0.5-3% while the vfio numbers are
> now +1.8-9.2%. I would have expected the algorithm change to at least
> not be worse for small accesses and be better for accesses crossing
> page boundaries. Do you know what happened?
>
I only tested the 4 interfaces in GVT's environment, where most of the
guest memory accesses are less than one page.
And the different fluctuations should be caused by the locks.
vfio_dma_rw contends locks with other vfio accesses which are assumed to
be abundant in the case of GVT.

> > Comparison of benchmarks scores are as blow:
> > ------------------------------------------------------
> > | avg score | kvm_read/write_guest | vfio_dma_rw |
> > |----------------------------------------------------|
> > | Glmark2 | 1284 | 1296 |
> > |----------------------------------------------------|
> > | Lightsmark | 61.24 | 61.27 |
> > |----------------------------------------------------|
> > | OpenArena | 140.9 | 137.4 |
> > |----------------------------------------------------|
> > | Heaven | 671 | 670 |
> > ------------------------------------------------------
> > No obvious performance downgrade found.
> >
> > Cc: Kevin Tian <[email protected]>
> > Signed-off-by: Yan Zhao <[email protected]>
> > ---
> > drivers/gpu/drm/i915/gvt/kvmgt.c | 26 +++++++-------------------
> > 1 file changed, 7 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > index bd79a9718cc7..17edc9a7ff05 100644
> > --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> > +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > @@ -1966,31 +1966,19 @@ static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
> > void *buf, unsigned long len, bool write)
> > {
> > struct kvmgt_guest_info *info;
> > - struct kvm *kvm;
> > - int idx, ret;
> > - bool kthread = current->mm == NULL;
> > + int ret;
> > + struct intel_vgpu *vgpu;
> > + struct device *dev;
> >
> > if (!handle_valid(handle))
> > return -ESRCH;
> >
> > info = (struct kvmgt_guest_info *)handle;
> > - kvm = info->kvm;
> > -
> > - if (kthread) {
> > - if (!mmget_not_zero(kvm->mm))
> > - return -EFAULT;
> > - use_mm(kvm->mm);
> > - }
> > -
> > - idx = srcu_read_lock(&kvm->srcu);
> > - ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
> > - kvm_read_guest(kvm, gpa, buf, len);
> > - srcu_read_unlock(&kvm->srcu, idx);
> > + vgpu = info->vgpu;
> > + dev = mdev_dev(vgpu->vdev.mdev);
> >
> > - if (kthread) {
> > - unuse_mm(kvm->mm);
> > - mmput(kvm->mm);
> > - }
> > + ret = write ? vfio_dma_rw(dev, gpa, buf, len, true) :
> > + vfio_dma_rw(dev, gpa, buf, len, false);
>
> As Paolo suggested previously, this can be simplified:
>
> ret = vfio_dma_rw(dev, gpa, buf, len, write);
>
> >
> > return ret;
>
> Or even more simple, remove the ret variable:
>
> return vfio_dma_rw(dev, gpa, buf, len, write);
>
oh, it seems that I missed Paolo's mail. will change it. thank you!

Thanks
Yan
>
> > }
>

2020-01-16 15:38:56

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Thu, 16 Jan 2020 00:49:41 -0500
Yan Zhao <[email protected]> wrote:

> On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > On Tue, 14 Jan 2020 22:54:55 -0500
> > Yan Zhao <[email protected]> wrote:
> >
> > > As a device model, it is better to read/write guest memory using vfio
> > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > >
> > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > cycles more overhead on average.
> > >
> > > -------------------------------------
> > > | interface | avg cpu cycles |
> > > |-----------------------------------|
> > > | kvm_write_guest | 1554 |
> > > | ----------------------------------|
> > > | kvm_read_guest | 707 |
> > > |-----------------------------------|
> > > | vfio_dma_rw(w) | 2274 |
> > > |-----------------------------------|
> > > | vfio_dma_rw(r) | 1378 |
> > > -------------------------------------
> >
> > In v1 you had:
> >
> > -------------------------------------
> > | interface | avg cpu cycles |
> > |-----------------------------------|
> > | kvm_write_guest | 1546 |
> > | ----------------------------------|
> > | kvm_read_guest | 686 |
> > |-----------------------------------|
> > | vfio_iova_rw(w) | 2233 |
> > |-----------------------------------|
> > | vfio_iova_rw(r) | 1262 |
> > -------------------------------------
> >
> > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > now +1.8-9.2%. I would have expected the algorithm change to at least
> > not be worse for small accesses and be better for accesses crossing
> > page boundaries. Do you know what happened?
> >
> I only tested the 4 interfaces in GVT's environment, where most of the
> guest memory accesses are less than one page.
> And the different fluctuations should be caused by the locks.
> vfio_dma_rw contends locks with other vfio accesses which are assumed to
> be abundant in the case of GVT.

Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
rwsem? Thanks,

Alex

> > > Comparison of benchmarks scores are as blow:
> > > ------------------------------------------------------
> > > | avg score | kvm_read/write_guest | vfio_dma_rw |
> > > |----------------------------------------------------|
> > > | Glmark2 | 1284 | 1296 |
> > > |----------------------------------------------------|
> > > | Lightsmark | 61.24 | 61.27 |
> > > |----------------------------------------------------|
> > > | OpenArena | 140.9 | 137.4 |
> > > |----------------------------------------------------|
> > > | Heaven | 671 | 670 |
> > > ------------------------------------------------------
> > > No obvious performance downgrade found.
> > >
> > > Cc: Kevin Tian <[email protected]>
> > > Signed-off-by: Yan Zhao <[email protected]>
> > > ---
> > > drivers/gpu/drm/i915/gvt/kvmgt.c | 26 +++++++-------------------
> > > 1 file changed, 7 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > > index bd79a9718cc7..17edc9a7ff05 100644
> > > --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> > > +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > > @@ -1966,31 +1966,19 @@ static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
> > > void *buf, unsigned long len, bool write)
> > > {
> > > struct kvmgt_guest_info *info;
> > > - struct kvm *kvm;
> > > - int idx, ret;
> > > - bool kthread = current->mm == NULL;
> > > + int ret;
> > > + struct intel_vgpu *vgpu;
> > > + struct device *dev;
> > >
> > > if (!handle_valid(handle))
> > > return -ESRCH;
> > >
> > > info = (struct kvmgt_guest_info *)handle;
> > > - kvm = info->kvm;
> > > -
> > > - if (kthread) {
> > > - if (!mmget_not_zero(kvm->mm))
> > > - return -EFAULT;
> > > - use_mm(kvm->mm);
> > > - }
> > > -
> > > - idx = srcu_read_lock(&kvm->srcu);
> > > - ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
> > > - kvm_read_guest(kvm, gpa, buf, len);
> > > - srcu_read_unlock(&kvm->srcu, idx);
> > > + vgpu = info->vgpu;
> > > + dev = mdev_dev(vgpu->vdev.mdev);
> > >
> > > - if (kthread) {
> > > - unuse_mm(kvm->mm);
> > > - mmput(kvm->mm);
> > > - }
> > > + ret = write ? vfio_dma_rw(dev, gpa, buf, len, true) :
> > > + vfio_dma_rw(dev, gpa, buf, len, false);
> >
> > As Paolo suggested previously, this can be simplified:
> >
> > ret = vfio_dma_rw(dev, gpa, buf, len, write);
> >
> > >
> > > return ret;
> >
> > Or even more simple, remove the ret variable:
> >
> > return vfio_dma_rw(dev, gpa, buf, len, write);
> >
> oh, it seems that I missed Paolo's mail. will change it. thank you!
>
> Thanks
> Yan
> >
> > > }
> >
>

2020-01-19 10:17:01

by Yan Zhao

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Thu, Jan 16, 2020 at 11:37:29PM +0800, Alex Williamson wrote:
> On Thu, 16 Jan 2020 00:49:41 -0500
> Yan Zhao <[email protected]> wrote:
>
> > On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > > On Tue, 14 Jan 2020 22:54:55 -0500
> > > Yan Zhao <[email protected]> wrote:
> > >
> > > > As a device model, it is better to read/write guest memory using vfio
> > > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > > >
> > > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > > cycles more overhead on average.
> > > >
> > > > -------------------------------------
> > > > | interface | avg cpu cycles |
> > > > |-----------------------------------|
> > > > | kvm_write_guest | 1554 |
> > > > | ----------------------------------|
> > > > | kvm_read_guest | 707 |
> > > > |-----------------------------------|
> > > > | vfio_dma_rw(w) | 2274 |
> > > > |-----------------------------------|
> > > > | vfio_dma_rw(r) | 1378 |
> > > > -------------------------------------
> > >
> > > In v1 you had:
> > >
> > > -------------------------------------
> > > | interface | avg cpu cycles |
> > > |-----------------------------------|
> > > | kvm_write_guest | 1546 |
> > > | ----------------------------------|
> > > | kvm_read_guest | 686 |
> > > |-----------------------------------|
> > > | vfio_iova_rw(w) | 2233 |
> > > |-----------------------------------|
> > > | vfio_iova_rw(r) | 1262 |
> > > -------------------------------------
> > >
> > > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > > now +1.8-9.2%. I would have expected the algorithm change to at least
> > > not be worse for small accesses and be better for accesses crossing
> > > page boundaries. Do you know what happened?
> > >
> > I only tested the 4 interfaces in GVT's environment, where most of the
> > guest memory accesses are less than one page.
> > And the different fluctuations should be caused by the locks.
> > vfio_dma_rw contends locks with other vfio accesses which are assumed to
> > be abundant in the case of GVT.
>
> Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
> rwsem? Thanks,
>

hi Alex
I tested your rwsem patches at (https://lkml.org/lkml/2020/1/16/1869).
They works without any runtime error at my side. :)
However, I found out that the previous fluctuation may be because I didn't
take read/write counts in to account.
For example. though the two tests have different avg read/write cycles,
their average cycles are almost the same.
______________________________________________________________________
| | avg read | | avg write | | |
| | cycles | read cnt | cycles | write cnt | avg cycles |
|----------------------------------------------------------------------|
| test 1 | 1339 | 29,587,120 | 2258 | 17,098,364 | 1676 |
| test 2 | 1340 | 28,454,262 | 2238 | 16,501,788 | 1670 |
----------------------------------------------------------------------

After measuring the exact read/write cnt and cycles of a specific workload,
I get below findings:

(1) with single VM running glmark2 inside.
glmark2: 40M+ read+write cnt, among which 63% is read.
among reads, 48% is of PAGE_SIZE, the rest is less than a page.
among writes, 100% is less than a page.

__________________________________________________
| cycles | read | write | avg | inc |
|--------------------------------------------------|
| kvm_read/write_page | 694 | 1506 | 993 | / |
|--------------------------------------------------|
| vfio_dma_rw(mutex) | 1340 | 2248 | 1673 | 680 |
|--------------------------------------------------|
| vfio_dma_rw(rwsem r) | 1323 | 2198 | 1645 | 653 |
---------------------------------------------------

so vfio_dma_rw generally has 650+ more cycles per each read/write.
While kvm->srcu is of 160 cycles on average with one vm is running, the
cycles spending on locks for vfio_dma_rw spread like this:
___________________________
| cycles | avg |
|---------------------------|
| iommu->lock | 117 |
|---------------------------|
| vfio.group_lock | 108 |
|---------------------------|
| group->unbound_lock | 114 |
|---------------------------|
| group->device_lock | 115 |
|---------------------------|
| group->mutex | 113 |
---------------------------

I measured the cycles for a mutex without any contention is 104 cycles
on average (including time for get_cycles() and measured in the same way
as other locks). So the contention of a single lock in a single vm
environment is light. probably because there's a vgpu lock hold in GVT already.

(2) with two VMs each running glmark2 inside.
The contention increases a little.

___________________________________________________
| cycles | read | write | avg | inc |
|---------------------------------------------------|
| kvm_read/write_page | 1035 | 1832 | 1325 | / |
|---------------------------------------------------|
| vfio_dma_rw(mutex) | 2104 | 2886 | 2390 | 1065 |
|---------------------------------------------------|
| vfio_dma_rw(rwsem r) | 1965 | 2778 | 2260 | 935 |
---------------------------------------------------


-----------------------------------------------
| avg cycles | one VM | two VMs |
|-----------------------------------------------|
| iommu lock (mutex) | 117 | 150 |
|-----------------------------------|-----------|
| iommu lock (rwsem r) | 117 | 156 |
|-----------------------------------|-----------|
| kvm->srcu | 160 | 213 |
-----------------------------------------------

In the kvm case, avg cycles increased 332 cycles, while kvm->srcu only costed
213 cycles. The rest 109 cycles may be spent on atomic operations.
But I didn't measure them, as get_cycles() operation itself would influence final
cycles by ~20 cycles.


Thanks
Yan




>
> > > > Comparison of benchmarks scores are as blow:
> > > > ------------------------------------------------------
> > > > | avg score | kvm_read/write_guest | vfio_dma_rw |
> > > > |----------------------------------------------------|
> > > > | Glmark2 | 1284 | 1296 |
> > > > |----------------------------------------------------|
> > > > | Lightsmark | 61.24 | 61.27 |
> > > > |----------------------------------------------------|
> > > > | OpenArena | 140.9 | 137.4 |
> > > > |----------------------------------------------------|
> > > > | Heaven | 671 | 670 |
> > > > ------------------------------------------------------
> > > > No obvious performance downgrade found.
> > > >
> > > > Cc: Kevin Tian <[email protected]>
> > > > Signed-off-by: Yan Zhao <[email protected]>
> > > > ---
> > > > drivers/gpu/drm/i915/gvt/kvmgt.c | 26 +++++++-------------------
> > > > 1 file changed, 7 insertions(+), 19 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > > > index bd79a9718cc7..17edc9a7ff05 100644
> > > > --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> > > > +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > > > @@ -1966,31 +1966,19 @@ static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
> > > > void *buf, unsigned long len, bool write)
> > > > {
> > > > struct kvmgt_guest_info *info;
> > > > - struct kvm *kvm;
> > > > - int idx, ret;
> > > > - bool kthread = current->mm == NULL;
> > > > + int ret;
> > > > + struct intel_vgpu *vgpu;
> > > > + struct device *dev;
> > > >
> > > > if (!handle_valid(handle))
> > > > return -ESRCH;
> > > >
> > > > info = (struct kvmgt_guest_info *)handle;
> > > > - kvm = info->kvm;
> > > > -
> > > > - if (kthread) {
> > > > - if (!mmget_not_zero(kvm->mm))
> > > > - return -EFAULT;
> > > > - use_mm(kvm->mm);
> > > > - }
> > > > -
> > > > - idx = srcu_read_lock(&kvm->srcu);
> > > > - ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
> > > > - kvm_read_guest(kvm, gpa, buf, len);
> > > > - srcu_read_unlock(&kvm->srcu, idx);
> > > > + vgpu = info->vgpu;
> > > > + dev = mdev_dev(vgpu->vdev.mdev);
> > > >
> > > > - if (kthread) {
> > > > - unuse_mm(kvm->mm);
> > > > - mmput(kvm->mm);
> > > > - }
> > > > + ret = write ? vfio_dma_rw(dev, gpa, buf, len, true) :
> > > > + vfio_dma_rw(dev, gpa, buf, len, false);
> > >
> > > As Paolo suggested previously, this can be simplified:
> > >
> > > ret = vfio_dma_rw(dev, gpa, buf, len, write);
> > >
> > > >
> > > > return ret;
> > >
> > > Or even more simple, remove the ret variable:
> > >
> > > return vfio_dma_rw(dev, gpa, buf, len, write);
> > >
> > oh, it seems that I missed Paolo's mail. will change it. thank you!
> >
> > Thanks
> > Yan
> > >
> > > > }
> > >
> >
>

2020-01-20 20:03:18

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Sun, 19 Jan 2020 05:06:37 -0500
Yan Zhao <[email protected]> wrote:

> On Thu, Jan 16, 2020 at 11:37:29PM +0800, Alex Williamson wrote:
> > On Thu, 16 Jan 2020 00:49:41 -0500
> > Yan Zhao <[email protected]> wrote:
> >
> > > On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > > > On Tue, 14 Jan 2020 22:54:55 -0500
> > > > Yan Zhao <[email protected]> wrote:
> > > >
> > > > > As a device model, it is better to read/write guest memory using vfio
> > > > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > > > >
> > > > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > > > cycles more overhead on average.
> > > > >
> > > > > -------------------------------------
> > > > > | interface | avg cpu cycles |
> > > > > |-----------------------------------|
> > > > > | kvm_write_guest | 1554 |
> > > > > | ----------------------------------|
> > > > > | kvm_read_guest | 707 |
> > > > > |-----------------------------------|
> > > > > | vfio_dma_rw(w) | 2274 |
> > > > > |-----------------------------------|
> > > > > | vfio_dma_rw(r) | 1378 |
> > > > > -------------------------------------
> > > >
> > > > In v1 you had:
> > > >
> > > > -------------------------------------
> > > > | interface | avg cpu cycles |
> > > > |-----------------------------------|
> > > > | kvm_write_guest | 1546 |
> > > > | ----------------------------------|
> > > > | kvm_read_guest | 686 |
> > > > |-----------------------------------|
> > > > | vfio_iova_rw(w) | 2233 |
> > > > |-----------------------------------|
> > > > | vfio_iova_rw(r) | 1262 |
> > > > -------------------------------------
> > > >
> > > > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > > > now +1.8-9.2%. I would have expected the algorithm change to at least
> > > > not be worse for small accesses and be better for accesses crossing
> > > > page boundaries. Do you know what happened?
> > > >
> > > I only tested the 4 interfaces in GVT's environment, where most of the
> > > guest memory accesses are less than one page.
> > > And the different fluctuations should be caused by the locks.
> > > vfio_dma_rw contends locks with other vfio accesses which are assumed to
> > > be abundant in the case of GVT.
> >
> > Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
> > rwsem? Thanks,
> >
>
> hi Alex
> I tested your rwsem patches at (https://lkml.org/lkml/2020/1/16/1869).
> They works without any runtime error at my side. :)
> However, I found out that the previous fluctuation may be because I didn't
> take read/write counts in to account.
> For example. though the two tests have different avg read/write cycles,
> their average cycles are almost the same.
> ______________________________________________________________________
> | | avg read | | avg write | | |
> | | cycles | read cnt | cycles | write cnt | avg cycles |
> |----------------------------------------------------------------------|
> | test 1 | 1339 | 29,587,120 | 2258 | 17,098,364 | 1676 |
> | test 2 | 1340 | 28,454,262 | 2238 | 16,501,788 | 1670 |
> ----------------------------------------------------------------------
>
> After measuring the exact read/write cnt and cycles of a specific workload,
> I get below findings:
>
> (1) with single VM running glmark2 inside.
> glmark2: 40M+ read+write cnt, among which 63% is read.
> among reads, 48% is of PAGE_SIZE, the rest is less than a page.
> among writes, 100% is less than a page.
>
> __________________________________________________
> | cycles | read | write | avg | inc |
> |--------------------------------------------------|
> | kvm_read/write_page | 694 | 1506 | 993 | / |
> |--------------------------------------------------|
> | vfio_dma_rw(mutex) | 1340 | 2248 | 1673 | 680 |
> |--------------------------------------------------|
> | vfio_dma_rw(rwsem r) | 1323 | 2198 | 1645 | 653 |
> ---------------------------------------------------
>
> so vfio_dma_rw generally has 650+ more cycles per each read/write.
> While kvm->srcu is of 160 cycles on average with one vm is running, the
> cycles spending on locks for vfio_dma_rw spread like this:
> ___________________________
> | cycles | avg |
> |---------------------------|
> | iommu->lock | 117 |
> |---------------------------|
> | vfio.group_lock | 108 |
> |---------------------------|
> | group->unbound_lock | 114 |
> |---------------------------|
> | group->device_lock | 115 |
> |---------------------------|
> | group->mutex | 113 |
> ---------------------------
>
> I measured the cycles for a mutex without any contention is 104 cycles
> on average (including time for get_cycles() and measured in the same way
> as other locks). So the contention of a single lock in a single vm
> environment is light. probably because there's a vgpu lock hold in GVT already.
>
> (2) with two VMs each running glmark2 inside.
> The contention increases a little.
>
> ___________________________________________________
> | cycles | read | write | avg | inc |
> |---------------------------------------------------|
> | kvm_read/write_page | 1035 | 1832 | 1325 | / |
> |---------------------------------------------------|
> | vfio_dma_rw(mutex) | 2104 | 2886 | 2390 | 1065 |
> |---------------------------------------------------|
> | vfio_dma_rw(rwsem r) | 1965 | 2778 | 2260 | 935 |
> ---------------------------------------------------
>
>
> -----------------------------------------------
> | avg cycles | one VM | two VMs |
> |-----------------------------------------------|
> | iommu lock (mutex) | 117 | 150 |
> |-----------------------------------|-----------|
> | iommu lock (rwsem r) | 117 | 156 |
> |-----------------------------------|-----------|
> | kvm->srcu | 160 | 213 |
> -----------------------------------------------
>
> In the kvm case, avg cycles increased 332 cycles, while kvm->srcu only costed
> 213 cycles. The rest 109 cycles may be spent on atomic operations.
> But I didn't measure them, as get_cycles() operation itself would influence final
> cycles by ~20 cycles.

It seems like we need to extend the vfio external user interface so
that GVT-g can hold the group and container user references across
multiple calls. For instance if we had a
vfio_group_get_external_user_from_dev() (based on
vfio_group_get_external_user()) then i915 could get an opaque
vfio_group pointer which it could use to call vfio_group_dma_rw() which
would leave us with only the iommu rw_sem locking. i915 would release
the reference with vfio_group_put_external_user() when the device is
released. The same could be done with the pin pages interface to
streamline that as well. Thoughts? Thanks,

Alex

2020-01-21 08:23:35

by Yan Zhao

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Tue, Jan 21, 2020 at 04:01:57AM +0800, Alex Williamson wrote:
> On Sun, 19 Jan 2020 05:06:37 -0500
> Yan Zhao <[email protected]> wrote:
>
> > On Thu, Jan 16, 2020 at 11:37:29PM +0800, Alex Williamson wrote:
> > > On Thu, 16 Jan 2020 00:49:41 -0500
> > > Yan Zhao <[email protected]> wrote:
> > >
> > > > On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > > > > On Tue, 14 Jan 2020 22:54:55 -0500
> > > > > Yan Zhao <[email protected]> wrote:
> > > > >
> > > > > > As a device model, it is better to read/write guest memory using vfio
> > > > > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > > > > >
> > > > > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > > > > cycles more overhead on average.
> > > > > >
> > > > > > -------------------------------------
> > > > > > | interface | avg cpu cycles |
> > > > > > |-----------------------------------|
> > > > > > | kvm_write_guest | 1554 |
> > > > > > | ----------------------------------|
> > > > > > | kvm_read_guest | 707 |
> > > > > > |-----------------------------------|
> > > > > > | vfio_dma_rw(w) | 2274 |
> > > > > > |-----------------------------------|
> > > > > > | vfio_dma_rw(r) | 1378 |
> > > > > > -------------------------------------
> > > > >
> > > > > In v1 you had:
> > > > >
> > > > > -------------------------------------
> > > > > | interface | avg cpu cycles |
> > > > > |-----------------------------------|
> > > > > | kvm_write_guest | 1546 |
> > > > > | ----------------------------------|
> > > > > | kvm_read_guest | 686 |
> > > > > |-----------------------------------|
> > > > > | vfio_iova_rw(w) | 2233 |
> > > > > |-----------------------------------|
> > > > > | vfio_iova_rw(r) | 1262 |
> > > > > -------------------------------------
> > > > >
> > > > > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > > > > now +1.8-9.2%. I would have expected the algorithm change to at least
> > > > > not be worse for small accesses and be better for accesses crossing
> > > > > page boundaries. Do you know what happened?
> > > > >
> > > > I only tested the 4 interfaces in GVT's environment, where most of the
> > > > guest memory accesses are less than one page.
> > > > And the different fluctuations should be caused by the locks.
> > > > vfio_dma_rw contends locks with other vfio accesses which are assumed to
> > > > be abundant in the case of GVT.
> > >
> > > Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
> > > rwsem? Thanks,
> > >
> >
> > hi Alex
> > I tested your rwsem patches at (https://lkml.org/lkml/2020/1/16/1869).
> > They works without any runtime error at my side. :)
> > However, I found out that the previous fluctuation may be because I didn't
> > take read/write counts in to account.
> > For example. though the two tests have different avg read/write cycles,
> > their average cycles are almost the same.
> > ______________________________________________________________________
> > | | avg read | | avg write | | |
> > | | cycles | read cnt | cycles | write cnt | avg cycles |
> > |----------------------------------------------------------------------|
> > | test 1 | 1339 | 29,587,120 | 2258 | 17,098,364 | 1676 |
> > | test 2 | 1340 | 28,454,262 | 2238 | 16,501,788 | 1670 |
> > ----------------------------------------------------------------------
> >
> > After measuring the exact read/write cnt and cycles of a specific workload,
> > I get below findings:
> >
> > (1) with single VM running glmark2 inside.
> > glmark2: 40M+ read+write cnt, among which 63% is read.
> > among reads, 48% is of PAGE_SIZE, the rest is less than a page.
> > among writes, 100% is less than a page.
> >
> > __________________________________________________
> > | cycles | read | write | avg | inc |
> > |--------------------------------------------------|
> > | kvm_read/write_page | 694 | 1506 | 993 | / |
> > |--------------------------------------------------|
> > | vfio_dma_rw(mutex) | 1340 | 2248 | 1673 | 680 |
> > |--------------------------------------------------|
> > | vfio_dma_rw(rwsem r) | 1323 | 2198 | 1645 | 653 |
> > ---------------------------------------------------
> >
> > so vfio_dma_rw generally has 650+ more cycles per each read/write.
> > While kvm->srcu is of 160 cycles on average with one vm is running, the
> > cycles spending on locks for vfio_dma_rw spread like this:
> > ___________________________
> > | cycles | avg |
> > |---------------------------|
> > | iommu->lock | 117 |
> > |---------------------------|
> > | vfio.group_lock | 108 |
> > |---------------------------|
> > | group->unbound_lock | 114 |
> > |---------------------------|
> > | group->device_lock | 115 |
> > |---------------------------|
> > | group->mutex | 113 |
> > ---------------------------
> >
> > I measured the cycles for a mutex without any contention is 104 cycles
> > on average (including time for get_cycles() and measured in the same way
> > as other locks). So the contention of a single lock in a single vm
> > environment is light. probably because there's a vgpu lock hold in GVT already.
> >
> > (2) with two VMs each running glmark2 inside.
> > The contention increases a little.
> >
> > ___________________________________________________
> > | cycles | read | write | avg | inc |
> > |---------------------------------------------------|
> > | kvm_read/write_page | 1035 | 1832 | 1325 | / |
> > |---------------------------------------------------|
> > | vfio_dma_rw(mutex) | 2104 | 2886 | 2390 | 1065 |
> > |---------------------------------------------------|
> > | vfio_dma_rw(rwsem r) | 1965 | 2778 | 2260 | 935 |
> > ---------------------------------------------------
> >
> >
> > -----------------------------------------------
> > | avg cycles | one VM | two VMs |
> > |-----------------------------------------------|
> > | iommu lock (mutex) | 117 | 150 |
> > |-----------------------------------|-----------|
> > | iommu lock (rwsem r) | 117 | 156 |
> > |-----------------------------------|-----------|
> > | kvm->srcu | 160 | 213 |
> > -----------------------------------------------
> >
> > In the kvm case, avg cycles increased 332 cycles, while kvm->srcu only costed
> > 213 cycles. The rest 109 cycles may be spent on atomic operations.
> > But I didn't measure them, as get_cycles() operation itself would influence final
> > cycles by ~20 cycles.
>
> It seems like we need to extend the vfio external user interface so
> that GVT-g can hold the group and container user references across
> multiple calls. For instance if we had a
> vfio_group_get_external_user_from_dev() (based on
> vfio_group_get_external_user()) then i915 could get an opaque
> vfio_group pointer which it could use to call vfio_group_dma_rw() which
> would leave us with only the iommu rw_sem locking. i915 would release
> the reference with vfio_group_put_external_user() when the device is
> released. The same could be done with the pin pages interface to
> streamline that as well. Thoughts? Thanks,
>
hi Alex,
it works!
now the average vfio_dma_rw cycles can reduced to 1198.
one thing I want to propose is that, in sight of dma->task is always user
space process, instead of calling get_task_mm(dma->task), can we just use
"mmget_not_zero(dma->task->mm)"? in this way, the avg cycles can
further reduce to 1051.

Thanks
Yan

>
> _______________________________________________
> intel-gvt-dev mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev

2020-01-21 16:52:42

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Tue, 21 Jan 2020 03:12:07 -0500
Yan Zhao <[email protected]> wrote:

> On Tue, Jan 21, 2020 at 04:01:57AM +0800, Alex Williamson wrote:
> > On Sun, 19 Jan 2020 05:06:37 -0500
> > Yan Zhao <[email protected]> wrote:
> >
> > > On Thu, Jan 16, 2020 at 11:37:29PM +0800, Alex Williamson wrote:
> > > > On Thu, 16 Jan 2020 00:49:41 -0500
> > > > Yan Zhao <[email protected]> wrote:
> > > >
> > > > > On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > > > > > On Tue, 14 Jan 2020 22:54:55 -0500
> > > > > > Yan Zhao <[email protected]> wrote:
> > > > > >
> > > > > > > As a device model, it is better to read/write guest memory using vfio
> > > > > > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > > > > > >
> > > > > > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > > > > > cycles more overhead on average.
> > > > > > >
> > > > > > > -------------------------------------
> > > > > > > | interface | avg cpu cycles |
> > > > > > > |-----------------------------------|
> > > > > > > | kvm_write_guest | 1554 |
> > > > > > > | ----------------------------------|
> > > > > > > | kvm_read_guest | 707 |
> > > > > > > |-----------------------------------|
> > > > > > > | vfio_dma_rw(w) | 2274 |
> > > > > > > |-----------------------------------|
> > > > > > > | vfio_dma_rw(r) | 1378 |
> > > > > > > -------------------------------------
> > > > > >
> > > > > > In v1 you had:
> > > > > >
> > > > > > -------------------------------------
> > > > > > | interface | avg cpu cycles |
> > > > > > |-----------------------------------|
> > > > > > | kvm_write_guest | 1546 |
> > > > > > | ----------------------------------|
> > > > > > | kvm_read_guest | 686 |
> > > > > > |-----------------------------------|
> > > > > > | vfio_iova_rw(w) | 2233 |
> > > > > > |-----------------------------------|
> > > > > > | vfio_iova_rw(r) | 1262 |
> > > > > > -------------------------------------
> > > > > >
> > > > > > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > > > > > now +1.8-9.2%. I would have expected the algorithm change to at least
> > > > > > not be worse for small accesses and be better for accesses crossing
> > > > > > page boundaries. Do you know what happened?
> > > > > >
> > > > > I only tested the 4 interfaces in GVT's environment, where most of the
> > > > > guest memory accesses are less than one page.
> > > > > And the different fluctuations should be caused by the locks.
> > > > > vfio_dma_rw contends locks with other vfio accesses which are assumed to
> > > > > be abundant in the case of GVT.
> > > >
> > > > Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
> > > > rwsem? Thanks,
> > > >
> > >
> > > hi Alex
> > > I tested your rwsem patches at (https://lkml.org/lkml/2020/1/16/1869).
> > > They works without any runtime error at my side. :)
> > > However, I found out that the previous fluctuation may be because I didn't
> > > take read/write counts in to account.
> > > For example. though the two tests have different avg read/write cycles,
> > > their average cycles are almost the same.
> > > ______________________________________________________________________
> > > | | avg read | | avg write | | |
> > > | | cycles | read cnt | cycles | write cnt | avg cycles |
> > > |----------------------------------------------------------------------|
> > > | test 1 | 1339 | 29,587,120 | 2258 | 17,098,364 | 1676 |
> > > | test 2 | 1340 | 28,454,262 | 2238 | 16,501,788 | 1670 |
> > > ----------------------------------------------------------------------
> > >
> > > After measuring the exact read/write cnt and cycles of a specific workload,
> > > I get below findings:
> > >
> > > (1) with single VM running glmark2 inside.
> > > glmark2: 40M+ read+write cnt, among which 63% is read.
> > > among reads, 48% is of PAGE_SIZE, the rest is less than a page.
> > > among writes, 100% is less than a page.
> > >
> > > __________________________________________________
> > > | cycles | read | write | avg | inc |
> > > |--------------------------------------------------|
> > > | kvm_read/write_page | 694 | 1506 | 993 | / |
> > > |--------------------------------------------------|
> > > | vfio_dma_rw(mutex) | 1340 | 2248 | 1673 | 680 |
> > > |--------------------------------------------------|
> > > | vfio_dma_rw(rwsem r) | 1323 | 2198 | 1645 | 653 |
> > > ---------------------------------------------------
> > >
> > > so vfio_dma_rw generally has 650+ more cycles per each read/write.
> > > While kvm->srcu is of 160 cycles on average with one vm is running, the
> > > cycles spending on locks for vfio_dma_rw spread like this:
> > > ___________________________
> > > | cycles | avg |
> > > |---------------------------|
> > > | iommu->lock | 117 |
> > > |---------------------------|
> > > | vfio.group_lock | 108 |
> > > |---------------------------|
> > > | group->unbound_lock | 114 |
> > > |---------------------------|
> > > | group->device_lock | 115 |
> > > |---------------------------|
> > > | group->mutex | 113 |
> > > ---------------------------
> > >
> > > I measured the cycles for a mutex without any contention is 104 cycles
> > > on average (including time for get_cycles() and measured in the same way
> > > as other locks). So the contention of a single lock in a single vm
> > > environment is light. probably because there's a vgpu lock hold in GVT already.
> > >
> > > (2) with two VMs each running glmark2 inside.
> > > The contention increases a little.
> > >
> > > ___________________________________________________
> > > | cycles | read | write | avg | inc |
> > > |---------------------------------------------------|
> > > | kvm_read/write_page | 1035 | 1832 | 1325 | / |
> > > |---------------------------------------------------|
> > > | vfio_dma_rw(mutex) | 2104 | 2886 | 2390 | 1065 |
> > > |---------------------------------------------------|
> > > | vfio_dma_rw(rwsem r) | 1965 | 2778 | 2260 | 935 |
> > > ---------------------------------------------------
> > >
> > >
> > > -----------------------------------------------
> > > | avg cycles | one VM | two VMs |
> > > |-----------------------------------------------|
> > > | iommu lock (mutex) | 117 | 150 |
> > > |-----------------------------------|-----------|
> > > | iommu lock (rwsem r) | 117 | 156 |
> > > |-----------------------------------|-----------|
> > > | kvm->srcu | 160 | 213 |
> > > -----------------------------------------------
> > >
> > > In the kvm case, avg cycles increased 332 cycles, while kvm->srcu only costed
> > > 213 cycles. The rest 109 cycles may be spent on atomic operations.
> > > But I didn't measure them, as get_cycles() operation itself would influence final
> > > cycles by ~20 cycles.
> >
> > It seems like we need to extend the vfio external user interface so
> > that GVT-g can hold the group and container user references across
> > multiple calls. For instance if we had a
> > vfio_group_get_external_user_from_dev() (based on
> > vfio_group_get_external_user()) then i915 could get an opaque
> > vfio_group pointer which it could use to call vfio_group_dma_rw() which
> > would leave us with only the iommu rw_sem locking. i915 would release
> > the reference with vfio_group_put_external_user() when the device is
> > released. The same could be done with the pin pages interface to
> > streamline that as well. Thoughts? Thanks,
> >
> hi Alex,
> it works!

Hurrah!

> now the average vfio_dma_rw cycles can reduced to 1198.
> one thing I want to propose is that, in sight of dma->task is always user
> space process, instead of calling get_task_mm(dma->task), can we just use
> "mmget_not_zero(dma->task->mm)"? in this way, the avg cycles can
> further reduce to 1051.

I'm not an expert there. As noted in the type1 code we hold a
reference to the task because it's not advised to hold a long term
reference to the mm, so do we know we can look at task->mm without
acquiring task_lock()? It's possible this is safe, but it's not
abundantly obvious to me. Please research further and provide
justification if you think it's correct. Thanks,

Alex

2020-01-21 22:21:05

by Yan Zhao

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Wed, Jan 22, 2020 at 12:51:16AM +0800, Alex Williamson wrote:
> On Tue, 21 Jan 2020 03:12:07 -0500
> Yan Zhao <[email protected]> wrote:
>
> > On Tue, Jan 21, 2020 at 04:01:57AM +0800, Alex Williamson wrote:
> > > On Sun, 19 Jan 2020 05:06:37 -0500
> > > Yan Zhao <[email protected]> wrote:
> > >
> > > > On Thu, Jan 16, 2020 at 11:37:29PM +0800, Alex Williamson wrote:
> > > > > On Thu, 16 Jan 2020 00:49:41 -0500
> > > > > Yan Zhao <[email protected]> wrote:
> > > > >
> > > > > > On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > > > > > > On Tue, 14 Jan 2020 22:54:55 -0500
> > > > > > > Yan Zhao <[email protected]> wrote:
> > > > > > >
> > > > > > > > As a device model, it is better to read/write guest memory using vfio
> > > > > > > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > > > > > > >
> > > > > > > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > > > > > > cycles more overhead on average.
> > > > > > > >
> > > > > > > > -------------------------------------
> > > > > > > > | interface | avg cpu cycles |
> > > > > > > > |-----------------------------------|
> > > > > > > > | kvm_write_guest | 1554 |
> > > > > > > > | ----------------------------------|
> > > > > > > > | kvm_read_guest | 707 |
> > > > > > > > |-----------------------------------|
> > > > > > > > | vfio_dma_rw(w) | 2274 |
> > > > > > > > |-----------------------------------|
> > > > > > > > | vfio_dma_rw(r) | 1378 |
> > > > > > > > -------------------------------------
> > > > > > >
> > > > > > > In v1 you had:
> > > > > > >
> > > > > > > -------------------------------------
> > > > > > > | interface | avg cpu cycles |
> > > > > > > |-----------------------------------|
> > > > > > > | kvm_write_guest | 1546 |
> > > > > > > | ----------------------------------|
> > > > > > > | kvm_read_guest | 686 |
> > > > > > > |-----------------------------------|
> > > > > > > | vfio_iova_rw(w) | 2233 |
> > > > > > > |-----------------------------------|
> > > > > > > | vfio_iova_rw(r) | 1262 |
> > > > > > > -------------------------------------
> > > > > > >
> > > > > > > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > > > > > > now +1.8-9.2%. I would have expected the algorithm change to at least
> > > > > > > not be worse for small accesses and be better for accesses crossing
> > > > > > > page boundaries. Do you know what happened?
> > > > > > >
> > > > > > I only tested the 4 interfaces in GVT's environment, where most of the
> > > > > > guest memory accesses are less than one page.
> > > > > > And the different fluctuations should be caused by the locks.
> > > > > > vfio_dma_rw contends locks with other vfio accesses which are assumed to
> > > > > > be abundant in the case of GVT.
> > > > >
> > > > > Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
> > > > > rwsem? Thanks,
> > > > >
> > > >
> > > > hi Alex
> > > > I tested your rwsem patches at (https://lkml.org/lkml/2020/1/16/1869).
> > > > They works without any runtime error at my side. :)
> > > > However, I found out that the previous fluctuation may be because I didn't
> > > > take read/write counts in to account.
> > > > For example. though the two tests have different avg read/write cycles,
> > > > their average cycles are almost the same.
> > > > ______________________________________________________________________
> > > > | | avg read | | avg write | | |
> > > > | | cycles | read cnt | cycles | write cnt | avg cycles |
> > > > |----------------------------------------------------------------------|
> > > > | test 1 | 1339 | 29,587,120 | 2258 | 17,098,364 | 1676 |
> > > > | test 2 | 1340 | 28,454,262 | 2238 | 16,501,788 | 1670 |
> > > > ----------------------------------------------------------------------
> > > >
> > > > After measuring the exact read/write cnt and cycles of a specific workload,
> > > > I get below findings:
> > > >
> > > > (1) with single VM running glmark2 inside.
> > > > glmark2: 40M+ read+write cnt, among which 63% is read.
> > > > among reads, 48% is of PAGE_SIZE, the rest is less than a page.
> > > > among writes, 100% is less than a page.
> > > >
> > > > __________________________________________________
> > > > | cycles | read | write | avg | inc |
> > > > |--------------------------------------------------|
> > > > | kvm_read/write_page | 694 | 1506 | 993 | / |
> > > > |--------------------------------------------------|
> > > > | vfio_dma_rw(mutex) | 1340 | 2248 | 1673 | 680 |
> > > > |--------------------------------------------------|
> > > > | vfio_dma_rw(rwsem r) | 1323 | 2198 | 1645 | 653 |
> > > > ---------------------------------------------------
> > > >
> > > > so vfio_dma_rw generally has 650+ more cycles per each read/write.
> > > > While kvm->srcu is of 160 cycles on average with one vm is running, the
> > > > cycles spending on locks for vfio_dma_rw spread like this:
> > > > ___________________________
> > > > | cycles | avg |
> > > > |---------------------------|
> > > > | iommu->lock | 117 |
> > > > |---------------------------|
> > > > | vfio.group_lock | 108 |
> > > > |---------------------------|
> > > > | group->unbound_lock | 114 |
> > > > |---------------------------|
> > > > | group->device_lock | 115 |
> > > > |---------------------------|
> > > > | group->mutex | 113 |
> > > > ---------------------------
> > > >
> > > > I measured the cycles for a mutex without any contention is 104 cycles
> > > > on average (including time for get_cycles() and measured in the same way
> > > > as other locks). So the contention of a single lock in a single vm
> > > > environment is light. probably because there's a vgpu lock hold in GVT already.
> > > >
> > > > (2) with two VMs each running glmark2 inside.
> > > > The contention increases a little.
> > > >
> > > > ___________________________________________________
> > > > | cycles | read | write | avg | inc |
> > > > |---------------------------------------------------|
> > > > | kvm_read/write_page | 1035 | 1832 | 1325 | / |
> > > > |---------------------------------------------------|
> > > > | vfio_dma_rw(mutex) | 2104 | 2886 | 2390 | 1065 |
> > > > |---------------------------------------------------|
> > > > | vfio_dma_rw(rwsem r) | 1965 | 2778 | 2260 | 935 |
> > > > ---------------------------------------------------
> > > >
> > > >
> > > > -----------------------------------------------
> > > > | avg cycles | one VM | two VMs |
> > > > |-----------------------------------------------|
> > > > | iommu lock (mutex) | 117 | 150 |
> > > > |-----------------------------------|-----------|
> > > > | iommu lock (rwsem r) | 117 | 156 |
> > > > |-----------------------------------|-----------|
> > > > | kvm->srcu | 160 | 213 |
> > > > -----------------------------------------------
> > > >
> > > > In the kvm case, avg cycles increased 332 cycles, while kvm->srcu only costed
> > > > 213 cycles. The rest 109 cycles may be spent on atomic operations.
> > > > But I didn't measure them, as get_cycles() operation itself would influence final
> > > > cycles by ~20 cycles.
> > >
> > > It seems like we need to extend the vfio external user interface so
> > > that GVT-g can hold the group and container user references across
> > > multiple calls. For instance if we had a
> > > vfio_group_get_external_user_from_dev() (based on
> > > vfio_group_get_external_user()) then i915 could get an opaque
> > > vfio_group pointer which it could use to call vfio_group_dma_rw() which
> > > would leave us with only the iommu rw_sem locking. i915 would release
> > > the reference with vfio_group_put_external_user() when the device is
> > > released. The same could be done with the pin pages interface to
> > > streamline that as well. Thoughts? Thanks,
> > >
> > hi Alex,
> > it works!
>
> Hurrah!
>
> > now the average vfio_dma_rw cycles can reduced to 1198.
> > one thing I want to propose is that, in sight of dma->task is always user
> > space process, instead of calling get_task_mm(dma->task), can we just use
> > "mmget_not_zero(dma->task->mm)"? in this way, the avg cycles can
> > further reduce to 1051.
>
> I'm not an expert there. As noted in the type1 code we hold a
> reference to the task because it's not advised to hold a long term
> reference to the mm, so do we know we can look at task->mm without
> acquiring task_lock()? It's possible this is safe, but it's not
> abundantly obvious to me. Please research further and provide
> justification if you think it's correct. Thanks,
>
in get_task_mm,
struct mm_struct *get_task_mm(struct task_struct *task)
{
struct mm_struct *mm;

task_lock(task);
mm = task->mm;
if (mm) {
if (task->flags & PF_KTHREAD)
mm = NULL;
else
mmget(mm);
}
task_unlock(task);
return mm;
}
task lock is hold only during the call, so the purpose of it is to
ensure task->flags and task->mm is not changed or gone before mmget(mm)
or function return.
so, if we know for sure the task always has no flag PF_THREAD,
then we only need to ensure mm is not gone before mmget(mm) is done.

static inline void mmget(struct mm_struct *mm)
{
atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
return atomic_inc_not_zero(&mm->mm_users);
}

the atomic_inc_not_zero() in mmget_not_zero can ensure mm is not gone
before its ref count inc.

So, I think the only thing we need to make sure is dma->task is not a
kernel thread.
Do you think I can make this assumption?

Thanks
Yan


2020-01-22 03:19:19

by Yan Zhao

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Wed, Jan 22, 2020 at 06:10:38AM +0800, Yan Zhao wrote:
> On Wed, Jan 22, 2020 at 12:51:16AM +0800, Alex Williamson wrote:
> > On Tue, 21 Jan 2020 03:12:07 -0500
> > Yan Zhao <[email protected]> wrote:
> >
> > > On Tue, Jan 21, 2020 at 04:01:57AM +0800, Alex Williamson wrote:
> > > > On Sun, 19 Jan 2020 05:06:37 -0500
> > > > Yan Zhao <[email protected]> wrote:
> > > >
> > > > > On Thu, Jan 16, 2020 at 11:37:29PM +0800, Alex Williamson wrote:
> > > > > > On Thu, 16 Jan 2020 00:49:41 -0500
> > > > > > Yan Zhao <[email protected]> wrote:
> > > > > >
> > > > > > > On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > > > > > > > On Tue, 14 Jan 2020 22:54:55 -0500
> > > > > > > > Yan Zhao <[email protected]> wrote:
> > > > > > > >
> > > > > > > > > As a device model, it is better to read/write guest memory using vfio
> > > > > > > > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > > > > > > > >
> > > > > > > > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > > > > > > > cycles more overhead on average.
> > > > > > > > >
> > > > > > > > > -------------------------------------
> > > > > > > > > | interface | avg cpu cycles |
> > > > > > > > > |-----------------------------------|
> > > > > > > > > | kvm_write_guest | 1554 |
> > > > > > > > > | ----------------------------------|
> > > > > > > > > | kvm_read_guest | 707 |
> > > > > > > > > |-----------------------------------|
> > > > > > > > > | vfio_dma_rw(w) | 2274 |
> > > > > > > > > |-----------------------------------|
> > > > > > > > > | vfio_dma_rw(r) | 1378 |
> > > > > > > > > -------------------------------------
> > > > > > > >
> > > > > > > > In v1 you had:
> > > > > > > >
> > > > > > > > -------------------------------------
> > > > > > > > | interface | avg cpu cycles |
> > > > > > > > |-----------------------------------|
> > > > > > > > | kvm_write_guest | 1546 |
> > > > > > > > | ----------------------------------|
> > > > > > > > | kvm_read_guest | 686 |
> > > > > > > > |-----------------------------------|
> > > > > > > > | vfio_iova_rw(w) | 2233 |
> > > > > > > > |-----------------------------------|
> > > > > > > > | vfio_iova_rw(r) | 1262 |
> > > > > > > > -------------------------------------
> > > > > > > >
> > > > > > > > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > > > > > > > now +1.8-9.2%. I would have expected the algorithm change to at least
> > > > > > > > not be worse for small accesses and be better for accesses crossing
> > > > > > > > page boundaries. Do you know what happened?
> > > > > > > >
> > > > > > > I only tested the 4 interfaces in GVT's environment, where most of the
> > > > > > > guest memory accesses are less than one page.
> > > > > > > And the different fluctuations should be caused by the locks.
> > > > > > > vfio_dma_rw contends locks with other vfio accesses which are assumed to
> > > > > > > be abundant in the case of GVT.
> > > > > >
> > > > > > Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
> > > > > > rwsem? Thanks,
> > > > > >
> > > > >
> > > > > hi Alex
> > > > > I tested your rwsem patches at (https://lkml.org/lkml/2020/1/16/1869).
> > > > > They works without any runtime error at my side. :)
> > > > > However, I found out that the previous fluctuation may be because I didn't
> > > > > take read/write counts in to account.
> > > > > For example. though the two tests have different avg read/write cycles,
> > > > > their average cycles are almost the same.
> > > > > ______________________________________________________________________
> > > > > | | avg read | | avg write | | |
> > > > > | | cycles | read cnt | cycles | write cnt | avg cycles |
> > > > > |----------------------------------------------------------------------|
> > > > > | test 1 | 1339 | 29,587,120 | 2258 | 17,098,364 | 1676 |
> > > > > | test 2 | 1340 | 28,454,262 | 2238 | 16,501,788 | 1670 |
> > > > > ----------------------------------------------------------------------
> > > > >
> > > > > After measuring the exact read/write cnt and cycles of a specific workload,
> > > > > I get below findings:
> > > > >
> > > > > (1) with single VM running glmark2 inside.
> > > > > glmark2: 40M+ read+write cnt, among which 63% is read.
> > > > > among reads, 48% is of PAGE_SIZE, the rest is less than a page.
> > > > > among writes, 100% is less than a page.
> > > > >
> > > > > __________________________________________________
> > > > > | cycles | read | write | avg | inc |
> > > > > |--------------------------------------------------|
> > > > > | kvm_read/write_page | 694 | 1506 | 993 | / |
> > > > > |--------------------------------------------------|
> > > > > | vfio_dma_rw(mutex) | 1340 | 2248 | 1673 | 680 |
> > > > > |--------------------------------------------------|
> > > > > | vfio_dma_rw(rwsem r) | 1323 | 2198 | 1645 | 653 |
> > > > > ---------------------------------------------------
> > > > >
> > > > > so vfio_dma_rw generally has 650+ more cycles per each read/write.
> > > > > While kvm->srcu is of 160 cycles on average with one vm is running, the
> > > > > cycles spending on locks for vfio_dma_rw spread like this:
> > > > > ___________________________
> > > > > | cycles | avg |
> > > > > |---------------------------|
> > > > > | iommu->lock | 117 |
> > > > > |---------------------------|
> > > > > | vfio.group_lock | 108 |
> > > > > |---------------------------|
> > > > > | group->unbound_lock | 114 |
> > > > > |---------------------------|
> > > > > | group->device_lock | 115 |
> > > > > |---------------------------|
> > > > > | group->mutex | 113 |
> > > > > ---------------------------
> > > > >
> > > > > I measured the cycles for a mutex without any contention is 104 cycles
> > > > > on average (including time for get_cycles() and measured in the same way
> > > > > as other locks). So the contention of a single lock in a single vm
> > > > > environment is light. probably because there's a vgpu lock hold in GVT already.
> > > > >
> > > > > (2) with two VMs each running glmark2 inside.
> > > > > The contention increases a little.
> > > > >
> > > > > ___________________________________________________
> > > > > | cycles | read | write | avg | inc |
> > > > > |---------------------------------------------------|
> > > > > | kvm_read/write_page | 1035 | 1832 | 1325 | / |
> > > > > |---------------------------------------------------|
> > > > > | vfio_dma_rw(mutex) | 2104 | 2886 | 2390 | 1065 |
> > > > > |---------------------------------------------------|
> > > > > | vfio_dma_rw(rwsem r) | 1965 | 2778 | 2260 | 935 |
> > > > > ---------------------------------------------------
> > > > >
> > > > >
> > > > > -----------------------------------------------
> > > > > | avg cycles | one VM | two VMs |
> > > > > |-----------------------------------------------|
> > > > > | iommu lock (mutex) | 117 | 150 |
> > > > > |-----------------------------------|-----------|
> > > > > | iommu lock (rwsem r) | 117 | 156 |
> > > > > |-----------------------------------|-----------|
> > > > > | kvm->srcu | 160 | 213 |
> > > > > -----------------------------------------------
> > > > >
> > > > > In the kvm case, avg cycles increased 332 cycles, while kvm->srcu only costed
> > > > > 213 cycles. The rest 109 cycles may be spent on atomic operations.
> > > > > But I didn't measure them, as get_cycles() operation itself would influence final
> > > > > cycles by ~20 cycles.
> > > >
> > > > It seems like we need to extend the vfio external user interface so
> > > > that GVT-g can hold the group and container user references across
> > > > multiple calls. For instance if we had a
> > > > vfio_group_get_external_user_from_dev() (based on
> > > > vfio_group_get_external_user()) then i915 could get an opaque
> > > > vfio_group pointer which it could use to call vfio_group_dma_rw() which
> > > > would leave us with only the iommu rw_sem locking. i915 would release
> > > > the reference with vfio_group_put_external_user() when the device is
> > > > released. The same could be done with the pin pages interface to
> > > > streamline that as well. Thoughts? Thanks,
> > > >
> > > hi Alex,
> > > it works!
> >
> > Hurrah!
> >
> > > now the average vfio_dma_rw cycles can reduced to 1198.
> > > one thing I want to propose is that, in sight of dma->task is always user
> > > space process, instead of calling get_task_mm(dma->task), can we just use
> > > "mmget_not_zero(dma->task->mm)"? in this way, the avg cycles can
> > > further reduce to 1051.
> >
> > I'm not an expert there. As noted in the type1 code we hold a
> > reference to the task because it's not advised to hold a long term
> > reference to the mm, so do we know we can look at task->mm without
> > acquiring task_lock()? It's possible this is safe, but it's not
> > abundantly obvious to me. Please research further and provide
> > justification if you think it's correct. Thanks,
> >
> in get_task_mm,
> struct mm_struct *get_task_mm(struct task_struct *task)
> {
> struct mm_struct *mm;
>
> task_lock(task);
> mm = task->mm;
> if (mm) {
> if (task->flags & PF_KTHREAD)
> mm = NULL;
> else
> mmget(mm);
> }
> task_unlock(task);
> return mm;
> }
> task lock is hold only during the call, so the purpose of it is to
> ensure task->flags and task->mm is not changed or gone before mmget(mm)
> or function return.
> so, if we know for sure the task always has no flag PF_THREAD,
> then we only need to ensure mm is not gone before mmget(mm) is done.
>
> static inline void mmget(struct mm_struct *mm)
> {
> atomic_inc(&mm->mm_users);
> }
>
> static inline bool mmget_not_zero(struct mm_struct *mm)
> {
> return atomic_inc_not_zero(&mm->mm_users);
> }
>
> the atomic_inc_not_zero() in mmget_not_zero can ensure mm is not gone
> before its ref count inc.
>
> So, I think the only thing we need to make sure is dma->task is not a
> kernel thread.
> Do you think I can make this assumption?
>
hi Alex
Maybe I can still test PF_KTHREAD without holding task_lock
(task->alloc_lock), as it is only used to protect
"->fs, ->files, ->mm, ->group_info, ->comm, keyring
subscriptions and synchronises with wait4(). Also used in procfs. Also
pins the final release of task.io_context. Also protects ->cpuset and
->cgroup.subsys[]. And ->vfork_done."

I checked elsewhere in kernel, e.g.
try_to_wake_up
|->select_task_rq
|->is_per_cpu_kthread
|->if (!(p->flags & PF_KTHREAD))
task->alloc_lock is not hold there.

So, I would replace get_task_mm(dma->task) into two steps:
(1) check dma->task->flags & PF_KTHREAD, and (2) mmget_not_zero(mm).

I'll do more tests and send out new patches after Chinese new year.

Thanks
Yan



>
> _______________________________________________
> intel-gvt-dev mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev

2020-01-23 10:13:52

by Yan Zhao

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] drm/i915/gvt: subsitute kvm_read/write_guest with vfio_dma_rw

On Wed, Jan 22, 2020 at 11:07:58AM +0800, Yan Zhao wrote:
> On Wed, Jan 22, 2020 at 06:10:38AM +0800, Yan Zhao wrote:
> > On Wed, Jan 22, 2020 at 12:51:16AM +0800, Alex Williamson wrote:
> > > On Tue, 21 Jan 2020 03:12:07 -0500
> > > Yan Zhao <[email protected]> wrote:
> > >
> > > > On Tue, Jan 21, 2020 at 04:01:57AM +0800, Alex Williamson wrote:
> > > > > On Sun, 19 Jan 2020 05:06:37 -0500
> > > > > Yan Zhao <[email protected]> wrote:
> > > > >
> > > > > > On Thu, Jan 16, 2020 at 11:37:29PM +0800, Alex Williamson wrote:
> > > > > > > On Thu, 16 Jan 2020 00:49:41 -0500
> > > > > > > Yan Zhao <[email protected]> wrote:
> > > > > > >
> > > > > > > > On Thu, Jan 16, 2020 at 04:06:51AM +0800, Alex Williamson wrote:
> > > > > > > > > On Tue, 14 Jan 2020 22:54:55 -0500
> > > > > > > > > Yan Zhao <[email protected]> wrote:
> > > > > > > > >
> > > > > > > > > > As a device model, it is better to read/write guest memory using vfio
> > > > > > > > > > interface, so that vfio is able to maintain dirty info of device IOVAs.
> > > > > > > > > >
> > > > > > > > > > Compared to kvm interfaces kvm_read/write_guest(), vfio_dma_rw() has ~600
> > > > > > > > > > cycles more overhead on average.
> > > > > > > > > >
> > > > > > > > > > -------------------------------------
> > > > > > > > > > | interface | avg cpu cycles |
> > > > > > > > > > |-----------------------------------|
> > > > > > > > > > | kvm_write_guest | 1554 |
> > > > > > > > > > | ----------------------------------|
> > > > > > > > > > | kvm_read_guest | 707 |
> > > > > > > > > > |-----------------------------------|
> > > > > > > > > > | vfio_dma_rw(w) | 2274 |
> > > > > > > > > > |-----------------------------------|
> > > > > > > > > > | vfio_dma_rw(r) | 1378 |
> > > > > > > > > > -------------------------------------
> > > > > > > > >
> > > > > > > > > In v1 you had:
> > > > > > > > >
> > > > > > > > > -------------------------------------
> > > > > > > > > | interface | avg cpu cycles |
> > > > > > > > > |-----------------------------------|
> > > > > > > > > | kvm_write_guest | 1546 |
> > > > > > > > > | ----------------------------------|
> > > > > > > > > | kvm_read_guest | 686 |
> > > > > > > > > |-----------------------------------|
> > > > > > > > > | vfio_iova_rw(w) | 2233 |
> > > > > > > > > |-----------------------------------|
> > > > > > > > > | vfio_iova_rw(r) | 1262 |
> > > > > > > > > -------------------------------------
> > > > > > > > >
> > > > > > > > > So the kvm numbers remained within +0.5-3% while the vfio numbers are
> > > > > > > > > now +1.8-9.2%. I would have expected the algorithm change to at least
> > > > > > > > > not be worse for small accesses and be better for accesses crossing
> > > > > > > > > page boundaries. Do you know what happened?
> > > > > > > > >
> > > > > > > > I only tested the 4 interfaces in GVT's environment, where most of the
> > > > > > > > guest memory accesses are less than one page.
> > > > > > > > And the different fluctuations should be caused by the locks.
> > > > > > > > vfio_dma_rw contends locks with other vfio accesses which are assumed to
> > > > > > > > be abundant in the case of GVT.
> > > > > > >
> > > > > > > Hmm, so maybe it's time to convert vfio_iommu.lock from a mutex to a
> > > > > > > rwsem? Thanks,
> > > > > > >
> > > > > >
> > > > > > hi Alex
> > > > > > I tested your rwsem patches at (https://lkml.org/lkml/2020/1/16/1869).
> > > > > > They works without any runtime error at my side. :)
> > > > > > However, I found out that the previous fluctuation may be because I didn't
> > > > > > take read/write counts in to account.
> > > > > > For example. though the two tests have different avg read/write cycles,
> > > > > > their average cycles are almost the same.
> > > > > > ______________________________________________________________________
> > > > > > | | avg read | | avg write | | |
> > > > > > | | cycles | read cnt | cycles | write cnt | avg cycles |
> > > > > > |----------------------------------------------------------------------|
> > > > > > | test 1 | 1339 | 29,587,120 | 2258 | 17,098,364 | 1676 |
> > > > > > | test 2 | 1340 | 28,454,262 | 2238 | 16,501,788 | 1670 |
> > > > > > ----------------------------------------------------------------------
> > > > > >
> > > > > > After measuring the exact read/write cnt and cycles of a specific workload,
> > > > > > I get below findings:
> > > > > >
> > > > > > (1) with single VM running glmark2 inside.
> > > > > > glmark2: 40M+ read+write cnt, among which 63% is read.
> > > > > > among reads, 48% is of PAGE_SIZE, the rest is less than a page.
> > > > > > among writes, 100% is less than a page.
> > > > > >
> > > > > > __________________________________________________
> > > > > > | cycles | read | write | avg | inc |
> > > > > > |--------------------------------------------------|
> > > > > > | kvm_read/write_page | 694 | 1506 | 993 | / |
> > > > > > |--------------------------------------------------|
> > > > > > | vfio_dma_rw(mutex) | 1340 | 2248 | 1673 | 680 |
> > > > > > |--------------------------------------------------|
> > > > > > | vfio_dma_rw(rwsem r) | 1323 | 2198 | 1645 | 653 |
> > > > > > ---------------------------------------------------
> > > > > >
> > > > > > so vfio_dma_rw generally has 650+ more cycles per each read/write.
> > > > > > While kvm->srcu is of 160 cycles on average with one vm is running, the
> > > > > > cycles spending on locks for vfio_dma_rw spread like this:
> > > > > > ___________________________
> > > > > > | cycles | avg |
> > > > > > |---------------------------|
> > > > > > | iommu->lock | 117 |
> > > > > > |---------------------------|
> > > > > > | vfio.group_lock | 108 |
> > > > > > |---------------------------|
> > > > > > | group->unbound_lock | 114 |
> > > > > > |---------------------------|
> > > > > > | group->device_lock | 115 |
> > > > > > |---------------------------|
> > > > > > | group->mutex | 113 |
> > > > > > ---------------------------
> > > > > >
> > > > > > I measured the cycles for a mutex without any contention is 104 cycles
> > > > > > on average (including time for get_cycles() and measured in the same way
> > > > > > as other locks). So the contention of a single lock in a single vm
> > > > > > environment is light. probably because there's a vgpu lock hold in GVT already.
> > > > > >
> > > > > > (2) with two VMs each running glmark2 inside.
> > > > > > The contention increases a little.
> > > > > >
> > > > > > ___________________________________________________
> > > > > > | cycles | read | write | avg | inc |
> > > > > > |---------------------------------------------------|
> > > > > > | kvm_read/write_page | 1035 | 1832 | 1325 | / |
> > > > > > |---------------------------------------------------|
> > > > > > | vfio_dma_rw(mutex) | 2104 | 2886 | 2390 | 1065 |
> > > > > > |---------------------------------------------------|
> > > > > > | vfio_dma_rw(rwsem r) | 1965 | 2778 | 2260 | 935 |
> > > > > > ---------------------------------------------------
> > > > > >
> > > > > >
> > > > > > -----------------------------------------------
> > > > > > | avg cycles | one VM | two VMs |
> > > > > > |-----------------------------------------------|
> > > > > > | iommu lock (mutex) | 117 | 150 |
> > > > > > |-----------------------------------|-----------|
> > > > > > | iommu lock (rwsem r) | 117 | 156 |
> > > > > > |-----------------------------------|-----------|
> > > > > > | kvm->srcu | 160 | 213 |
> > > > > > -----------------------------------------------
> > > > > >
> > > > > > In the kvm case, avg cycles increased 332 cycles, while kvm->srcu only costed
> > > > > > 213 cycles. The rest 109 cycles may be spent on atomic operations.
> > > > > > But I didn't measure them, as get_cycles() operation itself would influence final
> > > > > > cycles by ~20 cycles.
> > > > >
> > > > > It seems like we need to extend the vfio external user interface so
> > > > > that GVT-g can hold the group and container user references across
> > > > > multiple calls. For instance if we had a
> > > > > vfio_group_get_external_user_from_dev() (based on
> > > > > vfio_group_get_external_user()) then i915 could get an opaque
> > > > > vfio_group pointer which it could use to call vfio_group_dma_rw() which
> > > > > would leave us with only the iommu rw_sem locking. i915 would release
> > > > > the reference with vfio_group_put_external_user() when the device is
> > > > > released. The same could be done with the pin pages interface to
> > > > > streamline that as well. Thoughts? Thanks,
> > > > >
> > > > hi Alex,
> > > > it works!
> > >
> > > Hurrah!
> > >
> > > > now the average vfio_dma_rw cycles can reduced to 1198.
> > > > one thing I want to propose is that, in sight of dma->task is always user
> > > > space process, instead of calling get_task_mm(dma->task), can we just use
> > > > "mmget_not_zero(dma->task->mm)"? in this way, the avg cycles can
> > > > further reduce to 1051.
> > >
> > > I'm not an expert there. As noted in the type1 code we hold a
> > > reference to the task because it's not advised to hold a long term
> > > reference to the mm, so do we know we can look at task->mm without
> > > acquiring task_lock()? It's possible this is safe, but it's not
> > > abundantly obvious to me. Please research further and provide
> > > justification if you think it's correct. Thanks,
> > >
> > in get_task_mm,
> > struct mm_struct *get_task_mm(struct task_struct *task)
> > {
> > struct mm_struct *mm;
> >
> > task_lock(task);
> > mm = task->mm;
> > if (mm) {
> > if (task->flags & PF_KTHREAD)
> > mm = NULL;
> > else
> > mmget(mm);
> > }
> > task_unlock(task);
> > return mm;
> > }
> > task lock is hold only during the call, so the purpose of it is to
> > ensure task->flags and task->mm is not changed or gone before mmget(mm)
> > or function return.
> > so, if we know for sure the task always has no flag PF_THREAD,
> > then we only need to ensure mm is not gone before mmget(mm) is done.
> >
> > static inline void mmget(struct mm_struct *mm)
> > {
> > atomic_inc(&mm->mm_users);
> > }
> >
> > static inline bool mmget_not_zero(struct mm_struct *mm)
> > {
> > return atomic_inc_not_zero(&mm->mm_users);
> > }
> >
> > the atomic_inc_not_zero() in mmget_not_zero can ensure mm is not gone
> > before its ref count inc.
> >
> > So, I think the only thing we need to make sure is dma->task is not a
> > kernel thread.
> > Do you think I can make this assumption?
> >
> hi Alex
> Maybe I can still test PF_KTHREAD without holding task_lock
> (task->alloc_lock), as it is only used to protect
> "->fs, ->files, ->mm, ->group_info, ->comm, keyring
> subscriptions and synchronises with wait4(). Also used in procfs. Also
> pins the final release of task.io_context. Also protects ->cpuset and
> ->cgroup.subsys[]. And ->vfork_done."
>
> I checked elsewhere in kernel, e.g.
> try_to_wake_up
> |->select_task_rq
> |->is_per_cpu_kthread
> |->if (!(p->flags & PF_KTHREAD))
> task->alloc_lock is not hold there.
>
> So, I would replace get_task_mm(dma->task) into two steps:
> (1) check dma->task->flags & PF_KTHREAD, and (2) mmget_not_zero(mm).
>
> I'll do more tests and send out new patches after Chinese new year.
>
> Thanks
> Yan
>
hi Alex
after a second thought, I find out that the task->alloc_lock cannot be
dropped, as there are chances that dma->task->mm is set to null between
checking "dma->task->mm != NULL" and "mmget_not_zero(dma->task->mm)".

The chances of this condition can be increased by adding a msleep in-between
the two lines of code to mimic possible task schedule. Then it was proved that
dma->task->mm could be set to NULL in exit_mm() by killing QEMU.

So I have to keep the call to get_task_mm(). :)

Thanks
Yan