2021-01-25 09:32:19

by Zhou Wang

[permalink] [raw]
Subject: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

Uacce driver supports to use devices in user space safely by SVA. However,
IO page fault may happen when doing DMA operations, which will affect DMA
performance severely.

For some accelerators which need stable performance, it is better to
avoid IO page fault totally. Current memory related APIs, like mlock,
could not achieve this requirement. Idealy we should have a system call,
like "mpin", to pin related pages. However, there is no such API in
kernel, currently drivers which need pin pages implement ioctl interfaces
in their own drivers, like v412, gpu, infiniband, media, vfio etc.

This patch also tries to do this by introducing a pin user page interface in
uacce driver. This patch introduces a new char device named /dev/uacce_ctrl
to help to maintain pin/unpin pages. User space can do pin/unpin pages by
ioctls of an open file of /dev/uacce_ctrl, all pinned pages under one file
will be unpinned in file release process.

Signed-off-by: Zhou Wang <[email protected]>
Signed-off-by: Sihang Chen <[email protected]>
Suggested-by: Barry Song <[email protected]>
---
Changes v1 -> v2:
- Some tiny fixes
- Follow Greg's suggestion to get mm-list and iommu-list involved.

v1: https://lwn.net/Articles/843432/
---
drivers/misc/uacce/uacce.c | 172 +++++++++++++++++++++++++++++++++++++++-
include/uapi/misc/uacce/uacce.h | 16 ++++
2 files changed, 187 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index d07af4e..69d3ba8 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -2,16 +2,28 @@
#include <linux/compat.h>
#include <linux/dma-mapping.h>
#include <linux/iommu.h>
+#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/uacce.h>
+#include <linux/vmalloc.h>

static struct class *uacce_class;
static dev_t uacce_devt;
static DEFINE_MUTEX(uacce_mutex);
static DEFINE_XARRAY_ALLOC(uacce_xa);

+struct uacce_pin_container {
+ struct xarray array;
+};
+
+struct pin_pages {
+ unsigned long first;
+ unsigned long nr_pages;
+ struct page **pages;
+};
+
static int uacce_start_queue(struct uacce_queue *q)
{
int ret = 0;
@@ -497,6 +509,151 @@ void uacce_remove(struct uacce_device *uacce)
}
EXPORT_SYMBOL_GPL(uacce_remove);

+static int uacce_ctrl_open(struct inode *inode, struct file *file)
+{
+ struct uacce_pin_container *p;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ file->private_data = p;
+
+ xa_init(&p->array);
+
+ return 0;
+}
+
+static int uacce_ctrl_release(struct inode *inode, struct file *file)
+{
+ struct uacce_pin_container *priv = file->private_data;
+ struct pin_pages *p;
+ unsigned long idx;
+
+ xa_for_each(&priv->array, idx, p) {
+ unpin_user_pages(p->pages, p->nr_pages);
+ xa_erase(&priv->array, p->first);
+ vfree(p->pages);
+ kfree(p);
+ }
+
+ xa_destroy(&priv->array);
+ kfree(priv);
+
+ return 0;
+}
+
+static int uacce_pin_page(struct uacce_pin_container *priv,
+ struct uacce_pin_address *addr)
+{
+ unsigned int flags = FOLL_FORCE | FOLL_WRITE;
+ unsigned long first, last, nr_pages;
+ struct page **pages;
+ struct pin_pages *p;
+ int ret;
+
+ first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ nr_pages = last - first + 1;
+
+ pages = vmalloc(nr_pages * sizeof(struct page *));
+ if (!pages)
+ return -ENOMEM;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
+ flags | FOLL_LONGTERM, pages);
+ if (ret != nr_pages) {
+ pr_err("uacce: Failed to pin page\n");
+ goto free_p;
+ }
+ p->first = first;
+ p->nr_pages = nr_pages;
+ p->pages = pages;
+
+ ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
+ if (ret)
+ goto unpin_pages;
+
+ return 0;
+
+unpin_pages:
+ unpin_user_pages(pages, nr_pages);
+free_p:
+ kfree(p);
+free:
+ vfree(pages);
+ return ret;
+}
+
+static int uacce_unpin_page(struct uacce_pin_container *priv,
+ struct uacce_pin_address *addr)
+{
+ unsigned long first, last, nr_pages;
+ struct pin_pages *p;
+
+ first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ nr_pages = last - first + 1;
+
+ /* find pin_pages */
+ p = xa_load(&priv->array, first);
+ if (!p)
+ return -ENODEV;
+
+ if (p->nr_pages != nr_pages)
+ return -EINVAL;
+
+ /* unpin */
+ unpin_user_pages(p->pages, p->nr_pages);
+
+ /* release resource */
+ xa_erase(&priv->array, first);
+ vfree(p->pages);
+ kfree(p);
+
+ return 0;
+}
+
+static long uacce_ctrl_unl_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct uacce_pin_container *p = filep->private_data;
+ struct uacce_pin_address addr;
+
+ if (copy_from_user(&addr, (void __user *)arg,
+ sizeof(struct uacce_pin_address)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case UACCE_CMD_PIN:
+ return uacce_pin_page(p, &addr);
+
+ case UACCE_CMD_UNPIN:
+ return uacce_unpin_page(p, &addr);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct file_operations uacce_ctrl_fops = {
+ .owner = THIS_MODULE,
+ .open = uacce_ctrl_open,
+ .release = uacce_ctrl_release,
+ .unlocked_ioctl = uacce_ctrl_unl_ioctl,
+};
+
+static struct miscdevice uacce_ctrl_miscdev = {
+ .name = "uacce_ctrl",
+ .minor = MISC_DYNAMIC_MINOR,
+ .fops = &uacce_ctrl_fops,
+};
+
static int __init uacce_init(void)
{
int ret;
@@ -507,13 +664,26 @@ static int __init uacce_init(void)

ret = alloc_chrdev_region(&uacce_devt, 0, MINORMASK, UACCE_NAME);
if (ret)
- class_destroy(uacce_class);
+ goto destroy_class;
+
+ ret = misc_register(&uacce_ctrl_miscdev);
+ if (ret) {
+ pr_err("uacce: ctrl dev registration failed\n");
+ goto unregister_cdev;
+ }

+ return 0;
+
+unregister_cdev:
+ unregister_chrdev_region(uacce_devt, MINORMASK);
+destroy_class:
+ class_destroy(uacce_class);
return ret;
}

static __exit void uacce_exit(void)
{
+ misc_deregister(&uacce_ctrl_miscdev);
unregister_chrdev_region(uacce_devt, MINORMASK);
class_destroy(uacce_class);
}
diff --git a/include/uapi/misc/uacce/uacce.h b/include/uapi/misc/uacce/uacce.h
index cc71856..0b10551 100644
--- a/include/uapi/misc/uacce/uacce.h
+++ b/include/uapi/misc/uacce/uacce.h
@@ -35,4 +35,20 @@ enum uacce_qfrt {
UACCE_QFRT_DUS = 1,
};

+/**
+ * struct uacce_pin_address - Expected pin user space address and size
+ * @addr: Address to pin
+ * @size: Size of pin address
+ */
+struct uacce_pin_address {
+ __u64 addr;
+ __u64 size;
+};
+
+/* UACCE_CMD_PIN: Pin a range of memory */
+#define UACCE_CMD_PIN _IOW('W', 2, struct uacce_pin_address)
+
+/* UACCE_CMD_UNPIN: Unpin a range of memory */
+#define UACCE_CMD_UNPIN _IOW('W', 3, struct uacce_pin_address)
+
#endif
--
2.8.1


2021-01-25 12:59:21

by Zhou Wang

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On 2021/1/25 17:28, Greg Kroah-Hartman wrote:
> On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
>> +static int uacce_pin_page(struct uacce_pin_container *priv,
>> + struct uacce_pin_address *addr)
>> +{
>> + unsigned int flags = FOLL_FORCE | FOLL_WRITE;
>> + unsigned long first, last, nr_pages;
>> + struct page **pages;
>> + struct pin_pages *p;
>> + int ret;
>> +
>> + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
>> + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> + nr_pages = last - first + 1;
>> +
>> + pages = vmalloc(nr_pages * sizeof(struct page *));
>> + if (!pages)
>> + return -ENOMEM;
>> +
>> + p = kzalloc(sizeof(*p), GFP_KERNEL);
>> + if (!p) {
>> + ret = -ENOMEM;
>> + goto free;
>> + }
>> +
>> + ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
>> + flags | FOLL_LONGTERM, pages);
>> + if (ret != nr_pages) {
>> + pr_err("uacce: Failed to pin page\n");
>> + goto free_p;
>> + }
>> + p->first = first;
>> + p->nr_pages = nr_pages;
>> + p->pages = pages;
>> +
>> + ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
>> + if (ret)
>> + goto unpin_pages;
>> +
>> + return 0;
>> +
>> +unpin_pages:
>> + unpin_user_pages(pages, nr_pages);
>> +free_p:
>> + kfree(p);
>> +free:
>> + vfree(pages);
>> + return ret;
>> +}
>
> No error checking on the memory locations or size of memory to be
> 'pinned', what could ever go wrong?

These problems has been considered if I understand it right.

I have checked pin_user_pages_fast, it checks memory location by access_ok.
For the size of memory to pin, we added a limitation, like limiting pin
page size to 1GB, however, it has been removed in the post patch. The
reason is the permission of /dev/uacce_ctrl is 600 root:root, /dev/uacce_ctrl
has to been added to trusted groups by root to be used.

>
> Note, this opens a huge hole in the kernel that needs to be documented
> really really really well somewhere, as it can cause very strange
> results if you do not know exactly what you are doing, which is why I am
> going to require that the mm developers sign off on this type of thing.
>
> And to give more context, I really don't think this is needed, but if it

Maybe I do not explain the problem clearly. Let us see it again.

From the view of functionality, pin page is no needed at all, however,
from the view of performance, we need make DMA physical pages fixed as
the latency of IO page fault currently is relatively high, for example
for ARM SMMUv3 IO page fault, it will be at least 20us+. When a DMA
transaction triggers a IO page fault, the performance will be bad. See
from a long term, the DMA performance will be not stable.

Here we use pinned pages to create a memory pool in user space, users'
lib/app can use the memory in above pinned pages based memory pool to
avoid IO page fault.

Best,
Zhou

> is, it should be a new syscall, not buried in an ioctl for a random
> misc driver, but the author seems to want it tied to this specific
> driver...
>
> thanks,
>
> greg k-h
>
> .
>

2021-01-25 15:52:23

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:

> +static int uacce_pin_page(struct uacce_pin_container *priv,
> + struct uacce_pin_address *addr)
> +{
> + unsigned int flags = FOLL_FORCE | FOLL_WRITE;
> + unsigned long first, last, nr_pages;
> + struct page **pages;
> + struct pin_pages *p;
> + int ret;
> +
> + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> + nr_pages = last - first + 1;
> +
> + pages = vmalloc(nr_pages * sizeof(struct page *));
> + if (!pages)
> + return -ENOMEM;
> +
> + p = kzalloc(sizeof(*p), GFP_KERNEL);
> + if (!p) {
> + ret = -ENOMEM;
> + goto free;
> + }
> +
> + ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> + flags | FOLL_LONGTERM, pages);

This needs to copy the RLIMIT_MEMLOCK and can_do_mlock() stuff from
other places, like ib_umem_get

> + ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));

And this is really weird, I don't think it makes sense to make handles
for DMA based on the starting VA.

> +static int uacce_unpin_page(struct uacce_pin_container *priv,
> + struct uacce_pin_address *addr)
> +{
> + unsigned long first, last, nr_pages;
> + struct pin_pages *p;
> +
> + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> + nr_pages = last - first + 1;
> +
> + /* find pin_pages */
> + p = xa_load(&priv->array, first);
> + if (!p)
> + return -ENODEV;
> +
> + if (p->nr_pages != nr_pages)
> + return -EINVAL;
> +
> + /* unpin */
> + unpin_user_pages(p->pages, p->nr_pages);

And unpinning without guaranteeing there is no ongoing DMA is really
weird

Are you abusing this in conjunction with a SVA scheme just to prevent
page motion? Why wasn't mlock good enough?

Jason

Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device



> -----Original Message-----
> From: Jason Gunthorpe [mailto:[email protected]]
> Sent: Tuesday, January 26, 2021 4:47 AM
> To: Wangzhou (B) <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>; Arnd Bergmann
> <[email protected]>; Zhangfei Gao <[email protected]>;
> [email protected]; [email protected];
> [email protected]; [email protected]; Song Bao Hua (Barry Song)
> <[email protected]>; Liguozhu (Kenneth) <[email protected]>;
> chensihang (A) <[email protected]>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
>
> On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
>
> > +static int uacce_pin_page(struct uacce_pin_container *priv,
> > + struct uacce_pin_address *addr)
> > +{
> > + unsigned int flags = FOLL_FORCE | FOLL_WRITE;
> > + unsigned long first, last, nr_pages;
> > + struct page **pages;
> > + struct pin_pages *p;
> > + int ret;
> > +
> > + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> > + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> > + nr_pages = last - first + 1;
> > +
> > + pages = vmalloc(nr_pages * sizeof(struct page *));
> > + if (!pages)
> > + return -ENOMEM;
> > +
> > + p = kzalloc(sizeof(*p), GFP_KERNEL);
> > + if (!p) {
> > + ret = -ENOMEM;
> > + goto free;
> > + }
> > +
> > + ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> > + flags | FOLL_LONGTERM, pages);
>
> This needs to copy the RLIMIT_MEMLOCK and can_do_mlock() stuff from
> other places, like ib_umem_get
>
> > + ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
>
> And this is really weird, I don't think it makes sense to make handles
> for DMA based on the starting VA.
>
> > +static int uacce_unpin_page(struct uacce_pin_container *priv,
> > + struct uacce_pin_address *addr)
> > +{
> > + unsigned long first, last, nr_pages;
> > + struct pin_pages *p;
> > +
> > + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> > + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> > + nr_pages = last - first + 1;
> > +
> > + /* find pin_pages */
> > + p = xa_load(&priv->array, first);
> > + if (!p)
> > + return -ENODEV;
> > +
> > + if (p->nr_pages != nr_pages)
> > + return -EINVAL;
> > +
> > + /* unpin */
> > + unpin_user_pages(p->pages, p->nr_pages);
>
> And unpinning without guaranteeing there is no ongoing DMA is really
> weird

In SVA case, kernel has no idea if accelerators are accessing
the memory so I would assume SVA has a method to prevent
the pages being transferred from migration or release. Otherwise,
SVA will crash easily in a system with high memory pressure.

Anyway, This is a problem worth further investigating.

>
> Are you abusing this in conjunction with a SVA scheme just to prevent
> page motion? Why wasn't mlock good enough?

Page migration won't cause any disfunction in SVA case as IO page
fault will get a valid page again. It is only a performance issue
as IO page fault has larger latency than the usual page fault,
would be 3-80slower than page fault[1]

mlock, while certainly be able to prevent swapping out, it won't
be able to stop page moving due to:
* memory compaction in alloc_pages()
* making huge pages
* numa balance
* memory compaction in CMA
etc.

[1] https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7482091&tag=1
>
> Jason

Thanks
Barry

2021-01-25 23:21:24

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> mlock, while certainly be able to prevent swapping out, it won't
> be able to stop page moving due to:
> * memory compaction in alloc_pages()
> * making huge pages
> * numa balance
> * memory compaction in CMA

Enabling those things is a major reason to have SVA device in the
first place, providing a SW API to turn it all off seems like the
wrong direction.

If the device doesn't want to use SVA then don't use it, use normal
DMA pinning like everything else.

Jason

Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device



> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On Behalf Of
> Jason Gunthorpe
> Sent: Tuesday, January 26, 2021 12:16 PM
> To: Song Bao Hua (Barry Song) <[email protected]>
> Cc: Wangzhou (B) <[email protected]>; Greg Kroah-Hartman
> <[email protected]>; Arnd Bergmann <[email protected]>; Zhangfei Gao
> <[email protected]>; [email protected];
> [email protected]; [email protected];
> [email protected]; Liguozhu (Kenneth) <[email protected]>; chensihang
> (A) <[email protected]>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
>
> On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > mlock, while certainly be able to prevent swapping out, it won't
> > be able to stop page moving due to:
> > * memory compaction in alloc_pages()
> > * making huge pages
> > * numa balance
> > * memory compaction in CMA
>
> Enabling those things is a major reason to have SVA device in the
> first place, providing a SW API to turn it all off seems like the
> wrong direction.

I wouldn't say this is a major reason to have SVA. If we read the
history of SVA and papers, people would think easy programming due
to data struct sharing between cpu and device, and process space
isolation in device would be the major reasons for SVA. SVA also
declares it supports zero-copy while zero-copy doesn't necessarily
depend on SVA.

Page migration and I/O page fault overhead, on the other hand, would
probably be the major problems which block SVA becoming a
high-performance and more popular solution.

>
> If the device doesn't want to use SVA then don't use it, use normal
> DMA pinning like everything else.
>

If we disable SVA, we won't get the benefits of SVA on address sharing,
and process space isolation.

> Jason

Thanks
Barry

Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device



> -----Original Message-----
> From: Jason Gunthorpe [mailto:[email protected]]
> Sent: Tuesday, January 26, 2021 2:13 PM
> To: Song Bao Hua (Barry Song) <[email protected]>
> Cc: Wangzhou (B) <[email protected]>; Greg Kroah-Hartman
> <[email protected]>; Arnd Bergmann <[email protected]>; Zhangfei Gao
> <[email protected]>; [email protected];
> [email protected]; [email protected];
> [email protected]; Liguozhu (Kenneth) <[email protected]>; chensihang
> (A) <[email protected]>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
>
> On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:
>
> > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > be able to stop page moving due to:
> > > > * memory compaction in alloc_pages()
> > > > * making huge pages
> > > > * numa balance
> > > > * memory compaction in CMA
> > >
> > > Enabling those things is a major reason to have SVA device in the
> > > first place, providing a SW API to turn it all off seems like the
> > > wrong direction.
> >
> > I wouldn't say this is a major reason to have SVA. If we read the
> > history of SVA and papers, people would think easy programming due
> > to data struct sharing between cpu and device, and process space
> > isolation in device would be the major reasons for SVA. SVA also
> > declares it supports zero-copy while zero-copy doesn't necessarily
> > depend on SVA.
>
> Once you have to explicitly make system calls to declare memory under
> IO, you loose all of that.
>
> Since you've asked the app to be explicit about the DMAs it intends to
> do, there is not really much reason to use SVA for those DMAs anymore.

Let's see a non-SVA case. We are not using SVA, we can have
a memory pool by hugetlb or pin, and app can allocate memory
from this pool, and get stable I/O performance on the memory
from the pool. But device has its separate page table which
is not bound with this process, thus lacking the protection
of process space isolation. Plus, CPU and device are using
different address.

And then we move to SVA case, we can still have a memory pool
by hugetlb or pin, and app can allocate memory from this pool
since this pool is mapped to the address space of the process,
and we are able to get stable I/O performance since it is always
there. But in this case, device is using the page table of
process with the full permission control.
And they are using same address and can possibly enjoy the easy
programming if HW supports.

SVA is not doom to work with IO page fault only. If we have SVA+pin,
we would get both sharing address and stable I/O latency.

>
> Jason

Thanks
Barry

2021-01-26 05:14:25

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
> +static int uacce_pin_page(struct uacce_pin_container *priv,
> + struct uacce_pin_address *addr)
> +{
> + unsigned int flags = FOLL_FORCE | FOLL_WRITE;
> + unsigned long first, last, nr_pages;
> + struct page **pages;
> + struct pin_pages *p;
> + int ret;
> +
> + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> + nr_pages = last - first + 1;
> +
> + pages = vmalloc(nr_pages * sizeof(struct page *));
> + if (!pages)
> + return -ENOMEM;
> +
> + p = kzalloc(sizeof(*p), GFP_KERNEL);
> + if (!p) {
> + ret = -ENOMEM;
> + goto free;
> + }
> +
> + ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> + flags | FOLL_LONGTERM, pages);
> + if (ret != nr_pages) {
> + pr_err("uacce: Failed to pin page\n");
> + goto free_p;
> + }
> + p->first = first;
> + p->nr_pages = nr_pages;
> + p->pages = pages;
> +
> + ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
> + if (ret)
> + goto unpin_pages;
> +
> + return 0;
> +
> +unpin_pages:
> + unpin_user_pages(pages, nr_pages);
> +free_p:
> + kfree(p);
> +free:
> + vfree(pages);
> + return ret;
> +}

No error checking on the memory locations or size of memory to be
'pinned', what could ever go wrong?

Note, this opens a huge hole in the kernel that needs to be documented
really really really well somewhere, as it can cause very strange
results if you do not know exactly what you are doing, which is why I am
going to require that the mm developers sign off on this type of thing.

And to give more context, I really don't think this is needed, but if it
is, it should be a new syscall, not buried in an ioctl for a random
misc driver, but the author seems to want it tied to this specific
driver...

thanks,

greg k-h

2021-01-26 10:30:16

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:

> > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > > mlock, while certainly be able to prevent swapping out, it won't
> > > be able to stop page moving due to:
> > > * memory compaction in alloc_pages()
> > > * making huge pages
> > > * numa balance
> > > * memory compaction in CMA
> >
> > Enabling those things is a major reason to have SVA device in the
> > first place, providing a SW API to turn it all off seems like the
> > wrong direction.
>
> I wouldn't say this is a major reason to have SVA. If we read the
> history of SVA and papers, people would think easy programming due
> to data struct sharing between cpu and device, and process space
> isolation in device would be the major reasons for SVA. SVA also
> declares it supports zero-copy while zero-copy doesn't necessarily
> depend on SVA.

Once you have to explicitly make system calls to declare memory under
IO, you loose all of that.

Since you've asked the app to be explicit about the DMAs it intends to
do, there is not really much reason to use SVA for those DMAs anymore.

Jason

2021-01-27 01:01:55

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On Tue, Jan 26, 2021 at 01:26:45AM +0000, Song Bao Hua (Barry Song) wrote:
> > On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:
> >
> > > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > > be able to stop page moving due to:
> > > > > * memory compaction in alloc_pages()
> > > > > * making huge pages
> > > > > * numa balance
> > > > > * memory compaction in CMA
> > > >
> > > > Enabling those things is a major reason to have SVA device in the
> > > > first place, providing a SW API to turn it all off seems like the
> > > > wrong direction.
> > >
> > > I wouldn't say this is a major reason to have SVA. If we read the
> > > history of SVA and papers, people would think easy programming due
> > > to data struct sharing between cpu and device, and process space
> > > isolation in device would be the major reasons for SVA. SVA also
> > > declares it supports zero-copy while zero-copy doesn't necessarily
> > > depend on SVA.
> >
> > Once you have to explicitly make system calls to declare memory under
> > IO, you loose all of that.
> >
> > Since you've asked the app to be explicit about the DMAs it intends to
> > do, there is not really much reason to use SVA for those DMAs anymore.
>
> Let's see a non-SVA case. We are not using SVA, we can have
> a memory pool by hugetlb or pin, and app can allocate memory
> from this pool, and get stable I/O performance on the memory
> from the pool. But device has its separate page table which
> is not bound with this process, thus lacking the protection
> of process space isolation. Plus, CPU and device are using
> different address.

So you are relying on the platform to do the SVA for the device?

This feels like it goes back to another topic where I felt the SVA
setup uAPI should be shared and not buried into every driver's unique
ioctls.

Having something like this in a shared SVA system is somewhat less
strange.

Jason

2021-01-27 06:07:03

by Zhou Wang

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On 2021/1/25 23:47, Jason Gunthorpe wrote:
> On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
>
>> +static int uacce_pin_page(struct uacce_pin_container *priv,
>> + struct uacce_pin_address *addr)
>> +{
>> + unsigned int flags = FOLL_FORCE | FOLL_WRITE;
>> + unsigned long first, last, nr_pages;
>> + struct page **pages;
>> + struct pin_pages *p;
>> + int ret;
>> +
>> + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
>> + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> + nr_pages = last - first + 1;
>> +
>> + pages = vmalloc(nr_pages * sizeof(struct page *));
>> + if (!pages)
>> + return -ENOMEM;
>> +
>> + p = kzalloc(sizeof(*p), GFP_KERNEL);
>> + if (!p) {
>> + ret = -ENOMEM;
>> + goto free;
>> + }
>> +
>> + ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
>> + flags | FOLL_LONGTERM, pages);
>
> This needs to copy the RLIMIT_MEMLOCK and can_do_mlock() stuff from
> other places, like ib_umem_get

I am confused as can_do_mlock seems to be about the limitation of mlock,
which has different meaning with pin memory?

>
>> + ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
>
> And this is really weird, I don't think it makes sense to make handles
> for DMA based on the starting VA.

Here starting VA is used as an index of pinned pages. When doing unpinning,
starting VA is used to get pinned pages, check unpinned VA/size.

But it has problem here to use xa_store here as new one will replace old one :(
Seems we can use xa_insert here, which returns -EBUSY if the entry at one
index is not empty. The design here will be that we only support to pin same
VA once.

>
>> +static int uacce_unpin_page(struct uacce_pin_container *priv,
>> + struct uacce_pin_address *addr)
>> +{
>> + unsigned long first, last, nr_pages;
>> + struct pin_pages *p;
>> +
>> + first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
>> + last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> + nr_pages = last - first + 1;
>> +
>> + /* find pin_pages */
>> + p = xa_load(&priv->array, first);
>> + if (!p)
>> + return -ENODEV;
>> +
>> + if (p->nr_pages != nr_pages)
>> + return -EINVAL;
>> +
>> + /* unpin */
>> + unpin_user_pages(p->pages, p->nr_pages);
>
> And unpinning without guaranteeing there is no ongoing DMA is really
> weird
>
> Are you abusing this in conjunction with a SVA scheme just to prevent
> page motion? Why wasn't mlock good enough?

Just as Barry said, mlock can not avoid IOPF from page migration.

Best,
Zhou

>
> Jason
>
> .
>

Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device



> -----Original Message-----
> From: Jason Gunthorpe [mailto:[email protected]]
> Sent: Wednesday, January 27, 2021 7:20 AM
> To: Song Bao Hua (Barry Song) <[email protected]>
> Cc: Wangzhou (B) <[email protected]>; Greg Kroah-Hartman
> <[email protected]>; Arnd Bergmann <[email protected]>; Zhangfei Gao
> <[email protected]>; [email protected];
> [email protected]; [email protected];
> [email protected]; Liguozhu (Kenneth) <[email protected]>; chensihang
> (A) <[email protected]>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
>
> On Tue, Jan 26, 2021 at 01:26:45AM +0000, Song Bao Hua (Barry Song) wrote:
> > > On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:
> > >
> > > > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song)
> wrote:
> > > > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > > > be able to stop page moving due to:
> > > > > > * memory compaction in alloc_pages()
> > > > > > * making huge pages
> > > > > > * numa balance
> > > > > > * memory compaction in CMA
> > > > >
> > > > > Enabling those things is a major reason to have SVA device in the
> > > > > first place, providing a SW API to turn it all off seems like the
> > > > > wrong direction.
> > > >
> > > > I wouldn't say this is a major reason to have SVA. If we read the
> > > > history of SVA and papers, people would think easy programming due
> > > > to data struct sharing between cpu and device, and process space
> > > > isolation in device would be the major reasons for SVA. SVA also
> > > > declares it supports zero-copy while zero-copy doesn't necessarily
> > > > depend on SVA.
> > >
> > > Once you have to explicitly make system calls to declare memory under
> > > IO, you loose all of that.
> > >
> > > Since you've asked the app to be explicit about the DMAs it intends to
> > > do, there is not really much reason to use SVA for those DMAs anymore.
> >
> > Let's see a non-SVA case. We are not using SVA, we can have
> > a memory pool by hugetlb or pin, and app can allocate memory
> > from this pool, and get stable I/O performance on the memory
> > from the pool. But device has its separate page table which
> > is not bound with this process, thus lacking the protection
> > of process space isolation. Plus, CPU and device are using
> > different address.
>
> So you are relying on the platform to do the SVA for the device?
>

Sorry for late response.

uacce and its userspace framework UADK depend on SVA, leveraging
the enhanced security by isolated process address space.

This patch is mainly an extension for performance optimization to
get stable high-performance I/O on pinned memory even though the
hardware supports IO page fault to get pages back after swapping
out or page migration.
But IO page fault will cause serious latency jitter for high-speed
I/O.
For slow speed device, they don't need to use this extension.

> This feels like it goes back to another topic where I felt the SVA
> setup uAPI should be shared and not buried into every driver's unique
> ioctls.
>
> Having something like this in a shared SVA system is somewhat less
> strange.

Sounds reasonable. On the other hand, uacce seems to be an common
uAPI for SVA, and probably the only one for this moment.

uacce is a framework not a specific driver as any accelerators
can hook into this framework as long as a device provides
uacce_ops and register itself by uacce_register(). Uacce, for
itself, doesn't bind with any specific hardware. So uacce interfaces
are kind of common uAPI :-)

>
> Jason

Thanks
Barry

2021-02-01 23:49:39

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > we would get both sharing address and stable I/O latency.
>
> Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> cpu_va of the memory pool as the iova?

I think their issue is the HW can't do the cpu_va trick without also
involving the system IOMMU in a SVA mode

It really is something that belongs under some general /dev/sva as we
talked on the vfio thread

Jason

Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device



> -----Original Message-----
> From: Jason Gunthorpe [mailto:[email protected]]
> Sent: Tuesday, February 2, 2021 12:44 PM
> To: Tian, Kevin <[email protected]>
> Cc: Song Bao Hua (Barry Song) <[email protected]>; chensihang (A)
> <[email protected]>; Arnd Bergmann <[email protected]>; Greg
> Kroah-Hartman <[email protected]>; [email protected];
> [email protected]; [email protected]; Zhangfei Gao
> <[email protected]>; Liguozhu (Kenneth) <[email protected]>;
> [email protected]
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
>
> On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > > we would get both sharing address and stable I/O latency.
> >
> > Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> > cpu_va of the memory pool as the iova?
>
> I think their issue is the HW can't do the cpu_va trick without also
> involving the system IOMMU in a SVA mode
>
> It really is something that belongs under some general /dev/sva as we
> talked on the vfio thread

AFAIK, there is no this /dev/sva so /dev/uacce is an uAPI
which belongs to sva.

Another option is that we add a system call like
fs/userfaultfd.c, and move the file_operations and ioctl
to the anon inode by creating fd via anon_inode_getfd().
Then nothing will be buried by uacce.

>
> Jason

Thanks
Barry

2021-02-02 02:54:22

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device

> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, February 2, 2021 7:44 AM
>
> On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > > we would get both sharing address and stable I/O latency.
> >
> > Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> > cpu_va of the memory pool as the iova?
>
> I think their issue is the HW can't do the cpu_va trick without also
> involving the system IOMMU in a SVA mode
>

This is the part that I didn't understand. Using cpu_va in a MAP_DMA
interface doesn't require device support. It's just an user-specified
address to be mapped into the IOMMU page table. On the other hand,
sharing CPU page table through a SVA interface for an usage where I/O
page faults must be completely avoided seems a misleading attempt.
Even if people do want this model (e.g. mix pinning+fault), it should be
a mm syscall as Greg pointed out, not specific to sva.

Thanks
Kevin

Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device



> -----Original Message-----
> From: Tian, Kevin [mailto:[email protected]]
> Sent: Tuesday, February 2, 2021 3:52 PM
> To: Jason Gunthorpe <[email protected]>
> Cc: Song Bao Hua (Barry Song) <[email protected]>; chensihang (A)
> <[email protected]>; Arnd Bergmann <[email protected]>; Greg
> Kroah-Hartman <[email protected]>; [email protected];
> [email protected]; [email protected]; Zhangfei Gao
> <[email protected]>; Liguozhu (Kenneth) <[email protected]>;
> [email protected]
> Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
>
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, February 2, 2021 7:44 AM
> >
> > On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > > > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > > > we would get both sharing address and stable I/O latency.
> > >
> > > Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> > > cpu_va of the memory pool as the iova?
> >
> > I think their issue is the HW can't do the cpu_va trick without also
> > involving the system IOMMU in a SVA mode
> >
>
> This is the part that I didn't understand. Using cpu_va in a MAP_DMA
> interface doesn't require device support. It's just an user-specified
> address to be mapped into the IOMMU page table. On the other hand,

The background is that uacce is based on SVA and we are building
applications on uacce:
https://www.kernel.org/doc/html/v5.10/misc-devices/uacce.html
so IOMMU simply uses the page table of MMU, and don't do any
special mapping to an user-specified address. We don't break
the basic assumption that uacce is using SVA, otherwise, we
need to re-build uacce and the whole base.

> sharing CPU page table through a SVA interface for an usage where I/O
> page faults must be completely avoided seems a misleading attempt.

That is not for completely avoiding IO page fault, that is just
an extension for high-performance I/O case, providing a way to
avoid IO latency jitter. Using it or not is totally up to users.

> Even if people do want this model (e.g. mix pinning+fault), it should be
> a mm syscall as Greg pointed out, not specific to sva.
>

We are glad to make it a syscall if people are happy with
it. The simplest way would be a syscall similar with
userfaultfd if we don't want to mess up mm_struct.

> Thanks
> Kevin

Thanks
Barry