2010-04-15 20:58:17

by Tom Lyon

[permalink] [raw]
Subject: [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.

This is the second of 2 related, but independent, patches. This is for
uio.c, the previous is for uio_pci_generic.c.

The 2 patches were previously one large patch. Changes for this version:
- uio_pci_generic.c just gets extensions so that a single fd can be used
by non-privileged processes for interrupt control and mmaps
- All of the DMA and IOMMU related stuff move to uio.c; no longer a need
to pass ioctls to individual uio drivers. It turns out that the code
is not PCI specific anyways.
- A new ioctl to pin DMA buffers to certain IO virtual addresses for KVM.
- New eventfd based interrupt notifications, including support for PCI
specific MSI and MSI-X interrupts.
- PCI specific code to reset PCI functions before and after use

diff -ruNP linux-2.6.33/drivers/uio/uio.c uio-2.6.33/drivers/uio/uio.c
--- linux-2.6.33/drivers/uio/uio.c 2010-02-24 10:52:17.000000000 -0800
+++ uio-2.6.33/drivers/uio/uio.c 2010-04-15 12:39:02.000000000 -0700
@@ -23,6 +23,11 @@
#include <linux/string.h>
#include <linux/kobject.h>
#include <linux/uio_driver.h>
+#include <linux/mmu_notifier.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/eventfd.h>
+#include <linux/semaphore.h>

#define UIO_MAX_DEVICES 255

@@ -37,8 +42,37 @@
struct uio_info *info;
struct kobject *map_dir;
struct kobject *portio_dir;
+ struct pci_dev *pdev;
+ int pmaster;
+ struct semaphore gate;
+ int listeners;
+ atomic_t mapcount;
+ struct msix_entry *msix;
+ int nvec;
+ struct iommu_domain *domain;
+ int cachec;
+ struct eventfd_ctx *ev_irq;
+ struct eventfd_ctx *ev_msi;
+ struct eventfd_ctx **ev_msix;
};

+/*
+ * Structure for keeping track of memory nailed down by the
+ * user for DMA
+ */
+struct dma_map_page {
+ struct list_head list;
+ struct page **pages;
+ struct scatterlist *sg;
+ dma_addr_t daddr;
+ unsigned long vaddr;
+ int npage;
+ int rdwr;
+};
+
+static void uio_disable_msi(struct uio_device *);
+static void uio_disable_msix(struct uio_device *);
+
static int uio_major;
static DEFINE_IDR(uio_idr);
static const struct file_operations uio_fops;
@@ -440,17 +474,38 @@
struct uio_device *idev = (struct uio_device *)dev_id;
irqreturn_t ret = idev->info->handler(irq, idev->info);

- if (ret == IRQ_HANDLED)
+ if (ret != IRQ_HANDLED)
+ return ret;
+ if (idev->ev_irq)
+ eventfd_signal(idev->ev_irq, 1);
+ else
uio_event_notify(idev->info);

return ret;
}

+/*
+ * MSI and MSI-X Interrupt handler.
+ * Just record an event
+ */
+static irqreturn_t msihandler(int irq, void *arg)
+{
+ struct eventfd_ctx *ctx = arg;
+
+ eventfd_signal(ctx, 1);
+ return IRQ_HANDLED;
+}
+
struct uio_listener {
struct uio_device *dev;
s32 event_count;
+ struct mm_struct *mm;
+ struct mmu_notifier mmu_notifier;
+ struct list_head dm_list;
};

+static void uio_dma_unmapall(struct uio_listener *);
+
static int uio_open(struct inode *inode, struct file *filep)
{
struct uio_device *idev;
@@ -470,7 +525,7 @@
goto out;
}

- listener = kmalloc(sizeof(*listener), GFP_KERNEL);
+ listener = kzalloc(sizeof(*listener), GFP_KERNEL);
if (!listener) {
ret = -ENOMEM;
goto err_alloc_listener;
@@ -478,8 +533,22 @@

listener->dev = idev;
listener->event_count = atomic_read(&idev->event);
+ INIT_LIST_HEAD(&listener->dm_list);
filep->private_data = listener;

+ down(&idev->gate);
+ if (idev->listeners == 0) { /* first open */
+ if (idev->pmaster && !iommu_found() && !capable(CAP_SYS_RAWIO)) {
+ up(&idev->gate);
+ return -EPERM;
+ }
+ /* reset to known state if we can */
+ if (idev->pdev)
+ (void) pci_reset_function(idev->pdev);
+ }
+ idev->listeners++;
+ up(&idev->gate);
+
if (idev->info->open) {
ret = idev->info->open(idev->info, inode);
if (ret)
@@ -514,6 +583,34 @@
if (idev->info->release)
ret = idev->info->release(idev->info, inode);

+ uio_dma_unmapall(listener);
+ if (listener->mm) {
+ mmu_notifier_unregister(&listener->mmu_notifier, listener->mm);
+ listener->mm = NULL;
+ }
+
+ down(&idev->gate);
+ if (--idev->listeners <= 0) {
+ if (idev->msix) {
+ uio_disable_msix(idev);
+ }
+ if (idev->ev_msi) {
+ uio_disable_msi(idev);
+ }
+ if (idev->ev_irq) {
+ eventfd_ctx_put(idev->ev_msi);
+ idev->ev_irq = NULL;
+ }
+ if (idev->domain) {
+ iommu_domain_free(idev->domain);
+ idev->domain = NULL;
+ }
+ /* reset to known state if we can */
+ if (idev->pdev)
+ (void) pci_reset_function(idev->pdev);
+ }
+ up(&idev->gate);
+
module_put(idev->owner);
kfree(listener);
return ret;
@@ -730,12 +827,510 @@
}
}

+/* Unmap DMA region */
+static void uio_dma_unmap(struct uio_listener *listener,
+ struct dma_map_page *mlp)
+{
+ int i;
+ struct uio_device *idev = listener->dev;
+
+ list_del(&mlp->list);
+ if (mlp->sg) {
+ dma_unmap_sg(idev->dev->parent, mlp->sg, mlp->npage, DMA_BIDIRECTIONAL);
+ kfree(mlp->sg);
+ } else {
+ for (i=0; i<mlp->npage; i++)
+ (void) iommu_unmap_range(idev->domain,
+ mlp->daddr + i*PAGE_SIZE, PAGE_SIZE);
+ }
+ for (i=0; i<mlp->npage; i++) {
+ if (mlp->rdwr)
+ SetPageDirty(mlp->pages[i]);
+ put_page(mlp->pages[i]);
+ }
+ listener->mm->locked_vm -= mlp->npage;
+ kfree(mlp->pages);
+ kfree(mlp);
+ atomic_dec(&idev->mapcount);
+}
+
+/* Unmap ALL DMA regions */
+static void uio_dma_unmapall(struct uio_listener *listener)
+{
+ struct list_head *pos, *pos2;
+ struct dma_map_page *mlp;
+
+ list_for_each_safe(pos, pos2, &listener->dm_list) {
+ mlp = list_entry(pos, struct dma_map_page, list);
+ uio_dma_unmap(listener, mlp);
+ }
+}
+
+int uio_dma_unmap_dm(struct uio_listener *listener, struct uio_dma_map *dmp)
+{
+ unsigned long start, npage;
+ struct dma_map_page *mlp;
+ struct list_head *pos, *pos2;
+ int ret;
+
+ start = dmp->vaddr & ~PAGE_SIZE;
+ npage = dmp->size >> PAGE_SHIFT;
+
+ ret = -ENXIO;
+ list_for_each_safe(pos, pos2, &listener->dm_list) {
+ mlp = list_entry(pos, struct dma_map_page, list);
+ if (dmp->vaddr != mlp->vaddr || mlp->npage != npage)
+ continue;
+ ret = 0;
+ uio_dma_unmap(listener, mlp);
+ break;
+ }
+ return ret;
+}
+
+/* Handle MMU notifications - user process freed or realloced memory
+ * which may be in use in a DMA region. Clean up region if so.
+ */
+static void uio_dma_handle_mmu_notify(struct mmu_notifier *mn,
+ unsigned long start, unsigned long end)
+{
+ struct uio_listener *listener;
+ unsigned long myend;
+ struct list_head *pos, *pos2;
+ struct dma_map_page *mlp;
+
+ listener = container_of(mn, struct uio_listener, mmu_notifier);
+ list_for_each_safe(pos, pos2, &listener->dm_list) {
+ mlp = list_entry(pos, struct dma_map_page, list);
+ if (mlp->vaddr >= end)
+ continue;
+ /*
+ * Ranges overlap if they're not disjoint; and they're
+ * disjoint if the end of one is before the start of
+ * the other one.
+ */
+ myend = mlp->vaddr + (mlp->npage << PAGE_SHIFT) - 1;
+ if (!(myend <= start || end <= mlp->vaddr)) {
+ printk(KERN_WARNING
+ "%s: demap start %lx end %lx va %lx pa %lx\n",
+ __func__, start, end, mlp->vaddr, (long)mlp->daddr);
+ uio_dma_unmap(listener, mlp);
+ }
+ }
+}
+
+static void uio_dma_inval_page(struct mmu_notifier *mn,
+ struct mm_struct *mm, unsigned long addr)
+{
+ uio_dma_handle_mmu_notify(mn, addr, addr + PAGE_SIZE);
+}
+
+static void uio_dma_inval_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ uio_dma_handle_mmu_notify(mn, start, end);
+}
+
+static const struct mmu_notifier_ops uio_dma_mmu_notifier_ops = {
+ .invalidate_page = uio_dma_inval_page,
+ .invalidate_range_start = uio_dma_inval_range_start,
+};
+
+static int uio_dma_map_iova(
+ struct uio_listener *listener,
+ unsigned long start_iova,
+ struct page **pages,
+ int npage,
+ int rdwr,
+ struct dma_map_page **mlpp)
+{
+ struct uio_device *idev = listener->dev;
+ int ret;
+ int i;
+ phys_addr_t hpa;
+ struct dma_map_page *mlp;
+ unsigned long iova = start_iova;
+
+ if (idev->domain == NULL) {
+ if (!list_empty(&listener->dm_list)) /* no mix with anywhere */
+ return -EINVAL;
+ if (!iommu_found())
+ return -EINVAL;
+ idev->domain = iommu_domain_alloc();
+ if (idev->domain == NULL)
+ return -ENXIO;
+ idev->cachec = iommu_domain_has_cap(idev->domain,
+ IOMMU_CAP_CACHE_COHERENCY);
+ ret = iommu_attach_device(idev->domain, idev->dev->parent);
+ if (ret) {
+ iommu_domain_free(idev->domain);
+ idev->domain = NULL;
+ printk(KERN_ERR "%s: device_attach failed %d\n", __func__, ret);
+ return ret;
+ }
+ }
+ for (i=0; i<npage; i++) {
+ if (iommu_iova_to_phys(idev->domain, iova + i*PAGE_SIZE))
+ return -EBUSY;
+ }
+ rdwr = rdwr ? IOMMU_READ|IOMMU_WRITE : IOMMU_READ;
+ if (idev->cachec) rdwr |= IOMMU_CACHE;
+ for (i=0; i<npage; i++) {
+ hpa = page_to_phys(pages[i]);
+ ret = iommu_map_range(idev->domain, iova,
+ hpa, PAGE_SIZE, rdwr);
+ if (ret) {
+ while (--i > 0) {
+ iova -= PAGE_SIZE;
+ (void) iommu_unmap_range(idev->domain, iova, PAGE_SIZE);
+ }
+ return ret;
+ }
+ iova += PAGE_SIZE;
+ }
+
+ mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
+ mlp->pages = pages;
+ mlp->daddr = start_iova;
+ mlp->npage = npage;
+ *mlpp = mlp;
+ atomic_inc(&idev->mapcount);
+ return 0;
+}
+
+static int uio_dma_map_anywhere(
+ struct uio_listener *listener,
+ struct page **pages,
+ int npage,
+ int rdwr,
+ struct dma_map_page **mlpp)
+{
+ struct uio_device *idev = listener->dev;
+ struct scatterlist *sg, *nsg;
+ int i, nents;
+ struct dma_map_page *mlp;
+ unsigned long length;
+
+ if (idev->domain) {
+ /* map anywhere and map iova don't mix */
+ if (atomic_read(&idev->mapcount) == 0) {
+ iommu_domain_free(idev->domain);
+ idev->domain = NULL;
+ } else
+ return -EINVAL;
+ }
+ sg = kzalloc(npage * sizeof(struct scatterlist), GFP_KERNEL);
+ if (sg == NULL)
+ return -ENOMEM;
+ for (i=0; i<npage; i++) {
+ sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+ }
+ nents = dma_map_sg(idev->dev->parent, sg, npage,
+ rdwr ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE);
+ /* The API for dma_map_sg suggests that it may squash together
+ * adjacent pages, but noone seems to really do that. So we squash
+ * it ourselves, because the user level wants a single buffer.
+ * This works if (a) there is an iommu, or (b) the user allocates
+ * large buffers from a huge page
+ */
+ nsg = sg;
+ for (i=1; i<nents; i++) {
+ length = sg[i].dma_length;
+ sg[i].dma_length = 0;
+ if (sg[i].dma_address == (nsg->dma_address + nsg->dma_length)) {
+ nsg->dma_length += length;
+ } else {
+ nsg++;
+ nsg->dma_address = sg[i].dma_address;
+ nsg->dma_length = length;
+ }
+ }
+ nents = 1 + (nsg - sg);
+ if (nents != 1) {
+ if (nents > 0)
+ dma_unmap_sg(idev->dev->parent, sg, npage, DMA_BIDIRECTIONAL);
+ for (i=0; i<npage; i++)
+ put_page(pages[i]);
+ kfree(sg);
+ printk(KERN_ERR "%s: sequential dma mapping failed\n", __func__);
+ return -EFAULT;
+ }
+
+ mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
+ mlp->pages = pages;
+ mlp->sg = sg;
+ mlp->daddr = sg_dma_address(sg);
+ mlp->npage = npage;
+ *mlpp = mlp;
+ atomic_inc(&idev->mapcount);
+ return 0;
+}
+
+static int uio_dma_map_common(struct uio_listener *listener,
+ unsigned int cmd, struct uio_dma_map *dmp)
+{
+ int locked, lock_limit;
+ struct page **pages;
+ int npage;
+ struct dma_map_page *mlp = NULL;
+ int ret=0;
+
+ if (dmp->vaddr & (PAGE_SIZE-1))
+ return -EINVAL;
+ if (dmp->size & (PAGE_SIZE-1))
+ return -EINVAL;
+ if (dmp->size <= 0)
+ return -EINVAL;
+ npage = dmp->size >> PAGE_SHIFT;
+
+ /* account for locked pages */
+ locked = npage + current->mm->locked_vm;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ printk(KERN_WARNING "%s: RLIMIT_MEMLOCK exceeded\n",
+ __func__);
+ return -ENOMEM;
+ }
+ /* only 1 address space per fd */
+ if (current->mm != listener->mm) {
+ if (listener->mm != NULL)
+ return -EINVAL;
+ listener->mm = current->mm;
+ listener->mmu_notifier.ops = &uio_dma_mmu_notifier_ops;
+ ret = mmu_notifier_register(&listener->mmu_notifier,
+ listener->mm);
+ if (ret)
+ printk(KERN_ERR "%s: mmu_notifier_register failed %d\n",
+ __func__, ret);
+ }
+
+ pages = kzalloc(npage * sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL)
+ return -ENOMEM;
+ ret = get_user_pages_fast(dmp->vaddr, npage, dmp->rdwr, pages);
+ if (ret != npage) {
+ printk(KERN_ERR "%s: get_user_pages_fast returns %d, not %d\n",
+ __func__, ret, npage);
+ kfree(pages);
+ return -EFAULT;
+ }
+
+ if (cmd == UIO_DMA_MAP_IOVA)
+ ret = uio_dma_map_iova(listener, dmp->dmaaddr,
+ pages, npage, dmp->rdwr, &mlp);
+ else
+ ret = uio_dma_map_anywhere(listener, pages,
+ npage, dmp->rdwr, &mlp);
+ if (ret) {
+ kfree(pages);
+ return ret;
+ }
+ mlp->vaddr = dmp->vaddr;
+ mlp->rdwr = dmp->rdwr;
+ dmp->dmaaddr = mlp->daddr;
+ list_add(&mlp->list, &listener->dm_list);
+
+ current->mm->locked_vm += npage;
+ return 0;
+}
+
+static void uio_disable_msi(struct uio_device *idev)
+{
+ struct pci_dev *pdev = idev->pdev;
+
+ if (!pdev) /* pci specific */
+ return;
+ if (idev->ev_msi) {
+ eventfd_ctx_put(idev->ev_msi);
+ free_irq(pdev->irq, idev->ev_msi);
+ idev->ev_msi = NULL;
+ }
+ pci_disable_msi(pdev);
+}
+
+static int uio_enable_msi(struct uio_device *idev, int fd)
+{
+ struct pci_dev *pdev = idev->pdev;
+ int ret;
+
+ if (!pdev) /* pci specific */
+ return -EINVAL;
+ idev->ev_msi = eventfd_ctx_fdget(fd);
+ if (idev->ev_msi == NULL)
+ return -EINVAL;
+ pci_enable_msi(pdev);
+ ret = request_irq(pdev->irq, msihandler, 0,
+ idev->info->name, idev->ev_msi);
+ if (ret) {
+ eventfd_ctx_put(idev->ev_msi);
+ pci_disable_msi(pdev);
+ idev->ev_msi = NULL;
+ }
+ return ret;
+}
+
+static void uio_disable_msix(struct uio_device *idev)
+{
+ struct pci_dev *pdev = idev->pdev;
+ int i;
+
+ if (!pdev) /* pci specific */
+ return;
+ if (idev->ev_msix && idev->msix) {
+ for (i=0; i<idev->nvec; i++) {
+ free_irq(idev->msix[i].vector, idev->ev_msix[i]);
+ eventfd_ctx_put(idev->ev_msix[i]);
+ }
+ }
+ if (idev->ev_msix) {
+ kfree(idev->ev_msix);
+ idev->ev_msix = NULL;
+ }
+ if (idev->msix) {
+ kfree(idev->msix);
+ idev->msix = NULL;
+ }
+ idev->nvec = 0;
+ pci_disable_msix(pdev);
+}
+
+static int uio_enable_msix(struct uio_device *idev, int nvec, void __user *uarg)
+{
+ struct pci_dev *pdev = idev->pdev;
+ struct eventfd_ctx *ctx;
+ int ret = 0;
+ int i;
+ int fd;
+
+ if (!pdev) /* pci specific */
+ return -EINVAL;
+ idev->msix = kzalloc(nvec * sizeof(struct msix_entry), GFP_KERNEL);
+ idev->ev_msix = kzalloc(nvec * sizeof(struct eventfd_ctx *), GFP_KERNEL);
+ if (idev->msix == NULL || idev->ev_msix == NULL)
+ ret = -ENOMEM;
+ else {
+ for (i=0; i<nvec; i++) {
+ if (copy_from_user(&fd, uarg, sizeof fd)) {
+ ret = -EFAULT;
+ break;
+ }
+ uarg += sizeof fd;
+ ctx = eventfd_ctx_fdget(fd);
+ if (ctx == NULL) {
+ ret = -EINVAL;
+ break;
+ }
+ idev->msix[i].entry = i;
+ idev->ev_msix[i] = ctx;
+ }
+ }
+ if (!ret)
+ ret = pci_enable_msix(pdev, idev->msix, nvec);
+ idev->nvec = 0;
+ for (i=0; i<nvec && !ret; i++) {
+ ret = request_irq(idev->msix[i].vector, msihandler, 0,
+ idev->info->name, idev->ev_msix[i]);
+ if (ret)
+ break;
+ idev->nvec = i+1;
+ }
+ if (ret)
+ uio_disable_msix(idev);
+ return ret;
+}
+
+static int uio_ioctl(struct inode *inode, struct file *filep,
+ unsigned int cmd, unsigned long arg)
+{
+ struct uio_listener *listener = filep->private_data;
+ struct uio_device *idev = listener->dev;
+ void __user *uarg = (void __user *)arg;
+ struct pci_dev *pdev = idev->pdev;
+ struct uio_dma_map dm;
+ int ret = 0;
+ u64 mask;
+ int fd, nfd;
+
+ if (idev == NULL || idev->info == NULL)
+ return -EINVAL;
+
+ switch(cmd) {
+
+ case UIO_DMA_MAP_ANYWHERE:
+ case UIO_DMA_MAP_IOVA:
+ if (copy_from_user(&dm, uarg, sizeof dm))
+ return -EFAULT;
+ ret = uio_dma_map_common(listener, cmd, &dm);
+ if (!ret && copy_to_user(uarg, &dm, sizeof dm))
+ ret = -EFAULT;
+ break;
+
+ case UIO_DMA_UNMAP:
+ if (copy_from_user(&dm, uarg, sizeof dm))
+ return -EFAULT;
+ ret = uio_dma_unmap_dm(listener, &dm);
+ break;
+
+ case UIO_DMA_MASK: /* set master mode and DMA mask */
+ if (copy_from_user(&mask, uarg, sizeof mask))
+ return -EFAULT;
+ if (pdev) {
+ pci_set_master(pdev);
+ ret = pci_set_dma_mask(pdev, mask);
+ } else {
+ ret = dma_set_mask(idev->dev->parent, mask);
+ }
+ break;
+
+ case UIO_EVENTFD_IRQ:
+ if (copy_from_user(&fd, uarg, sizeof fd))
+ return -EFAULT;
+ if (idev->ev_irq)
+ eventfd_ctx_put(idev->ev_irq);
+ if (fd >= 0) {
+ idev->ev_irq = eventfd_ctx_fdget(fd);
+ if (idev->ev_irq == NULL)
+ ret = -EINVAL;
+ }
+ break;
+
+ case UIO_EVENTFD_MSI:
+ if (copy_from_user(&fd, uarg, sizeof fd))
+ return -EFAULT;
+ if (fd >= 0 && idev->ev_msi == NULL && idev->ev_msix == NULL) {
+ ret = uio_enable_msi(idev, fd);
+ } else if (fd < 0 && idev->ev_msi) {
+ uio_disable_msi(idev);
+ } else
+ ret = -EINVAL;
+ break;
+
+ case UIO_EVENTFDS_MSIX:
+ if (copy_from_user(&nfd, uarg, sizeof nfd))
+ return -EFAULT;
+ uarg += sizeof nfd;
+ if (nfd > 0 && idev->ev_msi == NULL && idev->ev_msix == NULL)
+ ret = uio_enable_msix(idev, nfd, uarg);
+ else if (nfd == 0 && idev->ev_msix)
+ uio_disable_msix(idev);
+ else {
+ ret = -EINVAL;
+ }
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ return ret;
+}
+
static const struct file_operations uio_fops = {
.owner = THIS_MODULE,
.open = uio_open,
.release = uio_release,
.read = uio_read,
.write = uio_write,
+ .ioctl = uio_ioctl,
.mmap = uio_mmap,
.poll = uio_poll,
.fasync = uio_fasync,
@@ -836,6 +1431,7 @@
ret = -ENOMEM;
goto err_kzalloc;
}
+ init_MUTEX(&idev->gate);

idev->owner = owner;
idev->info = info;
@@ -861,6 +1457,22 @@

info->uio_dev = idev;

+ /* See if we have a PCI device */
+ if (parent->bus == &pci_bus_type) {
+ u16 old_cmd, cmd;
+ struct pci_dev *pdev = to_pci_dev(parent);
+
+ idev->pdev = pdev;
+ /* see if its a master */
+ pci_read_config_word(pdev, PCI_COMMAND, &old_cmd);
+ cmd = old_cmd | PCI_COMMAND_MASTER;
+ pci_write_config_word(pdev, PCI_COMMAND, cmd);
+ pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+ pci_write_config_word(pdev, PCI_COMMAND, old_cmd);
+ if (cmd & PCI_COMMAND_MASTER)
+ idev->pmaster = 1;
+ }
+
if (idev->info->irq >= 0) {
ret = request_irq(idev->info->irq, uio_interrupt,
idev->info->irq_flags, idev->info->name, idev);
diff -ruNP linux-2.6.33/include/linux/uio_driver.h uio-2.6.33/include/linux/uio_driver.h
--- linux-2.6.33/include/linux/uio_driver.h 2010-02-24 10:52:17.000000000 -0800
+++ uio-2.6.33/include/linux/uio_driver.h 2010-04-14 15:42:57.000000000 -0700
@@ -14,8 +14,12 @@
#ifndef _UIO_DRIVER_H_
#define _UIO_DRIVER_H_

-#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/ioctl.h>
+
+#ifdef __KERNEL__
+
+#include <linux/module.h>
#include <linux/interrupt.h>

struct uio_map;
@@ -122,4 +126,23 @@
#define UIO_PORT_GPIO 2
#define UIO_PORT_OTHER 3

+#endif /* __KERNEL__ */
+
+// Kernel & User level defines for ioctls
+
+struct uio_dma_map {
+ unsigned long vaddr;
+ unsigned long long dmaaddr;
+ int size;
+ int rdwr;
+};
+
+#define UIO_DMA_MAP_ANYWHERE _IOWR(';', 100, struct uio_dma_map)
+#define UIO_DMA_MAP_IOVA _IOWR(';', 101, struct uio_dma_map)
+#define UIO_DMA_UNMAP _IOW(';', 102, struct uio_dma_map)
+#define UIO_DMA_MASK _IOW(';', 103, unsigned long long)
+#define UIO_EVENTFD_IRQ _IOW(';', 104, int)
+#define UIO_EVENTFD_MSI _IOW(';', 105, int)
+#define UIO_EVENTFDS_MSIX _IOW(';', 106, int)
+
#endif /* _LINUX_UIO_DRIVER_H_ */


2010-04-17 10:43:13

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.

On Thu, Apr 15, 2010 at 01:55:29PM -0700, Tom Lyon wrote:

> + down(&idev->gate);
> + if (idev->listeners == 0) { /* first open */
> + if (idev->pmaster && !iommu_found() && !capable(CAP_SYS_RAWIO)) {
> + up(&idev->gate);
> + return -EPERM;
> + }
> + /* reset to known state if we can */
> + if (idev->pdev)
> + (void) pci_reset_function(idev->pdev);
> + }
> + idev->listeners++;
> + up(&idev->gate);

Why do you want to allow multiple opens for a device? Is it not
sufficient to allow only one?


> + if (idev->domain == NULL) {
> + if (!list_empty(&listener->dm_list)) /* no mix with anywhere */
> + return -EINVAL;
> + if (!iommu_found())
> + return -EINVAL;
> + idev->domain = iommu_domain_alloc();
> + if (idev->domain == NULL)
> + return -ENXIO;
> + idev->cachec = iommu_domain_has_cap(idev->domain,
> + IOMMU_CAP_CACHE_COHERENCY);
> + ret = iommu_attach_device(idev->domain, idev->dev->parent);
> + if (ret) {
> + iommu_domain_free(idev->domain);
> + idev->domain = NULL;
> + printk(KERN_ERR "%s: device_attach failed %d\n", __func__, ret);
> + return ret;
> + }
> + }

If userspace calls this path this will make all the addresses mapped
with DMA-API paths unusable by the device. This doesn't look like a sane
userspace interface.

For better and more in-depth review I suggest that you split up this
large patch into a series of smaler which implement specific aspects of
your work.

Joerg

P.S.: I got these warning when applying your patches ...

Applying: drivers/uio/uio_pci_generic.c: allow access for non-privileged processes
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:86: trailing whitespace.
info->mem[j].name = name;
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:87: trailing whitespace.
info->mem[j].addr = pci_resource_start(pdev, i);
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:89: trailing whitespace.
info->mem[j].memtype = UIO_MEM_PHYS;
warning: 3 lines add whitespace errors.
Applying: drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:315: trailing whitespace.
ret = iommu_map_range(idev->domain, iova,
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:325: trailing whitespace.
}
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:366: trailing whitespace.
* adjacent pages, but noone seems to really do that. So we squash
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:368: trailing whitespace.
* This works if (a) there is an iommu, or (b) the user allocates
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:578: trailing whitespace.
unsigned int cmd, unsigned long arg)
warning: squelched 1 whitespace error
warning: 6 lines add whitespace errors.

And there are also some coding-style issues.

2010-04-17 17:07:16

by Tom Lyon

[permalink] [raw]
Subject: Re: [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.

The current uio and uio_pci_generic allow multiple opens; I was just preserving that behavior.

On Saturday 17 April 2010 03:43:09 am Joerg Roedel wrote:
> On Thu, Apr 15, 2010 at 01:55:29PM -0700, Tom Lyon wrote:
>
> > + down(&idev->gate);
> > + if (idev->listeners == 0) { /* first open */
> > + if (idev->pmaster && !iommu_found() && !capable(CAP_SYS_RAWIO)) {
> > + up(&idev->gate);
> > + return -EPERM;
> > + }
> > + /* reset to known state if we can */
> > + if (idev->pdev)
> > + (void) pci_reset_function(idev->pdev);
> > + }
> > + idev->listeners++;
> > + up(&idev->gate);
>
> Why do you want to allow multiple opens for a device? Is it not
> sufficient to allow only one?
>
>
> > + if (idev->domain == NULL) {
> > + if (!list_empty(&listener->dm_list)) /* no mix with anywhere */
> > + return -EINVAL;
> > + if (!iommu_found())
> > + return -EINVAL;
> > + idev->domain = iommu_domain_alloc();
> > + if (idev->domain == NULL)
> > + return -ENXIO;
> > + idev->cachec = iommu_domain_has_cap(idev->domain,
> > + IOMMU_CAP_CACHE_COHERENCY);
> > + ret = iommu_attach_device(idev->domain, idev->dev->parent);
> > + if (ret) {
> > + iommu_domain_free(idev->domain);
> > + idev->domain = NULL;
> > + printk(KERN_ERR "%s: device_attach failed %d\n", __func__, ret);
> > + return ret;
> > + }
> > + }
>
> If userspace calls this path this will make all the addresses mapped
> with DMA-API paths unusable by the device. This doesn't look like a sane
> userspace interface.
>
> For better and more in-depth review I suggest that you split up this
> large patch into a series of smaler which implement specific aspects of
> your work.
>
> Joerg
>
> P.S.: I got these warning when applying your patches ...
>
> Applying: drivers/uio/uio_pci_generic.c: allow access for non-privileged processes
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:86: trailing whitespace.
> info->mem[j].name = name;
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:87: trailing whitespace.
> info->mem[j].addr = pci_resource_start(pdev, i);
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:89: trailing whitespace.
> info->mem[j].memtype = UIO_MEM_PHYS;
> warning: 3 lines add whitespace errors.
> Applying: drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:315: trailing whitespace.
> ret = iommu_map_range(idev->domain, iova,
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:325: trailing whitespace.
> }
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:366: trailing whitespace.
> * adjacent pages, but noone seems to really do that. So we squash
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:368: trailing whitespace.
> * This works if (a) there is an iommu, or (b) the user allocates
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:578: trailing whitespace.
> unsigned int cmd, unsigned long arg)
> warning: squelched 1 whitespace error
> warning: 6 lines add whitespace errors.
>
> And there are also some coding-style issues.
>

2010-04-17 18:35:19

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.

On 04/15/2010 11:55 PM, Tom Lyon wrote:
> This is the second of 2 related, but independent, patches. This is for
> uio.c, the previous is for uio_pci_generic.c.
>
> The 2 patches were previously one large patch. Changes for this version:
> - uio_pci_generic.c just gets extensions so that a single fd can be used
> by non-privileged processes for interrupt control and mmaps
> - All of the DMA and IOMMU related stuff move to uio.c; no longer a need
> to pass ioctls to individual uio drivers. It turns out that the code
> is not PCI specific anyways.
> - A new ioctl to pin DMA buffers to certain IO virtual addresses for KVM.
> - New eventfd based interrupt notifications, including support for PCI
> specific MSI and MSI-X interrupts.
> - PCI specific code to reset PCI functions before and after use
>
>
> @@ -122,4 +126,23 @@
> #define UIO_PORT_GPIO 2
> #define UIO_PORT_OTHER 3
>
> +#endif /* __KERNEL__ */
> +
> +// Kernel& User level defines for ioctls
> +
> +struct uio_dma_map {
> + unsigned long vaddr;
> + unsigned long long dmaaddr;
>

Use __u64 for both, otherwise you need to rewrite the structures in the
kernel in case 32-bit userspace calls a 64-bit kernel.

> + int size;
>

What units? Size is probably too small. Suggest unsigned type to avoid
an extra check in the kernel.

> + int rdwr;
>

What values can this hold?

> +};
> +
> +#define UIO_DMA_MAP_ANYWHERE _IOWR(';', 100, struct uio_dma_map)
>

What does this do? Ignore vaddr?

> +#define UIO_DMA_MAP_IOVA _IOWR(';', 101, struct uio_dma_map)
> +#define UIO_DMA_UNMAP _IOW(';', 102, struct uio_dma_map)
> +#define UIO_DMA_MASK _IOW(';', 103, unsigned long long)
> +#define UIO_EVENTFD_IRQ _IOW(';', 104, int)
> +#define UIO_EVENTFD_MSI _IOW(';', 105, int)
> +#define UIO_EVENTFDS_MSIX _IOW(';', 106, int)
>

These three need some documentation.


--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.