2012-11-20 00:49:12

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH] vfio powerpc: enabled and supported on powernv platform

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.
The platform dependent part includes IOMMU initialization
and handling.

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan, only POWERNV
platform is supported at the moment.

Also the patch implements an VFIO-IOMMU driver which
manages DMA mapping/unmapping requests coming from
the client (now QEMU). It also returns a DMA window
information to let the guest initialize the device tree
for a guest OS properly. Although this driver has been
tested only on POWERNV, it should work on any platform
supporting TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option.

Cc: David Gibson <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
arch/powerpc/include/asm/iommu.h | 6 +
arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
drivers/iommu/Kconfig | 8 ++
drivers/vfio/Kconfig | 6 +
drivers/vfio/Makefile | 1 +
drivers/vfio/vfio_iommu_spapr_tce.c | 247 ++++++++++++++++++++++++++++++++++
include/linux/vfio.h | 20 +++
8 files changed, 563 insertions(+)
create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5ba66cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,30 +64,33 @@ struct iommu_pool {
} ____cacheline_aligned_in_smp;

struct iommu_table {
unsigned long it_busno; /* Bus number this table belongs to */
unsigned long it_size; /* Size of iommu table in entries */
unsigned long it_offset; /* Offset into global table */
unsigned long it_base; /* mapped address of tce table */
unsigned long it_index; /* which iommu table this is */
unsigned long it_type; /* type: PCI or Virtual Bus */
unsigned long it_blocksize; /* Entries in each block (cacheline) */
unsigned long poolsize;
unsigned long nr_pools;
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map; /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+ struct iommu_group *it_group;
+#endif
};

struct scatterlist;

static inline void set_iommu_table_base(struct device *dev, void *base)
{
dev->archdata.dma_data.iommu_table_base = base;
}

static inline void *get_iommu_table_base(struct device *dev)
{
return dev->archdata.dma_data.iommu_table_base;
}

/* Frees table for an individual device node */
@@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { }
extern void alloc_dart_table(void);
#if defined(CONFIG_PPC64) && defined(CONFIG_PM)
static inline void iommu_save(void)
{
if (ppc_md.iommu_save)
ppc_md.iommu_save();
}

static inline void iommu_restore(void)
{
if (ppc_md.iommu_restore)
ppc_md.iommu_restore();
}
#endif

+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+ enum dma_data_direction direction, unsigned long pages);
+
#endif /* __KERNEL__ */
#endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..94f614b 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -32,30 +32,31 @@
#include <linux/dma-mapping.h>
#include <linux/bitmap.h>
#include <linux/iommu-helper.h>
#include <linux/crash_dump.h>
#include <linux/hash.h>
#include <linux/fault-inject.h>
#include <linux/pci.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/kdump.h>
#include <asm/fadump.h>
#include <asm/vio.h>
+#include <asm/tce.h>

#define DBG(...)

static int novmerge;

static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);

static int __init setup_iommu(char *str)
{
if (!strcmp(str, "novmerge"))
novmerge = 1;
else if (!strcmp(str, "vmerge"))
novmerge = 0;
return 1;
}
@@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
}

void iommu_free_coherent(struct iommu_table *tbl, size_t size,
void *vaddr, dma_addr_t dma_handle)
{
if (tbl) {
unsigned int nio_pages;

size = PAGE_ALIGN(size);
nio_pages = size >> IOMMU_PAGE_SHIFT;
iommu_free(tbl, dma_handle, nio_pages);
size = PAGE_ALIGN(size);
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
+{
+ struct page *page = NULL;
+ unsigned long oldtce;
+
+ oldtce = ppc_md.tce_get(tbl, entry);
+
+ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ return NULL;
+
+ page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+ WARN_ON(!page);
+ if (page && (oldtce & TCE_PCI_WRITE))
+ SetPageDirty(page);
+ ppc_md.tce_free(tbl, entry, 1);
+
+ return page;
+}
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction)
+{
+ int ret;
+ struct page *page = NULL;
+ unsigned long kva, offset;
+
+ /* Map new TCE */
+ offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (ret < 1) {
+ printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, ret);
+ if (!ret)
+ ret = -EFAULT;
+ }
+
+ kva = (unsigned long) page_address(page);
+ kva += offset;
+
+ /* tce_build receives a virtual address */
+ entry += tbl->it_offset; /* Offset into real TCE table */
+ ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+ /* tce_build() only returns non-zero for transient errors */
+ if (unlikely(ret)) {
+ printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+ put_page(page);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+}
+
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+ enum dma_data_direction direction, unsigned long pages)
+{
+ int i, ret = 0, pages_to_put = 0;
+ struct page *page;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+ struct page **oldpages;
+ const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
+
+ BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+ /* Handle a single page request without allocation
+ of pages-to-release array */
+ if (pages == 1) {
+ spin_lock(&(pool->lock));
+ page = free_tce(tbl, entry);
+
+ if (direction != DMA_NONE)
+ ret = put_tce(tbl, entry, tce, direction);
+
+ tce_flush(tbl);
+
+ if (page)
+ put_page(page);
+
+ spin_unlock(&(pool->lock));
+ return ret;
+ }
+
+ /* Releasing multiple pages */
+ /* Allocate an array for pages to be released after TCE table
+ is updated */
+ oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!oldpages)
+ return -ENOMEM;
+
+ spin_lock(&(pool->lock));
+
+ for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
+ page = free_tce(tbl, entry);
+ if (page) {
+ oldpages[pages_to_put] = page;
+ ++pages_to_put;
+ }
+
+ if (direction != DMA_NONE)
+ ret = put_tce(tbl, entry, tce, direction);
+
+ /* Release old pages if we reached the end of oldpages[] or
+ it is the last page or we are about to exit the loop */
+ if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
+ tce_flush(tbl);
+
+ /* Release pages after removing them from TCE table */
+ while (pages_to_put) {
+ --pages_to_put;
+ put_page(oldpages[pages_to_put]);
+ }
+ }
+ }
+
+ spin_unlock(&(pool->lock));
+ kfree(oldpages);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..676f4d9 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -8,30 +8,31 @@
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/

#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/delay.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/irq.h>
#include <linux/io.h>
#include <linux/msi.h>
+#include <linux/iommu.h>

#include <asm/sections.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
#include <asm/opal.h>
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/abs_addr.h>
#include <asm/firmware.h>

#include "powernv.h"
#include "pci.h"
@@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
/* Configure IOMMU DMA hooks */
ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
ppc_md.tce_build = pnv_tce_build;
ppc_md.tce_free = pnv_tce_free;
ppc_md.tce_get = pnv_tce_get;
ppc_md.pci_probe_mode = pnv_pci_probe_mode;
set_pci_dma_ops(&dma_iommu_ops);

/* Configure MSIs */
#ifdef CONFIG_PCI_MSI
ppc_md.msi_check_device = pnv_msi_check_device;
ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
#endif
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+ struct iommu_table *tbl;
+ int ret = 0;
+
+ if (WARN_ON(dev->iommu_group)) {
+ printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+ dev->kobj.name,
+ iommu_group_id(dev->iommu_group));
+ return -EBUSY;
+ }
+
+ tbl = get_iommu_table_base(dev);
+ if (!tbl) {
+ pr_debug("tce_vfio: skipping device %s with no tbl\n",
+ dev->kobj.name);
+ return 0;
+ }
+
+ pr_debug("tce_vfio: adding %s to iommu group %d\n",
+ dev->kobj.name, iommu_group_id(tbl->it_group));
+
+ ret = iommu_group_add_device(tbl->it_group, dev);
+ if (ret < 0)
+ printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+ dev->kobj.name, ret);
+
+ return ret;
+}
+
+static void del_device(struct device *dev)
+{
+ iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ return add_device(dev);
+ case BUS_NOTIFY_DEL_DEVICE:
+ del_device(dev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+ .notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+ struct iommu_table *tbl = iommu_data;
+ tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp;
+
+ bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Allocate and initialize IOMMU groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+
+ /* Skip already initialized */
+ if (tbl->it_group)
+ continue;
+
+ grp = iommu_group_alloc();
+ if (IS_ERR(grp)) {
+ printk(KERN_INFO "tce_vfio: cannot create "
+ "new IOMMU group, ret=%ld\n",
+ PTR_ERR(grp));
+ return PTR_ERR(grp);
+ }
+ tbl->it_group = grp;
+ iommu_group_set_iommudata(grp, tbl, group_release);
+ }
+
+ /* Add PCI devices to VFIO groups */
+ for_each_pci_dev(pdev)
+ add_device(&pdev->dev);
+
+ return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp = NULL;
+
+ bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Delete PCI devices from VFIO groups */
+ for_each_pci_dev(pdev)
+ del_device(&pdev->dev);
+
+ /* Release VFIO groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+ grp = tbl->it_group;
+
+ /* Skip (already) uninitialized */
+ if (!grp)
+ continue;
+
+ /* Do actual release, group_release() is expected to work */
+ iommu_group_put(grp);
+ BUG_ON(tbl->it_group);
+ }
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -175,16 +175,24 @@ config EXYNOS_IOMMU
processor family. This enables H/W multimedia accellerators to see
non-linear physical memory chunks as a linear memory in their
address spaces

If unsure, say N here.

config EXYNOS_IOMMU_DEBUG
bool "Debugging log for Exynos IOMMU"
depends on EXYNOS_IOMMU
help
Select this to see the detailed log message that shows what
happens in the IOMMU driver

Say N unless you need kernel log message for IOMMU debugging

+config SPAPR_TCE_IOMMU
+ bool "sPAPR TCE IOMMU Support"
+ depends on PPC_POWERNV
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops is
+ still not implemented.
+
endif # IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,16 +1,22 @@
config VFIO_IOMMU_TYPE1
tristate
depends on VFIO
default n

+config VFIO_IOMMU_SPAPR_TCE
+ tristate
+ depends on VFIO && SPAPR_TCE_IOMMU
+ default n
+
menuconfig VFIO
tristate "VFIO Non-Privileged userspace driver framework"
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
+ select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
help
VFIO provides a framework for secure userspace device drivers.
See Documentation/vfio.txt for more details.

If you don't know what to do here, say N.

source "drivers/vfio/pci/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_VFIO) += vfio.o
obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..ac72c74d
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,247 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Author: Alexey Kardashevskiy <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <[email protected]>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR "[email protected]"
+#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+ struct mutex lock;
+ struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+ struct tce_container *container;
+
+ if (arg != VFIO_SPAPR_TCE_IOMMU) {
+ printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ container = kzalloc(sizeof(*container), GFP_KERNEL);
+ if (!container)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&container->lock);
+
+ return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+ struct tce_container *container = iommu_data;
+
+ WARN_ON(container->tbl && !container->tbl->it_group);
+ if (container->tbl && container->tbl->it_group)
+ tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+ mutex_destroy(&container->lock);
+
+ kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+ unsigned int cmd, unsigned long arg)
+{
+ struct tce_container *container = iommu_data;
+ unsigned long minsz;
+
+ switch (cmd) {
+ case VFIO_CHECK_EXTENSION: {
+ return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+ }
+ case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+ struct vfio_iommu_spapr_tce_info info;
+ struct iommu_table *tbl = container->tbl;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+ dma64_window_size);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+ info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+ info.dma64_window_start = 0;
+ info.dma64_window_size = 0;
+ info.flags = 0;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_IOMMU_MAP_DMA: {
+ vfio_iommu_spapr_tce_dma_map par;
+ struct iommu_table *tbl = container->tbl;
+ enum dma_data_direction direction = DMA_NONE;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+ if (copy_from_user(&par, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (par.argsz < minsz)
+ return -EINVAL;
+
+ if ((par.flags & VFIO_DMA_MAP_FLAG_READ) &&
+ (par.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
+ direction = DMA_BIDIRECTIONAL;
+ } else if (par.flags & VFIO_DMA_MAP_FLAG_READ) {
+ direction = DMA_TO_DEVICE;
+ } else if (par.flags & VFIO_DMA_MAP_FLAG_WRITE) {
+ direction = DMA_FROM_DEVICE;
+ }
+
+ par.size += par.iova & ~IOMMU_PAGE_MASK;
+ par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
+
+ return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
+ par.vaddr & IOMMU_PAGE_MASK, direction,
+ par.size >> IOMMU_PAGE_SHIFT);
+ }
+ case VFIO_IOMMU_UNMAP_DMA: {
+ vfio_iommu_spapr_tce_dma_unmap par;
+ struct iommu_table *tbl = container->tbl;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+ if (copy_from_user(&par, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (par.argsz < minsz)
+ return -EINVAL;
+
+ par.size += par.iova & ~IOMMU_PAGE_MASK;
+ par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
+
+ return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
+ 0, DMA_NONE, par.size >> IOMMU_PAGE_SHIFT);
+ }
+ default:
+ printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+ }
+
+ return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+ struct tce_container *container = iommu_data;
+ struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+ BUG_ON(!tbl);
+ mutex_lock(&container->lock);
+ pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+ iommu_group_id(iommu_group), iommu_group);
+ if (container->tbl) {
+ printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+ iommu_group_id(container->tbl->it_group),
+ iommu_group_id(iommu_group));
+ mutex_unlock(&container->lock);
+ return -EBUSY;
+ }
+
+ container->tbl = tbl;
+ mutex_unlock(&container->lock);
+
+ return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+ struct tce_container *container = iommu_data;
+ struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+ BUG_ON(!tbl);
+ mutex_lock(&container->lock);
+ if (tbl != container->tbl) {
+ printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
+ iommu_group_id(iommu_group),
+ iommu_group_id(tbl->it_group));
+ } else {
+
+ pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+ iommu_group_id(iommu_group), iommu_group);
+
+ iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
+ container->tbl = NULL;
+ }
+ mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+ .name = "iommu-vfio-powerpc",
+ .owner = THIS_MODULE,
+ .open = tce_iommu_open,
+ .release = tce_iommu_release,
+ .ioctl = tce_iommu_ioctl,
+ .attach_group = tce_iommu_attach_group,
+ .detach_group = tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+ return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..3ecd65c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -87,30 +87,31 @@ extern void vfio_unregister_iommu_driver(
* Simple helper macro for dealing with variable sized structures passed
* from user space. This allows us to easily determine if the provided
* structure is sized to include various fields.
*/
#define offsetofend(TYPE, MEMBER) ({ \
TYPE tmp; \
offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \

#endif /* __KERNEL__ */

/* Kernel & User level defines for VFIO IOCTLs. */

/* Extensions */

#define VFIO_TYPE1_IOMMU 1
+#define VFIO_SPAPR_TCE_IOMMU 2

/*
* The IOCTL interface is designed for extensibility by embedding the
* structure length (argsz) and flags into structures passed between
* kernel and userspace. We therefore use the _IO() macro for these
* defines to avoid implicitly embedding a size into the ioctl request.
* As structure fields are added, argsz will increase to match and flag
* bits will be defined to indicate additional fields with valid data.
* It's *always* the caller's responsibility to indicate the size of
* the structure passed by setting argsz appropriately.
*/

#define VFIO_TYPE (';')
#define VFIO_BASE 100

@@ -430,16 +431,35 @@ struct vfio_iommu_type1_dma_map {
/**
* VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
*
* Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
* Caller sets argsz.
*/
struct vfio_iommu_type1_dma_unmap {
__u32 argsz;
__u32 flags;
__u64 iova; /* IO virtual address */
__u64 size; /* Size of mapping (bytes) */
};

#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)

+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+ __u32 argsz;
+ __u32 flags;
+ __u32 dma32_window_start;
+ __u32 dma32_window_size;
+ __u64 dma64_window_start;
+ __u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
#endif /* VFIO_H */
--
1.7.10.4


2012-11-20 18:20:00

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> The platform dependent part includes IOMMU initialization
> and handling.
>
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan, only POWERNV
> platform is supported at the moment.
>
> Also the patch implements an VFIO-IOMMU driver which
> manages DMA mapping/unmapping requests coming from
> the client (now QEMU). It also returns a DMA window
> information to let the guest initialize the device tree
> for a guest OS properly. Although this driver has been
> tested only on POWERNV, it should work on any platform
> supporting TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option.
>
> Cc: David Gibson <[email protected]>
> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> ---
> arch/powerpc/include/asm/iommu.h | 6 +
> arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
> drivers/iommu/Kconfig | 8 ++
> drivers/vfio/Kconfig | 6 +
> drivers/vfio/Makefile | 1 +
> drivers/vfio/vfio_iommu_spapr_tce.c | 247 ++++++++++++++++++++++++++++++++++
> include/linux/vfio.h | 20 +++
> 8 files changed, 563 insertions(+)
> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5ba66cb 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -64,30 +64,33 @@ struct iommu_pool {
> } ____cacheline_aligned_in_smp;
>
> struct iommu_table {
> unsigned long it_busno; /* Bus number this table belongs to */
> unsigned long it_size; /* Size of iommu table in entries */
> unsigned long it_offset; /* Offset into global table */
> unsigned long it_base; /* mapped address of tce table */
> unsigned long it_index; /* which iommu table this is */
> unsigned long it_type; /* type: PCI or Virtual Bus */
> unsigned long it_blocksize; /* Entries in each block (cacheline) */
> unsigned long poolsize;
> unsigned long nr_pools;
> struct iommu_pool large_pool;
> struct iommu_pool pools[IOMMU_NR_POOLS];
> unsigned long *it_map; /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
> };
>
> struct scatterlist;
>
> static inline void set_iommu_table_base(struct device *dev, void *base)
> {
> dev->archdata.dma_data.iommu_table_base = base;
> }
>
> static inline void *get_iommu_table_base(struct device *dev)
> {
> return dev->archdata.dma_data.iommu_table_base;
> }
>
> /* Frees table for an individual device node */
> @@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { }
> extern void alloc_dart_table(void);
> #if defined(CONFIG_PPC64) && defined(CONFIG_PM)
> static inline void iommu_save(void)
> {
> if (ppc_md.iommu_save)
> ppc_md.iommu_save();
> }
>
> static inline void iommu_restore(void)
> {
> if (ppc_md.iommu_restore)
> ppc_md.iommu_restore();
> }
> #endif
>
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> + enum dma_data_direction direction, unsigned long pages);
> +
> #endif /* __KERNEL__ */
> #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..94f614b 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -32,30 +32,31 @@
> #include <linux/dma-mapping.h>
> #include <linux/bitmap.h>
> #include <linux/iommu-helper.h>
> #include <linux/crash_dump.h>
> #include <linux/hash.h>
> #include <linux/fault-inject.h>
> #include <linux/pci.h>
> #include <asm/io.h>
> #include <asm/prom.h>
> #include <asm/iommu.h>
> #include <asm/pci-bridge.h>
> #include <asm/machdep.h>
> #include <asm/kdump.h>
> #include <asm/fadump.h>
> #include <asm/vio.h>
> +#include <asm/tce.h>
>
> #define DBG(...)
>
> static int novmerge;
>
> static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
>
> static int __init setup_iommu(char *str)
> {
> if (!strcmp(str, "novmerge"))
> novmerge = 1;
> else if (!strcmp(str, "vmerge"))
> novmerge = 0;
> return 1;
> }
> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
> }
>
> void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> void *vaddr, dma_addr_t dma_handle)
> {
> if (tbl) {
> unsigned int nio_pages;
>
> size = PAGE_ALIGN(size);
> nio_pages = size >> IOMMU_PAGE_SHIFT;
> iommu_free(tbl, dma_handle, nio_pages);
> size = PAGE_ALIGN(size);
> free_pages((unsigned long)vaddr, get_order(size));
> }
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
> +{
> + struct page *page = NULL;

NULL initialization doesn't appear to be necessary

> + unsigned long oldtce;
> +
> + oldtce = ppc_md.tce_get(tbl, entry);
> +
> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> + return NULL;
> +
> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> + WARN_ON(!page);
> + if (page && (oldtce & TCE_PCI_WRITE))
> + SetPageDirty(page);
> + ppc_md.tce_free(tbl, entry, 1);
> +
> + return page;
> +}
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction)
> +{
> + int ret;
> + struct page *page = NULL;
> + unsigned long kva, offset;
> +
> + /* Map new TCE */
> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> + direction != DMA_TO_DEVICE, &page);
> + if (ret < 1) {
> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> + if (!ret)
> + ret = -EFAULT;

Missing return ret? Otherwise we've got some bogus uses of page below
and we're setting ret for no reason here.

> + }
> +
> + kva = (unsigned long) page_address(page);
> + kva += offset;
> +
> + /* tce_build receives a virtual address */
> + entry += tbl->it_offset; /* Offset into real TCE table */
> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> + /* tce_build() only returns non-zero for transient errors */
> + if (unlikely(ret)) {
> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> + put_page(page);
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> + /* Flush/invalidate TLB caches if necessary */
> + if (ppc_md.tce_flush)
> + ppc_md.tce_flush(tbl);
> +
> + /* Make sure updates are seen by hardware */
> + mb();
> +}
> +
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> + enum dma_data_direction direction, unsigned long pages)
> +{
> + int i, ret = 0, pages_to_put = 0;
> + struct page *page;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> + struct page **oldpages;
> + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> +
> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +
> + /* Handle a single page request without allocation
> + of pages-to-release array */
> + if (pages == 1) {
> + spin_lock(&(pool->lock));
> + page = free_tce(tbl, entry);
> +
> + if (direction != DMA_NONE)
> + ret = put_tce(tbl, entry, tce, direction);
> +
> + tce_flush(tbl);
> +
> + if (page)
> + put_page(page);
> +
> + spin_unlock(&(pool->lock));
> + return ret;
> + }
> +
> + /* Releasing multiple pages */
> + /* Allocate an array for pages to be released after TCE table
> + is updated */
> + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> + if (!oldpages)
> + return -ENOMEM;
> +
> + spin_lock(&(pool->lock));
> +
> + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
> + page = free_tce(tbl, entry);
> + if (page) {
> + oldpages[pages_to_put] = page;
> + ++pages_to_put;
> + }
> +
> + if (direction != DMA_NONE)
> + ret = put_tce(tbl, entry, tce, direction);
> +
> + /* Release old pages if we reached the end of oldpages[] or
> + it is the last page or we are about to exit the loop */
> + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
> + tce_flush(tbl);

Avoiding tce_flush() is the reason for all this extra overhead, right?
I wonder if it'd be cleaner separating map vs unmap, where the map case
can avoid the oldpages array... but that means inserting new mappings on
top of old ones wouldn't put the pages.

> +
> + /* Release pages after removing them from TCE table */
> + while (pages_to_put) {
> + --pages_to_put;
> + put_page(oldpages[pages_to_put]);
> + }
> + }
> + }
> +
> + spin_unlock(&(pool->lock));
> + kfree(oldpages);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..676f4d9 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -8,30 +8,31 @@
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version
> * 2 of the License, or (at your option) any later version.
> */
>
> #include <linux/kernel.h>
> #include <linux/pci.h>
> #include <linux/delay.h>
> #include <linux/string.h>
> #include <linux/init.h>
> #include <linux/bootmem.h>
> #include <linux/irq.h>
> #include <linux/io.h>
> #include <linux/msi.h>
> +#include <linux/iommu.h>
>
> #include <asm/sections.h>
> #include <asm/io.h>
> #include <asm/prom.h>
> #include <asm/pci-bridge.h>
> #include <asm/machdep.h>
> #include <asm/ppc-pci.h>
> #include <asm/opal.h>
> #include <asm/iommu.h>
> #include <asm/tce.h>
> #include <asm/abs_addr.h>
> #include <asm/firmware.h>
>
> #include "powernv.h"
> #include "pci.h"
> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> /* Configure IOMMU DMA hooks */
> ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> ppc_md.tce_build = pnv_tce_build;
> ppc_md.tce_free = pnv_tce_free;
> ppc_md.tce_get = pnv_tce_get;
> ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> set_pci_dma_ops(&dma_iommu_ops);
>
> /* Configure MSIs */
> #ifdef CONFIG_PCI_MSI
> ppc_md.msi_check_device = pnv_msi_check_device;
> ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> #endif
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> + struct iommu_table *tbl;
> + int ret = 0;
> +
> + if (WARN_ON(dev->iommu_group)) {
> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> + dev->kobj.name,

dev_name(dev)

> + iommu_group_id(dev->iommu_group));
> + return -EBUSY;
> + }
> +
> + tbl = get_iommu_table_base(dev);
> + if (!tbl) {
> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> + dev->kobj.name);
> + return 0;
> + }
> +
> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> + dev->kobj.name, iommu_group_id(tbl->it_group));
> +
> + ret = iommu_group_add_device(tbl->it_group, dev);
> + if (ret < 0)
> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> + dev->kobj.name, ret);
> +
> + return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> + iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> + unsigned long action, void *data)
> +{
> + struct device *dev = data;
> +
> + switch (action) {
> + case BUS_NOTIFY_ADD_DEVICE:
> + return add_device(dev);
> + case BUS_NOTIFY_DEL_DEVICE:
> + del_device(dev);
> + return 0;
> + default:
> + return 0;
> + }
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> + .notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> + struct iommu_table *tbl = iommu_data;
> + tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp;
> +
> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);

There's already a notifier in the iommu code if you were to register an
iommu_ops with the add/remove_device entries. That would allow you to
remove the notifier block and notifier function below and the second
loop below. Are you avoiding that to avoid the rest of iommu_ops?

Also, shouldn't this notifier only be registered after the first loop
below? Otherwise ADD_DEVICE could race with setting up groups, which we
assume are present in the add_device() above.

> +
> + /* Allocate and initialize IOMMU groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> +
> + /* Skip already initialized */
> + if (tbl->it_group)
> + continue;
> +
> + grp = iommu_group_alloc();
> + if (IS_ERR(grp)) {
> + printk(KERN_INFO "tce_vfio: cannot create "
> + "new IOMMU group, ret=%ld\n",
> + PTR_ERR(grp));
> + return PTR_ERR(grp);
> + }
> + tbl->it_group = grp;
> + iommu_group_set_iommudata(grp, tbl, group_release);
> + }
> +
> + /* Add PCI devices to VFIO groups */
> + for_each_pci_dev(pdev)
> + add_device(&pdev->dev);
> +
> + return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp = NULL;
> +
> + bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Delete PCI devices from VFIO groups */
> + for_each_pci_dev(pdev)
> + del_device(&pdev->dev);
> +
> + /* Release VFIO groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> + grp = tbl->it_group;
> +
> + /* Skip (already) uninitialized */
> + if (!grp)
> + continue;
> +
> + /* Do actual release, group_release() is expected to work */
> + iommu_group_put(grp);
> + BUG_ON(tbl->it_group);
> + }
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -175,16 +175,24 @@ config EXYNOS_IOMMU
> processor family. This enables H/W multimedia accellerators to see
> non-linear physical memory chunks as a linear memory in their
> address spaces
>
> If unsure, say N here.
>
> config EXYNOS_IOMMU_DEBUG
> bool "Debugging log for Exynos IOMMU"
> depends on EXYNOS_IOMMU
> help
> Select this to see the detailed log message that shows what
> happens in the IOMMU driver
>
> Say N unless you need kernel log message for IOMMU debugging
>
> +config SPAPR_TCE_IOMMU
> + bool "sPAPR TCE IOMMU Support"
> + depends on PPC_POWERNV
> + select IOMMU_API
> + help
> + Enables bits of IOMMU API required by VFIO. The iommu_ops is
> + still not implemented.
> +
> endif # IOMMU_SUPPORT

How are you planning to split this up among maintainers? A powerpc
patch, an iommu kconfig patch, then the vfio changes below for me?

> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -1,16 +1,22 @@
> config VFIO_IOMMU_TYPE1
> tristate
> depends on VFIO
> default n
>
> +config VFIO_IOMMU_SPAPR_TCE
> + tristate
> + depends on VFIO && SPAPR_TCE_IOMMU
> + default n
> +
> menuconfig VFIO
> tristate "VFIO Non-Privileged userspace driver framework"
> depends on IOMMU_API
> select VFIO_IOMMU_TYPE1 if X86
> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> help
> VFIO provides a framework for secure userspace device drivers.
> See Documentation/vfio.txt for more details.
>
> If you don't know what to do here, say N.
>
> source "drivers/vfio/pci/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
> obj-$(CONFIG_VFIO) += vfio.o
> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..ac72c74d
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,247 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp. All rights reserved.
> + * Author: Alexey Kardashevskiy <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> + * Author: Alex Williamson <[email protected]>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION "0.1"
> +#define DRIVER_AUTHOR "[email protected]"
> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> + struct mutex lock;
> + struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> + struct tce_container *container;
> +
> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
> + printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> + return ERR_PTR(-EINVAL);
> + }
> +
> + container = kzalloc(sizeof(*container), GFP_KERNEL);
> + if (!container)
> + return ERR_PTR(-ENOMEM);
> +
> + mutex_init(&container->lock);
> +
> + return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> + struct tce_container *container = iommu_data;
> +
> + WARN_ON(container->tbl && !container->tbl->it_group);
> + if (container->tbl && container->tbl->it_group)
> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> + mutex_destroy(&container->lock);
> +
> + kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> + unsigned int cmd, unsigned long arg)
> +{
> + struct tce_container *container = iommu_data;
> + unsigned long minsz;
> +
> + switch (cmd) {
> + case VFIO_CHECK_EXTENSION: {
> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> + }
> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> + struct vfio_iommu_spapr_tce_info info;
> + struct iommu_table *tbl = container->tbl;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> + dma64_window_size);
> +
> + if (copy_from_user(&info, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (info.argsz < minsz)
> + return -EINVAL;
> +
> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> + info.dma64_window_start = 0;
> + info.dma64_window_size = 0;
> + info.flags = 0;
> +
> + if (copy_to_user((void __user *)arg, &info, minsz))
> + return -EFAULT;
> +
> + return 0;
> + }
> + case VFIO_IOMMU_MAP_DMA: {
> + vfio_iommu_spapr_tce_dma_map par;

What does "par" stand for?

> + struct iommu_table *tbl = container->tbl;
> + enum dma_data_direction direction = DMA_NONE;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> + if (copy_from_user(&par, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (par.argsz < minsz)
> + return -EINVAL;
> +
> + if ((par.flags & VFIO_DMA_MAP_FLAG_READ) &&
> + (par.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> + direction = DMA_BIDIRECTIONAL;
> + } else if (par.flags & VFIO_DMA_MAP_FLAG_READ) {
> + direction = DMA_TO_DEVICE;
> + } else if (par.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> + direction = DMA_FROM_DEVICE;
> + }
> +
> + par.size += par.iova & ~IOMMU_PAGE_MASK;
> + par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
> +
> + return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
> + par.vaddr & IOMMU_PAGE_MASK, direction,
> + par.size >> IOMMU_PAGE_SHIFT);
> + }
> + case VFIO_IOMMU_UNMAP_DMA: {
> + vfio_iommu_spapr_tce_dma_unmap par;
> + struct iommu_table *tbl = container->tbl;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> + if (copy_from_user(&par, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (par.argsz < minsz)
> + return -EINVAL;
> +
> + par.size += par.iova & ~IOMMU_PAGE_MASK;
> + par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
> +
> + return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
> + 0, DMA_NONE, par.size >> IOMMU_PAGE_SHIFT);
> + }
> + default:
> + printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> + }
> +
> + return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> + struct iommu_group *iommu_group)
> +{
> + struct tce_container *container = iommu_data;
> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> + BUG_ON(!tbl);
> + mutex_lock(&container->lock);
> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> + iommu_group_id(iommu_group), iommu_group);
> + if (container->tbl) {
> + printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> + iommu_group_id(container->tbl->it_group),
> + iommu_group_id(iommu_group));
> + mutex_unlock(&container->lock);
> + return -EBUSY;
> + }
> +
> + container->tbl = tbl;
> + mutex_unlock(&container->lock);
> +
> + return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group)
> +{
> + struct tce_container *container = iommu_data;
> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> + BUG_ON(!tbl);
> + mutex_lock(&container->lock);
> + if (tbl != container->tbl) {
> + printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
> + iommu_group_id(iommu_group),
> + iommu_group_id(tbl->it_group));
> + } else {
> +
> + pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> + iommu_group_id(iommu_group), iommu_group);
> +
> + iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
> + container->tbl = NULL;
> + }
> + mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> + .name = "iommu-vfio-powerpc",
> + .owner = THIS_MODULE,
> + .open = tce_iommu_open,
> + .release = tce_iommu_release,
> + .ioctl = tce_iommu_ioctl,
> + .attach_group = tce_iommu_attach_group,
> + .detach_group = tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> + return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..3ecd65c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -87,30 +87,31 @@ extern void vfio_unregister_iommu_driver(
> * Simple helper macro for dealing with variable sized structures passed
> * from user space. This allows us to easily determine if the provided
> * structure is sized to include various fields.
> */
> #define offsetofend(TYPE, MEMBER) ({ \
> TYPE tmp; \
> offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
>
> #endif /* __KERNEL__ */
>
> /* Kernel & User level defines for VFIO IOCTLs. */
>
> /* Extensions */
>
> #define VFIO_TYPE1_IOMMU 1
> +#define VFIO_SPAPR_TCE_IOMMU 2
>
> /*
> * The IOCTL interface is designed for extensibility by embedding the
> * structure length (argsz) and flags into structures passed between
> * kernel and userspace. We therefore use the _IO() macro for these
> * defines to avoid implicitly embedding a size into the ioctl request.
> * As structure fields are added, argsz will increase to match and flag
> * bits will be defined to indicate additional fields with valid data.
> * It's *always* the caller's responsibility to indicate the size of
> * the structure passed by setting argsz appropriately.
> */
>
> #define VFIO_TYPE (';')
> #define VFIO_BASE 100
>
> @@ -430,16 +431,35 @@ struct vfio_iommu_type1_dma_map {
> /**
> * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
> *
> * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
> * Caller sets argsz.
> */
> struct vfio_iommu_type1_dma_unmap {
> __u32 argsz;
> __u32 flags;
> __u64 iova; /* IO virtual address */
> __u64 size; /* Size of mapping (bytes) */
> };
>
> #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +struct vfio_iommu_spapr_tce_info {
> + __u32 argsz;
> + __u32 flags;
> + __u32 dma32_window_start;
> + __u32 dma32_window_size;
> + __u64 dma64_window_start;
> + __u64 dma64_window_size;
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
> #endif /* VFIO_H */

I'm glad you were able to reuse these, after this gets merged we can
rename the structure to something more common and typedef for both type1
and spapr_tce so we don't forget it's shared. Thanks,

Alex

2012-11-22 18:28:24

by Sethi Varun-B16395

[permalink] [raw]
Subject: RE: [PATCH] vfio powerpc: enabled and supported on powernv platform



> -----Original Message-----
> From: [email protected] [mailto:linux-kernel-
> [email protected]] On Behalf Of Alex Williamson
> Sent: Tuesday, November 20, 2012 11:50 PM
> To: Alexey Kardashevskiy
> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> [email protected]; [email protected]; [email protected];
> David Gibson
> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> platform
>
> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > VFIO implements platform independent stuff such as a PCI driver, BAR
> > access (via read/write on a file descriptor or direct mapping when
> > possible) and IRQ signaling.
> > The platform dependent part includes IOMMU initialization and
> > handling.
> >
> > This patch initializes IOMMU groups based on the IOMMU configuration
> > discovered during the PCI scan, only POWERNV platform is supported at
> > the moment.
> >
> > Also the patch implements an VFIO-IOMMU driver which manages DMA
> > mapping/unmapping requests coming from the client (now QEMU). It also
> > returns a DMA window information to let the guest initialize the
> > device tree for a guest OS properly. Although this driver has been
> > tested only on POWERNV, it should work on any platform supporting TCE
> > tables.
> >
> > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >
> > Cc: David Gibson <[email protected]>
> > Signed-off-by: Alexey Kardashevskiy <[email protected]>
> > ---
> > arch/powerpc/include/asm/iommu.h | 6 +
> > arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
> > arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
> > drivers/iommu/Kconfig | 8 ++
> > drivers/vfio/Kconfig | 6 +
> > drivers/vfio/Makefile | 1 +
> > drivers/vfio/vfio_iommu_spapr_tce.c | 247
> ++++++++++++++++++++++++++++++++++
> > include/linux/vfio.h | 20 +++
> > 8 files changed, 563 insertions(+)
> > create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >
> > diff --git a/arch/powerpc/include/asm/iommu.h
> > b/arch/powerpc/include/asm/iommu.h
> > index cbfe678..5ba66cb 100644
> > --- a/arch/powerpc/include/asm/iommu.h
> > +++ b/arch/powerpc/include/asm/iommu.h
> > @@ -64,30 +64,33 @@ struct iommu_pool { }
> > ____cacheline_aligned_in_smp;
> >
> > struct iommu_table {
> > unsigned long it_busno; /* Bus number this table belongs to */
> > unsigned long it_size; /* Size of iommu table in entries */
> > unsigned long it_offset; /* Offset into global table */
> > unsigned long it_base; /* mapped address of tce table */
> > unsigned long it_index; /* which iommu table this is */
> > unsigned long it_type; /* type: PCI or Virtual Bus */
> > unsigned long it_blocksize; /* Entries in each block (cacheline)
> */
> > unsigned long poolsize;
> > unsigned long nr_pools;
> > struct iommu_pool large_pool;
> > struct iommu_pool pools[IOMMU_NR_POOLS];
> > unsigned long *it_map; /* A simple allocation bitmap for now
> */
> > +#ifdef CONFIG_IOMMU_API
> > + struct iommu_group *it_group;
> > +#endif
> > };
> >
> > struct scatterlist;
> >
> > static inline void set_iommu_table_base(struct device *dev, void
> > *base) {
> > dev->archdata.dma_data.iommu_table_base = base; }
> >
> > static inline void *get_iommu_table_base(struct device *dev) {
> > return dev->archdata.dma_data.iommu_table_base;
> > }
> >
> > /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > static inline void pci_iommu_init(void) { } extern void
> > alloc_dart_table(void); #if defined(CONFIG_PPC64) &&
> > defined(CONFIG_PM) static inline void iommu_save(void) {
> > if (ppc_md.iommu_save)
> > ppc_md.iommu_save();
> > }
> >
> > static inline void iommu_restore(void) {
> > if (ppc_md.iommu_restore)
> > ppc_md.iommu_restore();
> > }
> > #endif
> >
> > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> entry, uint64_t tce,
> > + enum dma_data_direction direction, unsigned long pages);
> > +
> > #endif /* __KERNEL__ */
> > #endif /* _ASM_IOMMU_H */
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index ff5a6ce..94f614b 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -32,30 +32,31 @@
> > #include <linux/dma-mapping.h>
> > #include <linux/bitmap.h>
> > #include <linux/iommu-helper.h>
> > #include <linux/crash_dump.h>
> > #include <linux/hash.h>
> > #include <linux/fault-inject.h>
> > #include <linux/pci.h>
> > #include <asm/io.h>
> > #include <asm/prom.h>
> > #include <asm/iommu.h>
> > #include <asm/pci-bridge.h>
> > #include <asm/machdep.h>
> > #include <asm/kdump.h>
> > #include <asm/fadump.h>
> > #include <asm/vio.h>
> > +#include <asm/tce.h>
> >
> > #define DBG(...)
> >
> > static int novmerge;
> >
> > static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > int);
> >
> > static int __init setup_iommu(char *str) {
> > if (!strcmp(str, "novmerge"))
> > novmerge = 1;
> > else if (!strcmp(str, "vmerge"))
> > novmerge = 0;
> > return 1;
> > }
> > @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > struct iommu_table *tbl, }
> >
> > void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> > void *vaddr, dma_addr_t dma_handle) {
> > if (tbl) {
> > unsigned int nio_pages;
> >
> > size = PAGE_ALIGN(size);
> > nio_pages = size >> IOMMU_PAGE_SHIFT;
> > iommu_free(tbl, dma_handle, nio_pages);
> > size = PAGE_ALIGN(size);
> > free_pages((unsigned long)vaddr, get_order(size));
> > }
> > }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * SPAPR TCE API
> > + */
> > +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > +entry) {
> > + struct page *page = NULL;
>
> NULL initialization doesn't appear to be necessary
>
> > + unsigned long oldtce;
> > +
> > + oldtce = ppc_md.tce_get(tbl, entry);
> > +
> > + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > + return NULL;
> > +
> > + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > +
> > + WARN_ON(!page);
> > + if (page && (oldtce & TCE_PCI_WRITE))
> > + SetPageDirty(page);
> > + ppc_md.tce_free(tbl, entry, 1);
> > +
> > + return page;
> > +}
> > +
> > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > + uint64_t tce, enum dma_data_direction direction) {
> > + int ret;
> > + struct page *page = NULL;
> > + unsigned long kva, offset;
> > +
> > + /* Map new TCE */
> > + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > + direction != DMA_TO_DEVICE, &page);
> > + if (ret < 1) {
> > + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> tce=%llx ioba=%lx ret=%d\n",
> > + tce, entry << IOMMU_PAGE_SHIFT, ret);
> > + if (!ret)
> > + ret = -EFAULT;
>
> Missing return ret? Otherwise we've got some bogus uses of page below
> and we're setting ret for no reason here.
>
> > + }
> > +
> > + kva = (unsigned long) page_address(page);
> > + kva += offset;
> > +
> > + /* tce_build receives a virtual address */
> > + entry += tbl->it_offset; /* Offset into real TCE table */
> > + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > +
> > + /* tce_build() only returns non-zero for transient errors */
> > + if (unlikely(ret)) {
> > + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> ioba=%lx kva=%lx ret=%d\n",
> > + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > + put_page(page);
> > + return -EIO;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static void tce_flush(struct iommu_table *tbl) {
> > + /* Flush/invalidate TLB caches if necessary */
> > + if (ppc_md.tce_flush)
> > + ppc_md.tce_flush(tbl);
> > +
> > + /* Make sure updates are seen by hardware */
> > + mb();
> > +}
> > +
> > +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> uint64_t tce,
> > + enum dma_data_direction direction, unsigned long pages) {
> > + int i, ret = 0, pages_to_put = 0;
> > + struct page *page;
> > + struct iommu_pool *pool = get_pool(tbl, entry);
> > + struct page **oldpages;
> > + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > +
> > + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > +
> > + /* Handle a single page request without allocation
> > + of pages-to-release array */
> > + if (pages == 1) {
> > + spin_lock(&(pool->lock));
> > + page = free_tce(tbl, entry);
> > +
> > + if (direction != DMA_NONE)
> > + ret = put_tce(tbl, entry, tce, direction);
> > +
> > + tce_flush(tbl);
> > +
> > + if (page)
> > + put_page(page);
> > +
> > + spin_unlock(&(pool->lock));
> > + return ret;
> > + }
> > +
> > + /* Releasing multiple pages */
> > + /* Allocate an array for pages to be released after TCE table
> > + is updated */
> > + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > + if (!oldpages)
> > + return -ENOMEM;
> > +
> > + spin_lock(&(pool->lock));
> > +
> > + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> IOMMU_PAGE_SIZE) {
> > + page = free_tce(tbl, entry);
> > + if (page) {
> > + oldpages[pages_to_put] = page;
> > + ++pages_to_put;
> > + }
> > +
> > + if (direction != DMA_NONE)
> > + ret = put_tce(tbl, entry, tce, direction);
> > +
> > + /* Release old pages if we reached the end of oldpages[] or
> > + it is the last page or we are about to exit the loop */
> > + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> {
> > + tce_flush(tbl);
>
> Avoiding tce_flush() is the reason for all this extra overhead, right?
> I wonder if it'd be cleaner separating map vs unmap, where the map case
> can avoid the oldpages array... but that means inserting new mappings on
> top of old ones wouldn't put the pages.
>
> > +
> > + /* Release pages after removing them from TCE table */
> > + while (pages_to_put) {
> > + --pages_to_put;
> > + put_page(oldpages[pages_to_put]);
> > + }
> > + }
> > + }
> > +
> > + spin_unlock(&(pool->lock));
> > + kfree(oldpages);
> > +
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > +#endif /* CONFIG_IOMMU_API */
> > diff --git a/arch/powerpc/platforms/powernv/pci.c
> > b/arch/powerpc/platforms/powernv/pci.c
> > index 05205cf..676f4d9 100644
> > --- a/arch/powerpc/platforms/powernv/pci.c
> > +++ b/arch/powerpc/platforms/powernv/pci.c
> > @@ -8,30 +8,31 @@
> > * This program is free software; you can redistribute it and/or
> > * modify it under the terms of the GNU General Public License
> > * as published by the Free Software Foundation; either version
> > * 2 of the License, or (at your option) any later version.
> > */
> >
> > #include <linux/kernel.h>
> > #include <linux/pci.h>
> > #include <linux/delay.h>
> > #include <linux/string.h>
> > #include <linux/init.h>
> > #include <linux/bootmem.h>
> > #include <linux/irq.h>
> > #include <linux/io.h>
> > #include <linux/msi.h>
> > +#include <linux/iommu.h>
> >
> > #include <asm/sections.h>
> > #include <asm/io.h>
> > #include <asm/prom.h>
> > #include <asm/pci-bridge.h>
> > #include <asm/machdep.h>
> > #include <asm/ppc-pci.h>
> > #include <asm/opal.h>
> > #include <asm/iommu.h>
> > #include <asm/tce.h>
> > #include <asm/abs_addr.h>
> > #include <asm/firmware.h>
> >
> > #include "powernv.h"
> > #include "pci.h"
> > @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> > /* Configure IOMMU DMA hooks */
> > ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> > ppc_md.tce_build = pnv_tce_build;
> > ppc_md.tce_free = pnv_tce_free;
> > ppc_md.tce_get = pnv_tce_get;
> > ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> > set_pci_dma_ops(&dma_iommu_ops);
> >
> > /* Configure MSIs */
> > #ifdef CONFIG_PCI_MSI
> > ppc_md.msi_check_device = pnv_msi_check_device;
> > ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> > ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; #endif }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * IOMMU groups support required by VFIO */ static int
> > +add_device(struct device *dev) {
> > + struct iommu_table *tbl;
> > + int ret = 0;
> > +
> > + if (WARN_ON(dev->iommu_group)) {
> > + printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> group %d, skipping\n",
> > + dev->kobj.name,
>
> dev_name(dev)
>
> > + iommu_group_id(dev->iommu_group));
> > + return -EBUSY;
> > + }
> > +
> > + tbl = get_iommu_table_base(dev);
> > + if (!tbl) {
> > + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > + dev->kobj.name);
> > + return 0;
> > + }
> > +
> > + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > + dev->kobj.name, iommu_group_id(tbl->it_group));
> > +
> > + ret = iommu_group_add_device(tbl->it_group, dev);
> > + if (ret < 0)
> > + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> > + dev->kobj.name, ret);
> > +
> > + return ret;
> > +}
> > +
> > +static void del_device(struct device *dev) {
> > + iommu_group_remove_device(dev);
> > +}
> > +
> > +static int iommu_bus_notifier(struct notifier_block *nb,
> > + unsigned long action, void *data) {
> > + struct device *dev = data;
> > +
> > + switch (action) {
> > + case BUS_NOTIFY_ADD_DEVICE:
> > + return add_device(dev);
> > + case BUS_NOTIFY_DEL_DEVICE:
> > + del_device(dev);
> > + return 0;
> > + default:
> > + return 0;
> > + }
> > +}
> > +
> > +static struct notifier_block tce_iommu_bus_nb = {
> > + .notifier_call = iommu_bus_notifier, };
> > +
> > +static void group_release(void *iommu_data) {
> > + struct iommu_table *tbl = iommu_data;
> > + tbl->it_group = NULL;
> > +}
> > +
> > +static int __init tce_iommu_init(void) {
> > + struct pci_dev *pdev = NULL;
> > + struct iommu_table *tbl;
> > + struct iommu_group *grp;
> > +
> > + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>
> There's already a notifier in the iommu code if you were to register an
> iommu_ops with the add/remove_device entries. That would allow you to
> remove the notifier block and notifier function below and the second loop
> below. Are you avoiding that to avoid the rest of iommu_ops?
>
[Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.

> Also, shouldn't this notifier only be registered after the first loop
> below? Otherwise ADD_DEVICE could race with setting up groups, which we
> assume are present in the add_device() above.
[Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.

-Varun
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?Ý¢j"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2012-11-23 02:03:10

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On 22/11/12 22:56, Sethi Varun-B16395 wrote:
>
>
>> -----Original Message-----
>> From: [email protected] [mailto:linux-kernel-
>> [email protected]] On Behalf Of Alex Williamson
>> Sent: Tuesday, November 20, 2012 11:50 PM
>> To: Alexey Kardashevskiy
>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
>> [email protected]; [email protected]; [email protected];
>> David Gibson
>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
>> platform
>>
>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
>>> VFIO implements platform independent stuff such as a PCI driver, BAR
>>> access (via read/write on a file descriptor or direct mapping when
>>> possible) and IRQ signaling.
>>> The platform dependent part includes IOMMU initialization and
>>> handling.
>>>
>>> This patch initializes IOMMU groups based on the IOMMU configuration
>>> discovered during the PCI scan, only POWERNV platform is supported at
>>> the moment.
>>>
>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
>>> mapping/unmapping requests coming from the client (now QEMU). It also
>>> returns a DMA window information to let the guest initialize the
>>> device tree for a guest OS properly. Although this driver has been
>>> tested only on POWERNV, it should work on any platform supporting TCE
>>> tables.
>>>
>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
>>>
>>> Cc: David Gibson <[email protected]>
>>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
>>> ---
>>> arch/powerpc/include/asm/iommu.h | 6 +
>>> arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
>>> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
>>> drivers/iommu/Kconfig | 8 ++
>>> drivers/vfio/Kconfig | 6 +
>>> drivers/vfio/Makefile | 1 +
>>> drivers/vfio/vfio_iommu_spapr_tce.c | 247
>> ++++++++++++++++++++++++++++++++++
>>> include/linux/vfio.h | 20 +++
>>> 8 files changed, 563 insertions(+)
>>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>
>>> diff --git a/arch/powerpc/include/asm/iommu.h
>>> b/arch/powerpc/include/asm/iommu.h
>>> index cbfe678..5ba66cb 100644
>>> --- a/arch/powerpc/include/asm/iommu.h
>>> +++ b/arch/powerpc/include/asm/iommu.h
>>> @@ -64,30 +64,33 @@ struct iommu_pool { }
>>> ____cacheline_aligned_in_smp;
>>>
>>> struct iommu_table {
>>> unsigned long it_busno; /* Bus number this table belongs to */
>>> unsigned long it_size; /* Size of iommu table in entries */
>>> unsigned long it_offset; /* Offset into global table */
>>> unsigned long it_base; /* mapped address of tce table */
>>> unsigned long it_index; /* which iommu table this is */
>>> unsigned long it_type; /* type: PCI or Virtual Bus */
>>> unsigned long it_blocksize; /* Entries in each block (cacheline)
>> */
>>> unsigned long poolsize;
>>> unsigned long nr_pools;
>>> struct iommu_pool large_pool;
>>> struct iommu_pool pools[IOMMU_NR_POOLS];
>>> unsigned long *it_map; /* A simple allocation bitmap for now
>> */
>>> +#ifdef CONFIG_IOMMU_API
>>> + struct iommu_group *it_group;
>>> +#endif
>>> };
>>>
>>> struct scatterlist;
>>>
>>> static inline void set_iommu_table_base(struct device *dev, void
>>> *base) {
>>> dev->archdata.dma_data.iommu_table_base = base; }
>>>
>>> static inline void *get_iommu_table_base(struct device *dev) {
>>> return dev->archdata.dma_data.iommu_table_base;
>>> }
>>>
>>> /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
>>> static inline void pci_iommu_init(void) { } extern void
>>> alloc_dart_table(void); #if defined(CONFIG_PPC64) &&
>>> defined(CONFIG_PM) static inline void iommu_save(void) {
>>> if (ppc_md.iommu_save)
>>> ppc_md.iommu_save();
>>> }
>>>
>>> static inline void iommu_restore(void) {
>>> if (ppc_md.iommu_restore)
>>> ppc_md.iommu_restore();
>>> }
>>> #endif
>>>
>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
>> entry, uint64_t tce,
>>> + enum dma_data_direction direction, unsigned long pages);
>>> +
>>> #endif /* __KERNEL__ */
>>> #endif /* _ASM_IOMMU_H */
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index ff5a6ce..94f614b 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -32,30 +32,31 @@
>>> #include <linux/dma-mapping.h>
>>> #include <linux/bitmap.h>
>>> #include <linux/iommu-helper.h>
>>> #include <linux/crash_dump.h>
>>> #include <linux/hash.h>
>>> #include <linux/fault-inject.h>
>>> #include <linux/pci.h>
>>> #include <asm/io.h>
>>> #include <asm/prom.h>
>>> #include <asm/iommu.h>
>>> #include <asm/pci-bridge.h>
>>> #include <asm/machdep.h>
>>> #include <asm/kdump.h>
>>> #include <asm/fadump.h>
>>> #include <asm/vio.h>
>>> +#include <asm/tce.h>
>>>
>>> #define DBG(...)
>>>
>>> static int novmerge;
>>>
>>> static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
>>> int);
>>>
>>> static int __init setup_iommu(char *str) {
>>> if (!strcmp(str, "novmerge"))
>>> novmerge = 1;
>>> else if (!strcmp(str, "vmerge"))
>>> novmerge = 0;
>>> return 1;
>>> }
>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
>>> struct iommu_table *tbl, }
>>>
>>> void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>> void *vaddr, dma_addr_t dma_handle) {
>>> if (tbl) {
>>> unsigned int nio_pages;
>>>
>>> size = PAGE_ALIGN(size);
>>> nio_pages = size >> IOMMU_PAGE_SHIFT;
>>> iommu_free(tbl, dma_handle, nio_pages);
>>> size = PAGE_ALIGN(size);
>>> free_pages((unsigned long)vaddr, get_order(size));
>>> }
>>> }
>>> +
>>> +#ifdef CONFIG_IOMMU_API
>>> +/*
>>> + * SPAPR TCE API
>>> + */
>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
>>> +entry) {
>>> + struct page *page = NULL;
>>
>> NULL initialization doesn't appear to be necessary
>>
>>> + unsigned long oldtce;
>>> +
>>> + oldtce = ppc_md.tce_get(tbl, entry);
>>> +
>>> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>>> + return NULL;
>>> +
>>> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
>>> +
>>> + WARN_ON(!page);
>>> + if (page && (oldtce & TCE_PCI_WRITE))
>>> + SetPageDirty(page);
>>> + ppc_md.tce_free(tbl, entry, 1);
>>> +
>>> + return page;
>>> +}
>>> +
>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>>> + uint64_t tce, enum dma_data_direction direction) {
>>> + int ret;
>>> + struct page *page = NULL;
>>> + unsigned long kva, offset;
>>> +
>>> + /* Map new TCE */
>>> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>> + direction != DMA_TO_DEVICE, &page);
>>> + if (ret < 1) {
>>> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
>> tce=%llx ioba=%lx ret=%d\n",
>>> + tce, entry << IOMMU_PAGE_SHIFT, ret);
>>> + if (!ret)
>>> + ret = -EFAULT;
>>
>> Missing return ret? Otherwise we've got some bogus uses of page below
>> and we're setting ret for no reason here.
>>
>>> + }
>>> +
>>> + kva = (unsigned long) page_address(page);
>>> + kva += offset;
>>> +
>>> + /* tce_build receives a virtual address */
>>> + entry += tbl->it_offset; /* Offset into real TCE table */
>>> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>> +
>>> + /* tce_build() only returns non-zero for transient errors */
>>> + if (unlikely(ret)) {
>>> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
>> ioba=%lx kva=%lx ret=%d\n",
>>> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>>> + put_page(page);
>>> + return -EIO;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void tce_flush(struct iommu_table *tbl) {
>>> + /* Flush/invalidate TLB caches if necessary */
>>> + if (ppc_md.tce_flush)
>>> + ppc_md.tce_flush(tbl);
>>> +
>>> + /* Make sure updates are seen by hardware */
>>> + mb();
>>> +}
>>> +
>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> uint64_t tce,
>>> + enum dma_data_direction direction, unsigned long pages) {
>>> + int i, ret = 0, pages_to_put = 0;
>>> + struct page *page;
>>> + struct iommu_pool *pool = get_pool(tbl, entry);
>>> + struct page **oldpages;
>>> + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
>>> +
>>> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>>> +
>>> + /* Handle a single page request without allocation
>>> + of pages-to-release array */
>>> + if (pages == 1) {
>>> + spin_lock(&(pool->lock));
>>> + page = free_tce(tbl, entry);
>>> +
>>> + if (direction != DMA_NONE)
>>> + ret = put_tce(tbl, entry, tce, direction);
>>> +
>>> + tce_flush(tbl);
>>> +
>>> + if (page)
>>> + put_page(page);
>>> +
>>> + spin_unlock(&(pool->lock));
>>> + return ret;
>>> + }
>>> +
>>> + /* Releasing multiple pages */
>>> + /* Allocate an array for pages to be released after TCE table
>>> + is updated */
>>> + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>> + if (!oldpages)
>>> + return -ENOMEM;
>>> +
>>> + spin_lock(&(pool->lock));
>>> +
>>> + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
>> IOMMU_PAGE_SIZE) {
>>> + page = free_tce(tbl, entry);
>>> + if (page) {
>>> + oldpages[pages_to_put] = page;
>>> + ++pages_to_put;
>>> + }
>>> +
>>> + if (direction != DMA_NONE)
>>> + ret = put_tce(tbl, entry, tce, direction);
>>> +
>>> + /* Release old pages if we reached the end of oldpages[] or
>>> + it is the last page or we are about to exit the loop */
>>> + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
>> {
>>> + tce_flush(tbl);
>>
>> Avoiding tce_flush() is the reason for all this extra overhead, right?
>> I wonder if it'd be cleaner separating map vs unmap, where the map case
>> can avoid the oldpages array... but that means inserting new mappings on
>> top of old ones wouldn't put the pages.


Yes, we do not want to loose pages if the guest forgot to unmap them.


>>> +
>>> + /* Release pages after removing them from TCE table */
>>> + while (pages_to_put) {
>>> + --pages_to_put;
>>> + put_page(oldpages[pages_to_put]);
>>> + }
>>> + }
>>> + }
>>> +
>>> + spin_unlock(&(pool->lock));
>>> + kfree(oldpages);
>>> +
>>> + return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
>>> +#endif /* CONFIG_IOMMU_API */
>>> diff --git a/arch/powerpc/platforms/powernv/pci.c
>>> b/arch/powerpc/platforms/powernv/pci.c
>>> index 05205cf..676f4d9 100644
>>> --- a/arch/powerpc/platforms/powernv/pci.c
>>> +++ b/arch/powerpc/platforms/powernv/pci.c
>>> @@ -8,30 +8,31 @@
>>> * This program is free software; you can redistribute it and/or
>>> * modify it under the terms of the GNU General Public License
>>> * as published by the Free Software Foundation; either version
>>> * 2 of the License, or (at your option) any later version.
>>> */
>>>
>>> #include <linux/kernel.h>
>>> #include <linux/pci.h>
>>> #include <linux/delay.h>
>>> #include <linux/string.h>
>>> #include <linux/init.h>
>>> #include <linux/bootmem.h>
>>> #include <linux/irq.h>
>>> #include <linux/io.h>
>>> #include <linux/msi.h>
>>> +#include <linux/iommu.h>
>>>
>>> #include <asm/sections.h>
>>> #include <asm/io.h>
>>> #include <asm/prom.h>
>>> #include <asm/pci-bridge.h>
>>> #include <asm/machdep.h>
>>> #include <asm/ppc-pci.h>
>>> #include <asm/opal.h>
>>> #include <asm/iommu.h>
>>> #include <asm/tce.h>
>>> #include <asm/abs_addr.h>
>>> #include <asm/firmware.h>
>>>
>>> #include "powernv.h"
>>> #include "pci.h"
>>> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
>>> /* Configure IOMMU DMA hooks */
>>> ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
>>> ppc_md.tce_build = pnv_tce_build;
>>> ppc_md.tce_free = pnv_tce_free;
>>> ppc_md.tce_get = pnv_tce_get;
>>> ppc_md.pci_probe_mode = pnv_pci_probe_mode;
>>> set_pci_dma_ops(&dma_iommu_ops);
>>>
>>> /* Configure MSIs */
>>> #ifdef CONFIG_PCI_MSI
>>> ppc_md.msi_check_device = pnv_msi_check_device;
>>> ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
>>> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; #endif }
>>> +
>>> +#ifdef CONFIG_IOMMU_API
>>> +/*
>>> + * IOMMU groups support required by VFIO */ static int
>>> +add_device(struct device *dev) {
>>> + struct iommu_table *tbl;
>>> + int ret = 0;
>>> +
>>> + if (WARN_ON(dev->iommu_group)) {
>>> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu
>> group %d, skipping\n",
>>> + dev->kobj.name,
>>
>> dev_name(dev)
>>
>>> + iommu_group_id(dev->iommu_group));
>>> + return -EBUSY;
>>> + }
>>> +
>>> + tbl = get_iommu_table_base(dev);
>>> + if (!tbl) {
>>> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
>>> + dev->kobj.name);
>>> + return 0;
>>> + }
>>> +
>>> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
>>> + dev->kobj.name, iommu_group_id(tbl->it_group));
>>> +
>>> + ret = iommu_group_add_device(tbl->it_group, dev);
>>> + if (ret < 0)
>>> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>>> + dev->kobj.name, ret);
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static void del_device(struct device *dev) {
>>> + iommu_group_remove_device(dev);
>>> +}
>>> +
>>> +static int iommu_bus_notifier(struct notifier_block *nb,
>>> + unsigned long action, void *data) {
>>> + struct device *dev = data;
>>> +
>>> + switch (action) {
>>> + case BUS_NOTIFY_ADD_DEVICE:
>>> + return add_device(dev);
>>> + case BUS_NOTIFY_DEL_DEVICE:
>>> + del_device(dev);
>>> + return 0;
>>> + default:
>>> + return 0;
>>> + }
>>> +}
>>> +
>>> +static struct notifier_block tce_iommu_bus_nb = {
>>> + .notifier_call = iommu_bus_notifier, };
>>> +
>>> +static void group_release(void *iommu_data) {
>>> + struct iommu_table *tbl = iommu_data;
>>> + tbl->it_group = NULL;
>>> +}
>>> +
>>> +static int __init tce_iommu_init(void) {
>>> + struct pci_dev *pdev = NULL;
>>> + struct iommu_table *tbl;
>>> + struct iommu_group *grp;
>>> +
>>> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>>
>> There's already a notifier in the iommu code if you were to register an
>> iommu_ops with the add/remove_device entries. That would allow you to
>> remove the notifier block and notifier function below and the second loop
>> below. Are you avoiding that to avoid the rest of iommu_ops?

Yes. I need to implement either a small part of iommu_ops (especially the
part which I think should not be there at all) or notifier, cannot how how
the first is simpler.


> [Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.

Also true.
I would actually allocate IOMMU groups right after we discovered the new
one but this is done during PCI scan which works before
subsys_initcall(iommu_init) is called so I added this first loop.


>> Also, shouldn't this notifier only be registered after the first loop
>> below? Otherwise ADD_DEVICE could race with setting up groups, which we
>> assume are present in the add_device() above.
> [Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.

Yep. Not very familiar with this stuff but if it is done one way and it is
already upstream, I cannot see why I should go another way :)



--
Alexey

2012-11-23 09:03:58

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH 0/2] vfio powerpc: implemented and enabled

The series includes IOMMU implementation and necessary IOMMU groups initialization.

Alexey Kardashevskiy (2):
vfio powerpc: implemented IOMMU driver for VFIO
vfio powerpc: enabled on powernv platform

arch/powerpc/include/asm/iommu.h | 6 +
arch/powerpc/kernel/iommu.c | 141 +++++++++++++++++++
arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
drivers/iommu/Kconfig | 8 ++
drivers/vfio/Kconfig | 6 +
drivers/vfio/Makefile | 1 +
drivers/vfio/vfio_iommu_spapr_tce.c | 247 ++++++++++++++++++++++++++++++++++
include/linux/vfio.h | 20 +++
8 files changed, 564 insertions(+)
create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

--
1.7.10.4

2012-11-23 09:04:30

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH 2/2] vfio powerpc: enabled on powernv platform

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
arch/powerpc/include/asm/iommu.h | 6 ++
arch/powerpc/kernel/iommu.c | 141 ++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/pci.c | 135 ++++++++++++++++++++++++++++++++
drivers/iommu/Kconfig | 8 ++
4 files changed, 290 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5ba66cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map; /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+ struct iommu_group *it_group;
+#endif
};

struct scatterlist;
@@ -147,5 +150,8 @@ static inline void iommu_restore(void)
}
#endif

+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+ enum dma_data_direction direction, unsigned long pages);
+
#endif /* __KERNEL__ */
#endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..c8dad1f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
#include <asm/kdump.h>
#include <asm/fadump.h>
#include <asm/vio.h>
+#include <asm/tce.h>

#define DBG(...)

@@ -856,3 +857,143 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
+{
+ struct page *page;
+ unsigned long oldtce;
+
+ oldtce = ppc_md.tce_get(tbl, entry);
+
+ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ return NULL;
+
+ page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+ WARN_ON(!page);
+ if (page && (oldtce & TCE_PCI_WRITE))
+ SetPageDirty(page);
+ ppc_md.tce_free(tbl, entry, 1);
+
+ return page;
+}
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction)
+{
+ int ret;
+ struct page *page = NULL;
+ unsigned long kva, offset;
+
+ /* Map new TCE */
+ offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (ret < 1) {
+ printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, ret);
+ if (!ret)
+ ret = -EFAULT;
+ return ret;
+ }
+
+ kva = (unsigned long) page_address(page);
+ kva += offset;
+
+ /* tce_build receives a virtual address */
+ entry += tbl->it_offset; /* Offset into real TCE table */
+ ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+ /* tce_build() only returns non-zero for transient errors */
+ if (unlikely(ret)) {
+ printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+ put_page(page);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+}
+
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+ enum dma_data_direction direction, unsigned long pages)
+{
+ int i, ret = 0, pages_to_put = 0;
+ struct page *page;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+ struct page **oldpages;
+ const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
+
+ BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+ /* Handle a single page request without allocation
+ of pages-to-release array */
+ if (pages == 1) {
+ spin_lock(&(pool->lock));
+ page = free_tce(tbl, entry);
+
+ if (direction != DMA_NONE)
+ ret = put_tce(tbl, entry, tce, direction);
+
+ tce_flush(tbl);
+
+ if (page)
+ put_page(page);
+
+ spin_unlock(&(pool->lock));
+ return ret;
+ }
+
+ /* Releasing multiple pages */
+ /* Allocate an array for pages to be released after TCE table
+ is updated */
+ oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!oldpages)
+ return -ENOMEM;
+
+ spin_lock(&(pool->lock));
+
+ for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
+ page = free_tce(tbl, entry);
+ if (page) {
+ oldpages[pages_to_put] = page;
+ ++pages_to_put;
+ }
+
+ if (direction != DMA_NONE)
+ ret = put_tce(tbl, entry, tce, direction);
+
+ /* Release old pages if we reached the end of oldpages[] or
+ it is the last page or we are about to exit the loop */
+ if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
+ tce_flush(tbl);
+
+ /* Release pages after removing them from TCE table */
+ while (pages_to_put) {
+ --pages_to_put;
+ put_page(oldpages[pages_to_put]);
+ }
+ }
+ }
+
+ spin_unlock(&(pool->lock));
+ kfree(oldpages);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..660dcc6 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
#include <linux/irq.h>
#include <linux/io.h>
#include <linux/msi.h>
+#include <linux/iommu.h>

#include <asm/sections.h>
#include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
#endif
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+ struct iommu_table *tbl;
+ int ret = 0;
+
+ if (WARN_ON(dev->iommu_group)) {
+ printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+ dev_name(dev),
+ iommu_group_id(dev->iommu_group));
+ return -EBUSY;
+ }
+
+ tbl = get_iommu_table_base(dev);
+ if (!tbl) {
+ pr_debug("tce_vfio: skipping device %s with no tbl\n",
+ dev_name(dev));
+ return 0;
+ }
+
+ pr_debug("tce_vfio: adding %s to iommu group %d\n",
+ dev_name(dev), iommu_group_id(tbl->it_group));
+
+ ret = iommu_group_add_device(tbl->it_group, dev);
+ if (ret < 0)
+ printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+ dev_name(dev), ret);
+
+ return ret;
+}
+
+static void del_device(struct device *dev)
+{
+ iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ return add_device(dev);
+ case BUS_NOTIFY_DEL_DEVICE:
+ del_device(dev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+ .notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+ struct iommu_table *tbl = iommu_data;
+ tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp;
+
+ bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Allocate and initialize IOMMU groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+
+ /* Skip already initialized */
+ if (tbl->it_group)
+ continue;
+
+ grp = iommu_group_alloc();
+ if (IS_ERR(grp)) {
+ printk(KERN_INFO "tce_vfio: cannot create "
+ "new IOMMU group, ret=%ld\n",
+ PTR_ERR(grp));
+ return PTR_ERR(grp);
+ }
+ tbl->it_group = grp;
+ iommu_group_set_iommudata(grp, tbl, group_release);
+ }
+
+ /* Add PCI devices to VFIO groups */
+ for_each_pci_dev(pdev)
+ add_device(&pdev->dev);
+
+ return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp = NULL;
+
+ bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Delete PCI devices from VFIO groups */
+ for_each_pci_dev(pdev)
+ del_device(&pdev->dev);
+
+ /* Release VFIO groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+ grp = tbl->it_group;
+
+ /* Skip (already) uninitialized */
+ if (!grp)
+ continue;
+
+ /* Do actual release, group_release() is expected to work */
+ iommu_group_put(grp);
+ BUG_ON(tbl->it_group);
+ }
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG

Say N unless you need kernel log message for IOMMU debugging

+config SPAPR_TCE_IOMMU
+ bool "sPAPR TCE IOMMU Support"
+ depends on PPC_POWERNV
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops is
+ still not implemented.
+
endif # IOMMU_SUPPORT
--
1.7.10.4

2012-11-23 09:04:35

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
drivers/vfio/Kconfig | 6 +
drivers/vfio/Makefile | 1 +
drivers/vfio/vfio_iommu_spapr_tce.c | 247 +++++++++++++++++++++++++++++++++++
include/linux/vfio.h | 20 +++
4 files changed, 274 insertions(+)
create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
depends on VFIO
default n

+config VFIO_IOMMU_SPAPR_TCE
+ tristate
+ depends on VFIO && SPAPR_TCE_IOMMU
+ default n
+
menuconfig VFIO
tristate "VFIO Non-Privileged userspace driver framework"
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
+ select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
help
VFIO provides a framework for secure userspace device drivers.
See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_VFIO) += vfio.o
obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..46a6298
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,247 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Author: Alexey Kardashevskiy <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <[email protected]>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR "[email protected]"
+#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+ struct mutex lock;
+ struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+ struct tce_container *container;
+
+ if (arg != VFIO_SPAPR_TCE_IOMMU) {
+ printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ container = kzalloc(sizeof(*container), GFP_KERNEL);
+ if (!container)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&container->lock);
+
+ return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+ struct tce_container *container = iommu_data;
+
+ WARN_ON(container->tbl && !container->tbl->it_group);
+ if (container->tbl && container->tbl->it_group)
+ tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+ mutex_destroy(&container->lock);
+
+ kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+ unsigned int cmd, unsigned long arg)
+{
+ struct tce_container *container = iommu_data;
+ unsigned long minsz;
+
+ switch (cmd) {
+ case VFIO_CHECK_EXTENSION: {
+ return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+ }
+ case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+ struct vfio_iommu_spapr_tce_info info;
+ struct iommu_table *tbl = container->tbl;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+ dma64_window_size);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+ info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+ info.dma64_window_start = 0;
+ info.dma64_window_size = 0;
+ info.flags = 0;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_IOMMU_MAP_DMA: {
+ vfio_iommu_spapr_tce_dma_map param;
+ struct iommu_table *tbl = container->tbl;
+ enum dma_data_direction direction = DMA_NONE;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+ if (copy_from_user(&param, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (param.argsz < minsz)
+ return -EINVAL;
+
+ if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+ (param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
+ direction = DMA_BIDIRECTIONAL;
+ } else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
+ direction = DMA_TO_DEVICE;
+ } else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
+ direction = DMA_FROM_DEVICE;
+ }
+
+ param.size += param.iova & ~IOMMU_PAGE_MASK;
+ param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
+
+ return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+ param.vaddr & IOMMU_PAGE_MASK, direction,
+ param.size >> IOMMU_PAGE_SHIFT);
+ }
+ case VFIO_IOMMU_UNMAP_DMA: {
+ vfio_iommu_spapr_tce_dma_unmap param;
+ struct iommu_table *tbl = container->tbl;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+ if (copy_from_user(&param, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (param.argsz < minsz)
+ return -EINVAL;
+
+ param.size += param.iova & ~IOMMU_PAGE_MASK;
+ param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
+
+ return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+ 0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
+ }
+ default:
+ printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+ }
+
+ return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+ struct tce_container *container = iommu_data;
+ struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+ BUG_ON(!tbl);
+ mutex_lock(&container->lock);
+ pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+ iommu_group_id(iommu_group), iommu_group);
+ if (container->tbl) {
+ printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+ iommu_group_id(container->tbl->it_group),
+ iommu_group_id(iommu_group));
+ mutex_unlock(&container->lock);
+ return -EBUSY;
+ }
+
+ container->tbl = tbl;
+ mutex_unlock(&container->lock);
+
+ return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+ struct tce_container *container = iommu_data;
+ struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+ BUG_ON(!tbl);
+ mutex_lock(&container->lock);
+ if (tbl != container->tbl) {
+ printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
+ iommu_group_id(iommu_group),
+ iommu_group_id(tbl->it_group));
+ } else {
+
+ pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+ iommu_group_id(iommu_group), iommu_group);
+
+ iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
+ container->tbl = NULL;
+ }
+ mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+ .name = "iommu-vfio-powerpc",
+ .owner = THIS_MODULE,
+ .open = tce_iommu_open,
+ .release = tce_iommu_release,
+ .ioctl = tce_iommu_ioctl,
+ .attach_group = tce_iommu_attach_group,
+ .detach_group = tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+ return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..3ecd65c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
/* Extensions */

#define VFIO_TYPE1_IOMMU 1
+#define VFIO_SPAPR_TCE_IOMMU 2

/*
* The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {

#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)

+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+ __u32 argsz;
+ __u32 flags;
+ __u32 dma32_window_start;
+ __u32 dma32_window_size;
+ __u64 dma64_window_start;
+ __u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
#endif /* VFIO_H */
--
1.7.10.4

2012-11-26 15:08:56

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On Thu, 2012-11-22 at 11:56 +0000, Sethi Varun-B16395 wrote:
>
> > -----Original Message-----
> > From: [email protected] [mailto:linux-kernel-
> > [email protected]] On Behalf Of Alex Williamson
> > Sent: Tuesday, November 20, 2012 11:50 PM
> > To: Alexey Kardashevskiy
> > Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> > [email protected]; [email protected]; [email protected];
> > David Gibson
> > Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> > platform
> >
> > On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > > VFIO implements platform independent stuff such as a PCI driver, BAR
> > > access (via read/write on a file descriptor or direct mapping when
> > > possible) and IRQ signaling.
> > > The platform dependent part includes IOMMU initialization and
> > > handling.
> > >
> > > This patch initializes IOMMU groups based on the IOMMU configuration
> > > discovered during the PCI scan, only POWERNV platform is supported at
> > > the moment.
> > >
> > > Also the patch implements an VFIO-IOMMU driver which manages DMA
> > > mapping/unmapping requests coming from the client (now QEMU). It also
> > > returns a DMA window information to let the guest initialize the
> > > device tree for a guest OS properly. Although this driver has been
> > > tested only on POWERNV, it should work on any platform supporting TCE
> > > tables.
> > >
> > > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> > >
> > > Cc: David Gibson <[email protected]>
> > > Signed-off-by: Alexey Kardashevskiy <[email protected]>
> > > ---
> > > arch/powerpc/include/asm/iommu.h | 6 +
> > > arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
> > > arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
> > > drivers/iommu/Kconfig | 8 ++
> > > drivers/vfio/Kconfig | 6 +
> > > drivers/vfio/Makefile | 1 +
> > > drivers/vfio/vfio_iommu_spapr_tce.c | 247
> > ++++++++++++++++++++++++++++++++++
> > > include/linux/vfio.h | 20 +++
> > > 8 files changed, 563 insertions(+)
> > > create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> > >
> > > diff --git a/arch/powerpc/include/asm/iommu.h
> > > b/arch/powerpc/include/asm/iommu.h
> > > index cbfe678..5ba66cb 100644
> > > --- a/arch/powerpc/include/asm/iommu.h
> > > +++ b/arch/powerpc/include/asm/iommu.h
> > > @@ -64,30 +64,33 @@ struct iommu_pool { }
> > > ____cacheline_aligned_in_smp;
> > >
> > > struct iommu_table {
> > > unsigned long it_busno; /* Bus number this table belongs to */
> > > unsigned long it_size; /* Size of iommu table in entries */
> > > unsigned long it_offset; /* Offset into global table */
> > > unsigned long it_base; /* mapped address of tce table */
> > > unsigned long it_index; /* which iommu table this is */
> > > unsigned long it_type; /* type: PCI or Virtual Bus */
> > > unsigned long it_blocksize; /* Entries in each block (cacheline)
> > */
> > > unsigned long poolsize;
> > > unsigned long nr_pools;
> > > struct iommu_pool large_pool;
> > > struct iommu_pool pools[IOMMU_NR_POOLS];
> > > unsigned long *it_map; /* A simple allocation bitmap for now
> > */
> > > +#ifdef CONFIG_IOMMU_API
> > > + struct iommu_group *it_group;
> > > +#endif
> > > };
> > >
> > > struct scatterlist;
> > >
> > > static inline void set_iommu_table_base(struct device *dev, void
> > > *base) {
> > > dev->archdata.dma_data.iommu_table_base = base; }
> > >
> > > static inline void *get_iommu_table_base(struct device *dev) {
> > > return dev->archdata.dma_data.iommu_table_base;
> > > }
> > >
> > > /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > > static inline void pci_iommu_init(void) { } extern void
> > > alloc_dart_table(void); #if defined(CONFIG_PPC64) &&
> > > defined(CONFIG_PM) static inline void iommu_save(void) {
> > > if (ppc_md.iommu_save)
> > > ppc_md.iommu_save();
> > > }
> > >
> > > static inline void iommu_restore(void) {
> > > if (ppc_md.iommu_restore)
> > > ppc_md.iommu_restore();
> > > }
> > > #endif
> > >
> > > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> > entry, uint64_t tce,
> > > + enum dma_data_direction direction, unsigned long pages);
> > > +
> > > #endif /* __KERNEL__ */
> > > #endif /* _ASM_IOMMU_H */
> > > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > > index ff5a6ce..94f614b 100644
> > > --- a/arch/powerpc/kernel/iommu.c
> > > +++ b/arch/powerpc/kernel/iommu.c
> > > @@ -32,30 +32,31 @@
> > > #include <linux/dma-mapping.h>
> > > #include <linux/bitmap.h>
> > > #include <linux/iommu-helper.h>
> > > #include <linux/crash_dump.h>
> > > #include <linux/hash.h>
> > > #include <linux/fault-inject.h>
> > > #include <linux/pci.h>
> > > #include <asm/io.h>
> > > #include <asm/prom.h>
> > > #include <asm/iommu.h>
> > > #include <asm/pci-bridge.h>
> > > #include <asm/machdep.h>
> > > #include <asm/kdump.h>
> > > #include <asm/fadump.h>
> > > #include <asm/vio.h>
> > > +#include <asm/tce.h>
> > >
> > > #define DBG(...)
> > >
> > > static int novmerge;
> > >
> > > static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > > int);
> > >
> > > static int __init setup_iommu(char *str) {
> > > if (!strcmp(str, "novmerge"))
> > > novmerge = 1;
> > > else if (!strcmp(str, "vmerge"))
> > > novmerge = 0;
> > > return 1;
> > > }
> > > @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > > struct iommu_table *tbl, }
> > >
> > > void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> > > void *vaddr, dma_addr_t dma_handle) {
> > > if (tbl) {
> > > unsigned int nio_pages;
> > >
> > > size = PAGE_ALIGN(size);
> > > nio_pages = size >> IOMMU_PAGE_SHIFT;
> > > iommu_free(tbl, dma_handle, nio_pages);
> > > size = PAGE_ALIGN(size);
> > > free_pages((unsigned long)vaddr, get_order(size));
> > > }
> > > }
> > > +
> > > +#ifdef CONFIG_IOMMU_API
> > > +/*
> > > + * SPAPR TCE API
> > > + */
> > > +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > > +entry) {
> > > + struct page *page = NULL;
> >
> > NULL initialization doesn't appear to be necessary
> >
> > > + unsigned long oldtce;
> > > +
> > > + oldtce = ppc_md.tce_get(tbl, entry);
> > > +
> > > + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > > + return NULL;
> > > +
> > > + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > > +
> > > + WARN_ON(!page);
> > > + if (page && (oldtce & TCE_PCI_WRITE))
> > > + SetPageDirty(page);
> > > + ppc_md.tce_free(tbl, entry, 1);
> > > +
> > > + return page;
> > > +}
> > > +
> > > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > > + uint64_t tce, enum dma_data_direction direction) {
> > > + int ret;
> > > + struct page *page = NULL;
> > > + unsigned long kva, offset;
> > > +
> > > + /* Map new TCE */
> > > + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > > + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > > + direction != DMA_TO_DEVICE, &page);
> > > + if (ret < 1) {
> > > + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> > tce=%llx ioba=%lx ret=%d\n",
> > > + tce, entry << IOMMU_PAGE_SHIFT, ret);
> > > + if (!ret)
> > > + ret = -EFAULT;
> >
> > Missing return ret? Otherwise we've got some bogus uses of page below
> > and we're setting ret for no reason here.
> >
> > > + }
> > > +
> > > + kva = (unsigned long) page_address(page);
> > > + kva += offset;
> > > +
> > > + /* tce_build receives a virtual address */
> > > + entry += tbl->it_offset; /* Offset into real TCE table */
> > > + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > > +
> > > + /* tce_build() only returns non-zero for transient errors */
> > > + if (unlikely(ret)) {
> > > + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> > ioba=%lx kva=%lx ret=%d\n",
> > > + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > > + put_page(page);
> > > + return -EIO;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static void tce_flush(struct iommu_table *tbl) {
> > > + /* Flush/invalidate TLB caches if necessary */
> > > + if (ppc_md.tce_flush)
> > > + ppc_md.tce_flush(tbl);
> > > +
> > > + /* Make sure updates are seen by hardware */
> > > + mb();
> > > +}
> > > +
> > > +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> > uint64_t tce,
> > > + enum dma_data_direction direction, unsigned long pages) {
> > > + int i, ret = 0, pages_to_put = 0;
> > > + struct page *page;
> > > + struct iommu_pool *pool = get_pool(tbl, entry);
> > > + struct page **oldpages;
> > > + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > > +
> > > + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > > +
> > > + /* Handle a single page request without allocation
> > > + of pages-to-release array */
> > > + if (pages == 1) {
> > > + spin_lock(&(pool->lock));
> > > + page = free_tce(tbl, entry);
> > > +
> > > + if (direction != DMA_NONE)
> > > + ret = put_tce(tbl, entry, tce, direction);
> > > +
> > > + tce_flush(tbl);
> > > +
> > > + if (page)
> > > + put_page(page);
> > > +
> > > + spin_unlock(&(pool->lock));
> > > + return ret;
> > > + }
> > > +
> > > + /* Releasing multiple pages */
> > > + /* Allocate an array for pages to be released after TCE table
> > > + is updated */
> > > + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > > + if (!oldpages)
> > > + return -ENOMEM;
> > > +
> > > + spin_lock(&(pool->lock));
> > > +
> > > + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> > IOMMU_PAGE_SIZE) {
> > > + page = free_tce(tbl, entry);
> > > + if (page) {
> > > + oldpages[pages_to_put] = page;
> > > + ++pages_to_put;
> > > + }
> > > +
> > > + if (direction != DMA_NONE)
> > > + ret = put_tce(tbl, entry, tce, direction);
> > > +
> > > + /* Release old pages if we reached the end of oldpages[] or
> > > + it is the last page or we are about to exit the loop */
> > > + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> > {
> > > + tce_flush(tbl);
> >
> > Avoiding tce_flush() is the reason for all this extra overhead, right?
> > I wonder if it'd be cleaner separating map vs unmap, where the map case
> > can avoid the oldpages array... but that means inserting new mappings on
> > top of old ones wouldn't put the pages.
> >
> > > +
> > > + /* Release pages after removing them from TCE table */
> > > + while (pages_to_put) {
> > > + --pages_to_put;
> > > + put_page(oldpages[pages_to_put]);
> > > + }
> > > + }
> > > + }
> > > +
> > > + spin_unlock(&(pool->lock));
> > > + kfree(oldpages);
> > > +
> > > + return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > > +#endif /* CONFIG_IOMMU_API */
> > > diff --git a/arch/powerpc/platforms/powernv/pci.c
> > > b/arch/powerpc/platforms/powernv/pci.c
> > > index 05205cf..676f4d9 100644
> > > --- a/arch/powerpc/platforms/powernv/pci.c
> > > +++ b/arch/powerpc/platforms/powernv/pci.c
> > > @@ -8,30 +8,31 @@
> > > * This program is free software; you can redistribute it and/or
> > > * modify it under the terms of the GNU General Public License
> > > * as published by the Free Software Foundation; either version
> > > * 2 of the License, or (at your option) any later version.
> > > */
> > >
> > > #include <linux/kernel.h>
> > > #include <linux/pci.h>
> > > #include <linux/delay.h>
> > > #include <linux/string.h>
> > > #include <linux/init.h>
> > > #include <linux/bootmem.h>
> > > #include <linux/irq.h>
> > > #include <linux/io.h>
> > > #include <linux/msi.h>
> > > +#include <linux/iommu.h>
> > >
> > > #include <asm/sections.h>
> > > #include <asm/io.h>
> > > #include <asm/prom.h>
> > > #include <asm/pci-bridge.h>
> > > #include <asm/machdep.h>
> > > #include <asm/ppc-pci.h>
> > > #include <asm/opal.h>
> > > #include <asm/iommu.h>
> > > #include <asm/tce.h>
> > > #include <asm/abs_addr.h>
> > > #include <asm/firmware.h>
> > >
> > > #include "powernv.h"
> > > #include "pci.h"
> > > @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> > > /* Configure IOMMU DMA hooks */
> > > ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> > > ppc_md.tce_build = pnv_tce_build;
> > > ppc_md.tce_free = pnv_tce_free;
> > > ppc_md.tce_get = pnv_tce_get;
> > > ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> > > set_pci_dma_ops(&dma_iommu_ops);
> > >
> > > /* Configure MSIs */
> > > #ifdef CONFIG_PCI_MSI
> > > ppc_md.msi_check_device = pnv_msi_check_device;
> > > ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> > > ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; #endif }
> > > +
> > > +#ifdef CONFIG_IOMMU_API
> > > +/*
> > > + * IOMMU groups support required by VFIO */ static int
> > > +add_device(struct device *dev) {
> > > + struct iommu_table *tbl;
> > > + int ret = 0;
> > > +
> > > + if (WARN_ON(dev->iommu_group)) {
> > > + printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> > group %d, skipping\n",
> > > + dev->kobj.name,
> >
> > dev_name(dev)
> >
> > > + iommu_group_id(dev->iommu_group));
> > > + return -EBUSY;
> > > + }
> > > +
> > > + tbl = get_iommu_table_base(dev);
> > > + if (!tbl) {
> > > + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > > + dev->kobj.name);
> > > + return 0;
> > > + }
> > > +
> > > + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > > + dev->kobj.name, iommu_group_id(tbl->it_group));
> > > +
> > > + ret = iommu_group_add_device(tbl->it_group, dev);
> > > + if (ret < 0)
> > > + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> > > + dev->kobj.name, ret);
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static void del_device(struct device *dev) {
> > > + iommu_group_remove_device(dev);
> > > +}
> > > +
> > > +static int iommu_bus_notifier(struct notifier_block *nb,
> > > + unsigned long action, void *data) {
> > > + struct device *dev = data;
> > > +
> > > + switch (action) {
> > > + case BUS_NOTIFY_ADD_DEVICE:
> > > + return add_device(dev);
> > > + case BUS_NOTIFY_DEL_DEVICE:
> > > + del_device(dev);
> > > + return 0;
> > > + default:
> > > + return 0;
> > > + }
> > > +}
> > > +
> > > +static struct notifier_block tce_iommu_bus_nb = {
> > > + .notifier_call = iommu_bus_notifier, };
> > > +
> > > +static void group_release(void *iommu_data) {
> > > + struct iommu_table *tbl = iommu_data;
> > > + tbl->it_group = NULL;
> > > +}
> > > +
> > > +static int __init tce_iommu_init(void) {
> > > + struct pci_dev *pdev = NULL;
> > > + struct iommu_table *tbl;
> > > + struct iommu_group *grp;
> > > +
> > > + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >
> > There's already a notifier in the iommu code if you were to register an
> > iommu_ops with the add/remove_device entries. That would allow you to
> > remove the notifier block and notifier function below and the second loop
> > below. Are you avoiding that to avoid the rest of iommu_ops?
> >
> [Sethi Varun-B16395] Could be one reason, also they are associating
> the iommu group with the tce table entry and not the device.

That's fine, the tce is the level they claim to have isolation.

>
> > Also, shouldn't this notifier only be registered after the first loop
> > below? Otherwise ADD_DEVICE could race with setting up groups, which we
> > assume are present in the add_device() above.
> [Sethi Varun-B16395] Isn't this similar to how how the notifier is
> registered in iommu_bus_init? First a notifier is registered and then
> we check for devices that have already been probed.

It's not quite the same because the existing notifier callbacks to add
devices also creates groups as necessary. My point here was that we
register a notifier that assumes a group prior to setting up the groups.
In the existing code the order doesn't matter so much because the system
isn't susceptible to hot device adds at that point. That's likely the
case here too, but registering a notifier before setting up the data the
callback references seems unnecessary. Thanks,

Alex

2012-11-26 15:19:02

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> >
> >
> >> -----Original Message-----
> >> From: [email protected] [mailto:linux-kernel-
> >> [email protected]] On Behalf Of Alex Williamson
> >> Sent: Tuesday, November 20, 2012 11:50 PM
> >> To: Alexey Kardashevskiy
> >> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> >> [email protected]; [email protected]; [email protected];
> >> David Gibson
> >> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> >> platform
> >>
> >> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> >>> VFIO implements platform independent stuff such as a PCI driver, BAR
> >>> access (via read/write on a file descriptor or direct mapping when
> >>> possible) and IRQ signaling.
> >>> The platform dependent part includes IOMMU initialization and
> >>> handling.
> >>>
> >>> This patch initializes IOMMU groups based on the IOMMU configuration
> >>> discovered during the PCI scan, only POWERNV platform is supported at
> >>> the moment.
> >>>
> >>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> >>> mapping/unmapping requests coming from the client (now QEMU). It also
> >>> returns a DMA window information to let the guest initialize the
> >>> device tree for a guest OS properly. Although this driver has been
> >>> tested only on POWERNV, it should work on any platform supporting TCE
> >>> tables.
> >>>
> >>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >>>
> >>> Cc: David Gibson <[email protected]>
> >>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> >>> ---
> >>> arch/powerpc/include/asm/iommu.h | 6 +
> >>> arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
> >>> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
> >>> drivers/iommu/Kconfig | 8 ++
> >>> drivers/vfio/Kconfig | 6 +
> >>> drivers/vfio/Makefile | 1 +
> >>> drivers/vfio/vfio_iommu_spapr_tce.c | 247
> >> ++++++++++++++++++++++++++++++++++
> >>> include/linux/vfio.h | 20 +++
> >>> 8 files changed, 563 insertions(+)
> >>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>
> >>> diff --git a/arch/powerpc/include/asm/iommu.h
> >>> b/arch/powerpc/include/asm/iommu.h
> >>> index cbfe678..5ba66cb 100644
> >>> --- a/arch/powerpc/include/asm/iommu.h
> >>> +++ b/arch/powerpc/include/asm/iommu.h
> >>> @@ -64,30 +64,33 @@ struct iommu_pool { }
> >>> ____cacheline_aligned_in_smp;
> >>>
> >>> struct iommu_table {
> >>> unsigned long it_busno; /* Bus number this table belongs to */
> >>> unsigned long it_size; /* Size of iommu table in entries */
> >>> unsigned long it_offset; /* Offset into global table */
> >>> unsigned long it_base; /* mapped address of tce table */
> >>> unsigned long it_index; /* which iommu table this is */
> >>> unsigned long it_type; /* type: PCI or Virtual Bus */
> >>> unsigned long it_blocksize; /* Entries in each block (cacheline)
> >> */
> >>> unsigned long poolsize;
> >>> unsigned long nr_pools;
> >>> struct iommu_pool large_pool;
> >>> struct iommu_pool pools[IOMMU_NR_POOLS];
> >>> unsigned long *it_map; /* A simple allocation bitmap for now
> >> */
> >>> +#ifdef CONFIG_IOMMU_API
> >>> + struct iommu_group *it_group;
> >>> +#endif
> >>> };
> >>>
> >>> struct scatterlist;
> >>>
> >>> static inline void set_iommu_table_base(struct device *dev, void
> >>> *base) {
> >>> dev->archdata.dma_data.iommu_table_base = base; }
> >>>
> >>> static inline void *get_iommu_table_base(struct device *dev) {
> >>> return dev->archdata.dma_data.iommu_table_base;
> >>> }
> >>>
> >>> /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> >>> static inline void pci_iommu_init(void) { } extern void
> >>> alloc_dart_table(void); #if defined(CONFIG_PPC64) &&
> >>> defined(CONFIG_PM) static inline void iommu_save(void) {
> >>> if (ppc_md.iommu_save)
> >>> ppc_md.iommu_save();
> >>> }
> >>>
> >>> static inline void iommu_restore(void) {
> >>> if (ppc_md.iommu_restore)
> >>> ppc_md.iommu_restore();
> >>> }
> >>> #endif
> >>>
> >>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> >> entry, uint64_t tce,
> >>> + enum dma_data_direction direction, unsigned long pages);
> >>> +
> >>> #endif /* __KERNEL__ */
> >>> #endif /* _ASM_IOMMU_H */
> >>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>> index ff5a6ce..94f614b 100644
> >>> --- a/arch/powerpc/kernel/iommu.c
> >>> +++ b/arch/powerpc/kernel/iommu.c
> >>> @@ -32,30 +32,31 @@
> >>> #include <linux/dma-mapping.h>
> >>> #include <linux/bitmap.h>
> >>> #include <linux/iommu-helper.h>
> >>> #include <linux/crash_dump.h>
> >>> #include <linux/hash.h>
> >>> #include <linux/fault-inject.h>
> >>> #include <linux/pci.h>
> >>> #include <asm/io.h>
> >>> #include <asm/prom.h>
> >>> #include <asm/iommu.h>
> >>> #include <asm/pci-bridge.h>
> >>> #include <asm/machdep.h>
> >>> #include <asm/kdump.h>
> >>> #include <asm/fadump.h>
> >>> #include <asm/vio.h>
> >>> +#include <asm/tce.h>
> >>>
> >>> #define DBG(...)
> >>>
> >>> static int novmerge;
> >>>
> >>> static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> >>> int);
> >>>
> >>> static int __init setup_iommu(char *str) {
> >>> if (!strcmp(str, "novmerge"))
> >>> novmerge = 1;
> >>> else if (!strcmp(str, "vmerge"))
> >>> novmerge = 0;
> >>> return 1;
> >>> }
> >>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> >>> struct iommu_table *tbl, }
> >>>
> >>> void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>> void *vaddr, dma_addr_t dma_handle) {
> >>> if (tbl) {
> >>> unsigned int nio_pages;
> >>>
> >>> size = PAGE_ALIGN(size);
> >>> nio_pages = size >> IOMMU_PAGE_SHIFT;
> >>> iommu_free(tbl, dma_handle, nio_pages);
> >>> size = PAGE_ALIGN(size);
> >>> free_pages((unsigned long)vaddr, get_order(size));
> >>> }
> >>> }
> >>> +
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +/*
> >>> + * SPAPR TCE API
> >>> + */
> >>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> >>> +entry) {
> >>> + struct page *page = NULL;
> >>
> >> NULL initialization doesn't appear to be necessary
> >>
> >>> + unsigned long oldtce;
> >>> +
> >>> + oldtce = ppc_md.tce_get(tbl, entry);
> >>> +
> >>> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >>> + return NULL;
> >>> +
> >>> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >>> +
> >>> + WARN_ON(!page);
> >>> + if (page && (oldtce & TCE_PCI_WRITE))
> >>> + SetPageDirty(page);
> >>> + ppc_md.tce_free(tbl, entry, 1);
> >>> +
> >>> + return page;
> >>> +}
> >>> +
> >>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >>> + uint64_t tce, enum dma_data_direction direction) {
> >>> + int ret;
> >>> + struct page *page = NULL;
> >>> + unsigned long kva, offset;
> >>> +
> >>> + /* Map new TCE */
> >>> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>> + direction != DMA_TO_DEVICE, &page);
> >>> + if (ret < 1) {
> >>> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> >> tce=%llx ioba=%lx ret=%d\n",
> >>> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>> + if (!ret)
> >>> + ret = -EFAULT;
> >>
> >> Missing return ret? Otherwise we've got some bogus uses of page below
> >> and we're setting ret for no reason here.
> >>
> >>> + }
> >>> +
> >>> + kva = (unsigned long) page_address(page);
> >>> + kva += offset;
> >>> +
> >>> + /* tce_build receives a virtual address */
> >>> + entry += tbl->it_offset; /* Offset into real TCE table */
> >>> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>> +
> >>> + /* tce_build() only returns non-zero for transient errors */
> >>> + if (unlikely(ret)) {
> >>> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> >> ioba=%lx kva=%lx ret=%d\n",
> >>> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >>> + put_page(page);
> >>> + return -EIO;
> >>> + }
> >>> +
> >>> + return 0;
> >>> +}
> >>> +
> >>> +static void tce_flush(struct iommu_table *tbl) {
> >>> + /* Flush/invalidate TLB caches if necessary */
> >>> + if (ppc_md.tce_flush)
> >>> + ppc_md.tce_flush(tbl);
> >>> +
> >>> + /* Make sure updates are seen by hardware */
> >>> + mb();
> >>> +}
> >>> +
> >>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> uint64_t tce,
> >>> + enum dma_data_direction direction, unsigned long pages) {
> >>> + int i, ret = 0, pages_to_put = 0;
> >>> + struct page *page;
> >>> + struct iommu_pool *pool = get_pool(tbl, entry);
> >>> + struct page **oldpages;
> >>> + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> >>> +
> >>> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >>> +
> >>> + /* Handle a single page request without allocation
> >>> + of pages-to-release array */
> >>> + if (pages == 1) {
> >>> + spin_lock(&(pool->lock));
> >>> + page = free_tce(tbl, entry);
> >>> +
> >>> + if (direction != DMA_NONE)
> >>> + ret = put_tce(tbl, entry, tce, direction);
> >>> +
> >>> + tce_flush(tbl);
> >>> +
> >>> + if (page)
> >>> + put_page(page);
> >>> +
> >>> + spin_unlock(&(pool->lock));
> >>> + return ret;
> >>> + }
> >>> +
> >>> + /* Releasing multiple pages */
> >>> + /* Allocate an array for pages to be released after TCE table
> >>> + is updated */
> >>> + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> >>> + if (!oldpages)
> >>> + return -ENOMEM;
> >>> +
> >>> + spin_lock(&(pool->lock));
> >>> +
> >>> + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> >> IOMMU_PAGE_SIZE) {
> >>> + page = free_tce(tbl, entry);
> >>> + if (page) {
> >>> + oldpages[pages_to_put] = page;
> >>> + ++pages_to_put;
> >>> + }
> >>> +
> >>> + if (direction != DMA_NONE)
> >>> + ret = put_tce(tbl, entry, tce, direction);
> >>> +
> >>> + /* Release old pages if we reached the end of oldpages[] or
> >>> + it is the last page or we are about to exit the loop */
> >>> + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> >> {
> >>> + tce_flush(tbl);
> >>
> >> Avoiding tce_flush() is the reason for all this extra overhead, right?
> >> I wonder if it'd be cleaner separating map vs unmap, where the map case
> >> can avoid the oldpages array... but that means inserting new mappings on
> >> top of old ones wouldn't put the pages.
>
>
> Yes, we do not want to loose pages if the guest forgot to unmap them.

Hmm, does that mean we're not actively clearing tce entries or somehow
disabling the iommu window when the iommu is released through vfio?

> >>> +
> >>> + /* Release pages after removing them from TCE table */
> >>> + while (pages_to_put) {
> >>> + --pages_to_put;
> >>> + put_page(oldpages[pages_to_put]);
> >>> + }
> >>> + }
> >>> + }
> >>> +
> >>> + spin_unlock(&(pool->lock));
> >>> + kfree(oldpages);
> >>> +
> >>> + return ret;
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> >>> +#endif /* CONFIG_IOMMU_API */
> >>> diff --git a/arch/powerpc/platforms/powernv/pci.c
> >>> b/arch/powerpc/platforms/powernv/pci.c
> >>> index 05205cf..676f4d9 100644
> >>> --- a/arch/powerpc/platforms/powernv/pci.c
> >>> +++ b/arch/powerpc/platforms/powernv/pci.c
> >>> @@ -8,30 +8,31 @@
> >>> * This program is free software; you can redistribute it and/or
> >>> * modify it under the terms of the GNU General Public License
> >>> * as published by the Free Software Foundation; either version
> >>> * 2 of the License, or (at your option) any later version.
> >>> */
> >>>
> >>> #include <linux/kernel.h>
> >>> #include <linux/pci.h>
> >>> #include <linux/delay.h>
> >>> #include <linux/string.h>
> >>> #include <linux/init.h>
> >>> #include <linux/bootmem.h>
> >>> #include <linux/irq.h>
> >>> #include <linux/io.h>
> >>> #include <linux/msi.h>
> >>> +#include <linux/iommu.h>
> >>>
> >>> #include <asm/sections.h>
> >>> #include <asm/io.h>
> >>> #include <asm/prom.h>
> >>> #include <asm/pci-bridge.h>
> >>> #include <asm/machdep.h>
> >>> #include <asm/ppc-pci.h>
> >>> #include <asm/opal.h>
> >>> #include <asm/iommu.h>
> >>> #include <asm/tce.h>
> >>> #include <asm/abs_addr.h>
> >>> #include <asm/firmware.h>
> >>>
> >>> #include "powernv.h"
> >>> #include "pci.h"
> >>> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> >>> /* Configure IOMMU DMA hooks */
> >>> ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> >>> ppc_md.tce_build = pnv_tce_build;
> >>> ppc_md.tce_free = pnv_tce_free;
> >>> ppc_md.tce_get = pnv_tce_get;
> >>> ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> >>> set_pci_dma_ops(&dma_iommu_ops);
> >>>
> >>> /* Configure MSIs */
> >>> #ifdef CONFIG_PCI_MSI
> >>> ppc_md.msi_check_device = pnv_msi_check_device;
> >>> ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> >>> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; #endif }
> >>> +
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +/*
> >>> + * IOMMU groups support required by VFIO */ static int
> >>> +add_device(struct device *dev) {
> >>> + struct iommu_table *tbl;
> >>> + int ret = 0;
> >>> +
> >>> + if (WARN_ON(dev->iommu_group)) {
> >>> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> >> group %d, skipping\n",
> >>> + dev->kobj.name,
> >>
> >> dev_name(dev)
> >>
> >>> + iommu_group_id(dev->iommu_group));
> >>> + return -EBUSY;
> >>> + }
> >>> +
> >>> + tbl = get_iommu_table_base(dev);
> >>> + if (!tbl) {
> >>> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> >>> + dev->kobj.name);
> >>> + return 0;
> >>> + }
> >>> +
> >>> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> >>> + dev->kobj.name, iommu_group_id(tbl->it_group));
> >>> +
> >>> + ret = iommu_group_add_device(tbl->it_group, dev);
> >>> + if (ret < 0)
> >>> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >>> + dev->kobj.name, ret);
> >>> +
> >>> + return ret;
> >>> +}
> >>> +
> >>> +static void del_device(struct device *dev) {
> >>> + iommu_group_remove_device(dev);
> >>> +}
> >>> +
> >>> +static int iommu_bus_notifier(struct notifier_block *nb,
> >>> + unsigned long action, void *data) {
> >>> + struct device *dev = data;
> >>> +
> >>> + switch (action) {
> >>> + case BUS_NOTIFY_ADD_DEVICE:
> >>> + return add_device(dev);
> >>> + case BUS_NOTIFY_DEL_DEVICE:
> >>> + del_device(dev);
> >>> + return 0;
> >>> + default:
> >>> + return 0;
> >>> + }
> >>> +}
> >>> +
> >>> +static struct notifier_block tce_iommu_bus_nb = {
> >>> + .notifier_call = iommu_bus_notifier, };
> >>> +
> >>> +static void group_release(void *iommu_data) {
> >>> + struct iommu_table *tbl = iommu_data;
> >>> + tbl->it_group = NULL;
> >>> +}
> >>> +
> >>> +static int __init tce_iommu_init(void) {
> >>> + struct pci_dev *pdev = NULL;
> >>> + struct iommu_table *tbl;
> >>> + struct iommu_group *grp;
> >>> +
> >>> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >>
> >> There's already a notifier in the iommu code if you were to register an
> >> iommu_ops with the add/remove_device entries. That would allow you to
> >> remove the notifier block and notifier function below and the second loop
> >> below. Are you avoiding that to avoid the rest of iommu_ops?
>
> Yes. I need to implement either a small part of iommu_ops (especially the
> part which I think should not be there at all) or notifier, cannot how how
> the first is simpler.
>
>
> > [Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.
>
> Also true.
> I would actually allocate IOMMU groups right after we discovered the new
> one but this is done during PCI scan which works before
> subsys_initcall(iommu_init) is called so I added this first loop.
>
>
> >> Also, shouldn't this notifier only be registered after the first loop
> >> below? Otherwise ADD_DEVICE could race with setting up groups, which we
> >> assume are present in the add_device() above.
> > [Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.
>
> Yep. Not very familiar with this stuff but if it is done one way and it is
> already upstream, I cannot see why I should go another way :)

The existing notifier callback and loop should be able to operate in
parallel... of course they don't because we're not actively adding new
devices at the point where it's setup. IIRC, the notifier callback
blindly uses something that's not setup at the point it's registered.
That's a bit sloppy. Maybe I'm mis-remembering, I'll verify in your new
version. Thanks,

Alex


2012-11-26 18:04:48

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> > On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: [email protected] [mailto:linux-kernel-
> > >> [email protected]] On Behalf Of Alex Williamson
> > >> Sent: Tuesday, November 20, 2012 11:50 PM
> > >> To: Alexey Kardashevskiy
> > >> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> > >> [email protected]; [email protected]; [email protected];
> > >> David Gibson
> > >> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> > >> platform
> > >>
> > >> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > >>> VFIO implements platform independent stuff such as a PCI driver, BAR
> > >>> access (via read/write on a file descriptor or direct mapping when
> > >>> possible) and IRQ signaling.
> > >>> The platform dependent part includes IOMMU initialization and
> > >>> handling.
> > >>>
> > >>> This patch initializes IOMMU groups based on the IOMMU configuration
> > >>> discovered during the PCI scan, only POWERNV platform is supported at
> > >>> the moment.
> > >>>
> > >>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> > >>> mapping/unmapping requests coming from the client (now QEMU). It also
> > >>> returns a DMA window information to let the guest initialize the
> > >>> device tree for a guest OS properly. Although this driver has been
> > >>> tested only on POWERNV, it should work on any platform supporting TCE
> > >>> tables.
> > >>>
> > >>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> > >>>
> > >>> Cc: David Gibson <[email protected]>
> > >>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> > >>> ---
> > >>> arch/powerpc/include/asm/iommu.h | 6 +
> > >>> arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
> > >>> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
> > >>> drivers/iommu/Kconfig | 8 ++
> > >>> drivers/vfio/Kconfig | 6 +
> > >>> drivers/vfio/Makefile | 1 +
> > >>> drivers/vfio/vfio_iommu_spapr_tce.c | 247
> > >> ++++++++++++++++++++++++++++++++++
> > >>> include/linux/vfio.h | 20 +++
> > >>> 8 files changed, 563 insertions(+)
> > >>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> > >>>
> > >>> diff --git a/arch/powerpc/include/asm/iommu.h
> > >>> b/arch/powerpc/include/asm/iommu.h
> > >>> index cbfe678..5ba66cb 100644
> > >>> --- a/arch/powerpc/include/asm/iommu.h
> > >>> +++ b/arch/powerpc/include/asm/iommu.h
> > >>> @@ -64,30 +64,33 @@ struct iommu_pool { }
> > >>> ____cacheline_aligned_in_smp;
> > >>>
> > >>> struct iommu_table {
> > >>> unsigned long it_busno; /* Bus number this table belongs to */
> > >>> unsigned long it_size; /* Size of iommu table in entries */
> > >>> unsigned long it_offset; /* Offset into global table */
> > >>> unsigned long it_base; /* mapped address of tce table */
> > >>> unsigned long it_index; /* which iommu table this is */
> > >>> unsigned long it_type; /* type: PCI or Virtual Bus */
> > >>> unsigned long it_blocksize; /* Entries in each block (cacheline)
> > >> */
> > >>> unsigned long poolsize;
> > >>> unsigned long nr_pools;
> > >>> struct iommu_pool large_pool;
> > >>> struct iommu_pool pools[IOMMU_NR_POOLS];
> > >>> unsigned long *it_map; /* A simple allocation bitmap for now
> > >> */
> > >>> +#ifdef CONFIG_IOMMU_API
> > >>> + struct iommu_group *it_group;
> > >>> +#endif
> > >>> };
> > >>>
> > >>> struct scatterlist;
> > >>>
> > >>> static inline void set_iommu_table_base(struct device *dev, void
> > >>> *base) {
> > >>> dev->archdata.dma_data.iommu_table_base = base; }
> > >>>
> > >>> static inline void *get_iommu_table_base(struct device *dev) {
> > >>> return dev->archdata.dma_data.iommu_table_base;
> > >>> }
> > >>>
> > >>> /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > >>> static inline void pci_iommu_init(void) { } extern void
> > >>> alloc_dart_table(void); #if defined(CONFIG_PPC64) &&
> > >>> defined(CONFIG_PM) static inline void iommu_save(void) {
> > >>> if (ppc_md.iommu_save)
> > >>> ppc_md.iommu_save();
> > >>> }
> > >>>
> > >>> static inline void iommu_restore(void) {
> > >>> if (ppc_md.iommu_restore)
> > >>> ppc_md.iommu_restore();
> > >>> }
> > >>> #endif
> > >>>
> > >>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> > >> entry, uint64_t tce,
> > >>> + enum dma_data_direction direction, unsigned long pages);
> > >>> +
> > >>> #endif /* __KERNEL__ */
> > >>> #endif /* _ASM_IOMMU_H */
> > >>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > >>> index ff5a6ce..94f614b 100644
> > >>> --- a/arch/powerpc/kernel/iommu.c
> > >>> +++ b/arch/powerpc/kernel/iommu.c
> > >>> @@ -32,30 +32,31 @@
> > >>> #include <linux/dma-mapping.h>
> > >>> #include <linux/bitmap.h>
> > >>> #include <linux/iommu-helper.h>
> > >>> #include <linux/crash_dump.h>
> > >>> #include <linux/hash.h>
> > >>> #include <linux/fault-inject.h>
> > >>> #include <linux/pci.h>
> > >>> #include <asm/io.h>
> > >>> #include <asm/prom.h>
> > >>> #include <asm/iommu.h>
> > >>> #include <asm/pci-bridge.h>
> > >>> #include <asm/machdep.h>
> > >>> #include <asm/kdump.h>
> > >>> #include <asm/fadump.h>
> > >>> #include <asm/vio.h>
> > >>> +#include <asm/tce.h>
> > >>>
> > >>> #define DBG(...)
> > >>>
> > >>> static int novmerge;
> > >>>
> > >>> static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > >>> int);
> > >>>
> > >>> static int __init setup_iommu(char *str) {
> > >>> if (!strcmp(str, "novmerge"))
> > >>> novmerge = 1;
> > >>> else if (!strcmp(str, "vmerge"))
> > >>> novmerge = 0;
> > >>> return 1;
> > >>> }
> > >>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > >>> struct iommu_table *tbl, }
> > >>>
> > >>> void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> > >>> void *vaddr, dma_addr_t dma_handle) {
> > >>> if (tbl) {
> > >>> unsigned int nio_pages;
> > >>>
> > >>> size = PAGE_ALIGN(size);
> > >>> nio_pages = size >> IOMMU_PAGE_SHIFT;
> > >>> iommu_free(tbl, dma_handle, nio_pages);
> > >>> size = PAGE_ALIGN(size);
> > >>> free_pages((unsigned long)vaddr, get_order(size));
> > >>> }
> > >>> }
> > >>> +
> > >>> +#ifdef CONFIG_IOMMU_API
> > >>> +/*
> > >>> + * SPAPR TCE API
> > >>> + */
> > >>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > >>> +entry) {
> > >>> + struct page *page = NULL;
> > >>
> > >> NULL initialization doesn't appear to be necessary
> > >>
> > >>> + unsigned long oldtce;
> > >>> +
> > >>> + oldtce = ppc_md.tce_get(tbl, entry);
> > >>> +
> > >>> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > >>> + return NULL;
> > >>> +
> > >>> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > >>> +
> > >>> + WARN_ON(!page);
> > >>> + if (page && (oldtce & TCE_PCI_WRITE))
> > >>> + SetPageDirty(page);
> > >>> + ppc_md.tce_free(tbl, entry, 1);
> > >>> +
> > >>> + return page;
> > >>> +}
> > >>> +
> > >>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > >>> + uint64_t tce, enum dma_data_direction direction) {
> > >>> + int ret;
> > >>> + struct page *page = NULL;
> > >>> + unsigned long kva, offset;
> > >>> +
> > >>> + /* Map new TCE */
> > >>> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > >>> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > >>> + direction != DMA_TO_DEVICE, &page);
> > >>> + if (ret < 1) {
> > >>> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> > >> tce=%llx ioba=%lx ret=%d\n",
> > >>> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> > >>> + if (!ret)
> > >>> + ret = -EFAULT;
> > >>
> > >> Missing return ret? Otherwise we've got some bogus uses of page below
> > >> and we're setting ret for no reason here.
> > >>
> > >>> + }
> > >>> +
> > >>> + kva = (unsigned long) page_address(page);
> > >>> + kva += offset;
> > >>> +
> > >>> + /* tce_build receives a virtual address */
> > >>> + entry += tbl->it_offset; /* Offset into real TCE table */
> > >>> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > >>> +
> > >>> + /* tce_build() only returns non-zero for transient errors */
> > >>> + if (unlikely(ret)) {
> > >>> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> > >> ioba=%lx kva=%lx ret=%d\n",
> > >>> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > >>> + put_page(page);
> > >>> + return -EIO;
> > >>> + }
> > >>> +
> > >>> + return 0;
> > >>> +}
> > >>> +
> > >>> +static void tce_flush(struct iommu_table *tbl) {
> > >>> + /* Flush/invalidate TLB caches if necessary */
> > >>> + if (ppc_md.tce_flush)
> > >>> + ppc_md.tce_flush(tbl);
> > >>> +
> > >>> + /* Make sure updates are seen by hardware */
> > >>> + mb();
> > >>> +}
> > >>> +
> > >>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> > >> uint64_t tce,
> > >>> + enum dma_data_direction direction, unsigned long pages) {
> > >>> + int i, ret = 0, pages_to_put = 0;
> > >>> + struct page *page;
> > >>> + struct iommu_pool *pool = get_pool(tbl, entry);
> > >>> + struct page **oldpages;
> > >>> + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > >>> +
> > >>> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > >>> +
> > >>> + /* Handle a single page request without allocation
> > >>> + of pages-to-release array */
> > >>> + if (pages == 1) {
> > >>> + spin_lock(&(pool->lock));
> > >>> + page = free_tce(tbl, entry);
> > >>> +
> > >>> + if (direction != DMA_NONE)
> > >>> + ret = put_tce(tbl, entry, tce, direction);
> > >>> +
> > >>> + tce_flush(tbl);
> > >>> +
> > >>> + if (page)
> > >>> + put_page(page);
> > >>> +
> > >>> + spin_unlock(&(pool->lock));
> > >>> + return ret;
> > >>> + }
> > >>> +
> > >>> + /* Releasing multiple pages */
> > >>> + /* Allocate an array for pages to be released after TCE table
> > >>> + is updated */
> > >>> + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > >>> + if (!oldpages)
> > >>> + return -ENOMEM;
> > >>> +
> > >>> + spin_lock(&(pool->lock));
> > >>> +
> > >>> + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> > >> IOMMU_PAGE_SIZE) {
> > >>> + page = free_tce(tbl, entry);
> > >>> + if (page) {
> > >>> + oldpages[pages_to_put] = page;
> > >>> + ++pages_to_put;
> > >>> + }
> > >>> +
> > >>> + if (direction != DMA_NONE)
> > >>> + ret = put_tce(tbl, entry, tce, direction);
> > >>> +
> > >>> + /* Release old pages if we reached the end of oldpages[] or
> > >>> + it is the last page or we are about to exit the loop */
> > >>> + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> > >> {
> > >>> + tce_flush(tbl);
> > >>
> > >> Avoiding tce_flush() is the reason for all this extra overhead, right?
> > >> I wonder if it'd be cleaner separating map vs unmap, where the map case
> > >> can avoid the oldpages array... but that means inserting new mappings on
> > >> top of old ones wouldn't put the pages.
> >
> >
> > Yes, we do not want to loose pages if the guest forgot to unmap them.
>
> Hmm, does that mean we're not actively clearing tce entries or somehow
> disabling the iommu window when the iommu is released through vfio?

Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
more concerned about the guest simply mapping over top of it's own
mappings. Is that common? Is it common enough for every multi-page
mapping to assume it will happen? I know this is a performance
sensitive path for you and it seems like a map-only w/ fallback to
unmap, remap would be better in the general case.

On x86 we do exactly that, but we do the unmap, remap from userspace
when we get an EBUSY. Thanks,

Alex

2012-11-26 18:20:51

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO

On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
>
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
>
> The counterpart in QEMU is required to support this functionality.
>
> Cc: David Gibson <[email protected]>
> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> ---
> drivers/vfio/Kconfig | 6 +
> drivers/vfio/Makefile | 1 +
> drivers/vfio/vfio_iommu_spapr_tce.c | 247 +++++++++++++++++++++++++++++++++++
> include/linux/vfio.h | 20 +++
> 4 files changed, 274 insertions(+)
> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> depends on VFIO
> default n
>
> +config VFIO_IOMMU_SPAPR_TCE
> + tristate
> + depends on VFIO && SPAPR_TCE_IOMMU
> + default n
> +
> menuconfig VFIO
> tristate "VFIO Non-Privileged userspace driver framework"
> depends on IOMMU_API
> select VFIO_IOMMU_TYPE1 if X86
> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> help
> VFIO provides a framework for secure userspace device drivers.
> See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
> obj-$(CONFIG_VFIO) += vfio.o
> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..46a6298
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,247 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp. All rights reserved.
> + * Author: Alexey Kardashevskiy <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> + * Author: Alex Williamson <[email protected]>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION "0.1"
> +#define DRIVER_AUTHOR "[email protected]"
> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> + struct mutex lock;
> + struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> + struct tce_container *container;
> +
> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
> + printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> + return ERR_PTR(-EINVAL);
> + }
> +
> + container = kzalloc(sizeof(*container), GFP_KERNEL);
> + if (!container)
> + return ERR_PTR(-ENOMEM);
> +
> + mutex_init(&container->lock);
> +
> + return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> + struct tce_container *container = iommu_data;
> +
> + WARN_ON(container->tbl && !container->tbl->it_group);

I think your patch ordering is backwards here. it_group isn't added
until 2/2. I'd really like to see the arch/powerpc code approved and
merged by the powerpc maintainer before we add the code that makes use
of it into vfio. Otherwise we just get lots of churn if interfaces
change or they disapprove of it altogether.

> + if (container->tbl && container->tbl->it_group)
> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> + mutex_destroy(&container->lock);
> +
> + kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> + unsigned int cmd, unsigned long arg)
> +{
> + struct tce_container *container = iommu_data;
> + unsigned long minsz;
> +
> + switch (cmd) {
> + case VFIO_CHECK_EXTENSION: {
> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> + }
> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> + struct vfio_iommu_spapr_tce_info info;
> + struct iommu_table *tbl = container->tbl;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> + dma64_window_size);
> +
> + if (copy_from_user(&info, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (info.argsz < minsz)
> + return -EINVAL;
> +
> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> + info.dma64_window_start = 0;
> + info.dma64_window_size = 0;
> + info.flags = 0;
> +
> + if (copy_to_user((void __user *)arg, &info, minsz))
> + return -EFAULT;
> +
> + return 0;
> + }
> + case VFIO_IOMMU_MAP_DMA: {
> + vfio_iommu_spapr_tce_dma_map param;
> + struct iommu_table *tbl = container->tbl;
> + enum dma_data_direction direction = DMA_NONE;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> + if (copy_from_user(&param, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (param.argsz < minsz)
> + return -EINVAL;
> +
> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> + direction = DMA_BIDIRECTIONAL;
> + } else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> + direction = DMA_TO_DEVICE;
> + } else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> + direction = DMA_FROM_DEVICE;
> + }
> +
> + param.size += param.iova & ~IOMMU_PAGE_MASK;
> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);

On x86 we force iova, vaddr, and size to all be aligned to the smallest
page granularity of the iommu and return -EINVAL if it doesn't fit.
What does it imply to the user if they're always aligned to work here?
Won't this interface happily map overlapping entries with no indication
to the user that the previous mapping is no longer valid?

Maybe another reason why a combined unmap/map makes me nervous, we have
to assume the user knows what they're doing.

> +
> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> + param.vaddr & IOMMU_PAGE_MASK, direction,
> + param.size >> IOMMU_PAGE_SHIFT);
> + }
> + case VFIO_IOMMU_UNMAP_DMA: {
> + vfio_iommu_spapr_tce_dma_unmap param;
> + struct iommu_table *tbl = container->tbl;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> + if (copy_from_user(&param, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (param.argsz < minsz)
> + return -EINVAL;
> +
> + param.size += param.iova & ~IOMMU_PAGE_MASK;
> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> +
> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> + 0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> + }
> + default:
> + printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);

pr_warn

> + }
> +
> + return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> + struct iommu_group *iommu_group)
> +{
> + struct tce_container *container = iommu_data;
> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> + BUG_ON(!tbl);
> + mutex_lock(&container->lock);
> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> + iommu_group_id(iommu_group), iommu_group);
> + if (container->tbl) {
> + printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",

pr_warn

> + iommu_group_id(container->tbl->it_group),
> + iommu_group_id(iommu_group));
> + mutex_unlock(&container->lock);
> + return -EBUSY;
> + }
> +
> + container->tbl = tbl;

Would it be too much paranoia to clear all the tce here as you do below
on detach? ie. is there any risk that there's leftover programming?
x86 allocates a new domain on open of the iommu, so we always start out
clean.

> + mutex_unlock(&container->lock);
> +
> + return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group)
> +{
> + struct tce_container *container = iommu_data;
> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> + BUG_ON(!tbl);
> + mutex_lock(&container->lock);
> + if (tbl != container->tbl) {
> + printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",

pr_warn

> + iommu_group_id(iommu_group),
> + iommu_group_id(tbl->it_group));
> + } else {
> +
> + pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> + iommu_group_id(iommu_group), iommu_group);
> +
> + iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);

So this cleans out any mappings when vfio is closed, good.

> + container->tbl = NULL;
> + }
> + mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> + .name = "iommu-vfio-powerpc",
> + .owner = THIS_MODULE,
> + .open = tce_iommu_open,
> + .release = tce_iommu_release,
> + .ioctl = tce_iommu_ioctl,
> + .attach_group = tce_iommu_attach_group,
> + .detach_group = tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> + return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..3ecd65c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
> /* Extensions */
>
> #define VFIO_TYPE1_IOMMU 1
> +#define VFIO_SPAPR_TCE_IOMMU 2
>
> /*
> * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
>
> #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +struct vfio_iommu_spapr_tce_info {
> + __u32 argsz;
> + __u32 flags;
> + __u32 dma32_window_start;
> + __u32 dma32_window_size;
> + __u64 dma64_window_start;
> + __u64 dma64_window_size;
> +};

Is there anything we can document about this? It should probably list
that size is in bytes. Is there any need to communicate the IOMMU page
size here?

> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
> #endif /* VFIO_H */

Thanks,

Alex


2012-11-27 00:22:16

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On Mon, 2012-11-26 at 11:04 -0700, Alex Williamson wrote:
> Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> more concerned about the guest simply mapping over top of it's own
> mappings. Is that common? Is it common enough for every multi-page
> mapping to assume it will happen? I know this is a performance
> sensitive path for you and it seems like a map-only w/ fallback to
> unmap, remap would be better in the general case.
>
> On x86 we do exactly that, but we do the unmap, remap from userspace
> when we get an EBUSY. Thanks,

Right, Linux as guest at least will never map "over" an existing
mapping. It will always unmap first. IE. The only transition we do on
H_PUT_TCE are 0 -> valid and valid -> 0.

So it would be fine to simplify the code and keep the "map over map" as
a slow fallback. I can't tell for other operating systems but we don't
care about those at this point :-)

Cheers,
Ben.

2012-11-27 03:28:30

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On 27/11/12 05:04, Alex Williamson wrote:
> On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
>> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
>>> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: [email protected] [mailto:linux-kernel-
>>>>> [email protected]] On Behalf Of Alex Williamson
>>>>> Sent: Tuesday, November 20, 2012 11:50 PM
>>>>> To: Alexey Kardashevskiy
>>>>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
>>>>> [email protected]; [email protected]; [email protected];
>>>>> David Gibson
>>>>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
>>>>> platform
>>>>>
>>>>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
>>>>>> VFIO implements platform independent stuff such as a PCI driver, BAR
>>>>>> access (via read/write on a file descriptor or direct mapping when
>>>>>> possible) and IRQ signaling.
>>>>>> The platform dependent part includes IOMMU initialization and
>>>>>> handling.
>>>>>>
>>>>>> This patch initializes IOMMU groups based on the IOMMU configuration
>>>>>> discovered during the PCI scan, only POWERNV platform is supported at
>>>>>> the moment.
>>>>>>
>>>>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
>>>>>> mapping/unmapping requests coming from the client (now QEMU). It also
>>>>>> returns a DMA window information to let the guest initialize the
>>>>>> device tree for a guest OS properly. Although this driver has been
>>>>>> tested only on POWERNV, it should work on any platform supporting TCE
>>>>>> tables.
>>>>>>
>>>>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
>>>>>>
>>>>>> Cc: David Gibson <[email protected]>
>>>>>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
>>>>>> ---
>>>>>> arch/powerpc/include/asm/iommu.h | 6 +
>>>>>> arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
>>>>>> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
>>>>>> drivers/iommu/Kconfig | 8 ++
>>>>>> drivers/vfio/Kconfig | 6 +
>>>>>> drivers/vfio/Makefile | 1 +
>>>>>> drivers/vfio/vfio_iommu_spapr_tce.c | 247
>>>>> ++++++++++++++++++++++++++++++++++
>>>>>> include/linux/vfio.h | 20 +++
>>>>>> 8 files changed, 563 insertions(+)
>>>>>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>>>
>>>>>> diff --git a/arch/powerpc/include/asm/iommu.h
>>>>>> b/arch/powerpc/include/asm/iommu.h
>>>>>> index cbfe678..5ba66cb 100644
>>>>>> --- a/arch/powerpc/include/asm/iommu.h
>>>>>> +++ b/arch/powerpc/include/asm/iommu.h
>>>>>> @@ -64,30 +64,33 @@ struct iommu_pool { }
>>>>>> ____cacheline_aligned_in_smp;
>>>>>>
>>>>>> struct iommu_table {
>>>>>> unsigned long it_busno; /* Bus number this table belongs to */
>>>>>> unsigned long it_size; /* Size of iommu table in entries */
>>>>>> unsigned long it_offset; /* Offset into global table */
>>>>>> unsigned long it_base; /* mapped address of tce table */
>>>>>> unsigned long it_index; /* which iommu table this is */
>>>>>> unsigned long it_type; /* type: PCI or Virtual Bus */
>>>>>> unsigned long it_blocksize; /* Entries in each block (cacheline)
>>>>> */
>>>>>> unsigned long poolsize;
>>>>>> unsigned long nr_pools;
>>>>>> struct iommu_pool large_pool;
>>>>>> struct iommu_pool pools[IOMMU_NR_POOLS];
>>>>>> unsigned long *it_map; /* A simple allocation bitmap for now
>>>>> */
>>>>>> +#ifdef CONFIG_IOMMU_API
>>>>>> + struct iommu_group *it_group;
>>>>>> +#endif
>>>>>> };
>>>>>>
>>>>>> struct scatterlist;
>>>>>>
>>>>>> static inline void set_iommu_table_base(struct device *dev, void
>>>>>> *base) {
>>>>>> dev->archdata.dma_data.iommu_table_base = base; }
>>>>>>
>>>>>> static inline void *get_iommu_table_base(struct device *dev) {
>>>>>> return dev->archdata.dma_data.iommu_table_base;
>>>>>> }
>>>>>>
>>>>>> /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
>>>>>> static inline void pci_iommu_init(void) { } extern void
>>>>>> alloc_dart_table(void); #if defined(CONFIG_PPC64) &&
>>>>>> defined(CONFIG_PM) static inline void iommu_save(void) {
>>>>>> if (ppc_md.iommu_save)
>>>>>> ppc_md.iommu_save();
>>>>>> }
>>>>>>
>>>>>> static inline void iommu_restore(void) {
>>>>>> if (ppc_md.iommu_restore)
>>>>>> ppc_md.iommu_restore();
>>>>>> }
>>>>>> #endif
>>>>>>
>>>>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
>>>>> entry, uint64_t tce,
>>>>>> + enum dma_data_direction direction, unsigned long pages);
>>>>>> +
>>>>>> #endif /* __KERNEL__ */
>>>>>> #endif /* _ASM_IOMMU_H */
>>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>>>>> index ff5a6ce..94f614b 100644
>>>>>> --- a/arch/powerpc/kernel/iommu.c
>>>>>> +++ b/arch/powerpc/kernel/iommu.c
>>>>>> @@ -32,30 +32,31 @@
>>>>>> #include <linux/dma-mapping.h>
>>>>>> #include <linux/bitmap.h>
>>>>>> #include <linux/iommu-helper.h>
>>>>>> #include <linux/crash_dump.h>
>>>>>> #include <linux/hash.h>
>>>>>> #include <linux/fault-inject.h>
>>>>>> #include <linux/pci.h>
>>>>>> #include <asm/io.h>
>>>>>> #include <asm/prom.h>
>>>>>> #include <asm/iommu.h>
>>>>>> #include <asm/pci-bridge.h>
>>>>>> #include <asm/machdep.h>
>>>>>> #include <asm/kdump.h>
>>>>>> #include <asm/fadump.h>
>>>>>> #include <asm/vio.h>
>>>>>> +#include <asm/tce.h>
>>>>>>
>>>>>> #define DBG(...)
>>>>>>
>>>>>> static int novmerge;
>>>>>>
>>>>>> static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
>>>>>> int);
>>>>>>
>>>>>> static int __init setup_iommu(char *str) {
>>>>>> if (!strcmp(str, "novmerge"))
>>>>>> novmerge = 1;
>>>>>> else if (!strcmp(str, "vmerge"))
>>>>>> novmerge = 0;
>>>>>> return 1;
>>>>>> }
>>>>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
>>>>>> struct iommu_table *tbl, }
>>>>>>
>>>>>> void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>>>>> void *vaddr, dma_addr_t dma_handle) {
>>>>>> if (tbl) {
>>>>>> unsigned int nio_pages;
>>>>>>
>>>>>> size = PAGE_ALIGN(size);
>>>>>> nio_pages = size >> IOMMU_PAGE_SHIFT;
>>>>>> iommu_free(tbl, dma_handle, nio_pages);
>>>>>> size = PAGE_ALIGN(size);
>>>>>> free_pages((unsigned long)vaddr, get_order(size));
>>>>>> }
>>>>>> }
>>>>>> +
>>>>>> +#ifdef CONFIG_IOMMU_API
>>>>>> +/*
>>>>>> + * SPAPR TCE API
>>>>>> + */
>>>>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
>>>>>> +entry) {
>>>>>> + struct page *page = NULL;
>>>>>
>>>>> NULL initialization doesn't appear to be necessary
>>>>>
>>>>>> + unsigned long oldtce;
>>>>>> +
>>>>>> + oldtce = ppc_md.tce_get(tbl, entry);
>>>>>> +
>>>>>> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>>>>>> + return NULL;
>>>>>> +
>>>>>> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
>>>>>> +
>>>>>> + WARN_ON(!page);
>>>>>> + if (page && (oldtce & TCE_PCI_WRITE))
>>>>>> + SetPageDirty(page);
>>>>>> + ppc_md.tce_free(tbl, entry, 1);
>>>>>> +
>>>>>> + return page;
>>>>>> +}
>>>>>> +
>>>>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>>>>>> + uint64_t tce, enum dma_data_direction direction) {
>>>>>> + int ret;
>>>>>> + struct page *page = NULL;
>>>>>> + unsigned long kva, offset;
>>>>>> +
>>>>>> + /* Map new TCE */
>>>>>> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>>>>> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>>>>> + direction != DMA_TO_DEVICE, &page);
>>>>>> + if (ret < 1) {
>>>>>> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
>>>>> tce=%llx ioba=%lx ret=%d\n",
>>>>>> + tce, entry << IOMMU_PAGE_SHIFT, ret);
>>>>>> + if (!ret)
>>>>>> + ret = -EFAULT;
>>>>>
>>>>> Missing return ret? Otherwise we've got some bogus uses of page below
>>>>> and we're setting ret for no reason here.
>>>>>
>>>>>> + }
>>>>>> +
>>>>>> + kva = (unsigned long) page_address(page);
>>>>>> + kva += offset;
>>>>>> +
>>>>>> + /* tce_build receives a virtual address */
>>>>>> + entry += tbl->it_offset; /* Offset into real TCE table */
>>>>>> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>>>>> +
>>>>>> + /* tce_build() only returns non-zero for transient errors */
>>>>>> + if (unlikely(ret)) {
>>>>>> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
>>>>> ioba=%lx kva=%lx ret=%d\n",
>>>>>> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>>>>>> + put_page(page);
>>>>>> + return -EIO;
>>>>>> + }
>>>>>> +
>>>>>> + return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void tce_flush(struct iommu_table *tbl) {
>>>>>> + /* Flush/invalidate TLB caches if necessary */
>>>>>> + if (ppc_md.tce_flush)
>>>>>> + ppc_md.tce_flush(tbl);
>>>>>> +
>>>>>> + /* Make sure updates are seen by hardware */
>>>>>> + mb();
>>>>>> +}
>>>>>> +
>>>>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>>>>> uint64_t tce,
>>>>>> + enum dma_data_direction direction, unsigned long pages) {
>>>>>> + int i, ret = 0, pages_to_put = 0;
>>>>>> + struct page *page;
>>>>>> + struct iommu_pool *pool = get_pool(tbl, entry);
>>>>>> + struct page **oldpages;
>>>>>> + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
>>>>>> +
>>>>>> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>>>>>> +
>>>>>> + /* Handle a single page request without allocation
>>>>>> + of pages-to-release array */
>>>>>> + if (pages == 1) {
>>>>>> + spin_lock(&(pool->lock));
>>>>>> + page = free_tce(tbl, entry);
>>>>>> +
>>>>>> + if (direction != DMA_NONE)
>>>>>> + ret = put_tce(tbl, entry, tce, direction);
>>>>>> +
>>>>>> + tce_flush(tbl);
>>>>>> +
>>>>>> + if (page)
>>>>>> + put_page(page);
>>>>>> +
>>>>>> + spin_unlock(&(pool->lock));
>>>>>> + return ret;
>>>>>> + }
>>>>>> +
>>>>>> + /* Releasing multiple pages */
>>>>>> + /* Allocate an array for pages to be released after TCE table
>>>>>> + is updated */
>>>>>> + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>>>>> + if (!oldpages)
>>>>>> + return -ENOMEM;
>>>>>> +
>>>>>> + spin_lock(&(pool->lock));
>>>>>> +
>>>>>> + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
>>>>> IOMMU_PAGE_SIZE) {
>>>>>> + page = free_tce(tbl, entry);
>>>>>> + if (page) {
>>>>>> + oldpages[pages_to_put] = page;
>>>>>> + ++pages_to_put;
>>>>>> + }
>>>>>> +
>>>>>> + if (direction != DMA_NONE)
>>>>>> + ret = put_tce(tbl, entry, tce, direction);
>>>>>> +
>>>>>> + /* Release old pages if we reached the end of oldpages[] or
>>>>>> + it is the last page or we are about to exit the loop */
>>>>>> + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
>>>>> {
>>>>>> + tce_flush(tbl);
>>>>>
>>>>> Avoiding tce_flush() is the reason for all this extra overhead, right?
>>>>> I wonder if it'd be cleaner separating map vs unmap, where the map case
>>>>> can avoid the oldpages array... but that means inserting new mappings on
>>>>> top of old ones wouldn't put the pages.
>>>
>>>
>>> Yes, we do not want to loose pages if the guest forgot to unmap them.
>>
>> Hmm, does that mean we're not actively clearing tce entries or somehow
>> disabling the iommu window when the iommu is released through vfio?
>
> Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> more concerned about the guest simply mapping over top of it's own
> mappings. Is that common? Is it common enough for every multi-page
> mapping to assume it will happen? I know this is a performance
> sensitive path for you and it seems like a map-only w/ fallback to
> unmap, remap would be better in the general case.


I do not get it. Where exactly does the performance suffer?
iommu_put_tces() with non zero "tce" (i.e. "map") has to check if the entry
is not used, at least to return EBUSY when it is, and this check is
performed. If it is zero, there is no overhead at all. And it is going to
be the 99.(9)% case as the guest (un)maps one page per call.

Generally speaking we want to move "put tce" completely to the kernel for
the (much) better performance and vfio won't be dealing with it all.

We already agreed that SPAPR TCE driver uses x86 (aka type1) API but I do
not see why the powerpc implementation should look x86 alike as it still
operates with powerpc machine dependent callbacks so the reader has to have
some powerpc knowledge.


> On x86 we do exactly that, but we do the unmap, remap from userspace
> when we get an EBUSY. Thanks,



--
Alexey

2012-11-27 04:06:25

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO

On 27/11/12 05:20, Alex Williamson wrote:
> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
>> VFIO implements platform independent stuff such as
>> a PCI driver, BAR access (via read/write on a file descriptor
>> or direct mapping when possible) and IRQ signaling.
>>
>> The platform dependent part includes IOMMU initialization
>> and handling. This patch implements an IOMMU driver for VFIO
>> which does mapping/unmapping pages for the guest IO and
>> provides information about DMA window (required by a POWERPC
>> guest).
>>
>> The counterpart in QEMU is required to support this functionality.
>>
>> Cc: David Gibson <[email protected]>
>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
>> ---
>> drivers/vfio/Kconfig | 6 +
>> drivers/vfio/Makefile | 1 +
>> drivers/vfio/vfio_iommu_spapr_tce.c | 247 +++++++++++++++++++++++++++++++++++
>> include/linux/vfio.h | 20 +++
>> 4 files changed, 274 insertions(+)
>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>> depends on VFIO
>> default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> + tristate
>> + depends on VFIO && SPAPR_TCE_IOMMU
>> + default n
>> +
>> menuconfig VFIO
>> tristate "VFIO Non-Privileged userspace driver framework"
>> depends on IOMMU_API
>> select VFIO_IOMMU_TYPE1 if X86
>> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>> help
>> VFIO provides a framework for secure userspace device drivers.
>> See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>> obj-$(CONFIG_VFIO) += vfio.o
>> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>> obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..46a6298
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,247 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp. All rights reserved.
>> + * Author: Alexey Kardashevskiy <[email protected]>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_type1.c:
>> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
>> + * Author: Alex Williamson <[email protected]>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION "0.1"
>> +#define DRIVER_AUTHOR "[email protected]"
>> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> + struct iommu_group *iommu_group);
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> + struct mutex lock;
>> + struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> + struct tce_container *container;
>> +
>> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> + printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>> + return ERR_PTR(-EINVAL);
>> + }
>> +
>> + container = kzalloc(sizeof(*container), GFP_KERNEL);
>> + if (!container)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + mutex_init(&container->lock);
>> +
>> + return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> + struct tce_container *container = iommu_data;
>> +
>> + WARN_ON(container->tbl && !container->tbl->it_group);
>
> I think your patch ordering is backwards here. it_group isn't added
> until 2/2. I'd really like to see the arch/powerpc code approved and
> merged by the powerpc maintainer before we add the code that makes use
> of it into vfio. Otherwise we just get lots of churn if interfaces
> change or they disapprove of it altogether.


Makes sense, thanks.


>> + if (container->tbl && container->tbl->it_group)
>> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>> +
>> + mutex_destroy(&container->lock);
>> +
>> + kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> + unsigned int cmd, unsigned long arg)
>> +{
>> + struct tce_container *container = iommu_data;
>> + unsigned long minsz;
>> +
>> + switch (cmd) {
>> + case VFIO_CHECK_EXTENSION: {
>> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> + }
>> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> + struct vfio_iommu_spapr_tce_info info;
>> + struct iommu_table *tbl = container->tbl;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> + dma64_window_size);
>> +
>> + if (copy_from_user(&info, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (info.argsz < minsz)
>> + return -EINVAL;
>> +
>> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> + info.dma64_window_start = 0;
>> + info.dma64_window_size = 0;
>> + info.flags = 0;
>> +
>> + if (copy_to_user((void __user *)arg, &info, minsz))
>> + return -EFAULT;
>> +
>> + return 0;
>> + }
>> + case VFIO_IOMMU_MAP_DMA: {
>> + vfio_iommu_spapr_tce_dma_map param;
>> + struct iommu_table *tbl = container->tbl;
>> + enum dma_data_direction direction = DMA_NONE;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
>> + direction = DMA_BIDIRECTIONAL;
>> + } else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
>> + direction = DMA_TO_DEVICE;
>> + } else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
>> + direction = DMA_FROM_DEVICE;
>> + }
>> +
>> + param.size += param.iova & ~IOMMU_PAGE_MASK;
>> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>
> On x86 we force iova, vaddr, and size to all be aligned to the smallest
> page granularity of the iommu and return -EINVAL if it doesn't fit.
> What does it imply to the user if they're always aligned to work here?
> Won't this interface happily map overlapping entries with no indication
> to the user that the previous mapping is no longer valid?
> Maybe another reason why a combined unmap/map makes me nervous, we have
> to assume the user knows what they're doing.


I got used to guests which do know what they are doing so I am pretty calm :)
but ok, I'll move alignment to the QEMU, it makes sense.


>> +
>> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> + param.vaddr & IOMMU_PAGE_MASK, direction,
>> + param.size >> IOMMU_PAGE_SHIFT);
>> + }
>> + case VFIO_IOMMU_UNMAP_DMA: {
>> + vfio_iommu_spapr_tce_dma_unmap param;
>> + struct iommu_table *tbl = container->tbl;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>> +
>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + param.size += param.iova & ~IOMMU_PAGE_MASK;
>> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>> +
>> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> + 0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
>> + }
>> + default:
>> + printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>
> pr_warn
>
>> + }
>> +
>> + return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> + struct iommu_group *iommu_group)
>> +{
>> + struct tce_container *container = iommu_data;
>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> + BUG_ON(!tbl);
>> + mutex_lock(&container->lock);
>> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>> + iommu_group_id(iommu_group), iommu_group);
>> + if (container->tbl) {
>> + printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>
> pr_warn
>
>> + iommu_group_id(container->tbl->it_group),
>> + iommu_group_id(iommu_group));
>> + mutex_unlock(&container->lock);
>> + return -EBUSY;
>> + }
>> +
>> + container->tbl = tbl;
>
> Would it be too much paranoia to clear all the tce here as you do below
> on detach?

Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
unmaps) the whole DMA window at the boot time.


> ie. is there any risk that there's leftover programming?
> x86 allocates a new domain on open of the iommu, so we always start out
> clean.


>> + mutex_unlock(&container->lock);
>> +
>> + return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> + struct iommu_group *iommu_group)
>> +{
>> + struct tce_container *container = iommu_data;
>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> + BUG_ON(!tbl);
>> + mutex_lock(&container->lock);
>> + if (tbl != container->tbl) {
>> + printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
>
> pr_warn
>
>> + iommu_group_id(iommu_group),
>> + iommu_group_id(tbl->it_group));
>> + } else {
>> +
>> + pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>> + iommu_group_id(iommu_group), iommu_group);
>> +
>> + iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
>
> So this cleans out any mappings when vfio is closed, good.
>
>> + container->tbl = NULL;
>> + }
>> + mutex_unlock(&container->lock);
>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> + .name = "iommu-vfio-powerpc",
>> + .owner = THIS_MODULE,
>> + .open = tce_iommu_open,
>> + .release = tce_iommu_release,
>> + .ioctl = tce_iommu_ioctl,
>> + .attach_group = tce_iommu_attach_group,
>> + .detach_group = tce_iommu_detach_group,
>> +};
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> + return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> + vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..3ecd65c 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>> /* Extensions */
>>
>> #define VFIO_TYPE1_IOMMU 1
>> +#define VFIO_SPAPR_TCE_IOMMU 2
>>
>> /*
>> * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
>>
>> #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> + __u32 argsz;
>> + __u32 flags;
>> + __u32 dma32_window_start;
>> + __u32 dma32_window_size;
>> + __u64 dma64_window_start;
>> + __u64 dma64_window_size;
>> +};
>
> Is there anything we can document about this?

I'll put some.

> It should probably list that size is in bytes. Is there any need to communicate the IOMMU page
> size here?

It is always 4k. I'll put it to comments.

>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +/* Reuse type1 map/unmap structs as they are the same at the moment */
>> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
>> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
>> +
>> +/* ***************************************************************** */
>> +
>> #endif /* VFIO_H */
>
> Thanks,
>
> Alex
>
>
>


--
Alexey

2012-11-27 04:23:25

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

On Tue, 2012-11-27 at 14:28 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 05:04, Alex Williamson wrote:
> > On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
> >> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> >>> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: [email protected] [mailto:linux-kernel-
> >>>>> [email protected]] On Behalf Of Alex Williamson
> >>>>> Sent: Tuesday, November 20, 2012 11:50 PM
> >>>>> To: Alexey Kardashevskiy
> >>>>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> >>>>> [email protected]; [email protected]; [email protected];
> >>>>> David Gibson
> >>>>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> >>>>> platform
> >>>>>
> >>>>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> >>>>>> VFIO implements platform independent stuff such as a PCI driver, BAR
> >>>>>> access (via read/write on a file descriptor or direct mapping when
> >>>>>> possible) and IRQ signaling.
> >>>>>> The platform dependent part includes IOMMU initialization and
> >>>>>> handling.
> >>>>>>
> >>>>>> This patch initializes IOMMU groups based on the IOMMU configuration
> >>>>>> discovered during the PCI scan, only POWERNV platform is supported at
> >>>>>> the moment.
> >>>>>>
> >>>>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> >>>>>> mapping/unmapping requests coming from the client (now QEMU). It also
> >>>>>> returns a DMA window information to let the guest initialize the
> >>>>>> device tree for a guest OS properly. Although this driver has been
> >>>>>> tested only on POWERNV, it should work on any platform supporting TCE
> >>>>>> tables.
> >>>>>>
> >>>>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >>>>>>
> >>>>>> Cc: David Gibson <[email protected]>
> >>>>>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> >>>>>> ---
> >>>>>> arch/powerpc/include/asm/iommu.h | 6 +
> >>>>>> arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++
> >>>>>> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++
> >>>>>> drivers/iommu/Kconfig | 8 ++
> >>>>>> drivers/vfio/Kconfig | 6 +
> >>>>>> drivers/vfio/Makefile | 1 +
> >>>>>> drivers/vfio/vfio_iommu_spapr_tce.c | 247
> >>>>> ++++++++++++++++++++++++++++++++++
> >>>>>> include/linux/vfio.h | 20 +++
> >>>>>> 8 files changed, 563 insertions(+)
> >>>>>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>>>
> >>>>>> diff --git a/arch/powerpc/include/asm/iommu.h
> >>>>>> b/arch/powerpc/include/asm/iommu.h
> >>>>>> index cbfe678..5ba66cb 100644
> >>>>>> --- a/arch/powerpc/include/asm/iommu.h
> >>>>>> +++ b/arch/powerpc/include/asm/iommu.h
> >>>>>> @@ -64,30 +64,33 @@ struct iommu_pool { }
> >>>>>> ____cacheline_aligned_in_smp;
> >>>>>>
> >>>>>> struct iommu_table {
> >>>>>> unsigned long it_busno; /* Bus number this table belongs to */
> >>>>>> unsigned long it_size; /* Size of iommu table in entries */
> >>>>>> unsigned long it_offset; /* Offset into global table */
> >>>>>> unsigned long it_base; /* mapped address of tce table */
> >>>>>> unsigned long it_index; /* which iommu table this is */
> >>>>>> unsigned long it_type; /* type: PCI or Virtual Bus */
> >>>>>> unsigned long it_blocksize; /* Entries in each block (cacheline)
> >>>>> */
> >>>>>> unsigned long poolsize;
> >>>>>> unsigned long nr_pools;
> >>>>>> struct iommu_pool large_pool;
> >>>>>> struct iommu_pool pools[IOMMU_NR_POOLS];
> >>>>>> unsigned long *it_map; /* A simple allocation bitmap for now
> >>>>> */
> >>>>>> +#ifdef CONFIG_IOMMU_API
> >>>>>> + struct iommu_group *it_group;
> >>>>>> +#endif
> >>>>>> };
> >>>>>>
> >>>>>> struct scatterlist;
> >>>>>>
> >>>>>> static inline void set_iommu_table_base(struct device *dev, void
> >>>>>> *base) {
> >>>>>> dev->archdata.dma_data.iommu_table_base = base; }
> >>>>>>
> >>>>>> static inline void *get_iommu_table_base(struct device *dev) {
> >>>>>> return dev->archdata.dma_data.iommu_table_base;
> >>>>>> }
> >>>>>>
> >>>>>> /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> >>>>>> static inline void pci_iommu_init(void) { } extern void
> >>>>>> alloc_dart_table(void); #if defined(CONFIG_PPC64) &&
> >>>>>> defined(CONFIG_PM) static inline void iommu_save(void) {
> >>>>>> if (ppc_md.iommu_save)
> >>>>>> ppc_md.iommu_save();
> >>>>>> }
> >>>>>>
> >>>>>> static inline void iommu_restore(void) {
> >>>>>> if (ppc_md.iommu_restore)
> >>>>>> ppc_md.iommu_restore();
> >>>>>> }
> >>>>>> #endif
> >>>>>>
> >>>>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> >>>>> entry, uint64_t tce,
> >>>>>> + enum dma_data_direction direction, unsigned long pages);
> >>>>>> +
> >>>>>> #endif /* __KERNEL__ */
> >>>>>> #endif /* _ASM_IOMMU_H */
> >>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>>>>> index ff5a6ce..94f614b 100644
> >>>>>> --- a/arch/powerpc/kernel/iommu.c
> >>>>>> +++ b/arch/powerpc/kernel/iommu.c
> >>>>>> @@ -32,30 +32,31 @@
> >>>>>> #include <linux/dma-mapping.h>
> >>>>>> #include <linux/bitmap.h>
> >>>>>> #include <linux/iommu-helper.h>
> >>>>>> #include <linux/crash_dump.h>
> >>>>>> #include <linux/hash.h>
> >>>>>> #include <linux/fault-inject.h>
> >>>>>> #include <linux/pci.h>
> >>>>>> #include <asm/io.h>
> >>>>>> #include <asm/prom.h>
> >>>>>> #include <asm/iommu.h>
> >>>>>> #include <asm/pci-bridge.h>
> >>>>>> #include <asm/machdep.h>
> >>>>>> #include <asm/kdump.h>
> >>>>>> #include <asm/fadump.h>
> >>>>>> #include <asm/vio.h>
> >>>>>> +#include <asm/tce.h>
> >>>>>>
> >>>>>> #define DBG(...)
> >>>>>>
> >>>>>> static int novmerge;
> >>>>>>
> >>>>>> static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> >>>>>> int);
> >>>>>>
> >>>>>> static int __init setup_iommu(char *str) {
> >>>>>> if (!strcmp(str, "novmerge"))
> >>>>>> novmerge = 1;
> >>>>>> else if (!strcmp(str, "vmerge"))
> >>>>>> novmerge = 0;
> >>>>>> return 1;
> >>>>>> }
> >>>>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> >>>>>> struct iommu_table *tbl, }
> >>>>>>
> >>>>>> void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>>>>> void *vaddr, dma_addr_t dma_handle) {
> >>>>>> if (tbl) {
> >>>>>> unsigned int nio_pages;
> >>>>>>
> >>>>>> size = PAGE_ALIGN(size);
> >>>>>> nio_pages = size >> IOMMU_PAGE_SHIFT;
> >>>>>> iommu_free(tbl, dma_handle, nio_pages);
> >>>>>> size = PAGE_ALIGN(size);
> >>>>>> free_pages((unsigned long)vaddr, get_order(size));
> >>>>>> }
> >>>>>> }
> >>>>>> +
> >>>>>> +#ifdef CONFIG_IOMMU_API
> >>>>>> +/*
> >>>>>> + * SPAPR TCE API
> >>>>>> + */
> >>>>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> >>>>>> +entry) {
> >>>>>> + struct page *page = NULL;
> >>>>>
> >>>>> NULL initialization doesn't appear to be necessary
> >>>>>
> >>>>>> + unsigned long oldtce;
> >>>>>> +
> >>>>>> + oldtce = ppc_md.tce_get(tbl, entry);
> >>>>>> +
> >>>>>> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >>>>>> + return NULL;
> >>>>>> +
> >>>>>> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >>>>>> +
> >>>>>> + WARN_ON(!page);
> >>>>>> + if (page && (oldtce & TCE_PCI_WRITE))
> >>>>>> + SetPageDirty(page);
> >>>>>> + ppc_md.tce_free(tbl, entry, 1);
> >>>>>> +
> >>>>>> + return page;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >>>>>> + uint64_t tce, enum dma_data_direction direction) {
> >>>>>> + int ret;
> >>>>>> + struct page *page = NULL;
> >>>>>> + unsigned long kva, offset;
> >>>>>> +
> >>>>>> + /* Map new TCE */
> >>>>>> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>>>>> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>>>>> + direction != DMA_TO_DEVICE, &page);
> >>>>>> + if (ret < 1) {
> >>>>>> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> >>>>> tce=%llx ioba=%lx ret=%d\n",
> >>>>>> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>>>>> + if (!ret)
> >>>>>> + ret = -EFAULT;
> >>>>>
> >>>>> Missing return ret? Otherwise we've got some bogus uses of page below
> >>>>> and we're setting ret for no reason here.
> >>>>>
> >>>>>> + }
> >>>>>> +
> >>>>>> + kva = (unsigned long) page_address(page);
> >>>>>> + kva += offset;
> >>>>>> +
> >>>>>> + /* tce_build receives a virtual address */
> >>>>>> + entry += tbl->it_offset; /* Offset into real TCE table */
> >>>>>> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>>>>> +
> >>>>>> + /* tce_build() only returns non-zero for transient errors */
> >>>>>> + if (unlikely(ret)) {
> >>>>>> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> >>>>> ioba=%lx kva=%lx ret=%d\n",
> >>>>>> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >>>>>> + put_page(page);
> >>>>>> + return -EIO;
> >>>>>> + }
> >>>>>> +
> >>>>>> + return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void tce_flush(struct iommu_table *tbl) {
> >>>>>> + /* Flush/invalidate TLB caches if necessary */
> >>>>>> + if (ppc_md.tce_flush)
> >>>>>> + ppc_md.tce_flush(tbl);
> >>>>>> +
> >>>>>> + /* Make sure updates are seen by hardware */
> >>>>>> + mb();
> >>>>>> +}
> >>>>>> +
> >>>>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >>>>> uint64_t tce,
> >>>>>> + enum dma_data_direction direction, unsigned long pages) {
> >>>>>> + int i, ret = 0, pages_to_put = 0;
> >>>>>> + struct page *page;
> >>>>>> + struct iommu_pool *pool = get_pool(tbl, entry);
> >>>>>> + struct page **oldpages;
> >>>>>> + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> >>>>>> +
> >>>>>> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >>>>>> +
> >>>>>> + /* Handle a single page request without allocation
> >>>>>> + of pages-to-release array */
> >>>>>> + if (pages == 1) {
> >>>>>> + spin_lock(&(pool->lock));
> >>>>>> + page = free_tce(tbl, entry);
> >>>>>> +
> >>>>>> + if (direction != DMA_NONE)
> >>>>>> + ret = put_tce(tbl, entry, tce, direction);
> >>>>>> +
> >>>>>> + tce_flush(tbl);
> >>>>>> +
> >>>>>> + if (page)
> >>>>>> + put_page(page);
> >>>>>> +
> >>>>>> + spin_unlock(&(pool->lock));
> >>>>>> + return ret;
> >>>>>> + }
> >>>>>> +
> >>>>>> + /* Releasing multiple pages */
> >>>>>> + /* Allocate an array for pages to be released after TCE table
> >>>>>> + is updated */
> >>>>>> + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> >>>>>> + if (!oldpages)
> >>>>>> + return -ENOMEM;
> >>>>>> +
> >>>>>> + spin_lock(&(pool->lock));
> >>>>>> +
> >>>>>> + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> >>>>> IOMMU_PAGE_SIZE) {
> >>>>>> + page = free_tce(tbl, entry);
> >>>>>> + if (page) {
> >>>>>> + oldpages[pages_to_put] = page;
> >>>>>> + ++pages_to_put;
> >>>>>> + }
> >>>>>> +
> >>>>>> + if (direction != DMA_NONE)
> >>>>>> + ret = put_tce(tbl, entry, tce, direction);
> >>>>>> +
> >>>>>> + /* Release old pages if we reached the end of oldpages[] or
> >>>>>> + it is the last page or we are about to exit the loop */
> >>>>>> + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> >>>>> {
> >>>>>> + tce_flush(tbl);
> >>>>>
> >>>>> Avoiding tce_flush() is the reason for all this extra overhead, right?
> >>>>> I wonder if it'd be cleaner separating map vs unmap, where the map case
> >>>>> can avoid the oldpages array... but that means inserting new mappings on
> >>>>> top of old ones wouldn't put the pages.
> >>>
> >>>
> >>> Yes, we do not want to loose pages if the guest forgot to unmap them.
> >>
> >> Hmm, does that mean we're not actively clearing tce entries or somehow
> >> disabling the iommu window when the iommu is released through vfio?
> >
> > Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> > more concerned about the guest simply mapping over top of it's own
> > mappings. Is that common? Is it common enough for every multi-page
> > mapping to assume it will happen? I know this is a performance
> > sensitive path for you and it seems like a map-only w/ fallback to
> > unmap, remap would be better in the general case.
>
>
> I do not get it. Where exactly does the performance suffer?
> iommu_put_tces() with non zero "tce" (i.e. "map") has to check if the entry
> is not used, at least to return EBUSY when it is, and this check is
> performed. If it is zero, there is no overhead at all. And it is going to
> be the 99.(9)% case as the guest (un)maps one page per call.

I was mostly concerned about the kmalloc in your mapping path. If you
had a map-only path it could scan the whole range to verify it's clear
and return EBUSY w/o allocating a buffer. A second pass could do the
actual mappings. Maybe it's not much of a win if you expect 99% of the
mappings to be single pages but since you effectively have a pv iommu
interface I wouldn't be surprised if they get batched in the guest.

> Generally speaking we want to move "put tce" completely to the kernel for
> the (much) better performance and vfio won't be dealing with it all.

Right, but let's not use that as an excuse to be lazy and not ask if we
can do better here.

> We already agreed that SPAPR TCE driver uses x86 (aka type1) API but I do
> not see why the powerpc implementation should look x86 alike as it still
> operates with powerpc machine dependent callbacks so the reader has to have
> some powerpc knowledge.

I'm only using x86 as an example because it's the only one we have. I
don't think anything we're talking about here is x86-ish or powerpc-ish.
There's a kmalloc in a performance path and I'm asking if we can get rid
of it. I'm also nervous that we're silently doing fixups on user
parameters to adjust mapping sizes and clear overlaps without any
warning to the user. Thanks,

Alex

2012-11-27 04:41:21

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH 2/2] vfio powerpc: enabled on powernv platform

On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
>
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
>
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
>
> Cc: David Gibson <[email protected]>
> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> ---
> arch/powerpc/include/asm/iommu.h | 6 ++
> arch/powerpc/kernel/iommu.c | 141 ++++++++++++++++++++++++++++++++++
> arch/powerpc/platforms/powernv/pci.c | 135 ++++++++++++++++++++++++++++++++
> drivers/iommu/Kconfig | 8 ++
> 4 files changed, 290 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5ba66cb 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
> struct iommu_pool large_pool;
> struct iommu_pool pools[IOMMU_NR_POOLS];
> unsigned long *it_map; /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
> };
>
> struct scatterlist;
> @@ -147,5 +150,8 @@ static inline void iommu_restore(void)
> }
> #endif
>
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> + enum dma_data_direction direction, unsigned long pages);
> +
> #endif /* __KERNEL__ */
> #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..c8dad1f 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
> #include <asm/kdump.h>
> #include <asm/fadump.h>
> #include <asm/vio.h>
> +#include <asm/tce.h>
>
> #define DBG(...)
>
> @@ -856,3 +857,143 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> free_pages((unsigned long)vaddr, get_order(size));
> }
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
> +{
> + struct page *page;
> + unsigned long oldtce;
> +
> + oldtce = ppc_md.tce_get(tbl, entry);
> +
> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> + return NULL;
> +
> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> + WARN_ON(!page);
> + if (page && (oldtce & TCE_PCI_WRITE))
> + SetPageDirty(page);
> + ppc_md.tce_free(tbl, entry, 1);
> +
> + return page;
> +}
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction)
> +{
> + int ret;
> + struct page *page = NULL;
> + unsigned long kva, offset;
> +
> + /* Map new TCE */
> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> + direction != DMA_TO_DEVICE, &page);

We're locking memory here on behalf of the user, but I don't see where
rlimit gets checked to verify the user has privileges to lock the pages.
I know you're locking a much smaller set of memory than x86 does, but
are we just foregoing that added security?

> + if (ret < 1) {
> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> + if (!ret)
> + ret = -EFAULT;
> + return ret;
> + }
> +
> + kva = (unsigned long) page_address(page);
> + kva += offset;
> +
> + /* tce_build receives a virtual address */
> + entry += tbl->it_offset; /* Offset into real TCE table */
> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> + /* tce_build() only returns non-zero for transient errors */
> + if (unlikely(ret)) {
> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> + put_page(page);
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> + /* Flush/invalidate TLB caches if necessary */
> + if (ppc_md.tce_flush)
> + ppc_md.tce_flush(tbl);
> +
> + /* Make sure updates are seen by hardware */
> + mb();
> +}
> +
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> + enum dma_data_direction direction, unsigned long pages)
> +{
> + int i, ret = 0, pages_to_put = 0;
> + struct page *page;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> + struct page **oldpages;
> + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> +
> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +
> + /* Handle a single page request without allocation
> + of pages-to-release array */

nit, this comment style doesn't seem to match anything existing in this
file. I'd also be tempted to use pr_err/warn in this file, but I'll
leave that for the maintainers. Thanks,

Alex

> + if (pages == 1) {
> + spin_lock(&(pool->lock));
> + page = free_tce(tbl, entry);
> +
> + if (direction != DMA_NONE)
> + ret = put_tce(tbl, entry, tce, direction);
> +
> + tce_flush(tbl);
> +
> + if (page)
> + put_page(page);
> +
> + spin_unlock(&(pool->lock));
> + return ret;
> + }
> +
> + /* Releasing multiple pages */
> + /* Allocate an array for pages to be released after TCE table
> + is updated */
> + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> + if (!oldpages)
> + return -ENOMEM;
> +
> + spin_lock(&(pool->lock));
> +
> + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
> + page = free_tce(tbl, entry);
> + if (page) {
> + oldpages[pages_to_put] = page;
> + ++pages_to_put;
> + }
> +
> + if (direction != DMA_NONE)
> + ret = put_tce(tbl, entry, tce, direction);
> +
> + /* Release old pages if we reached the end of oldpages[] or
> + it is the last page or we are about to exit the loop */
> + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
> + tce_flush(tbl);
> +
> + /* Release pages after removing them from TCE table */
> + while (pages_to_put) {
> + --pages_to_put;
> + put_page(oldpages[pages_to_put]);
> + }
> + }
> + }
> +
> + spin_unlock(&(pool->lock));
> + kfree(oldpages);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..660dcc6 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
> #include <linux/irq.h>
> #include <linux/io.h>
> #include <linux/msi.h>
> +#include <linux/iommu.h>
>
> #include <asm/sections.h>
> #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> #endif
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> + struct iommu_table *tbl;
> + int ret = 0;
> +
> + if (WARN_ON(dev->iommu_group)) {
> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> + dev_name(dev),
> + iommu_group_id(dev->iommu_group));
> + return -EBUSY;
> + }
> +
> + tbl = get_iommu_table_base(dev);
> + if (!tbl) {
> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> + dev_name(dev));
> + return 0;
> + }
> +
> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> + dev_name(dev), iommu_group_id(tbl->it_group));
> +
> + ret = iommu_group_add_device(tbl->it_group, dev);
> + if (ret < 0)
> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> + dev_name(dev), ret);
> +
> + return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> + iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> + unsigned long action, void *data)
> +{
> + struct device *dev = data;
> +
> + switch (action) {
> + case BUS_NOTIFY_ADD_DEVICE:
> + return add_device(dev);
> + case BUS_NOTIFY_DEL_DEVICE:
> + del_device(dev);
> + return 0;
> + default:
> + return 0;
> + }
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> + .notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> + struct iommu_table *tbl = iommu_data;
> + tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp;
> +
> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Allocate and initialize IOMMU groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> +
> + /* Skip already initialized */
> + if (tbl->it_group)
> + continue;
> +
> + grp = iommu_group_alloc();
> + if (IS_ERR(grp)) {
> + printk(KERN_INFO "tce_vfio: cannot create "
> + "new IOMMU group, ret=%ld\n",
> + PTR_ERR(grp));
> + return PTR_ERR(grp);
> + }
> + tbl->it_group = grp;
> + iommu_group_set_iommudata(grp, tbl, group_release);
> + }
> +
> + /* Add PCI devices to VFIO groups */
> + for_each_pci_dev(pdev)
> + add_device(&pdev->dev);
> +
> + return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp = NULL;
> +
> + bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Delete PCI devices from VFIO groups */
> + for_each_pci_dev(pdev)
> + del_device(&pdev->dev);
> +
> + /* Release VFIO groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> + grp = tbl->it_group;
> +
> + /* Skip (already) uninitialized */
> + if (!grp)
> + continue;
> +
> + /* Do actual release, group_release() is expected to work */
> + iommu_group_put(grp);
> + BUG_ON(tbl->it_group);
> + }
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>
> Say N unless you need kernel log message for IOMMU debugging
>
> +config SPAPR_TCE_IOMMU
> + bool "sPAPR TCE IOMMU Support"
> + depends on PPC_POWERNV
> + select IOMMU_API
> + help
> + Enables bits of IOMMU API required by VFIO. The iommu_ops is
> + still not implemented.
> +
> endif # IOMMU_SUPPORT


2012-11-27 04:58:26

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO

On 27/11/12 15:29, Alex Williamson wrote:
> On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
>> On 27/11/12 05:20, Alex Williamson wrote:
>>> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
>>>> VFIO implements platform independent stuff such as
>>>> a PCI driver, BAR access (via read/write on a file descriptor
>>>> or direct mapping when possible) and IRQ signaling.
>>>>
>>>> The platform dependent part includes IOMMU initialization
>>>> and handling. This patch implements an IOMMU driver for VFIO
>>>> which does mapping/unmapping pages for the guest IO and
>>>> provides information about DMA window (required by a POWERPC
>>>> guest).
>>>>
>>>> The counterpart in QEMU is required to support this functionality.
>>>>
>>>> Cc: David Gibson <[email protected]>
>>>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
>>>> ---
>>>> drivers/vfio/Kconfig | 6 +
>>>> drivers/vfio/Makefile | 1 +
>>>> drivers/vfio/vfio_iommu_spapr_tce.c | 247 +++++++++++++++++++++++++++++++++++
>>>> include/linux/vfio.h | 20 +++
>>>> 4 files changed, 274 insertions(+)
>>>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>
>>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>>>> index 7cd5dec..b464687 100644
>>>> --- a/drivers/vfio/Kconfig
>>>> +++ b/drivers/vfio/Kconfig
>>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>>> depends on VFIO
>>>> default n
>>>>
>>>> +config VFIO_IOMMU_SPAPR_TCE
>>>> + tristate
>>>> + depends on VFIO && SPAPR_TCE_IOMMU
>>>> + default n
>>>> +
>>>> menuconfig VFIO
>>>> tristate "VFIO Non-Privileged userspace driver framework"
>>>> depends on IOMMU_API
>>>> select VFIO_IOMMU_TYPE1 if X86
>>>> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>>> help
>>>> VFIO provides a framework for secure userspace device drivers.
>>>> See Documentation/vfio.txt for more details.
>>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>>>> index 2398d4a..72bfabc 100644
>>>> --- a/drivers/vfio/Makefile
>>>> +++ b/drivers/vfio/Makefile
>>>> @@ -1,3 +1,4 @@
>>>> obj-$(CONFIG_VFIO) += vfio.o
>>>> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>>> obj-$(CONFIG_VFIO_PCI) += pci/
>>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> new file mode 100644
>>>> index 0000000..46a6298
>>>> --- /dev/null
>>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> @@ -0,0 +1,247 @@
>>>> +/*
>>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>>>> + *
>>>> + * Copyright (C) 2012 IBM Corp. All rights reserved.
>>>> + * Author: Alexey Kardashevskiy <[email protected]>
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License version 2 as
>>>> + * published by the Free Software Foundation.
>>>> + *
>>>> + * Derived from original vfio_iommu_type1.c:
>>>> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
>>>> + * Author: Alex Williamson <[email protected]>
>>>> + */
>>>> +
>>>> +#include <linux/module.h>
>>>> +#include <linux/pci.h>
>>>> +#include <linux/slab.h>
>>>> +#include <linux/uaccess.h>
>>>> +#include <linux/err.h>
>>>> +#include <linux/vfio.h>
>>>> +#include <asm/iommu.h>
>>>> +
>>>> +#define DRIVER_VERSION "0.1"
>>>> +#define DRIVER_AUTHOR "[email protected]"
>>>> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
>>>> +
>>>> +static void tce_iommu_detach_group(void *iommu_data,
>>>> + struct iommu_group *iommu_group);
>>>> +
>>>> +/*
>>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>>>> + */
>>>> +
>>>> +/*
>>>> + * The container descriptor supports only a single group per container.
>>>> + * Required by the API as the container is not supplied with the IOMMU group
>>>> + * at the moment of initialization.
>>>> + */
>>>> +struct tce_container {
>>>> + struct mutex lock;
>>>> + struct iommu_table *tbl;
>>>> +};
>>>> +
>>>> +static void *tce_iommu_open(unsigned long arg)
>>>> +{
>>>> + struct tce_container *container;
>>>> +
>>>> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
>>>> + printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>>>> + return ERR_PTR(-EINVAL);
>>>> + }
>>>> +
>>>> + container = kzalloc(sizeof(*container), GFP_KERNEL);
>>>> + if (!container)
>>>> + return ERR_PTR(-ENOMEM);
>>>> +
>>>> + mutex_init(&container->lock);
>>>> +
>>>> + return container;
>>>> +}
>>>> +
>>>> +static void tce_iommu_release(void *iommu_data)
>>>> +{
>>>> + struct tce_container *container = iommu_data;
>>>> +
>>>> + WARN_ON(container->tbl && !container->tbl->it_group);
>>>
>>> I think your patch ordering is backwards here. it_group isn't added
>>> until 2/2. I'd really like to see the arch/powerpc code approved and
>>> merged by the powerpc maintainer before we add the code that makes use
>>> of it into vfio. Otherwise we just get lots of churn if interfaces
>>> change or they disapprove of it altogether.
>>
>>
>> Makes sense, thanks.
>>
>>
>>>> + if (container->tbl && container->tbl->it_group)
>>>> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>>>> +
>>>> + mutex_destroy(&container->lock);
>>>> +
>>>> + kfree(container);
>>>> +}
>>>> +
>>>> +static long tce_iommu_ioctl(void *iommu_data,
>>>> + unsigned int cmd, unsigned long arg)
>>>> +{
>>>> + struct tce_container *container = iommu_data;
>>>> + unsigned long minsz;
>>>> +
>>>> + switch (cmd) {
>>>> + case VFIO_CHECK_EXTENSION: {
>>>> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>>>> + }
>>>> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>>>> + struct vfio_iommu_spapr_tce_info info;
>>>> + struct iommu_table *tbl = container->tbl;
>>>> +
>>>> + if (WARN_ON(!tbl))
>>>> + return -ENXIO;
>>>> +
>>>> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>>>> + dma64_window_size);
>>>> +
>>>> + if (copy_from_user(&info, (void __user *)arg, minsz))
>>>> + return -EFAULT;
>>>> +
>>>> + if (info.argsz < minsz)
>>>> + return -EINVAL;
>>>> +
>>>> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>>>> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>>>> + info.dma64_window_start = 0;
>>>> + info.dma64_window_size = 0;
>>>> + info.flags = 0;
>>>> +
>>>> + if (copy_to_user((void __user *)arg, &info, minsz))
>>>> + return -EFAULT;
>>>> +
>>>> + return 0;
>>>> + }
>>>> + case VFIO_IOMMU_MAP_DMA: {
>>>> + vfio_iommu_spapr_tce_dma_map param;
>>>> + struct iommu_table *tbl = container->tbl;
>>>> + enum dma_data_direction direction = DMA_NONE;
>>>> +
>>>> + if (WARN_ON(!tbl))
>>>> + return -ENXIO;
>>>> +
>>>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>>>> +
>>>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>>>> + return -EFAULT;
>>>> +
>>>> + if (param.argsz < minsz)
>>>> + return -EINVAL;
>>>> +
>>>> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>>>> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
>>>> + direction = DMA_BIDIRECTIONAL;
>>>> + } else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
>>>> + direction = DMA_TO_DEVICE;
>>>> + } else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
>>>> + direction = DMA_FROM_DEVICE;
>>>> + }
>>>> +
>>>> + param.size += param.iova & ~IOMMU_PAGE_MASK;
>>>> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>>>
>>> On x86 we force iova, vaddr, and size to all be aligned to the smallest
>>> page granularity of the iommu and return -EINVAL if it doesn't fit.
>>> What does it imply to the user if they're always aligned to work here?
>>> Won't this interface happily map overlapping entries with no indication
>>> to the user that the previous mapping is no longer valid?
>>> Maybe another reason why a combined unmap/map makes me nervous, we have
>>> to assume the user knows what they're doing.
>>
>>
>> I got used to guests which do know what they are doing so I am pretty calm :)
>> but ok, I'll move alignment to the QEMU, it makes sense.
>>
>>
>>>> +
>>>> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>>>> + param.vaddr & IOMMU_PAGE_MASK, direction,
>>>> + param.size >> IOMMU_PAGE_SHIFT);
>>>> + }
>>>> + case VFIO_IOMMU_UNMAP_DMA: {
>>>> + vfio_iommu_spapr_tce_dma_unmap param;
>>>> + struct iommu_table *tbl = container->tbl;
>>>> +
>>>> + if (WARN_ON(!tbl))
>>>> + return -ENXIO;
>>>> +
>>>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>>>> +
>>>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>>>> + return -EFAULT;
>>>> +
>>>> + if (param.argsz < minsz)
>>>> + return -EINVAL;
>>>> +
>>>> + param.size += param.iova & ~IOMMU_PAGE_MASK;
>>>> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>>>> +
>>>> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>>>> + 0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
>>>> + }
>>>> + default:
>>>> + printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>>>
>>> pr_warn
>>>
>>>> + }
>>>> +
>>>> + return -ENOTTY;
>>>> +}
>>>> +
>>>> +static int tce_iommu_attach_group(void *iommu_data,
>>>> + struct iommu_group *iommu_group)
>>>> +{
>>>> + struct tce_container *container = iommu_data;
>>>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>>>> +
>>>> + BUG_ON(!tbl);
>>>> + mutex_lock(&container->lock);
>>>> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>>>> + iommu_group_id(iommu_group), iommu_group);
>>>> + if (container->tbl) {
>>>> + printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>>>
>>> pr_warn
>>>
>>>> + iommu_group_id(container->tbl->it_group),
>>>> + iommu_group_id(iommu_group));
>>>> + mutex_unlock(&container->lock);
>>>> + return -EBUSY;
>>>> + }
>>>> +
>>>> + container->tbl = tbl;
>>>
>>> Would it be too much paranoia to clear all the tce here as you do below
>>> on detach?
>>
>> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
>> unmaps) the whole DMA window at the boot time.
>
> But that's just one user of this interface, we can't assume they'll all
> be so agreeable. If any tces were enabled here, a malicious user would
> have a window to host memory, right? Thanks,


But I still release pages on detach(), how can this code be not called on
the guest exit (normal or crashed)?



>
> Alex
>


--
Alexey

2012-11-27 05:02:32

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO

On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 05:20, Alex Williamson wrote:
> > On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >> VFIO implements platform independent stuff such as
> >> a PCI driver, BAR access (via read/write on a file descriptor
> >> or direct mapping when possible) and IRQ signaling.
> >>
> >> The platform dependent part includes IOMMU initialization
> >> and handling. This patch implements an IOMMU driver for VFIO
> >> which does mapping/unmapping pages for the guest IO and
> >> provides information about DMA window (required by a POWERPC
> >> guest).
> >>
> >> The counterpart in QEMU is required to support this functionality.
> >>
> >> Cc: David Gibson <[email protected]>
> >> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> >> ---
> >> drivers/vfio/Kconfig | 6 +
> >> drivers/vfio/Makefile | 1 +
> >> drivers/vfio/vfio_iommu_spapr_tce.c | 247 +++++++++++++++++++++++++++++++++++
> >> include/linux/vfio.h | 20 +++
> >> 4 files changed, 274 insertions(+)
> >> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>
> >> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >> index 7cd5dec..b464687 100644
> >> --- a/drivers/vfio/Kconfig
> >> +++ b/drivers/vfio/Kconfig
> >> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >> depends on VFIO
> >> default n
> >>
> >> +config VFIO_IOMMU_SPAPR_TCE
> >> + tristate
> >> + depends on VFIO && SPAPR_TCE_IOMMU
> >> + default n
> >> +
> >> menuconfig VFIO
> >> tristate "VFIO Non-Privileged userspace driver framework"
> >> depends on IOMMU_API
> >> select VFIO_IOMMU_TYPE1 if X86
> >> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >> help
> >> VFIO provides a framework for secure userspace device drivers.
> >> See Documentation/vfio.txt for more details.
> >> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >> index 2398d4a..72bfabc 100644
> >> --- a/drivers/vfio/Makefile
> >> +++ b/drivers/vfio/Makefile
> >> @@ -1,3 +1,4 @@
> >> obj-$(CONFIG_VFIO) += vfio.o
> >> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >> obj-$(CONFIG_VFIO_PCI) += pci/
> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> new file mode 100644
> >> index 0000000..46a6298
> >> --- /dev/null
> >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> @@ -0,0 +1,247 @@
> >> +/*
> >> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >> + *
> >> + * Copyright (C) 2012 IBM Corp. All rights reserved.
> >> + * Author: Alexey Kardashevskiy <[email protected]>
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License version 2 as
> >> + * published by the Free Software Foundation.
> >> + *
> >> + * Derived from original vfio_iommu_type1.c:
> >> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> >> + * Author: Alex Williamson <[email protected]>
> >> + */
> >> +
> >> +#include <linux/module.h>
> >> +#include <linux/pci.h>
> >> +#include <linux/slab.h>
> >> +#include <linux/uaccess.h>
> >> +#include <linux/err.h>
> >> +#include <linux/vfio.h>
> >> +#include <asm/iommu.h>
> >> +
> >> +#define DRIVER_VERSION "0.1"
> >> +#define DRIVER_AUTHOR "[email protected]"
> >> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> >> +
> >> +static void tce_iommu_detach_group(void *iommu_data,
> >> + struct iommu_group *iommu_group);
> >> +
> >> +/*
> >> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >> + */
> >> +
> >> +/*
> >> + * The container descriptor supports only a single group per container.
> >> + * Required by the API as the container is not supplied with the IOMMU group
> >> + * at the moment of initialization.
> >> + */
> >> +struct tce_container {
> >> + struct mutex lock;
> >> + struct iommu_table *tbl;
> >> +};
> >> +
> >> +static void *tce_iommu_open(unsigned long arg)
> >> +{
> >> + struct tce_container *container;
> >> +
> >> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >> + printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >> + return ERR_PTR(-EINVAL);
> >> + }
> >> +
> >> + container = kzalloc(sizeof(*container), GFP_KERNEL);
> >> + if (!container)
> >> + return ERR_PTR(-ENOMEM);
> >> +
> >> + mutex_init(&container->lock);
> >> +
> >> + return container;
> >> +}
> >> +
> >> +static void tce_iommu_release(void *iommu_data)
> >> +{
> >> + struct tce_container *container = iommu_data;
> >> +
> >> + WARN_ON(container->tbl && !container->tbl->it_group);
> >
> > I think your patch ordering is backwards here. it_group isn't added
> > until 2/2. I'd really like to see the arch/powerpc code approved and
> > merged by the powerpc maintainer before we add the code that makes use
> > of it into vfio. Otherwise we just get lots of churn if interfaces
> > change or they disapprove of it altogether.
>
>
> Makes sense, thanks.
>
>
> >> + if (container->tbl && container->tbl->it_group)
> >> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >> +
> >> + mutex_destroy(&container->lock);
> >> +
> >> + kfree(container);
> >> +}
> >> +
> >> +static long tce_iommu_ioctl(void *iommu_data,
> >> + unsigned int cmd, unsigned long arg)
> >> +{
> >> + struct tce_container *container = iommu_data;
> >> + unsigned long minsz;
> >> +
> >> + switch (cmd) {
> >> + case VFIO_CHECK_EXTENSION: {
> >> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >> + }
> >> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >> + struct vfio_iommu_spapr_tce_info info;
> >> + struct iommu_table *tbl = container->tbl;
> >> +
> >> + if (WARN_ON(!tbl))
> >> + return -ENXIO;
> >> +
> >> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >> + dma64_window_size);
> >> +
> >> + if (copy_from_user(&info, (void __user *)arg, minsz))
> >> + return -EFAULT;
> >> +
> >> + if (info.argsz < minsz)
> >> + return -EINVAL;
> >> +
> >> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >> + info.dma64_window_start = 0;
> >> + info.dma64_window_size = 0;
> >> + info.flags = 0;
> >> +
> >> + if (copy_to_user((void __user *)arg, &info, minsz))
> >> + return -EFAULT;
> >> +
> >> + return 0;
> >> + }
> >> + case VFIO_IOMMU_MAP_DMA: {
> >> + vfio_iommu_spapr_tce_dma_map param;
> >> + struct iommu_table *tbl = container->tbl;
> >> + enum dma_data_direction direction = DMA_NONE;
> >> +
> >> + if (WARN_ON(!tbl))
> >> + return -ENXIO;
> >> +
> >> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >> +
> >> + if (copy_from_user(&param, (void __user *)arg, minsz))
> >> + return -EFAULT;
> >> +
> >> + if (param.argsz < minsz)
> >> + return -EINVAL;
> >> +
> >> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >> + direction = DMA_BIDIRECTIONAL;
> >> + } else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >> + direction = DMA_TO_DEVICE;
> >> + } else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >> + direction = DMA_FROM_DEVICE;
> >> + }
> >> +
> >> + param.size += param.iova & ~IOMMU_PAGE_MASK;
> >> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >
> > On x86 we force iova, vaddr, and size to all be aligned to the smallest
> > page granularity of the iommu and return -EINVAL if it doesn't fit.
> > What does it imply to the user if they're always aligned to work here?
> > Won't this interface happily map overlapping entries with no indication
> > to the user that the previous mapping is no longer valid?
> > Maybe another reason why a combined unmap/map makes me nervous, we have
> > to assume the user knows what they're doing.
>
>
> I got used to guests which do know what they are doing so I am pretty calm :)
> but ok, I'll move alignment to the QEMU, it makes sense.
>
>
> >> +
> >> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >> + param.vaddr & IOMMU_PAGE_MASK, direction,
> >> + param.size >> IOMMU_PAGE_SHIFT);
> >> + }
> >> + case VFIO_IOMMU_UNMAP_DMA: {
> >> + vfio_iommu_spapr_tce_dma_unmap param;
> >> + struct iommu_table *tbl = container->tbl;
> >> +
> >> + if (WARN_ON(!tbl))
> >> + return -ENXIO;
> >> +
> >> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >> +
> >> + if (copy_from_user(&param, (void __user *)arg, minsz))
> >> + return -EFAULT;
> >> +
> >> + if (param.argsz < minsz)
> >> + return -EINVAL;
> >> +
> >> + param.size += param.iova & ~IOMMU_PAGE_MASK;
> >> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >> +
> >> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >> + 0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >> + }
> >> + default:
> >> + printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >
> > pr_warn
> >
> >> + }
> >> +
> >> + return -ENOTTY;
> >> +}
> >> +
> >> +static int tce_iommu_attach_group(void *iommu_data,
> >> + struct iommu_group *iommu_group)
> >> +{
> >> + struct tce_container *container = iommu_data;
> >> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >> +
> >> + BUG_ON(!tbl);
> >> + mutex_lock(&container->lock);
> >> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >> + iommu_group_id(iommu_group), iommu_group);
> >> + if (container->tbl) {
> >> + printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >
> > pr_warn
> >
> >> + iommu_group_id(container->tbl->it_group),
> >> + iommu_group_id(iommu_group));
> >> + mutex_unlock(&container->lock);
> >> + return -EBUSY;
> >> + }
> >> +
> >> + container->tbl = tbl;
> >
> > Would it be too much paranoia to clear all the tce here as you do below
> > on detach?
>
> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
> unmaps) the whole DMA window at the boot time.

But that's just one user of this interface, we can't assume they'll all
be so agreeable. If any tces were enabled here, a malicious user would
have a window to host memory, right? Thanks,

Alex

2012-11-27 05:08:08

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO

On Tue, 2012-11-27 at 15:58 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 15:29, Alex Williamson wrote:
> > On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> >> On 27/11/12 05:20, Alex Williamson wrote:
> >>> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >>>> VFIO implements platform independent stuff such as
> >>>> a PCI driver, BAR access (via read/write on a file descriptor
> >>>> or direct mapping when possible) and IRQ signaling.
> >>>>
> >>>> The platform dependent part includes IOMMU initialization
> >>>> and handling. This patch implements an IOMMU driver for VFIO
> >>>> which does mapping/unmapping pages for the guest IO and
> >>>> provides information about DMA window (required by a POWERPC
> >>>> guest).
> >>>>
> >>>> The counterpart in QEMU is required to support this functionality.
> >>>>
> >>>> Cc: David Gibson <[email protected]>
> >>>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> >>>> ---
> >>>> drivers/vfio/Kconfig | 6 +
> >>>> drivers/vfio/Makefile | 1 +
> >>>> drivers/vfio/vfio_iommu_spapr_tce.c | 247 +++++++++++++++++++++++++++++++++++
> >>>> include/linux/vfio.h | 20 +++
> >>>> 4 files changed, 274 insertions(+)
> >>>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>
> >>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>> index 7cd5dec..b464687 100644
> >>>> --- a/drivers/vfio/Kconfig
> >>>> +++ b/drivers/vfio/Kconfig
> >>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>> depends on VFIO
> >>>> default n
> >>>>
> >>>> +config VFIO_IOMMU_SPAPR_TCE
> >>>> + tristate
> >>>> + depends on VFIO && SPAPR_TCE_IOMMU
> >>>> + default n
> >>>> +
> >>>> menuconfig VFIO
> >>>> tristate "VFIO Non-Privileged userspace driver framework"
> >>>> depends on IOMMU_API
> >>>> select VFIO_IOMMU_TYPE1 if X86
> >>>> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>> help
> >>>> VFIO provides a framework for secure userspace device drivers.
> >>>> See Documentation/vfio.txt for more details.
> >>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>> index 2398d4a..72bfabc 100644
> >>>> --- a/drivers/vfio/Makefile
> >>>> +++ b/drivers/vfio/Makefile
> >>>> @@ -1,3 +1,4 @@
> >>>> obj-$(CONFIG_VFIO) += vfio.o
> >>>> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>> obj-$(CONFIG_VFIO_PCI) += pci/
> >>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> new file mode 100644
> >>>> index 0000000..46a6298
> >>>> --- /dev/null
> >>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> @@ -0,0 +1,247 @@
> >>>> +/*
> >>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>> + *
> >>>> + * Copyright (C) 2012 IBM Corp. All rights reserved.
> >>>> + * Author: Alexey Kardashevskiy <[email protected]>
> >>>> + *
> >>>> + * This program is free software; you can redistribute it and/or modify
> >>>> + * it under the terms of the GNU General Public License version 2 as
> >>>> + * published by the Free Software Foundation.
> >>>> + *
> >>>> + * Derived from original vfio_iommu_type1.c:
> >>>> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> >>>> + * Author: Alex Williamson <[email protected]>
> >>>> + */
> >>>> +
> >>>> +#include <linux/module.h>
> >>>> +#include <linux/pci.h>
> >>>> +#include <linux/slab.h>
> >>>> +#include <linux/uaccess.h>
> >>>> +#include <linux/err.h>
> >>>> +#include <linux/vfio.h>
> >>>> +#include <asm/iommu.h>
> >>>> +
> >>>> +#define DRIVER_VERSION "0.1"
> >>>> +#define DRIVER_AUTHOR "[email protected]"
> >>>> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> >>>> +
> >>>> +static void tce_iommu_detach_group(void *iommu_data,
> >>>> + struct iommu_group *iommu_group);
> >>>> +
> >>>> +/*
> >>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>> + */
> >>>> +
> >>>> +/*
> >>>> + * The container descriptor supports only a single group per container.
> >>>> + * Required by the API as the container is not supplied with the IOMMU group
> >>>> + * at the moment of initialization.
> >>>> + */
> >>>> +struct tce_container {
> >>>> + struct mutex lock;
> >>>> + struct iommu_table *tbl;
> >>>> +};
> >>>> +
> >>>> +static void *tce_iommu_open(unsigned long arg)
> >>>> +{
> >>>> + struct tce_container *container;
> >>>> +
> >>>> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>> + printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>> + return ERR_PTR(-EINVAL);
> >>>> + }
> >>>> +
> >>>> + container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>> + if (!container)
> >>>> + return ERR_PTR(-ENOMEM);
> >>>> +
> >>>> + mutex_init(&container->lock);
> >>>> +
> >>>> + return container;
> >>>> +}
> >>>> +
> >>>> +static void tce_iommu_release(void *iommu_data)
> >>>> +{
> >>>> + struct tce_container *container = iommu_data;
> >>>> +
> >>>> + WARN_ON(container->tbl && !container->tbl->it_group);
> >>>
> >>> I think your patch ordering is backwards here. it_group isn't added
> >>> until 2/2. I'd really like to see the arch/powerpc code approved and
> >>> merged by the powerpc maintainer before we add the code that makes use
> >>> of it into vfio. Otherwise we just get lots of churn if interfaces
> >>> change or they disapprove of it altogether.
> >>
> >>
> >> Makes sense, thanks.
> >>
> >>
> >>>> + if (container->tbl && container->tbl->it_group)
> >>>> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >>>> +
> >>>> + mutex_destroy(&container->lock);
> >>>> +
> >>>> + kfree(container);
> >>>> +}
> >>>> +
> >>>> +static long tce_iommu_ioctl(void *iommu_data,
> >>>> + unsigned int cmd, unsigned long arg)
> >>>> +{
> >>>> + struct tce_container *container = iommu_data;
> >>>> + unsigned long minsz;
> >>>> +
> >>>> + switch (cmd) {
> >>>> + case VFIO_CHECK_EXTENSION: {
> >>>> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>> + }
> >>>> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>> + struct vfio_iommu_spapr_tce_info info;
> >>>> + struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> + if (WARN_ON(!tbl))
> >>>> + return -ENXIO;
> >>>> +
> >>>> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>> + dma64_window_size);
> >>>> +
> >>>> + if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>> + return -EFAULT;
> >>>> +
> >>>> + if (info.argsz < minsz)
> >>>> + return -EINVAL;
> >>>> +
> >>>> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>> + info.dma64_window_start = 0;
> >>>> + info.dma64_window_size = 0;
> >>>> + info.flags = 0;
> >>>> +
> >>>> + if (copy_to_user((void __user *)arg, &info, minsz))
> >>>> + return -EFAULT;
> >>>> +
> >>>> + return 0;
> >>>> + }
> >>>> + case VFIO_IOMMU_MAP_DMA: {
> >>>> + vfio_iommu_spapr_tce_dma_map param;
> >>>> + struct iommu_table *tbl = container->tbl;
> >>>> + enum dma_data_direction direction = DMA_NONE;
> >>>> +
> >>>> + if (WARN_ON(!tbl))
> >>>> + return -ENXIO;
> >>>> +
> >>>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >>>> +
> >>>> + if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>> + return -EFAULT;
> >>>> +
> >>>> + if (param.argsz < minsz)
> >>>> + return -EINVAL;
> >>>> +
> >>>> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >>>> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >>>> + direction = DMA_BIDIRECTIONAL;
> >>>> + } else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >>>> + direction = DMA_TO_DEVICE;
> >>>> + } else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >>>> + direction = DMA_FROM_DEVICE;
> >>>> + }
> >>>> +
> >>>> + param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>
> >>> On x86 we force iova, vaddr, and size to all be aligned to the smallest
> >>> page granularity of the iommu and return -EINVAL if it doesn't fit.
> >>> What does it imply to the user if they're always aligned to work here?
> >>> Won't this interface happily map overlapping entries with no indication
> >>> to the user that the previous mapping is no longer valid?
> >>> Maybe another reason why a combined unmap/map makes me nervous, we have
> >>> to assume the user knows what they're doing.
> >>
> >>
> >> I got used to guests which do know what they are doing so I am pretty calm :)
> >> but ok, I'll move alignment to the QEMU, it makes sense.
> >>
> >>
> >>>> +
> >>>> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>> + param.vaddr & IOMMU_PAGE_MASK, direction,
> >>>> + param.size >> IOMMU_PAGE_SHIFT);
> >>>> + }
> >>>> + case VFIO_IOMMU_UNMAP_DMA: {
> >>>> + vfio_iommu_spapr_tce_dma_unmap param;
> >>>> + struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> + if (WARN_ON(!tbl))
> >>>> + return -ENXIO;
> >>>> +
> >>>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >>>> +
> >>>> + if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>> + return -EFAULT;
> >>>> +
> >>>> + if (param.argsz < minsz)
> >>>> + return -EINVAL;
> >>>> +
> >>>> + param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>> + param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>> +
> >>>> + return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>> + 0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >>>> + }
> >>>> + default:
> >>>> + printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>
> >>> pr_warn
> >>>
> >>>> + }
> >>>> +
> >>>> + return -ENOTTY;
> >>>> +}
> >>>> +
> >>>> +static int tce_iommu_attach_group(void *iommu_data,
> >>>> + struct iommu_group *iommu_group)
> >>>> +{
> >>>> + struct tce_container *container = iommu_data;
> >>>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>> +
> >>>> + BUG_ON(!tbl);
> >>>> + mutex_lock(&container->lock);
> >>>> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >>>> + iommu_group_id(iommu_group), iommu_group);
> >>>> + if (container->tbl) {
> >>>> + printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >>>
> >>> pr_warn
> >>>
> >>>> + iommu_group_id(container->tbl->it_group),
> >>>> + iommu_group_id(iommu_group));
> >>>> + mutex_unlock(&container->lock);
> >>>> + return -EBUSY;
> >>>> + }
> >>>> +
> >>>> + container->tbl = tbl;
> >>>
> >>> Would it be too much paranoia to clear all the tce here as you do below
> >>> on detach?
> >>
> >> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
> >> unmaps) the whole DMA window at the boot time.
> >
> > But that's just one user of this interface, we can't assume they'll all
> > be so agreeable. If any tces were enabled here, a malicious user would
> > have a window to host memory, right? Thanks,
>
>
> But I still release pages on detach(), how can this code be not called on
> the guest exit (normal or crashed)?

What's the initial state? You leave it clean, but who came before you?
Thanks,

Alex

2012-11-27 05:12:25

by David Gibson

[permalink] [raw]
Subject: Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO

On Tue, Nov 27, 2012 at 03:58:14PM +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 15:29, Alex Williamson wrote:
> >On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> >>On 27/11/12 05:20, Alex Williamson wrote:
> >>>On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >>>>VFIO implements platform independent stuff such as
> >>>>a PCI driver, BAR access (via read/write on a file descriptor
> >>>>or direct mapping when possible) and IRQ signaling.
> >>>>
> >>>>The platform dependent part includes IOMMU initialization
> >>>>and handling. This patch implements an IOMMU driver for VFIO
> >>>>which does mapping/unmapping pages for the guest IO and
> >>>>provides information about DMA window (required by a POWERPC
> >>>>guest).
> >>>>
> >>>>The counterpart in QEMU is required to support this functionality.
> >>>>
> >>>>Cc: David Gibson <[email protected]>
> >>>>Signed-off-by: Alexey Kardashevskiy <[email protected]>
> >>>>---
> >>>> drivers/vfio/Kconfig | 6 +
> >>>> drivers/vfio/Makefile | 1 +
> >>>> drivers/vfio/vfio_iommu_spapr_tce.c | 247 +++++++++++++++++++++++++++++++++++
> >>>> include/linux/vfio.h | 20 +++
> >>>> 4 files changed, 274 insertions(+)
> >>>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>
> >>>>diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>>index 7cd5dec..b464687 100644
> >>>>--- a/drivers/vfio/Kconfig
> >>>>+++ b/drivers/vfio/Kconfig
> >>>>@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>> depends on VFIO
> >>>> default n
> >>>>
> >>>>+config VFIO_IOMMU_SPAPR_TCE
> >>>>+ tristate
> >>>>+ depends on VFIO && SPAPR_TCE_IOMMU
> >>>>+ default n
> >>>>+
> >>>> menuconfig VFIO
> >>>> tristate "VFIO Non-Privileged userspace driver framework"
> >>>> depends on IOMMU_API
> >>>> select VFIO_IOMMU_TYPE1 if X86
> >>>>+ select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>> help
> >>>> VFIO provides a framework for secure userspace device drivers.
> >>>> See Documentation/vfio.txt for more details.
> >>>>diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>>index 2398d4a..72bfabc 100644
> >>>>--- a/drivers/vfio/Makefile
> >>>>+++ b/drivers/vfio/Makefile
> >>>>@@ -1,3 +1,4 @@
> >>>> obj-$(CONFIG_VFIO) += vfio.o
> >>>> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>>+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>> obj-$(CONFIG_VFIO_PCI) += pci/
> >>>>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>new file mode 100644
> >>>>index 0000000..46a6298
> >>>>--- /dev/null
> >>>>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>@@ -0,0 +1,247 @@
> >>>>+/*
> >>>>+ * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>>+ *
> >>>>+ * Copyright (C) 2012 IBM Corp. All rights reserved.
> >>>>+ * Author: Alexey Kardashevskiy <[email protected]>
> >>>>+ *
> >>>>+ * This program is free software; you can redistribute it and/or modify
> >>>>+ * it under the terms of the GNU General Public License version 2 as
> >>>>+ * published by the Free Software Foundation.
> >>>>+ *
> >>>>+ * Derived from original vfio_iommu_type1.c:
> >>>>+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> >>>>+ * Author: Alex Williamson <[email protected]>
> >>>>+ */
> >>>>+
> >>>>+#include <linux/module.h>
> >>>>+#include <linux/pci.h>
> >>>>+#include <linux/slab.h>
> >>>>+#include <linux/uaccess.h>
> >>>>+#include <linux/err.h>
> >>>>+#include <linux/vfio.h>
> >>>>+#include <asm/iommu.h>
> >>>>+
> >>>>+#define DRIVER_VERSION "0.1"
> >>>>+#define DRIVER_AUTHOR "[email protected]"
> >>>>+#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> >>>>+
> >>>>+static void tce_iommu_detach_group(void *iommu_data,
> >>>>+ struct iommu_group *iommu_group);
> >>>>+
> >>>>+/*
> >>>>+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>>+ */
> >>>>+
> >>>>+/*
> >>>>+ * The container descriptor supports only a single group per container.
> >>>>+ * Required by the API as the container is not supplied with the IOMMU group
> >>>>+ * at the moment of initialization.
> >>>>+ */
> >>>>+struct tce_container {
> >>>>+ struct mutex lock;
> >>>>+ struct iommu_table *tbl;
> >>>>+};
> >>>>+
> >>>>+static void *tce_iommu_open(unsigned long arg)
> >>>>+{
> >>>>+ struct tce_container *container;
> >>>>+
> >>>>+ if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>>+ printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>>+ return ERR_PTR(-EINVAL);
> >>>>+ }
> >>>>+
> >>>>+ container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>>+ if (!container)
> >>>>+ return ERR_PTR(-ENOMEM);
> >>>>+
> >>>>+ mutex_init(&container->lock);
> >>>>+
> >>>>+ return container;
> >>>>+}
> >>>>+
> >>>>+static void tce_iommu_release(void *iommu_data)
> >>>>+{
> >>>>+ struct tce_container *container = iommu_data;
> >>>>+
> >>>>+ WARN_ON(container->tbl && !container->tbl->it_group);
> >>>
> >>>I think your patch ordering is backwards here. it_group isn't added
> >>>until 2/2. I'd really like to see the arch/powerpc code approved and
> >>>merged by the powerpc maintainer before we add the code that makes use
> >>>of it into vfio. Otherwise we just get lots of churn if interfaces
> >>>change or they disapprove of it altogether.
> >>
> >>
> >>Makes sense, thanks.
> >>
> >>
> >>>>+ if (container->tbl && container->tbl->it_group)
> >>>>+ tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >>>>+
> >>>>+ mutex_destroy(&container->lock);
> >>>>+
> >>>>+ kfree(container);
> >>>>+}
> >>>>+
> >>>>+static long tce_iommu_ioctl(void *iommu_data,
> >>>>+ unsigned int cmd, unsigned long arg)
> >>>>+{
> >>>>+ struct tce_container *container = iommu_data;
> >>>>+ unsigned long minsz;
> >>>>+
> >>>>+ switch (cmd) {
> >>>>+ case VFIO_CHECK_EXTENSION: {
> >>>>+ return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>>+ }
> >>>>+ case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>>+ struct vfio_iommu_spapr_tce_info info;
> >>>>+ struct iommu_table *tbl = container->tbl;
> >>>>+
> >>>>+ if (WARN_ON(!tbl))
> >>>>+ return -ENXIO;
> >>>>+
> >>>>+ minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>>+ dma64_window_size);
> >>>>+
> >>>>+ if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>>+ return -EFAULT;
> >>>>+
> >>>>+ if (info.argsz < minsz)
> >>>>+ return -EINVAL;
> >>>>+
> >>>>+ info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>>+ info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>>+ info.dma64_window_start = 0;
> >>>>+ info.dma64_window_size = 0;
> >>>>+ info.flags = 0;
> >>>>+
> >>>>+ if (copy_to_user((void __user *)arg, &info, minsz))
> >>>>+ return -EFAULT;
> >>>>+
> >>>>+ return 0;
> >>>>+ }
> >>>>+ case VFIO_IOMMU_MAP_DMA: {
> >>>>+ vfio_iommu_spapr_tce_dma_map param;
> >>>>+ struct iommu_table *tbl = container->tbl;
> >>>>+ enum dma_data_direction direction = DMA_NONE;
> >>>>+
> >>>>+ if (WARN_ON(!tbl))
> >>>>+ return -ENXIO;
> >>>>+
> >>>>+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >>>>+
> >>>>+ if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>>+ return -EFAULT;
> >>>>+
> >>>>+ if (param.argsz < minsz)
> >>>>+ return -EINVAL;
> >>>>+
> >>>>+ if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >>>>+ (param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >>>>+ direction = DMA_BIDIRECTIONAL;
> >>>>+ } else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >>>>+ direction = DMA_TO_DEVICE;
> >>>>+ } else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >>>>+ direction = DMA_FROM_DEVICE;
> >>>>+ }
> >>>>+
> >>>>+ param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>>+ param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>
> >>>On x86 we force iova, vaddr, and size to all be aligned to the smallest
> >>>page granularity of the iommu and return -EINVAL if it doesn't fit.
> >>>What does it imply to the user if they're always aligned to work here?
> >>>Won't this interface happily map overlapping entries with no indication
> >>>to the user that the previous mapping is no longer valid?
> >>>Maybe another reason why a combined unmap/map makes me nervous, we have
> >>>to assume the user knows what they're doing.
> >>
> >>
> >>I got used to guests which do know what they are doing so I am pretty calm :)
> >>but ok, I'll move alignment to the QEMU, it makes sense.
> >>
> >>
> >>>>+
> >>>>+ return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>>+ param.vaddr & IOMMU_PAGE_MASK, direction,
> >>>>+ param.size >> IOMMU_PAGE_SHIFT);
> >>>>+ }
> >>>>+ case VFIO_IOMMU_UNMAP_DMA: {
> >>>>+ vfio_iommu_spapr_tce_dma_unmap param;
> >>>>+ struct iommu_table *tbl = container->tbl;
> >>>>+
> >>>>+ if (WARN_ON(!tbl))
> >>>>+ return -ENXIO;
> >>>>+
> >>>>+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >>>>+
> >>>>+ if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>>+ return -EFAULT;
> >>>>+
> >>>>+ if (param.argsz < minsz)
> >>>>+ return -EINVAL;
> >>>>+
> >>>>+ param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>>+ param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>>+
> >>>>+ return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>>+ 0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >>>>+ }
> >>>>+ default:
> >>>>+ printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>
> >>>pr_warn
> >>>
> >>>>+ }
> >>>>+
> >>>>+ return -ENOTTY;
> >>>>+}
> >>>>+
> >>>>+static int tce_iommu_attach_group(void *iommu_data,
> >>>>+ struct iommu_group *iommu_group)
> >>>>+{
> >>>>+ struct tce_container *container = iommu_data;
> >>>>+ struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>>+
> >>>>+ BUG_ON(!tbl);
> >>>>+ mutex_lock(&container->lock);
> >>>>+ pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >>>>+ iommu_group_id(iommu_group), iommu_group);
> >>>>+ if (container->tbl) {
> >>>>+ printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >>>
> >>>pr_warn
> >>>
> >>>>+ iommu_group_id(container->tbl->it_group),
> >>>>+ iommu_group_id(iommu_group));
> >>>>+ mutex_unlock(&container->lock);
> >>>>+ return -EBUSY;
> >>>>+ }
> >>>>+
> >>>>+ container->tbl = tbl;
> >>>
> >>>Would it be too much paranoia to clear all the tce here as you do below
> >>>on detach?
> >>
> >>Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
> >>unmaps) the whole DMA window at the boot time.
> >
> >But that's just one user of this interface, we can't assume they'll all
> >be so agreeable. If any tces were enabled here, a malicious user would
> >have a window to host memory, right? Thanks,
>
>
> But I still release pages on detach(), how can this code be not
> called on the guest exit (normal or crashed)?

I think the concern is about robustness if some bug elsewhere in the
kernel left some TCE entries in place before the table was handed over
to VFIO.

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson

2012-11-28 07:18:49

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH] vfio powerpc: enabled on powernv platform

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
arch/powerpc/include/asm/iommu.h | 9 +++
arch/powerpc/kernel/iommu.c | 147 ++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++++++++++++++
drivers/iommu/Kconfig | 8 ++
4 files changed, 299 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map; /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+ struct iommu_group *it_group;
+#endif
};

struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
}
#endif

+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction,
+ unsigned long pages);
+
#endif /* __KERNEL__ */
#endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..1456b6e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
#include <asm/kdump.h>
#include <asm/fadump.h>
#include <asm/vio.h>
+#include <asm/tce.h>

#define DBG(...)

@@ -856,3 +857,149 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void tce_flush(struct iommu_table *tbl)
+{
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of pages
+ * which it called put_page() on.
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages)
+{
+ int i, pages_put = 0;
+ unsigned long oldtce;
+ struct page *page;
+
+ for (i = 0; i < pages; ++i) {
+ oldtce = ppc_md.tce_get(tbl, entry + i);
+ ppc_md.tce_free(tbl, entry + i, 1);
+
+ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ continue;
+
+ page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+ WARN_ON(!page);
+ if (!page)
+ continue;
+
+ if (oldtce & TCE_PCI_WRITE)
+ SetPageDirty(page);
+
+ ++pages_put;
+ put_page(page);
+ }
+
+ return pages_put;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of released pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages)
+{
+ int ret;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ spin_lock(&(pool->lock));
+ ret = clear_tces_nolock(tbl, entry, pages);
+ tce_flush(tbl);
+ spin_unlock(&(pool->lock));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction)
+{
+ int ret;
+ struct page *page = NULL;
+ unsigned long kva, offset;
+
+ /* Map new TCE */
+ offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (ret < 1) {
+ printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, ret);
+ if (!ret)
+ ret = -EFAULT;
+ return ret;
+ }
+
+ kva = (unsigned long) page_address(page);
+ kva += offset;
+
+ /* tce_build receives a virtual address */
+ entry += tbl->it_offset; /* Offset into real TCE table */
+ ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+ /* tce_build() only returns non-zero for transient errors */
+ if (unlikely(ret)) {
+ printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+ put_page(page);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually locked pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction,
+ unsigned long pages)
+{
+ int i, ret = 0;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+ BUG_ON(direction == DMA_NONE);
+
+ spin_lock(&(pool->lock));
+
+ /* Check if any is in use */
+ for (i = 0; i < pages; ++i) {
+ unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+ if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
+ spin_unlock(&(pool->lock));
+ return -EBUSY;
+ }
+ }
+
+ /* Put tces to the table */
+ for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE)
+ ret = put_tce(tbl, entry + i, tce, direction);
+
+ /* If failed, release locked pages, otherwise return the number of pages */
+ if (ret)
+ clear_tces_nolock(tbl, entry, i);
+ else
+ ret = pages;
+
+ tce_flush(tbl);
+ spin_unlock(&(pool->lock));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
#include <linux/irq.h>
#include <linux/io.h>
#include <linux/msi.h>
+#include <linux/iommu.h>

#include <asm/sections.h>
#include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
#endif
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+ struct iommu_table *tbl;
+ int ret = 0;
+
+ if (WARN_ON(dev->iommu_group)) {
+ printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+ dev_name(dev),
+ iommu_group_id(dev->iommu_group));
+ return -EBUSY;
+ }
+
+ tbl = get_iommu_table_base(dev);
+ if (!tbl) {
+ pr_debug("tce_vfio: skipping device %s with no tbl\n",
+ dev_name(dev));
+ return 0;
+ }
+
+ pr_debug("tce_vfio: adding %s to iommu group %d\n",
+ dev_name(dev), iommu_group_id(tbl->it_group));
+
+ ret = iommu_group_add_device(tbl->it_group, dev);
+ if (ret < 0)
+ printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+ dev_name(dev), ret);
+
+ return ret;
+}
+
+static void del_device(struct device *dev)
+{
+ iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ return add_device(dev);
+ case BUS_NOTIFY_DEL_DEVICE:
+ del_device(dev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+ .notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+ struct iommu_table *tbl = iommu_data;
+ tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp;
+
+ /* Allocate and initialize IOMMU groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+
+ /* Skip already initialized */
+ if (tbl->it_group)
+ continue;
+
+ grp = iommu_group_alloc();
+ if (IS_ERR(grp)) {
+ printk(KERN_INFO "tce_vfio: cannot create "
+ "new IOMMU group, ret=%ld\n",
+ PTR_ERR(grp));
+ return PTR_ERR(grp);
+ }
+ tbl->it_group = grp;
+ iommu_group_set_iommudata(grp, tbl, group_release);
+ }
+
+ bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Add PCI devices to VFIO groups */
+ for_each_pci_dev(pdev)
+ add_device(&pdev->dev);
+
+ return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp = NULL;
+
+ bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Delete PCI devices from VFIO groups */
+ for_each_pci_dev(pdev)
+ del_device(&pdev->dev);
+
+ /* Release VFIO groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+ grp = tbl->it_group;
+
+ /* Skip (already) uninitialized */
+ if (!grp)
+ continue;
+
+ /* Do actual release, group_release() is expected to work */
+ iommu_group_put(grp);
+ BUG_ON(tbl->it_group);
+ }
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG

Say N unless you need kernel log message for IOMMU debugging

+config SPAPR_TCE_IOMMU
+ bool "sPAPR TCE IOMMU Support"
+ depends on PPC_POWERNV
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops is
+ still not implemented.
+
endif # IOMMU_SUPPORT
--
1.7.10.4

2012-11-28 07:21:51

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
drivers/vfio/Kconfig | 6 +
drivers/vfio/Makefile | 1 +
drivers/vfio/vfio_iommu_spapr_tce.c | 332 +++++++++++++++++++++++++++++++++++
include/linux/vfio.h | 33 ++++
4 files changed, 372 insertions(+)
create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
depends on VFIO
default n

+config VFIO_IOMMU_SPAPR_TCE
+ tristate
+ depends on VFIO && SPAPR_TCE_IOMMU
+ default n
+
menuconfig VFIO
tristate "VFIO Non-Privileged userspace driver framework"
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
+ select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
help
VFIO provides a framework for secure userspace device drivers.
See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_VFIO) += vfio.o
obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..b98770e
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,332 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Author: Alexey Kardashevskiy <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <[email protected]>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR "[email protected]"
+#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+ struct mm_struct *mm;
+ long npage;
+ struct work_struct work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+ struct vwork *vwork = container_of(work, struct vwork, work);
+ struct mm_struct *mm;
+
+ mm = vwork->mm;
+ down_write(&mm->mmap_sem);
+ mm->locked_vm += vwork->npage;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+ struct vwork *vwork;
+ struct mm_struct *mm;
+
+ if (!current->mm)
+ return; /* process exited */
+
+ if (down_write_trylock(&current->mm->mmap_sem)) {
+ current->mm->locked_vm += npage;
+ up_write(&current->mm->mmap_sem);
+ return;
+ }
+
+ /*
+ * Couldn't get mmap_sem lock, so must setup to update
+ * mm->locked_vm later. If locked_vm were atomic, we
+ * wouldn't need this silliness
+ */
+ vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+ if (!vwork)
+ return;
+ mm = get_task_mm(current);
+ if (!mm) {
+ kfree(vwork);
+ return;
+ }
+ INIT_WORK(&vwork->work, lock_acct_bg);
+ vwork->mm = mm;
+ vwork->npage = npage;
+ schedule_work(&vwork->work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+ struct mutex lock;
+ struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+ struct tce_container *container;
+
+ if (arg != VFIO_SPAPR_TCE_IOMMU) {
+ pr_err("tce_vfio: Wrong IOMMU type\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ container = kzalloc(sizeof(*container), GFP_KERNEL);
+ if (!container)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&container->lock);
+
+ return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+ struct tce_container *container = iommu_data;
+
+ WARN_ON(container->tbl && !container->tbl->it_group);
+ if (container->tbl && container->tbl->it_group)
+ tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+ mutex_destroy(&container->lock);
+
+ kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+ unsigned int cmd, unsigned long arg)
+{
+ struct tce_container *container = iommu_data;
+ unsigned long minsz;
+ long ret;
+
+ switch (cmd) {
+ case VFIO_CHECK_EXTENSION: {
+ return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+ }
+ case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+ struct vfio_iommu_spapr_tce_info info;
+ struct iommu_table *tbl = container->tbl;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+ dma64_window_size);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+ info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+ info.dma64_window_start = 0;
+ info.dma64_window_size = 0;
+ info.flags = 0;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_IOMMU_MAP_DMA: {
+ vfio_iommu_spapr_tce_dma_map param;
+ struct iommu_table *tbl = container->tbl;
+ enum dma_data_direction direction;
+ unsigned long locked, lock_limit;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+ if (copy_from_user(&param, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (param.argsz < minsz)
+ return -EINVAL;
+
+ if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+ (param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+ direction = DMA_BIDIRECTIONAL;
+ else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+ direction = DMA_TO_DEVICE;
+ else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+ direction = DMA_FROM_DEVICE;
+ else
+ return -EINVAL;
+
+ if ((param.size & ~IOMMU_PAGE_MASK) ||
+ (param.iova & ~IOMMU_PAGE_MASK) ||
+ (param.vaddr & ~IOMMU_PAGE_MASK))
+ return -EINVAL;
+
+ /* Account for locked pages */
+ locked = current->mm->locked_vm +
+ (param.size >> IOMMU_PAGE_SHIFT);
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+ rlimit(RLIMIT_MEMLOCK));
+ return -ENOMEM;
+ }
+
+ ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+ param.vaddr, direction,
+ param.size >> IOMMU_PAGE_SHIFT);
+ if (ret > 0)
+ lock_acct(ret);
+
+ return ret;
+ }
+ case VFIO_IOMMU_UNMAP_DMA: {
+ vfio_iommu_spapr_tce_dma_unmap param;
+ struct iommu_table *tbl = container->tbl;
+
+ if (WARN_ON(!tbl))
+ return -ENXIO;
+
+ minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+ if (copy_from_user(&param, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (param.argsz < minsz)
+ return -EINVAL;
+
+ if ((param.size & ~IOMMU_PAGE_MASK) ||
+ (param.iova & ~IOMMU_PAGE_MASK))
+ return -EINVAL;
+
+ ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+ param.size >> IOMMU_PAGE_SHIFT);
+ if (ret > 0)
+ lock_acct(-ret);
+
+ return ret;
+ }
+ default:
+ pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
+ }
+
+ return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+ struct tce_container *container = iommu_data;
+ struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+ BUG_ON(!tbl);
+ mutex_lock(&container->lock);
+ pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+ iommu_group_id(iommu_group), iommu_group);
+ if (container->tbl) {
+ pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+ iommu_group_id(container->tbl->it_group),
+ iommu_group_id(iommu_group));
+ mutex_unlock(&container->lock);
+ return -EBUSY;
+ }
+
+ container->tbl = tbl;
+ iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+ mutex_unlock(&container->lock);
+
+ return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+ struct tce_container *container = iommu_data;
+ struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+ BUG_ON(!tbl);
+ mutex_lock(&container->lock);
+ if (tbl != container->tbl) {
+ pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+ iommu_group_id(iommu_group),
+ iommu_group_id(tbl->it_group));
+ } else {
+
+ pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+ iommu_group_id(iommu_group), iommu_group);
+
+ iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+ container->tbl = NULL;
+ }
+ mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+ .name = "iommu-vfio-powerpc",
+ .owner = THIS_MODULE,
+ .open = tce_iommu_open,
+ .release = tce_iommu_release,
+ .ioctl = tce_iommu_ioctl,
+ .attach_group = tce_iommu_attach_group,
+ .detach_group = tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+ return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..820af1e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
/* Extensions */

#define VFIO_TYPE1_IOMMU 1
+#define VFIO_SPAPR_TCE_IOMMU 2

/*
* The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {

#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)

+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
+ * 64 bit window (not supported at the moment for the guest) is supposed to
+ * be mapped completely to the guest memory so the devices capable of 64bit
+ * DMA will not have to use map/unmap ioctls.
+ *
+ * The IOMMU page size is always 4K.
+ */
+
+struct vfio_iommu_spapr_tce_info {
+ __u32 argsz;
+ __u32 flags; /* reserved for future use */
+ __u32 dma32_window_start; /* 32 bit window start (bytes) */
+ __u32 dma32_window_size; /* 32 bit window size (bytes) */
+ __u64 dma64_window_start; /* 64 bit window start (bytes) */
+ __u64 dma64_window_size; /* 64 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
#endif /* VFIO_H */
--
1.7.10.4

2012-11-28 21:02:06

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO

On Wed, 2012-11-28 at 18:21 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
>
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
>
> The counterpart in QEMU is required to support this functionality.
>
> Cc: David Gibson <[email protected]>
> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> ---
> drivers/vfio/Kconfig | 6 +
> drivers/vfio/Makefile | 1 +
> drivers/vfio/vfio_iommu_spapr_tce.c | 332 +++++++++++++++++++++++++++++++++++
> include/linux/vfio.h | 33 ++++
> 4 files changed, 372 insertions(+)
> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> depends on VFIO
> default n
>
> +config VFIO_IOMMU_SPAPR_TCE
> + tristate
> + depends on VFIO && SPAPR_TCE_IOMMU
> + default n
> +
> menuconfig VFIO
> tristate "VFIO Non-Privileged userspace driver framework"
> depends on IOMMU_API
> select VFIO_IOMMU_TYPE1 if X86
> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> help
> VFIO provides a framework for secure userspace device drivers.
> See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
> obj-$(CONFIG_VFIO) += vfio.o
> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..b98770e
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,332 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp. All rights reserved.
> + * Author: Alexey Kardashevskiy <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> + * Author: Alex Williamson <[email protected]>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION "0.1"
> +#define DRIVER_AUTHOR "[email protected]"
> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> + struct mm_struct *mm;
> + long npage;
> + struct work_struct work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> + struct vwork *vwork = container_of(work, struct vwork, work);
> + struct mm_struct *mm;
> +
> + mm = vwork->mm;
> + down_write(&mm->mmap_sem);
> + mm->locked_vm += vwork->npage;
> + up_write(&mm->mmap_sem);
> + mmput(mm);
> + kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> + struct vwork *vwork;
> + struct mm_struct *mm;
> +
> + if (!current->mm)
> + return; /* process exited */
> +
> + if (down_write_trylock(&current->mm->mmap_sem)) {
> + current->mm->locked_vm += npage;
> + up_write(&current->mm->mmap_sem);
> + return;
> + }
> +
> + /*
> + * Couldn't get mmap_sem lock, so must setup to update
> + * mm->locked_vm later. If locked_vm were atomic, we
> + * wouldn't need this silliness
> + */
> + vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> + if (!vwork)
> + return;
> + mm = get_task_mm(current);
> + if (!mm) {
> + kfree(vwork);
> + return;
> + }
> + INIT_WORK(&vwork->work, lock_acct_bg);
> + vwork->mm = mm;
> + vwork->npage = npage;
> + schedule_work(&vwork->work);
> +}

This looks familiar, should we split it out to a common file instead of
duplicating it?

> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> + struct mutex lock;
> + struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> + struct tce_container *container;
> +
> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
> + pr_err("tce_vfio: Wrong IOMMU type\n");
> + return ERR_PTR(-EINVAL);
> + }
> +
> + container = kzalloc(sizeof(*container), GFP_KERNEL);
> + if (!container)
> + return ERR_PTR(-ENOMEM);
> +
> + mutex_init(&container->lock);
> +
> + return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> + struct tce_container *container = iommu_data;
> +
> + WARN_ON(container->tbl && !container->tbl->it_group);
> + if (container->tbl && container->tbl->it_group)
> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> + mutex_destroy(&container->lock);
> +
> + kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> + unsigned int cmd, unsigned long arg)
> +{
> + struct tce_container *container = iommu_data;
> + unsigned long minsz;
> + long ret;
> +
> + switch (cmd) {
> + case VFIO_CHECK_EXTENSION: {
> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> + }
> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> + struct vfio_iommu_spapr_tce_info info;
> + struct iommu_table *tbl = container->tbl;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> + dma64_window_size);
> +
> + if (copy_from_user(&info, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (info.argsz < minsz)
> + return -EINVAL;
> +
> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> + info.dma64_window_start = 0;
> + info.dma64_window_size = 0;
> + info.flags = 0;
> +
> + if (copy_to_user((void __user *)arg, &info, minsz))
> + return -EFAULT;
> +
> + return 0;
> + }
> + case VFIO_IOMMU_MAP_DMA: {
> + vfio_iommu_spapr_tce_dma_map param;
> + struct iommu_table *tbl = container->tbl;
> + enum dma_data_direction direction;
> + unsigned long locked, lock_limit;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> + if (copy_from_user(&param, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (param.argsz < minsz)
> + return -EINVAL;
> +
> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> + direction = DMA_BIDIRECTIONAL;
> + else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> + direction = DMA_TO_DEVICE;
> + else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> + direction = DMA_FROM_DEVICE;
> + else
> + return -EINVAL;
> +
> + if ((param.size & ~IOMMU_PAGE_MASK) ||
> + (param.iova & ~IOMMU_PAGE_MASK) ||
> + (param.vaddr & ~IOMMU_PAGE_MASK))
> + return -EINVAL;
> +
> + /* Account for locked pages */
> + locked = current->mm->locked_vm +
> + (param.size >> IOMMU_PAGE_SHIFT);
> + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

This page accounting doesn't look right. PAGE_SIZE is several orders
bigger than IOMMU_PAGE_SIZE (right?), but we mix them here, which seems
like it will over penalize the user. For example, if a user maps 4x4k
(assume aligned and contiguous) IOMMU pages, isn't that only a single
pinned system page (assuming >=16k pages).

> + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> + rlimit(RLIMIT_MEMLOCK));
> + return -ENOMEM;
> + }
> +
> + ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> + param.vaddr, direction,
> + param.size >> IOMMU_PAGE_SHIFT);
> + if (ret > 0)
> + lock_acct(ret);
> +
> + return ret;
> + }
> + case VFIO_IOMMU_UNMAP_DMA: {
> + vfio_iommu_spapr_tce_dma_unmap param;
> + struct iommu_table *tbl = container->tbl;
> +
> + if (WARN_ON(!tbl))
> + return -ENXIO;
> +
> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> + if (copy_from_user(&param, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (param.argsz < minsz)
> + return -EINVAL;
> +
> + if ((param.size & ~IOMMU_PAGE_MASK) ||
> + (param.iova & ~IOMMU_PAGE_MASK))
> + return -EINVAL;
> +
> + ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> + param.size >> IOMMU_PAGE_SHIFT);
> + if (ret > 0)
> + lock_acct(-ret);
> +
> + return ret;
> + }
> + default:
> + pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
> + }
> +
> + return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> + struct iommu_group *iommu_group)
> +{
> + struct tce_container *container = iommu_data;
> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> + BUG_ON(!tbl);
> + mutex_lock(&container->lock);
> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> + iommu_group_id(iommu_group), iommu_group);
> + if (container->tbl) {
> + pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> + iommu_group_id(container->tbl->it_group),
> + iommu_group_id(iommu_group));
> + mutex_unlock(&container->lock);
> + return -EBUSY;
> + }
> +
> + container->tbl = tbl;
> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> + mutex_unlock(&container->lock);
> +
> + return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group)
> +{
> + struct tce_container *container = iommu_data;
> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> + BUG_ON(!tbl);
> + mutex_lock(&container->lock);
> + if (tbl != container->tbl) {
> + pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> + iommu_group_id(iommu_group),
> + iommu_group_id(tbl->it_group));
> + } else {
> +
> + pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> + iommu_group_id(iommu_group), iommu_group);
> +
> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> + container->tbl = NULL;
> + }
> + mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> + .name = "iommu-vfio-powerpc",
> + .owner = THIS_MODULE,
> + .open = tce_iommu_open,
> + .release = tce_iommu_release,
> + .ioctl = tce_iommu_ioctl,
> + .attach_group = tce_iommu_attach_group,
> + .detach_group = tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> + return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..820af1e 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
> /* Extensions */
>
> #define VFIO_TYPE1_IOMMU 1
> +#define VFIO_SPAPR_TCE_IOMMU 2
>
> /*
> * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
>
> #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +/*
> + * The SPAPR TCE info struct provides the information about the PCI bus
> + * address ranges available for DMA, these values are programmed into
> + * the hardware so the guest has to know that information.
> + *
> + * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
^^^^^^^^^^^
explicitly

> + * 64 bit window (not supported at the moment for the guest) is supposed to
> + * be mapped completely to the guest memory so the devices capable of 64bit
> + * DMA will not have to use map/unmap ioctls.
> + *
> + * The IOMMU page size is always 4K.
> + */

Thanks,

Alex

> +
> +struct vfio_iommu_spapr_tce_info {
> + __u32 argsz;
> + __u32 flags; /* reserved for future use */
> + __u32 dma32_window_start; /* 32 bit window start (bytes) */
> + __u32 dma32_window_size; /* 32 bit window size (bytes) */
> + __u64 dma64_window_start; /* 64 bit window start (bytes) */
> + __u64 dma64_window_size; /* 64 bit window size (bytes) */
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
> #endif /* VFIO_H */


2012-11-28 21:31:00

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled on powernv platform

On Wed, 2012-11-28 at 18:18 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
>
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
>
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
>
> Cc: David Gibson <[email protected]>
> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> ---
> arch/powerpc/include/asm/iommu.h | 9 +++
> arch/powerpc/kernel/iommu.c | 147 ++++++++++++++++++++++++++++++++++
> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++++++++++++++
> drivers/iommu/Kconfig | 8 ++
> 4 files changed, 299 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
> struct iommu_pool large_pool;
> struct iommu_pool pools[IOMMU_NR_POOLS];
> unsigned long *it_map; /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
> };
>
> struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
> }
> #endif
>
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long pages);
> +
> #endif /* __KERNEL__ */
> #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..1456b6e 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
> #include <asm/kdump.h>
> #include <asm/fadump.h>
> #include <asm/vio.h>
> +#include <asm/tce.h>
>
> #define DBG(...)
>
> @@ -856,3 +857,149 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> free_pages((unsigned long)vaddr, get_order(size));
> }
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static void tce_flush(struct iommu_table *tbl)
> +{
> + /* Flush/invalidate TLB caches if necessary */
> + if (ppc_md.tce_flush)
> + ppc_md.tce_flush(tbl);
> +
> + /* Make sure updates are seen by hardware */
> + mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of pages
> + * which it called put_page() on.
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages)
> +{
> + int i, pages_put = 0;
> + unsigned long oldtce;
> + struct page *page;
> +
> + for (i = 0; i < pages; ++i) {
> + oldtce = ppc_md.tce_get(tbl, entry + i);
> + ppc_md.tce_free(tbl, entry + i, 1);
> +
> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> + continue;
> +
> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> + WARN_ON(!page);
> + if (!page)
> + continue;
> +
> + if (oldtce & TCE_PCI_WRITE)
> + SetPageDirty(page);
> +
> + ++pages_put;
> + put_page(page);
> + }
> +
> + return pages_put;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of released pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages)
> +{
> + int ret;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> +
> + spin_lock(&(pool->lock));
> + ret = clear_tces_nolock(tbl, entry, pages);
> + tce_flush(tbl);
> + spin_unlock(&(pool->lock));
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction)
> +{
> + int ret;
> + struct page *page = NULL;
> + unsigned long kva, offset;
> +
> + /* Map new TCE */
> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +
> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> + direction != DMA_TO_DEVICE, &page);
> + if (ret < 1) {
> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> + if (!ret)
> + ret = -EFAULT;
> + return ret;
> + }
> +
> + kva = (unsigned long) page_address(page);
> + kva += offset;
> +
> + /* tce_build receives a virtual address */
> + entry += tbl->it_offset; /* Offset into real TCE table */
> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> + /* tce_build() only returns non-zero for transient errors */
> + if (unlikely(ret)) {
> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> + put_page(page);
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually locked pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long pages)
> +{
> + int i, ret = 0;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> +
> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> + BUG_ON(direction == DMA_NONE);
> +
> + spin_lock(&(pool->lock));
> +
> + /* Check if any is in use */
> + for (i = 0; i < pages; ++i) {
> + unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> + if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
> + spin_unlock(&(pool->lock));
> + return -EBUSY;
> + }
> + }
> +
> + /* Put tces to the table */
> + for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE)
> + ret = put_tce(tbl, entry + i, tce, direction);
> +
> + /* If failed, release locked pages, otherwise return the number of pages */
> + if (ret)
> + clear_tces_nolock(tbl, entry, i);
> + else
> + ret = pages;
> +
> + tce_flush(tbl);
> + spin_unlock(&(pool->lock));
> +
> + return ret;
> +}

Nice, no more kmalloc! I'm still concerned about the IOMMU page size
mismatch here. If nothing else, the comment is misleading since we're
locking system pages, but returning tce pages. The user would therefore
need to multiply their locked memory limit by (PAGE_SHIFT -
IOMMU_PAGE_SHIFT). Thanks,

Alex

> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
> #include <linux/irq.h>
> #include <linux/io.h>
> #include <linux/msi.h>
> +#include <linux/iommu.h>
>
> #include <asm/sections.h>
> #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> #endif
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> + struct iommu_table *tbl;
> + int ret = 0;
> +
> + if (WARN_ON(dev->iommu_group)) {
> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> + dev_name(dev),
> + iommu_group_id(dev->iommu_group));
> + return -EBUSY;
> + }
> +
> + tbl = get_iommu_table_base(dev);
> + if (!tbl) {
> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> + dev_name(dev));
> + return 0;
> + }
> +
> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> + dev_name(dev), iommu_group_id(tbl->it_group));
> +
> + ret = iommu_group_add_device(tbl->it_group, dev);
> + if (ret < 0)
> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> + dev_name(dev), ret);
> +
> + return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> + iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> + unsigned long action, void *data)
> +{
> + struct device *dev = data;
> +
> + switch (action) {
> + case BUS_NOTIFY_ADD_DEVICE:
> + return add_device(dev);
> + case BUS_NOTIFY_DEL_DEVICE:
> + del_device(dev);
> + return 0;
> + default:
> + return 0;
> + }
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> + .notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> + struct iommu_table *tbl = iommu_data;
> + tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp;
> +
> + /* Allocate and initialize IOMMU groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> +
> + /* Skip already initialized */
> + if (tbl->it_group)
> + continue;
> +
> + grp = iommu_group_alloc();
> + if (IS_ERR(grp)) {
> + printk(KERN_INFO "tce_vfio: cannot create "
> + "new IOMMU group, ret=%ld\n",
> + PTR_ERR(grp));
> + return PTR_ERR(grp);
> + }
> + tbl->it_group = grp;
> + iommu_group_set_iommudata(grp, tbl, group_release);
> + }
> +
> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Add PCI devices to VFIO groups */
> + for_each_pci_dev(pdev)
> + add_device(&pdev->dev);
> +
> + return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp = NULL;
> +
> + bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Delete PCI devices from VFIO groups */
> + for_each_pci_dev(pdev)
> + del_device(&pdev->dev);
> +
> + /* Release VFIO groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> + grp = tbl->it_group;
> +
> + /* Skip (already) uninitialized */
> + if (!grp)
> + continue;
> +
> + /* Do actual release, group_release() is expected to work */
> + iommu_group_put(grp);
> + BUG_ON(tbl->it_group);
> + }
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>
> Say N unless you need kernel log message for IOMMU debugging
>
> +config SPAPR_TCE_IOMMU
> + bool "sPAPR TCE IOMMU Support"
> + depends on PPC_POWERNV
> + select IOMMU_API
> + help
> + Enables bits of IOMMU API required by VFIO. The iommu_ops is
> + still not implemented.
> +
> endif # IOMMU_SUPPORT


2012-11-29 03:51:09

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO

On 29/11/12 08:01, Alex Williamson wrote:
> On Wed, 2012-11-28 at 18:21 +1100, Alexey Kardashevskiy wrote:
>> VFIO implements platform independent stuff such as
>> a PCI driver, BAR access (via read/write on a file descriptor
>> or direct mapping when possible) and IRQ signaling.
>>
>> The platform dependent part includes IOMMU initialization
>> and handling. This patch implements an IOMMU driver for VFIO
>> which does mapping/unmapping pages for the guest IO and
>> provides information about DMA window (required by a POWERPC
>> guest).
>>
>> The counterpart in QEMU is required to support this functionality.
>>
>> Cc: David Gibson <[email protected]>
>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
>> ---
>> drivers/vfio/Kconfig | 6 +
>> drivers/vfio/Makefile | 1 +
>> drivers/vfio/vfio_iommu_spapr_tce.c | 332 +++++++++++++++++++++++++++++++++++
>> include/linux/vfio.h | 33 ++++
>> 4 files changed, 372 insertions(+)
>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>> depends on VFIO
>> default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> + tristate
>> + depends on VFIO && SPAPR_TCE_IOMMU
>> + default n
>> +
>> menuconfig VFIO
>> tristate "VFIO Non-Privileged userspace driver framework"
>> depends on IOMMU_API
>> select VFIO_IOMMU_TYPE1 if X86
>> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>> help
>> VFIO provides a framework for secure userspace device drivers.
>> See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>> obj-$(CONFIG_VFIO) += vfio.o
>> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>> obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..b98770e
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,332 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp. All rights reserved.
>> + * Author: Alexey Kardashevskiy <[email protected]>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_type1.c:
>> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
>> + * Author: Alex Williamson <[email protected]>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION "0.1"
>> +#define DRIVER_AUTHOR "[email protected]"
>> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> + struct iommu_group *iommu_group);
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * This code handles mapping and unmapping of user data buffers
>> + * into DMA'ble space using the IOMMU
>> + */
>> +
>> +#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
>> +
>> +struct vwork {
>> + struct mm_struct *mm;
>> + long npage;
>> + struct work_struct work;
>> +};
>> +
>> +/* delayed decrement/increment for locked_vm */
>> +static void lock_acct_bg(struct work_struct *work)
>> +{
>> + struct vwork *vwork = container_of(work, struct vwork, work);
>> + struct mm_struct *mm;
>> +
>> + mm = vwork->mm;
>> + down_write(&mm->mmap_sem);
>> + mm->locked_vm += vwork->npage;
>> + up_write(&mm->mmap_sem);
>> + mmput(mm);
>> + kfree(vwork);
>> +}
>> +
>> +static void lock_acct(long npage)
>> +{
>> + struct vwork *vwork;
>> + struct mm_struct *mm;
>> +
>> + if (!current->mm)
>> + return; /* process exited */
>> +
>> + if (down_write_trylock(&current->mm->mmap_sem)) {
>> + current->mm->locked_vm += npage;
>> + up_write(&current->mm->mmap_sem);
>> + return;
>> + }
>> +
>> + /*
>> + * Couldn't get mmap_sem lock, so must setup to update
>> + * mm->locked_vm later. If locked_vm were atomic, we
>> + * wouldn't need this silliness
>> + */
>> + vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>> + if (!vwork)
>> + return;
>> + mm = get_task_mm(current);
>> + if (!mm) {
>> + kfree(vwork);
>> + return;
>> + }
>> + INIT_WORK(&vwork->work, lock_acct_bg);
>> + vwork->mm = mm;
>> + vwork->npage = npage;
>> + schedule_work(&vwork->work);
>> +}
>
> This looks familiar, should we split it out to a common file instead of
> duplicating it?

It is simple cut-n-paste from type1 driver :)
Moving it to a separate file is up to you but it is quite small piece of
code to move it somewhere, and I have not fixed rlimit handling yet, so
wait a bit.


>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> + struct mutex lock;
>> + struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> + struct tce_container *container;
>> +
>> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> + pr_err("tce_vfio: Wrong IOMMU type\n");
>> + return ERR_PTR(-EINVAL);
>> + }
>> +
>> + container = kzalloc(sizeof(*container), GFP_KERNEL);
>> + if (!container)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + mutex_init(&container->lock);
>> +
>> + return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> + struct tce_container *container = iommu_data;
>> +
>> + WARN_ON(container->tbl && !container->tbl->it_group);
>> + if (container->tbl && container->tbl->it_group)
>> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>> +
>> + mutex_destroy(&container->lock);
>> +
>> + kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> + unsigned int cmd, unsigned long arg)
>> +{
>> + struct tce_container *container = iommu_data;
>> + unsigned long minsz;
>> + long ret;
>> +
>> + switch (cmd) {
>> + case VFIO_CHECK_EXTENSION: {
>> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> + }
>> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> + struct vfio_iommu_spapr_tce_info info;
>> + struct iommu_table *tbl = container->tbl;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> + dma64_window_size);
>> +
>> + if (copy_from_user(&info, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (info.argsz < minsz)
>> + return -EINVAL;
>> +
>> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> + info.dma64_window_start = 0;
>> + info.dma64_window_size = 0;
>> + info.flags = 0;
>> +
>> + if (copy_to_user((void __user *)arg, &info, minsz))
>> + return -EFAULT;
>> +
>> + return 0;
>> + }
>> + case VFIO_IOMMU_MAP_DMA: {
>> + vfio_iommu_spapr_tce_dma_map param;
>> + struct iommu_table *tbl = container->tbl;
>> + enum dma_data_direction direction;
>> + unsigned long locked, lock_limit;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE))
>> + direction = DMA_BIDIRECTIONAL;
>> + else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> + direction = DMA_TO_DEVICE;
>> + else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> + direction = DMA_FROM_DEVICE;
>> + else
>> + return -EINVAL;
>> +
>> + if ((param.size & ~IOMMU_PAGE_MASK) ||
>> + (param.iova & ~IOMMU_PAGE_MASK) ||
>> + (param.vaddr & ~IOMMU_PAGE_MASK))
>> + return -EINVAL;
>> +
>> + /* Account for locked pages */
>> + locked = current->mm->locked_vm +
>> + (param.size >> IOMMU_PAGE_SHIFT);
>> + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>
> This page accounting doesn't look right. PAGE_SIZE is several orders
> bigger than IOMMU_PAGE_SIZE (right?), but we mix them here, which seems
> like it will over penalize the user. For example, if a user maps 4x4k
> (assume aligned and contiguous) IOMMU pages, isn't that only a single
> pinned system page (assuming >=16k pages).

Oops. My bad. IOMMU_PAGE_SHIFT should be PAGE_SHIFT and should return the
number of system pages.

But we do not track 4K pages so I do not see any easy solution here. Except
fixing iommu_put_tces/iommu_clear_tces (*) to return the number of the very
first 4K IOMMU pages within system 64K pages.
This won't be too accurate but should work, no?

I'll post it as a patch in reply to "vfio powerpc: enabled on powernv
platform".



>> + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
>> + rlimit(RLIMIT_MEMLOCK));
>> + return -ENOMEM;
>> + }
>> +
>> + ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> + param.vaddr, direction,
>> + param.size >> IOMMU_PAGE_SHIFT);
>> + if (ret > 0)
>> + lock_acct(ret);
>> +
>> + return ret;
>> + }
>> + case VFIO_IOMMU_UNMAP_DMA: {
>> + vfio_iommu_spapr_tce_dma_unmap param;
>> + struct iommu_table *tbl = container->tbl;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>> +
>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + if ((param.size & ~IOMMU_PAGE_MASK) ||
>> + (param.iova & ~IOMMU_PAGE_MASK))
>> + return -EINVAL;
>> +
>> + ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> + param.size >> IOMMU_PAGE_SHIFT);
>> + if (ret > 0)
>> + lock_acct(-ret);
>> +
>> + return ret;
>> + }
>> + default:
>> + pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
>> + }
>> +
>> + return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> + struct iommu_group *iommu_group)
>> +{
>> + struct tce_container *container = iommu_data;
>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> + BUG_ON(!tbl);
>> + mutex_lock(&container->lock);
>> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>> + iommu_group_id(iommu_group), iommu_group);
>> + if (container->tbl) {
>> + pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>> + iommu_group_id(container->tbl->it_group),
>> + iommu_group_id(iommu_group));
>> + mutex_unlock(&container->lock);
>> + return -EBUSY;
>> + }
>> +
>> + container->tbl = tbl;
>> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> + mutex_unlock(&container->lock);
>> +
>> + return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> + struct iommu_group *iommu_group)
>> +{
>> + struct tce_container *container = iommu_data;
>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> + BUG_ON(!tbl);
>> + mutex_lock(&container->lock);
>> + if (tbl != container->tbl) {
>> + pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
>> + iommu_group_id(iommu_group),
>> + iommu_group_id(tbl->it_group));
>> + } else {
>> +
>> + pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>> + iommu_group_id(iommu_group), iommu_group);
>> +
>> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> + container->tbl = NULL;
>> + }
>> + mutex_unlock(&container->lock);
>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> + .name = "iommu-vfio-powerpc",
>> + .owner = THIS_MODULE,
>> + .open = tce_iommu_open,
>> + .release = tce_iommu_release,
>> + .ioctl = tce_iommu_ioctl,
>> + .attach_group = tce_iommu_attach_group,
>> + .detach_group = tce_iommu_detach_group,
>> +};
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> + return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> + vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..820af1e 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>> /* Extensions */
>>
>> #define VFIO_TYPE1_IOMMU 1
>> +#define VFIO_SPAPR_TCE_IOMMU 2
>>
>> /*
>> * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
>>
>> #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +/*
>> + * The SPAPR TCE info struct provides the information about the PCI bus
>> + * address ranges available for DMA, these values are programmed into
>> + * the hardware so the guest has to know that information.
>> + *
>> + * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
> ^^^^^^^^^^^
> explicitly
>
>> + * 64 bit window (not supported at the moment for the guest) is supposed to
>> + * be mapped completely to the guest memory so the devices capable of 64bit
>> + * DMA will not have to use map/unmap ioctls.
>> + *
>> + * The IOMMU page size is always 4K.
>> + */
>
> Thanks,
>
> Alex
>
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> + __u32 argsz;
>> + __u32 flags; /* reserved for future use */
>> + __u32 dma32_window_start; /* 32 bit window start (bytes) */
>> + __u32 dma32_window_size; /* 32 bit window size (bytes) */
>> + __u64 dma64_window_start; /* 64 bit window start (bytes) */
>> + __u64 dma64_window_size; /* 64 bit window size (bytes) */
>> +};
>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +/* Reuse type1 map/unmap structs as they are the same at the moment */
>> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
>> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
>> +
>> +/* ***************************************************************** */
>> +
>> #endif /* VFIO_H */
>
>
>


--
Alexey

2012-11-29 03:54:11

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH] vfio powerpc: enabled on powernv platform

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
arch/powerpc/include/asm/iommu.h | 9 ++
arch/powerpc/kernel/iommu.c | 159 ++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++++++++++++
drivers/iommu/Kconfig | 8 ++
4 files changed, 311 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map; /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+ struct iommu_group *it_group;
+#endif
};

struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
}
#endif

+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction,
+ unsigned long pages);
+
#endif /* __KERNEL__ */
#endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..1225fbb 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
#include <asm/kdump.h>
#include <asm/fadump.h>
#include <asm/vio.h>
+#include <asm/tce.h>

#define DBG(...)

@@ -856,3 +857,161 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void tce_flush(struct iommu_table *tbl)
+{
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of pages
+ * which it called put_page() on.
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages)
+{
+ int i, retpages = 0;
+ unsigned long oldtce;
+ struct page *page;
+
+ for (i = 0; i < pages; ++i) {
+ oldtce = ppc_md.tce_get(tbl, entry + i);
+ ppc_md.tce_free(tbl, entry + i, 1);
+
+ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ continue;
+
+ page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+ WARN_ON(!page);
+ if (!page)
+ continue;
+
+ if (oldtce & TCE_PCI_WRITE)
+ SetPageDirty(page);
+
+ if (!(oldtce & ~PAGE_MASK))
+ ++retpages;
+
+ put_page(page);
+ }
+
+ return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of released pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages)
+{
+ int ret;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ spin_lock(&(pool->lock));
+ ret = clear_tces_nolock(tbl, entry, pages);
+ tce_flush(tbl);
+ spin_unlock(&(pool->lock));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction)
+{
+ int ret;
+ struct page *page = NULL;
+ unsigned long kva, offset;
+
+ /* Map new TCE */
+ offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (ret < 1) {
+ printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, ret);
+ if (!ret)
+ ret = -EFAULT;
+ return ret;
+ }
+
+ kva = (unsigned long) page_address(page);
+ kva += offset;
+
+ /* tce_build receives a virtual address */
+ entry += tbl->it_offset; /* Offset into real TCE table */
+ ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+ /* tce_build() only returns non-zero for transient errors */
+ if (unlikely(ret)) {
+ printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+ put_page(page);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction,
+ unsigned long pages)
+{
+ int i, ret = 0, retpages = 0;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+ BUG_ON(direction == DMA_NONE);
+
+ spin_lock(&(pool->lock));
+
+ /* Check if any is in use */
+ for (i = 0; i < pages; ++i) {
+ unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+ if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
+ spin_unlock(&(pool->lock));
+ return -EBUSY;
+ }
+ }
+
+ /* Put tces to the table */
+ for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
+ ret = put_tce(tbl, entry + i, tce, direction);
+ /*
+ * As IOMMU page size is always 4K, the system page size
+ * can be 64K and there is no special tracking for IOMMU pages,
+ * we only do rlimit check/update for the very first
+ * 4K IOMMUpage within 64K system page.
+ */
+ if (!(tce & ~PAGE_MASK))
+ ++retpages;
+ }
+
+ /* If failed, release locked pages, otherwise return the number of pages */
+ if (ret)
+ clear_tces_nolock(tbl, entry, i);
+ else
+ ret = retpages;
+
+ tce_flush(tbl);
+ spin_unlock(&(pool->lock));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
#include <linux/irq.h>
#include <linux/io.h>
#include <linux/msi.h>
+#include <linux/iommu.h>

#include <asm/sections.h>
#include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
#endif
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+ struct iommu_table *tbl;
+ int ret = 0;
+
+ if (WARN_ON(dev->iommu_group)) {
+ printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+ dev_name(dev),
+ iommu_group_id(dev->iommu_group));
+ return -EBUSY;
+ }
+
+ tbl = get_iommu_table_base(dev);
+ if (!tbl) {
+ pr_debug("tce_vfio: skipping device %s with no tbl\n",
+ dev_name(dev));
+ return 0;
+ }
+
+ pr_debug("tce_vfio: adding %s to iommu group %d\n",
+ dev_name(dev), iommu_group_id(tbl->it_group));
+
+ ret = iommu_group_add_device(tbl->it_group, dev);
+ if (ret < 0)
+ printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+ dev_name(dev), ret);
+
+ return ret;
+}
+
+static void del_device(struct device *dev)
+{
+ iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ return add_device(dev);
+ case BUS_NOTIFY_DEL_DEVICE:
+ del_device(dev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+ .notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+ struct iommu_table *tbl = iommu_data;
+ tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp;
+
+ /* Allocate and initialize IOMMU groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+
+ /* Skip already initialized */
+ if (tbl->it_group)
+ continue;
+
+ grp = iommu_group_alloc();
+ if (IS_ERR(grp)) {
+ printk(KERN_INFO "tce_vfio: cannot create "
+ "new IOMMU group, ret=%ld\n",
+ PTR_ERR(grp));
+ return PTR_ERR(grp);
+ }
+ tbl->it_group = grp;
+ iommu_group_set_iommudata(grp, tbl, group_release);
+ }
+
+ bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Add PCI devices to VFIO groups */
+ for_each_pci_dev(pdev)
+ add_device(&pdev->dev);
+
+ return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp = NULL;
+
+ bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Delete PCI devices from VFIO groups */
+ for_each_pci_dev(pdev)
+ del_device(&pdev->dev);
+
+ /* Release VFIO groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+ grp = tbl->it_group;
+
+ /* Skip (already) uninitialized */
+ if (!grp)
+ continue;
+
+ /* Do actual release, group_release() is expected to work */
+ iommu_group_put(grp);
+ BUG_ON(tbl->it_group);
+ }
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG

Say N unless you need kernel log message for IOMMU debugging

+config SPAPR_TCE_IOMMU
+ bool "sPAPR TCE IOMMU Support"
+ depends on PPC_POWERNV
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops is
+ still not implemented.
+
endif # IOMMU_SUPPORT
--
1.7.10.4

2012-11-29 04:27:38

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled on powernv platform

On Thu, 2012-11-29 at 14:53 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
>
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
>
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
>
> Cc: David Gibson <[email protected]>
> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> ---
> arch/powerpc/include/asm/iommu.h | 9 ++
> arch/powerpc/kernel/iommu.c | 159 ++++++++++++++++++++++++++++++++++
> arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++++++++++++
> drivers/iommu/Kconfig | 8 ++
> 4 files changed, 311 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
> struct iommu_pool large_pool;
> struct iommu_pool pools[IOMMU_NR_POOLS];
> unsigned long *it_map; /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
> };
>
> struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
> }
> #endif
>
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long pages);
> +
> #endif /* __KERNEL__ */
> #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..1225fbb 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
> #include <asm/kdump.h>
> #include <asm/fadump.h>
> #include <asm/vio.h>
> +#include <asm/tce.h>
>
> #define DBG(...)
>
> @@ -856,3 +857,161 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> free_pages((unsigned long)vaddr, get_order(size));
> }
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static void tce_flush(struct iommu_table *tbl)
> +{
> + /* Flush/invalidate TLB caches if necessary */
> + if (ppc_md.tce_flush)
> + ppc_md.tce_flush(tbl);
> +
> + /* Make sure updates are seen by hardware */
> + mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of pages
> + * which it called put_page() on.
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages)
> +{
> + int i, retpages = 0;
> + unsigned long oldtce;
> + struct page *page;
> +
> + for (i = 0; i < pages; ++i) {
> + oldtce = ppc_md.tce_get(tbl, entry + i);
> + ppc_md.tce_free(tbl, entry + i, 1);
> +
> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> + continue;
> +
> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> + WARN_ON(!page);
> + if (!page)
> + continue;
> +
> + if (oldtce & TCE_PCI_WRITE)
> + SetPageDirty(page);
> +
> + if (!(oldtce & ~PAGE_MASK))
> + ++retpages;

I'm confused, it looks like you're trying to only increment the counter
for tce pages aligned at the start of a page, but don't we need to mask
out the read/write and valid bits? Trickiness like this demands a
comment.

> +
> + put_page(page);
> + }
> +
> + return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of released pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages)
> +{
> + int ret;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> +
> + spin_lock(&(pool->lock));
> + ret = clear_tces_nolock(tbl, entry, pages);
> + tce_flush(tbl);
> + spin_unlock(&(pool->lock));
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction)
> +{
> + int ret;
> + struct page *page = NULL;
> + unsigned long kva, offset;
> +
> + /* Map new TCE */
> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +
> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> + direction != DMA_TO_DEVICE, &page);
> + if (ret < 1) {
> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> + if (!ret)
> + ret = -EFAULT;
> + return ret;
> + }
> +
> + kva = (unsigned long) page_address(page);
> + kva += offset;
> +
> + /* tce_build receives a virtual address */
> + entry += tbl->it_offset; /* Offset into real TCE table */
> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> + /* tce_build() only returns non-zero for transient errors */
> + if (unlikely(ret)) {
> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> + put_page(page);
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long pages)
> +{
> + int i, ret = 0, retpages = 0;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> +
> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> + BUG_ON(direction == DMA_NONE);
> +
> + spin_lock(&(pool->lock));
> +
> + /* Check if any is in use */
> + for (i = 0; i < pages; ++i) {
> + unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> + if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
> + spin_unlock(&(pool->lock));
> + return -EBUSY;
> + }
> + }
> +
> + /* Put tces to the table */
> + for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
> + ret = put_tce(tbl, entry + i, tce, direction);
> + /*
> + * As IOMMU page size is always 4K, the system page size
> + * can be 64K and there is no special tracking for IOMMU pages,
> + * we only do rlimit check/update for the very first
> + * 4K IOMMUpage within 64K system page.
> + */
> + if (!(tce & ~PAGE_MASK))
> + ++retpages;

Ah, here's the comment I was looking for, though I'm still not sure
about the read/write bits.

Isn't there an exploit here that a user can lock pages beyond their
limits if they just skip mapping the first 4k of each page? Thanks,

Alex

> + }
> +
> + /* If failed, release locked pages, otherwise return the number of pages */
> + if (ret)
> + clear_tces_nolock(tbl, entry, i);
> + else
> + ret = retpages;
> +
> + tce_flush(tbl);
> + spin_unlock(&(pool->lock));
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
> #include <linux/irq.h>
> #include <linux/io.h>
> #include <linux/msi.h>
> +#include <linux/iommu.h>
>
> #include <asm/sections.h>
> #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> #endif
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> + struct iommu_table *tbl;
> + int ret = 0;
> +
> + if (WARN_ON(dev->iommu_group)) {
> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> + dev_name(dev),
> + iommu_group_id(dev->iommu_group));
> + return -EBUSY;
> + }
> +
> + tbl = get_iommu_table_base(dev);
> + if (!tbl) {
> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> + dev_name(dev));
> + return 0;
> + }
> +
> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> + dev_name(dev), iommu_group_id(tbl->it_group));
> +
> + ret = iommu_group_add_device(tbl->it_group, dev);
> + if (ret < 0)
> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> + dev_name(dev), ret);
> +
> + return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> + iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> + unsigned long action, void *data)
> +{
> + struct device *dev = data;
> +
> + switch (action) {
> + case BUS_NOTIFY_ADD_DEVICE:
> + return add_device(dev);
> + case BUS_NOTIFY_DEL_DEVICE:
> + del_device(dev);
> + return 0;
> + default:
> + return 0;
> + }
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> + .notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> + struct iommu_table *tbl = iommu_data;
> + tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp;
> +
> + /* Allocate and initialize IOMMU groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> +
> + /* Skip already initialized */
> + if (tbl->it_group)
> + continue;
> +
> + grp = iommu_group_alloc();
> + if (IS_ERR(grp)) {
> + printk(KERN_INFO "tce_vfio: cannot create "
> + "new IOMMU group, ret=%ld\n",
> + PTR_ERR(grp));
> + return PTR_ERR(grp);
> + }
> + tbl->it_group = grp;
> + iommu_group_set_iommudata(grp, tbl, group_release);
> + }
> +
> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Add PCI devices to VFIO groups */
> + for_each_pci_dev(pdev)
> + add_device(&pdev->dev);
> +
> + return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp = NULL;
> +
> + bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Delete PCI devices from VFIO groups */
> + for_each_pci_dev(pdev)
> + del_device(&pdev->dev);
> +
> + /* Release VFIO groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> + grp = tbl->it_group;
> +
> + /* Skip (already) uninitialized */
> + if (!grp)
> + continue;
> +
> + /* Do actual release, group_release() is expected to work */
> + iommu_group_put(grp);
> + BUG_ON(tbl->it_group);
> + }
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>
> Say N unless you need kernel log message for IOMMU debugging
>
> +config SPAPR_TCE_IOMMU
> + bool "sPAPR TCE IOMMU Support"
> + depends on PPC_POWERNV
> + select IOMMU_API
> + help
> + Enables bits of IOMMU API required by VFIO. The iommu_ops is
> + still not implemented.
> +
> endif # IOMMU_SUPPORT


2012-11-30 06:14:22

by Alexey Kardashevskiy

[permalink] [raw]
Subject: [PATCH] vfio powerpc: enabled on powernv platform

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
arch/powerpc/include/asm/iommu.h | 9 ++
arch/powerpc/kernel/iommu.c | 186 ++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/pci.c | 135 ++++++++++++++++++++++++
drivers/iommu/Kconfig | 8 ++
4 files changed, 338 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map; /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+ struct iommu_group *it_group;
+#endif
};

struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
}
#endif

+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction,
+ unsigned long pages);
+
#endif /* __KERNEL__ */
#endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..0646c50 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
#include <asm/kdump.h>
#include <asm/fadump.h>
#include <asm/vio.h>
+#include <asm/tce.h>

#define DBG(...)

@@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ */
+static int syspage_weight(unsigned long *map, unsigned long entry)
+{
+ int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+ /* Aligns TCE entry number to system page boundary */
+ entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+ /* Count used 4K pages */
+ while (nbits--)
+ ret += (test_bit(entry++, map) == 0) ? 0 : 1;
+
+ return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages)
+{
+ int i, retpages = 0;
+ unsigned long oldtce, oldweight;
+ struct page *page;
+
+ for (i = 0; i < pages; ++i) {
+ oldtce = ppc_md.tce_get(tbl, entry + i);
+ ppc_md.tce_free(tbl, entry + i, 1);
+
+ oldweight = syspage_weight(tbl->it_map, entry);
+ __clear_bit(entry, tbl->it_map);
+
+ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ continue;
+
+ page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+ WARN_ON(!page);
+ if (!page)
+ continue;
+
+ if (oldtce & TCE_PCI_WRITE)
+ SetPageDirty(page);
+
+ put_page(page);
+
+ /* That was the last IOMMU page within the system page */
+ if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
+ ++retpages;
+ }
+
+ return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ / of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+ unsigned long pages)
+{
+ int ret;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ spin_lock(&(pool->lock));
+ ret = clear_tces_nolock(tbl, entry, pages);
+ tce_flush(tbl);
+ spin_unlock(&(pool->lock));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction)
+{
+ int ret;
+ struct page *page = NULL;
+ unsigned long kva, offset, oldweight;
+
+ /* Map new TCE */
+ offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (ret < 1) {
+ printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, ret);
+ if (!ret || (ret > 1))
+ ret = -EFAULT;
+ return ret;
+ }
+
+ kva = (unsigned long) page_address(page);
+ kva += offset;
+
+ /* tce_build receives a virtual address */
+ entry += tbl->it_offset; /* Offset into real TCE table */
+ ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+ /* tce_build() only returns non-zero for transient errors */
+ if (unlikely(ret)) {
+ printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+ put_page(page);
+ return -EIO;
+ }
+
+ /* Calculate if new system page has been locked */
+ oldweight = syspage_weight(tbl->it_map, entry);
+ __set_bit(entry, tbl->it_map);
+
+ return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+ uint64_t tce, enum dma_data_direction direction,
+ unsigned long pages)
+{
+ int i, ret = 0, retpages = 0;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+ BUG_ON(direction == DMA_NONE);
+
+ spin_lock(&(pool->lock));
+
+ /* Check if any is in use */
+ for (i = 0; i < pages; ++i) {
+ unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+ if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
+ test_bit(entry + i, tbl->it_map)) {
+ WARN_ON(test_bit(entry + i, tbl->it_map));
+ spin_unlock(&(pool->lock));
+ return -EBUSY;
+ }
+ }
+
+ /* Put tces to the table */
+ for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+ ret = put_tce(tbl, entry + i, tce, direction);
+ if (ret == 1)
+ ++retpages;
+ }
+
+ /*
+ * If failed, release locked pages, otherwise return the number
+ * of locked system pages
+ */
+ if (ret < 0)
+ clear_tces_nolock(tbl, entry, i);
+ else
+ ret = retpages;
+
+ tce_flush(tbl);
+ spin_unlock(&(pool->lock));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
#include <linux/irq.h>
#include <linux/io.h>
#include <linux/msi.h>
+#include <linux/iommu.h>

#include <asm/sections.h>
#include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
#endif
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+ struct iommu_table *tbl;
+ int ret = 0;
+
+ if (WARN_ON(dev->iommu_group)) {
+ printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+ dev_name(dev),
+ iommu_group_id(dev->iommu_group));
+ return -EBUSY;
+ }
+
+ tbl = get_iommu_table_base(dev);
+ if (!tbl) {
+ pr_debug("tce_vfio: skipping device %s with no tbl\n",
+ dev_name(dev));
+ return 0;
+ }
+
+ pr_debug("tce_vfio: adding %s to iommu group %d\n",
+ dev_name(dev), iommu_group_id(tbl->it_group));
+
+ ret = iommu_group_add_device(tbl->it_group, dev);
+ if (ret < 0)
+ printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+ dev_name(dev), ret);
+
+ return ret;
+}
+
+static void del_device(struct device *dev)
+{
+ iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ return add_device(dev);
+ case BUS_NOTIFY_DEL_DEVICE:
+ del_device(dev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+ .notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+ struct iommu_table *tbl = iommu_data;
+ tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp;
+
+ /* Allocate and initialize IOMMU groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+
+ /* Skip already initialized */
+ if (tbl->it_group)
+ continue;
+
+ grp = iommu_group_alloc();
+ if (IS_ERR(grp)) {
+ printk(KERN_INFO "tce_vfio: cannot create "
+ "new IOMMU group, ret=%ld\n",
+ PTR_ERR(grp));
+ return PTR_ERR(grp);
+ }
+ tbl->it_group = grp;
+ iommu_group_set_iommudata(grp, tbl, group_release);
+ }
+
+ bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Add PCI devices to VFIO groups */
+ for_each_pci_dev(pdev)
+ add_device(&pdev->dev);
+
+ return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl;
+ struct iommu_group *grp = NULL;
+
+ bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+ /* Delete PCI devices from VFIO groups */
+ for_each_pci_dev(pdev)
+ del_device(&pdev->dev);
+
+ /* Release VFIO groups */
+ for_each_pci_dev(pdev) {
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (!tbl)
+ continue;
+ grp = tbl->it_group;
+
+ /* Skip (already) uninitialized */
+ if (!grp)
+ continue;
+
+ /* Do actual release, group_release() is expected to work */
+ iommu_group_put(grp);
+ BUG_ON(tbl->it_group);
+ }
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG

Say N unless you need kernel log message for IOMMU debugging

+config SPAPR_TCE_IOMMU
+ bool "sPAPR TCE IOMMU Support"
+ depends on PPC_POWERNV
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops is
+ still not implemented.
+
endif # IOMMU_SUPPORT
--
1.7.10.4

2012-11-30 06:16:52

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled on powernv platform

On 29/11/12 15:20, Alex Williamson wrote:

>> + /* Put tces to the table */
>> + for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
>> + ret = put_tce(tbl, entry + i, tce, direction);
>> + /*
>> + * As IOMMU page size is always 4K, the system page size
>> + * can be 64K and there is no special tracking for IOMMU pages,
>> + * we only do rlimit check/update for the very first
>> + * 4K IOMMUpage within 64K system page.
>> + */
>> + if (!(tce & ~PAGE_MASK))
>> + ++retpages;
>
> Ah, here's the comment I was looking for, though I'm still not sure
> about the read/write bits.
>
> Isn't there an exploit here that a user can lock pages beyond their
> limits if they just skip mapping the first 4k of each page? Thanks,


Heh. True. Posted another patch with 4K pages per system page usage tracking.



--
Alexey

2012-11-30 16:48:23

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled on powernv platform

On Fri, 2012-11-30 at 17:14 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
>
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
>
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
>
> Cc: David Gibson <[email protected]>
> Signed-off-by: Alexey Kardashevskiy <[email protected]>
> ---
> arch/powerpc/include/asm/iommu.h | 9 ++
> arch/powerpc/kernel/iommu.c | 186 ++++++++++++++++++++++++++++++++++
> arch/powerpc/platforms/powernv/pci.c | 135 ++++++++++++++++++++++++
> drivers/iommu/Kconfig | 8 ++
> 4 files changed, 338 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
> struct iommu_pool large_pool;
> struct iommu_pool pools[IOMMU_NR_POOLS];
> unsigned long *it_map; /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
> };
>
> struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
> }
> #endif
>
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long pages);
> +
> #endif /* __KERNEL__ */
> #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..0646c50 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
> #include <asm/kdump.h>
> #include <asm/fadump.h>
> #include <asm/vio.h>
> +#include <asm/tce.h>
>
> #define DBG(...)
>
> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> free_pages((unsigned long)vaddr, get_order(size));
> }
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long entry)
> +{
> + int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> + /* Aligns TCE entry number to system page boundary */
> + entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> + /* Count used 4K pages */
> + while (nbits--)
> + ret += (test_bit(entry++, map) == 0) ? 0 : 1;

Ok, entry is the iova page number. So presumably it's relative to the
start of dma32_window_start since you're unlikely to have a bitmap that
covers all of memory. I hadn't realized that previously. Doesn't that
mean that it's actually impossible to create an ioctl based interface to
the dma64_window since we're not going to know which window is the
target? I know you're not planning on one, but it seems limiting. We
at least need some documentation here, but I'm wondering if iova
shouldn't be zero based so we can determine which window it hits. Also,
now that I look at it, I can't find any range checking on the iova.
Thanks,

Alex

> +
> + return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> + /* Flush/invalidate TLB caches if necessary */
> + if (ppc_md.tce_flush)
> + ppc_md.tce_flush(tbl);
> +
> + /* Make sure updates are seen by hardware */
> + mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages)
> +{
> + int i, retpages = 0;
> + unsigned long oldtce, oldweight;
> + struct page *page;
> +
> + for (i = 0; i < pages; ++i) {
> + oldtce = ppc_md.tce_get(tbl, entry + i);
> + ppc_md.tce_free(tbl, entry + i, 1);
> +
> + oldweight = syspage_weight(tbl->it_map, entry);
> + __clear_bit(entry, tbl->it_map);
> +
> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> + continue;
> +
> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> + WARN_ON(!page);
> + if (!page)
> + continue;
> +
> + if (oldtce & TCE_PCI_WRITE)
> + SetPageDirty(page);
> +
> + put_page(page);
> +
> + /* That was the last IOMMU page within the system page */
> + if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> + ++retpages;
> + }
> +
> + return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + / of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages)
> +{
> + int ret;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> +
> + spin_lock(&(pool->lock));
> + ret = clear_tces_nolock(tbl, entry, pages);
> + tce_flush(tbl);
> + spin_unlock(&(pool->lock));
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction)
> +{
> + int ret;
> + struct page *page = NULL;
> + unsigned long kva, offset, oldweight;
> +
> + /* Map new TCE */
> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> + direction != DMA_TO_DEVICE, &page);
> + if (ret < 1) {
> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, ret);
> + if (!ret || (ret > 1))
> + ret = -EFAULT;
> + return ret;
> + }
> +
> + kva = (unsigned long) page_address(page);
> + kva += offset;
> +
> + /* tce_build receives a virtual address */
> + entry += tbl->it_offset; /* Offset into real TCE table */
> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> + /* tce_build() only returns non-zero for transient errors */
> + if (unlikely(ret)) {
> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> + put_page(page);
> + return -EIO;
> + }
> +
> + /* Calculate if new system page has been locked */
> + oldweight = syspage_weight(tbl->it_map, entry);
> + __set_bit(entry, tbl->it_map);
> +
> + return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long pages)
> +{
> + int i, ret = 0, retpages = 0;
> + struct iommu_pool *pool = get_pool(tbl, entry);
> +
> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> + BUG_ON(direction == DMA_NONE);
> +
> + spin_lock(&(pool->lock));
> +
> + /* Check if any is in use */
> + for (i = 0; i < pages; ++i) {
> + unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> + if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> + test_bit(entry + i, tbl->it_map)) {
> + WARN_ON(test_bit(entry + i, tbl->it_map));
> + spin_unlock(&(pool->lock));
> + return -EBUSY;
> + }
> + }
> +
> + /* Put tces to the table */
> + for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> + ret = put_tce(tbl, entry + i, tce, direction);
> + if (ret == 1)
> + ++retpages;
> + }
> +
> + /*
> + * If failed, release locked pages, otherwise return the number
> + * of locked system pages
> + */
> + if (ret < 0)
> + clear_tces_nolock(tbl, entry, i);
> + else
> + ret = retpages;
> +
> + tce_flush(tbl);
> + spin_unlock(&(pool->lock));
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
> #include <linux/irq.h>
> #include <linux/io.h>
> #include <linux/msi.h>
> +#include <linux/iommu.h>
>
> #include <asm/sections.h>
> #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> #endif
> }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> + struct iommu_table *tbl;
> + int ret = 0;
> +
> + if (WARN_ON(dev->iommu_group)) {
> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> + dev_name(dev),
> + iommu_group_id(dev->iommu_group));
> + return -EBUSY;
> + }
> +
> + tbl = get_iommu_table_base(dev);
> + if (!tbl) {
> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
> + dev_name(dev));
> + return 0;
> + }
> +
> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
> + dev_name(dev), iommu_group_id(tbl->it_group));
> +
> + ret = iommu_group_add_device(tbl->it_group, dev);
> + if (ret < 0)
> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> + dev_name(dev), ret);
> +
> + return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> + iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> + unsigned long action, void *data)
> +{
> + struct device *dev = data;
> +
> + switch (action) {
> + case BUS_NOTIFY_ADD_DEVICE:
> + return add_device(dev);
> + case BUS_NOTIFY_DEL_DEVICE:
> + del_device(dev);
> + return 0;
> + default:
> + return 0;
> + }
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> + .notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> + struct iommu_table *tbl = iommu_data;
> + tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp;
> +
> + /* Allocate and initialize IOMMU groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> +
> + /* Skip already initialized */
> + if (tbl->it_group)
> + continue;
> +
> + grp = iommu_group_alloc();
> + if (IS_ERR(grp)) {
> + printk(KERN_INFO "tce_vfio: cannot create "
> + "new IOMMU group, ret=%ld\n",
> + PTR_ERR(grp));
> + return PTR_ERR(grp);
> + }
> + tbl->it_group = grp;
> + iommu_group_set_iommudata(grp, tbl, group_release);
> + }
> +
> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Add PCI devices to VFIO groups */
> + for_each_pci_dev(pdev)
> + add_device(&pdev->dev);
> +
> + return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct iommu_table *tbl;
> + struct iommu_group *grp = NULL;
> +
> + bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> + /* Delete PCI devices from VFIO groups */
> + for_each_pci_dev(pdev)
> + del_device(&pdev->dev);
> +
> + /* Release VFIO groups */
> + for_each_pci_dev(pdev) {
> + tbl = get_iommu_table_base(&pdev->dev);
> + if (!tbl)
> + continue;
> + grp = tbl->it_group;
> +
> + /* Skip (already) uninitialized */
> + if (!grp)
> + continue;
> +
> + /* Do actual release, group_release() is expected to work */
> + iommu_group_put(grp);
> + BUG_ON(tbl->it_group);
> + }
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>
> Say N unless you need kernel log message for IOMMU debugging
>
> +config SPAPR_TCE_IOMMU
> + bool "sPAPR TCE IOMMU Support"
> + depends on PPC_POWERNV
> + select IOMMU_API
> + help
> + Enables bits of IOMMU API required by VFIO. The iommu_ops is
> + still not implemented.
> +
> endif # IOMMU_SUPPORT


2012-12-01 00:14:53

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [PATCH] vfio powerpc: enabled on powernv platform

On 01/12/12 03:48, Alex Williamson wrote:
> On Fri, 2012-11-30 at 17:14 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <[email protected]>
>> Signed-off-by: Alexey Kardashevskiy <[email protected]>
>> ---
>> arch/powerpc/include/asm/iommu.h | 9 ++
>> arch/powerpc/kernel/iommu.c | 186 ++++++++++++++++++++++++++++++++++
>> arch/powerpc/platforms/powernv/pci.c | 135 ++++++++++++++++++++++++
>> drivers/iommu/Kconfig | 8 ++
>> 4 files changed, 338 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..5c7087a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>> struct iommu_pool large_pool;
>> struct iommu_pool pools[IOMMU_NR_POOLS];
>> unsigned long *it_map; /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> + struct iommu_group *it_group;
>> +#endif
>> };
>>
>> struct scatterlist;
>> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>> }
>> #endif
>>
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> + unsigned long pages);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> + uint64_t tce, enum dma_data_direction direction,
>> + unsigned long pages);
>> +
>> #endif /* __KERNEL__ */
>> #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..0646c50 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -44,6 +44,7 @@
>> #include <asm/kdump.h>
>> #include <asm/fadump.h>
>> #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>> #define DBG(...)
>>
>> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>> free_pages((unsigned long)vaddr, get_order(size));
>> }
>> }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + * bitmap_weight is not used as it does not support bigendian maps.
>> + */
>> +static int syspage_weight(unsigned long *map, unsigned long entry)
>> +{
>> + int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> + /* Aligns TCE entry number to system page boundary */
>> + entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> + /* Count used 4K pages */
>> + while (nbits--)
>> + ret += (test_bit(entry++, map) == 0) ? 0 : 1;
>
> Ok, entry is the iova page number. So presumably it's relative to the
> start of dma32_window_start since you're unlikely to have a bitmap that
> covers all of memory. I hadn't realized that previously.

No, it is zero based. The DMA window is a filter but not offset. But you
are right, the it_map does not cover the whole global table (one per PHB,
roughly), will fix it, thanks for pointing. On my test system IOMMU group
is a whole PHB and DMA window always starts from 0 so tests do not show
everything :)

> Doesn't that
> mean that it's actually impossible to create an ioctl based interface to
> the dma64_window since we're not going to know which window is the
> target? I know you're not planning on one, but it seems limiting.

No ,it is not limiting as iova is zero based. Even if it was, there are
flags in map/unmap ioctls which we could use, no?

> We
> at least need some documentation here, but I'm wondering if iova
> shouldn't be zero based so we can determine which window it hits. Also,
> now that I look at it, I can't find any range checking on the iova.

True... Have not hit this problem yet :) Good point, will fix, thanks.



--
Alexey