2007-11-02 17:08:27

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH -mm 0/3] convert IOMMUs to use iova

This patchset convert the PPC64 IOMMU to use the iova code for free
area management.

The IOMMUs ignores low level drivers' restrictions, the maximum
segment size and segment boundary.

I fixed the former:

http://thread.gmane.org/gmane.linux.scsi/35602

The latter makes the free area management complicated. I'd like to
convert IOMMUs to use the iova code (that intel-iommu introduced) for
free area management and enable iova to handle segment boundary
restrictions, rather than fixing all the IOMMUs' free area management,

I converted the PPC64 IOMMU to see how things work. The patchset was
slightly tested with a pSeries box and seems to work.

This is against the latest Linus' git tree with David Miller's
genericizing iova patch:

http://www.ussg.iu.edu/hypermail/linux/kernel/0710.2/2910.html

This patchset is also available:

git://git.kernel.org/pub/scm/linux/kernel/git/tomo/linux-2.6-misc.git iova


2007-11-02 17:07:44

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 2/3] move iova cache code to iova.c

This detaches iova cache code from intel-iommu.c to iova.c in order to
enable IOMMUs to use iova code.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
drivers/pci/intel-iommu.c | 14 ++------------
include/linux/iova.h | 6 +++---
lib/iova.c | 34 ++++++++++++++++++++++++++++------
3 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 1a23b4c..0c41d79 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -141,16 +141,6 @@ static inline void free_devinfo_mem(void *vaddr)
kmem_cache_free(iommu_devinfo_cache, vaddr);
}

-struct iova *alloc_iova_mem(void)
-{
- return iommu_kmem_cache_alloc(iommu_iova_cache);
-}
-
-void free_iova_mem(struct iova *iova)
-{
- kmem_cache_free(iommu_iova_cache, iova);
-}
-
static inline void __iommu_flush_cache(
struct intel_iommu *iommu, void *addr, int size)
{
@@ -1087,7 +1077,7 @@ static void dmar_init_reserved_ranges(void)
int i;
u64 addr, size;

- init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
+ init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN, iommu_iova_cache);

/* IOAPIC ranges shouldn't be accessed by DMA */
iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
@@ -1141,7 +1131,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
int adjust_width, agaw;
unsigned long sagaw;

- init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+ init_iova_domain(&domain->iovad, DMA_32BIT_PFN, iommu_iova_cache);
spin_lock_init(&domain->mapping_lock);

domain_reserve_special_ranges(domain);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d521b5b..41f189b 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -32,10 +32,9 @@ struct iova_domain {
struct rb_root rbroot; /* iova domain rbtree root */
struct rb_node *cached32_node; /* Save last alloced node */
unsigned long dma_32bit_pfn;
+ struct kmem_cache *iova_cachep;
};

-struct iova *alloc_iova_mem(void);
-void free_iova_mem(struct iova *iova);
void free_iova(struct iova_domain *iovad, unsigned long pfn);
void __free_iova(struct iova_domain *iovad, struct iova *iova);
struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
@@ -44,7 +43,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
unsigned long pfn_hi);
void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit,
+ struct kmem_cache *iova_cachep);
struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
void put_iova_domain(struct iova_domain *iovad);

diff --git a/lib/iova.c b/lib/iova.c
index 6e14a3f..d2612e5 100644
--- a/lib/iova.c
+++ b/lib/iova.c
@@ -6,16 +6,38 @@
* Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
*/

+#include <linux/sched.h>
#include <linux/iova.h>

+static struct iova *alloc_iova_mem(struct iova_domain *iovad)
+{
+ unsigned int flags;
+ void *vaddr;
+
+ /* trying to avoid low memory issues */
+ flags = current->flags & PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
+ vaddr = kmem_cache_alloc(iovad->iova_cachep, GFP_ATOMIC);
+ current->flags &= (~PF_MEMALLOC | flags);
+
+ return vaddr;
+}
+
+static void free_iova_mem(struct iova_domain *iovad, struct iova *iova)
+{
+ kmem_cache_free(iovad->iova_cachep, iova);
+}
+
void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit,
+ struct kmem_cache *iova_cachep)
{
spin_lock_init(&iovad->iova_alloc_lock);
spin_lock_init(&iovad->iova_rbtree_lock);
iovad->rbroot = RB_ROOT;
iovad->cached32_node = NULL;
iovad->dma_32bit_pfn = pfn_32bit;
+ iovad->iova_cachep = iova_cachep;
}

static struct rb_node *
@@ -160,7 +182,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
struct iova *new_iova;
int ret;

- new_iova = alloc_iova_mem();
+ new_iova = alloc_iova_mem(iovad);
if (!new_iova)
return NULL;

@@ -176,7 +198,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,

if (ret) {
spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
- free_iova_mem(new_iova);
+ free_iova_mem(iovad, new_iova);
return NULL;
}

@@ -246,7 +268,7 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
__cached_rbnode_delete_update(iovad, iova);
rb_erase(&iova->node, &iovad->rbroot);
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- free_iova_mem(iova);
+ free_iova_mem(iovad, iova);
}

/**
@@ -280,7 +302,7 @@ void put_iova_domain(struct iova_domain *iovad)
while (node) {
struct iova *iova = container_of(node, struct iova, node);
rb_erase(node, &iovad->rbroot);
- free_iova_mem(iova);
+ free_iova_mem(iovad, iova);
node = rb_first(&iovad->rbroot);
}
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
@@ -303,7 +325,7 @@ __insert_new_range(struct iova_domain *iovad,
{
struct iova *iova;

- iova = alloc_iova_mem();
+ iova = alloc_iova_mem(iovad);
if (!iova)
return iova;

--
1.5.2.4

2007-11-02 17:08:39

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 3/3] POWERPC: convert the IOMMU to use iova

This converts the PPC64 IOMMU to use iova for free area management.

TODO: we might need to modify iova to use the tricks like avoiding
cacheline bouncing. Performance tests are necessary.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/powerpc/kernel/iommu.c | 169 ++++++--------------------------
arch/powerpc/platforms/Kconfig.cputype | 1 +
arch/powerpc/platforms/cell/iommu.c | 3 +-
arch/powerpc/sysdev/dart_iommu.c | 4 +-
include/asm-powerpc/iommu.h | 6 +-
5 files changed, 36 insertions(+), 147 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2d0c9ef..34dcfd3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -31,6 +31,7 @@
#include <linux/string.h>
#include <linux/dma-mapping.h>
#include <linux/bitops.h>
+#include <linux/iova.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
@@ -82,106 +83,18 @@ __setup("protect4gb=", setup_protect4gb);
__setup("iommu=", setup_iommu);

static unsigned long iommu_range_alloc(struct iommu_table *tbl,
- unsigned long npages,
- unsigned long *handle,
- unsigned long mask,
- unsigned int align_order)
-{
- unsigned long n, end, i, start;
- unsigned long limit;
- int largealloc = npages > 15;
- int pass = 0;
- unsigned long align_mask;
-
- align_mask = 0xffffffffffffffffl >> (64 - align_order);
-
- /* This allocator was derived from x86_64's bit string search */
-
- /* Sanity check */
- if (unlikely(npages == 0)) {
- if (printk_ratelimit())
- WARN_ON(1);
- return DMA_ERROR_CODE;
- }
-
- if (handle && *handle)
- start = *handle;
- else
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- /* Use only half of the table for small allocs (15 pages or less) */
- limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
-
- if (largealloc && start < tbl->it_halfpoint)
- start = tbl->it_halfpoint;
-
- /* The case below can happen if we have a small segment appended
- * to a large, or when the previous alloc was at the very end of
- * the available space. If so, go back to the initial start.
- */
- if (start >= limit)
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- again:
-
- if (limit + tbl->it_offset > mask) {
- limit = mask - tbl->it_offset + 1;
- /* If we're constrained on address range, first try
- * at the masked hint to avoid O(n) search complexity,
- * but on second pass, start at 0.
- */
- if ((start & mask) >= limit || pass > 0)
- start = 0;
- else
- start &= mask;
- }
-
- n = find_next_zero_bit(tbl->it_map, limit, start);
-
- /* Align allocation */
- n = (n + align_mask) & ~align_mask;
-
- end = n + npages;
-
- if (unlikely(end >= limit)) {
- if (likely(pass < 2)) {
- /* First failure, just rescan the half of the table.
- * Second failure, rescan the other half of the table.
- */
- start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
- limit = pass ? tbl->it_size : limit;
- pass++;
- goto again;
- } else {
- /* Third failure, give up */
- return DMA_ERROR_CODE;
- }
- }
-
- for (i = n; i < end; i++)
- if (test_bit(i, tbl->it_map)) {
- start = i+1;
- goto again;
- }
-
- for (i = n; i < end; i++)
- __set_bit(i, tbl->it_map);
-
- /* Bump the hint to a new block for small allocs. */
- if (largealloc) {
- /* Don't bump to new block to avoid fragmentation */
- tbl->it_largehint = end;
- } else {
- /* Overflow will be taken care of at the next allocation */
- tbl->it_hint = (end + tbl->it_blocksize - 1) &
- ~(tbl->it_blocksize - 1);
- }
+ unsigned long npages,
+ unsigned long mask,
+ unsigned int align_order)
+{
+ unsigned long end;
+ struct iova *iova;

- /* Update handle for SG allocations */
- if (handle)
- *handle = end;
+ end = min_t(unsigned long, DMA_32BIT_MASK >> IOMMU_PAGE_SHIFT,
+ tbl->it_size - 1);
+ iova = alloc_iova(&tbl->iovad, npages, end, 1);

- return n;
+ return iova ? iova->pfn_lo : DMA_ERROR_CODE;
}

static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
@@ -193,7 +106,7 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,

spin_lock_irqsave(&(tbl->it_lock), flags);

- entry = iommu_range_alloc(tbl, npages, NULL, mask, align_order);
+ entry = iommu_range_alloc(tbl, npages, mask, align_order);

if (unlikely(entry == DMA_ERROR_CODE)) {
spin_unlock_irqrestore(&(tbl->it_lock), flags);
@@ -224,7 +137,6 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
unsigned long entry, free_entry;
- unsigned long i;

entry = dma_addr >> IOMMU_PAGE_SHIFT;
free_entry = entry - tbl->it_offset;
@@ -246,9 +158,8 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
}

ppc_md.tce_free(tbl, entry, npages);
-
- for (i = 0; i < npages; i++)
- __clear_bit(free_entry+i, tbl->it_map);
+
+ free_iova(&tbl->iovad, free_entry);
}

static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -309,7 +220,7 @@ int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist,
/* Allocate iommu entries for that segment */
vaddr = (unsigned long) sg_virt(s);
npages = iommu_num_pages(vaddr, slen);
- entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0);
+ entry = iommu_range_alloc(tbl, npages, mask >> IOMMU_PAGE_SHIFT, 0);

DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);

@@ -439,34 +350,28 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
spin_unlock_irqrestore(&(tbl->it_lock), flags);
}

+static struct kmem_cache *iova_cachep;
+
/*
* Build a iommu_table structure. This contains a bit map which
* is used to manage allocation of the tce space.
*/
struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
{
- unsigned long sz;
unsigned long start_index, end_index;
unsigned long entries_per_4g;
unsigned long index;
static int welcomed = 0;
- struct page *page;
-
- /* Set aside 1/4 of the table for large allocations. */
- tbl->it_halfpoint = tbl->it_size * 3 / 4;

- /* number of bytes needed for the bitmap */
- sz = (tbl->it_size + 7) >> 3;
-
- page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz));
- if (!page)
- panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
- tbl->it_map = page_address(page);
- memset(tbl->it_map, 0, sz);
+ if (!iova_cachep) {
+ iova_cachep = KMEM_CACHE(iova, 0);
+ if (!iova_cachep)
+ return NULL;
+ }

- tbl->it_hint = 0;
- tbl->it_largehint = tbl->it_halfpoint;
spin_lock_init(&tbl->it_lock);
+ init_iova_domain(&tbl->iovad, DMA_32BIT_MASK >> IOMMU_PAGE_SHIFT,
+ iova_cachep);

#ifdef CONFIG_CRASH_DUMP
if (ppc_md.tce_get) {
@@ -482,7 +387,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
* Freed TCE entry contains 0x7fffffffffffffff on JS20
*/
if (tceval && (tceval != 0x7fffffffffffffffUL)) {
- __set_bit(index, tbl->it_map);
+ free_iova(&tbl->iovad, index);
tcecount++;
}
}
@@ -492,7 +397,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
KDUMP_MIN_TCE_ENTRIES);
for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
index < tbl->it_size; index++)
- __clear_bit(index, tbl->it_map);
+ free_iova(&tbl->iovad, index);
}
}
#else
@@ -514,7 +419,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
end_index = tbl->it_size;

for (index = start_index; index < end_index - 1; index += entries_per_4g)
- __set_bit(index, tbl->it_map);
+ free_iova(&tbl->iovad, index);
}

if (!welcomed) {
@@ -530,31 +435,15 @@ void iommu_free_table(struct device_node *dn)
{
struct pci_dn *pdn = dn->data;
struct iommu_table *tbl = pdn->iommu_table;
- unsigned long bitmap_sz, i;
- unsigned int order;

- if (!tbl || !tbl->it_map) {
+ if (!tbl) {
printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__,
dn->full_name);
return;
}

/* verify that table contains no entries */
- /* it_size is in entries, and we're examining 64 at a time */
- for (i = 0; i < (tbl->it_size/64); i++) {
- if (tbl->it_map[i] != 0) {
- printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
- __FUNCTION__, dn->full_name);
- break;
- }
- }
-
- /* calculate bitmap size in bytes */
- bitmap_sz = (tbl->it_size + 7) / 8;
-
- /* free bitmap */
- order = get_order(bitmap_sz);
- free_pages((unsigned long) tbl->it_map, order);
+ put_iova_domain(&tbl->iovad);

/* free table */
kfree(tbl);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 99684ea..22483e5 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
config PPC64
bool "64-bit kernel"
default n
+ select IOVA
help
This option selects whether a 32-bit or a 64-bit kernel
will be built.
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index faabc3f..799e8e5 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -471,10 +471,9 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
* This code also assumes that we have a window that starts at 0,
* which is the case on all spider based blades.
*/
- __set_bit(0, window->table.it_map);
+ reserve_iova(&window->table.iovad, 0, 0);
tce_build_cell(&window->table, window->table.it_offset, 1,
(unsigned long)iommu->pad_page, DMA_TO_DEVICE);
- window->table.it_hint = window->table.it_blocksize;

return window;
}
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index e0e24b0..6c58ef5 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -37,6 +37,7 @@
#include <linux/dma-mapping.h>
#include <linux/vmalloc.h>
#include <linux/suspend.h>
+#include <linux/iova.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
@@ -287,7 +288,8 @@ static void iommu_table_dart_setup(void)
/* Reserve the last page of the DART to avoid possible prefetch
* past the DART mapped area
*/
- set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map);
+ reserve_iova(&iommu_table_dart.iovad, iommu_table_dart.it_size - 1,
+ iommu_table_dart.it_size - 1);
}

static void pci_dma_dev_setup_dart(struct pci_dev *dev)
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index 4a82fdc..f497e7e 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -27,6 +27,7 @@
#include <linux/device.h>
#include <linux/dma-mapping.h>
#include <linux/bitops.h>
+#include <linux/iova.h>
#include <asm/machdep.h>
#include <asm/types.h>

@@ -61,11 +62,8 @@ struct iommu_table {
unsigned long it_index; /* which iommu table this is */
unsigned long it_type; /* type: PCI or Virtual Bus */
unsigned long it_blocksize; /* Entries in each block (cacheline) */
- unsigned long it_hint; /* Hint for next alloc */
- unsigned long it_largehint; /* Hint for large allocs */
- unsigned long it_halfpoint; /* Breaking point for small/large allocs */
spinlock_t it_lock; /* Protects it_map */
- unsigned long *it_map; /* A simple allocation bitmap for now */
+ struct iova_domain iovad;
};

struct scatterlist;
--
1.5.2.4

2007-11-02 17:11:00

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 1/3] move iova from drivers/pci/ to lib/

iova could be used by several IOMMUs. This patch just moves iova from
drivers/pci/ to lib/ and fixes the appropriate Makefile and Kconfig
files.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/Kconfig.x86_64 | 1 +
drivers/pci/Makefile | 2 +-
drivers/pci/intel-iommu.c | 1 -
drivers/pci/intel-iommu.h | 2 +-
drivers/pci/iova.c | 394 ---------------------------------------------
drivers/pci/iova.h | 51 ------
include/linux/iova.h | 51 ++++++
lib/Kconfig | 3 +
lib/Makefile | 2 +
lib/iova.c | 394 +++++++++++++++++++++++++++++++++++++++++++++
10 files changed, 453 insertions(+), 448 deletions(-)
delete mode 100644 drivers/pci/iova.c
delete mode 100644 drivers/pci/iova.h
create mode 100644 include/linux/iova.h
create mode 100644 lib/iova.c

diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index cc468ea..c10d3f0 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -749,6 +749,7 @@ config PCI_DOMAINS
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
depends on PCI_MSI && ACPI && EXPERIMENTAL
+ select IOVA
help
DMA remapping (DMAR) devices support enables independent address
translations for Direct Memory Access (DMA) from devices.
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 5550556..00c3428 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -21,7 +21,7 @@ obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_HT_IRQ) += htirq.o

# Build Intel IOMMU support
-obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+obj-$(CONFIG_DMAR) += dmar.o intel-iommu.o

#
# Some architectures use the generic PCI setup functions
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index dc1edbd..1a23b4c 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -30,7 +30,6 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
-#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
#include <asm/cacheflush.h>
diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h
index f54efd1..750e1b5 100644
--- a/drivers/pci/intel-iommu.h
+++ b/drivers/pci/intel-iommu.h
@@ -23,7 +23,7 @@

#include <linux/types.h>
#include <linux/msi.h>
-#include "iova.h"
+#include <linux/iova.h>
#include <linux/io.h>

/*
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
deleted file mode 100644
index 8de7ab6..0000000
--- a/drivers/pci/iova.c
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This file is released under the GPLv2.
- *
- * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
- */
-
-#include "iova.h"
-
-void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
-{
- spin_lock_init(&iovad->iova_alloc_lock);
- spin_lock_init(&iovad->iova_rbtree_lock);
- iovad->rbroot = RB_ROOT;
- iovad->cached32_node = NULL;
- iovad->dma_32bit_pfn = pfn_32bit;
-}
-
-static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
-{
- if ((*limit_pfn != iovad->dma_32bit_pfn) ||
- (iovad->cached32_node == NULL))
- return rb_last(&iovad->rbroot);
- else {
- struct rb_node *prev_node = rb_prev(iovad->cached32_node);
- struct iova *curr_iova =
- container_of(iovad->cached32_node, struct iova, node);
- *limit_pfn = curr_iova->pfn_lo - 1;
- return prev_node;
- }
-}
-
-static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
- unsigned long limit_pfn, struct iova *new)
-{
- if (limit_pfn != iovad->dma_32bit_pfn)
- return;
- iovad->cached32_node = &new->node;
-}
-
-static void
-__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
-{
- struct iova *cached_iova;
- struct rb_node *curr;
-
- if (!iovad->cached32_node)
- return;
- curr = iovad->cached32_node;
- cached_iova = container_of(curr, struct iova, node);
-
- if (free->pfn_lo >= cached_iova->pfn_lo)
- iovad->cached32_node = rb_next(&free->node);
-}
-
-/* Computes the padding size required, to make the
- * the start address naturally aligned on its size
- */
-static int
-iova_get_pad_size(int size, unsigned int limit_pfn)
-{
- unsigned int pad_size = 0;
- unsigned int order = ilog2(size);
-
- if (order)
- pad_size = (limit_pfn + 1) % (1 << order);
-
- return pad_size;
-}
-
-static int __alloc_iova_range(struct iova_domain *iovad, unsigned long size,
- unsigned long limit_pfn, struct iova *new, bool size_aligned)
-{
- struct rb_node *curr = NULL;
- unsigned long flags;
- unsigned long saved_pfn;
- unsigned int pad_size = 0;
-
- /* Walk the tree backwards */
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- saved_pfn = limit_pfn;
- curr = __get_cached_rbnode(iovad, &limit_pfn);
- while (curr) {
- struct iova *curr_iova = container_of(curr, struct iova, node);
- if (limit_pfn < curr_iova->pfn_lo)
- goto move_left;
- else if (limit_pfn < curr_iova->pfn_hi)
- goto adjust_limit_pfn;
- else {
- if (size_aligned)
- pad_size = iova_get_pad_size(size, limit_pfn);
- if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
- break; /* found a free slot */
- }
-adjust_limit_pfn:
- limit_pfn = curr_iova->pfn_lo - 1;
-move_left:
- curr = rb_prev(curr);
- }
-
- if (!curr) {
- if (size_aligned)
- pad_size = iova_get_pad_size(size, limit_pfn);
- if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return -ENOMEM;
- }
- }
-
- /* pfn_lo will point to size aligned address if size_aligned is set */
- new->pfn_lo = limit_pfn - (size + pad_size) + 1;
- new->pfn_hi = new->pfn_lo + size - 1;
-
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return 0;
-}
-
-static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova)
-{
- struct rb_node **new = &(root->rb_node), *parent = NULL;
- /* Figure out where to put new node */
- while (*new) {
- struct iova *this = container_of(*new, struct iova, node);
- parent = *new;
-
- if (iova->pfn_lo < this->pfn_lo)
- new = &((*new)->rb_left);
- else if (iova->pfn_lo > this->pfn_lo)
- new = &((*new)->rb_right);
- else
- BUG(); /* this should not happen */
- }
- /* Add new node and rebalance tree. */
- rb_link_node(&iova->node, parent, new);
- rb_insert_color(&iova->node, root);
-}
-
-/**
- * alloc_iova - allocates an iova
- * @iovad - iova domain in question
- * @size - size of page frames to allocate
- * @limit_pfn - max limit address
- * @size_aligned - set if size_aligned address range is required
- * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
- * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
- * flag is set then the allocated address iova->pfn_lo will be naturally
- * aligned on roundup_power_of_two(size).
- */
-struct iova *
-alloc_iova(struct iova_domain *iovad, unsigned long size,
- unsigned long limit_pfn,
- bool size_aligned)
-{
- unsigned long flags;
- struct iova *new_iova;
- int ret;
-
- new_iova = alloc_iova_mem();
- if (!new_iova)
- return NULL;
-
- /* If size aligned is set then round the size to
- * to next power of two.
- */
- if (size_aligned)
- size = __roundup_pow_of_two(size);
-
- spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
- ret = __alloc_iova_range(iovad, size, limit_pfn, new_iova,
- size_aligned);
-
- if (ret) {
- spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
- free_iova_mem(new_iova);
- return NULL;
- }
-
- /* Insert the new_iova into domain rbtree by holding writer lock */
- spin_lock(&iovad->iova_rbtree_lock);
- iova_insert_rbtree(&iovad->rbroot, new_iova);
- __cached_rbnode_insert_update(iovad, limit_pfn, new_iova);
- spin_unlock(&iovad->iova_rbtree_lock);
-
- spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
-
- return new_iova;
-}
-
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad - iova domain in question.
- * pfn - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
-{
- unsigned long flags;
- struct rb_node *node;
-
- /* Take the lock so that no other thread is manipulating the rbtree */
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- node = iovad->rbroot.rb_node;
- while (node) {
- struct iova *iova = container_of(node, struct iova, node);
-
- /* If pfn falls within iova's range, return iova */
- if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- /* We are not holding the lock while this iova
- * is referenced by the caller as the same thread
- * which called this function also calls __free_iova()
- * and it is by desing that only one thread can possibly
- * reference a particular iova and hence no conflict.
- */
- return iova;
- }
-
- if (pfn < iova->pfn_lo)
- node = node->rb_left;
- else if (pfn > iova->pfn_lo)
- node = node->rb_right;
- }
-
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return NULL;
-}
-
-/**
- * __free_iova - frees the given iova
- * @iovad: iova domain in question.
- * @iova: iova in question.
- * Frees the given iova belonging to the giving domain
- */
-void
-__free_iova(struct iova_domain *iovad, struct iova *iova)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- __cached_rbnode_delete_update(iovad, iova);
- rb_erase(&iova->node, &iovad->rbroot);
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- free_iova_mem(iova);
-}
-
-/**
- * free_iova - finds and frees the iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - pfn that is allocated previously
- * This functions finds an iova for a given pfn and then
- * frees the iova from that domain.
- */
-void
-free_iova(struct iova_domain *iovad, unsigned long pfn)
-{
- struct iova *iova = find_iova(iovad, pfn);
- if (iova)
- __free_iova(iovad, iova);
-
-}
-
-/**
- * put_iova_domain - destroys the iova doamin
- * @iovad: - iova domain in question.
- * All the iova's in that domain are destroyed.
- */
-void put_iova_domain(struct iova_domain *iovad)
-{
- struct rb_node *node;
- unsigned long flags;
-
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- node = rb_first(&iovad->rbroot);
- while (node) {
- struct iova *iova = container_of(node, struct iova, node);
- rb_erase(node, &iovad->rbroot);
- free_iova_mem(iova);
- node = rb_first(&iovad->rbroot);
- }
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-}
-
-static int
-__is_range_overlap(struct rb_node *node,
- unsigned long pfn_lo, unsigned long pfn_hi)
-{
- struct iova *iova = container_of(node, struct iova, node);
-
- if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
- return 1;
- return 0;
-}
-
-static struct iova *
-__insert_new_range(struct iova_domain *iovad,
- unsigned long pfn_lo, unsigned long pfn_hi)
-{
- struct iova *iova;
-
- iova = alloc_iova_mem();
- if (!iova)
- return iova;
-
- iova->pfn_hi = pfn_hi;
- iova->pfn_lo = pfn_lo;
- iova_insert_rbtree(&iovad->rbroot, iova);
- return iova;
-}
-
-static void
-__adjust_overlap_range(struct iova *iova,
- unsigned long *pfn_lo, unsigned long *pfn_hi)
-{
- if (*pfn_lo < iova->pfn_lo)
- iova->pfn_lo = *pfn_lo;
- if (*pfn_hi > iova->pfn_hi)
- *pfn_lo = iova->pfn_hi + 1;
-}
-
-/**
- * reserve_iova - reserves an iova in the given range
- * @iovad: - iova domain pointer
- * @pfn_lo: - lower page frame address
- * @pfn_hi:- higher pfn adderss
- * This function allocates reserves the address range from pfn_lo to pfn_hi so
- * that this address is not dished out as part of alloc_iova.
- */
-struct iova *
-reserve_iova(struct iova_domain *iovad,
- unsigned long pfn_lo, unsigned long pfn_hi)
-{
- struct rb_node *node;
- unsigned long flags;
- struct iova *iova;
- unsigned int overlap = 0;
-
- spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
- spin_lock(&iovad->iova_rbtree_lock);
- for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
- if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
- iova = container_of(node, struct iova, node);
- __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
- if ((pfn_lo >= iova->pfn_lo) &&
- (pfn_hi <= iova->pfn_hi))
- goto finish;
- overlap = 1;
-
- } else if (overlap)
- break;
- }
-
- /* We are here either becasue this is the first reserver node
- * or need to insert remaining non overlap addr range
- */
- iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
-finish:
-
- spin_unlock(&iovad->iova_rbtree_lock);
- spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
- return iova;
-}
-
-/**
- * copy_reserved_iova - copies the reserved between domains
- * @from: - source doamin from where to copy
- * @to: - destination domin where to copy
- * This function copies reserved iova's from one doamin to
- * other.
- */
-void
-copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
-{
- unsigned long flags;
- struct rb_node *node;
-
- spin_lock_irqsave(&from->iova_alloc_lock, flags);
- spin_lock(&from->iova_rbtree_lock);
- for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
- struct iova *iova = container_of(node, struct iova, node);
- struct iova *new_iova;
- new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
- if (!new_iova)
- printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
- iova->pfn_lo, iova->pfn_lo);
- }
- spin_unlock(&from->iova_rbtree_lock);
- spin_unlock_irqrestore(&from->iova_alloc_lock, flags);
-}
diff --git a/drivers/pci/iova.h b/drivers/pci/iova.h
deleted file mode 100644
index d521b5b..0000000
--- a/drivers/pci/iova.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This file is released under the GPLv2.
- *
- * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
- *
- */
-
-#ifndef _IOVA_H_
-#define _IOVA_H_
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/rbtree.h>
-#include <linux/dma-mapping.h>
-
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN (1)
-
-/* iova structure */
-struct iova {
- struct rb_node node;
- unsigned long pfn_hi; /* IOMMU dish out addr hi */
- unsigned long pfn_lo; /* IOMMU dish out addr lo */
-};
-
-/* holds all the iova translations for a domain */
-struct iova_domain {
- spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */
- spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
- struct rb_root rbroot; /* iova domain rbtree root */
- struct rb_node *cached32_node; /* Save last alloced node */
- unsigned long dma_32bit_pfn;
-};
-
-struct iova *alloc_iova_mem(void);
-void free_iova_mem(struct iova *iova);
-void free_iova(struct iova_domain *iovad, unsigned long pfn);
-void __free_iova(struct iova_domain *iovad, struct iova *iova);
-struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
- unsigned long limit_pfn,
- bool size_aligned);
-struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
- unsigned long pfn_hi);
-void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
-void put_iova_domain(struct iova_domain *iovad);
-
-#endif
diff --git a/include/linux/iova.h b/include/linux/iova.h
new file mode 100644
index 0000000..d521b5b
--- /dev/null
+++ b/include/linux/iova.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
+ *
+ */
+
+#ifndef _IOVA_H_
+#define _IOVA_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h>
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN (1)
+
+/* iova structure */
+struct iova {
+ struct rb_node node;
+ unsigned long pfn_hi; /* IOMMU dish out addr hi */
+ unsigned long pfn_lo; /* IOMMU dish out addr lo */
+};
+
+/* holds all the iova translations for a domain */
+struct iova_domain {
+ spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */
+ spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
+ struct rb_root rbroot; /* iova domain rbtree root */
+ struct rb_node *cached32_node; /* Save last alloced node */
+ unsigned long dma_32bit_pfn;
+};
+
+struct iova *alloc_iova_mem(void);
+void free_iova_mem(struct iova *iova);
+void free_iova(struct iova_domain *iovad, unsigned long pfn);
+void __free_iova(struct iova_domain *iovad, struct iova *iova);
+struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn,
+ bool size_aligned);
+struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
+ unsigned long pfn_hi);
+void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
+void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
+void put_iova_domain(struct iova_domain *iovad);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index ba3d104..4107547 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -141,4 +141,7 @@ config HAS_DMA
config CHECK_SIGNATURE
bool

+config IOVA
+ boolean
+
endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 3a0983b..7de33cc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -66,6 +66,8 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o

+obj-$(CONFIG_IOVA) += iova.o
+
lib-$(CONFIG_GENERIC_BUG) += bug.o

hostprogs-y := gen_crc32table
diff --git a/lib/iova.c b/lib/iova.c
new file mode 100644
index 0000000..6e14a3f
--- /dev/null
+++ b/lib/iova.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
+ */
+
+#include <linux/iova.h>
+
+void
+init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+{
+ spin_lock_init(&iovad->iova_alloc_lock);
+ spin_lock_init(&iovad->iova_rbtree_lock);
+ iovad->rbroot = RB_ROOT;
+ iovad->cached32_node = NULL;
+ iovad->dma_32bit_pfn = pfn_32bit;
+}
+
+static struct rb_node *
+__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
+{
+ if ((*limit_pfn != iovad->dma_32bit_pfn) ||
+ (iovad->cached32_node == NULL))
+ return rb_last(&iovad->rbroot);
+ else {
+ struct rb_node *prev_node = rb_prev(iovad->cached32_node);
+ struct iova *curr_iova =
+ container_of(iovad->cached32_node, struct iova, node);
+ *limit_pfn = curr_iova->pfn_lo - 1;
+ return prev_node;
+ }
+}
+
+static void
+__cached_rbnode_insert_update(struct iova_domain *iovad,
+ unsigned long limit_pfn, struct iova *new)
+{
+ if (limit_pfn != iovad->dma_32bit_pfn)
+ return;
+ iovad->cached32_node = &new->node;
+}
+
+static void
+__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
+{
+ struct iova *cached_iova;
+ struct rb_node *curr;
+
+ if (!iovad->cached32_node)
+ return;
+ curr = iovad->cached32_node;
+ cached_iova = container_of(curr, struct iova, node);
+
+ if (free->pfn_lo >= cached_iova->pfn_lo)
+ iovad->cached32_node = rb_next(&free->node);
+}
+
+/* Computes the padding size required, to make the
+ * the start address naturally aligned on its size
+ */
+static int
+iova_get_pad_size(int size, unsigned int limit_pfn)
+{
+ unsigned int pad_size = 0;
+ unsigned int order = ilog2(size);
+
+ if (order)
+ pad_size = (limit_pfn + 1) % (1 << order);
+
+ return pad_size;
+}
+
+static int __alloc_iova_range(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn, struct iova *new, bool size_aligned)
+{
+ struct rb_node *curr = NULL;
+ unsigned long flags;
+ unsigned long saved_pfn;
+ unsigned int pad_size = 0;
+
+ /* Walk the tree backwards */
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ saved_pfn = limit_pfn;
+ curr = __get_cached_rbnode(iovad, &limit_pfn);
+ while (curr) {
+ struct iova *curr_iova = container_of(curr, struct iova, node);
+ if (limit_pfn < curr_iova->pfn_lo)
+ goto move_left;
+ else if (limit_pfn < curr_iova->pfn_hi)
+ goto adjust_limit_pfn;
+ else {
+ if (size_aligned)
+ pad_size = iova_get_pad_size(size, limit_pfn);
+ if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
+ break; /* found a free slot */
+ }
+adjust_limit_pfn:
+ limit_pfn = curr_iova->pfn_lo - 1;
+move_left:
+ curr = rb_prev(curr);
+ }
+
+ if (!curr) {
+ if (size_aligned)
+ pad_size = iova_get_pad_size(size, limit_pfn);
+ if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return -ENOMEM;
+ }
+ }
+
+ /* pfn_lo will point to size aligned address if size_aligned is set */
+ new->pfn_lo = limit_pfn - (size + pad_size) + 1;
+ new->pfn_hi = new->pfn_lo + size - 1;
+
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return 0;
+}
+
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ /* Figure out where to put new node */
+ while (*new) {
+ struct iova *this = container_of(*new, struct iova, node);
+ parent = *new;
+
+ if (iova->pfn_lo < this->pfn_lo)
+ new = &((*new)->rb_left);
+ else if (iova->pfn_lo > this->pfn_lo)
+ new = &((*new)->rb_right);
+ else
+ BUG(); /* this should not happen */
+ }
+ /* Add new node and rebalance tree. */
+ rb_link_node(&iova->node, parent, new);
+ rb_insert_color(&iova->node, root);
+}
+
+/**
+ * alloc_iova - allocates an iova
+ * @iovad - iova domain in question
+ * @size - size of page frames to allocate
+ * @limit_pfn - max limit address
+ * @size_aligned - set if size_aligned address range is required
+ * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
+ * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
+ * flag is set then the allocated address iova->pfn_lo will be naturally
+ * aligned on roundup_power_of_two(size).
+ */
+struct iova *
+alloc_iova(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn,
+ bool size_aligned)
+{
+ unsigned long flags;
+ struct iova *new_iova;
+ int ret;
+
+ new_iova = alloc_iova_mem();
+ if (!new_iova)
+ return NULL;
+
+ /* If size aligned is set then round the size to
+ * to next power of two.
+ */
+ if (size_aligned)
+ size = __roundup_pow_of_two(size);
+
+ spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
+ ret = __alloc_iova_range(iovad, size, limit_pfn, new_iova,
+ size_aligned);
+
+ if (ret) {
+ spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+ free_iova_mem(new_iova);
+ return NULL;
+ }
+
+ /* Insert the new_iova into domain rbtree by holding writer lock */
+ spin_lock(&iovad->iova_rbtree_lock);
+ iova_insert_rbtree(&iovad->rbroot, new_iova);
+ __cached_rbnode_insert_update(iovad, limit_pfn, new_iova);
+ spin_unlock(&iovad->iova_rbtree_lock);
+
+ spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+
+ return new_iova;
+}
+
+/**
+ * find_iova - find's an iova for a given pfn
+ * @iovad - iova domain in question.
+ * pfn - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+ unsigned long flags;
+ struct rb_node *node;
+
+ /* Take the lock so that no other thread is manipulating the rbtree */
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ node = iovad->rbroot.rb_node;
+ while (node) {
+ struct iova *iova = container_of(node, struct iova, node);
+
+ /* If pfn falls within iova's range, return iova */
+ if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ /* We are not holding the lock while this iova
+ * is referenced by the caller as the same thread
+ * which called this function also calls __free_iova()
+ * and it is by desing that only one thread can possibly
+ * reference a particular iova and hence no conflict.
+ */
+ return iova;
+ }
+
+ if (pfn < iova->pfn_lo)
+ node = node->rb_left;
+ else if (pfn > iova->pfn_lo)
+ node = node->rb_right;
+ }
+
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return NULL;
+}
+
+/**
+ * __free_iova - frees the given iova
+ * @iovad: iova domain in question.
+ * @iova: iova in question.
+ * Frees the given iova belonging to the giving domain
+ */
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ __cached_rbnode_delete_update(iovad, iova);
+ rb_erase(&iova->node, &iovad->rbroot);
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ free_iova_mem(iova);
+}
+
+/**
+ * free_iova - finds and frees the iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * This functions finds an iova for a given pfn and then
+ * frees the iova from that domain.
+ */
+void
+free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+ struct iova *iova = find_iova(iovad, pfn);
+ if (iova)
+ __free_iova(iovad, iova);
+
+}
+
+/**
+ * put_iova_domain - destroys the iova doamin
+ * @iovad: - iova domain in question.
+ * All the iova's in that domain are destroyed.
+ */
+void put_iova_domain(struct iova_domain *iovad)
+{
+ struct rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ node = rb_first(&iovad->rbroot);
+ while (node) {
+ struct iova *iova = container_of(node, struct iova, node);
+ rb_erase(node, &iovad->rbroot);
+ free_iova_mem(iova);
+ node = rb_first(&iovad->rbroot);
+ }
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+
+static int
+__is_range_overlap(struct rb_node *node,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct iova *iova = container_of(node, struct iova, node);
+
+ if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
+ return 1;
+ return 0;
+}
+
+static struct iova *
+__insert_new_range(struct iova_domain *iovad,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct iova *iova;
+
+ iova = alloc_iova_mem();
+ if (!iova)
+ return iova;
+
+ iova->pfn_hi = pfn_hi;
+ iova->pfn_lo = pfn_lo;
+ iova_insert_rbtree(&iovad->rbroot, iova);
+ return iova;
+}
+
+static void
+__adjust_overlap_range(struct iova *iova,
+ unsigned long *pfn_lo, unsigned long *pfn_hi)
+{
+ if (*pfn_lo < iova->pfn_lo)
+ iova->pfn_lo = *pfn_lo;
+ if (*pfn_hi > iova->pfn_hi)
+ *pfn_lo = iova->pfn_hi + 1;
+}
+
+/**
+ * reserve_iova - reserves an iova in the given range
+ * @iovad: - iova domain pointer
+ * @pfn_lo: - lower page frame address
+ * @pfn_hi:- higher pfn adderss
+ * This function allocates reserves the address range from pfn_lo to pfn_hi so
+ * that this address is not dished out as part of alloc_iova.
+ */
+struct iova *
+reserve_iova(struct iova_domain *iovad,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct rb_node *node;
+ unsigned long flags;
+ struct iova *iova;
+ unsigned int overlap = 0;
+
+ spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
+ spin_lock(&iovad->iova_rbtree_lock);
+ for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
+ if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
+ iova = container_of(node, struct iova, node);
+ __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
+ if ((pfn_lo >= iova->pfn_lo) &&
+ (pfn_hi <= iova->pfn_hi))
+ goto finish;
+ overlap = 1;
+
+ } else if (overlap)
+ break;
+ }
+
+ /* We are here either becasue this is the first reserver node
+ * or need to insert remaining non overlap addr range
+ */
+ iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
+finish:
+
+ spin_unlock(&iovad->iova_rbtree_lock);
+ spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+ return iova;
+}
+
+/**
+ * copy_reserved_iova - copies the reserved between domains
+ * @from: - source doamin from where to copy
+ * @to: - destination domin where to copy
+ * This function copies reserved iova's from one doamin to
+ * other.
+ */
+void
+copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
+{
+ unsigned long flags;
+ struct rb_node *node;
+
+ spin_lock_irqsave(&from->iova_alloc_lock, flags);
+ spin_lock(&from->iova_rbtree_lock);
+ for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
+ struct iova *iova = container_of(node, struct iova, node);
+ struct iova *new_iova;
+ new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+ if (!new_iova)
+ printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
+ iova->pfn_lo, iova->pfn_lo);
+ }
+ spin_unlock(&from->iova_rbtree_lock);
+ spin_unlock_irqrestore(&from->iova_alloc_lock, flags);
+}
--
1.5.2.4

2007-11-02 17:12:47

by Muli Ben-Yehuda

[permalink] [raw]
Subject: Re: [PATCH -mm 0/3] convert IOMMUs to use iova

On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:

> This patchset convert the PPC64 IOMMU to use the iova code for free
> area management.
>
> The IOMMUs ignores low level drivers' restrictions, the maximum
> segment size and segment boundary.
>
> I fixed the former:
>
> http://thread.gmane.org/gmane.linux.scsi/35602
>
> The latter makes the free area management complicated. I'd like to
> convert IOMMUs to use the iova code (that intel-iommu introduced)
> for free area management and enable iova to handle segment boundary
> restrictions, rather than fixing all the IOMMUs' free area
> management,

In general it sounds like a great idea, but have you looked at what
impact this has on the performance of the IO path?

Cheers,
Muli

2007-11-02 18:17:27

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH -mm 0/3] convert IOMMUs to use iova

On Fri, 2 Nov 2007 19:12:27 +0200
Muli Ben-Yehuda <[email protected]> wrote:

> On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:
>
> > This patchset convert the PPC64 IOMMU to use the iova code for free
> > area management.
> >
> > The IOMMUs ignores low level drivers' restrictions, the maximum
> > segment size and segment boundary.
> >
> > I fixed the former:
> >
> > http://thread.gmane.org/gmane.linux.scsi/35602
> >
> > The latter makes the free area management complicated. I'd like to
> > convert IOMMUs to use the iova code (that intel-iommu introduced)
> > for free area management and enable iova to handle segment boundary
> > restrictions, rather than fixing all the IOMMUs' free area
> > management,
>
> In general it sounds like a great idea, but have you looked at what
> impact this has on the performance of the IO path?

Not yet. And I have access to only an entry-class pSeries server so I
would appreciate it if POWERPC people try the patchset with high-end
servers.

The patchset doesn't use the tricks that the current POWERPC IOMMU use
to improve performance (Anton and Olof told me about them, Thanks!)
. So I might need to modify iova for them.

I think that it's difficult to use some tricks used in the IOMMUs with
the iova even if we modify the iova. If the iova degrades performance,
I think that it would be better to have new IOMMU library code to use
simply a bit map that most of the IOMMUs use.

Another possible factor to degrade performance is that we can't
allocate areas for map_sg at one stroke like some of the IOMMUs do.
We need to allocate an area per sg segment to handle segment boundary.
If it's unacceptable, I guess that there is only one way, LLDs
allocate many sg entries and then the IOMMU splits the sg entryes.

2007-11-07 13:39:20

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH -mm 0/3] convert IOMMUs to use iova

On Fri, 2 Nov 2007 19:12:27 +0200
Muli Ben-Yehuda <[email protected]> wrote:

> On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:
>
> > This patchset convert the PPC64 IOMMU to use the iova code for free
> > area management.
> >
> > The IOMMUs ignores low level drivers' restrictions, the maximum
> > segment size and segment boundary.
> >
> > I fixed the former:
> >
> > http://thread.gmane.org/gmane.linux.scsi/35602
> >
> > The latter makes the free area management complicated. I'd like to
> > convert IOMMUs to use the iova code (that intel-iommu introduced)
> > for free area management and enable iova to handle segment boundary
> > restrictions, rather than fixing all the IOMMUs' free area
> > management,
>
> In general it sounds like a great idea, but have you looked at what
> impact this has on the performance of the IO path?

I converted swiotlb to use iova and compared it with the original
algorithm (better than the simple bit map one that most of the IOMMUs
use, I think).

I use 'swiotlb=force' boot option and run netperf with "-m 128 -D"
(lead to tons of dma_map_single).

The original produced 281.8 Mb/s and the iova produced 77.2 Mb/s.

Seems that it would be better to generalization the swiotlb algorithm
(at least for small I/O area)? Or my patch might have bugs.

Here's the patch to convert swiotlb to use iova against my iova
patchset:

http://marc.info/?l=linux-kernel&m=119402340801254&w=2

diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index c10d3f0..8735822 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -524,6 +524,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
bool
+ select IOVA
help
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU (e.g. the current generation
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index aa805b1..8507402 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -325,6 +325,9 @@ static int __init pci_iommu_init(void)
gart_iommu_init();
#endif

+#ifdef CONFIG_SWIOTLB
+ swiotlb_alloc();
+#endif
no_iommu_init();
return 0;
}
diff --git a/include/asm-x86/swiotlb.h b/include/asm-x86/swiotlb.h
index f9c5895..f00d20c 100644
--- a/include/asm-x86/swiotlb.h
+++ b/include/asm-x86/swiotlb.h
@@ -40,6 +40,7 @@ extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
void *vaddr, dma_addr_t dma_handle);
extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
extern void swiotlb_init(void);
+extern void swiotlb_alloc(void);

extern int swiotlb_force;

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1a8050a..54ecb87 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ctype.h>
+#include <linux/iova.h>

#include <asm/io.h>
#include <asm/dma.h>
@@ -103,10 +104,7 @@ static unsigned int io_tlb_index;
*/
static unsigned char **io_tlb_orig_addr;

-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
+static struct iova_domain swiotlb_iovad;

static int __init
setup_io_tlb_npages(char *str)
@@ -272,6 +270,19 @@ cleanup1:
return -ENOMEM;
}

+static struct kmem_cache *iova_cachep;
+
+void __init
+swiotlb_alloc(void)
+{
+ if (!swiotlb)
+ return;
+
+ iova_cachep = KMEM_CACHE(iova, 0);
+ init_iova_domain(&swiotlb_iovad, DMA_32BIT_MASK >> IO_TLB_SHIFT,
+ iova_cachep);
+}
+
static int
address_needs_mapping(struct device *hwdev, dma_addr_t addr)
{
@@ -288,70 +299,20 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr)
static void *
map_single(struct device *hwdev, char *buffer, size_t size, int dir)
{
- unsigned long flags;
char *dma_addr;
- unsigned int nslots, stride, index, wrap;
+ unsigned int nslots, index;
int i;
+ struct iova *iova;

- /*
- * For mappings greater than a page, we limit the stride (and
- * hence alignment) to a page size.
- */
nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- if (size > PAGE_SIZE)
- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
- else
- stride = 1;
-
BUG_ON(!nslots);

- /*
- * Find suitable number of IO TLB entries size that will fit this
- * request and allocate a buffer from that IO TLB pool.
- */
- spin_lock_irqsave(&io_tlb_lock, flags);
- {
- wrap = index = ALIGN(io_tlb_index, stride);
-
- if (index >= io_tlb_nslabs)
- wrap = index = 0;
-
- do {
- /*
- * If we find a slot that indicates we have 'nslots'
- * number of contiguous buffers, we allocate the
- * buffers from that slot and mark the entries as '0'
- * indicating unavailable.
- */
- if (io_tlb_list[index] >= nslots) {
- int count = 0;
-
- for (i = index; i < (int) (index + nslots); i++)
- io_tlb_list[i] = 0;
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
- /*
- * Update the indices to avoid searching in
- * the next round.
- */
- io_tlb_index = ((index + nslots) < io_tlb_nslabs
- ? (index + nslots) : 0);
-
- goto found;
- }
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- } while (index != wrap);
-
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ iova = alloc_iova(&swiotlb_iovad, nslots, io_tlb_nslabs - 1, 1);
+ if (!iova)
return NULL;
- }
- found:
- spin_unlock_irqrestore(&io_tlb_lock, flags);

+ index = iova->pfn_lo;
+ dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
/*
* Save away the mapping from the original address to the DMA address.
* This is needed when we sync the memory. Then we sync the buffer if
@@ -371,8 +332,6 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
static void
unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
{
- unsigned long flags;
- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
char *buffer = io_tlb_orig_addr[index];

@@ -386,30 +345,7 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
*/
memcpy(buffer, dma_addr, size);

- /*
- * Return the buffer to the free list by setting the corresponding
- * entries to indicate the number of contigous entries available.
- * While returning the entries to the free list, we merge the entries
- * with slots below and above the pool being returned.
- */
- spin_lock_irqsave(&io_tlb_lock, flags);
- {
- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
- io_tlb_list[index + nslots] : 0);
- /*
- * Step 1: return the slots to the free list, merging the
- * slots with superceeding slots
- */
- for (i = index + nslots - 1; i >= index; i--)
- io_tlb_list[i] = ++count;
- /*
- * Step 2: merge the returned slots with the preceding slots,
- * if available (non zero)
- */
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- }
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ free_iova(&swiotlb_iovad, index);
}

static void
--
1.5.3.4