This patchset convert the PPC64 IOMMU to use the iova code for free
area management.
The IOMMUs ignores low level drivers' restrictions, the maximum
segment size and segment boundary.
I fixed the former:
http://thread.gmane.org/gmane.linux.scsi/35602
The latter makes the free area management complicated. I'd like to
convert IOMMUs to use the iova code (that intel-iommu introduced) for
free area management and enable iova to handle segment boundary
restrictions, rather than fixing all the IOMMUs' free area management,
I converted the PPC64 IOMMU to see how things work. The patchset was
slightly tested with a pSeries box and seems to work.
This is against the latest Linus' git tree with David Miller's
genericizing iova patch:
http://www.ussg.iu.edu/hypermail/linux/kernel/0710.2/2910.html
This patchset is also available:
git://git.kernel.org/pub/scm/linux/kernel/git/tomo/linux-2.6-misc.git iova
This detaches iova cache code from intel-iommu.c to iova.c in order to
enable IOMMUs to use iova code.
Signed-off-by: FUJITA Tomonori <[email protected]>
---
drivers/pci/intel-iommu.c | 14 ++------------
include/linux/iova.h | 6 +++---
lib/iova.c | 34 ++++++++++++++++++++++++++++------
3 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 1a23b4c..0c41d79 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -141,16 +141,6 @@ static inline void free_devinfo_mem(void *vaddr)
kmem_cache_free(iommu_devinfo_cache, vaddr);
}
-struct iova *alloc_iova_mem(void)
-{
- return iommu_kmem_cache_alloc(iommu_iova_cache);
-}
-
-void free_iova_mem(struct iova *iova)
-{
- kmem_cache_free(iommu_iova_cache, iova);
-}
-
static inline void __iommu_flush_cache(
struct intel_iommu *iommu, void *addr, int size)
{
@@ -1087,7 +1077,7 @@ static void dmar_init_reserved_ranges(void)
int i;
u64 addr, size;
- init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
+ init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN, iommu_iova_cache);
/* IOAPIC ranges shouldn't be accessed by DMA */
iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
@@ -1141,7 +1131,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
int adjust_width, agaw;
unsigned long sagaw;
- init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+ init_iova_domain(&domain->iovad, DMA_32BIT_PFN, iommu_iova_cache);
spin_lock_init(&domain->mapping_lock);
domain_reserve_special_ranges(domain);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d521b5b..41f189b 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -32,10 +32,9 @@ struct iova_domain {
struct rb_root rbroot; /* iova domain rbtree root */
struct rb_node *cached32_node; /* Save last alloced node */
unsigned long dma_32bit_pfn;
+ struct kmem_cache *iova_cachep;
};
-struct iova *alloc_iova_mem(void);
-void free_iova_mem(struct iova *iova);
void free_iova(struct iova_domain *iovad, unsigned long pfn);
void __free_iova(struct iova_domain *iovad, struct iova *iova);
struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
@@ -44,7 +43,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
unsigned long pfn_hi);
void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit,
+ struct kmem_cache *iova_cachep);
struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
void put_iova_domain(struct iova_domain *iovad);
diff --git a/lib/iova.c b/lib/iova.c
index 6e14a3f..d2612e5 100644
--- a/lib/iova.c
+++ b/lib/iova.c
@@ -6,16 +6,38 @@
* Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
*/
+#include <linux/sched.h>
#include <linux/iova.h>
+static struct iova *alloc_iova_mem(struct iova_domain *iovad)
+{
+ unsigned int flags;
+ void *vaddr;
+
+ /* trying to avoid low memory issues */
+ flags = current->flags & PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
+ vaddr = kmem_cache_alloc(iovad->iova_cachep, GFP_ATOMIC);
+ current->flags &= (~PF_MEMALLOC | flags);
+
+ return vaddr;
+}
+
+static void free_iova_mem(struct iova_domain *iovad, struct iova *iova)
+{
+ kmem_cache_free(iovad->iova_cachep, iova);
+}
+
void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit,
+ struct kmem_cache *iova_cachep)
{
spin_lock_init(&iovad->iova_alloc_lock);
spin_lock_init(&iovad->iova_rbtree_lock);
iovad->rbroot = RB_ROOT;
iovad->cached32_node = NULL;
iovad->dma_32bit_pfn = pfn_32bit;
+ iovad->iova_cachep = iova_cachep;
}
static struct rb_node *
@@ -160,7 +182,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
struct iova *new_iova;
int ret;
- new_iova = alloc_iova_mem();
+ new_iova = alloc_iova_mem(iovad);
if (!new_iova)
return NULL;
@@ -176,7 +198,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
if (ret) {
spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
- free_iova_mem(new_iova);
+ free_iova_mem(iovad, new_iova);
return NULL;
}
@@ -246,7 +268,7 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
__cached_rbnode_delete_update(iovad, iova);
rb_erase(&iova->node, &iovad->rbroot);
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- free_iova_mem(iova);
+ free_iova_mem(iovad, iova);
}
/**
@@ -280,7 +302,7 @@ void put_iova_domain(struct iova_domain *iovad)
while (node) {
struct iova *iova = container_of(node, struct iova, node);
rb_erase(node, &iovad->rbroot);
- free_iova_mem(iova);
+ free_iova_mem(iovad, iova);
node = rb_first(&iovad->rbroot);
}
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
@@ -303,7 +325,7 @@ __insert_new_range(struct iova_domain *iovad,
{
struct iova *iova;
- iova = alloc_iova_mem();
+ iova = alloc_iova_mem(iovad);
if (!iova)
return iova;
--
1.5.2.4
This converts the PPC64 IOMMU to use iova for free area management.
TODO: we might need to modify iova to use the tricks like avoiding
cacheline bouncing. Performance tests are necessary.
Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/powerpc/kernel/iommu.c | 169 ++++++--------------------------
arch/powerpc/platforms/Kconfig.cputype | 1 +
arch/powerpc/platforms/cell/iommu.c | 3 +-
arch/powerpc/sysdev/dart_iommu.c | 4 +-
include/asm-powerpc/iommu.h | 6 +-
5 files changed, 36 insertions(+), 147 deletions(-)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2d0c9ef..34dcfd3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -31,6 +31,7 @@
#include <linux/string.h>
#include <linux/dma-mapping.h>
#include <linux/bitops.h>
+#include <linux/iova.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
@@ -82,106 +83,18 @@ __setup("protect4gb=", setup_protect4gb);
__setup("iommu=", setup_iommu);
static unsigned long iommu_range_alloc(struct iommu_table *tbl,
- unsigned long npages,
- unsigned long *handle,
- unsigned long mask,
- unsigned int align_order)
-{
- unsigned long n, end, i, start;
- unsigned long limit;
- int largealloc = npages > 15;
- int pass = 0;
- unsigned long align_mask;
-
- align_mask = 0xffffffffffffffffl >> (64 - align_order);
-
- /* This allocator was derived from x86_64's bit string search */
-
- /* Sanity check */
- if (unlikely(npages == 0)) {
- if (printk_ratelimit())
- WARN_ON(1);
- return DMA_ERROR_CODE;
- }
-
- if (handle && *handle)
- start = *handle;
- else
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- /* Use only half of the table for small allocs (15 pages or less) */
- limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
-
- if (largealloc && start < tbl->it_halfpoint)
- start = tbl->it_halfpoint;
-
- /* The case below can happen if we have a small segment appended
- * to a large, or when the previous alloc was at the very end of
- * the available space. If so, go back to the initial start.
- */
- if (start >= limit)
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- again:
-
- if (limit + tbl->it_offset > mask) {
- limit = mask - tbl->it_offset + 1;
- /* If we're constrained on address range, first try
- * at the masked hint to avoid O(n) search complexity,
- * but on second pass, start at 0.
- */
- if ((start & mask) >= limit || pass > 0)
- start = 0;
- else
- start &= mask;
- }
-
- n = find_next_zero_bit(tbl->it_map, limit, start);
-
- /* Align allocation */
- n = (n + align_mask) & ~align_mask;
-
- end = n + npages;
-
- if (unlikely(end >= limit)) {
- if (likely(pass < 2)) {
- /* First failure, just rescan the half of the table.
- * Second failure, rescan the other half of the table.
- */
- start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
- limit = pass ? tbl->it_size : limit;
- pass++;
- goto again;
- } else {
- /* Third failure, give up */
- return DMA_ERROR_CODE;
- }
- }
-
- for (i = n; i < end; i++)
- if (test_bit(i, tbl->it_map)) {
- start = i+1;
- goto again;
- }
-
- for (i = n; i < end; i++)
- __set_bit(i, tbl->it_map);
-
- /* Bump the hint to a new block for small allocs. */
- if (largealloc) {
- /* Don't bump to new block to avoid fragmentation */
- tbl->it_largehint = end;
- } else {
- /* Overflow will be taken care of at the next allocation */
- tbl->it_hint = (end + tbl->it_blocksize - 1) &
- ~(tbl->it_blocksize - 1);
- }
+ unsigned long npages,
+ unsigned long mask,
+ unsigned int align_order)
+{
+ unsigned long end;
+ struct iova *iova;
- /* Update handle for SG allocations */
- if (handle)
- *handle = end;
+ end = min_t(unsigned long, DMA_32BIT_MASK >> IOMMU_PAGE_SHIFT,
+ tbl->it_size - 1);
+ iova = alloc_iova(&tbl->iovad, npages, end, 1);
- return n;
+ return iova ? iova->pfn_lo : DMA_ERROR_CODE;
}
static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
@@ -193,7 +106,7 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
spin_lock_irqsave(&(tbl->it_lock), flags);
- entry = iommu_range_alloc(tbl, npages, NULL, mask, align_order);
+ entry = iommu_range_alloc(tbl, npages, mask, align_order);
if (unlikely(entry == DMA_ERROR_CODE)) {
spin_unlock_irqrestore(&(tbl->it_lock), flags);
@@ -224,7 +137,6 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
unsigned long entry, free_entry;
- unsigned long i;
entry = dma_addr >> IOMMU_PAGE_SHIFT;
free_entry = entry - tbl->it_offset;
@@ -246,9 +158,8 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
}
ppc_md.tce_free(tbl, entry, npages);
-
- for (i = 0; i < npages; i++)
- __clear_bit(free_entry+i, tbl->it_map);
+
+ free_iova(&tbl->iovad, free_entry);
}
static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -309,7 +220,7 @@ int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist,
/* Allocate iommu entries for that segment */
vaddr = (unsigned long) sg_virt(s);
npages = iommu_num_pages(vaddr, slen);
- entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0);
+ entry = iommu_range_alloc(tbl, npages, mask >> IOMMU_PAGE_SHIFT, 0);
DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);
@@ -439,34 +350,28 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
spin_unlock_irqrestore(&(tbl->it_lock), flags);
}
+static struct kmem_cache *iova_cachep;
+
/*
* Build a iommu_table structure. This contains a bit map which
* is used to manage allocation of the tce space.
*/
struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
{
- unsigned long sz;
unsigned long start_index, end_index;
unsigned long entries_per_4g;
unsigned long index;
static int welcomed = 0;
- struct page *page;
-
- /* Set aside 1/4 of the table for large allocations. */
- tbl->it_halfpoint = tbl->it_size * 3 / 4;
- /* number of bytes needed for the bitmap */
- sz = (tbl->it_size + 7) >> 3;
-
- page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz));
- if (!page)
- panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
- tbl->it_map = page_address(page);
- memset(tbl->it_map, 0, sz);
+ if (!iova_cachep) {
+ iova_cachep = KMEM_CACHE(iova, 0);
+ if (!iova_cachep)
+ return NULL;
+ }
- tbl->it_hint = 0;
- tbl->it_largehint = tbl->it_halfpoint;
spin_lock_init(&tbl->it_lock);
+ init_iova_domain(&tbl->iovad, DMA_32BIT_MASK >> IOMMU_PAGE_SHIFT,
+ iova_cachep);
#ifdef CONFIG_CRASH_DUMP
if (ppc_md.tce_get) {
@@ -482,7 +387,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
* Freed TCE entry contains 0x7fffffffffffffff on JS20
*/
if (tceval && (tceval != 0x7fffffffffffffffUL)) {
- __set_bit(index, tbl->it_map);
+ free_iova(&tbl->iovad, index);
tcecount++;
}
}
@@ -492,7 +397,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
KDUMP_MIN_TCE_ENTRIES);
for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
index < tbl->it_size; index++)
- __clear_bit(index, tbl->it_map);
+ free_iova(&tbl->iovad, index);
}
}
#else
@@ -514,7 +419,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
end_index = tbl->it_size;
for (index = start_index; index < end_index - 1; index += entries_per_4g)
- __set_bit(index, tbl->it_map);
+ free_iova(&tbl->iovad, index);
}
if (!welcomed) {
@@ -530,31 +435,15 @@ void iommu_free_table(struct device_node *dn)
{
struct pci_dn *pdn = dn->data;
struct iommu_table *tbl = pdn->iommu_table;
- unsigned long bitmap_sz, i;
- unsigned int order;
- if (!tbl || !tbl->it_map) {
+ if (!tbl) {
printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__,
dn->full_name);
return;
}
/* verify that table contains no entries */
- /* it_size is in entries, and we're examining 64 at a time */
- for (i = 0; i < (tbl->it_size/64); i++) {
- if (tbl->it_map[i] != 0) {
- printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
- __FUNCTION__, dn->full_name);
- break;
- }
- }
-
- /* calculate bitmap size in bytes */
- bitmap_sz = (tbl->it_size + 7) / 8;
-
- /* free bitmap */
- order = get_order(bitmap_sz);
- free_pages((unsigned long) tbl->it_map, order);
+ put_iova_domain(&tbl->iovad);
/* free table */
kfree(tbl);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 99684ea..22483e5 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
config PPC64
bool "64-bit kernel"
default n
+ select IOVA
help
This option selects whether a 32-bit or a 64-bit kernel
will be built.
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index faabc3f..799e8e5 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -471,10 +471,9 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
* This code also assumes that we have a window that starts at 0,
* which is the case on all spider based blades.
*/
- __set_bit(0, window->table.it_map);
+ reserve_iova(&window->table.iovad, 0, 0);
tce_build_cell(&window->table, window->table.it_offset, 1,
(unsigned long)iommu->pad_page, DMA_TO_DEVICE);
- window->table.it_hint = window->table.it_blocksize;
return window;
}
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index e0e24b0..6c58ef5 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -37,6 +37,7 @@
#include <linux/dma-mapping.h>
#include <linux/vmalloc.h>
#include <linux/suspend.h>
+#include <linux/iova.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
@@ -287,7 +288,8 @@ static void iommu_table_dart_setup(void)
/* Reserve the last page of the DART to avoid possible prefetch
* past the DART mapped area
*/
- set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map);
+ reserve_iova(&iommu_table_dart.iovad, iommu_table_dart.it_size - 1,
+ iommu_table_dart.it_size - 1);
}
static void pci_dma_dev_setup_dart(struct pci_dev *dev)
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index 4a82fdc..f497e7e 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -27,6 +27,7 @@
#include <linux/device.h>
#include <linux/dma-mapping.h>
#include <linux/bitops.h>
+#include <linux/iova.h>
#include <asm/machdep.h>
#include <asm/types.h>
@@ -61,11 +62,8 @@ struct iommu_table {
unsigned long it_index; /* which iommu table this is */
unsigned long it_type; /* type: PCI or Virtual Bus */
unsigned long it_blocksize; /* Entries in each block (cacheline) */
- unsigned long it_hint; /* Hint for next alloc */
- unsigned long it_largehint; /* Hint for large allocs */
- unsigned long it_halfpoint; /* Breaking point for small/large allocs */
spinlock_t it_lock; /* Protects it_map */
- unsigned long *it_map; /* A simple allocation bitmap for now */
+ struct iova_domain iovad;
};
struct scatterlist;
--
1.5.2.4
iova could be used by several IOMMUs. This patch just moves iova from
drivers/pci/ to lib/ and fixes the appropriate Makefile and Kconfig
files.
Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/Kconfig.x86_64 | 1 +
drivers/pci/Makefile | 2 +-
drivers/pci/intel-iommu.c | 1 -
drivers/pci/intel-iommu.h | 2 +-
drivers/pci/iova.c | 394 ---------------------------------------------
drivers/pci/iova.h | 51 ------
include/linux/iova.h | 51 ++++++
lib/Kconfig | 3 +
lib/Makefile | 2 +
lib/iova.c | 394 +++++++++++++++++++++++++++++++++++++++++++++
10 files changed, 453 insertions(+), 448 deletions(-)
delete mode 100644 drivers/pci/iova.c
delete mode 100644 drivers/pci/iova.h
create mode 100644 include/linux/iova.h
create mode 100644 lib/iova.c
diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index cc468ea..c10d3f0 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -749,6 +749,7 @@ config PCI_DOMAINS
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
depends on PCI_MSI && ACPI && EXPERIMENTAL
+ select IOVA
help
DMA remapping (DMAR) devices support enables independent address
translations for Direct Memory Access (DMA) from devices.
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 5550556..00c3428 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -21,7 +21,7 @@ obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_HT_IRQ) += htirq.o
# Build Intel IOMMU support
-obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+obj-$(CONFIG_DMAR) += dmar.o intel-iommu.o
#
# Some architectures use the generic PCI setup functions
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index dc1edbd..1a23b4c 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -30,7 +30,6 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
-#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
#include <asm/cacheflush.h>
diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h
index f54efd1..750e1b5 100644
--- a/drivers/pci/intel-iommu.h
+++ b/drivers/pci/intel-iommu.h
@@ -23,7 +23,7 @@
#include <linux/types.h>
#include <linux/msi.h>
-#include "iova.h"
+#include <linux/iova.h>
#include <linux/io.h>
/*
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
deleted file mode 100644
index 8de7ab6..0000000
--- a/drivers/pci/iova.c
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This file is released under the GPLv2.
- *
- * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
- */
-
-#include "iova.h"
-
-void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
-{
- spin_lock_init(&iovad->iova_alloc_lock);
- spin_lock_init(&iovad->iova_rbtree_lock);
- iovad->rbroot = RB_ROOT;
- iovad->cached32_node = NULL;
- iovad->dma_32bit_pfn = pfn_32bit;
-}
-
-static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
-{
- if ((*limit_pfn != iovad->dma_32bit_pfn) ||
- (iovad->cached32_node == NULL))
- return rb_last(&iovad->rbroot);
- else {
- struct rb_node *prev_node = rb_prev(iovad->cached32_node);
- struct iova *curr_iova =
- container_of(iovad->cached32_node, struct iova, node);
- *limit_pfn = curr_iova->pfn_lo - 1;
- return prev_node;
- }
-}
-
-static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
- unsigned long limit_pfn, struct iova *new)
-{
- if (limit_pfn != iovad->dma_32bit_pfn)
- return;
- iovad->cached32_node = &new->node;
-}
-
-static void
-__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
-{
- struct iova *cached_iova;
- struct rb_node *curr;
-
- if (!iovad->cached32_node)
- return;
- curr = iovad->cached32_node;
- cached_iova = container_of(curr, struct iova, node);
-
- if (free->pfn_lo >= cached_iova->pfn_lo)
- iovad->cached32_node = rb_next(&free->node);
-}
-
-/* Computes the padding size required, to make the
- * the start address naturally aligned on its size
- */
-static int
-iova_get_pad_size(int size, unsigned int limit_pfn)
-{
- unsigned int pad_size = 0;
- unsigned int order = ilog2(size);
-
- if (order)
- pad_size = (limit_pfn + 1) % (1 << order);
-
- return pad_size;
-}
-
-static int __alloc_iova_range(struct iova_domain *iovad, unsigned long size,
- unsigned long limit_pfn, struct iova *new, bool size_aligned)
-{
- struct rb_node *curr = NULL;
- unsigned long flags;
- unsigned long saved_pfn;
- unsigned int pad_size = 0;
-
- /* Walk the tree backwards */
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- saved_pfn = limit_pfn;
- curr = __get_cached_rbnode(iovad, &limit_pfn);
- while (curr) {
- struct iova *curr_iova = container_of(curr, struct iova, node);
- if (limit_pfn < curr_iova->pfn_lo)
- goto move_left;
- else if (limit_pfn < curr_iova->pfn_hi)
- goto adjust_limit_pfn;
- else {
- if (size_aligned)
- pad_size = iova_get_pad_size(size, limit_pfn);
- if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
- break; /* found a free slot */
- }
-adjust_limit_pfn:
- limit_pfn = curr_iova->pfn_lo - 1;
-move_left:
- curr = rb_prev(curr);
- }
-
- if (!curr) {
- if (size_aligned)
- pad_size = iova_get_pad_size(size, limit_pfn);
- if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return -ENOMEM;
- }
- }
-
- /* pfn_lo will point to size aligned address if size_aligned is set */
- new->pfn_lo = limit_pfn - (size + pad_size) + 1;
- new->pfn_hi = new->pfn_lo + size - 1;
-
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return 0;
-}
-
-static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova)
-{
- struct rb_node **new = &(root->rb_node), *parent = NULL;
- /* Figure out where to put new node */
- while (*new) {
- struct iova *this = container_of(*new, struct iova, node);
- parent = *new;
-
- if (iova->pfn_lo < this->pfn_lo)
- new = &((*new)->rb_left);
- else if (iova->pfn_lo > this->pfn_lo)
- new = &((*new)->rb_right);
- else
- BUG(); /* this should not happen */
- }
- /* Add new node and rebalance tree. */
- rb_link_node(&iova->node, parent, new);
- rb_insert_color(&iova->node, root);
-}
-
-/**
- * alloc_iova - allocates an iova
- * @iovad - iova domain in question
- * @size - size of page frames to allocate
- * @limit_pfn - max limit address
- * @size_aligned - set if size_aligned address range is required
- * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
- * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
- * flag is set then the allocated address iova->pfn_lo will be naturally
- * aligned on roundup_power_of_two(size).
- */
-struct iova *
-alloc_iova(struct iova_domain *iovad, unsigned long size,
- unsigned long limit_pfn,
- bool size_aligned)
-{
- unsigned long flags;
- struct iova *new_iova;
- int ret;
-
- new_iova = alloc_iova_mem();
- if (!new_iova)
- return NULL;
-
- /* If size aligned is set then round the size to
- * to next power of two.
- */
- if (size_aligned)
- size = __roundup_pow_of_two(size);
-
- spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
- ret = __alloc_iova_range(iovad, size, limit_pfn, new_iova,
- size_aligned);
-
- if (ret) {
- spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
- free_iova_mem(new_iova);
- return NULL;
- }
-
- /* Insert the new_iova into domain rbtree by holding writer lock */
- spin_lock(&iovad->iova_rbtree_lock);
- iova_insert_rbtree(&iovad->rbroot, new_iova);
- __cached_rbnode_insert_update(iovad, limit_pfn, new_iova);
- spin_unlock(&iovad->iova_rbtree_lock);
-
- spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
-
- return new_iova;
-}
-
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad - iova domain in question.
- * pfn - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
-{
- unsigned long flags;
- struct rb_node *node;
-
- /* Take the lock so that no other thread is manipulating the rbtree */
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- node = iovad->rbroot.rb_node;
- while (node) {
- struct iova *iova = container_of(node, struct iova, node);
-
- /* If pfn falls within iova's range, return iova */
- if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- /* We are not holding the lock while this iova
- * is referenced by the caller as the same thread
- * which called this function also calls __free_iova()
- * and it is by desing that only one thread can possibly
- * reference a particular iova and hence no conflict.
- */
- return iova;
- }
-
- if (pfn < iova->pfn_lo)
- node = node->rb_left;
- else if (pfn > iova->pfn_lo)
- node = node->rb_right;
- }
-
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return NULL;
-}
-
-/**
- * __free_iova - frees the given iova
- * @iovad: iova domain in question.
- * @iova: iova in question.
- * Frees the given iova belonging to the giving domain
- */
-void
-__free_iova(struct iova_domain *iovad, struct iova *iova)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- __cached_rbnode_delete_update(iovad, iova);
- rb_erase(&iova->node, &iovad->rbroot);
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- free_iova_mem(iova);
-}
-
-/**
- * free_iova - finds and frees the iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - pfn that is allocated previously
- * This functions finds an iova for a given pfn and then
- * frees the iova from that domain.
- */
-void
-free_iova(struct iova_domain *iovad, unsigned long pfn)
-{
- struct iova *iova = find_iova(iovad, pfn);
- if (iova)
- __free_iova(iovad, iova);
-
-}
-
-/**
- * put_iova_domain - destroys the iova doamin
- * @iovad: - iova domain in question.
- * All the iova's in that domain are destroyed.
- */
-void put_iova_domain(struct iova_domain *iovad)
-{
- struct rb_node *node;
- unsigned long flags;
-
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- node = rb_first(&iovad->rbroot);
- while (node) {
- struct iova *iova = container_of(node, struct iova, node);
- rb_erase(node, &iovad->rbroot);
- free_iova_mem(iova);
- node = rb_first(&iovad->rbroot);
- }
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-}
-
-static int
-__is_range_overlap(struct rb_node *node,
- unsigned long pfn_lo, unsigned long pfn_hi)
-{
- struct iova *iova = container_of(node, struct iova, node);
-
- if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
- return 1;
- return 0;
-}
-
-static struct iova *
-__insert_new_range(struct iova_domain *iovad,
- unsigned long pfn_lo, unsigned long pfn_hi)
-{
- struct iova *iova;
-
- iova = alloc_iova_mem();
- if (!iova)
- return iova;
-
- iova->pfn_hi = pfn_hi;
- iova->pfn_lo = pfn_lo;
- iova_insert_rbtree(&iovad->rbroot, iova);
- return iova;
-}
-
-static void
-__adjust_overlap_range(struct iova *iova,
- unsigned long *pfn_lo, unsigned long *pfn_hi)
-{
- if (*pfn_lo < iova->pfn_lo)
- iova->pfn_lo = *pfn_lo;
- if (*pfn_hi > iova->pfn_hi)
- *pfn_lo = iova->pfn_hi + 1;
-}
-
-/**
- * reserve_iova - reserves an iova in the given range
- * @iovad: - iova domain pointer
- * @pfn_lo: - lower page frame address
- * @pfn_hi:- higher pfn adderss
- * This function allocates reserves the address range from pfn_lo to pfn_hi so
- * that this address is not dished out as part of alloc_iova.
- */
-struct iova *
-reserve_iova(struct iova_domain *iovad,
- unsigned long pfn_lo, unsigned long pfn_hi)
-{
- struct rb_node *node;
- unsigned long flags;
- struct iova *iova;
- unsigned int overlap = 0;
-
- spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
- spin_lock(&iovad->iova_rbtree_lock);
- for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
- if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
- iova = container_of(node, struct iova, node);
- __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
- if ((pfn_lo >= iova->pfn_lo) &&
- (pfn_hi <= iova->pfn_hi))
- goto finish;
- overlap = 1;
-
- } else if (overlap)
- break;
- }
-
- /* We are here either becasue this is the first reserver node
- * or need to insert remaining non overlap addr range
- */
- iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
-finish:
-
- spin_unlock(&iovad->iova_rbtree_lock);
- spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
- return iova;
-}
-
-/**
- * copy_reserved_iova - copies the reserved between domains
- * @from: - source doamin from where to copy
- * @to: - destination domin where to copy
- * This function copies reserved iova's from one doamin to
- * other.
- */
-void
-copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
-{
- unsigned long flags;
- struct rb_node *node;
-
- spin_lock_irqsave(&from->iova_alloc_lock, flags);
- spin_lock(&from->iova_rbtree_lock);
- for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
- struct iova *iova = container_of(node, struct iova, node);
- struct iova *new_iova;
- new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
- if (!new_iova)
- printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
- iova->pfn_lo, iova->pfn_lo);
- }
- spin_unlock(&from->iova_rbtree_lock);
- spin_unlock_irqrestore(&from->iova_alloc_lock, flags);
-}
diff --git a/drivers/pci/iova.h b/drivers/pci/iova.h
deleted file mode 100644
index d521b5b..0000000
--- a/drivers/pci/iova.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This file is released under the GPLv2.
- *
- * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
- *
- */
-
-#ifndef _IOVA_H_
-#define _IOVA_H_
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/rbtree.h>
-#include <linux/dma-mapping.h>
-
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN (1)
-
-/* iova structure */
-struct iova {
- struct rb_node node;
- unsigned long pfn_hi; /* IOMMU dish out addr hi */
- unsigned long pfn_lo; /* IOMMU dish out addr lo */
-};
-
-/* holds all the iova translations for a domain */
-struct iova_domain {
- spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */
- spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
- struct rb_root rbroot; /* iova domain rbtree root */
- struct rb_node *cached32_node; /* Save last alloced node */
- unsigned long dma_32bit_pfn;
-};
-
-struct iova *alloc_iova_mem(void);
-void free_iova_mem(struct iova *iova);
-void free_iova(struct iova_domain *iovad, unsigned long pfn);
-void __free_iova(struct iova_domain *iovad, struct iova *iova);
-struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
- unsigned long limit_pfn,
- bool size_aligned);
-struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
- unsigned long pfn_hi);
-void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
-void put_iova_domain(struct iova_domain *iovad);
-
-#endif
diff --git a/include/linux/iova.h b/include/linux/iova.h
new file mode 100644
index 0000000..d521b5b
--- /dev/null
+++ b/include/linux/iova.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
+ *
+ */
+
+#ifndef _IOVA_H_
+#define _IOVA_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h>
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN (1)
+
+/* iova structure */
+struct iova {
+ struct rb_node node;
+ unsigned long pfn_hi; /* IOMMU dish out addr hi */
+ unsigned long pfn_lo; /* IOMMU dish out addr lo */
+};
+
+/* holds all the iova translations for a domain */
+struct iova_domain {
+ spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */
+ spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
+ struct rb_root rbroot; /* iova domain rbtree root */
+ struct rb_node *cached32_node; /* Save last alloced node */
+ unsigned long dma_32bit_pfn;
+};
+
+struct iova *alloc_iova_mem(void);
+void free_iova_mem(struct iova *iova);
+void free_iova(struct iova_domain *iovad, unsigned long pfn);
+void __free_iova(struct iova_domain *iovad, struct iova *iova);
+struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn,
+ bool size_aligned);
+struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
+ unsigned long pfn_hi);
+void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
+void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
+void put_iova_domain(struct iova_domain *iovad);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index ba3d104..4107547 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -141,4 +141,7 @@ config HAS_DMA
config CHECK_SIGNATURE
bool
+config IOVA
+ boolean
+
endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 3a0983b..7de33cc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -66,6 +66,8 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
+obj-$(CONFIG_IOVA) += iova.o
+
lib-$(CONFIG_GENERIC_BUG) += bug.o
hostprogs-y := gen_crc32table
diff --git a/lib/iova.c b/lib/iova.c
new file mode 100644
index 0000000..6e14a3f
--- /dev/null
+++ b/lib/iova.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006 Anil S Keshavamurthy <[email protected]>
+ */
+
+#include <linux/iova.h>
+
+void
+init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+{
+ spin_lock_init(&iovad->iova_alloc_lock);
+ spin_lock_init(&iovad->iova_rbtree_lock);
+ iovad->rbroot = RB_ROOT;
+ iovad->cached32_node = NULL;
+ iovad->dma_32bit_pfn = pfn_32bit;
+}
+
+static struct rb_node *
+__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
+{
+ if ((*limit_pfn != iovad->dma_32bit_pfn) ||
+ (iovad->cached32_node == NULL))
+ return rb_last(&iovad->rbroot);
+ else {
+ struct rb_node *prev_node = rb_prev(iovad->cached32_node);
+ struct iova *curr_iova =
+ container_of(iovad->cached32_node, struct iova, node);
+ *limit_pfn = curr_iova->pfn_lo - 1;
+ return prev_node;
+ }
+}
+
+static void
+__cached_rbnode_insert_update(struct iova_domain *iovad,
+ unsigned long limit_pfn, struct iova *new)
+{
+ if (limit_pfn != iovad->dma_32bit_pfn)
+ return;
+ iovad->cached32_node = &new->node;
+}
+
+static void
+__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
+{
+ struct iova *cached_iova;
+ struct rb_node *curr;
+
+ if (!iovad->cached32_node)
+ return;
+ curr = iovad->cached32_node;
+ cached_iova = container_of(curr, struct iova, node);
+
+ if (free->pfn_lo >= cached_iova->pfn_lo)
+ iovad->cached32_node = rb_next(&free->node);
+}
+
+/* Computes the padding size required, to make the
+ * the start address naturally aligned on its size
+ */
+static int
+iova_get_pad_size(int size, unsigned int limit_pfn)
+{
+ unsigned int pad_size = 0;
+ unsigned int order = ilog2(size);
+
+ if (order)
+ pad_size = (limit_pfn + 1) % (1 << order);
+
+ return pad_size;
+}
+
+static int __alloc_iova_range(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn, struct iova *new, bool size_aligned)
+{
+ struct rb_node *curr = NULL;
+ unsigned long flags;
+ unsigned long saved_pfn;
+ unsigned int pad_size = 0;
+
+ /* Walk the tree backwards */
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ saved_pfn = limit_pfn;
+ curr = __get_cached_rbnode(iovad, &limit_pfn);
+ while (curr) {
+ struct iova *curr_iova = container_of(curr, struct iova, node);
+ if (limit_pfn < curr_iova->pfn_lo)
+ goto move_left;
+ else if (limit_pfn < curr_iova->pfn_hi)
+ goto adjust_limit_pfn;
+ else {
+ if (size_aligned)
+ pad_size = iova_get_pad_size(size, limit_pfn);
+ if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
+ break; /* found a free slot */
+ }
+adjust_limit_pfn:
+ limit_pfn = curr_iova->pfn_lo - 1;
+move_left:
+ curr = rb_prev(curr);
+ }
+
+ if (!curr) {
+ if (size_aligned)
+ pad_size = iova_get_pad_size(size, limit_pfn);
+ if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return -ENOMEM;
+ }
+ }
+
+ /* pfn_lo will point to size aligned address if size_aligned is set */
+ new->pfn_lo = limit_pfn - (size + pad_size) + 1;
+ new->pfn_hi = new->pfn_lo + size - 1;
+
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return 0;
+}
+
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ /* Figure out where to put new node */
+ while (*new) {
+ struct iova *this = container_of(*new, struct iova, node);
+ parent = *new;
+
+ if (iova->pfn_lo < this->pfn_lo)
+ new = &((*new)->rb_left);
+ else if (iova->pfn_lo > this->pfn_lo)
+ new = &((*new)->rb_right);
+ else
+ BUG(); /* this should not happen */
+ }
+ /* Add new node and rebalance tree. */
+ rb_link_node(&iova->node, parent, new);
+ rb_insert_color(&iova->node, root);
+}
+
+/**
+ * alloc_iova - allocates an iova
+ * @iovad - iova domain in question
+ * @size - size of page frames to allocate
+ * @limit_pfn - max limit address
+ * @size_aligned - set if size_aligned address range is required
+ * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
+ * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
+ * flag is set then the allocated address iova->pfn_lo will be naturally
+ * aligned on roundup_power_of_two(size).
+ */
+struct iova *
+alloc_iova(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn,
+ bool size_aligned)
+{
+ unsigned long flags;
+ struct iova *new_iova;
+ int ret;
+
+ new_iova = alloc_iova_mem();
+ if (!new_iova)
+ return NULL;
+
+ /* If size aligned is set then round the size to
+ * to next power of two.
+ */
+ if (size_aligned)
+ size = __roundup_pow_of_two(size);
+
+ spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
+ ret = __alloc_iova_range(iovad, size, limit_pfn, new_iova,
+ size_aligned);
+
+ if (ret) {
+ spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+ free_iova_mem(new_iova);
+ return NULL;
+ }
+
+ /* Insert the new_iova into domain rbtree by holding writer lock */
+ spin_lock(&iovad->iova_rbtree_lock);
+ iova_insert_rbtree(&iovad->rbroot, new_iova);
+ __cached_rbnode_insert_update(iovad, limit_pfn, new_iova);
+ spin_unlock(&iovad->iova_rbtree_lock);
+
+ spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+
+ return new_iova;
+}
+
+/**
+ * find_iova - find's an iova for a given pfn
+ * @iovad - iova domain in question.
+ * pfn - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+ unsigned long flags;
+ struct rb_node *node;
+
+ /* Take the lock so that no other thread is manipulating the rbtree */
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ node = iovad->rbroot.rb_node;
+ while (node) {
+ struct iova *iova = container_of(node, struct iova, node);
+
+ /* If pfn falls within iova's range, return iova */
+ if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ /* We are not holding the lock while this iova
+ * is referenced by the caller as the same thread
+ * which called this function also calls __free_iova()
+ * and it is by desing that only one thread can possibly
+ * reference a particular iova and hence no conflict.
+ */
+ return iova;
+ }
+
+ if (pfn < iova->pfn_lo)
+ node = node->rb_left;
+ else if (pfn > iova->pfn_lo)
+ node = node->rb_right;
+ }
+
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return NULL;
+}
+
+/**
+ * __free_iova - frees the given iova
+ * @iovad: iova domain in question.
+ * @iova: iova in question.
+ * Frees the given iova belonging to the giving domain
+ */
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ __cached_rbnode_delete_update(iovad, iova);
+ rb_erase(&iova->node, &iovad->rbroot);
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ free_iova_mem(iova);
+}
+
+/**
+ * free_iova - finds and frees the iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * This functions finds an iova for a given pfn and then
+ * frees the iova from that domain.
+ */
+void
+free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+ struct iova *iova = find_iova(iovad, pfn);
+ if (iova)
+ __free_iova(iovad, iova);
+
+}
+
+/**
+ * put_iova_domain - destroys the iova doamin
+ * @iovad: - iova domain in question.
+ * All the iova's in that domain are destroyed.
+ */
+void put_iova_domain(struct iova_domain *iovad)
+{
+ struct rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ node = rb_first(&iovad->rbroot);
+ while (node) {
+ struct iova *iova = container_of(node, struct iova, node);
+ rb_erase(node, &iovad->rbroot);
+ free_iova_mem(iova);
+ node = rb_first(&iovad->rbroot);
+ }
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+
+static int
+__is_range_overlap(struct rb_node *node,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct iova *iova = container_of(node, struct iova, node);
+
+ if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
+ return 1;
+ return 0;
+}
+
+static struct iova *
+__insert_new_range(struct iova_domain *iovad,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct iova *iova;
+
+ iova = alloc_iova_mem();
+ if (!iova)
+ return iova;
+
+ iova->pfn_hi = pfn_hi;
+ iova->pfn_lo = pfn_lo;
+ iova_insert_rbtree(&iovad->rbroot, iova);
+ return iova;
+}
+
+static void
+__adjust_overlap_range(struct iova *iova,
+ unsigned long *pfn_lo, unsigned long *pfn_hi)
+{
+ if (*pfn_lo < iova->pfn_lo)
+ iova->pfn_lo = *pfn_lo;
+ if (*pfn_hi > iova->pfn_hi)
+ *pfn_lo = iova->pfn_hi + 1;
+}
+
+/**
+ * reserve_iova - reserves an iova in the given range
+ * @iovad: - iova domain pointer
+ * @pfn_lo: - lower page frame address
+ * @pfn_hi:- higher pfn adderss
+ * This function allocates reserves the address range from pfn_lo to pfn_hi so
+ * that this address is not dished out as part of alloc_iova.
+ */
+struct iova *
+reserve_iova(struct iova_domain *iovad,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct rb_node *node;
+ unsigned long flags;
+ struct iova *iova;
+ unsigned int overlap = 0;
+
+ spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
+ spin_lock(&iovad->iova_rbtree_lock);
+ for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
+ if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
+ iova = container_of(node, struct iova, node);
+ __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
+ if ((pfn_lo >= iova->pfn_lo) &&
+ (pfn_hi <= iova->pfn_hi))
+ goto finish;
+ overlap = 1;
+
+ } else if (overlap)
+ break;
+ }
+
+ /* We are here either becasue this is the first reserver node
+ * or need to insert remaining non overlap addr range
+ */
+ iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
+finish:
+
+ spin_unlock(&iovad->iova_rbtree_lock);
+ spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+ return iova;
+}
+
+/**
+ * copy_reserved_iova - copies the reserved between domains
+ * @from: - source doamin from where to copy
+ * @to: - destination domin where to copy
+ * This function copies reserved iova's from one doamin to
+ * other.
+ */
+void
+copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
+{
+ unsigned long flags;
+ struct rb_node *node;
+
+ spin_lock_irqsave(&from->iova_alloc_lock, flags);
+ spin_lock(&from->iova_rbtree_lock);
+ for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
+ struct iova *iova = container_of(node, struct iova, node);
+ struct iova *new_iova;
+ new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+ if (!new_iova)
+ printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
+ iova->pfn_lo, iova->pfn_lo);
+ }
+ spin_unlock(&from->iova_rbtree_lock);
+ spin_unlock_irqrestore(&from->iova_alloc_lock, flags);
+}
--
1.5.2.4
On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:
> This patchset convert the PPC64 IOMMU to use the iova code for free
> area management.
>
> The IOMMUs ignores low level drivers' restrictions, the maximum
> segment size and segment boundary.
>
> I fixed the former:
>
> http://thread.gmane.org/gmane.linux.scsi/35602
>
> The latter makes the free area management complicated. I'd like to
> convert IOMMUs to use the iova code (that intel-iommu introduced)
> for free area management and enable iova to handle segment boundary
> restrictions, rather than fixing all the IOMMUs' free area
> management,
In general it sounds like a great idea, but have you looked at what
impact this has on the performance of the IO path?
Cheers,
Muli
On Fri, 2 Nov 2007 19:12:27 +0200
Muli Ben-Yehuda <[email protected]> wrote:
> On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:
>
> > This patchset convert the PPC64 IOMMU to use the iova code for free
> > area management.
> >
> > The IOMMUs ignores low level drivers' restrictions, the maximum
> > segment size and segment boundary.
> >
> > I fixed the former:
> >
> > http://thread.gmane.org/gmane.linux.scsi/35602
> >
> > The latter makes the free area management complicated. I'd like to
> > convert IOMMUs to use the iova code (that intel-iommu introduced)
> > for free area management and enable iova to handle segment boundary
> > restrictions, rather than fixing all the IOMMUs' free area
> > management,
>
> In general it sounds like a great idea, but have you looked at what
> impact this has on the performance of the IO path?
Not yet. And I have access to only an entry-class pSeries server so I
would appreciate it if POWERPC people try the patchset with high-end
servers.
The patchset doesn't use the tricks that the current POWERPC IOMMU use
to improve performance (Anton and Olof told me about them, Thanks!)
. So I might need to modify iova for them.
I think that it's difficult to use some tricks used in the IOMMUs with
the iova even if we modify the iova. If the iova degrades performance,
I think that it would be better to have new IOMMU library code to use
simply a bit map that most of the IOMMUs use.
Another possible factor to degrade performance is that we can't
allocate areas for map_sg at one stroke like some of the IOMMUs do.
We need to allocate an area per sg segment to handle segment boundary.
If it's unacceptable, I guess that there is only one way, LLDs
allocate many sg entries and then the IOMMU splits the sg entryes.
On Fri, 2 Nov 2007 19:12:27 +0200
Muli Ben-Yehuda <[email protected]> wrote:
> On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:
>
> > This patchset convert the PPC64 IOMMU to use the iova code for free
> > area management.
> >
> > The IOMMUs ignores low level drivers' restrictions, the maximum
> > segment size and segment boundary.
> >
> > I fixed the former:
> >
> > http://thread.gmane.org/gmane.linux.scsi/35602
> >
> > The latter makes the free area management complicated. I'd like to
> > convert IOMMUs to use the iova code (that intel-iommu introduced)
> > for free area management and enable iova to handle segment boundary
> > restrictions, rather than fixing all the IOMMUs' free area
> > management,
>
> In general it sounds like a great idea, but have you looked at what
> impact this has on the performance of the IO path?
I converted swiotlb to use iova and compared it with the original
algorithm (better than the simple bit map one that most of the IOMMUs
use, I think).
I use 'swiotlb=force' boot option and run netperf with "-m 128 -D"
(lead to tons of dma_map_single).
The original produced 281.8 Mb/s and the iova produced 77.2 Mb/s.
Seems that it would be better to generalization the swiotlb algorithm
(at least for small I/O area)? Or my patch might have bugs.
Here's the patch to convert swiotlb to use iova against my iova
patchset:
http://marc.info/?l=linux-kernel&m=119402340801254&w=2
diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index c10d3f0..8735822 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -524,6 +524,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
bool
+ select IOVA
help
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU (e.g. the current generation
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index aa805b1..8507402 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -325,6 +325,9 @@ static int __init pci_iommu_init(void)
gart_iommu_init();
#endif
+#ifdef CONFIG_SWIOTLB
+ swiotlb_alloc();
+#endif
no_iommu_init();
return 0;
}
diff --git a/include/asm-x86/swiotlb.h b/include/asm-x86/swiotlb.h
index f9c5895..f00d20c 100644
--- a/include/asm-x86/swiotlb.h
+++ b/include/asm-x86/swiotlb.h
@@ -40,6 +40,7 @@ extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
void *vaddr, dma_addr_t dma_handle);
extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
extern void swiotlb_init(void);
+extern void swiotlb_alloc(void);
extern int swiotlb_force;
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1a8050a..54ecb87 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ctype.h>
+#include <linux/iova.h>
#include <asm/io.h>
#include <asm/dma.h>
@@ -103,10 +104,7 @@ static unsigned int io_tlb_index;
*/
static unsigned char **io_tlb_orig_addr;
-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
+static struct iova_domain swiotlb_iovad;
static int __init
setup_io_tlb_npages(char *str)
@@ -272,6 +270,19 @@ cleanup1:
return -ENOMEM;
}
+static struct kmem_cache *iova_cachep;
+
+void __init
+swiotlb_alloc(void)
+{
+ if (!swiotlb)
+ return;
+
+ iova_cachep = KMEM_CACHE(iova, 0);
+ init_iova_domain(&swiotlb_iovad, DMA_32BIT_MASK >> IO_TLB_SHIFT,
+ iova_cachep);
+}
+
static int
address_needs_mapping(struct device *hwdev, dma_addr_t addr)
{
@@ -288,70 +299,20 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr)
static void *
map_single(struct device *hwdev, char *buffer, size_t size, int dir)
{
- unsigned long flags;
char *dma_addr;
- unsigned int nslots, stride, index, wrap;
+ unsigned int nslots, index;
int i;
+ struct iova *iova;
- /*
- * For mappings greater than a page, we limit the stride (and
- * hence alignment) to a page size.
- */
nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- if (size > PAGE_SIZE)
- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
- else
- stride = 1;
-
BUG_ON(!nslots);
- /*
- * Find suitable number of IO TLB entries size that will fit this
- * request and allocate a buffer from that IO TLB pool.
- */
- spin_lock_irqsave(&io_tlb_lock, flags);
- {
- wrap = index = ALIGN(io_tlb_index, stride);
-
- if (index >= io_tlb_nslabs)
- wrap = index = 0;
-
- do {
- /*
- * If we find a slot that indicates we have 'nslots'
- * number of contiguous buffers, we allocate the
- * buffers from that slot and mark the entries as '0'
- * indicating unavailable.
- */
- if (io_tlb_list[index] >= nslots) {
- int count = 0;
-
- for (i = index; i < (int) (index + nslots); i++)
- io_tlb_list[i] = 0;
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
- /*
- * Update the indices to avoid searching in
- * the next round.
- */
- io_tlb_index = ((index + nslots) < io_tlb_nslabs
- ? (index + nslots) : 0);
-
- goto found;
- }
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- } while (index != wrap);
-
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ iova = alloc_iova(&swiotlb_iovad, nslots, io_tlb_nslabs - 1, 1);
+ if (!iova)
return NULL;
- }
- found:
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ index = iova->pfn_lo;
+ dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
/*
* Save away the mapping from the original address to the DMA address.
* This is needed when we sync the memory. Then we sync the buffer if
@@ -371,8 +332,6 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
static void
unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
{
- unsigned long flags;
- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
char *buffer = io_tlb_orig_addr[index];
@@ -386,30 +345,7 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
*/
memcpy(buffer, dma_addr, size);
- /*
- * Return the buffer to the free list by setting the corresponding
- * entries to indicate the number of contigous entries available.
- * While returning the entries to the free list, we merge the entries
- * with slots below and above the pool being returned.
- */
- spin_lock_irqsave(&io_tlb_lock, flags);
- {
- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
- io_tlb_list[index + nslots] : 0);
- /*
- * Step 1: return the slots to the free list, merging the
- * slots with superceeding slots
- */
- for (i = index + nslots - 1; i >= index; i--)
- io_tlb_list[i] = ++count;
- /*
- * Step 2: merge the returned slots with the preceding slots,
- * if available (non zero)
- */
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- }
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ free_iova(&swiotlb_iovad, index);
}
static void
--
1.5.3.4