2008-02-11 22:43:32

by mark gross

[permalink] [raw]
Subject: [PATCH]intel-iommu batched iotlb flushes

The intel-iommu hardware requires a polling operation to flush IOTLB
PTE's after an unmap operation. Through some TSC instrumentation of a
netperf UDP stream with small packets test case it was seen that the
flush operations where sucking up to 16% of the CPU time doing
iommu_flush_iotlb's

The following patch batches the IOTLB flushes removing most of the
overhead in flushing the IOTLB's. It works by building a list of to be
released IOVA's that is iterated over when a timer goes off or when a
high water mark is reached.

The wrinkle this has is that the memory protection and page fault
warnings from errant DMA operations is somewhat reduced, hence a kernel
parameter is added to revert back to the "strict" page flush / unmap
behavior.

The hole is the following scenarios:
do many map_signal operations, do some unmap_signals, reuse a recently
unmapped page, <errant DMA hardware sneaks through and steps on reused
memory>

Or: you have rouge hardware using DMA's to look at pages: do many
map_signal's, do many unmap_singles, reuse some unmapped pages :
<rouge hardware looks at reused page>

Note : these holes are very hard to get too, as the IOTLB is small and
only the PTE's still in the IOTLB can be accessed through this
mechanism.

Its recommended that strict is used when developing drivers that do DMA
operations to catch bugs early. For production code where performance
is desired running with the batched IOTLB flushing is a good way to
go.


--mgross


Signed-off-by: <[email protected]>



Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-07 13:03:10.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-11 10:38:49.000000000 -0800
@@ -21,6 +21,7 @@

#include <linux/init.h>
#include <linux/bitmap.h>
+#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
@@ -30,6 +31,7 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
+#include <linux/timer.h>
#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@

#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)

+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+ TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+ struct list_head list;
+ struct dmar_domain *domain;
+ struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long *g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
static void domain_remove_dev_info(struct dmar_domain *domain);

static int dmar_disabled;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
+static int intel_iommu_strict;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
printk(KERN_INFO
"Intel-IOMMU: disable GFX device mapping\n");
} else if (!strncmp(str, "forcedac", 8)) {
- printk (KERN_INFO
+ printk(KERN_INFO
"Intel-IOMMU: Forcing DAC for PCI devices\n");
dmar_forcedac = 1;
+ } else if (!strncmp(str, "strict", 8)) {
+ printk(KERN_INFO
+ "Intel-IOMMU: disable bached IOTLB flush\n");
+ intel_iommu_strict = 1;
}

str += strcspn(str, ",");
@@ -965,17 +999,13 @@
set_bit(0, iommu->domain_ids);
return 0;
}
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+ struct dmar_drhd_unit *drhd)
{
- struct intel_iommu *iommu;
int ret;
int map_size;
u32 ver;

- iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
- if (!iommu)
- return NULL;
iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
if (!iommu->reg) {
printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
int index;

while (dev) {
- for (index = 0; index < cnt; index ++)
+ for (index = 0; index < cnt; index++)
if (dev == devices[index])
return 1;

@@ -1661,7 +1691,7 @@
struct dmar_rmrr_unit *rmrr;
struct pci_dev *pdev;
struct intel_iommu *iommu;
- int ret, unit = 0;
+ int nlongs, i, ret, unit = 0;

/*
* for each drhd
@@ -1672,7 +1702,29 @@
for_each_drhd_unit(drhd) {
if (drhd->ignored)
continue;
- iommu = alloc_iommu(drhd);
+ g_num_of_iommus++;
+ }
+
+ nlongs = BITS_TO_LONGS(g_num_of_iommus);
+ g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+ if (!g_iommus_to_flush) {
+ printk(KERN_ERR "Allocating bitmap array failed\n");
+ return -ENOMEM;
+ }
+
+ g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+ if (!g_iommus) {
+ kfree(g_iommus_to_flush);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ for_each_drhd_unit(drhd) {
+ if (drhd->ignored)
+ continue;
+ iommu = alloc_iommu(&g_iommus[i], drhd);
+ i++;
if (!iommu) {
ret = -ENOMEM;
goto error;
@@ -1705,7 +1757,6 @@
* endfor
*/
for_each_rmrr_units(rmrr) {
- int i;
for (i = 0; i < rmrr->devices_cnt; i++) {
pdev = rmrr->devices[i];
/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1960,54 @@
return 0;
}

+static void flush_unmaps(void)
+{
+ struct iova *node, *n;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ timer_on = 0;
+
+ /*just flush them all*/
+ for (i = 0; i < g_num_of_iommus; i++) {
+ if (test_and_clear_bit(i, g_iommus_to_flush))
+ iommu_flush_iotlb_global(&g_iommus[i], 0);
+ }
+
+ list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+ /* free iova */
+ list_del(&node->list);
+ __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+ }
+ list_size = 0;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+ flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ iova->dmar = dom;
+ list_add(&iova->list, &unmaps_to_do);
+ set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+ if (!timer_on) {
+ unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+ mod_timer(&unmap_timer, unmap_timer.expires);
+ timer_on = 1;
+ }
+ list_size++;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
size_t size, int dir)
{
@@ -1936,13 +2035,21 @@
dma_pte_clear_range(domain, start_addr, start_addr + size);
/* free page tables */
dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
- if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
- size >> PAGE_SHIFT_4K, 0))
- iommu_flush_write_buffer(domain->iommu);
-
- /* free iova */
- __free_iova(&domain->iovad, iova);
+ if (intel_iommu_strict) {
+ if (iommu_flush_iotlb_psi(domain->iommu,
+ domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+ iommu_flush_write_buffer(domain->iommu);
+ /* free iova */
+ __free_iova(&domain->iovad, iova);
+ } else {
+ add_unmap(domain, iova);
+ /*
+ * queue up the release of the unmap to save the 1/6th of the
+ * cpu used up by the iotlb flush operation...
+ */
+ if (list_size > high_watermark)
+ flush_unmaps();
+ }
}

static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2373,10 @@
if (dmar_table_init())
return -ENODEV;

+ high_watermark = 250;
+ intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+ debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+ intel_iommu_debug, &high_watermark);
iommu_init_mempool();
dmar_init_reserved_ranges();

@@ -2281,6 +2392,7 @@
printk(KERN_INFO
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");

+ init_timer(&unmap_timer);
force_iommu = 1;
dma_ops = &intel_dma_ops;
return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h 2008-02-07 13:03:10.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h 2008-02-07 13:06:28.000000000 -0800
@@ -23,6 +23,8 @@
struct rb_node node;
unsigned long pfn_hi; /* IOMMU dish out addr hi */
unsigned long pfn_lo; /* IOMMU dish out addr lo */
+ struct list_head list;
+ void *dmar;
};

/* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-11 13:44:23.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-11 14:23:37.000000000 -0800
@@ -822,6 +822,10 @@
than 32 bit addressing. The default is to look
for translation below 32 bit and if not available
then look in the higher range.
+ strict [Default Off]
+ With this option on every umap_signle will
+ result in a hardware IOTLB flush opperation as
+ opposed to batching them for performance.

io_delay= [X86-32,X86-64] I/O delay method
0x80


2008-02-11 23:32:01

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Mon, 11 Feb 2008 14:41:05 -0800 mark gross wrote:

> The hole is the following scenarios:
> do many map_signal operations, do some unmap_signals, reuse a recently
> unmapped page, <errant DMA hardware sneaks through and steps on reused
> memory>
>
> Or: you have rouge hardware using DMA's to look at pages: do many

or rogue hardware?

> map_signal's, do many unmap_singles, reuse some unmapped pages :

signal .................... single

> <rouge hardware looks at reused page>

why rouge?

> Note : these holes are very hard to get too, as the IOTLB is small and
> only the PTE's still in the IOTLB can be accessed through this
> mechanism.
>
> Its recommended that strict is used when developing drivers that do DMA
> operations to catch bugs early. For production code where performance
> is desired running with the batched IOTLB flushing is a good way to
> go.
>
>
> --mgross
>
>
> Signed-off-by: <[email protected]>
>
>
>
> Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
> ===================================================================
> --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-07 13:03:10.000000000 -0800
> +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-11 10:38:49.000000000 -0800

> @@ -50,11 +52,39 @@
>
> #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
>
> +
> +static void flush_unmaps_timeout(unsigned long data);
> +
> +static struct timer_list unmap_timer =
> + TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
> +
> +struct unmap_list {
> + struct list_head list;
> + struct dmar_domain *domain;
> + struct iova *iova;
> +};
> +
> +static struct intel_iommu *g_iommus;
> +/* bitmap for indexing intel_iommus */
> +static unsigned long *g_iommus_to_flush;
> +static int g_num_of_iommus;
> +
> +static DEFINE_SPINLOCK(async_umap_flush_lock);
> +static LIST_HEAD(unmaps_to_do);
> +
> +static int timer_on;
> +static long list_size;
> +static int high_watermark;
> +
> +static struct dentry *intel_iommu_debug, *debug;
> +
> +
> static void domain_remove_dev_info(struct dmar_domain *domain);
>
> static int dmar_disabled;
> static int __initdata dmar_map_gfx = 1;
> static int dmar_forcedac;
> +static int intel_iommu_strict;
>
> #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> static DEFINE_SPINLOCK(device_domain_lock);
> @@ -73,9 +103,13 @@
> printk(KERN_INFO
> "Intel-IOMMU: disable GFX device mapping\n");
> } else if (!strncmp(str, "forcedac", 8)) {
> - printk (KERN_INFO
> + printk(KERN_INFO
> "Intel-IOMMU: Forcing DAC for PCI devices\n");
> dmar_forcedac = 1;
> + } else if (!strncmp(str, "strict", 8)) {
> + printk(KERN_INFO
> + "Intel-IOMMU: disable bached IOTLB flush\n");

batched

> + intel_iommu_strict = 1;
> }
>
> str += strcspn(str, ",");

> @@ -1672,7 +1702,29 @@
> for_each_drhd_unit(drhd) {
> if (drhd->ignored)
> continue;
> - iommu = alloc_iommu(drhd);
> + g_num_of_iommus++;
> + }
> +
> + nlongs = BITS_TO_LONGS(g_num_of_iommus);
> + g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> + if (!g_iommus_to_flush) {
> + printk(KERN_ERR "Allocating bitmap array failed\n");

identify: "IOMMU:

> + return -ENOMEM;
> + }
> +
> + g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> + if (!g_iommus) {
> + kfree(g_iommus_to_flush);
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> + i = 0;
> + for_each_drhd_unit(drhd) {
> + if (drhd->ignored)
> + continue;
> + iommu = alloc_iommu(&g_iommus[i], drhd);
> + i++;
> if (!iommu) {
> ret = -ENOMEM;
> goto error;

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-11 13:44:23.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-11 14:23:37.000000000 -0800
> @@ -822,6 +822,10 @@
> than 32 bit addressing. The default is to look
> for translation below 32 bit and if not available
> then look in the higher range.
> + strict [Default Off]
> + With this option on every umap_signle will

on, every unmap_si{ngle,gnal} ??

> + result in a hardware IOTLB flush opperation as
> + opposed to batching them for performance.
>
> io_delay= [X86-32,X86-64] I/O delay method
> 0x80


---
~Randy

2008-02-12 08:53:22

by Muli Ben-Yehuda

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Mon, Feb 11, 2008 at 02:41:05PM -0800, mark gross wrote:

> The intel-iommu hardware requires a polling operation to flush IOTLB
> PTE's after an unmap operation. Through some TSC instrumentation of
> a netperf UDP stream with small packets test case it was seen that
> the flush operations where sucking up to 16% of the CPU time doing
> iommu_flush_iotlb's
>
> The following patch batches the IOTLB flushes removing most of the
> overhead in flushing the IOTLB's. It works by building a list of to
> be released IOVA's that is iterated over when a timer goes off or
> when a high water mark is reached.
>
> The wrinkle this has is that the memory protection and page fault
> warnings from errant DMA operations is somewhat reduced, hence a kernel
> parameter is added to revert back to the "strict" page flush / unmap
> behavior.
>
> The hole is the following scenarios:
> do many map_signal operations, do some unmap_signals, reuse a recently
> unmapped page, <errant DMA hardware sneaks through and steps on reused
> memory>
>
> Or: you have rouge hardware using DMA's to look at pages: do many
> map_signal's, do many unmap_singles, reuse some unmapped pages :
> <rouge hardware looks at reused page>
>
> Note : these holes are very hard to get too, as the IOTLB is small
> and only the PTE's still in the IOTLB can be accessed through this
> mechanism.
>
> Its recommended that strict is used when developing drivers that do
> DMA operations to catch bugs early. For production code where
> performance is desired running with the batched IOTLB flushing is a
> good way to go.

While I don't disagree with this patch in principle (Calgary does the
same thing due to expensive IOTLB flushes) the right way to fix it
IMHO is to fix the drivers to batch mapping and unmapping operations
or map up-front and unmap when done. The streaming DMA-API was
designed to conserve IOMMU mappings for machines where IOMMU mappings
are a scarce resource, and is a poor fit for a modern IOMMU such as
VT-d with a 64-bit IO address space (or even an IOMMU with a 32-bit
address space such as Calgary) where there are plenty of IOMMU
mappings available.

Cheers,
Muli

2008-02-12 08:59:43

by David Miller

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

From: Muli Ben-Yehuda <[email protected]>
Date: Tue, 12 Feb 2008 10:52:56 +0200

> The streaming DMA-API was designed to conserve IOMMU mappings for
> machines where IOMMU mappings are a scarce resource, and is a poor
> fit for a modern IOMMU such as VT-d with a 64-bit IO address space
> (or even an IOMMU with a 32-bit address space such as Calgary) where
> there are plenty of IOMMU mappings available.

For the 64-bit case what you are suggesting eventually amounts
to mapping all available RAM in the IOMMU.

Although an extreme version of your suggestion, it would be the
most efficient as it would require zero IOMMU flush operations.

But we'd lose things like protection and other benefits.

2008-02-12 09:08:16

by Muli Ben-Yehuda

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Tue, Feb 12, 2008 at 01:00:06AM -0800, David Miller wrote:
> From: Muli Ben-Yehuda <[email protected]>
> Date: Tue, 12 Feb 2008 10:52:56 +0200
>
> > The streaming DMA-API was designed to conserve IOMMU mappings for
> > machines where IOMMU mappings are a scarce resource, and is a poor
> > fit for a modern IOMMU such as VT-d with a 64-bit IO address space
> > (or even an IOMMU with a 32-bit address space such as Calgary)
> > where there are plenty of IOMMU mappings available.
>
> For the 64-bit case what you are suggesting eventually amounts to
> mapping all available RAM in the IOMMU.
>
> Although an extreme version of your suggestion, it would be the most
> efficient as it would require zero IOMMU flush operations.
>
> But we'd lose things like protection and other benefits.

For the extreme case you are correct. There's an inherent trade-off
between IOMMU performance and protection guarantees, where one end of
the spectrum is represented by the streaming DMA-API and the other end
is represented by simply mapping all available memory. It's an open
question what is the right point in between. I think that an optimal
strategy might be "keep the mapping around for as long as it is safe",
i.e., keep a mapping to a frame for as long as the frame is owned by
whoever requested the mapping in the first place. Once ownership of
the frame is passed to another entity (e.g., from the driver to the
stack), revoke the original mapping. This implies a way for the kernel
to track frame ownership and communicate frame ownership changes to
the DMA-API layer, which we don't currently have.

Cheers,
Muli

2008-02-12 15:39:41

by mark gross

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Tue, Feb 12, 2008 at 10:52:56AM +0200, Muli Ben-Yehuda wrote:
> On Mon, Feb 11, 2008 at 02:41:05PM -0800, mark gross wrote:
>
> > The intel-iommu hardware requires a polling operation to flush IOTLB
> > PTE's after an unmap operation. Through some TSC instrumentation of
> > a netperf UDP stream with small packets test case it was seen that
> > the flush operations where sucking up to 16% of the CPU time doing
> > iommu_flush_iotlb's
> >
> > The following patch batches the IOTLB flushes removing most of the
> > overhead in flushing the IOTLB's. It works by building a list of to
> > be released IOVA's that is iterated over when a timer goes off or
> > when a high water mark is reached.
> >
> > The wrinkle this has is that the memory protection and page fault
> > warnings from errant DMA operations is somewhat reduced, hence a kernel
> > parameter is added to revert back to the "strict" page flush / unmap
> > behavior.
> >
> > The hole is the following scenarios:
> > do many map_signal operations, do some unmap_signals, reuse a recently
> > unmapped page, <errant DMA hardware sneaks through and steps on reused
> > memory>
> >
> > Or: you have rouge hardware using DMA's to look at pages: do many
> > map_signal's, do many unmap_singles, reuse some unmapped pages :
> > <rouge hardware looks at reused page>
> >
> > Note : these holes are very hard to get too, as the IOTLB is small
> > and only the PTE's still in the IOTLB can be accessed through this
> > mechanism.
> >
> > Its recommended that strict is used when developing drivers that do
> > DMA operations to catch bugs early. For production code where
> > performance is desired running with the batched IOTLB flushing is a
> > good way to go.
>
> While I don't disagree with this patch in principle (Calgary does the
> same thing due to expensive IOTLB flushes) the right way to fix it
> IMHO is to fix the drivers to batch mapping and unmapping operations
> or map up-front and unmap when done. The streaming DMA-API was
> designed to conserve IOMMU mappings for machines where IOMMU mappings
> are a scarce resource, and is a poor fit for a modern IOMMU such as
> VT-d with a 64-bit IO address space (or even an IOMMU with a 32-bit
> address space such as Calgary) where there are plenty of IOMMU
> mappings available.

Yes, have a DMA pool of DMA addresses to use and re-use in the stack
instead of setting up and tearing down the PTE's is something we need to
look at closely for network and other high DMA traffic stacks.

--mgross

2008-02-12 15:52:17

by mark gross

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Tue, Feb 12, 2008 at 01:00:06AM -0800, David Miller wrote:
> From: Muli Ben-Yehuda <[email protected]>
> Date: Tue, 12 Feb 2008 10:52:56 +0200
>
> > The streaming DMA-API was designed to conserve IOMMU mappings for
> > machines where IOMMU mappings are a scarce resource, and is a poor
> > fit for a modern IOMMU such as VT-d with a 64-bit IO address space
> > (or even an IOMMU with a 32-bit address space such as Calgary) where
> > there are plenty of IOMMU mappings available.
>
> For the 64-bit case what you are suggesting eventually amounts
> to mapping all available RAM in the IOMMU.

Something could be done:
we could enable drivers to have DMA-pools they manage that get mapped
and are re-used.

I would rather the DMA-pools be tied to PID's that way any bad behavior
would be limited to the address space of the process using the device.
I haven't thought about how hard this would be to do but it would be
nice. I think this could be tricky.

Application sets up ring buffer of device DMA memory, passes this to
driver/stack. Need to handle hitting high water marks and application
exit clean up sanely...

--mgross

>
> Although an extreme version of your suggestion, it would be the
> most efficient as it would require zero IOMMU flush operations.
>
> But we'd lose things like protection and other benefits.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2008-02-12 16:04:19

by mark gross

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Mon, Feb 11, 2008 at 03:27:16PM -0800, Randy Dunlap wrote:
> On Mon, 11 Feb 2008 14:41:05 -0800 mark gross wrote:
>
> > The hole is the following scenarios:
> > do many map_signal operations, do some unmap_signals, reuse a recently
> > unmapped page, <errant DMA hardware sneaks through and steps on reused
> > memory>
> >
> > Or: you have rouge hardware using DMA's to look at pages: do many
>
> or rogue hardware?

yes bad-guy hardware.

>
> > map_signal's, do many unmap_singles, reuse some unmapped pages :
>
> signal .................... single
>
> > <rouge hardware looks at reused page>
>
> why rouge?

because I'm a dumb ass.

>
> > Note : these holes are very hard to get too, as the IOTLB is small and
> > only the PTE's still in the IOTLB can be accessed through this
> > mechanism.
> >
> > Its recommended that strict is used when developing drivers that do DMA
> > operations to catch bugs early. For production code where performance
> > is desired running with the batched IOTLB flushing is a good way to
> > go.
> >
> >
> > --mgross
> >
> >
> > Signed-off-by: <[email protected]>
> >
> >
> >
> > Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-07 13:03:10.000000000 -0800
> > +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-11 10:38:49.000000000 -0800
>
> > @@ -50,11 +52,39 @@
> >
> > #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
> >
> > +
> > +static void flush_unmaps_timeout(unsigned long data);
> > +
> > +static struct timer_list unmap_timer =
> > + TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
> > +
> > +struct unmap_list {
> > + struct list_head list;
> > + struct dmar_domain *domain;
> > + struct iova *iova;
> > +};
> > +
> > +static struct intel_iommu *g_iommus;
> > +/* bitmap for indexing intel_iommus */
> > +static unsigned long *g_iommus_to_flush;
> > +static int g_num_of_iommus;
> > +
> > +static DEFINE_SPINLOCK(async_umap_flush_lock);
> > +static LIST_HEAD(unmaps_to_do);
> > +
> > +static int timer_on;
> > +static long list_size;
> > +static int high_watermark;
> > +
> > +static struct dentry *intel_iommu_debug, *debug;
> > +
> > +
> > static void domain_remove_dev_info(struct dmar_domain *domain);
> >
> > static int dmar_disabled;
> > static int __initdata dmar_map_gfx = 1;
> > static int dmar_forcedac;
> > +static int intel_iommu_strict;
> >
> > #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> > static DEFINE_SPINLOCK(device_domain_lock);
> > @@ -73,9 +103,13 @@
> > printk(KERN_INFO
> > "Intel-IOMMU: disable GFX device mapping\n");
> > } else if (!strncmp(str, "forcedac", 8)) {
> > - printk (KERN_INFO
> > + printk(KERN_INFO
> > "Intel-IOMMU: Forcing DAC for PCI devices\n");
> > dmar_forcedac = 1;
> > + } else if (!strncmp(str, "strict", 8)) {
> > + printk(KERN_INFO
> > + "Intel-IOMMU: disable bached IOTLB flush\n");
>
> batched

fixed.

>
> > + intel_iommu_strict = 1;
> > }
> >
> > str += strcspn(str, ",");
>
> > @@ -1672,7 +1702,29 @@
> > for_each_drhd_unit(drhd) {
> > if (drhd->ignored)
> > continue;
> > - iommu = alloc_iommu(drhd);
> > + g_num_of_iommus++;
> > + }
> > +
> > + nlongs = BITS_TO_LONGS(g_num_of_iommus);
> > + g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> > + if (!g_iommus_to_flush) {
> > + printk(KERN_ERR "Allocating bitmap array failed\n");
>
> identify: "IOMMU:
>
> > + return -ENOMEM;
> > + }
> > +
> > + g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> > + if (!g_iommus) {
> > + kfree(g_iommus_to_flush);
> > + ret = -ENOMEM;
> > + goto error;
> > + }
> > +
> > + i = 0;
> > + for_each_drhd_unit(drhd) {
> > + if (drhd->ignored)
> > + continue;
> > + iommu = alloc_iommu(&g_iommus[i], drhd);
> > + i++;
> > if (!iommu) {
> > ret = -ENOMEM;
> > goto error;
>
> > Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-11 13:44:23.000000000 -0800
> > +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-11 14:23:37.000000000 -0800
> > @@ -822,6 +822,10 @@
> > than 32 bit addressing. The default is to look
> > for translation below 32 bit and if not available
> > then look in the higher range.
> > + strict [Default Off]
> > + With this option on every umap_signle will
>
> on, every unmap_si{ngle,gnal} ??
>

fixed.


> > + result in a hardware IOTLB flush opperation as
> > + opposed to batching them for performance.
> >
> > io_delay= [X86-32,X86-64] I/O delay method
> > 0x80
>
>

thanks!

Signed-off-by: <[email protected]>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-12 07:47:04.000000000 -0800
@@ -21,6 +21,7 @@

#include <linux/init.h>
#include <linux/bitmap.h>
+#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
@@ -30,6 +31,7 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
+#include <linux/timer.h>
#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@

#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)

+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+ TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+ struct list_head list;
+ struct dmar_domain *domain;
+ struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long *g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
static void domain_remove_dev_info(struct dmar_domain *domain);

static int dmar_disabled;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
+static int intel_iommu_strict;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
printk(KERN_INFO
"Intel-IOMMU: disable GFX device mapping\n");
} else if (!strncmp(str, "forcedac", 8)) {
- printk (KERN_INFO
+ printk(KERN_INFO
"Intel-IOMMU: Forcing DAC for PCI devices\n");
dmar_forcedac = 1;
+ } else if (!strncmp(str, "strict", 8)) {
+ printk(KERN_INFO
+ "Intel-IOMMU: disable batched IOTLB flush\n");
+ intel_iommu_strict = 1;
}

str += strcspn(str, ",");
@@ -965,17 +999,13 @@
set_bit(0, iommu->domain_ids);
return 0;
}
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+ struct dmar_drhd_unit *drhd)
{
- struct intel_iommu *iommu;
int ret;
int map_size;
u32 ver;

- iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
- if (!iommu)
- return NULL;
iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
if (!iommu->reg) {
printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
int index;

while (dev) {
- for (index = 0; index < cnt; index ++)
+ for (index = 0; index < cnt; index++)
if (dev == devices[index])
return 1;

@@ -1661,7 +1691,7 @@
struct dmar_rmrr_unit *rmrr;
struct pci_dev *pdev;
struct intel_iommu *iommu;
- int ret, unit = 0;
+ int nlongs, i, ret, unit = 0;

/*
* for each drhd
@@ -1672,7 +1702,30 @@
for_each_drhd_unit(drhd) {
if (drhd->ignored)
continue;
- iommu = alloc_iommu(drhd);
+ g_num_of_iommus++;
+ }
+
+ nlongs = BITS_TO_LONGS(g_num_of_iommus);
+ g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+ if (!g_iommus_to_flush) {
+ printk(KERN_ERR "Intel-IOMMU: \
+ Allocating bitmap array failed\n");
+ return -ENOMEM;
+ }
+
+ g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+ if (!g_iommus) {
+ kfree(g_iommus_to_flush);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ for_each_drhd_unit(drhd) {
+ if (drhd->ignored)
+ continue;
+ iommu = alloc_iommu(&g_iommus[i], drhd);
+ i++;
if (!iommu) {
ret = -ENOMEM;
goto error;
@@ -1705,7 +1758,6 @@
* endfor
*/
for_each_rmrr_units(rmrr) {
- int i;
for (i = 0; i < rmrr->devices_cnt; i++) {
pdev = rmrr->devices[i];
/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
return 0;
}

+static void flush_unmaps(void)
+{
+ struct iova *node, *n;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ timer_on = 0;
+
+ /*just flush them all*/
+ for (i = 0; i < g_num_of_iommus; i++) {
+ if (test_and_clear_bit(i, g_iommus_to_flush))
+ iommu_flush_iotlb_global(&g_iommus[i], 0);
+ }
+
+ list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+ /* free iova */
+ list_del(&node->list);
+ __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+ }
+ list_size = 0;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+ flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ iova->dmar = dom;
+ list_add(&iova->list, &unmaps_to_do);
+ set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+ if (!timer_on) {
+ unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+ mod_timer(&unmap_timer, unmap_timer.expires);
+ timer_on = 1;
+ }
+ list_size++;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
size_t size, int dir)
{
@@ -1936,13 +2036,21 @@
dma_pte_clear_range(domain, start_addr, start_addr + size);
/* free page tables */
dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
- if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
- size >> PAGE_SHIFT_4K, 0))
- iommu_flush_write_buffer(domain->iommu);
-
- /* free iova */
- __free_iova(&domain->iovad, iova);
+ if (intel_iommu_strict) {
+ if (iommu_flush_iotlb_psi(domain->iommu,
+ domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+ iommu_flush_write_buffer(domain->iommu);
+ /* free iova */
+ __free_iova(&domain->iovad, iova);
+ } else {
+ add_unmap(domain, iova);
+ /*
+ * queue up the release of the unmap to save the 1/6th of the
+ * cpu used up by the iotlb flush operation...
+ */
+ if (list_size > high_watermark)
+ flush_unmaps();
+ }
}

static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
if (dmar_table_init())
return -ENODEV;

+ high_watermark = 250;
+ intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+ debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+ intel_iommu_debug, &high_watermark);
iommu_init_mempool();
dmar_init_reserved_ranges();

@@ -2281,6 +2393,7 @@
printk(KERN_INFO
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");

+ init_timer(&unmap_timer);
force_iommu = 1;
dma_ops = &intel_dma_ops;
return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h 2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
struct rb_node node;
unsigned long pfn_hi; /* IOMMU dish out addr hi */
unsigned long pfn_lo; /* IOMMU dish out addr lo */
+ struct list_head list;
+ void *dmar;
};

/* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12 07:47:02.000000000 -0800
@@ -822,6 +822,10 @@
than 32 bit addressing. The default is to look
for translation below 32 bit and if not available
then look in the higher range.
+ strict [Default Off]
+ With this option on every umap_signal opperation will
+ result in a hardware IOTLB flush opperation as opposed
+ to batching them for performance.

io_delay= [X86-32,X86-64] I/O delay method
0x80

2008-02-12 16:39:18

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

mark gross wrote:
>
> Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
> ===================================================================
> --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-12 07:47:04.000000000 -0800

> @@ -1672,7 +1702,30 @@
> for_each_drhd_unit(drhd) {
> if (drhd->ignored)
> continue;
> - iommu = alloc_iommu(drhd);
> + g_num_of_iommus++;
> + }
> +
> + nlongs = BITS_TO_LONGS(g_num_of_iommus);
> + g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> + if (!g_iommus_to_flush) {
> + printk(KERN_ERR "Intel-IOMMU: \
> + Allocating bitmap array failed\n");

I think that will make a string like below:
Intel-IOMMU: Allocating bitmap array failed

> + return -ENOMEM;
> + }
> +
> + g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> + if (!g_iommus) {
> + kfree(g_iommus_to_flush);
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> + i = 0;
> + for_each_drhd_unit(drhd) {
> + if (drhd->ignored)
> + continue;
> + iommu = alloc_iommu(&g_iommus[i], drhd);
> + i++;
> if (!iommu) {
> ret = -ENOMEM;
> goto error;

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12 07:47:02.000000000 -0800
> @@ -822,6 +822,10 @@
> than 32 bit addressing. The default is to look
> for translation below 32 bit and if not available
> then look in the higher range.
> + strict [Default Off]
> + With this option on every umap_signal opperation will

unmap_single operation

> + result in a hardware IOTLB flush opperation as opposed
> + to batching them for performance.
>
> io_delay= [X86-32,X86-64] I/O delay method
> 0x80


--
~Randy

2008-02-12 19:55:00

by mark gross

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Tue, Feb 12, 2008 at 08:34:39AM -0800, Randy Dunlap wrote:
> mark gross wrote:
>> Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
>> ===================================================================
>> --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-12
>> 07:12:06.000000000 -0800
>> +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-12
>> 07:47:04.000000000 -0800
>
>> @@ -1672,7 +1702,30 @@
>> for_each_drhd_unit(drhd) {
>> if (drhd->ignored)
>> continue;
>> - iommu = alloc_iommu(drhd);
>> + g_num_of_iommus++;
>> + }
>> +
>> + nlongs = BITS_TO_LONGS(g_num_of_iommus);
>> + g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
>> + if (!g_iommus_to_flush) {
>> + printk(KERN_ERR "Intel-IOMMU: \
>> + Allocating bitmap array failed\n");
>
> I think that will make a string like below:
> Intel-IOMMU: Allocating bitmap array failed
>

you are right.

>> + return -ENOMEM;
>> + }
>> + strict [Default Off]
>> + With this option on every umap_signal opperation will
>
> unmap_single operation
>
>> + result in a hardware IOTLB flush opperation as opposed

Signed-off-by: <[email protected]>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-12 11:35:42.000000000 -0800
@@ -21,6 +21,7 @@

#include <linux/init.h>
#include <linux/bitmap.h>
+#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
@@ -30,6 +31,7 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
+#include <linux/timer.h>
#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@

#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)

+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+ TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+ struct list_head list;
+ struct dmar_domain *domain;
+ struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long *g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
static void domain_remove_dev_info(struct dmar_domain *domain);

static int dmar_disabled;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
+static int intel_iommu_strict;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
printk(KERN_INFO
"Intel-IOMMU: disable GFX device mapping\n");
} else if (!strncmp(str, "forcedac", 8)) {
- printk (KERN_INFO
+ printk(KERN_INFO
"Intel-IOMMU: Forcing DAC for PCI devices\n");
dmar_forcedac = 1;
+ } else if (!strncmp(str, "strict", 8)) {
+ printk(KERN_INFO
+ "Intel-IOMMU: disable batched IOTLB flush\n");
+ intel_iommu_strict = 1;
}

str += strcspn(str, ",");
@@ -965,17 +999,13 @@
set_bit(0, iommu->domain_ids);
return 0;
}
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+ struct dmar_drhd_unit *drhd)
{
- struct intel_iommu *iommu;
int ret;
int map_size;
u32 ver;

- iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
- if (!iommu)
- return NULL;
iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
if (!iommu->reg) {
printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
int index;

while (dev) {
- for (index = 0; index < cnt; index ++)
+ for (index = 0; index < cnt; index++)
if (dev == devices[index])
return 1;

@@ -1661,7 +1691,7 @@
struct dmar_rmrr_unit *rmrr;
struct pci_dev *pdev;
struct intel_iommu *iommu;
- int ret, unit = 0;
+ int nlongs, i, ret, unit = 0;

/*
* for each drhd
@@ -1672,7 +1702,30 @@
for_each_drhd_unit(drhd) {
if (drhd->ignored)
continue;
- iommu = alloc_iommu(drhd);
+ g_num_of_iommus++;
+ }
+
+ nlongs = BITS_TO_LONGS(g_num_of_iommus);
+ g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+ if (!g_iommus_to_flush) {
+ printk(KERN_ERR "Intel-IOMMU: "
+ "Allocating bitmap array failed\n");
+ return -ENOMEM;
+ }
+
+ g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+ if (!g_iommus) {
+ kfree(g_iommus_to_flush);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ for_each_drhd_unit(drhd) {
+ if (drhd->ignored)
+ continue;
+ iommu = alloc_iommu(&g_iommus[i], drhd);
+ i++;
if (!iommu) {
ret = -ENOMEM;
goto error;
@@ -1705,7 +1758,6 @@
* endfor
*/
for_each_rmrr_units(rmrr) {
- int i;
for (i = 0; i < rmrr->devices_cnt; i++) {
pdev = rmrr->devices[i];
/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
return 0;
}

+static void flush_unmaps(void)
+{
+ struct iova *node, *n;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ timer_on = 0;
+
+ /*just flush them all*/
+ for (i = 0; i < g_num_of_iommus; i++) {
+ if (test_and_clear_bit(i, g_iommus_to_flush))
+ iommu_flush_iotlb_global(&g_iommus[i], 0);
+ }
+
+ list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+ /* free iova */
+ list_del(&node->list);
+ __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+ }
+ list_size = 0;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+ flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ iova->dmar = dom;
+ list_add(&iova->list, &unmaps_to_do);
+ set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+ if (!timer_on) {
+ unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+ mod_timer(&unmap_timer, unmap_timer.expires);
+ timer_on = 1;
+ }
+ list_size++;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
size_t size, int dir)
{
@@ -1936,13 +2036,21 @@
dma_pte_clear_range(domain, start_addr, start_addr + size);
/* free page tables */
dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
- if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
- size >> PAGE_SHIFT_4K, 0))
- iommu_flush_write_buffer(domain->iommu);
-
- /* free iova */
- __free_iova(&domain->iovad, iova);
+ if (intel_iommu_strict) {
+ if (iommu_flush_iotlb_psi(domain->iommu,
+ domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+ iommu_flush_write_buffer(domain->iommu);
+ /* free iova */
+ __free_iova(&domain->iovad, iova);
+ } else {
+ add_unmap(domain, iova);
+ /*
+ * queue up the release of the unmap to save the 1/6th of the
+ * cpu used up by the iotlb flush operation...
+ */
+ if (list_size > high_watermark)
+ flush_unmaps();
+ }
}

static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
if (dmar_table_init())
return -ENODEV;

+ high_watermark = 250;
+ intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+ debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+ intel_iommu_debug, &high_watermark);
iommu_init_mempool();
dmar_init_reserved_ranges();

@@ -2281,6 +2393,7 @@
printk(KERN_INFO
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");

+ init_timer(&unmap_timer);
force_iommu = 1;
dma_ops = &intel_dma_ops;
return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h 2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
struct rb_node node;
unsigned long pfn_hi; /* IOMMU dish out addr hi */
unsigned long pfn_lo; /* IOMMU dish out addr lo */
+ struct list_head list;
+ void *dmar;
};

/* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12 11:36:07.000000000 -0800
@@ -822,6 +822,10 @@
than 32 bit addressing. The default is to look
for translation below 32 bit and if not available
then look in the higher range.
+ strict [Default Off]
+ With this option on every umap_signle operation will
+ result in a hardware IOTLB flush operation as opposed
+ to batching them for performance.

io_delay= [X86-32,X86-64] I/O delay method
0x80

2008-02-12 20:26:29

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12 11:36:07.000000000 -0800
> @@ -822,6 +822,10 @@
> than 32 bit addressing. The default is to look
> for translation below 32 bit and if not available
> then look in the higher range.
> + strict [Default Off]
> + With this option on every umap_signle operation will

umap_signle ??? (again)

> + result in a hardware IOTLB flush operation as opposed
> + to batching them for performance.
>
> io_delay= [X86-32,X86-64] I/O delay method
> 0x80


--
~Randy

2008-02-12 23:46:14

by David Miller

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

From: mark gross <[email protected]>
Date: Tue, 12 Feb 2008 07:54:48 -0800

> Something could be done:
> we could enable drivers to have DMA-pools they manage that get mapped
> and are re-used.
>
> I would rather the DMA-pools be tied to PID's that way any bad behavior
> would be limited to the address space of the process using the device.
> I haven't thought about how hard this would be to do but it would be
> nice. I think this could be tricky.

Yes, this is a good idea especially for networking.

For transmit on 10GB links the IOMMU setup is near the top
of the profiles.

What a driver could do is determine the maximum number of
IOMMU pages it could need to map one maximally sized packet.
So then it allocates enough space for all such entries in
it's TX ring.

This eliminates the range allocation from the transmit path.
All that's left is "remap DMA range X to scatterlist Y"

And yes it would be nice to have dma_map_skb() type interfaces
so that we don't walk into the IOMMU code N times per packet.

2008-02-13 18:11:28

by mark gross

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Tue, Feb 12, 2008 at 12:21:08PM -0800, Randy Dunlap wrote:
>> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
>> ===================================================================
>> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12
>> 07:12:06.000000000 -0800
>> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12
>> 11:36:07.000000000 -0800
>> @@ -822,6 +822,10 @@
>> than 32 bit addressing. The default is to look
>> for translation below 32 bit and if not available
>> then look in the higher range.
>> + strict [Default Off]
>> + With this option on every umap_signle operation will
>
> umap_signle ??? (again)
>
>> + result in a hardware IOTLB flush operation as opposed
>> + to batching them for performance.
>> io_delay= [X86-32,X86-64] I/O delay method
>> 0x80
>
>
> --
> ~Randy
>

Signed-off-by: <[email protected]>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-12 11:35:42.000000000 -0800
@@ -21,6 +21,7 @@

#include <linux/init.h>
#include <linux/bitmap.h>
+#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
@@ -30,6 +31,7 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
+#include <linux/timer.h>
#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@

#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)

+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+ TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+ struct list_head list;
+ struct dmar_domain *domain;
+ struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long *g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
static void domain_remove_dev_info(struct dmar_domain *domain);

static int dmar_disabled;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
+static int intel_iommu_strict;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
printk(KERN_INFO
"Intel-IOMMU: disable GFX device mapping\n");
} else if (!strncmp(str, "forcedac", 8)) {
- printk (KERN_INFO
+ printk(KERN_INFO
"Intel-IOMMU: Forcing DAC for PCI devices\n");
dmar_forcedac = 1;
+ } else if (!strncmp(str, "strict", 8)) {
+ printk(KERN_INFO
+ "Intel-IOMMU: disable batched IOTLB flush\n");
+ intel_iommu_strict = 1;
}

str += strcspn(str, ",");
@@ -965,17 +999,13 @@
set_bit(0, iommu->domain_ids);
return 0;
}
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+ struct dmar_drhd_unit *drhd)
{
- struct intel_iommu *iommu;
int ret;
int map_size;
u32 ver;

- iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
- if (!iommu)
- return NULL;
iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
if (!iommu->reg) {
printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
int index;

while (dev) {
- for (index = 0; index < cnt; index ++)
+ for (index = 0; index < cnt; index++)
if (dev == devices[index])
return 1;

@@ -1661,7 +1691,7 @@
struct dmar_rmrr_unit *rmrr;
struct pci_dev *pdev;
struct intel_iommu *iommu;
- int ret, unit = 0;
+ int nlongs, i, ret, unit = 0;

/*
* for each drhd
@@ -1672,7 +1702,30 @@
for_each_drhd_unit(drhd) {
if (drhd->ignored)
continue;
- iommu = alloc_iommu(drhd);
+ g_num_of_iommus++;
+ }
+
+ nlongs = BITS_TO_LONGS(g_num_of_iommus);
+ g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+ if (!g_iommus_to_flush) {
+ printk(KERN_ERR "Intel-IOMMU: "
+ "Allocating bitmap array failed\n");
+ return -ENOMEM;
+ }
+
+ g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+ if (!g_iommus) {
+ kfree(g_iommus_to_flush);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ for_each_drhd_unit(drhd) {
+ if (drhd->ignored)
+ continue;
+ iommu = alloc_iommu(&g_iommus[i], drhd);
+ i++;
if (!iommu) {
ret = -ENOMEM;
goto error;
@@ -1705,7 +1758,6 @@
* endfor
*/
for_each_rmrr_units(rmrr) {
- int i;
for (i = 0; i < rmrr->devices_cnt; i++) {
pdev = rmrr->devices[i];
/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
return 0;
}

+static void flush_unmaps(void)
+{
+ struct iova *node, *n;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ timer_on = 0;
+
+ /*just flush them all*/
+ for (i = 0; i < g_num_of_iommus; i++) {
+ if (test_and_clear_bit(i, g_iommus_to_flush))
+ iommu_flush_iotlb_global(&g_iommus[i], 0);
+ }
+
+ list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+ /* free iova */
+ list_del(&node->list);
+ __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+ }
+ list_size = 0;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+ flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ iova->dmar = dom;
+ list_add(&iova->list, &unmaps_to_do);
+ set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+ if (!timer_on) {
+ unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+ mod_timer(&unmap_timer, unmap_timer.expires);
+ timer_on = 1;
+ }
+ list_size++;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
size_t size, int dir)
{
@@ -1936,13 +2036,21 @@
dma_pte_clear_range(domain, start_addr, start_addr + size);
/* free page tables */
dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
- if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
- size >> PAGE_SHIFT_4K, 0))
- iommu_flush_write_buffer(domain->iommu);
-
- /* free iova */
- __free_iova(&domain->iovad, iova);
+ if (intel_iommu_strict) {
+ if (iommu_flush_iotlb_psi(domain->iommu,
+ domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+ iommu_flush_write_buffer(domain->iommu);
+ /* free iova */
+ __free_iova(&domain->iovad, iova);
+ } else {
+ add_unmap(domain, iova);
+ /*
+ * queue up the release of the unmap to save the 1/6th of the
+ * cpu used up by the iotlb flush operation...
+ */
+ if (list_size > high_watermark)
+ flush_unmaps();
+ }
}

static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
if (dmar_table_init())
return -ENODEV;

+ high_watermark = 250;
+ intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+ debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+ intel_iommu_debug, &high_watermark);
iommu_init_mempool();
dmar_init_reserved_ranges();

@@ -2281,6 +2393,7 @@
printk(KERN_INFO
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");

+ init_timer(&unmap_timer);
force_iommu = 1;
dma_ops = &intel_dma_ops;
return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h 2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
struct rb_node node;
unsigned long pfn_hi; /* IOMMU dish out addr hi */
unsigned long pfn_lo; /* IOMMU dish out addr lo */
+ struct list_head list;
+ void *dmar;
};

/* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12 11:36:07.000000000 -0800
@@ -822,6 +822,10 @@
than 32 bit addressing. The default is to look
for translation below 32 bit and if not available
then look in the higher range.
+ strict [Default Off]
+ With this option on every umap_single operation will
+ result in a hardware IOTLB flush operation as opposed
+ to batching them for performance.

io_delay= [X86-32,X86-64] I/O delay method
0x80

2008-02-13 18:25:30

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12 11:36:07.000000000 -0800
> @@ -822,6 +822,10 @@
> than 32 bit addressing. The default is to look
> for translation below 32 bit and if not available
> then look in the higher range.
> + strict [Default Off]
> + With this option on every umap_single operation will

so I'll ask one more time :(
Shouldn't this be "unmap_single" ?

> + result in a hardware IOTLB flush operation as opposed
> + to batching them for performance.
>
> io_delay= [X86-32,X86-64] I/O delay method
> 0x80


--
~Randy

2008-02-13 18:36:48

by mark gross

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Tue, Feb 12, 2008 at 07:54:48AM -0800, David Miller wrote:
> > Something could be done:
> > we could enable drivers to have DMA-pools they manage that get mapped
> > and are re-used.
> >
> > I would rather the DMA-pools be tied to PID's that way any bad behavior
> > would be limited to the address space of the process using the device.
> > I haven't thought about how hard this would be to do but it would be
> > nice. I think this could be tricky.
>
> Yes, this is a good idea especially for networking.
>
> For transmit on 10GB links the IOMMU setup is near the top
> of the profiles.

true.

> What a driver could do is determine the maximum number of
> IOMMU pages it could need to map one maximally sized packet.
> So then it allocates enough space for all such entries in
> it's TX ring.
>
> This eliminates the range allocation from the transmit path.
> All that's left is "remap DMA range X to scatterlist Y"
>
> And yes it would be nice to have dma_map_skb() type interfaces
> so that we don't walk into the IOMMU code N times per packet.


/me starts looking more closely at how this could be done...

--mgross

2008-02-13 19:37:28

by mark gross

[permalink] [raw]
Subject: Re: [PATCH]intel-iommu batched iotlb flushes

On Wed, Feb 13, 2008 at 10:23:34AM -0800, Randy Dunlap wrote:
>> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
>> ===================================================================
>> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12
>> 07:12:06.000000000 -0800
>> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-12
>> 11:36:07.000000000 -0800
>> @@ -822,6 +822,10 @@
>> than 32 bit addressing. The default is to look
>> for translation below 32 bit and if not available
>> then look in the higher range.
>> + strict [Default Off]
>> + With this option on every umap_single operation will
>
> so I'll ask one more time :(
> Shouldn't this be "unmap_single" ?
>
>> + result in a hardware IOTLB flush operation as opposed
>> + to batching them for performance.
>> io_delay= [X86-32,X86-64] I/O delay method
>> 0x80
>
>

crap I was focused on the single.

Signed-off-by: <[email protected]>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c 2008-02-12 11:35:42.000000000 -0800
@@ -21,6 +21,7 @@

#include <linux/init.h>
#include <linux/bitmap.h>
+#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
@@ -30,6 +31,7 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
+#include <linux/timer.h>
#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@

#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)

+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+ TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+ struct list_head list;
+ struct dmar_domain *domain;
+ struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long *g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
static void domain_remove_dev_info(struct dmar_domain *domain);

static int dmar_disabled;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
+static int intel_iommu_strict;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
printk(KERN_INFO
"Intel-IOMMU: disable GFX device mapping\n");
} else if (!strncmp(str, "forcedac", 8)) {
- printk (KERN_INFO
+ printk(KERN_INFO
"Intel-IOMMU: Forcing DAC for PCI devices\n");
dmar_forcedac = 1;
+ } else if (!strncmp(str, "strict", 8)) {
+ printk(KERN_INFO
+ "Intel-IOMMU: disable batched IOTLB flush\n");
+ intel_iommu_strict = 1;
}

str += strcspn(str, ",");
@@ -965,17 +999,13 @@
set_bit(0, iommu->domain_ids);
return 0;
}
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+ struct dmar_drhd_unit *drhd)
{
- struct intel_iommu *iommu;
int ret;
int map_size;
u32 ver;

- iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
- if (!iommu)
- return NULL;
iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
if (!iommu->reg) {
printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
int index;

while (dev) {
- for (index = 0; index < cnt; index ++)
+ for (index = 0; index < cnt; index++)
if (dev == devices[index])
return 1;

@@ -1661,7 +1691,7 @@
struct dmar_rmrr_unit *rmrr;
struct pci_dev *pdev;
struct intel_iommu *iommu;
- int ret, unit = 0;
+ int nlongs, i, ret, unit = 0;

/*
* for each drhd
@@ -1672,7 +1702,30 @@
for_each_drhd_unit(drhd) {
if (drhd->ignored)
continue;
- iommu = alloc_iommu(drhd);
+ g_num_of_iommus++;
+ }
+
+ nlongs = BITS_TO_LONGS(g_num_of_iommus);
+ g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+ if (!g_iommus_to_flush) {
+ printk(KERN_ERR "Intel-IOMMU: "
+ "Allocating bitmap array failed\n");
+ return -ENOMEM;
+ }
+
+ g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+ if (!g_iommus) {
+ kfree(g_iommus_to_flush);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ for_each_drhd_unit(drhd) {
+ if (drhd->ignored)
+ continue;
+ iommu = alloc_iommu(&g_iommus[i], drhd);
+ i++;
if (!iommu) {
ret = -ENOMEM;
goto error;
@@ -1705,7 +1758,6 @@
* endfor
*/
for_each_rmrr_units(rmrr) {
- int i;
for (i = 0; i < rmrr->devices_cnt; i++) {
pdev = rmrr->devices[i];
/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
return 0;
}

+static void flush_unmaps(void)
+{
+ struct iova *node, *n;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ timer_on = 0;
+
+ /*just flush them all*/
+ for (i = 0; i < g_num_of_iommus; i++) {
+ if (test_and_clear_bit(i, g_iommus_to_flush))
+ iommu_flush_iotlb_global(&g_iommus[i], 0);
+ }
+
+ list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+ /* free iova */
+ list_del(&node->list);
+ __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+ }
+ list_size = 0;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+ flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ iova->dmar = dom;
+ list_add(&iova->list, &unmaps_to_do);
+ set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+ if (!timer_on) {
+ unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+ mod_timer(&unmap_timer, unmap_timer.expires);
+ timer_on = 1;
+ }
+ list_size++;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
size_t size, int dir)
{
@@ -1936,13 +2036,21 @@
dma_pte_clear_range(domain, start_addr, start_addr + size);
/* free page tables */
dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
- if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
- size >> PAGE_SHIFT_4K, 0))
- iommu_flush_write_buffer(domain->iommu);
-
- /* free iova */
- __free_iova(&domain->iovad, iova);
+ if (intel_iommu_strict) {
+ if (iommu_flush_iotlb_psi(domain->iommu,
+ domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+ iommu_flush_write_buffer(domain->iommu);
+ /* free iova */
+ __free_iova(&domain->iovad, iova);
+ } else {
+ add_unmap(domain, iova);
+ /*
+ * queue up the release of the unmap to save the 1/6th of the
+ * cpu used up by the iotlb flush operation...
+ */
+ if (list_size > high_watermark)
+ flush_unmaps();
+ }
}

static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
if (dmar_table_init())
return -ENODEV;

+ high_watermark = 250;
+ intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+ debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+ intel_iommu_debug, &high_watermark);
iommu_init_mempool();
dmar_init_reserved_ranges();

@@ -2281,6 +2393,7 @@
printk(KERN_INFO
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");

+ init_timer(&unmap_timer);
force_iommu = 1;
dma_ops = &intel_dma_ops;
return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h 2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
struct rb_node node;
unsigned long pfn_hi; /* IOMMU dish out addr hi */
unsigned long pfn_lo; /* IOMMU dish out addr lo */
+ struct list_head list;
+ void *dmar;
};

/* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-13 11:17:22.000000000 -0800
@@ -822,6 +822,10 @@
than 32 bit addressing. The default is to look
for translation below 32 bit and if not available
then look in the higher range.
+ strict [Default Off]
+ With this option on every unmap_single operation will
+ result in a hardware IOTLB flush operation as opposed
+ to batching them for performance.

io_delay= [X86-32,X86-64] I/O delay method
0x80