2008-10-01 16:58:10

by Fenghua Yu

[permalink] [raw]
Subject: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part




Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Tony Luck <[email protected]>

---

Kconfig | 17 ++++
Makefile | 1
configs/generic_defconfig | 2
configs/tiger_defconfig | 2
dig/Makefile | 2
dig/dig_vtd_iommu.c | 59 +++++++++++++++++
dig/machvec_vtd.c | 3
include/asm/cacheflush.h | 2
include/asm/device.h | 3
include/asm/dma-mapping.h | 50 ++++++++++++++
include/asm/iommu.h | 11 +++
include/asm/machvec.h | 2
include/asm/machvec_dig_vtd.h | 38 +++++++++++
include/asm/machvec_init.h | 1
include/asm/pci.h | 3
include/asm/swiotlb.h | 56 ++++++++++++++++
kernel/Makefile | 4 +
kernel/acpi.c | 17 ++++
kernel/msi_ia64.c | 80 +++++++++++++++++++++++
kernel/pci-dma.c | 143 ++++++++++++++++++++++++++++++++++++++++++
kernel/pci-swiotlb.c | 46 +++++++++++++
kernel/setup.c | 42 ++++++++----
lib/flush.S | 55 ++++++++++++++++
23 files changed, 626 insertions(+), 13 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 48e496f..ae965aa 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -125,6 +125,7 @@ config IA64_GENERIC
select NUMA
select ACPI_NUMA
select SWIOTLB
+ select PCI_MSI
help
This selects the system type of your hardware. A "generic" kernel
will run on any supported IA-64 system. However, if you configure
@@ -132,6 +133,7 @@ config IA64_GENERIC

generic For any supported IA-64 system
DIG-compliant For DIG ("Developer's Interface Guide") compliant systems
+ DIG+Intel+IOMMU For DIG systems with Intel IOMMU
HP-zx1/sx1000 For HP systems
HP-zx1/sx1000+swiotlb For HP systems with (broken) DMA-constrained devices.
SGI-SN2 For SGI Altix systems
@@ -144,6 +146,11 @@ config IA64_DIG
bool "DIG-compliant"
select SWIOTLB

+config IA64_DIG_VTD
+ bool "DIG+Intel+IOMMU"
+ select DMAR
+ select PCI_MSI
+
config IA64_HP_ZX1
bool "HP-zx1/sx1000"
help
@@ -589,6 +596,16 @@ source "drivers/pci/hotplug/Kconfig"

source "drivers/pcmcia/Kconfig"

+config DMAR
+ bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
+ depends on IA64_GENERIC && ACPI && EXPERIMENTAL
+ help
+ DMA remapping (DMAR) devices support enables independent address
+ translations for Direct Memory Access (DMA) from devices.
+ These DMA remapping devices are reported via ACPI tables
+ and include PCI device scope covered by these DMA
+ remapping devices.
+
endmenu

endif
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 905d25b..04cecee 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -53,6 +53,7 @@ libs-y += arch/ia64/lib/
core-y += arch/ia64/kernel/ arch/ia64/mm/
core-$(CONFIG_IA32_SUPPORT) += arch/ia64/ia32/
core-$(CONFIG_IA64_DIG) += arch/ia64/dig/
+core-$(CONFIG_IA64_DIG_VTD) += arch/ia64/dig/
core-$(CONFIG_IA64_GENERIC) += arch/ia64/dig/
core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/
core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
index 9f48397..e05f9e1 100644
--- a/arch/ia64/configs/generic_defconfig
+++ b/arch/ia64/configs/generic_defconfig
@@ -233,6 +233,8 @@ CONFIG_DMIID=y
CONFIG_BINFMT_ELF=y
CONFIG_BINFMT_MISC=m

+# CONFIG_DMAR is not set
+
#
# Power management and ACPI
#
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index 797acf9..c522edf 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -172,6 +172,8 @@ CONFIG_DMIID=y
CONFIG_BINFMT_ELF=y
CONFIG_BINFMT_MISC=m

+# CONFIG_DMAR is not set
+
#
# Power management and ACPI
#
diff --git a/arch/ia64/dig/Makefile b/arch/ia64/dig/Makefile
index 971cd78..5b63eec 100644
--- a/arch/ia64/dig/Makefile
+++ b/arch/ia64/dig/Makefile
@@ -7,3 +7,5 @@

obj-y := setup.o
obj-$(CONFIG_IA64_GENERIC) += machvec.o
+obj-$(CONFIG_DMAR) += machvec_vtd.o dig_vtd_iommu.o
+obj-$(CONFIG_IA64_DIG_VTD) += dig_vtd_iommu.o
diff --git a/arch/ia64/dig/dig_vtd_iommu.c b/arch/ia64/dig/dig_vtd_iommu.c
new file mode 100644
index 0000000..3d79f76
--- /dev/null
+++ b/arch/ia64/dig/dig_vtd_iommu.c
@@ -0,0 +1,59 @@
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/intel-iommu.h>
+
+void *
+vtd_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flags)
+{
+ return intel_alloc_coherent(dev, size, dma_handle, flags);
+}
+EXPORT_SYMBOL_GPL(vtd_alloc_coherent);
+
+void
+vtd_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle)
+{
+ intel_free_coherent(dev, size, vaddr, dma_handle);
+}
+EXPORT_SYMBOL_GPL(vtd_free_coherent);
+
+dma_addr_t
+vtd_map_single_attrs(struct device *dev, unsigned long addr, size_t size,
+ int dir, struct dma_attrs *attrs)
+{
+ return intel_map_single(dev, addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_map_single_attrs);
+
+void
+vtd_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size,
+ int dir, struct dma_attrs *attrs)
+{
+ intel_unmap_single(dev, iova, size, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_unmap_single_attrs);
+
+int
+vtd_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents,
+ int dir, struct dma_attrs *attrs)
+{
+ return intel_map_sg(dev, sglist, nents, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_map_sg_attrs);
+
+void
+vtd_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
+ int nents, int dir, struct dma_attrs *attrs)
+{
+ intel_unmap_sg(dev, sglist, nents, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_unmap_sg_attrs);
+
+int
+vtd_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vtd_dma_mapping_error);
diff --git a/arch/ia64/dig/machvec_vtd.c b/arch/ia64/dig/machvec_vtd.c
new file mode 100644
index 0000000..7cd3eb4
--- /dev/null
+++ b/arch/ia64/dig/machvec_vtd.c
@@ -0,0 +1,3 @@
+#define MACHVEC_PLATFORM_NAME dig_vtd
+#define MACHVEC_PLATFORM_HEADER <asm/machvec_dig_vtd.h>
+#include <asm/machvec_init.h>
diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h
index afcfbda..c8ce271 100644
--- a/arch/ia64/include/asm/cacheflush.h
+++ b/arch/ia64/include/asm/cacheflush.h
@@ -34,6 +34,8 @@ do { \
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

extern void flush_icache_range (unsigned long start, unsigned long end);
+extern void clflush_cache_range(void *addr, int size);
+

#define flush_icache_user_range(vma, page, user_addr, len) \
do { \
diff --git a/arch/ia64/include/asm/device.h b/arch/ia64/include/asm/device.h
index 3db6daf..41ab85d 100644
--- a/arch/ia64/include/asm/device.h
+++ b/arch/ia64/include/asm/device.h
@@ -10,6 +10,9 @@ struct dev_archdata {
#ifdef CONFIG_ACPI
void *acpi_handle;
#endif
+#ifdef CONFIG_DMAR
+ void *iommu; /* hook for IOMMU specific extension */
+#endif
};

#endif /* _ASM_IA64_DEVICE_H */
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 06ff1ba..bbab7e2 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -7,6 +7,49 @@
*/
#include <asm/machvec.h>
#include <linux/scatterlist.h>
+#include <asm/swiotlb.h>
+
+struct dma_mapping_ops {
+ int (*mapping_error)(struct device *dev,
+ dma_addr_t dma_addr);
+ void* (*alloc_coherent)(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp);
+ void (*free_coherent)(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+ dma_addr_t (*map_single)(struct device *hwdev, unsigned long ptr,
+ size_t size, int direction);
+ void (*unmap_single)(struct device *dev, dma_addr_t addr,
+ size_t size, int direction);
+ void (*sync_single_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_range_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_single_range_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_sg_for_cpu)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ void (*sync_sg_for_device)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+ void (*unmap_sg)(struct device *hwdev,
+ struct scatterlist *sg, int nents,
+ int direction);
+ int (*dma_supported_op)(struct device *hwdev, u64 mask);
+ int is_phys;
+};
+
+extern struct dma_mapping_ops *dma_ops;
+extern struct ia64_machine_vector ia64_mv;
+extern void set_iommu_machvec(void);

#define dma_alloc_coherent(dev, size, handle, gfp) \
platform_dma_alloc_coherent(dev, size, handle, (gfp) | GFP_DMA)
@@ -96,4 +139,11 @@ dma_cache_sync (struct device *dev, void *vaddr, size_t size,

#define dma_is_consistent(d, h) (1) /* all we do is coherent memory... */

+static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
+{
+ return dma_ops;
+}
+
+
+
#endif /* _ASM_IA64_DMA_MAPPING_H */
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
new file mode 100644
index 0000000..7707d2f
--- /dev/null
+++ b/arch/ia64/include/asm/iommu.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_IA64_IOMMU_H
+#define _ASM_IA64_IOMMU_H 1
+
+extern void pci_iommu_shutdown(void);
+extern void no_iommu_init(void);
+extern int force_iommu, no_iommu;
+extern int iommu_detected;
+extern void iommu_dma_init(void);
+extern void machvec_init(const char *name);
+
+#endif
diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h
index 2b850cc..ce9b1d0 100644
--- a/arch/ia64/include/asm/machvec.h
+++ b/arch/ia64/include/asm/machvec.h
@@ -120,6 +120,8 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *);
# include <asm/machvec_hpsim.h>
# elif defined (CONFIG_IA64_DIG)
# include <asm/machvec_dig.h>
+# elif defined(CONFIG_IA64_DIG_VTD)
+# include <asm/machvec_dig_vtd.h>
# elif defined (CONFIG_IA64_HP_ZX1)
# include <asm/machvec_hpzx1.h>
# elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB)
diff --git a/arch/ia64/include/asm/machvec_dig_vtd.h b/arch/ia64/include/asm/machvec_dig_vtd.h
new file mode 100644
index 0000000..3400b56
--- /dev/null
+++ b/arch/ia64/include/asm/machvec_dig_vtd.h
@@ -0,0 +1,38 @@
+#ifndef _ASM_IA64_MACHVEC_DIG_VTD_h
+#define _ASM_IA64_MACHVEC_DIG_VTD_h
+
+extern ia64_mv_setup_t dig_setup;
+extern ia64_mv_dma_alloc_coherent vtd_alloc_coherent;
+extern ia64_mv_dma_free_coherent vtd_free_coherent;
+extern ia64_mv_dma_map_single_attrs vtd_map_single_attrs;
+extern ia64_mv_dma_unmap_single_attrs vtd_unmap_single_attrs;
+extern ia64_mv_dma_map_sg_attrs vtd_map_sg_attrs;
+extern ia64_mv_dma_unmap_sg_attrs vtd_unmap_sg_attrs;
+extern ia64_mv_dma_supported iommu_dma_supported;
+extern ia64_mv_dma_mapping_error vtd_dma_mapping_error;
+extern ia64_mv_dma_init pci_iommu_alloc;
+
+/*
+ * This stuff has dual use!
+ *
+ * For a generic kernel, the macros are used to initialize the
+ * platform's machvec structure. When compiling a non-generic kernel,
+ * the macros are used directly.
+ */
+#define platform_name "dig_vtd"
+#define platform_setup dig_setup
+#define platform_dma_init pci_iommu_alloc
+#define platform_dma_alloc_coherent vtd_alloc_coherent
+#define platform_dma_free_coherent vtd_free_coherent
+#define platform_dma_map_single_attrs vtd_map_single_attrs
+#define platform_dma_unmap_single_attrs vtd_unmap_single_attrs
+#define platform_dma_map_sg_attrs vtd_map_sg_attrs
+#define platform_dma_unmap_sg_attrs vtd_unmap_sg_attrs
+#define platform_dma_sync_single_for_cpu machvec_dma_sync_single
+#define platform_dma_sync_sg_for_cpu machvec_dma_sync_sg
+#define platform_dma_sync_single_for_device machvec_dma_sync_single
+#define platform_dma_sync_sg_for_device machvec_dma_sync_sg
+#define platform_dma_supported iommu_dma_supported
+#define platform_dma_mapping_error vtd_dma_mapping_error
+
+#endif /* _ASM_IA64_MACHVEC_DIG_VTD_h */
diff --git a/arch/ia64/include/asm/machvec_init.h b/arch/ia64/include/asm/machvec_init.h
index 7f21249..ef964b2 100644
--- a/arch/ia64/include/asm/machvec_init.h
+++ b/arch/ia64/include/asm/machvec_init.h
@@ -1,3 +1,4 @@
+#include <asm/iommu.h>
#include <asm/machvec.h>

extern ia64_mv_send_ipi_t ia64_send_ipi;
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 0149097..645ef06 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -164,4 +164,7 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
return channel ? isa_irq_to_vector(15) : isa_irq_to_vector(14);
}

+#ifdef CONFIG_DMAR
+extern void pci_iommu_alloc(void);
+#endif
#endif /* _ASM_IA64_PCI_H */
diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
new file mode 100644
index 0000000..fb79423
--- /dev/null
+++ b/arch/ia64/include/asm/swiotlb.h
@@ -0,0 +1,56 @@
+#ifndef ASM_IA64__SWIOTLB_H
+#define ASM_IA64__SWIOTLB_H
+
+#include <linux/dma-mapping.h>
+
+/* SWIOTLB interface */
+
+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
+ size_t size, int dir);
+extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags);
+extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern void swiotlb_sync_sg_for_device(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
+extern void swiotlb_init(void);
+
+extern int swiotlb_force;
+
+#ifdef CONFIG_SWIOTLB
+extern int swiotlb;
+extern void pci_swiotlb_init(void);
+#else
+#define swiotlb 0
+static inline void pci_swiotlb_init(void)
+{
+}
+#endif
+
+#endif /* ASM_IA64__SWIOTLB_H */
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 87fea11..af0e750 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -42,6 +42,10 @@ obj-$(CONFIG_IA64_ESI) += esi.o
ifneq ($(CONFIG_IA64_ESI),)
obj-y += esi_stub.o # must be in kernel proper
endif
+obj-$(CONFIG_DMAR) += pci-dma.o
+ifeq ($(CONFIG_DMAR), y)
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+endif

# The gate DSO image is built using a special linker script.
targets += gate.so gate-syms.o
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 5d1eb7e..8cc2f8a 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -91,6 +91,9 @@ acpi_get_sysname(void)
struct acpi_table_rsdp *rsdp;
struct acpi_table_xsdt *xsdt;
struct acpi_table_header *hdr;
+#ifdef CONFIG_DMAR
+ u64 i, nentries;
+#endif

rsdp_phys = acpi_find_rsdp();
if (!rsdp_phys) {
@@ -123,6 +126,18 @@ acpi_get_sysname(void)
return "sn2";
}

+#ifdef CONFIG_DMAR
+ /* Look for Intel IOMMU */
+ nentries = (hdr->length - sizeof(*hdr)) /
+ sizeof(xsdt->table_offset_entry[0]);
+ for (i = 0; i < nentries; i++) {
+ hdr = __va(xsdt->table_offset_entry[i]);
+ if (strncmp(hdr->signature, ACPI_SIG_DMAR,
+ sizeof(ACPI_SIG_DMAR) - 1) == 0)
+ return "dig_vtd";
+ }
+#endif
+
return "dig";
#else
# if defined (CONFIG_IA64_HP_SIM)
@@ -137,6 +152,8 @@ acpi_get_sysname(void)
return "uv";
# elif defined (CONFIG_IA64_DIG)
return "dig";
+# elif defined(CONFIG_IA64_DIG_VTD)
+ return "dig_vtd";
# else
# error Unknown platform. Fix acpi.c.
# endif
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 60c6ef6..702a09c 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -5,6 +5,7 @@
#include <linux/pci.h>
#include <linux/irq.h>
#include <linux/msi.h>
+#include <linux/dmar.h>
#include <asm/smp.h>

/*
@@ -162,3 +163,82 @@ void arch_teardown_msi_irq(unsigned int irq)

return ia64_teardown_msi_irq(irq);
}
+
+#ifdef CONFIG_DMAR
+#ifdef CONFIG_SMP
+static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ struct msi_msg msg;
+ int cpu = first_cpu(mask);
+
+
+ if (!cpu_online(cpu))
+ return;
+
+ if (irq_prepare_move(irq, cpu))
+ return;
+
+ dmar_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DESTID_MASK;
+ msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+
+ dmar_msi_write(irq, &msg);
+ irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+struct irq_chip dmar_msi_type = {
+ .name = "DMAR_MSI",
+ .unmask = dmar_msi_unmask,
+ .mask = dmar_msi_mask,
+ .ack = ia64_ack_msi_irq,
+#ifdef CONFIG_SMP
+ .set_affinity = dmar_msi_set_affinity,
+#endif
+ .retrigger = ia64_msi_retrigger_irq,
+};
+
+static int
+msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ unsigned dest;
+ cpumask_t mask;
+
+ cpus_and(mask, irq_to_domain(irq), cpu_online_map);
+ dest = cpu_physical_id(first_cpu(mask));
+
+ msg->address_hi = 0;
+ msg->address_lo =
+ MSI_ADDR_HEADER |
+ MSI_ADDR_DESTMODE_PHYS |
+ MSI_ADDR_REDIRECTION_CPU |
+ MSI_ADDR_DESTID_CPU(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ MSI_DATA_DELIVERY_FIXED |
+ MSI_DATA_VECTOR(cfg->vector);
+ return 0;
+}
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg);
+ if (ret < 0)
+ return ret;
+ dmar_msi_write(irq, &msg);
+ set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+ "edge");
+ return 0;
+}
+#endif /* CONFIG_DMAR */
+
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
new file mode 100644
index 0000000..861ca5c
--- /dev/null
+++ b/arch/ia64/kernel/pci-dma.c
@@ -0,0 +1,143 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/dmar.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/machvec.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_DMAR
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/page.h>
+#include <asm/iommu.h>
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+static int iommu_sac_force __read_mostly;
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int force_iommu __read_mostly = 1;
+#else
+int force_iommu __read_mostly;
+#endif
+
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly;
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+ be probably a smaller DMA mask, but this is bug-to-bug compatible
+ to i386. */
+struct device fallback_dev = {
+ .bus_id = "fallback device",
+ .coherent_dma_mask = DMA_32BIT_MASK,
+ .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+static int forbid_dac __read_mostly;
+
+void __init pci_iommu_alloc(void)
+{
+ /*
+ * The order of these functions is important for
+ * fall-back/fail-over reasons
+ */
+ detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+ pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+ if (iommu_detected)
+ intel_iommu_init();
+
+ return 0;
+}
+
+void pci_iommu_shutdown(void)
+{
+ return;
+}
+
+void __init
+iommu_dma_init(void)
+{
+ return;
+}
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+ printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+ forbid_dac = 1;
+ }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+#endif
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+
+struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+int iommu_dma_supported(struct device *dev, u64 mask)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+#ifdef CONFIG_PCI
+ if (mask > 0xffffffff && forbid_dac > 0) {
+ dev_info(dev, "PCI: Disallowing DAC for device\n");
+ return 0;
+ }
+#endif
+
+ if (ops->dma_supported_op)
+ return ops->dma_supported_op(dev, mask);
+
+ /* Copied from i386. Doesn't make much sense, because it will
+ only work for pci_alloc_coherent.
+ The caller just has to use GFP_DMA in this case. */
+ if (mask < DMA_24BIT_MASK)
+ return 0;
+
+ /* Tell the device to use SAC when IOMMU force is on. This
+ allows the driver to use cheaper accesses in some cases.
+
+ Problem with this is that if we overflow the IOMMU area and
+ return DAC as fallback address the device may not handle it
+ correctly.
+
+ As a special case some controllers have a 39bit address
+ mode that is as efficient as 32bit (aic79xx). Don't force
+ SAC for these. Assume all masks <= 40 bits are of this
+ type. Normally this doesn't make any difference, but gives
+ more gentle handling of IOMMU overflow. */
+ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+ dev_info(dev, "Force SAC with mask %lx\n", mask);
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(iommu_dma_supported);
+
+#endif
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
new file mode 100644
index 0000000..16c5051
--- /dev/null
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -0,0 +1,46 @@
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+
+int swiotlb __read_mostly;
+EXPORT_SYMBOL(swiotlb);
+
+struct dma_mapping_ops swiotlb_dma_ops = {
+ .mapping_error = swiotlb_dma_mapping_error,
+ .alloc_coherent = swiotlb_alloc_coherent,
+ .free_coherent = swiotlb_free_coherent,
+ .map_single = swiotlb_map_single,
+ .unmap_single = swiotlb_unmap_single,
+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = swiotlb_sync_single_for_device,
+ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+ .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
+ .map_sg = swiotlb_map_sg,
+ .unmap_sg = swiotlb_unmap_sg,
+ .dma_supported_op = swiotlb_dma_supported,
+};
+
+void __init pci_swiotlb_init(void)
+{
+ if (!iommu_detected) {
+#ifdef CONFIG_IA64_GENERIC
+ swiotlb = 1;
+ printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
+ machvec_init("dig");
+ swiotlb_init();
+ dma_ops = &swiotlb_dma_ops;
+#else
+ panic("Unable to find Intel IOMMU");
+#endif
+ }
+}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index c27d5b2..218c550 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -116,6 +116,13 @@ unsigned int num_io_spaces;
*/
#define I_CACHE_STRIDE_SHIFT 5 /* Safest way to go: 32 bytes by 32 bytes */
unsigned long ia64_i_cache_stride_shift = ~0;
+/*
+ * "clflush_cache_range()" needs to know what processor dependent stride size to
+ * use when it flushes cache lines including both d-cache and i-cache.
+ */
+/* Safest way to go: 32 bytes by 32 bytes */
+#define CACHE_STRIDE_SHIFT 5
+unsigned long ia64_cache_stride_shift = ~0;

/*
* The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This
@@ -845,13 +852,14 @@ setup_per_cpu_areas (void)
}

/*
- * Calculate the max. cache line size.
+ * Do the following calculations:
*
- * In addition, the minimum of the i-cache stride sizes is calculated for
- * "flush_icache_range()".
+ * 1. the max. cache line size.
+ * 2. the minimum of the i-cache stride sizes for "flush_icache_range()".
+ * 3. the minimum of the cache stride sizes for "clflush_cache_range()".
*/
static void __cpuinit
-get_max_cacheline_size (void)
+get_cache_info(void)
{
unsigned long line_size, max = 1;
u64 l, levels, unique_caches;
@@ -865,12 +873,14 @@ get_max_cacheline_size (void)
max = SMP_CACHE_BYTES;
/* Safest setup for "flush_icache_range()" */
ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
+ /* Safest setup for "clflush_cache_range()" */
+ ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
goto out;
}

for (l = 0; l < levels; ++l) {
- status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2,
- &cci);
+ /* cache_type (data_or_unified)=2 */
+ status = ia64_pal_cache_config_info(l, 2, &cci);
if (status != 0) {
printk(KERN_ERR
"%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
@@ -878,15 +888,21 @@ get_max_cacheline_size (void)
max = SMP_CACHE_BYTES;
/* The safest setup for "flush_icache_range()" */
cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+ /* The safest setup for "clflush_cache_range()" */
+ ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
cci.pcci_unified = 1;
+ } else {
+ if (cci.pcci_stride < ia64_cache_stride_shift)
+ ia64_cache_stride_shift = cci.pcci_stride;
+
+ line_size = 1 << cci.pcci_line_size;
+ if (line_size > max)
+ max = line_size;
}
- line_size = 1 << cci.pcci_line_size;
- if (line_size > max)
- max = line_size;
+
if (!cci.pcci_unified) {
- status = ia64_pal_cache_config_info(l,
- /* cache_type (instruction)= */ 1,
- &cci);
+ /* cache_type (instruction)=1*/
+ status = ia64_pal_cache_config_info(l, 1, &cci);
if (status != 0) {
printk(KERN_ERR
"%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
@@ -940,7 +956,7 @@ cpu_init (void)
}
#endif

- get_max_cacheline_size();
+ get_cache_info();

/*
* We can't pass "local_cpu_data" to identify_cpu() because we haven't called
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
index 2a0d27f..2c1a948 100644
--- a/arch/ia64/lib/flush.S
+++ b/arch/ia64/lib/flush.S
@@ -60,3 +60,58 @@ GLOBAL_ENTRY(flush_icache_range)
mov ar.lc=r3 // restore ar.lc
br.ret.sptk.many rp
END(flush_icache_range)
+
+ /*
+ * clflush_cache_range(start,size)
+ *
+ * Flush cache lines from start to start+size-1.
+ *
+ * Must deal with range from start to start+size-1 but nothing else
+ * (need to be careful not to touch addresses that may be
+ * unmapped).
+ *
+ * Note: "in0" and "in1" are preserved for debugging purposes.
+ */
+ .section .kprobes.text,"ax"
+GLOBAL_ENTRY(clflush_cache_range)
+
+ .prologue
+ alloc r2=ar.pfs,2,0,0,0
+ movl r3=ia64_cache_stride_shift
+ mov r21=1
+ add r22=in1,in0
+ ;;
+ ld8 r20=[r3] // r20: stride shift
+ sub r22=r22,r0,1 // last byte address
+ ;;
+ shr.u r23=in0,r20 // start / (stride size)
+ shr.u r22=r22,r20 // (last byte address) / (stride size)
+ shl r21=r21,r20 // r21: stride size of the i-cache(s)
+ ;;
+ sub r8=r22,r23 // number of strides - 1
+ shl r24=r23,r20 // r24: addresses for "fc" =
+ // "start" rounded down to stride
+ // boundary
+ .save ar.lc,r3
+ mov r3=ar.lc // save ar.lc
+ ;;
+
+ .body
+ mov ar.lc=r8
+ ;;
+ /*
+ * 32 byte aligned loop, even number of (actually 2) bundles
+ */
+.Loop_fc:
+ fc r24 // issuable on M0 only
+ add r24=r21,r24 // we flush "stride size" bytes per iteration
+ nop.i 0
+ br.cloop.sptk.few .Loop_fc
+ ;;
+ sync.i
+ ;;
+ srlz.i
+ ;;
+ mov ar.lc=r3 // restore ar.lc
+ br.ret.sptk.many rp
+END(clflush_cache_range)


2008-10-02 15:51:23

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

On Wednesday 01 October 2008 10:57:50 am Fenghua Yu wrote:
> --- a/arch/ia64/include/asm/cacheflush.h
> +++ b/arch/ia64/include/asm/cacheflush.h
> @@ -34,6 +34,8 @@ do { \
> #define flush_dcache_mmap_unlock(mapping) do { } while (0)
>
> extern void flush_icache_range (unsigned long start, unsigned long end);
> +extern void clflush_cache_range(void *addr, int size);

This patch adds clflush_cache_range(), but it's not used anywhere.

If you do need it, it'd be nice if the arguments were the same types
as for flush_icache_range(), and if there were a comment describing
why it is necessary for VT-d. And maybe the name could be more like
the other cache flushing functions.

> +#ifdef CONFIG_PCI
> +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
> +
> +static __devinit void via_no_dac(struct pci_dev *dev)
> +{
> + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
> + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");

Please use dev_info() here. I see you just copied this from x86, but
we should fix x86, too. Or better, since this doesn't appear to be
arch-specific, maybe this should be moved to drivers/pci/quirks.c
alongside all the other VIA quirks.

> + forbid_dac = 1;

Shouldn't forbid_dac be a per-device or at least a per-bridge
property rather than a global?

> + }
> +}
> +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
> +#endif
> +/* Must execute after PCI subsystem */
> +fs_initcall(pci_iommu_init);
> +
> +struct dma_mapping_ops *dma_ops;
> +EXPORT_SYMBOL(dma_ops);
> +
> +int iommu_dma_supported(struct device *dev, u64 mask)
> +{
> + struct dma_mapping_ops *ops = get_dma_ops(dev);
> +
> +#ifdef CONFIG_PCI
> + if (mask > 0xffffffff && forbid_dac > 0) {
> + dev_info(dev, "PCI: Disallowing DAC for device\n");

The "PCI: " should be removed since dev_info() will add the driver
name and device ID.

Bjorn

2008-10-02 17:46:23

by Fenghua Yu

[permalink] [raw]
Subject: RE: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

>> --- a/arch/ia64/include/asm/cacheflush.h
>> +++ b/arch/ia64/include/asm/cacheflush.h
>> @@ -34,6 +34,8 @@ do { \
>> #define flush_dcache_mmap_unlock(mapping) do { } while (0)
>>
>> extern void flush_icache_range (unsigned long start, unsigned long end);
>> +extern void clflush_cache_range(void *addr, int size);
>
>This patch adds clflush_cache_range(), but it's not used anywhere.
Clflush_cache_range() is used in __iommu_flush_cache() in include/linux/intel-iommu.h.

>If you do need it, it'd be nice if the arguments were the same types
>as for flush_icache_range(), and if there were a comment describing
>why it is necessary for VT-d. And maybe the name could be more like
>the other cache flushing functions.
Since clflush_cache_range(start, size) is defined in x86, I just want to keep the same definition. So this patch set won't change __iommu_flush_cache(). Otherwise, the patch set will have #ifdef CONFIG_IA64 in __iommu_flush_cache() which is not desired.

Does this make sense?

>> +#ifdef CONFIG_PCI
>> +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
>> +
>> +static __devinit void via_no_dac(struct pci_dev *dev)
>> +{
>> + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
>> + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");

>Please use dev_info() here. I see you just copied this from x86, but
>we should fix x86, too. Or better, since this doesn't appear to be
>arch-specific, maybe this should be moved to drivers/pci/quirks.c
>alongside all the other VIA quirks.

I'll move this to drivers/pci/quirks.c and use dev_info().

>> + forbid_dac = 1;
>
>Shouldn't forbid_dac be a per-device or at least a per-bridge
>property rather than a global?
This patch set is mainly for IA64 porting purpose. I will use the same forbid dac code as x86-64 and will send a follow-up patch to deal with forbid dac.

>> + }
>> +}
>> +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
>> +#endif
>> +/* Must execute after PCI subsystem */
>> +fs_initcall(pci_iommu_init);
>> +
>> +struct dma_mapping_ops *dma_ops;
>> +EXPORT_SYMBOL(dma_ops);
>> +
>> +int iommu_dma_supported(struct device *dev, u64 mask)
>> +{
>> + struct dma_mapping_ops *ops = get_dma_ops(dev);
>> +
>> +#ifdef CONFIG_PCI
>> + if (mask > 0xffffffff && forbid_dac > 0) {
>> + dev_info(dev, "PCI: Disallowing DAC for device\n");
>
>The "PCI: " should be removed since dev_info() will add the driver
>name and device ID.
Will change this.

Bjorn

2008-10-03 15:41:57

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

On Thursday 02 October 2008 11:46:04 am Yu, Fenghua wrote:
> >> --- a/arch/ia64/include/asm/cacheflush.h
> >> +++ b/arch/ia64/include/asm/cacheflush.h
> >> @@ -34,6 +34,8 @@ do { \
> >> #define flush_dcache_mmap_unlock(mapping) do { } while (0)
> >>
> >> extern void flush_icache_range (unsigned long start, unsigned long end);
> >> +extern void clflush_cache_range(void *addr, int size);
> >
> >This patch adds clflush_cache_range(), but it's not used anywhere.
> Clflush_cache_range() is used in __iommu_flush_cache() in include/linux/intel-iommu.h.

Oh, OK. I didn't look hard enough to find __iommu_flush_cache()
(currently in drivers/pci/intel-iommu.c).

Architecturally, I'm surprised that ia64 would need to actually do a
cache flush. I would think the VT-d hardware would do coherent accesses
which would make the cache flush unnecessary.

Bjorn

2008-10-04 00:53:23

by Fenghua Yu

[permalink] [raw]
Subject: RE: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

>> >This patch adds clflush_cache_range(), but it's not used anywhere.
>> Clflush_cache_range() is used in __iommu_flush_cache() in include/linux/intel-iommu.h.

>Oh, OK. I didn't look hard enough to find __iommu_flush_cache()
> (currently in drivers/pci/intel-iommu.c).

>Architecturally, I'm surprised that ia64 would need to actually do a
>cache flush. I would think the VT-d hardware would do coherent accesses
>which would make the cache flush unnecessary.

VT-d hardware supports both non cache coherency and cache coherency by bit Coherency in Extended Capabilities Register.

Could you please point me to the doc that explicitly says that architecturally ia64 doesn't need cache flush?

Thanks.

-Fenghua

2008-10-04 06:10:33

by David Woodhouse

[permalink] [raw]
Subject: RE: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

On Fri, 2008-10-03 at 17:53 -0700, Yu, Fenghua wrote:
> >Architecturally, I'm surprised that ia64 would need to actually do a
> >cache flush. I would think the VT-d hardware would do coherent
> accesses which would make the cache flush unnecessary.
>
> VT-d hardware supports both non cache coherency and cache coherency by
> bit Coherency in Extended Capabilities Register.

But is the version without the cache coherency actually going to be
_seen_ on IA64?

> Could you please point me to the doc that explicitly says that
> architecturally ia64 doesn't need cache flush?

For safety, we can always make the driver just refuse to initialise on
IA64 if the cache coherency bit isn't set.

--
David Woodhouse Open Source Technology Centre
[email protected] Intel Corporation

2008-10-04 14:18:33

by Fenghua Yu

[permalink] [raw]
Subject: RE: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

>> VT-d hardware supports both non cache coherency and cache coherency by
>> bit Coherency in Extended Capabilities Register.
>
>But is the version without the cache coherency actually going to be
>_seen_ on IA64?
>
Currently there is only one IA64 platform CraterLake supporting VT-d. Its BIOS sets cache coherency bit. But since there is no architecture spec saying cache coherency is required for all ia64 platforms, non cache coherency could be seen in future platforms having VT-d. And one VT-d architect explicitly tell me that ia64 Linux IOMMU needs to deal with non cache coherency case and flush cache line for non cache coherency case (just like x86-64 is doing).

>> Could you please point me to the doc that explicitly says that
>> architecturally ia64 doesn't need cache flush?
>
>For safety, we can always make the driver just refuse to initialise on
>IA64 if the cache coherency bit isn't set.

The current patch set works just fine for both cache coherency and non cache coherency. We don't need to abandon non cache coherency support on ia64 unless there is explicit spec claiming that non cache coherency is a requirement on all ia64 platforms.

Thanks.

-Fenghua

2008-10-06 14:55:58

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

On Friday 03 October 2008 06:53:04 pm Yu, Fenghua wrote:
> >> >This patch adds clflush_cache_range(), but it's not used anywhere.
> >> Clflush_cache_range() is used in __iommu_flush_cache() in include/linux/intel-iommu.h.
>
> >Oh, OK. I didn't look hard enough to find __iommu_flush_cache()
> > (currently in drivers/pci/intel-iommu.c).
>
> >Architecturally, I'm surprised that ia64 would need to actually do a
> >cache flush. I would think the VT-d hardware would do coherent accesses
> >which would make the cache flush unnecessary.
>
> VT-d hardware supports both non cache coherency and cache coherency by bit Coherency in Extended Capabilities Register.
>
> Could you please point me to the doc that explicitly says that architecturally ia64 doesn't need cache flush?

I don't know the details of VT-d, so I'm just asking the question. I
do know the HP IOMMU does not require flushing because it participates
in the coherency domain, so I was just surprised to see this in Intel
chipset support.

The following sections in volume 2 of the SDM mention DMA:

Part 1, Sec 4.4.3, Cacheability and Coherency Attribute:

The processor must ensure that transactions from other I/O agents
(such as DMA) are physically coherent with the instruction and data
cache.

Part 2, Sec 2.5.4, DMA:

Unlike Programmed I/O, which requires intervention from the CPU
to move data from the device to main memory, data movement in DMA
occurs without help from the CPU. A processor based on the Itanium
architecture expects the platform to maintain coherency for DMA
traffic. That is, the platform issues snoop cycles on the bus to
invalidate cacheable pages that a DMA access modifies. These snoop
cycles invalidate the appropriate lines in both instruction and
data caches and thus maintain coherency. This behavior allows an
operating system to page code pages without taking explicit actions
to ensure coherency.

Software must maintain coherency for DMA traffic through explicit
action if the platform does not maintain coherency for this traffic.
Software can provide coherency by using the flush cache instruction,
fc, to invalidate the instruction and data cache lines that a DMA
transfer modifies.

It sounds like the expectation is that DMA will be fully coherent
and no flushes would be required, but there is wiggle room in that
last paragraph for platforms that don't maintain coherency.

Bjorn

2008-10-07 00:02:52

by Fenghua Yu

[permalink] [raw]
Subject: [PATCH V2 2/2] Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part


The patch contains Intel IOMMU IA64 specific code. It defines new machvec dig_vtd, hooks for IOMMU, DMAR table detection, cache line flush function, etc.

For a generic kernel with CONFIG_DMAR=y, if Intel IOMMU is detected, dig_vtd is used for machinve vector. Otherwise, kernel falls back to dig machine vector. Kernel parameter "machvec=dig" or "intel_iommu=off" can be used to force kernel to boot dig machine vector.

Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Tony Luck <[email protected]>

---

arch/ia64/Kconfig | 17 ++++
arch/ia64/Makefile | 1
arch/ia64/configs/generic_defconfig | 2
arch/ia64/configs/tiger_defconfig | 2
arch/ia64/dig/Makefile | 5 +
arch/ia64/dig/dig_vtd_iommu.c | 59 ++++++++++++++
arch/ia64/dig/machvec_vtd.c | 3
arch/ia64/include/asm/cacheflush.h | 2
arch/ia64/include/asm/device.h | 3
arch/ia64/include/asm/dma-mapping.h | 50 ++++++++++++
arch/ia64/include/asm/iommu.h | 16 +++
arch/ia64/include/asm/machvec.h | 2
arch/ia64/include/asm/machvec_dig_vtd.h | 38 +++++++++
arch/ia64/include/asm/machvec_init.h | 1
arch/ia64/include/asm/pci.h | 3
arch/ia64/include/asm/swiotlb.h | 56 +++++++++++++
arch/ia64/kernel/Makefile | 4
arch/ia64/kernel/acpi.c | 17 ++++
arch/ia64/kernel/msi_ia64.c | 80 +++++++++++++++++++
arch/ia64/kernel/pci-dma.c | 129 ++++++++++++++++++++++++++++++++
arch/ia64/kernel/pci-swiotlb.c | 46 +++++++++++
arch/ia64/kernel/setup.c | 42 +++++++---
arch/ia64/lib/flush.S | 55 +++++++++++++
23 files changed, 620 insertions(+), 13 deletions(-)


diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 48e496f..ae965aa 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -125,6 +125,7 @@ config IA64_GENERIC
select NUMA
select ACPI_NUMA
select SWIOTLB
+ select PCI_MSI
help
This selects the system type of your hardware. A "generic" kernel
will run on any supported IA-64 system. However, if you configure
@@ -132,6 +133,7 @@ config IA64_GENERIC

generic For any supported IA-64 system
DIG-compliant For DIG ("Developer's Interface Guide") compliant systems
+ DIG+Intel+IOMMU For DIG systems with Intel IOMMU
HP-zx1/sx1000 For HP systems
HP-zx1/sx1000+swiotlb For HP systems with (broken) DMA-constrained devices.
SGI-SN2 For SGI Altix systems
@@ -144,6 +146,11 @@ config IA64_DIG
bool "DIG-compliant"
select SWIOTLB

+config IA64_DIG_VTD
+ bool "DIG+Intel+IOMMU"
+ select DMAR
+ select PCI_MSI
+
config IA64_HP_ZX1
bool "HP-zx1/sx1000"
help
@@ -589,6 +596,16 @@ source "drivers/pci/hotplug/Kconfig"

source "drivers/pcmcia/Kconfig"

+config DMAR
+ bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
+ depends on IA64_GENERIC && ACPI && EXPERIMENTAL
+ help
+ DMA remapping (DMAR) devices support enables independent address
+ translations for Direct Memory Access (DMA) from devices.
+ These DMA remapping devices are reported via ACPI tables
+ and include PCI device scope covered by these DMA
+ remapping devices.
+
endmenu

endif
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 905d25b..04cecee 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -53,6 +53,7 @@ libs-y += arch/ia64/lib/
core-y += arch/ia64/kernel/ arch/ia64/mm/
core-$(CONFIG_IA32_SUPPORT) += arch/ia64/ia32/
core-$(CONFIG_IA64_DIG) += arch/ia64/dig/
+core-$(CONFIG_IA64_DIG_VTD) += arch/ia64/dig/
core-$(CONFIG_IA64_GENERIC) += arch/ia64/dig/
core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/
core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
index 9f48397..e05f9e1 100644
--- a/arch/ia64/configs/generic_defconfig
+++ b/arch/ia64/configs/generic_defconfig
@@ -233,6 +233,8 @@ CONFIG_DMIID=y
CONFIG_BINFMT_ELF=y
CONFIG_BINFMT_MISC=m

+# CONFIG_DMAR is not set
+
#
# Power management and ACPI
#
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index 797acf9..c522edf 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -172,6 +172,8 @@ CONFIG_DMIID=y
CONFIG_BINFMT_ELF=y
CONFIG_BINFMT_MISC=m

+# CONFIG_DMAR is not set
+
#
# Power management and ACPI
#
diff --git a/arch/ia64/dig/Makefile b/arch/ia64/dig/Makefile
index 971cd78..62e001c 100644
--- a/arch/ia64/dig/Makefile
+++ b/arch/ia64/dig/Makefile
@@ -6,4 +6,9 @@
#

obj-y := setup.o
+ifeq ($(CONFIG_DMAR), y)
+obj-$(CONFIG_IA64_GENERIC) += machvec.o machvec_vtd.o dig_vtd_iommu.o
+else
obj-$(CONFIG_IA64_GENERIC) += machvec.o
+endif
+obj-$(CONFIG_IA64_DIG_VTD) += dig_vtd_iommu.o
new file mode 100644
index 0000000..289d7e8
--- /dev/null
+++ b/arch/ia64/dig/dig_vtd_iommu.c
@@ -0,0 +1,59 @@
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/intel-iommu.h>
+
+void *
+vtd_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flags)
+{
+ return intel_alloc_coherent(dev, size, dma_handle, flags);
+}
+EXPORT_SYMBOL_GPL(vtd_alloc_coherent);
+
+void
+vtd_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle)
+{
+ intel_free_coherent(dev, size, vaddr, dma_handle);
+}
+EXPORT_SYMBOL_GPL(vtd_free_coherent);
+
+dma_addr_t
+vtd_map_single_attrs(struct device *dev, void *addr, size_t size,
+ int dir, struct dma_attrs *attrs)
+{
+ return intel_map_single(dev, (unsigned long)addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_map_single_attrs);
+
+void
+vtd_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size,
+ int dir, struct dma_attrs *attrs)
+{
+ intel_unmap_single(dev, iova, size, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_unmap_single_attrs);
+
+int
+vtd_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents,
+ int dir, struct dma_attrs *attrs)
+{
+ return intel_map_sg(dev, sglist, nents, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_map_sg_attrs);
+
+void
+vtd_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
+ int nents, int dir, struct dma_attrs *attrs)
+{
+ intel_unmap_sg(dev, sglist, nents, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_unmap_sg_attrs);
+
+int
+vtd_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vtd_dma_mapping_error);
diff --git a/arch/ia64/dig/machvec_vtd.c b/arch/ia64/dig/machvec_vtd.c
new file mode 100644
index 0000000..7cd3eb4
--- /dev/null
+++ b/arch/ia64/dig/machvec_vtd.c
@@ -0,0 +1,3 @@
+#define MACHVEC_PLATFORM_NAME dig_vtd
+#define MACHVEC_PLATFORM_HEADER <asm/machvec_dig_vtd.h>
+#include <asm/machvec_init.h>
diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h
index afcfbda..c8ce271 100644
--- a/arch/ia64/include/asm/cacheflush.h
+++ b/arch/ia64/include/asm/cacheflush.h
@@ -34,6 +34,8 @@ do { \
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

extern void flush_icache_range (unsigned long start, unsigned long end);
+extern void clflush_cache_range(void *addr, int size);
+

#define flush_icache_user_range(vma, page, user_addr, len) \
do { \
diff --git a/arch/ia64/include/asm/device.h b/arch/ia64/include/asm/device.h
index 3db6daf..41ab85d 100644
--- a/arch/ia64/include/asm/device.h
+++ b/arch/ia64/include/asm/device.h
@@ -10,6 +10,9 @@ struct dev_archdata {
#ifdef CONFIG_ACPI
void *acpi_handle;
#endif
+#ifdef CONFIG_DMAR
+ void *iommu; /* hook for IOMMU specific extension */
+#endif
};

#endif /* _ASM_IA64_DEVICE_H */
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 06ff1ba..bbab7e2 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -7,6 +7,49 @@
*/
#include <asm/machvec.h>
#include <linux/scatterlist.h>
+#include <asm/swiotlb.h>
+
+struct dma_mapping_ops {
+ int (*mapping_error)(struct device *dev,
+ dma_addr_t dma_addr);
+ void* (*alloc_coherent)(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp);
+ void (*free_coherent)(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+ dma_addr_t (*map_single)(struct device *hwdev, unsigned long ptr,
+ size_t size, int direction);
+ void (*unmap_single)(struct device *dev, dma_addr_t addr,
+ size_t size, int direction);
+ void (*sync_single_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_range_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_single_range_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_sg_for_cpu)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ void (*sync_sg_for_device)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+ void (*unmap_sg)(struct device *hwdev,
+ struct scatterlist *sg, int nents,
+ int direction);
+ int (*dma_supported_op)(struct device *hwdev, u64 mask);
+ int is_phys;
+};
+
+extern struct dma_mapping_ops *dma_ops;
+extern struct ia64_machine_vector ia64_mv;
+extern void set_iommu_machvec(void);

#define dma_alloc_coherent(dev, size, handle, gfp) \
platform_dma_alloc_coherent(dev, size, handle, (gfp) | GFP_DMA)
@@ -96,4 +139,11 @@ dma_cache_sync (struct device *dev, void *vaddr, size_t size,

#define dma_is_consistent(d, h) (1) /* all we do is coherent memory... */

+static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
+{
+ return dma_ops;
+}
+
+
+
#endif /* _ASM_IA64_DMA_MAPPING_H */
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
new file mode 100644
index 0000000..5fb2bb9
--- /dev/null
+++ b/arch/ia64/include/asm/iommu.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_IA64_IOMMU_H
+#define _ASM_IA64_IOMMU_H 1
+
+#define cpu_has_x2apic 0
+/* 10 seconds */
+#define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10)
+
+extern void pci_iommu_shutdown(void);
+extern void no_iommu_init(void);
+extern int force_iommu, no_iommu;
+extern int iommu_detected;
+extern void iommu_dma_init(void);
+extern void machvec_init(const char *name);
+extern int forbid_dac;
+
+#endif
diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h
index 2b850cc..ce9b1d0 100644
--- a/arch/ia64/include/asm/machvec.h
+++ b/arch/ia64/include/asm/machvec.h
@@ -120,6 +120,8 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *);
# include <asm/machvec_hpsim.h>
# elif defined (CONFIG_IA64_DIG)
# include <asm/machvec_dig.h>
+# elif defined(CONFIG_IA64_DIG_VTD)
+# include <asm/machvec_dig_vtd.h>
# elif defined (CONFIG_IA64_HP_ZX1)
# include <asm/machvec_hpzx1.h>
# elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB)
diff --git a/arch/ia64/include/asm/machvec_dig_vtd.h b/arch/ia64/include/asm/machvec_dig_vtd.h
new file mode 100644
index 0000000..3400b56
--- /dev/null
+++ b/arch/ia64/include/asm/machvec_dig_vtd.h
@@ -0,0 +1,38 @@
+#ifndef _ASM_IA64_MACHVEC_DIG_VTD_h
+#define _ASM_IA64_MACHVEC_DIG_VTD_h
+
+extern ia64_mv_setup_t dig_setup;
+extern ia64_mv_dma_alloc_coherent vtd_alloc_coherent;
+extern ia64_mv_dma_free_coherent vtd_free_coherent;
+extern ia64_mv_dma_map_single_attrs vtd_map_single_attrs;
+extern ia64_mv_dma_unmap_single_attrs vtd_unmap_single_attrs;
+extern ia64_mv_dma_map_sg_attrs vtd_map_sg_attrs;
+extern ia64_mv_dma_unmap_sg_attrs vtd_unmap_sg_attrs;
+extern ia64_mv_dma_supported iommu_dma_supported;
+extern ia64_mv_dma_mapping_error vtd_dma_mapping_error;
+extern ia64_mv_dma_init pci_iommu_alloc;
+
+/*
+ * This stuff has dual use!
+ *
+ * For a generic kernel, the macros are used to initialize the
+ * platform's machvec structure. When compiling a non-generic kernel,
+ * the macros are used directly.
+ */
+#define platform_name "dig_vtd"
+#define platform_setup dig_setup
+#define platform_dma_init pci_iommu_alloc
+#define platform_dma_alloc_coherent vtd_alloc_coherent
+#define platform_dma_free_coherent vtd_free_coherent
+#define platform_dma_map_single_attrs vtd_map_single_attrs
+#define platform_dma_unmap_single_attrs vtd_unmap_single_attrs
+#define platform_dma_map_sg_attrs vtd_map_sg_attrs
+#define platform_dma_unmap_sg_attrs vtd_unmap_sg_attrs
+#define platform_dma_sync_single_for_cpu machvec_dma_sync_single
+#define platform_dma_sync_sg_for_cpu machvec_dma_sync_sg
+#define platform_dma_sync_single_for_device machvec_dma_sync_single
+#define platform_dma_sync_sg_for_device machvec_dma_sync_sg
+#define platform_dma_supported iommu_dma_supported
+#define platform_dma_mapping_error vtd_dma_mapping_error
+
+#endif /* _ASM_IA64_MACHVEC_DIG_VTD_h */
diff --git a/arch/ia64/include/asm/machvec_init.h b/arch/ia64/include/asm/machvec_init.h
index 7f21249..ef964b2 100644
--- a/arch/ia64/include/asm/machvec_init.h
+++ b/arch/ia64/include/asm/machvec_init.h
@@ -1,3 +1,4 @@
+#include <asm/iommu.h>
#include <asm/machvec.h>

extern ia64_mv_send_ipi_t ia64_send_ipi;
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 0149097..645ef06 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -164,4 +164,7 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
return channel ? isa_irq_to_vector(15) : isa_irq_to_vector(14);
}

+#ifdef CONFIG_DMAR
+extern void pci_iommu_alloc(void);
+#endif
#endif /* _ASM_IA64_PCI_H */
diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
new file mode 100644
index 0000000..fb79423
--- /dev/null
+++ b/arch/ia64/include/asm/swiotlb.h
@@ -0,0 +1,56 @@
+#ifndef ASM_IA64__SWIOTLB_H
+#define ASM_IA64__SWIOTLB_H
+
+#include <linux/dma-mapping.h>
+
+/* SWIOTLB interface */
+
+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
+ size_t size, int dir);
+extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags);
+extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern void swiotlb_sync_sg_for_device(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
+extern void swiotlb_init(void);
+
+extern int swiotlb_force;
+
+#ifdef CONFIG_SWIOTLB
+extern int swiotlb;
+extern void pci_swiotlb_init(void);
+#else
+#define swiotlb 0
+static inline void pci_swiotlb_init(void)
+{
+}
+#endif
+
+#endif /* ASM_IA64__SWIOTLB_H */
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 87fea11..af0e750 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -42,6 +42,10 @@ obj-$(CONFIG_IA64_ESI) += esi.o
ifneq ($(CONFIG_IA64_ESI),)
obj-y += esi_stub.o # must be in kernel proper
endif
+obj-$(CONFIG_DMAR) += pci-dma.o
+ifeq ($(CONFIG_DMAR), y)
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+endif

# The gate DSO image is built using a special linker script.
targets += gate.so gate-syms.o
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 5d1eb7e..8cc2f8a 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -91,6 +91,9 @@ acpi_get_sysname(void)
struct acpi_table_rsdp *rsdp;
struct acpi_table_xsdt *xsdt;
struct acpi_table_header *hdr;
+#ifdef CONFIG_DMAR
+ u64 i, nentries;
+#endif

rsdp_phys = acpi_find_rsdp();
if (!rsdp_phys) {
@@ -123,6 +126,18 @@ acpi_get_sysname(void)
return "sn2";
}

+#ifdef CONFIG_DMAR
+ /* Look for Intel IOMMU */
+ nentries = (hdr->length - sizeof(*hdr)) /
+ sizeof(xsdt->table_offset_entry[0]);
+ for (i = 0; i < nentries; i++) {
+ hdr = __va(xsdt->table_offset_entry[i]);
+ if (strncmp(hdr->signature, ACPI_SIG_DMAR,
+ sizeof(ACPI_SIG_DMAR) - 1) == 0)
+ return "dig_vtd";
+ }
+#endif
+
return "dig";
#else
# if defined (CONFIG_IA64_HP_SIM)
@@ -137,6 +152,8 @@ acpi_get_sysname(void)
return "uv";
# elif defined (CONFIG_IA64_DIG)
return "dig";
+# elif defined(CONFIG_IA64_DIG_VTD)
+ return "dig_vtd";
# else
# error Unknown platform. Fix acpi.c.
# endif
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 60c6ef6..702a09c 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -5,6 +5,7 @@
#include <linux/pci.h>
#include <linux/irq.h>
#include <linux/msi.h>
+#include <linux/dmar.h>
#include <asm/smp.h>

/*
@@ -162,3 +163,82 @@ void arch_teardown_msi_irq(unsigned int irq)

return ia64_teardown_msi_irq(irq);
}
+
+#ifdef CONFIG_DMAR
+#ifdef CONFIG_SMP
+static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ struct msi_msg msg;
+ int cpu = first_cpu(mask);
+
+
+ if (!cpu_online(cpu))
+ return;
+
+ if (irq_prepare_move(irq, cpu))
+ return;
+
+ dmar_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DESTID_MASK;
+ msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+
+ dmar_msi_write(irq, &msg);
+ irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+struct irq_chip dmar_msi_type = {
+ .name = "DMAR_MSI",
+ .unmask = dmar_msi_unmask,
+ .mask = dmar_msi_mask,
+ .ack = ia64_ack_msi_irq,
+#ifdef CONFIG_SMP
+ .set_affinity = dmar_msi_set_affinity,
+#endif
+ .retrigger = ia64_msi_retrigger_irq,
+};
+
+static int
+msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ unsigned dest;
+ cpumask_t mask;
+
+ cpus_and(mask, irq_to_domain(irq), cpu_online_map);
+ dest = cpu_physical_id(first_cpu(mask));
+
+ msg->address_hi = 0;
+ msg->address_lo =
+ MSI_ADDR_HEADER |
+ MSI_ADDR_DESTMODE_PHYS |
+ MSI_ADDR_REDIRECTION_CPU |
+ MSI_ADDR_DESTID_CPU(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ MSI_DATA_DELIVERY_FIXED |
+ MSI_DATA_VECTOR(cfg->vector);
+ return 0;
+}
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg);
+ if (ret < 0)
+ return ret;
+ dmar_msi_write(irq, &msg);
+ set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+ "edge");
+ return 0;
+}
+#endif /* CONFIG_DMAR */
+
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
new file mode 100644
index 0000000..10a75b5
--- /dev/null
+++ b/arch/ia64/kernel/pci-dma.c
@@ -0,0 +1,129 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/dmar.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/machvec.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_DMAR
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/page.h>
+#include <asm/iommu.h>
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+static int iommu_sac_force __read_mostly;
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int force_iommu __read_mostly = 1;
+#else
+int force_iommu __read_mostly;
+#endif
+
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly;
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+ be probably a smaller DMA mask, but this is bug-to-bug compatible
+ to i386. */
+struct device fallback_dev = {
+ .bus_id = "fallback device",
+ .coherent_dma_mask = DMA_32BIT_MASK,
+ .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+void __init pci_iommu_alloc(void)
+{
+ /*
+ * The order of these functions is important for
+ * fall-back/fail-over reasons
+ */
+ detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+ pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+ if (iommu_detected)
+ intel_iommu_init();
+
+ return 0;
+}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+
+void pci_iommu_shutdown(void)
+{
+ return;
+}
+
+void __init
+iommu_dma_init(void)
+{
+ return;
+}
+
+struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+int iommu_dma_supported(struct device *dev, u64 mask)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+#ifdef CONFIG_PCI
+ if (mask > 0xffffffff && forbid_dac > 0) {
+ dev_info(dev, "Disallowing DAC for device\n");
+ return 0;
+ }
+#endif
+
+ if (ops->dma_supported_op)
+ return ops->dma_supported_op(dev, mask);
+
+ /* Copied from i386. Doesn't make much sense, because it will
+ only work for pci_alloc_coherent.
+ The caller just has to use GFP_DMA in this case. */
+ if (mask < DMA_24BIT_MASK)
+ return 0;
+
+ /* Tell the device to use SAC when IOMMU force is on. This
+ allows the driver to use cheaper accesses in some cases.
+
+ Problem with this is that if we overflow the IOMMU area and
+ return DAC as fallback address the device may not handle it
+ correctly.
+
+ As a special case some controllers have a 39bit address
+ mode that is as efficient as 32bit (aic79xx). Don't force
+ SAC for these. Assume all masks <= 40 bits are of this
+ type. Normally this doesn't make any difference, but gives
+ more gentle handling of IOMMU overflow. */
+ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+ dev_info(dev, "Force SAC with mask %lx\n", mask);
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(iommu_dma_supported);
+
+#endif
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
new file mode 100644
index 0000000..16c5051
--- /dev/null
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -0,0 +1,46 @@
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+
+int swiotlb __read_mostly;
+EXPORT_SYMBOL(swiotlb);
+
+struct dma_mapping_ops swiotlb_dma_ops = {
+ .mapping_error = swiotlb_dma_mapping_error,
+ .alloc_coherent = swiotlb_alloc_coherent,
+ .free_coherent = swiotlb_free_coherent,
+ .map_single = swiotlb_map_single,
+ .unmap_single = swiotlb_unmap_single,
+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = swiotlb_sync_single_for_device,
+ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+ .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
+ .map_sg = swiotlb_map_sg,
+ .unmap_sg = swiotlb_unmap_sg,
+ .dma_supported_op = swiotlb_dma_supported,
+};
+
+void __init pci_swiotlb_init(void)
+{
+ if (!iommu_detected) {
+#ifdef CONFIG_IA64_GENERIC
+ swiotlb = 1;
+ printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
+ machvec_init("dig");
+ swiotlb_init();
+ dma_ops = &swiotlb_dma_ops;
+#else
+ panic("Unable to find Intel IOMMU");
+#endif
+ }
+}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index c27d5b2..218c550 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -116,6 +116,13 @@ unsigned int num_io_spaces;
*/
#define I_CACHE_STRIDE_SHIFT 5 /* Safest way to go: 32 bytes by 32 bytes */
unsigned long ia64_i_cache_stride_shift = ~0;
+/*
+ * "clflush_cache_range()" needs to know what processor dependent stride size to
+ * use when it flushes cache lines including both d-cache and i-cache.
+ */
+/* Safest way to go: 32 bytes by 32 bytes */
+#define CACHE_STRIDE_SHIFT 5
+unsigned long ia64_cache_stride_shift = ~0;

/*
* The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This
@@ -845,13 +852,14 @@ setup_per_cpu_areas (void)
}

/*
- * Calculate the max. cache line size.
+ * Do the following calculations:
*
- * In addition, the minimum of the i-cache stride sizes is calculated for
- * "flush_icache_range()".
+ * 1. the max. cache line size.
+ * 2. the minimum of the i-cache stride sizes for "flush_icache_range()".
+ * 3. the minimum of the cache stride sizes for "clflush_cache_range()".
*/
static void __cpuinit
-get_max_cacheline_size (void)
+get_cache_info(void)
{
unsigned long line_size, max = 1;
u64 l, levels, unique_caches;
@@ -865,12 +873,14 @@ get_max_cacheline_size (void)
max = SMP_CACHE_BYTES;
/* Safest setup for "flush_icache_range()" */
ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
+ /* Safest setup for "clflush_cache_range()" */
+ ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
goto out;
}

for (l = 0; l < levels; ++l) {
- status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2,
- &cci);
+ /* cache_type (data_or_unified)=2 */
+ status = ia64_pal_cache_config_info(l, 2, &cci);
if (status != 0) {
printk(KERN_ERR
"%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
@@ -878,15 +888,21 @@ get_max_cacheline_size (void)
max = SMP_CACHE_BYTES;
/* The safest setup for "flush_icache_range()" */
cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+ /* The safest setup for "clflush_cache_range()" */
+ ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
cci.pcci_unified = 1;
+ } else {
+ if (cci.pcci_stride < ia64_cache_stride_shift)
+ ia64_cache_stride_shift = cci.pcci_stride;
+
+ line_size = 1 << cci.pcci_line_size;
+ if (line_size > max)
+ max = line_size;
}
- line_size = 1 << cci.pcci_line_size;
- if (line_size > max)
- max = line_size;
+
if (!cci.pcci_unified) {
- status = ia64_pal_cache_config_info(l,
- /* cache_type (instruction)= */ 1,
- &cci);
+ /* cache_type (instruction)=1*/
+ status = ia64_pal_cache_config_info(l, 1, &cci);
if (status != 0) {
printk(KERN_ERR
"%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
@@ -940,7 +956,7 @@ cpu_init (void)
}
#endif

- get_max_cacheline_size();
+ get_cache_info();

/*
* We can't pass "local_cpu_data" to identify_cpu() because we haven't called
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
index 2a0d27f..2c1a948 100644
--- a/arch/ia64/lib/flush.S
+++ b/arch/ia64/lib/flush.S
@@ -60,3 +60,58 @@ GLOBAL_ENTRY(flush_icache_range)
mov ar.lc=r3 // restore ar.lc
br.ret.sptk.many rp
END(flush_icache_range)
+
+ /*
+ * clflush_cache_range(start,size)
+ *
+ * Flush cache lines from start to start+size-1.
+ *
+ * Must deal with range from start to start+size-1 but nothing else
+ * (need to be careful not to touch addresses that may be
+ * unmapped).
+ *
+ * Note: "in0" and "in1" are preserved for debugging purposes.
+ */
+ .section .kprobes.text,"ax"
+GLOBAL_ENTRY(clflush_cache_range)
+
+ .prologue
+ alloc r2=ar.pfs,2,0,0,0
+ movl r3=ia64_cache_stride_shift
+ mov r21=1
+ add r22=in1,in0
+ ;;
+ ld8 r20=[r3] // r20: stride shift
+ sub r22=r22,r0,1 // last byte address
+ ;;
+ shr.u r23=in0,r20 // start / (stride size)
+ shr.u r22=r22,r20 // (last byte address) / (stride size)
+ shl r21=r21,r20 // r21: stride size of the i-cache(s)
+ ;;
+ sub r8=r22,r23 // number of strides - 1
+ shl r24=r23,r20 // r24: addresses for "fc" =
+ // "start" rounded down to stride
+ // boundary
+ .save ar.lc,r3
+ mov r3=ar.lc // save ar.lc
+ ;;
+
+ .body
+ mov ar.lc=r8
+ ;;
+ /*
+ * 32 byte aligned loop, even number of (actually 2) bundles
+ */
+.Loop_fc:
+ fc r24 // issuable on M0 only
+ add r24=r21,r24 // we flush "stride size" bytes per iteration
+ nop.i 0
+ br.cloop.sptk.few .Loop_fc
+ ;;
+ sync.i
+ ;;
+ srlz.i
+ ;;
+ mov ar.lc=r3 // restore ar.lc
+ br.ret.sptk.many rp
+END(clflush_cache_range)

2008-10-07 00:35:50

by Fenghua Yu

[permalink] [raw]
Subject: Re: [PATCH 2/2]Add Variable Page Size and IA64 Support in Intel IOMMU: IA64 Specific Part

> > Could you please point me to the doc that explicitly says that architecturally ia64 doesn't need cache flush?
>
> The following sections in volume 2 of the SDM mention DMA:
> Part 1, Sec 4.4.3, Cacheability and Coherency Attribute:
> Part 2, Sec 2.5.4, DMA:
>
> It sounds like the expectation is that DMA will be fully coherent
> and no flushes would be required, but there is wiggle room in that
> last paragraph for platforms that don't maintain coherency.

The cache coherency bit in VT-d is for root, context, and page tables which are for DMA management, not DMA data itself. VT-d DMA data should be cache coherent.The Intel IOMMU code doesn't need to deal with non cache coherency in DMA data traffic. But root, context, and page tables could be non cache coherent and this is handled by Intel IOMMU code.

Thanks.

-Fenghua

2008-11-24 19:53:25

by Fenghua Yu

[permalink] [raw]
Subject: [PATCH 1/2] Enable Pass Through Feature in Intel IOMMU


The patch set adds kernel parameter intel_iommu=pt to set up pass through mode in
context mapping entry. This disables DMAR in linux kernel; but KVM still runs on
VT-d. In this mode, kernel uses swiotlb for DMA API functions but other VT-d
functionalities are enabled for KVM. KVM always uses multi level translation
page table in VT-d. By default, pass though mode is disabled in kernel.

This is useful when people don't want to enable VT-d DMAR in kernel for
reasons like kernel iommu performance concern or debug purpose but still want to
use KVM.

Thanks.

-Fenghua


Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Weidong Han <[email protected]>
Signed-off-by: Allen Kay <[email protected]>
Signed-off-by: David Woodhouse <[email protected]>

---

Documentation/kernel-parameters.txt | 5 +++
arch/ia64/include/asm/iommu.h | 1
arch/ia64/kernel/pci-swiotlb.c | 2 -
arch/x86/include/asm/iommu.h | 1
arch/x86/kernel/pci-swiotlb_64.c | 4 ++-
drivers/pci/intel-iommu.c | 47 ++++++++++++++++++++++++++----------
include/linux/dma_remapping.h | 3 ++
include/linux/intel-iommu.h | 3 +-
8 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e0f346d..b966185 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -931,6 +931,11 @@ and is between 256 and 4096 characters. It is defined in the file
With this option on every unmap_single operation will
result in a hardware IOTLB flush operation as opposed
to batching them for performance.
+ pt [Default no Pass Through]
+ This option enables Pass Through in context mapping if
+ Pass Through is supported in hardware. With this option
+ DMAR is disabled in kernel and kernel uses swiotlb, but
+ KVM still uses VT-d hardware.

io_delay= [X86-32,X86-64] I/O delay method
0x80
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
index 0490794..37d41ca 100644
--- a/arch/ia64/include/asm/iommu.h
+++ b/arch/ia64/include/asm/iommu.h
@@ -9,6 +9,7 @@ extern void pci_iommu_shutdown(void);
extern void no_iommu_init(void);
extern int force_iommu, no_iommu;
extern int iommu_detected;
+extern int iommu_pass_through;
extern void iommu_dma_init(void);
extern void machvec_init(const char *name);

diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 16c5051..69135b0 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -32,7 +32,7 @@ struct dma_mapping_ops swiotlb_dma_ops = {

void __init pci_swiotlb_init(void)
{
- if (!iommu_detected) {
+ if (!iommu_detected || iommu_pass_through) {
#ifdef CONFIG_IA64_GENERIC
swiotlb = 1;
printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 0b500c5..014e94f 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,7 @@ extern void no_iommu_init(void);
extern struct dma_mapping_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
+extern int iommu_pass_through;

extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);

diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d1..4af2425 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -50,8 +50,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
- if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
+ if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ iommu_pass_through)
swiotlb = 1;
+
if (swiotlb_force)
swiotlb = 1;
if (swiotlb) {
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index aec60ad..f164a3c 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -120,7 +120,6 @@ struct context_entry {
(c).lo &= (((u64)-1) << 4) | 3; \
(c).lo |= ((val) & 3) << 2; \
} while (0)
-#define CONTEXT_TT_MULTI_LEVEL 0
#define context_set_address_root(c, val) \
do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0)
#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
@@ -203,6 +202,7 @@ static long list_size;
static void domain_remove_dev_info(struct dmar_domain *domain);

int dmar_disabled;
+int iommu_pass_through;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
static int intel_iommu_strict;
@@ -231,6 +231,9 @@ static int __init intel_iommu_setup(char *str)
printk(KERN_INFO
"Intel-IOMMU: disable batched IOTLB flush\n");
intel_iommu_strict = 1;
+ } else if (!strncmp(str, "pt", 2)) {
+ iommu_pass_through = 1;
+ printk(KERN_INFO "Intel-IOMMU: Pass Through enabled\n");
}

str += strcspn(str, ",");
@@ -1271,7 +1274,7 @@ static void domain_exit(struct dmar_domain *domain)
}

static int domain_context_mapping_one(struct dmar_domain *domain,
- u8 bus, u8 devfn)
+ u8 bus, u8 devfn, int translation)
{
struct context_entry *context;
struct intel_iommu *iommu = domain->iommu;
@@ -1279,7 +1282,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain,

pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
BUG_ON(!domain->pgd);
+ BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
+ translation != CONTEXT_TT_MULTI_LEVEL);
+
context = device_to_context_entry(iommu, bus, devfn);
if (!context)
return -ENOMEM;
@@ -1292,7 +1299,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
context_set_domain_id(*context, domain->id);
context_set_address_width(*context, domain->agaw);
context_set_address_root(*context, virt_to_phys(domain->pgd));
- context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+ context_set_translation_type(*context, translation);
context_set_fault_enable(*context);
context_set_present(*context);
__iommu_flush_cache(iommu, context, sizeof(*context));
@@ -1310,13 +1317,14 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
}

static int
-domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
+domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
+ int translation)
{
int ret;
struct pci_dev *tmp, *parent;

ret = domain_context_mapping_one(domain, pdev->bus->number,
- pdev->devfn);
+ pdev->devfn, translation);
if (ret)
return ret;

@@ -1328,17 +1336,17 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
parent = pdev->bus->self;
while (parent != tmp) {
ret = domain_context_mapping_one(domain, parent->bus->number,
- parent->devfn);
+ parent->devfn, translation);
if (ret)
return ret;
parent = parent->bus->self;
}
if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
return domain_context_mapping_one(domain,
- tmp->subordinate->number, 0);
+ tmp->subordinate->number, 0, translation);
else /* this is a legacy PCI bridge */
return domain_context_mapping_one(domain,
- tmp->bus->number, tmp->devfn);
+ tmp->bus->number, tmp->devfn, translation);
}

static int domain_context_mapped(struct dmar_domain *domain,
@@ -1583,6 +1591,8 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev,
unsigned long size;
unsigned long long base;
int ret;
+ int translation = iommu_pass_through ? CONTEXT_TT_PASS_THROUGH :
+ CONTEXT_TT_MULTI_LEVEL;

printk(KERN_INFO
"IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
@@ -1617,7 +1627,7 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev,
goto error;

/* context entry init */
- ret = domain_context_mapping(domain, pdev);
+ ret = domain_context_mapping(domain, pdev, translation);
if (!ret)
return 0;
error:
@@ -1725,6 +1735,7 @@ static int __init init_dmars(void)
struct pci_dev *pdev;
struct intel_iommu *iommu;
int i, ret, unit = 0;
+ int pass_through = 1;

/*
* for each drhd
@@ -1790,7 +1801,14 @@ static int __init init_dmars(void)
printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
"invalidation\n", drhd->reg_base_addr);
}
+ if (!ecap_pass_through(iommu->ecap))
+ pass_through = 0;
}
+ if (iommu_pass_through & pass_through) {
+ iommu_pass_through = 1;
+ printk(KERN_INFO "IOMMU is using Pass Through.\n");
+ } else
+ iommu_pass_through = 0;

/*
* For each rmrr
@@ -1921,6 +1939,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev)
{
struct dmar_domain *domain;
int ret;
+ int translation = iommu_pass_through ? CONTEXT_TT_PASS_THROUGH :
+ CONTEXT_TT_MULTI_LEVEL;

domain = get_domain_for_dev(pdev,
DEFAULT_DOMAIN_ADDRESS_WIDTH);
@@ -1932,7 +1952,7 @@ get_valid_domain_for_dev(struct pci_dev *pdev)

/* make sure context mapping is ok */
if (unlikely(!domain_context_mapped(domain, pdev))) {
- ret = domain_context_mapping(domain, pdev);
+ ret = domain_context_mapping(domain, pdev, translation);
if (ret) {
printk(KERN_ERR
"Domain context map for %s failed",
@@ -2450,7 +2470,8 @@ int __init intel_iommu_init(void)

init_timer(&unmap_timer);
force_iommu = 1;
- dma_ops = &intel_dma_ops;
+ if (!iommu_pass_through)
+ dma_ops = &intel_dma_ops;
return 0;
}

@@ -2511,10 +2532,10 @@ struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);

int intel_iommu_context_mapping(
- struct dmar_domain *domain, struct pci_dev *pdev)
+ struct dmar_domain *domain, struct pci_dev *pdev, int translation)
{
int rc;
- rc = domain_context_mapping(domain, pdev);
+ rc = domain_context_mapping(domain, pdev, translation);
return rc;
}
EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 7799a85..03054a6 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -12,6 +12,9 @@
#define DMA_PTE_READ (1)
#define DMA_PTE_WRITE (2)

+#define CONTEXT_TT_MULTI_LEVEL 0
+#define CONTEXT_TT_PASS_THROUGH 2
+
struct intel_iommu;
struct dmar_domain;
struct root_entry;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 1bff7bf..229b101 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -120,6 +120,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
#define ecap_coherent(e) ((e) & 0x1)
#define ecap_qis(e) ((e) & 0x2)
+#define ecap_pass_through(e) ((e >> 6) & 0x1)
#define ecap_eim_support(e) ((e >> 4) & 0x1)
#define ecap_ir_support(e) ((e >> 3) & 0x1)
#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
@@ -332,7 +333,7 @@ extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
void intel_iommu_domain_exit(struct dmar_domain *domain);
struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev);
int intel_iommu_context_mapping(struct dmar_domain *domain,
- struct pci_dev *pdev);
+ struct pci_dev *pdev, int translation);
int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
u64 hpa, size_t size, int prot);
void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn);