2009-04-16 00:20:14

by Fenghua Yu

[permalink] [raw]
Subject: [PATCH] Intel IOMMU Pass Through Support

The patch adds kernel parameter intel_iommu=pt to set up pass through mode in
context mapping entry. This disables DMAR in linux kernel; but KVM still runs on
VT-d and interrupt remapping still works.

In this mode, kernel uses swiotlb for DMA API functions but other VT-d
functionalities are enabled for KVM. KVM always uses multi level translation
page table in VT-d. By default, pass though mode is disabled in kernel.

This is useful when people don't want to enable VT-d DMAR in kernel but still
want to use KVM and interrupt remapping for reasons like DMAR performance
concern or debug purpose.

Thanks.

-Fenghua

Signed-off-by: Fenghua Yu <[email protected]>

---

Documentation/kernel-parameters.txt | 5
arch/ia64/include/asm/iommu.h | 1
arch/ia64/kernel/pci-swiotlb.c | 2
arch/x86/include/asm/iommu.h | 1
arch/x86/kernel/pci-swiotlb.c | 3
drivers/pci/dmar.c | 9 +
drivers/pci/intel-iommu.c | 187 ++++++++++++++++++++++++++----------
include/linux/dma_remapping.h | 8 +
include/linux/intel-iommu.h | 2
9 files changed, 167 insertions(+), 51 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6172e43..5594cdb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -915,6 +915,11 @@ and is between 256 and 4096 characters. It is defined in the file
With this option on every unmap_single operation will
result in a hardware IOTLB flush operation as opposed
to batching them for performance.
+ pt [Default no Pass Through]
+ This option enables Pass Through in context mapping if
+ Pass Through is supported in hardware. With this option
+ DMAR is disabled in kernel and kernel uses swiotlb, but
+ KVM can still uses VT-d IOTLB hardware.

inttest= [IA64]

diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
index 0490794..37d41ca 100644
--- a/arch/ia64/include/asm/iommu.h
+++ b/arch/ia64/include/asm/iommu.h
@@ -9,6 +9,7 @@ extern void pci_iommu_shutdown(void);
extern void no_iommu_init(void);
extern int force_iommu, no_iommu;
extern int iommu_detected;
+extern int iommu_pass_through;
extern void iommu_dma_init(void);
extern void machvec_init(const char *name);

diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 285aae8..223abb1 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -46,7 +46,7 @@ void __init swiotlb_dma_init(void)

void __init pci_swiotlb_init(void)
{
- if (!iommu_detected) {
+ if (!iommu_detected || iommu_pass_through) {
#ifdef CONFIG_IA64_GENERIC
swiotlb = 1;
printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index af326a2..fd6d21b 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,7 @@ extern void no_iommu_init(void);
extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
+extern int iommu_pass_through;

/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 34f12e9..42a0eb1 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
+ if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ iommu_pass_through)
swiotlb = 1;
#endif
if (swiotlb_force)
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index fa3a113..1ef1a19 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -515,6 +515,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
u32 ver;
static int iommu_allocated = 0;
int agaw = 0;
+ int msagaw = 0;

iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
if (!iommu)
@@ -539,8 +540,16 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
iommu->seq_id);
goto error;
}
+ msagaw = iommu_calculate_max_sagaw(iommu);
+ if (msagaw < 0) {
+ printk(KERN_ERR
+ "Cannot get a valid max agaw for iommu (seq_id = %d)\n",
+ iommu->seq_id);
+ goto error;
+ }
#endif
iommu->agaw = agaw;
+ iommu->msagaw = msagaw;

/* the registers might be more than one page */
map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 001b328..205e4a1 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -53,6 +53,8 @@

#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48

+#define MAX_AGAW_WIDTH 64
+
#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)

#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
@@ -127,8 +129,6 @@ static inline void context_set_fault_enable(struct context_entry *context)
context->lo &= (((u64)-1) << 2) | 1;
}

-#define CONTEXT_TT_MULTI_LEVEL 0
-
static inline void context_set_translation_type(struct context_entry *context,
unsigned long value)
{
@@ -288,6 +288,7 @@ int dmar_disabled = 1;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
static int intel_iommu_strict;
+int iommu_pass_through;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -318,6 +319,9 @@ static int __init intel_iommu_setup(char *str)
printk(KERN_INFO
"Intel-IOMMU: disable batched IOTLB flush\n");
intel_iommu_strict = 1;
+ } else if (!strncmp(str, "pt", 2)) {
+ iommu_pass_through = 1;
+ printk(KERN_INFO "Intel-IOMMU: Pass Through enabled\n");
}

str += strcspn(str, ",");
@@ -397,17 +401,13 @@ void free_iova_mem(struct iova *iova)

static inline int width_to_agaw(int width);

-/* calculate agaw for each iommu.
- * "SAGAW" may be different across iommus, use a default agaw, and
- * get a supported less agaw for iommus that don't support the default agaw.
- */
-int iommu_calculate_agaw(struct intel_iommu *iommu)
+static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
{
unsigned long sagaw;
int agaw = -1;

sagaw = cap_sagaw(iommu->cap);
- for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+ for (agaw = width_to_agaw(max_gaw);
agaw >= 0; agaw--) {
if (test_bit(agaw, &sagaw))
break;
@@ -416,6 +416,24 @@ int iommu_calculate_agaw(struct intel_iommu *iommu)
return agaw;
}

+/*
+ * Calculate max SAGAW for each iommu.
+ */
+int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+ return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+}
+
+/*
+ * calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+ return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+}
+
/* in native case, each domain is related to only one iommu */
static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
{
@@ -1321,8 +1339,8 @@ static void domain_exit(struct dmar_domain *domain)
free_domain_mem(domain);
}

-static int domain_context_mapping_one(struct dmar_domain *domain,
- int segment, u8 bus, u8 devfn)
+static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
+ u8 bus, u8 devfn, int translation)
{
struct context_entry *context;
unsigned long flags;
@@ -1335,7 +1353,10 @@ static int domain_context_mapping_one(struct dmar_domain *domain,

pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
BUG_ON(!domain->pgd);
+ BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
+ translation != CONTEXT_TT_MULTI_LEVEL);

iommu = device_to_iommu(segment, bus, devfn);
if (!iommu)
@@ -1395,9 +1416,18 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
}

context_set_domain_id(context, id);
- context_set_address_width(context, iommu->agaw);
- context_set_address_root(context, virt_to_phys(pgd));
- context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
+
+ /*
+ * In pass through mode, AW must be programmed to indicate the largest
+ * AGAW value supported by hardware. And ASR is ignored by hardware.
+ */
+ if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
+ context_set_address_width(context, iommu->agaw);
+ context_set_address_root(context, virt_to_phys(pgd));
+ } else
+ context_set_address_width(context, iommu->msagaw);
+
+ context_set_translation_type(context, translation);
context_set_fault_enable(context);
context_set_present(context);
domain_flush_cache(domain, context, sizeof(*context));
@@ -1422,13 +1452,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
}

static int
-domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
+domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
+ int translation)
{
int ret;
struct pci_dev *tmp, *parent;

ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
- pdev->bus->number, pdev->devfn);
+ pdev->bus->number, pdev->devfn,
+ translation);
if (ret)
return ret;

@@ -1440,9 +1472,9 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
parent = pdev->bus->self;
while (parent != tmp) {
ret = domain_context_mapping_one(domain,
- pci_domain_nr(parent->bus),
- parent->bus->number,
- parent->devfn);
+ pci_domain_nr(parent->bus),
+ parent->bus->number,
+ parent->devfn, translation);
if (ret)
return ret;
parent = parent->bus->self;
@@ -1450,12 +1482,14 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
return domain_context_mapping_one(domain,
pci_domain_nr(tmp->subordinate),
- tmp->subordinate->number, 0);
+ tmp->subordinate->number, 0,
+ translation);
else /* this is a legacy PCI bridge */
return domain_context_mapping_one(domain,
pci_domain_nr(tmp->bus),
tmp->bus->number,
- tmp->devfn);
+ tmp->devfn,
+ translation);
}

static int domain_context_mapped(struct pci_dev *pdev)
@@ -1752,7 +1786,7 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev,
goto error;

/* context entry init */
- ret = domain_context_mapping(domain, pdev);
+ ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
if (!ret)
return 0;
error:
@@ -1853,6 +1887,23 @@ static inline void iommu_prepare_isa(void)
}
#endif /* !CONFIG_DMAR_FLPY_WA */

+/* Initialize each context entry as pass through.*/
+static int __init init_context_pass_through(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct dmar_domain *domain;
+ int ret;
+
+ for_each_pci_dev(pdev) {
+ domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+ ret = domain_context_mapping(domain, pdev,
+ CONTEXT_TT_PASS_THROUGH);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
static int __init init_dmars(void)
{
struct dmar_drhd_unit *drhd;
@@ -1860,6 +1911,7 @@ static int __init init_dmars(void)
struct pci_dev *pdev;
struct intel_iommu *iommu;
int i, ret;
+ int pass_through = 1;

/*
* for each drhd
@@ -1913,7 +1965,15 @@ static int __init init_dmars(void)
printk(KERN_ERR "IOMMU: allocate root entry failed\n");
goto error;
}
+ if (!ecap_pass_through(iommu->ecap))
+ pass_through = 0;
}
+ if (iommu_pass_through)
+ if (!pass_through) {
+ printk(KERN_INFO
+ "Pass Through is not supported by hardware.\n");
+ iommu_pass_through = 0;
+ }

/*
* Start from the sane iommu hardware state.
@@ -1976,37 +2036,57 @@ static int __init init_dmars(void)
"IOMMU: enable interrupt remapping failed\n");
}
#endif
+ /*
+ * If pass through is set and enabled, context entries of all pci
+ * devices are intialized by pass through translation type.
+ */
+ if (iommu_pass_through) {
+ ret = init_context_pass_through();
+ if (ret) {
+ printk(KERN_ERR "IOMMU: Pass through init failed.\n");
+ iommu_pass_through = 0;
+ }
+ }

/*
- * For each rmrr
- * for each dev attached to rmrr
- * do
- * locate drhd for dev, alloc domain for dev
- * allocate free domain
- * allocate page table entries for rmrr
- * if context not allocated for bus
- * allocate and init context
- * set present in root table for this bus
- * init context with domain, translation etc
- * endfor
- * endfor
+ * If pass through is not set or not enabled, setup context entries for
+ * identity mappings for rmrr, gfx, and isa.
*/
- for_each_rmrr_units(rmrr) {
- for (i = 0; i < rmrr->devices_cnt; i++) {
- pdev = rmrr->devices[i];
- /* some BIOS lists non-exist devices in DMAR table */
- if (!pdev)
- continue;
- ret = iommu_prepare_rmrr_dev(rmrr, pdev);
- if (ret)
- printk(KERN_ERR
+ if (!iommu_pass_through) {
+ /*
+ * For each rmrr
+ * for each dev attached to rmrr
+ * do
+ * locate drhd for dev, alloc domain for dev
+ * allocate free domain
+ * allocate page table entries for rmrr
+ * if context not allocated for bus
+ * allocate and init context
+ * set present in root table for this bus
+ * init context with domain, translation etc
+ * endfor
+ * endfor
+ */
+ for_each_rmrr_units(rmrr) {
+ for (i = 0; i < rmrr->devices_cnt; i++) {
+ pdev = rmrr->devices[i];
+ /*
+ * some BIOS lists non-exist devices in DMAR
+ * table.
+ */
+ if (!pdev)
+ continue;
+ ret = iommu_prepare_rmrr_dev(rmrr, pdev);
+ if (ret)
+ printk(KERN_ERR
"IOMMU: mapping reserved region failed\n");
+ }
}
- }

- iommu_prepare_gfx_mapping();
+ iommu_prepare_gfx_mapping();

- iommu_prepare_isa();
+ iommu_prepare_isa();
+ }

/*
* for each drhd
@@ -2117,7 +2197,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev)

/* make sure context mapping is ok */
if (unlikely(!domain_context_mapped(pdev))) {
- ret = domain_context_mapping(domain, pdev);
+ ret = domain_context_mapping(domain, pdev,
+ CONTEXT_TT_MULTI_LEVEL);
if (ret) {
printk(KERN_ERR
"Domain context map for %s failed",
@@ -2786,7 +2867,7 @@ int __init intel_iommu_init(void)
* Check the need for DMA-remapping initialization now.
* Above initialization will also be used by Interrupt-remapping.
*/
- if (no_iommu || swiotlb || dmar_disabled)
+ if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
return -ENODEV;

iommu_init_mempool();
@@ -2806,7 +2887,15 @@ int __init intel_iommu_init(void)

init_timer(&unmap_timer);
force_iommu = 1;
- dma_ops = &intel_dma_ops;
+
+ if (!iommu_pass_through) {
+ printk(KERN_INFO
+ "Multi-level page-table translation for DMAR.\n");
+ dma_ops = &intel_dma_ops;
+ } else
+ printk(KERN_INFO
+ "DMAR: Pass through translation for DMAR.\n");
+
init_iommu_sysfs();

register_iommu(&intel_iommu_ops);
@@ -3146,7 +3235,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
return -EFAULT;
}

- ret = domain_context_mapping(dmar_domain, pdev);
+ ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
if (ret)
return ret;

diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 1a455f1..e0a03af 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -13,6 +13,9 @@
#define DMA_PTE_WRITE (2)
#define DMA_PTE_SNP (1 << 11)

+#define CONTEXT_TT_MULTI_LEVEL 0
+#define CONTEXT_TT_PASS_THROUGH 2
+
struct intel_iommu;
struct dmar_domain;
struct root_entry;
@@ -21,11 +24,16 @@ extern void free_dmar_iommu(struct intel_iommu *iommu);

#ifdef CONFIG_DMAR
extern int iommu_calculate_agaw(struct intel_iommu *iommu);
+extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
#else
static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
{
return 0;
}
+static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+ return 0;
+}
#endif

extern int dmar_disabled;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index aa8c531..7246971 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -120,6 +120,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
#define ecap_coherent(e) ((e) & 0x1)
#define ecap_qis(e) ((e) & 0x2)
+#define ecap_pass_through(e) ((e >> 6) & 0x1)
#define ecap_eim_support(e) ((e >> 4) & 0x1)
#define ecap_ir_support(e) ((e >> 3) & 0x1)
#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
@@ -302,6 +303,7 @@ struct intel_iommu {
spinlock_t register_lock; /* protect register handling */
int seq_id; /* sequence id of the iommu */
int agaw; /* agaw of this iommu */
+ int msagaw; /* max sagaw of this iommu */
unsigned int irq;
unsigned char name[13]; /* Device Name */


2009-04-16 02:14:39

by Weidong Han

[permalink] [raw]
Subject: RE: [PATCH] Intel IOMMU Pass Through Support

Acked-by: Weidong Han <[email protected]>

Yu, Fenghua wrote:
> The patch adds kernel parameter intel_iommu=pt to set up pass through
> mode in
> context mapping entry. This disables DMAR in linux kernel; but KVM
> still runs on
> VT-d and interrupt remapping still works.
>
> In this mode, kernel uses swiotlb for DMA API functions but other VT-d
> functionalities are enabled for KVM. KVM always uses multi level
> translation
> page table in VT-d. By default, pass though mode is disabled in
> kernel.
>
> This is useful when people don't want to enable VT-d DMAR in kernel
> but still
> want to use KVM and interrupt remapping for reasons like DMAR
> performance
> concern or debug purpose.
>
> Thanks.
>
> -Fenghua
>
> Signed-off-by: Fenghua Yu <[email protected]>
>
> ---
>
> Documentation/kernel-parameters.txt | 5
> arch/ia64/include/asm/iommu.h | 1
> arch/ia64/kernel/pci-swiotlb.c | 2
> arch/x86/include/asm/iommu.h | 1
> arch/x86/kernel/pci-swiotlb.c | 3
> drivers/pci/dmar.c | 9 +
> drivers/pci/intel-iommu.c | 187
> ++++++++++++++++++++++++++---------- include/linux/dma_remapping.h
> | 8 + include/linux/intel-iommu.h | 2
> 9 files changed, 167 insertions(+), 51 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt
> b/Documentation/kernel-parameters.txt
> index 6172e43..5594cdb 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -915,6 +915,11 @@ and is between 256 and 4096 characters. It is
> defined in the file With this option on every unmap_single
> operation will result in a hardware IOTLB flush operation as
> opposed to batching them for performance.
> + pt [Default no Pass Through]
> + This option enables Pass Through in context mapping if
> + Pass Through is supported in hardware. With this option
> + DMAR is disabled in kernel and kernel uses swiotlb, but
> + KVM can still uses VT-d IOTLB hardware.
>
> inttest= [IA64]
>
> diff --git a/arch/ia64/include/asm/iommu.h
> b/arch/ia64/include/asm/iommu.h
> index 0490794..37d41ca 100644
> --- a/arch/ia64/include/asm/iommu.h
> +++ b/arch/ia64/include/asm/iommu.h
> @@ -9,6 +9,7 @@ extern void pci_iommu_shutdown(void);
> extern void no_iommu_init(void);
> extern int force_iommu, no_iommu;
> extern int iommu_detected;
> +extern int iommu_pass_through;
> extern void iommu_dma_init(void);
> extern void machvec_init(const char *name);
>
> diff --git a/arch/ia64/kernel/pci-swiotlb.c
> b/arch/ia64/kernel/pci-swiotlb.c
> index 285aae8..223abb1 100644
> --- a/arch/ia64/kernel/pci-swiotlb.c
> +++ b/arch/ia64/kernel/pci-swiotlb.c
> @@ -46,7 +46,7 @@ void __init swiotlb_dma_init(void)
>
> void __init pci_swiotlb_init(void)
> {
> - if (!iommu_detected) {
> + if (!iommu_detected || iommu_pass_through) {
> #ifdef CONFIG_IA64_GENERIC
> swiotlb = 1;
> printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
> diff --git a/arch/x86/include/asm/iommu.h
> b/arch/x86/include/asm/iommu.h
> index af326a2..fd6d21b 100644
> --- a/arch/x86/include/asm/iommu.h
> +++ b/arch/x86/include/asm/iommu.h
> @@ -6,6 +6,7 @@ extern void no_iommu_init(void);
> extern struct dma_map_ops nommu_dma_ops;
> extern int force_iommu, no_iommu;
> extern int iommu_detected;
> +extern int iommu_pass_through;
>
> /* 10 seconds */
> #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
> diff --git a/arch/x86/kernel/pci-swiotlb.c
> b/arch/x86/kernel/pci-swiotlb.c
> index 34f12e9..42a0eb1 100644
> --- a/arch/x86/kernel/pci-swiotlb.c
> +++ b/arch/x86/kernel/pci-swiotlb.c
> @@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
> {
> /* don't initialize swiotlb if iommu=off (no_iommu=1) */
> #ifdef CONFIG_X86_64
> - if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
> + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
> + iommu_pass_through)
> swiotlb = 1;
> #endif
> if (swiotlb_force)
> diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
> index fa3a113..1ef1a19 100644
> --- a/drivers/pci/dmar.c
> +++ b/drivers/pci/dmar.c
> @@ -515,6 +515,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
> u32 ver;
> static int iommu_allocated = 0;
> int agaw = 0;
> + int msagaw = 0;
>
> iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> if (!iommu)
> @@ -539,8 +540,16 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
> iommu->seq_id);
> goto error;
> }
> + msagaw = iommu_calculate_max_sagaw(iommu);
> + if (msagaw < 0) {
> + printk(KERN_ERR
> + "Cannot get a valid max agaw for iommu (seq_id = %d)\n",
> + iommu->seq_id);
> + goto error;
> + }
> #endif
> iommu->agaw = agaw;
> + iommu->msagaw = msagaw;
>
> /* the registers might be more than one page */
> map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
> diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
> index 001b328..205e4a1 100644
> --- a/drivers/pci/intel-iommu.c
> +++ b/drivers/pci/intel-iommu.c
> @@ -53,6 +53,8 @@
>
> #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
>
> +#define MAX_AGAW_WIDTH 64
> +
> #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
>
> #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
> @@ -127,8 +129,6 @@ static inline void
> context_set_fault_enable(struct context_entry *context) context->lo
> &= (((u64)-1) << 2) | 1; }
>
> -#define CONTEXT_TT_MULTI_LEVEL 0
> -
> static inline void context_set_translation_type(struct context_entry
> *context, unsigned long value)
> {
> @@ -288,6 +288,7 @@ int dmar_disabled = 1;
> static int __initdata dmar_map_gfx = 1;
> static int dmar_forcedac;
> static int intel_iommu_strict;
> +int iommu_pass_through;
>
> #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> static DEFINE_SPINLOCK(device_domain_lock);
> @@ -318,6 +319,9 @@ static int __init intel_iommu_setup(char *str)
> printk(KERN_INFO
> "Intel-IOMMU: disable batched IOTLB flush\n");
> intel_iommu_strict = 1;
> + } else if (!strncmp(str, "pt", 2)) {
> + iommu_pass_through = 1;
> + printk(KERN_INFO "Intel-IOMMU: Pass Through enabled\n");
> }
>
> str += strcspn(str, ",");
> @@ -397,17 +401,13 @@ void free_iova_mem(struct iova *iova)
>
> static inline int width_to_agaw(int width);
>
> -/* calculate agaw for each iommu.
> - * "SAGAW" may be different across iommus, use a default agaw, and
> - * get a supported less agaw for iommus that don't support the
> default agaw.
> - */
> -int iommu_calculate_agaw(struct intel_iommu *iommu)
> +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int
> max_gaw) {
> unsigned long sagaw;
> int agaw = -1;
>
> sagaw = cap_sagaw(iommu->cap);
> - for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
> + for (agaw = width_to_agaw(max_gaw);
> agaw >= 0; agaw--) {
> if (test_bit(agaw, &sagaw))
> break;
> @@ -416,6 +416,24 @@ int iommu_calculate_agaw(struct intel_iommu
> *iommu) return agaw;
> }
>
> +/*
> + * Calculate max SAGAW for each iommu.
> + */
> +int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
> +{
> + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
> +}
> +
> +/*
> + * calculate agaw for each iommu.
> + * "SAGAW" may be different across iommus, use a default agaw, and
> + * get a supported less agaw for iommus that don't support the
> default agaw. + */
> +int iommu_calculate_agaw(struct intel_iommu *iommu)
> +{
> + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
> +}
> +
> /* in native case, each domain is related to only one iommu */
> static struct intel_iommu *domain_get_iommu(struct dmar_domain
> *domain) {
> @@ -1321,8 +1339,8 @@ static void domain_exit(struct dmar_domain
> *domain) free_domain_mem(domain);
> }
>
> -static int domain_context_mapping_one(struct dmar_domain *domain,
> - int segment, u8 bus, u8 devfn)
> +static int domain_context_mapping_one(struct dmar_domain *domain,
> int segment, + u8 bus, u8 devfn, int translation)
> {
> struct context_entry *context;
> unsigned long flags;
> @@ -1335,7 +1353,10 @@ static int domain_context_mapping_one(struct
> dmar_domain *domain,
>
> pr_debug("Set context mapping for %02x:%02x.%d\n",
> bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
> +
> BUG_ON(!domain->pgd);
> + BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
> + translation != CONTEXT_TT_MULTI_LEVEL);
>
> iommu = device_to_iommu(segment, bus, devfn);
> if (!iommu)
> @@ -1395,9 +1416,18 @@ static int domain_context_mapping_one(struct
> dmar_domain *domain, }
>
> context_set_domain_id(context, id);
> - context_set_address_width(context, iommu->agaw);
> - context_set_address_root(context, virt_to_phys(pgd));
> - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
> +
> + /*
> + * In pass through mode, AW must be programmed to indicate the
> largest + * AGAW value supported by hardware. And ASR is ignored by
> hardware. + */
> + if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
> + context_set_address_width(context, iommu->agaw);
> + context_set_address_root(context, virt_to_phys(pgd));
> + } else
> + context_set_address_width(context, iommu->msagaw);
> +
> + context_set_translation_type(context, translation);
> context_set_fault_enable(context);
> context_set_present(context);
> domain_flush_cache(domain, context, sizeof(*context));
> @@ -1422,13 +1452,15 @@ static int domain_context_mapping_one(struct
> dmar_domain *domain, }
>
> static int
> -domain_context_mapping(struct dmar_domain *domain, struct pci_dev
> *pdev) +domain_context_mapping(struct dmar_domain *domain, struct
> pci_dev *pdev, + int translation)
> {
> int ret;
> struct pci_dev *tmp, *parent;
>
> ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
> - pdev->bus->number, pdev->devfn);
> + pdev->bus->number, pdev->devfn,
> + translation);
> if (ret)
> return ret;
>
> @@ -1440,9 +1472,9 @@ domain_context_mapping(struct dmar_domain
> *domain, struct pci_dev *pdev) parent = pdev->bus->self;
> while (parent != tmp) {
> ret = domain_context_mapping_one(domain,
> - pci_domain_nr(parent->bus),
> - parent->bus->number,
> - parent->devfn);
> + pci_domain_nr(parent->bus),
> + parent->bus->number,
> + parent->devfn, translation);
> if (ret)
> return ret;
> parent = parent->bus->self;
> @@ -1450,12 +1482,14 @@ domain_context_mapping(struct dmar_domain
> *domain, struct pci_dev *pdev) if (tmp->is_pcie) /* this is a
> PCIE-to-PCI bridge */ return domain_context_mapping_one(domain,
> pci_domain_nr(tmp->subordinate),
> - tmp->subordinate->number, 0);
> + tmp->subordinate->number, 0,
> + translation);
> else /* this is a legacy PCI bridge */
> return domain_context_mapping_one(domain,
> pci_domain_nr(tmp->bus),
> tmp->bus->number,
> - tmp->devfn);
> + tmp->devfn,
> + translation);
> }
>
> static int domain_context_mapped(struct pci_dev *pdev)
> @@ -1752,7 +1786,7 @@ static int iommu_prepare_identity_map(struct
> pci_dev *pdev, goto error;
>
> /* context entry init */
> - ret = domain_context_mapping(domain, pdev);
> + ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
> if (!ret)
> return 0;
> error:
> @@ -1853,6 +1887,23 @@ static inline void iommu_prepare_isa(void)
> }
> #endif /* !CONFIG_DMAR_FLPY_WA */
>
> +/* Initialize each context entry as pass through.*/
> +static int __init init_context_pass_through(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct dmar_domain *domain;
> + int ret;
> +
> + for_each_pci_dev(pdev) {
> + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
> + ret = domain_context_mapping(domain, pdev,
> + CONTEXT_TT_PASS_THROUGH);
> + if (ret)
> + return ret;
> + }
> + return 0;
> +}
> +
> static int __init init_dmars(void)
> {
> struct dmar_drhd_unit *drhd;
> @@ -1860,6 +1911,7 @@ static int __init init_dmars(void)
> struct pci_dev *pdev;
> struct intel_iommu *iommu;
> int i, ret;
> + int pass_through = 1;
>
> /*
> * for each drhd
> @@ -1913,7 +1965,15 @@ static int __init init_dmars(void)
> printk(KERN_ERR "IOMMU: allocate root entry failed\n");
> goto error;
> }
> + if (!ecap_pass_through(iommu->ecap))
> + pass_through = 0;
> }
> + if (iommu_pass_through)
> + if (!pass_through) {
> + printk(KERN_INFO
> + "Pass Through is not supported by hardware.\n");
> + iommu_pass_through = 0;
> + }
>
> /*
> * Start from the sane iommu hardware state.
> @@ -1976,37 +2036,57 @@ static int __init init_dmars(void)
> "IOMMU: enable interrupt remapping failed\n");
> }
> #endif
> + /*
> + * If pass through is set and enabled, context entries of all pci
> + * devices are intialized by pass through translation type.
> + */
> + if (iommu_pass_through) {
> + ret = init_context_pass_through();
> + if (ret) {
> + printk(KERN_ERR "IOMMU: Pass through init failed.\n");
> + iommu_pass_through = 0;
> + }
> + }
>
> /*
> - * For each rmrr
> - * for each dev attached to rmrr
> - * do
> - * locate drhd for dev, alloc domain for dev
> - * allocate free domain
> - * allocate page table entries for rmrr
> - * if context not allocated for bus
> - * allocate and init context
> - * set present in root table for this bus
> - * init context with domain, translation etc
> - * endfor
> - * endfor
> + * If pass through is not set or not enabled, setup context entries
> for + * identity mappings for rmrr, gfx, and isa.
> */
> - for_each_rmrr_units(rmrr) {
> - for (i = 0; i < rmrr->devices_cnt; i++) {
> - pdev = rmrr->devices[i];
> - /* some BIOS lists non-exist devices in DMAR table */
> - if (!pdev)
> - continue;
> - ret = iommu_prepare_rmrr_dev(rmrr, pdev);
> - if (ret)
> - printk(KERN_ERR
> + if (!iommu_pass_through) {
> + /*
> + * For each rmrr
> + * for each dev attached to rmrr
> + * do
> + * locate drhd for dev, alloc domain for dev
> + * allocate free domain
> + * allocate page table entries for rmrr
> + * if context not allocated for bus
> + * allocate and init context
> + * set present in root table for this bus
> + * init context with domain, translation etc
> + * endfor
> + * endfor
> + */
> + for_each_rmrr_units(rmrr) {
> + for (i = 0; i < rmrr->devices_cnt; i++) {
> + pdev = rmrr->devices[i];
> + /*
> + * some BIOS lists non-exist devices in DMAR
> + * table.
> + */
> + if (!pdev)
> + continue;
> + ret = iommu_prepare_rmrr_dev(rmrr, pdev);
> + if (ret)
> + printk(KERN_ERR
> "IOMMU: mapping reserved region failed\n");
> + }
> }
> - }
>
> - iommu_prepare_gfx_mapping();
> + iommu_prepare_gfx_mapping();
>
> - iommu_prepare_isa();
> + iommu_prepare_isa();
> + }
>
> /*
> * for each drhd
> @@ -2117,7 +2197,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev)
>
> /* make sure context mapping is ok */
> if (unlikely(!domain_context_mapped(pdev))) {
> - ret = domain_context_mapping(domain, pdev);
> + ret = domain_context_mapping(domain, pdev,
> + CONTEXT_TT_MULTI_LEVEL);
> if (ret) {
> printk(KERN_ERR
> "Domain context map for %s failed",
> @@ -2786,7 +2867,7 @@ int __init intel_iommu_init(void)
> * Check the need for DMA-remapping initialization now.
> * Above initialization will also be used by Interrupt-remapping.
> */
> - if (no_iommu || swiotlb || dmar_disabled)
> + if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
> return -ENODEV;
>
> iommu_init_mempool();
> @@ -2806,7 +2887,15 @@ int __init intel_iommu_init(void)
>
> init_timer(&unmap_timer);
> force_iommu = 1;
> - dma_ops = &intel_dma_ops;
> +
> + if (!iommu_pass_through) {
> + printk(KERN_INFO
> + "Multi-level page-table translation for DMAR.\n");
> + dma_ops = &intel_dma_ops;
> + } else
> + printk(KERN_INFO
> + "DMAR: Pass through translation for DMAR.\n");
> +
> init_iommu_sysfs();
>
> register_iommu(&intel_iommu_ops);
> @@ -3146,7 +3235,7 @@ static int intel_iommu_attach_device(struct
> iommu_domain *domain, return -EFAULT;
> }
>
> - ret = domain_context_mapping(dmar_domain, pdev);
> + ret = domain_context_mapping(dmar_domain, pdev,
> CONTEXT_TT_MULTI_LEVEL); if (ret)
> return ret;
>
> diff --git a/include/linux/dma_remapping.h
> b/include/linux/dma_remapping.h
> index 1a455f1..e0a03af 100644
> --- a/include/linux/dma_remapping.h
> +++ b/include/linux/dma_remapping.h
> @@ -13,6 +13,9 @@
> #define DMA_PTE_WRITE (2)
> #define DMA_PTE_SNP (1 << 11)
>
> +#define CONTEXT_TT_MULTI_LEVEL 0
> +#define CONTEXT_TT_PASS_THROUGH 2
> +
> struct intel_iommu;
> struct dmar_domain;
> struct root_entry;
> @@ -21,11 +24,16 @@ extern void free_dmar_iommu(struct intel_iommu
> *iommu);
>
> #ifdef CONFIG_DMAR
> extern int iommu_calculate_agaw(struct intel_iommu *iommu);
> +extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
> #else
> static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
> {
> return 0;
> }
> +static inline int iommu_calculate_max_sagaw(struct intel_iommu
> *iommu) +{
> + return 0;
> +}
> #endif
>
> extern int dmar_disabled;
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index aa8c531..7246971 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -120,6 +120,7 @@ static inline void dmar_writeq(void __iomem
> *addr, u64 val) (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
> #define ecap_coherent(e) ((e) & 0x1)
> #define ecap_qis(e) ((e) & 0x2)
> +#define ecap_pass_through(e) ((e >> 6) & 0x1)
> #define ecap_eim_support(e) ((e >> 4) & 0x1)
> #define ecap_ir_support(e) ((e >> 3) & 0x1)
> #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
> @@ -302,6 +303,7 @@ struct intel_iommu {
> spinlock_t register_lock; /* protect register handling */
> int seq_id; /* sequence id of the iommu */
> int agaw; /* agaw of this iommu */
> + int msagaw; /* max sagaw of this iommu */
> unsigned int irq;
> unsigned char name[13]; /* Device Name */

2009-04-19 10:05:43

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] Intel IOMMU Pass Through Support

On Wed, 2009-04-15 at 17:19 -0700, Fenghua Yu wrote:
> + if (iommu_pass_through)
> + if (!pass_through) {
> + printk(KERN_INFO
> + "Pass Through is not supported by hardware.\n");
> + iommu_pass_through = 0;
> + }

So if we ask for pass-through and the hardware doesn't support it, we
end up in full-translation mode? Wouldn't it be better to set up 1:1
mappings for each device instead?

(We probably want to fix up the code which sets up 1:1 mappings to
actually share page tables where appropriate, and to use superpages).

Also, did we already have a discussion about whether this should be an
intel_iommu= parameter, or a generic iommu= one?

> ret = domain_context_mapping_one(domain,
> - pci_domain_nr(parent->bus),
> - parent->bus->number,
> - parent->devfn);
> + pci_domain_nr(parent->bus),
> + parent->bus->number,
> + parent->devfn, translation);

Whitespace damage here -- you're modifying continuation lines which used
to be right underneath the 'domain' parameter, and you're making them
not line up properly.

You have the same problem elsewhere for newly-added code too, in fact:

> + ret = domain_context_mapping(domain, pdev,
> + CONTEXT_TT_PASS_THROUGH);


--
David Woodhouse Open Source Technology Centre
[email protected] Intel Corporation

2009-04-20 17:27:36

by Fenghua Yu

[permalink] [raw]
Subject: RE: [PATCH] Intel IOMMU Pass Through Support

>On Wed, 2009-04-15 at 17:19 -0700, Fenghua Yu wrote:
>> + if (iommu_pass_through)
>> + if (!pass_through) {
>> + printk(KERN_INFO
>> + "Pass Through is not supported by
>hardware.\n");
>> + iommu_pass_through = 0;
>> + }
>
>So if we ask for pass-through and the hardware doesn't support it, we
>end up in full-translation mode? Wouldn't it be better to set up 1:1
>mappings for each device instead?
>
>(We probably want to fix up the code which sets up 1:1 mappings to
>actually share page tables where appropriate, and to use superpages).
>
>Also, did we already have a discussion about whether this should be an
>intel_iommu= parameter, or a generic iommu= one?

For this patch, I would just fall back to "normal" translation if without hardware pass-through support.

I'm doing 1:1 mapping now to speed up mapping and unmapping. When the 1:1 mapping is implemented, the fall back from pass-through will go to 1:1 mapping which will be "normal" translation.

BTW, I think most of (or newer) VT-d hardware supports pass through feature now. So it won't fail anyway if pass through is wanted.

So at this point, this should be fine.

>
>> ret = domain_context_mapping_one(domain,
>> - pci_domain_nr(parent-
>>bus),
>> - parent->bus->number,
>> - parent->devfn);
>> + pci_domain_nr(parent-
>>bus),
>> + parent->bus->number,
>> + parent->devfn,
>translation);
>
>Whitespace damage here -- you're modifying continuation lines which used
>to be right underneath the 'domain' parameter, and you're making them
>not line up properly.
>
>You have the same problem elsewhere for newly-added code too, in fact:
>
>> + ret = domain_context_mapping(domain, pdev,
>> + CONTEXT_TT_PASS_THROUGH);
>

OK. I'll change this to align to the upper parameter.

Thanks.

-Fenghuas

2009-04-30 23:33:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Intel IOMMU Pass Through Support

On Wed, 15 Apr 2009 17:19:57 -0700
Fenghua Yu <[email protected]> wrote:

> The patch adds kernel parameter intel_iommu=pt to set up pass through mode in
> context mapping entry. This disables DMAR in linux kernel; but KVM still runs on
> VT-d and interrupt remapping still works.
>
> In this mode, kernel uses swiotlb for DMA API functions but other VT-d
> functionalities are enabled for KVM. KVM always uses multi level translation
> page table in VT-d. By default, pass though mode is disabled in kernel.
>
> This is useful when people don't want to enable VT-d DMAR in kernel but still
> want to use KVM and interrupt remapping for reasons like DMAR performance
> concern or debug purpose.
>

The patch is now in linux-next and broke my build.

arch/x86/built-in.o: In function `iommu_setup':
arch/x86/kernel/pci-dma.c:215: undefined reference to `iommu_pass_through'
arch/x86/built-in.o: In function `pci_swiotlb_init':
arch/x86/kernel/pci-swiotlb.c:74: undefined reference to `iommu_pass_through'

Because iommu_pass_through is defined in drivers/pci/intel-iommu.c and

# CONFIG_DMAR is not set

I'll need to cook up some local hack to work around that.



Also, the patch in linux-next (but not the one which I'm replying to
here does:

: --- a/arch/x86/kernel/pci-dma.c
: +++ b/arch/x86/kernel/pci-dma.c
: @@ -160,6 +160,8 @@ again:
: return page_address(page);
: }
:
: +extern int iommu_pass_through;
: +

Which is wrong and unnecessary - the variable was already declared in a
header file.

scripts/checkpatch.pl would have warned you about this error, but
apparently neither you nor the people who reviewed or merged the patch
bother to use it.

2009-04-30 23:36:41

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH] Intel IOMMU Pass Through Support

Andrew Morton wrote:
> On Wed, 15 Apr 2009 17:19:57 -0700
> Fenghua Yu <[email protected]> wrote:
>
>> The patch adds kernel parameter intel_iommu=pt to set up pass through mode in
>> context mapping entry. This disables DMAR in linux kernel; but KVM still runs on
>> VT-d and interrupt remapping still works.
>>
>> In this mode, kernel uses swiotlb for DMA API functions but other VT-d
>> functionalities are enabled for KVM. KVM always uses multi level translation
>> page table in VT-d. By default, pass though mode is disabled in kernel.
>>
>> This is useful when people don't want to enable VT-d DMAR in kernel but still
>> want to use KVM and interrupt remapping for reasons like DMAR performance
>> concern or debug purpose.
>>
>
> The patch is now in linux-next and broke my build.
>
> arch/x86/built-in.o: In function `iommu_setup':
> arch/x86/kernel/pci-dma.c:215: undefined reference to `iommu_pass_through'
> arch/x86/built-in.o: In function `pci_swiotlb_init':
> arch/x86/kernel/pci-swiotlb.c:74: undefined reference to `iommu_pass_through'
>
> Because iommu_pass_through is defined in drivers/pci/intel-iommu.c and
>
> # CONFIG_DMAR is not set
>
> I'll need to cook up some local hack to work around that.
>

Patch just went to linux-next mailing list (but should have also
gone to lkml):

http://marc.info/?l=linux-next&m=124113213400748&w=2


>
> Also, the patch in linux-next (but not the one which I'm replying to
> here does:
>
> : --- a/arch/x86/kernel/pci-dma.c
> : +++ b/arch/x86/kernel/pci-dma.c
> : @@ -160,6 +160,8 @@ again:
> : return page_address(page);
> : }
> :
> : +extern int iommu_pass_through;
> : +
>
> Which is wrong and unnecessary - the variable was already declared in a
> header file.
>
> scripts/checkpatch.pl would have warned you about this error, but
> apparently neither you nor the people who reviewed or merged the patch
> bother to use it.

2009-05-01 00:23:38

by Fenghua Yu

[permalink] [raw]
Subject: Re: [PATCH] Intel IOMMU Pass Through Support

On Thu, Apr 30, 2009 at 04:37:52PM -0700, Randy Dunlap wrote:
> Andrew Morton wrote:
> > On Wed, 15 Apr 2009 17:19:57 -0700
> > Fenghua Yu <[email protected]> wrote:
> >
> >> The patch adds kernel parameter intel_iommu=pt to set up pass through mode in
> >> context mapping entry. This disables DMAR in linux kernel; but KVM still runs on
> >> VT-d and interrupt remapping still works.
> >>
> >> In this mode, kernel uses swiotlb for DMA API functions but other VT-d
> >> functionalities are enabled for KVM. KVM always uses multi level translation
> >> page table in VT-d. By default, pass though mode is disabled in kernel.
> >>
> >> This is useful when people don't want to enable VT-d DMAR in kernel but still
> >> want to use KVM and interrupt remapping for reasons like DMAR performance
> >> concern or debug purpose.
> >>
> >
> > The patch is now in linux-next and broke my build.
> >
> > arch/x86/built-in.o: In function `iommu_setup':
> > arch/x86/kernel/pci-dma.c:215: undefined reference to `iommu_pass_through'
> > arch/x86/built-in.o: In function `pci_swiotlb_init':
> > arch/x86/kernel/pci-swiotlb.c:74: undefined reference to `iommu_pass_through'
> >
> > Because iommu_pass_through is defined in drivers/pci/intel-iommu.c and
> >
> > # CONFIG_DMAR is not set
> >
> > I'll need to cook up some local hack to work around that.
> >
>
> Patch just went to linux-next mailing list (but should have also
> gone to lkml):
>
> http://marc.info/?l=linux-next&m=124113213400748&w=2

That's right. I just posted this patch to fix the compiling errors.

>
>
> >
> > Also, the patch in linux-next (but not the one which I'm replying to
> > here does:
> >
> > : --- a/arch/x86/kernel/pci-dma.c
> > : +++ b/arch/x86/kernel/pci-dma.c
> > : @@ -160,6 +160,8 @@ again:
> > : return page_address(page);
> > : }
> > :
> > : +extern int iommu_pass_through;
> > : +
> >
> > Which is wrong and unnecessary - the variable was already declared in a
> > header file.
> >
> > scripts/checkpatch.pl would have warned you about this error, but
> > apparently neither you nor the people who reviewed or merged the patch
> > bother to use it.

Actually checkpatch.pl doesn't report anything wrong with this statement.

The following patch removes this statement on the top of linux-next.

Signed-off-by: Fenghua Yu <[email protected]>

---

pci-dma.c | 2 --
1 files changed, 2 deletions(-)

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8cad0d8..049005e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -160,8 +162,6 @@ again:
return page_address(page);
}

-extern int iommu_pass_through;
-
/*
* See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
* documentation.

2009-05-01 00:24:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Intel IOMMU Pass Through Support

On Thu, 30 Apr 2009 16:37:52 -0700
Randy Dunlap <[email protected]> wrote:

> >
> > arch/x86/built-in.o: In function `iommu_setup':
> > arch/x86/kernel/pci-dma.c:215: undefined reference to `iommu_pass_through'
> > arch/x86/built-in.o: In function `pci_swiotlb_init':
> > arch/x86/kernel/pci-swiotlb.c:74: undefined reference to `iommu_pass_through'
> >
> > Because iommu_pass_through is defined in drivers/pci/intel-iommu.c and
> >
> > # CONFIG_DMAR is not set
> >
> > I'll need to cook up some local hack to work around that.
> >
>
> Patch just went to linux-next mailing list (but should have also
> gone to lkml):
>
> http://marc.info/?l=linux-next&m=124113213400748&w=2

OK, thanks. My linux-next feed seems to have broken.

That patch modifies

arch/ia64/kernel/dma-mapping.c
arch/x86/kernel/pci-dma.c

but it would be more logical to modify

arch/ia64/kernel/pci-dma.c
arch/x86/kernel/pci-dma.c

is there a reason why that cannot be done?

2009-05-01 00:32:43

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Intel IOMMU Pass Through Support

On Thu, 30 Apr 2009 17:05:10 -0700
Fenghua Yu <[email protected]> wrote:

> > > : --- a/arch/x86/kernel/pci-dma.c
> > > : +++ b/arch/x86/kernel/pci-dma.c
> > > : @@ -160,6 +160,8 @@ again:
> > > : return page_address(page);
> > > : }
> > > :
> > > : +extern int iommu_pass_through;
> > > : +
> > >
> > > Which is wrong and unnecessary - the variable was already declared in a
> > > header file.
> > >
> > > scripts/checkpatch.pl would have warned you about this error, but
> > > apparently neither you nor the people who reviewed or merged the patch
> > > bother to use it.
>
> Actually checkpatch.pl doesn't report anything wrong with this statement.

Running 4ed0d3e6c64cfd9ba4ceb2099b10d1cf8ece4320 through checkpatch produces
the below output:

WARNING: externs should be avoided in .c files
#83: FILE: arch/x86/kernel/pci-dma.c:163:
+extern int iommu_pass_through;

WARNING: suspect code indent for conditional statements (8, 15)
#108: FILE: arch/x86/kernel/pci-swiotlb.c:74:
+ if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
[...]
swiotlb = 1;

2009-05-01 00:57:22

by Fenghua Yu

[permalink] [raw]
Subject: Re: [PATCH] Intel IOMMU Pass Through Support

On Thu, Apr 30, 2009 at 05:00:34PM -0700, Andrew Morton wrote:
> On Thu, 30 Apr 2009 16:37:52 -0700
> Randy Dunlap <[email protected]> wrote:
>
> > >
> > > arch/x86/built-in.o: In function `iommu_setup':
> > > arch/x86/kernel/pci-dma.c:215: undefined reference to `iommu_pass_through'
> > > arch/x86/built-in.o: In function `pci_swiotlb_init':
> > > arch/x86/kernel/pci-swiotlb.c:74: undefined reference to `iommu_pass_through'
> > >
> > > Because iommu_pass_through is defined in drivers/pci/intel-iommu.c and
> > >
> > > # CONFIG_DMAR is not set
> > >
> > > I'll need to cook up some local hack to work around that.
> > >
> >
> > Patch just went to linux-next mailing list (but should have also
> > gone to lkml):
> >
> > http://marc.info/?l=linux-next&m=124113213400748&w=2
>
> OK, thanks. My linux-next feed seems to have broken.
>
> That patch modifies
>
> arch/ia64/kernel/dma-mapping.c
> arch/x86/kernel/pci-dma.c
>
> but it would be more logical to modify
>
> arch/ia64/kernel/pci-dma.c
> arch/x86/kernel/pci-dma.c
>
> is there a reason why that cannot be done?

No particular reason. I just follow iommu_detected definition which is arch/ia64
kernel/dma-mapping.c.

Now I updated the compiling error fix patch as follows:

This updated patch should fix the compiling errors and remove the extern
iommu_pass_through from drivers/pci/intel-iommu.c file.

Signed-off-by: Fenghua Yu <[email protected]>

---

arch/ia64/kernel/pci-dma.c | 2 ++
arch/x86/kernel/pci-dma.c | 4 ++--
drivers/pci/intel-iommu.c | 1 -
3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index eb98738..ecdde25 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -32,6 +32,8 @@ int force_iommu __read_mostly = 1;
int force_iommu __read_mostly;
#endif

+int iommu_pass_through;
+
/* Dummy device used for NULL arguments (normally ISA). Better would
be probably a smaller DMA mask, but this is bug-to-bug compatible
to i386. */
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8cad0d8..049005e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -32,6 +32,8 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;

+int iommu_pass_through;
+
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);

@@ -160,8 +162,6 @@ again:
return page_address(page);
}

-extern int iommu_pass_through;
-
/*
* See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
* documentation.
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index d3ac5e5..66a9cba 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -288,7 +288,6 @@ int dmar_disabled = 1;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
static int intel_iommu_strict;
-int iommu_pass_through;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);