Following LPC discussions, we now report reserved regions through
iommu-group sysfs reserved_regions attribute file.
Reserved regions are populated through the IOMMU get_resv_region callback
(former get_dm_regions), now implemented by amd-iommu, intel-iommu and
arm-smmu.
The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
IOMMU_RESV_NOMAP reserved region.
arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
1MB large) and the PCI host bridge windows.
The series integrates a not officially posted patch from Robin:
"iommu/dma: Allow MSI-only cookies".
This series currently does not address IRQ safety assessment.
Best Regards
Eric
Git: complete series available at
https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
History:
RFC v2 -> v3:
- switch to an iommu-group sysfs API
- use new dummy allocator provided by Robin
- dummy allocator initialized by vfio-iommu-type1 after enumerating
the reserved regions
- at the moment ARM MSI base address/size is left unchanged compared
to v2
- we currently report reserved regions and not usable IOVA regions as
requested by Alex
RFC v1 -> v2:
- fix intel_add_reserved_regions
- add mutex lock/unlock in vfio_iommu_type1
Eric Auger (10):
iommu/dma: Allow MSI-only cookies
iommu: Rename iommu_dm_regions into iommu_resv_regions
iommu: Add new reserved IOMMU attributes
iommu: iommu_alloc_resv_region
iommu: Do not map reserved regions
iommu: iommu_get_group_resv_regions
iommu: Implement reserved_regions iommu-group sysfs file
iommu/vt-d: Implement reserved region get/put callbacks
iommu/arm-smmu: Implement reserved region get/put callbacks
vfio/type1: Get MSI cookie
drivers/iommu/amd_iommu.c | 20 +++---
drivers/iommu/arm-smmu.c | 52 +++++++++++++++
drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
drivers/iommu/intel-iommu.c | 50 ++++++++++----
drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
include/linux/dma-iommu.h | 7 ++
include/linux/iommu.h | 49 ++++++++++----
8 files changed, 391 insertions(+), 70 deletions(-)
--
1.9.1
IOMMU domain users such as VFIO face a similar problem to DMA API ops
with regard to mapping MSI messages in systems where the MSI write is
subject to IOMMU translation. With the relevant infrastructure now in
place for managed DMA domains, it's actually really simple for other
users to piggyback off that and reap the benefits without giving up
their own IOVA management, and without having to reinvent their own
wheel in the MSI layer.
Allow such users to opt into automatic MSI remapping by dedicating a
region of their IOVA space to a managed cookie, and extend the mapping
routine to implement a trivial linear allocator in such cases, to avoid
the needless overhead of a full-blown IOVA domain.
Signed-off-by: Robin Murphy <[email protected]>
Signed-off-by: Eric Auger <[email protected]>
---
[Eric] fixed 2 issues on MSI path
---
drivers/iommu/dma-iommu.c | 116 +++++++++++++++++++++++++++++++++++++---------
include/linux/dma-iommu.h | 7 +++
2 files changed, 101 insertions(+), 22 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c5ab866..793103f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -37,10 +37,19 @@ struct iommu_dma_msi_page {
phys_addr_t phys;
};
+enum iommu_dma_cookie_type {
+ IOMMU_DMA_IOVA_COOKIE,
+ IOMMU_DMA_MSI_COOKIE,
+};
+
struct iommu_dma_cookie {
- struct iova_domain iovad;
- struct list_head msi_page_list;
- spinlock_t msi_lock;
+ union {
+ struct iova_domain iovad;
+ dma_addr_t msi_iova;
+ };
+ struct list_head msi_page_list;
+ spinlock_t msi_lock;
+ enum iommu_dma_cookie_type type;
};
static inline struct iova_domain *cookie_iovad(struct iommu_domain *domain)
@@ -53,6 +62,19 @@ int iommu_dma_init(void)
return iova_cache_get();
}
+static struct iommu_dma_cookie *__cookie_alloc(enum iommu_dma_cookie_type type)
+{
+ struct iommu_dma_cookie *cookie;
+
+ cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
+ if (cookie) {
+ spin_lock_init(&cookie->msi_lock);
+ INIT_LIST_HEAD(&cookie->msi_page_list);
+ cookie->type = type;
+ }
+ return cookie;
+}
+
/**
* iommu_get_dma_cookie - Acquire DMA-API resources for a domain
* @domain: IOMMU domain to prepare for DMA-API usage
@@ -62,25 +84,53 @@ int iommu_dma_init(void)
*/
int iommu_get_dma_cookie(struct iommu_domain *domain)
{
+ if (domain->iova_cookie)
+ return -EEXIST;
+
+ domain->iova_cookie = __cookie_alloc(IOMMU_DMA_IOVA_COOKIE);
+ if (!domain->iova_cookie)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(iommu_get_dma_cookie);
+
+/**
+ * iommu_get_msi_cookie - Acquire just MSI remapping resources
+ * @domain: IOMMU domain to prepare
+ * @base: Start address of IOVA region for MSI mappings
+ *
+ * Users who manage their own IOVA allocation and do not want DMA API support,
+ * but would still like to take advantage of automatic MSI remapping, can use
+ * this to initialise their own domain appropriately. Users should reserve a
+ * contiguous IOVA region, starting at @base, large enough to accommodate the
+ * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
+ * used by the devices attached to @domain.
+ */
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
+{
struct iommu_dma_cookie *cookie;
+ if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+ return -EINVAL;
+
if (domain->iova_cookie)
return -EEXIST;
- cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
+ cookie = __cookie_alloc(IOMMU_DMA_MSI_COOKIE);
if (!cookie)
return -ENOMEM;
- spin_lock_init(&cookie->msi_lock);
- INIT_LIST_HEAD(&cookie->msi_page_list);
+ cookie->msi_iova = base;
domain->iova_cookie = cookie;
return 0;
}
-EXPORT_SYMBOL(iommu_get_dma_cookie);
+EXPORT_SYMBOL(iommu_get_msi_cookie);
/**
* iommu_put_dma_cookie - Release a domain's DMA mapping resources
- * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or
+ * iommu_get_msi_cookie()
*
* IOMMU drivers should normally call this from their domain_free callback.
*/
@@ -92,7 +142,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
if (!cookie)
return;
- if (cookie->iovad.granule)
+ if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule)
put_iova_domain(&cookie->iovad);
list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
@@ -137,11 +187,12 @@ static void iova_reserve_pci_windows(struct pci_dev *dev,
int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
u64 size, struct device *dev)
{
- struct iova_domain *iovad = cookie_iovad(domain);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
unsigned long order, base_pfn, end_pfn;
- if (!iovad)
- return -ENODEV;
+ if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
+ return -EINVAL;
/* Use the smallest supported page size for IOVA granularity */
order = __ffs(domain->pgsize_bitmap);
@@ -644,11 +695,21 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
{
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iommu_dma_msi_page *msi_page;
- struct iova_domain *iovad = &cookie->iovad;
+ struct iova_domain *iovad;
struct iova *iova;
int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+ size_t size;
+
+ if (cookie->type == IOMMU_DMA_IOVA_COOKIE) {
+ iovad = &cookie->iovad;
+ size = iovad->granule;
+ } else {
+ iovad = NULL;
+ size = PAGE_SIZE;
+ }
+
+ msi_addr &= ~(phys_addr_t)(size - 1);
- msi_addr &= ~(phys_addr_t)iova_mask(iovad);
list_for_each_entry(msi_page, &cookie->msi_page_list, list)
if (msi_page->phys == msi_addr)
return msi_page;
@@ -657,13 +718,18 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
if (!msi_page)
return NULL;
- iova = __alloc_iova(domain, iovad->granule, dma_get_mask(dev));
- if (!iova)
- goto out_free_page;
-
msi_page->phys = msi_addr;
- msi_page->iova = iova_dma_addr(iovad, iova);
- if (iommu_map(domain, msi_page->iova, msi_addr, iovad->granule, prot))
+ if (iovad) {
+ iova = __alloc_iova(domain, size, dma_get_mask(dev));
+ if (!iova)
+ goto out_free_page;
+ msi_page->iova = iova_dma_addr(iovad, iova);
+ } else {
+ msi_page->iova = cookie->msi_iova;
+ cookie->msi_iova += size;
+ }
+
+ if (iommu_map(domain, msi_page->iova, msi_addr, size, prot))
goto out_free_iova;
INIT_LIST_HEAD(&msi_page->list);
@@ -671,7 +737,10 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
return msi_page;
out_free_iova:
- __free_iova(iovad, iova);
+ if (iovad)
+ __free_iova(iovad, iova);
+ else
+ cookie->msi_iova -= size;
out_free_page:
kfree(msi_page);
return NULL;
@@ -712,7 +781,10 @@ void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)
msg->data = ~0U;
} else {
msg->address_hi = upper_32_bits(msi_page->iova);
- msg->address_lo &= iova_mask(&cookie->iovad);
+ if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
+ msg->address_lo &= iova_mask(&cookie->iovad);
+ else
+ msg->address_lo &= (PAGE_SIZE - 1);
msg->address_lo += lower_32_bits(msi_page->iova);
}
}
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 32c5890..e27f2997 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -27,6 +27,7 @@
/* Domain management interface for IOMMU drivers */
int iommu_get_dma_cookie(struct iommu_domain *domain);
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
void iommu_put_dma_cookie(struct iommu_domain *domain);
/* Setup call for arch DMA mapping code */
@@ -82,6 +83,12 @@ static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
return -ENODEV;
}
+static inline int iommu_get_msi_cookie(struct iommu_domain *domain,
+ dma_addr_t base)
+{
+ return -ENODEV;
+}
+
static inline void iommu_put_dma_cookie(struct iommu_domain *domain)
{
}
--
1.9.1
We want to extend the callbacks used for dm regions and
use them for reserved regions. Reserved regions can be
- directly mapped regions
- regions that cannot be iommu mapped (PCI host bridge windows, ...)
- MSI regions (because they belong to another address space or because
they are not translated by the IOMMU and need special handling)
So let's rename the struct and also the callbacks.
Signed-off-by: Eric Auger <[email protected]>
---
drivers/iommu/amd_iommu.c | 20 ++++++++++----------
drivers/iommu/iommu.c | 22 +++++++++++-----------
include/linux/iommu.h | 29 +++++++++++++++--------------
3 files changed, 36 insertions(+), 35 deletions(-)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 754595e..a6c351d 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3159,8 +3159,8 @@ static bool amd_iommu_capable(enum iommu_cap cap)
return false;
}
-static void amd_iommu_get_dm_regions(struct device *dev,
- struct list_head *head)
+static void amd_iommu_get_resv_regions(struct device *dev,
+ struct list_head *head)
{
struct unity_map_entry *entry;
int devid;
@@ -3170,7 +3170,7 @@ static void amd_iommu_get_dm_regions(struct device *dev,
return;
list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- struct iommu_dm_region *region;
+ struct iommu_resv_region *region;
if (devid < entry->devid_start || devid > entry->devid_end)
continue;
@@ -3193,18 +3193,18 @@ static void amd_iommu_get_dm_regions(struct device *dev,
}
}
-static void amd_iommu_put_dm_regions(struct device *dev,
+static void amd_iommu_put_resv_regions(struct device *dev,
struct list_head *head)
{
- struct iommu_dm_region *entry, *next;
+ struct iommu_resv_region *entry, *next;
list_for_each_entry_safe(entry, next, head, list)
kfree(entry);
}
-static void amd_iommu_apply_dm_region(struct device *dev,
+static void amd_iommu_apply_resv_region(struct device *dev,
struct iommu_domain *domain,
- struct iommu_dm_region *region)
+ struct iommu_resv_region *region)
{
struct dma_ops_domain *dma_dom = to_dma_ops_domain(to_pdomain(domain));
unsigned long start, end;
@@ -3228,9 +3228,9 @@ static void amd_iommu_apply_dm_region(struct device *dev,
.add_device = amd_iommu_add_device,
.remove_device = amd_iommu_remove_device,
.device_group = amd_iommu_device_group,
- .get_dm_regions = amd_iommu_get_dm_regions,
- .put_dm_regions = amd_iommu_put_dm_regions,
- .apply_dm_region = amd_iommu_apply_dm_region,
+ .get_resv_regions = amd_iommu_get_resv_regions,
+ .put_resv_regions = amd_iommu_put_resv_regions,
+ .apply_resv_region = amd_iommu_apply_resv_region,
.pgsize_bitmap = AMD_IOMMU_PGSIZES,
};
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9a2f196..c7ed334 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -318,7 +318,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
struct device *dev)
{
struct iommu_domain *domain = group->default_domain;
- struct iommu_dm_region *entry;
+ struct iommu_resv_region *entry;
struct list_head mappings;
unsigned long pg_size;
int ret = 0;
@@ -331,14 +331,14 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
pg_size = 1UL << __ffs(domain->pgsize_bitmap);
INIT_LIST_HEAD(&mappings);
- iommu_get_dm_regions(dev, &mappings);
+ iommu_get_resv_regions(dev, &mappings);
/* We need to consider overlapping regions for different devices */
list_for_each_entry(entry, &mappings, list) {
dma_addr_t start, end, addr;
- if (domain->ops->apply_dm_region)
- domain->ops->apply_dm_region(dev, domain, entry);
+ if (domain->ops->apply_resv_region)
+ domain->ops->apply_resv_region(dev, domain, entry);
start = ALIGN(entry->start, pg_size);
end = ALIGN(entry->start + entry->length, pg_size);
@@ -358,7 +358,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
}
out:
- iommu_put_dm_regions(dev, &mappings);
+ iommu_put_resv_regions(dev, &mappings);
return ret;
}
@@ -1546,20 +1546,20 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
}
EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
-void iommu_get_dm_regions(struct device *dev, struct list_head *list)
+void iommu_get_resv_regions(struct device *dev, struct list_head *list)
{
const struct iommu_ops *ops = dev->bus->iommu_ops;
- if (ops && ops->get_dm_regions)
- ops->get_dm_regions(dev, list);
+ if (ops && ops->get_resv_regions)
+ ops->get_resv_regions(dev, list);
}
-void iommu_put_dm_regions(struct device *dev, struct list_head *list)
+void iommu_put_resv_regions(struct device *dev, struct list_head *list)
{
const struct iommu_ops *ops = dev->bus->iommu_ops;
- if (ops && ops->put_dm_regions)
- ops->put_dm_regions(dev, list);
+ if (ops && ops->put_resv_regions)
+ ops->put_resv_regions(dev, list);
}
/* Request that a device is direct mapped by the IOMMU */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 436dc21..7f6ebd0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,13 +118,13 @@ enum iommu_attr {
};
/**
- * struct iommu_dm_region - descriptor for a direct mapped memory region
+ * struct iommu_resv_region - descriptor for a reserved memory region
* @list: Linked list pointers
* @start: System physical start address of the region
* @length: Length of the region in bytes
* @prot: IOMMU Protection flags (READ/WRITE/...)
*/
-struct iommu_dm_region {
+struct iommu_resv_region {
struct list_head list;
phys_addr_t start;
size_t length;
@@ -150,9 +150,9 @@ struct iommu_dm_region {
* @device_group: find iommu group for a particular device
* @domain_get_attr: Query domain attributes
* @domain_set_attr: Change domain attributes
- * @get_dm_regions: Request list of direct mapping requirements for a device
- * @put_dm_regions: Free list of direct mapping requirements for a device
- * @apply_dm_region: Temporary helper call-back for iova reserved ranges
+ * @get_resv_regions: Request list of reserved regions for a device
+ * @put_resv_regions: Free list of reserved regions for a device
+ * @apply_resv_region: Temporary helper call-back for iova reserved ranges
* @domain_window_enable: Configure and enable a particular window for a domain
* @domain_window_disable: Disable a particular window for a domain
* @domain_set_windows: Set the number of windows for a domain
@@ -184,11 +184,12 @@ struct iommu_ops {
int (*domain_set_attr)(struct iommu_domain *domain,
enum iommu_attr attr, void *data);
- /* Request/Free a list of direct mapping requirements for a device */
- void (*get_dm_regions)(struct device *dev, struct list_head *list);
- void (*put_dm_regions)(struct device *dev, struct list_head *list);
- void (*apply_dm_region)(struct device *dev, struct iommu_domain *domain,
- struct iommu_dm_region *region);
+ /* Request/Free a list of reserved regions for a device */
+ void (*get_resv_regions)(struct device *dev, struct list_head *list);
+ void (*put_resv_regions)(struct device *dev, struct list_head *list);
+ void (*apply_resv_region)(struct device *dev,
+ struct iommu_domain *domain,
+ struct iommu_resv_region *region);
/* Window handling functions */
int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
@@ -233,8 +234,8 @@ extern size_t default_iommu_map_sg(struct iommu_domain *domain, unsigned long io
extern void iommu_set_fault_handler(struct iommu_domain *domain,
iommu_fault_handler_t handler, void *token);
-extern void iommu_get_dm_regions(struct device *dev, struct list_head *list);
-extern void iommu_put_dm_regions(struct device *dev, struct list_head *list);
+extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
+extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
extern int iommu_request_dm_for_dev(struct device *dev);
extern int iommu_attach_group(struct iommu_domain *domain,
@@ -439,12 +440,12 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain,
{
}
-static inline void iommu_get_dm_regions(struct device *dev,
+static inline void iommu_get_resv_regions(struct device *dev,
struct list_head *list)
{
}
-static inline void iommu_put_dm_regions(struct device *dev,
+static inline void iommu_put_resv_regions(struct device *dev,
struct list_head *list)
{
}
--
1.9.1
Introduce a new helper serving the purpose to allocate a reserved
region. This will be used in iommu driver implementing reserved
region callbacks.
Signed-off-by: Eric Auger <[email protected]>
---
drivers/iommu/iommu.c | 16 ++++++++++++++++
include/linux/iommu.h | 8 ++++++++
2 files changed, 24 insertions(+)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c7ed334..6ee529f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1562,6 +1562,22 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
ops->put_resv_regions(dev, list);
}
+struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
+ size_t length,
+ unsigned int prot)
+{
+ struct iommu_resv_region *region;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return NULL;
+
+ region->start = start;
+ region->length = length;
+ region->prot = prot;
+ return region;
+}
+
/* Request that a device is direct mapped by the IOMMU */
int iommu_request_dm_for_dev(struct device *dev)
{
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 02cf565..0aea877 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -241,6 +241,8 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
extern int iommu_request_dm_for_dev(struct device *dev);
+extern struct iommu_resv_region *
+iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot);
extern int iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
@@ -454,6 +456,12 @@ static inline void iommu_put_resv_regions(struct device *dev,
{
}
+static inline struct iommu_resv_region *
+iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot)
+{
+ return NULL;
+}
+
static inline int iommu_request_dm_for_dev(struct device *dev)
{
return -ENODEV;
--
1.9.1
A new iommu-group sysfs attribute file is introduced. It contains
the list of reserved regions for the iommu-group. Each reserved
region is described on a separate line:
- first field is the start IOVA address,
- second is the end IOVA address,
Signed-off-by: Eric Auger <[email protected]>
---
The file layout is inspired of /sys/bus/pci/devices/BDF/resource.
I also read Documentation/filesystems/sysfs.txt so I expect this
to be frowned upon.
---
drivers/iommu/iommu.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e0fbcc5..ce9a465 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,6 +32,7 @@
#include <linux/pci.h>
#include <linux/bitops.h>
#include <linux/property.h>
+#include <linux/list_sort.h>
#include <trace/events/iommu.h>
static struct kset *iommu_group_kset;
@@ -186,8 +187,47 @@ int iommu_get_group_resv_regions(struct iommu_group *group,
}
EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions);
+static int iommu_resv_region_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct iommu_resv_region *rega, *regb;
+
+ rega = container_of(a, struct iommu_resv_region, list);
+ regb = container_of(b, struct iommu_resv_region, list);
+
+ if (rega->start + rega->length - 1 < regb->start)
+ return -1;
+ if (regb->start + regb->length - 1 < rega->start)
+ return 1;
+ else
+ return 0;
+}
+
+static ssize_t iommu_group_show_resv_regions(struct iommu_group *group,
+ char *buf)
+{
+ struct iommu_resv_region *region, *next;
+ struct list_head group_resv_regions;
+ char *str = buf;
+
+ INIT_LIST_HEAD(&group_resv_regions);
+ iommu_get_group_resv_regions(group, &group_resv_regions);
+ list_sort(NULL, &group_resv_regions, iommu_resv_region_cmp);
+
+ list_for_each_entry_safe(region, next, &group_resv_regions, list) {
+ str += sprintf(str, "0x%016llx 0x%016llx\n", region->start,
+ region->start + region->length - 1);
+ kfree(region);
+ }
+
+ return (str - buf);
+}
+
static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
+static IOMMU_GROUP_ATTR(reserved_regions, S_IRUGO,
+ iommu_group_show_resv_regions, NULL);
+
static void iommu_group_release(struct kobject *kobj)
{
struct iommu_group *group = to_iommu_group(kobj);
@@ -202,6 +242,8 @@ static void iommu_group_release(struct kobject *kobj)
if (group->default_domain)
iommu_domain_free(group->default_domain);
+ iommu_group_remove_file(group, &iommu_group_attr_reserved_regions);
+
kfree(group->name);
kfree(group);
}
@@ -265,6 +307,11 @@ struct iommu_group *iommu_group_alloc(void)
*/
kobject_put(&group->kobj);
+ ret = iommu_group_create_file(group,
+ &iommu_group_attr_reserved_regions);
+ if (ret)
+ return ERR_PTR(ret);
+
pr_debug("Allocated group %d\n", group->id);
return group;
--
1.9.1
Introduce iommu_get_group_resv_regions whose role consists in
enumerating all devices from the group and collecting their
reserved regions. It checks duplicates.
Signed-off-by: Eric Auger <[email protected]>
---
- we do not move list elements from device to group list since
the iommu_put_resv_regions() could not be called.
- at the moment I did not introduce any iommu_put_group_resv_regions
since it simply consists in voiding/freeing the list
---
drivers/iommu/iommu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/iommu.h | 8 ++++++++
2 files changed, 61 insertions(+)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a4530ad..e0fbcc5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -133,6 +133,59 @@ static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
return sprintf(buf, "%s\n", group->name);
}
+static bool iommu_resv_region_present(struct iommu_resv_region *region,
+ struct list_head *head)
+{
+ struct iommu_resv_region *entry;
+
+ list_for_each_entry(entry, head, list) {
+ if ((region->start == entry->start) &&
+ (region->length == entry->length) &&
+ (region->prot == entry->prot))
+ return true;
+ }
+ return false;
+}
+
+static int
+iommu_insert_device_resv_regions(struct list_head *dev_resv_regions,
+ struct list_head *group_resv_regions)
+{
+ struct iommu_resv_region *entry, *region;
+
+ list_for_each_entry(entry, dev_resv_regions, list) {
+ if (iommu_resv_region_present(entry, group_resv_regions))
+ continue;
+ region = iommu_alloc_resv_region(entry->start, entry->length,
+ entry->prot);
+ if (!region)
+ return -ENOMEM;
+
+ list_add_tail(®ion->list, group_resv_regions);
+ }
+ return 0;
+}
+
+int iommu_get_group_resv_regions(struct iommu_group *group,
+ struct list_head *head)
+{
+ struct iommu_device *device;
+ int ret = 0;
+
+ list_for_each_entry(device, &group->devices, list) {
+ struct list_head dev_resv_regions;
+
+ INIT_LIST_HEAD(&dev_resv_regions);
+ iommu_get_resv_regions(device->dev, &dev_resv_regions);
+ ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
+ iommu_put_resv_regions(device->dev, &dev_resv_regions);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions);
+
static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
static void iommu_group_release(struct kobject *kobj)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0aea877..0f7ae2c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -243,6 +243,8 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
extern int iommu_request_dm_for_dev(struct device *dev);
extern struct iommu_resv_region *
iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot);
+extern int iommu_get_group_resv_regions(struct iommu_group *group,
+ struct list_head *head);
extern int iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
@@ -462,6 +464,12 @@ static inline void iommu_put_resv_regions(struct device *dev,
return NULL;
}
+static inline int iommu_get_group_resv_regions(struct iommu_group *group,
+ struct list_head *head)
+{
+ return -ENODEV;
+}
+
static inline int iommu_request_dm_for_dev(struct device *dev)
{
return -ENODEV;
--
1.9.1
When attaching a group to the container, handle the group's
reserved regions and particularly the IOMMU_RESV_MSI region
which requires an IOVA allocator to be initialized through
the iommu_get_msi_cookie API. This will allow the MSI IOVAs
to be transparently allocated on MSI controller's compose().
Signed-off-by: Eric Auger <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ba1942..701d8a8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/workqueue.h>
+#include <linux/dma-iommu.h>
#define DRIVER_VERSION "0.2"
#define DRIVER_AUTHOR "Alex Williamson <[email protected]>"
@@ -734,6 +735,27 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
__free_pages(pages, order);
}
+static int vfio_iommu_handle_resv_regions(struct iommu_domain *domain,
+ struct iommu_group *group)
+{
+ struct list_head group_resv_regions;
+ struct iommu_resv_region *region, *next;
+ int ret = 0;
+
+ INIT_LIST_HEAD(&group_resv_regions);
+ iommu_get_group_resv_regions(group, &group_resv_regions);
+ list_for_each_entry(region, &group_resv_regions, list) {
+ if (region->prot & IOMMU_RESV_MSI) {
+ ret = iommu_get_msi_cookie(domain, region->start);
+ if (ret)
+ break;
+ }
+ }
+ list_for_each_entry_safe(region, next, &group_resv_regions, list)
+ kfree(region);
+ return ret;
+}
+
static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
@@ -834,6 +856,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
if (ret)
goto out_detach;
+ ret = vfio_iommu_handle_resv_regions(domain->domain, iommu_group);
+ if (ret)
+ goto out_detach;
+
list_add(&domain->next, &iommu->domain_list);
mutex_unlock(&iommu->lock);
--
1.9.1
This patch registers the [FEE0_0000h - FEF0_000h] 1MB MSI range
as a reserved region. This will allow to report that range
in the iommu-group sysfs.
Signed-off-by: Eric Auger <[email protected]>
---
RFCv2 -> RFCv3:
- use get/put_resv_region callbacks.
RFC v1 -> RFC v2:
- fix intel_iommu_add_reserved_regions name
- use IOAPIC_RANGE_START and IOAPIC_RANGE_END defines
- return if the MSI region is already registered;
---
drivers/iommu/intel-iommu.c | 50 +++++++++++++++++++++++++++++++++------------
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 3965e73..8dada8f 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5183,6 +5183,28 @@ static void intel_iommu_remove_device(struct device *dev)
iommu_device_unlink(iommu->iommu_dev, dev);
}
+static void intel_iommu_get_resv_regions(struct device *device,
+ struct list_head *head)
+{
+ struct iommu_resv_region *reg;
+
+ reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
+ IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
+ IOMMU_RESV_NOMAP);
+ if (!reg)
+ return;
+ list_add_tail(®->list, head);
+}
+
+static void intel_iommu_put_resv_regions(struct device *dev,
+ struct list_head *head)
+{
+ struct iommu_resv_region *entry, *next;
+
+ list_for_each_entry_safe(entry, next, head, list)
+ kfree(entry);
+}
+
#ifdef CONFIG_INTEL_IOMMU_SVM
int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
{
@@ -5292,19 +5314,21 @@ struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
#endif /* CONFIG_INTEL_IOMMU_SVM */
static const struct iommu_ops intel_iommu_ops = {
- .capable = intel_iommu_capable,
- .domain_alloc = intel_iommu_domain_alloc,
- .domain_free = intel_iommu_domain_free,
- .attach_dev = intel_iommu_attach_device,
- .detach_dev = intel_iommu_detach_device,
- .map = intel_iommu_map,
- .unmap = intel_iommu_unmap,
- .map_sg = default_iommu_map_sg,
- .iova_to_phys = intel_iommu_iova_to_phys,
- .add_device = intel_iommu_add_device,
- .remove_device = intel_iommu_remove_device,
- .device_group = pci_device_group,
- .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
+ .capable = intel_iommu_capable,
+ .domain_alloc = intel_iommu_domain_alloc,
+ .domain_free = intel_iommu_domain_free,
+ .attach_dev = intel_iommu_attach_device,
+ .detach_dev = intel_iommu_detach_device,
+ .map = intel_iommu_map,
+ .unmap = intel_iommu_unmap,
+ .map_sg = default_iommu_map_sg,
+ .iova_to_phys = intel_iommu_iova_to_phys,
+ .add_device = intel_iommu_add_device,
+ .remove_device = intel_iommu_remove_device,
+ .get_resv_regions = intel_iommu_get_resv_regions,
+ .put_resv_regions = intel_iommu_put_resv_regions,
+ .device_group = pci_device_group,
+ .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
};
static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
--
1.9.1
The get() populates the list with the PCI host bridge windows
and the MSI IOVA range.
At the moment an arbitray MSI IOVA window is set at 0x8000000
of size 1MB. This will allow to report those info in iommu-group
sysfs?
Signed-off-by: Eric Auger <[email protected]>
---
RFC v2 -> v3:
- use existing get/put_resv_regions
RFC v1 -> v2:
- use defines for MSI IOVA base and length
---
drivers/iommu/arm-smmu.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 8f72814..81f1a83 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -278,6 +278,9 @@ enum arm_smmu_s2cr_privcfg {
#define FSYNR0_WNR (1 << 4)
+#define MSI_IOVA_BASE 0x8000000
+#define MSI_IOVA_LENGTH 0x100000
+
static int force_stage;
module_param(force_stage, int, S_IRUGO);
MODULE_PARM_DESC(force_stage,
@@ -1545,6 +1548,53 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
return iommu_fwspec_add_ids(dev, &fwid, 1);
}
+static void arm_smmu_get_resv_regions(struct device *dev,
+ struct list_head *head)
+{
+ struct iommu_resv_region *region;
+ struct pci_host_bridge *bridge;
+ struct resource_entry *window;
+
+ /* MSI region */
+ region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+ IOMMU_RESV_MSI);
+ if (!region)
+ return;
+
+ list_add_tail(®ion->list, head);
+
+ if (!dev_is_pci(dev))
+ return;
+
+ bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
+
+ resource_list_for_each_entry(window, &bridge->windows) {
+ phys_addr_t start;
+ size_t length;
+
+ if (resource_type(window->res) != IORESOURCE_MEM &&
+ resource_type(window->res) != IORESOURCE_IO)
+ continue;
+
+ start = window->res->start - window->offset;
+ length = window->res->end - window->res->start + 1;
+ region = iommu_alloc_resv_region(start, length,
+ IOMMU_RESV_NOMAP);
+ if (!region)
+ return;
+ list_add_tail(®ion->list, head);
+ }
+}
+
+static void arm_smmu_put_resv_regions(struct device *dev,
+ struct list_head *head)
+{
+ struct iommu_resv_region *entry, *next;
+
+ list_for_each_entry_safe(entry, next, head, list)
+ kfree(entry);
+}
+
static struct iommu_ops arm_smmu_ops = {
.capable = arm_smmu_capable,
.domain_alloc = arm_smmu_domain_alloc,
@@ -1560,6 +1610,8 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
.domain_get_attr = arm_smmu_domain_get_attr,
.domain_set_attr = arm_smmu_domain_set_attr,
.of_xlate = arm_smmu_of_xlate,
+ .get_resv_regions = arm_smmu_get_resv_regions,
+ .put_resv_regions = arm_smmu_put_resv_regions,
.pgsize_bitmap = -1UL, /* Restricted during device attach */
};
--
1.9.1
As we introduced IOMMU_RESV_NOMAP and IOMMU_RESV_MSI regions,
let's prevent those new regions from being mapped.
Signed-off-by: Eric Auger <[email protected]>
---
drivers/iommu/iommu.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6ee529f..a4530ad 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -343,6 +343,9 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
start = ALIGN(entry->start, pg_size);
end = ALIGN(entry->start + entry->length, pg_size);
+ if (entry->prot & IOMMU_RESV_MASK)
+ continue;
+
for (addr = start; addr < end; addr += pg_size) {
phys_addr_t phys_addr;
--
1.9.1
IOMMU_RESV_NOMAP is used to tag reserved IOVAs that are not
supposed to be IOMMU mapped. IOMMU_RESV_MSI tags IOVAs
corresponding to MSIs that need to be IOMMU mapped.
IOMMU_RESV_MASK allows to check if the IOVA is reserved.
Signed-off-by: Eric Auger <[email protected]>
---
include/linux/iommu.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7f6ebd0..02cf565 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -32,6 +32,10 @@
#define IOMMU_NOEXEC (1 << 3)
#define IOMMU_MMIO (1 << 4) /* e.g. things like MSI doorbells */
+#define IOMMU_RESV_MASK 0x300 /* Reserved IOVA mask */
+#define IOMMU_RESV_NOMAP (1 << 8) /* IOVA that cannot be mapped */
+#define IOMMU_RESV_MSI (1 << 9) /* MSI region transparently mapped */
+
struct iommu_ops;
struct iommu_group;
struct bus_type;
--
1.9.1
Hi Bharat,
On 18/11/2016 06:34, Bharat Bhushan wrote:
> Hi Eric,
>
> Have you sent out QEMU side patches based on this new approach? In case I missed please point me the patches?
Upstream QEMU works fine for PCIe/MSI passthrough on ARM since mach virt
address space does not collide with fixed MSI region.
Thanks
Eric
>
> Thanks
> -Bharat
>
>> -----Original Message-----
>> From: [email protected] [mailto:iommu-
>> [email protected]] On Behalf Of Eric Auger
>> Sent: Tuesday, November 15, 2016 6:39 PM
>> To: [email protected]; [email protected];
>> [email protected]; [email protected];
>> [email protected]; [email protected];
>> [email protected]; [email protected]; [email protected];
>> [email protected]; [email protected]
>> Cc: [email protected]; [email protected]; [email protected];
>> [email protected]; [email protected];
>> [email protected]
>> Subject: [RFC v3 00/10] KVM PCIe/MSI passthrough on ARM/ARM64 and
>> IOVA reserved regions
>>
>> Following LPC discussions, we now report reserved regions through iommu-
>> group sysfs reserved_regions attribute file.
>>
>> Reserved regions are populated through the IOMMU get_resv_region
>> callback (former get_dm_regions), now implemented by amd-iommu, intel-
>> iommu and arm-smmu.
>>
>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>> IOMMU_RESV_NOMAP reserved region.
>>
>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and 1MB
>> large) and the PCI host bridge windows.
>>
>> The series integrates a not officially posted patch from Robin:
>> "iommu/dma: Allow MSI-only cookies".
>>
>> This series currently does not address IRQ safety assessment.
>>
>> Best Regards
>>
>> Eric
>>
>> Git: complete series available at
>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>
>> History:
>> RFC v2 -> v3:
>> - switch to an iommu-group sysfs API
>> - use new dummy allocator provided by Robin
>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>> the reserved regions
>> - at the moment ARM MSI base address/size is left unchanged compared
>> to v2
>> - we currently report reserved regions and not usable IOVA regions as
>> requested by Alex
>>
>> RFC v1 -> v2:
>> - fix intel_add_reserved_regions
>> - add mutex lock/unlock in vfio_iommu_type1
>>
>>
>> Eric Auger (10):
>> iommu/dma: Allow MSI-only cookies
>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>> iommu: Add new reserved IOMMU attributes
>> iommu: iommu_alloc_resv_region
>> iommu: Do not map reserved regions
>> iommu: iommu_get_group_resv_regions
>> iommu: Implement reserved_regions iommu-group sysfs file
>> iommu/vt-d: Implement reserved region get/put callbacks
>> iommu/arm-smmu: Implement reserved region get/put callbacks
>> vfio/type1: Get MSI cookie
>>
>> drivers/iommu/amd_iommu.c | 20 +++---
>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++----
>> ---
>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>> drivers/iommu/iommu.c | 141
>> ++++++++++++++++++++++++++++++++++++----
>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>> include/linux/dma-iommu.h | 7 ++
>> include/linux/iommu.h | 49 ++++++++++----
>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>
>> --
>> 1.9.1
>>
>> _______________________________________________
>> iommu mailing list
>> [email protected]
>> https://lists.linuxfoundation.org/mailman/listinfo/iommu
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Hi Eric,
Have you sent out QEMU side patches based on this new approach? In case I missed please point me the patches?
Thanks
-Bharat
> -----Original Message-----
> From: [email protected] [mailto:iommu-
> [email protected]] On Behalf Of Eric Auger
> Sent: Tuesday, November 15, 2016 6:39 PM
> To: [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]
> Subject: [RFC v3 00/10] KVM PCIe/MSI passthrough on ARM/ARM64 and
> IOVA reserved regions
>
> Following LPC discussions, we now report reserved regions through iommu-
> group sysfs reserved_regions attribute file.
>
> Reserved regions are populated through the IOMMU get_resv_region
> callback (former get_dm_regions), now implemented by amd-iommu, intel-
> iommu and arm-smmu.
>
> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
> IOMMU_RESV_NOMAP reserved region.
>
> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and 1MB
> large) and the PCI host bridge windows.
>
> The series integrates a not officially posted patch from Robin:
> "iommu/dma: Allow MSI-only cookies".
>
> This series currently does not address IRQ safety assessment.
>
> Best Regards
>
> Eric
>
> Git: complete series available at
> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>
> History:
> RFC v2 -> v3:
> - switch to an iommu-group sysfs API
> - use new dummy allocator provided by Robin
> - dummy allocator initialized by vfio-iommu-type1 after enumerating
> the reserved regions
> - at the moment ARM MSI base address/size is left unchanged compared
> to v2
> - we currently report reserved regions and not usable IOVA regions as
> requested by Alex
>
> RFC v1 -> v2:
> - fix intel_add_reserved_regions
> - add mutex lock/unlock in vfio_iommu_type1
>
>
> Eric Auger (10):
> iommu/dma: Allow MSI-only cookies
> iommu: Rename iommu_dm_regions into iommu_resv_regions
> iommu: Add new reserved IOMMU attributes
> iommu: iommu_alloc_resv_region
> iommu: Do not map reserved regions
> iommu: iommu_get_group_resv_regions
> iommu: Implement reserved_regions iommu-group sysfs file
> iommu/vt-d: Implement reserved region get/put callbacks
> iommu/arm-smmu: Implement reserved region get/put callbacks
> vfio/type1: Get MSI cookie
>
> drivers/iommu/amd_iommu.c | 20 +++---
> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++----
> ---
> drivers/iommu/intel-iommu.c | 50 ++++++++++----
> drivers/iommu/iommu.c | 141
> ++++++++++++++++++++++++++++++++++++----
> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
> include/linux/dma-iommu.h | 7 ++
> include/linux/iommu.h | 49 ++++++++++----
> 8 files changed, 391 insertions(+), 70 deletions(-)
>
> --
> 1.9.1
>
> _______________________________________________
> iommu mailing list
> [email protected]
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
On Tue, Nov 15, 2016 at 01:09:17PM +0000, Eric Auger wrote:
> +static inline struct iommu_resv_region *
> +iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot)
> +{
> + return NULL;
> +}
> +
Will this function be called outside of iommu code?
Joerg
Hi Joerg,
On 29/11/2016 17:11, Joerg Roedel wrote:
> On Tue, Nov 15, 2016 at 01:09:17PM +0000, Eric Auger wrote:
>> +static inline struct iommu_resv_region *
>> +iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot)
>> +{
>> + return NULL;
>> +}
>> +
>
> Will this function be called outside of iommu code?
No the function is not bound to be called outside of the iommu code. I
will remove this.
Thanks
Eric
>
>
>
> Joerg
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Hi,
On 15/11/2016 14:09, Eric Auger wrote:
> Following LPC discussions, we now report reserved regions through
> iommu-group sysfs reserved_regions attribute file.
>
> Reserved regions are populated through the IOMMU get_resv_region callback
> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
> arm-smmu.
>
> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
> IOMMU_RESV_NOMAP reserved region.
>
> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
> 1MB large) and the PCI host bridge windows.
>
> The series integrates a not officially posted patch from Robin:
> "iommu/dma: Allow MSI-only cookies".
>
> This series currently does not address IRQ safety assessment.
I will respin this series taking into account Joerg's comment. Does
anyone have additional comments or want to put forward some conceptual
issues with the current direction and with this implementation?
As for the IRQ safety assessment, in a first step I would propose to
remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
assignment as unsafe. Any objection?
Thanks
Eric
> Best Regards
>
> Eric
>
> Git: complete series available at
> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>
> History:
> RFC v2 -> v3:
> - switch to an iommu-group sysfs API
> - use new dummy allocator provided by Robin
> - dummy allocator initialized by vfio-iommu-type1 after enumerating
> the reserved regions
> - at the moment ARM MSI base address/size is left unchanged compared
> to v2
> - we currently report reserved regions and not usable IOVA regions as
> requested by Alex
>
> RFC v1 -> v2:
> - fix intel_add_reserved_regions
> - add mutex lock/unlock in vfio_iommu_type1
>
>
> Eric Auger (10):
> iommu/dma: Allow MSI-only cookies
> iommu: Rename iommu_dm_regions into iommu_resv_regions
> iommu: Add new reserved IOMMU attributes
> iommu: iommu_alloc_resv_region
> iommu: Do not map reserved regions
> iommu: iommu_get_group_resv_regions
> iommu: Implement reserved_regions iommu-group sysfs file
> iommu/vt-d: Implement reserved region get/put callbacks
> iommu/arm-smmu: Implement reserved region get/put callbacks
> vfio/type1: Get MSI cookie
>
> drivers/iommu/amd_iommu.c | 20 +++---
> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
> drivers/iommu/intel-iommu.c | 50 ++++++++++----
> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
> include/linux/dma-iommu.h | 7 ++
> include/linux/iommu.h | 49 ++++++++++----
> 8 files changed, 391 insertions(+), 70 deletions(-)
>
Hi Eric,
in you repo "https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3"
there is 11th patch "pci: Enable overrides for missing ACS capabilities"
is this patch part of some other series?
thanks
Ganapat
On Wed, Nov 30, 2016 at 3:19 PM, Auger Eric <[email protected]> wrote:
> Hi,
>
> On 15/11/2016 14:09, Eric Auger wrote:
>> Following LPC discussions, we now report reserved regions through
>> iommu-group sysfs reserved_regions attribute file.
>>
>> Reserved regions are populated through the IOMMU get_resv_region callback
>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>> arm-smmu.
>>
>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>> IOMMU_RESV_NOMAP reserved region.
>>
>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>> 1MB large) and the PCI host bridge windows.
>>
>> The series integrates a not officially posted patch from Robin:
>> "iommu/dma: Allow MSI-only cookies".
>>
>> This series currently does not address IRQ safety assessment.
>
> I will respin this series taking into account Joerg's comment. Does
> anyone have additional comments or want to put forward some conceptual
> issues with the current direction and with this implementation?
>
> As for the IRQ safety assessment, in a first step I would propose to
> remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
> assignment as unsafe. Any objection?
>
> Thanks
>
> Eric
>
>
>> Best Regards
>>
>> Eric
>>
>> Git: complete series available at
>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>
>> History:
>> RFC v2 -> v3:
>> - switch to an iommu-group sysfs API
>> - use new dummy allocator provided by Robin
>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>> the reserved regions
>> - at the moment ARM MSI base address/size is left unchanged compared
>> to v2
>> - we currently report reserved regions and not usable IOVA regions as
>> requested by Alex
>>
>> RFC v1 -> v2:
>> - fix intel_add_reserved_regions
>> - add mutex lock/unlock in vfio_iommu_type1
>>
>>
>> Eric Auger (10):
>> iommu/dma: Allow MSI-only cookies
>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>> iommu: Add new reserved IOMMU attributes
>> iommu: iommu_alloc_resv_region
>> iommu: Do not map reserved regions
>> iommu: iommu_get_group_resv_regions
>> iommu: Implement reserved_regions iommu-group sysfs file
>> iommu/vt-d: Implement reserved region get/put callbacks
>> iommu/arm-smmu: Implement reserved region get/put callbacks
>> vfio/type1: Get MSI cookie
>>
>> drivers/iommu/amd_iommu.c | 20 +++---
>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>> include/linux/dma-iommu.h | 7 ++
>> include/linux/iommu.h | 49 ++++++++++----
>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Hi Ganapat,
On 30/11/2016 11:04, Ganapatrao Kulkarni wrote:
> Hi Eric,
>
> in you repo "https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3"
> there is 11th patch "pci: Enable overrides for missing ACS capabilities"
> is this patch part of some other series?
Actually this is a very old patch from Alex aimed at working around lack
of PCIe ACS support: https://lkml.org/lkml/2013/5/30/513
Thanks
Eric
>
> thanks
> Ganapat
>
> On Wed, Nov 30, 2016 at 3:19 PM, Auger Eric <[email protected]> wrote:
>> Hi,
>>
>> On 15/11/2016 14:09, Eric Auger wrote:
>>> Following LPC discussions, we now report reserved regions through
>>> iommu-group sysfs reserved_regions attribute file.
>>>
>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>> arm-smmu.
>>>
>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>> IOMMU_RESV_NOMAP reserved region.
>>>
>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>> 1MB large) and the PCI host bridge windows.
>>>
>>> The series integrates a not officially posted patch from Robin:
>>> "iommu/dma: Allow MSI-only cookies".
>>>
>>> This series currently does not address IRQ safety assessment.
>>
>> I will respin this series taking into account Joerg's comment. Does
>> anyone have additional comments or want to put forward some conceptual
>> issues with the current direction and with this implementation?
>>
>> As for the IRQ safety assessment, in a first step I would propose to
>> remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
>> assignment as unsafe. Any objection?
>>
>> Thanks
>>
>> Eric
>>
>>
>>> Best Regards
>>>
>>> Eric
>>>
>>> Git: complete series available at
>>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>>
>>> History:
>>> RFC v2 -> v3:
>>> - switch to an iommu-group sysfs API
>>> - use new dummy allocator provided by Robin
>>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>> the reserved regions
>>> - at the moment ARM MSI base address/size is left unchanged compared
>>> to v2
>>> - we currently report reserved regions and not usable IOVA regions as
>>> requested by Alex
>>>
>>> RFC v1 -> v2:
>>> - fix intel_add_reserved_regions
>>> - add mutex lock/unlock in vfio_iommu_type1
>>>
>>>
>>> Eric Auger (10):
>>> iommu/dma: Allow MSI-only cookies
>>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>>> iommu: Add new reserved IOMMU attributes
>>> iommu: iommu_alloc_resv_region
>>> iommu: Do not map reserved regions
>>> iommu: iommu_get_group_resv_regions
>>> iommu: Implement reserved_regions iommu-group sysfs file
>>> iommu/vt-d: Implement reserved region get/put callbacks
>>> iommu/arm-smmu: Implement reserved region get/put callbacks
>>> vfio/type1: Get MSI cookie
>>>
>>> drivers/iommu/amd_iommu.c | 20 +++---
>>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>>> include/linux/dma-iommu.h | 7 ++
>>> include/linux/iommu.h | 49 ++++++++++----
>>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>>
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
On Wed, Nov 30, 2016 at 10:49:33AM +0100, Auger Eric wrote:
> On 15/11/2016 14:09, Eric Auger wrote:
> > Following LPC discussions, we now report reserved regions through
> > iommu-group sysfs reserved_regions attribute file.
> >
> > Reserved regions are populated through the IOMMU get_resv_region callback
> > (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
> > arm-smmu.
> >
> > The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
> > IOMMU_RESV_NOMAP reserved region.
> >
> > arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
> > 1MB large) and the PCI host bridge windows.
> >
> > The series integrates a not officially posted patch from Robin:
> > "iommu/dma: Allow MSI-only cookies".
> >
> > This series currently does not address IRQ safety assessment.
>
> I will respin this series taking into account Joerg's comment. Does
> anyone have additional comments or want to put forward some conceptual
> issues with the current direction and with this implementation?
>
> As for the IRQ safety assessment, in a first step I would propose to
> remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
> assignment as unsafe. Any objection?
Well, yeah, because it's perfectly safe with GICv3.
Will
On Wed, Nov 30, 2016 at 3:44 PM, Auger Eric <[email protected]> wrote:
> Hi Ganapat,
>
> On 30/11/2016 11:04, Ganapatrao Kulkarni wrote:
>> Hi Eric,
>>
>> in you repo "https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3"
>> there is 11th patch "pci: Enable overrides for missing ACS capabilities"
>> is this patch part of some other series?
>
> Actually this is a very old patch from Alex aimed at working around lack
> of PCIe ACS support: https://lkml.org/lkml/2013/5/30/513
>
i have tried this patchset on thunderx-83xx for vfio and it works for me!
i was wondering is this patch required? i guess not.
please cc me when you respin this patchset.
thanks
Ganapat
> Thanks
>
> Eric
>>
>> thanks
>> Ganapat
>>
>> On Wed, Nov 30, 2016 at 3:19 PM, Auger Eric <[email protected]> wrote:
>>> Hi,
>>>
>>> On 15/11/2016 14:09, Eric Auger wrote:
>>>> Following LPC discussions, we now report reserved regions through
>>>> iommu-group sysfs reserved_regions attribute file.
>>>>
>>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>>> arm-smmu.
>>>>
>>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>>> IOMMU_RESV_NOMAP reserved region.
>>>>
>>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>>> 1MB large) and the PCI host bridge windows.
>>>>
>>>> The series integrates a not officially posted patch from Robin:
>>>> "iommu/dma: Allow MSI-only cookies".
>>>>
>>>> This series currently does not address IRQ safety assessment.
>>>
>>> I will respin this series taking into account Joerg's comment. Does
>>> anyone have additional comments or want to put forward some conceptual
>>> issues with the current direction and with this implementation?
>>>
>>> As for the IRQ safety assessment, in a first step I would propose to
>>> remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
>>> assignment as unsafe. Any objection?
>>>
>>> Thanks
>>>
>>> Eric
>>>
>>>
>>>> Best Regards
>>>>
>>>> Eric
>>>>
>>>> Git: complete series available at
>>>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>>>
>>>> History:
>>>> RFC v2 -> v3:
>>>> - switch to an iommu-group sysfs API
>>>> - use new dummy allocator provided by Robin
>>>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>>> the reserved regions
>>>> - at the moment ARM MSI base address/size is left unchanged compared
>>>> to v2
>>>> - we currently report reserved regions and not usable IOVA regions as
>>>> requested by Alex
>>>>
>>>> RFC v1 -> v2:
>>>> - fix intel_add_reserved_regions
>>>> - add mutex lock/unlock in vfio_iommu_type1
>>>>
>>>>
>>>> Eric Auger (10):
>>>> iommu/dma: Allow MSI-only cookies
>>>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>>>> iommu: Add new reserved IOMMU attributes
>>>> iommu: iommu_alloc_resv_region
>>>> iommu: Do not map reserved regions
>>>> iommu: iommu_get_group_resv_regions
>>>> iommu: Implement reserved_regions iommu-group sysfs file
>>>> iommu/vt-d: Implement reserved region get/put callbacks
>>>> iommu/arm-smmu: Implement reserved region get/put callbacks
>>>> vfio/type1: Get MSI cookie
>>>>
>>>> drivers/iommu/amd_iommu.c | 20 +++---
>>>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>>>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>>>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>>>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>>>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>>>> include/linux/dma-iommu.h | 7 ++
>>>> include/linux/iommu.h | 49 ++++++++++----
>>>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>>>
>>>
>>> _______________________________________________
>>> linux-arm-kernel mailing list
>>> [email protected]
>>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
On 30/11/16 10:52, Ganapatrao Kulkarni wrote:
> On Wed, Nov 30, 2016 at 3:44 PM, Auger Eric <[email protected]> wrote:
>> Hi Ganapat,
>>
>> On 30/11/2016 11:04, Ganapatrao Kulkarni wrote:
>>> Hi Eric,
>>>
>>> in you repo "https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3"
>>> there is 11th patch "pci: Enable overrides for missing ACS capabilities"
>>> is this patch part of some other series?
>>
>> Actually this is a very old patch from Alex aimed at working around lack
>> of PCIe ACS support: https://lkml.org/lkml/2013/5/30/513
>>
>
> i have tried this patchset on thunderx-83xx for vfio and it works for me!
> i was wondering is this patch required? i guess not.
If your system and devices actually support and properly advertise ACS
then there's nothing to override. Conversely, if you're happy assigning
everything behind a single RC to the same guest then ACS doesn't really
matter. It's only the in-between case - when the host still wants to
keep control of one or more devices, but they all get grouped together
due to lack of ACS - that warrants working around.
Robin.
>
> please cc me when you respin this patchset.
>
> thanks
> Ganapat
>
>> Thanks
>>
>> Eric
>>>
>>> thanks
>>> Ganapat
>>>
>>> On Wed, Nov 30, 2016 at 3:19 PM, Auger Eric <[email protected]> wrote:
>>>> Hi,
>>>>
>>>> On 15/11/2016 14:09, Eric Auger wrote:
>>>>> Following LPC discussions, we now report reserved regions through
>>>>> iommu-group sysfs reserved_regions attribute file.
>>>>>
>>>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>>>> arm-smmu.
>>>>>
>>>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>>>> IOMMU_RESV_NOMAP reserved region.
>>>>>
>>>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>>>> 1MB large) and the PCI host bridge windows.
>>>>>
>>>>> The series integrates a not officially posted patch from Robin:
>>>>> "iommu/dma: Allow MSI-only cookies".
>>>>>
>>>>> This series currently does not address IRQ safety assessment.
>>>>
>>>> I will respin this series taking into account Joerg's comment. Does
>>>> anyone have additional comments or want to put forward some conceptual
>>>> issues with the current direction and with this implementation?
>>>>
>>>> As for the IRQ safety assessment, in a first step I would propose to
>>>> remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
>>>> assignment as unsafe. Any objection?
>>>>
>>>> Thanks
>>>>
>>>> Eric
>>>>
>>>>
>>>>> Best Regards
>>>>>
>>>>> Eric
>>>>>
>>>>> Git: complete series available at
>>>>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>>>>
>>>>> History:
>>>>> RFC v2 -> v3:
>>>>> - switch to an iommu-group sysfs API
>>>>> - use new dummy allocator provided by Robin
>>>>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>>>> the reserved regions
>>>>> - at the moment ARM MSI base address/size is left unchanged compared
>>>>> to v2
>>>>> - we currently report reserved regions and not usable IOVA regions as
>>>>> requested by Alex
>>>>>
>>>>> RFC v1 -> v2:
>>>>> - fix intel_add_reserved_regions
>>>>> - add mutex lock/unlock in vfio_iommu_type1
>>>>>
>>>>>
>>>>> Eric Auger (10):
>>>>> iommu/dma: Allow MSI-only cookies
>>>>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>>>>> iommu: Add new reserved IOMMU attributes
>>>>> iommu: iommu_alloc_resv_region
>>>>> iommu: Do not map reserved regions
>>>>> iommu: iommu_get_group_resv_regions
>>>>> iommu: Implement reserved_regions iommu-group sysfs file
>>>>> iommu/vt-d: Implement reserved region get/put callbacks
>>>>> iommu/arm-smmu: Implement reserved region get/put callbacks
>>>>> vfio/type1: Get MSI cookie
>>>>>
>>>>> drivers/iommu/amd_iommu.c | 20 +++---
>>>>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>>>>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>>>>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>>>>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>>>>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>>>>> include/linux/dma-iommu.h | 7 ++
>>>>> include/linux/iommu.h | 49 ++++++++++----
>>>>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>>>>
>>>>
>>>> _______________________________________________
>>>> linux-arm-kernel mailing list
>>>> [email protected]
>>>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Hi Will,
On 30/11/2016 11:37, Will Deacon wrote:
> On Wed, Nov 30, 2016 at 10:49:33AM +0100, Auger Eric wrote:
>> On 15/11/2016 14:09, Eric Auger wrote:
>>> Following LPC discussions, we now report reserved regions through
>>> iommu-group sysfs reserved_regions attribute file.
>>>
>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>> arm-smmu.
>>>
>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>> IOMMU_RESV_NOMAP reserved region.
>>>
>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>> 1MB large) and the PCI host bridge windows.
>>>
>>> The series integrates a not officially posted patch from Robin:
>>> "iommu/dma: Allow MSI-only cookies".
>>>
>>> This series currently does not address IRQ safety assessment.
>>
>> I will respin this series taking into account Joerg's comment. Does
>> anyone have additional comments or want to put forward some conceptual
>> issues with the current direction and with this implementation?
>>
>> As for the IRQ safety assessment, in a first step I would propose to
>> remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
>> assignment as unsafe. Any objection?
>
> Well, yeah, because it's perfectly safe with GICv3.
Well except if you have an MSI controller in-between the device and the
sMMU (typically embedded in the host bridge). Detecting this situation
is not straightforward; hence my proposal.
Thanks
Eric
>
> Will
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
On 30/11/16 14:08, Auger Eric wrote:
> Hi Will,
>
> On 30/11/2016 11:37, Will Deacon wrote:
>> On Wed, Nov 30, 2016 at 10:49:33AM +0100, Auger Eric wrote:
>>> On 15/11/2016 14:09, Eric Auger wrote:
>>>> Following LPC discussions, we now report reserved regions through
>>>> iommu-group sysfs reserved_regions attribute file.
>>>>
>>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>>> arm-smmu.
>>>>
>>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>>> IOMMU_RESV_NOMAP reserved region.
>>>>
>>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>>> 1MB large) and the PCI host bridge windows.
>>>>
>>>> The series integrates a not officially posted patch from Robin:
>>>> "iommu/dma: Allow MSI-only cookies".
>>>>
>>>> This series currently does not address IRQ safety assessment.
>>>
>>> I will respin this series taking into account Joerg's comment. Does
>>> anyone have additional comments or want to put forward some conceptual
>>> issues with the current direction and with this implementation?
>>>
>>> As for the IRQ safety assessment, in a first step I would propose to
>>> remove the IOMMU_CAP_INTR_REMAP from arm-smmus and consider the
>>> assignment as unsafe. Any objection?
>>
>> Well, yeah, because it's perfectly safe with GICv3.
>
> Well except if you have an MSI controller in-between the device and the
> sMMU (typically embedded in the host bridge). Detecting this situation
> is not straightforward; hence my proposal.
That's not the GICv3 (ITS) case, though, and either way it's irrelevant
to the "safety" aspect in question; the fact that writes to the ITS
carry sideband signals which disambiguate and isolate MSIs has nothing
to do with whether that write undergoes address translation along the way.
It's also not legacy INTx, and the fact that we have to pretend the SMMU
provides MSI isolation in order to make things work with INTx is an
entirely separate piece of brokenness I've raised several times before.
I more than anyone would love to remove IOMMU_CAP_INTR_REMAP from the
SMMU drivers yesterday, but doing so breaks the existing use-case on ARM
unless we actually fix that aspect of VFIO first (I did look into it
once, but it seemed way beyond my comprehension at the time).
Robin.
On 15/11/16 13:09, Eric Auger wrote:
> IOMMU_RESV_NOMAP is used to tag reserved IOVAs that are not
> supposed to be IOMMU mapped. IOMMU_RESV_MSI tags IOVAs
> corresponding to MSIs that need to be IOMMU mapped.
>
> IOMMU_RESV_MASK allows to check if the IOVA is reserved.
>
> Signed-off-by: Eric Auger <[email protected]>
> ---
> include/linux/iommu.h | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 7f6ebd0..02cf565 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -32,6 +32,10 @@
> #define IOMMU_NOEXEC (1 << 3)
> #define IOMMU_MMIO (1 << 4) /* e.g. things like MSI doorbells */
>
> +#define IOMMU_RESV_MASK 0x300 /* Reserved IOVA mask */
> +#define IOMMU_RESV_NOMAP (1 << 8) /* IOVA that cannot be mapped */
> +#define IOMMU_RESV_MSI (1 << 9) /* MSI region transparently mapped */
It feels a bit grotty encoding these in prot - NOMAP sort of fits, but
MSI really is something else entirely. On reflection I think a separate
iommu_resv_region::type field would be better, particularly as it's
something we might potentially want to expose via the sysfs entry.
Robin.
> +
> struct iommu_ops;
> struct iommu_group;
> struct bus_type;
>
On 15/11/16 13:09, Eric Auger wrote:
> We want to extend the callbacks used for dm regions and
> use them for reserved regions. Reserved regions can be
> - directly mapped regions
> - regions that cannot be iommu mapped (PCI host bridge windows, ...)
> - MSI regions (because they belong to another address space or because
> they are not translated by the IOMMU and need special handling)
>
> So let's rename the struct and also the callbacks.
Acked-by: Robin Murphy <[email protected]>
> Signed-off-by: Eric Auger <[email protected]>
> ---
> drivers/iommu/amd_iommu.c | 20 ++++++++++----------
> drivers/iommu/iommu.c | 22 +++++++++++-----------
> include/linux/iommu.h | 29 +++++++++++++++--------------
> 3 files changed, 36 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index 754595e..a6c351d 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
> @@ -3159,8 +3159,8 @@ static bool amd_iommu_capable(enum iommu_cap cap)
> return false;
> }
>
> -static void amd_iommu_get_dm_regions(struct device *dev,
> - struct list_head *head)
> +static void amd_iommu_get_resv_regions(struct device *dev,
> + struct list_head *head)
> {
> struct unity_map_entry *entry;
> int devid;
> @@ -3170,7 +3170,7 @@ static void amd_iommu_get_dm_regions(struct device *dev,
> return;
>
> list_for_each_entry(entry, &amd_iommu_unity_map, list) {
> - struct iommu_dm_region *region;
> + struct iommu_resv_region *region;
>
> if (devid < entry->devid_start || devid > entry->devid_end)
> continue;
> @@ -3193,18 +3193,18 @@ static void amd_iommu_get_dm_regions(struct device *dev,
> }
> }
>
> -static void amd_iommu_put_dm_regions(struct device *dev,
> +static void amd_iommu_put_resv_regions(struct device *dev,
> struct list_head *head)
> {
> - struct iommu_dm_region *entry, *next;
> + struct iommu_resv_region *entry, *next;
>
> list_for_each_entry_safe(entry, next, head, list)
> kfree(entry);
> }
>
> -static void amd_iommu_apply_dm_region(struct device *dev,
> +static void amd_iommu_apply_resv_region(struct device *dev,
> struct iommu_domain *domain,
> - struct iommu_dm_region *region)
> + struct iommu_resv_region *region)
> {
> struct dma_ops_domain *dma_dom = to_dma_ops_domain(to_pdomain(domain));
> unsigned long start, end;
> @@ -3228,9 +3228,9 @@ static void amd_iommu_apply_dm_region(struct device *dev,
> .add_device = amd_iommu_add_device,
> .remove_device = amd_iommu_remove_device,
> .device_group = amd_iommu_device_group,
> - .get_dm_regions = amd_iommu_get_dm_regions,
> - .put_dm_regions = amd_iommu_put_dm_regions,
> - .apply_dm_region = amd_iommu_apply_dm_region,
> + .get_resv_regions = amd_iommu_get_resv_regions,
> + .put_resv_regions = amd_iommu_put_resv_regions,
> + .apply_resv_region = amd_iommu_apply_resv_region,
> .pgsize_bitmap = AMD_IOMMU_PGSIZES,
> };
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 9a2f196..c7ed334 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -318,7 +318,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
> struct device *dev)
> {
> struct iommu_domain *domain = group->default_domain;
> - struct iommu_dm_region *entry;
> + struct iommu_resv_region *entry;
> struct list_head mappings;
> unsigned long pg_size;
> int ret = 0;
> @@ -331,14 +331,14 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
> pg_size = 1UL << __ffs(domain->pgsize_bitmap);
> INIT_LIST_HEAD(&mappings);
>
> - iommu_get_dm_regions(dev, &mappings);
> + iommu_get_resv_regions(dev, &mappings);
>
> /* We need to consider overlapping regions for different devices */
> list_for_each_entry(entry, &mappings, list) {
> dma_addr_t start, end, addr;
>
> - if (domain->ops->apply_dm_region)
> - domain->ops->apply_dm_region(dev, domain, entry);
> + if (domain->ops->apply_resv_region)
> + domain->ops->apply_resv_region(dev, domain, entry);
>
> start = ALIGN(entry->start, pg_size);
> end = ALIGN(entry->start + entry->length, pg_size);
> @@ -358,7 +358,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
> }
>
> out:
> - iommu_put_dm_regions(dev, &mappings);
> + iommu_put_resv_regions(dev, &mappings);
>
> return ret;
> }
> @@ -1546,20 +1546,20 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
> }
> EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
>
> -void iommu_get_dm_regions(struct device *dev, struct list_head *list)
> +void iommu_get_resv_regions(struct device *dev, struct list_head *list)
> {
> const struct iommu_ops *ops = dev->bus->iommu_ops;
>
> - if (ops && ops->get_dm_regions)
> - ops->get_dm_regions(dev, list);
> + if (ops && ops->get_resv_regions)
> + ops->get_resv_regions(dev, list);
> }
>
> -void iommu_put_dm_regions(struct device *dev, struct list_head *list)
> +void iommu_put_resv_regions(struct device *dev, struct list_head *list)
> {
> const struct iommu_ops *ops = dev->bus->iommu_ops;
>
> - if (ops && ops->put_dm_regions)
> - ops->put_dm_regions(dev, list);
> + if (ops && ops->put_resv_regions)
> + ops->put_resv_regions(dev, list);
> }
>
> /* Request that a device is direct mapped by the IOMMU */
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 436dc21..7f6ebd0 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -118,13 +118,13 @@ enum iommu_attr {
> };
>
> /**
> - * struct iommu_dm_region - descriptor for a direct mapped memory region
> + * struct iommu_resv_region - descriptor for a reserved memory region
> * @list: Linked list pointers
> * @start: System physical start address of the region
> * @length: Length of the region in bytes
> * @prot: IOMMU Protection flags (READ/WRITE/...)
> */
> -struct iommu_dm_region {
> +struct iommu_resv_region {
> struct list_head list;
> phys_addr_t start;
> size_t length;
> @@ -150,9 +150,9 @@ struct iommu_dm_region {
> * @device_group: find iommu group for a particular device
> * @domain_get_attr: Query domain attributes
> * @domain_set_attr: Change domain attributes
> - * @get_dm_regions: Request list of direct mapping requirements for a device
> - * @put_dm_regions: Free list of direct mapping requirements for a device
> - * @apply_dm_region: Temporary helper call-back for iova reserved ranges
> + * @get_resv_regions: Request list of reserved regions for a device
> + * @put_resv_regions: Free list of reserved regions for a device
> + * @apply_resv_region: Temporary helper call-back for iova reserved ranges
> * @domain_window_enable: Configure and enable a particular window for a domain
> * @domain_window_disable: Disable a particular window for a domain
> * @domain_set_windows: Set the number of windows for a domain
> @@ -184,11 +184,12 @@ struct iommu_ops {
> int (*domain_set_attr)(struct iommu_domain *domain,
> enum iommu_attr attr, void *data);
>
> - /* Request/Free a list of direct mapping requirements for a device */
> - void (*get_dm_regions)(struct device *dev, struct list_head *list);
> - void (*put_dm_regions)(struct device *dev, struct list_head *list);
> - void (*apply_dm_region)(struct device *dev, struct iommu_domain *domain,
> - struct iommu_dm_region *region);
> + /* Request/Free a list of reserved regions for a device */
> + void (*get_resv_regions)(struct device *dev, struct list_head *list);
> + void (*put_resv_regions)(struct device *dev, struct list_head *list);
> + void (*apply_resv_region)(struct device *dev,
> + struct iommu_domain *domain,
> + struct iommu_resv_region *region);
>
> /* Window handling functions */
> int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
> @@ -233,8 +234,8 @@ extern size_t default_iommu_map_sg(struct iommu_domain *domain, unsigned long io
> extern void iommu_set_fault_handler(struct iommu_domain *domain,
> iommu_fault_handler_t handler, void *token);
>
> -extern void iommu_get_dm_regions(struct device *dev, struct list_head *list);
> -extern void iommu_put_dm_regions(struct device *dev, struct list_head *list);
> +extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
> +extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
> extern int iommu_request_dm_for_dev(struct device *dev);
>
> extern int iommu_attach_group(struct iommu_domain *domain,
> @@ -439,12 +440,12 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain,
> {
> }
>
> -static inline void iommu_get_dm_regions(struct device *dev,
> +static inline void iommu_get_resv_regions(struct device *dev,
> struct list_head *list)
> {
> }
>
> -static inline void iommu_put_dm_regions(struct device *dev,
> +static inline void iommu_put_resv_regions(struct device *dev,
> struct list_head *list)
> {
> }
>
On 15/11/16 13:09, Eric Auger wrote:
> Introduce a new helper serving the purpose to allocate a reserved
> region. This will be used in iommu driver implementing reserved
> region callbacks.
>
> Signed-off-by: Eric Auger <[email protected]>
> ---
> drivers/iommu/iommu.c | 16 ++++++++++++++++
> include/linux/iommu.h | 8 ++++++++
> 2 files changed, 24 insertions(+)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index c7ed334..6ee529f 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1562,6 +1562,22 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
> ops->put_resv_regions(dev, list);
> }
>
> +struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
> + size_t length,
> + unsigned int prot)
> +{
> + struct iommu_resv_region *region;
> +
> + region = kzalloc(sizeof(*region), GFP_KERNEL);
> + if (!region)
> + return NULL;
> +
> + region->start = start;
> + region->length = length;
> + region->prot = prot;
> + return region;
> +}
I think you need an INIT_LIST_HEAD() in here as well, or
CONFIG_DEBUG_LIST might get unhappy about using an uninitialised head later.
Robin.
> +
> /* Request that a device is direct mapped by the IOMMU */
> int iommu_request_dm_for_dev(struct device *dev)
> {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 02cf565..0aea877 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -241,6 +241,8 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
> extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
> extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
> extern int iommu_request_dm_for_dev(struct device *dev);
> +extern struct iommu_resv_region *
> +iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot);
>
> extern int iommu_attach_group(struct iommu_domain *domain,
> struct iommu_group *group);
> @@ -454,6 +456,12 @@ static inline void iommu_put_resv_regions(struct device *dev,
> {
> }
>
> +static inline struct iommu_resv_region *
> +iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot)
> +{
> + return NULL;
> +}
> +
> static inline int iommu_request_dm_for_dev(struct device *dev)
> {
> return -ENODEV;
>
On 15/11/16 13:09, Eric Auger wrote:
> As we introduced IOMMU_RESV_NOMAP and IOMMU_RESV_MSI regions,
> let's prevent those new regions from being mapped.
>
> Signed-off-by: Eric Auger <[email protected]>
> ---
> drivers/iommu/iommu.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 6ee529f..a4530ad 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -343,6 +343,9 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
> start = ALIGN(entry->start, pg_size);
> end = ALIGN(entry->start + entry->length, pg_size);
>
> + if (entry->prot & IOMMU_RESV_MASK)
This seems to be the only place that this mask is used, and frankly I
think it's less clear than simply "(IOMMU_RESV_NOMAP | IOMMU_RESV_MSI)"
would be, at which point we may as well drop the mask and special value
trickery altogether. Plus, per my previous comment, if it were to be "if
(entry->type != <direct mapped type>)" instead, that's about as obvious
as it can get.
Robin.
> + continue;
> +
> for (addr = start; addr < end; addr += pg_size) {
> phys_addr_t phys_addr;
>
>
On 15/11/16 13:09, Eric Auger wrote:
> Introduce iommu_get_group_resv_regions whose role consists in
> enumerating all devices from the group and collecting their
> reserved regions. It checks duplicates.
>
> Signed-off-by: Eric Auger <[email protected]>
>
> ---
>
> - we do not move list elements from device to group list since
> the iommu_put_resv_regions() could not be called.
> - at the moment I did not introduce any iommu_put_group_resv_regions
> since it simply consists in voiding/freeing the list
> ---
> drivers/iommu/iommu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/iommu.h | 8 ++++++++
> 2 files changed, 61 insertions(+)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index a4530ad..e0fbcc5 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -133,6 +133,59 @@ static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
> return sprintf(buf, "%s\n", group->name);
> }
>
> +static bool iommu_resv_region_present(struct iommu_resv_region *region,
> + struct list_head *head)
> +{
> + struct iommu_resv_region *entry;
> +
> + list_for_each_entry(entry, head, list) {
> + if ((region->start == entry->start) &&
> + (region->length == entry->length) &&
> + (region->prot == entry->prot))
> + return true;
> + }
> + return false;
> +}
> +
> +static int
> +iommu_insert_device_resv_regions(struct list_head *dev_resv_regions,
> + struct list_head *group_resv_regions)
> +{
> + struct iommu_resv_region *entry, *region;
> +
> + list_for_each_entry(entry, dev_resv_regions, list) {
> + if (iommu_resv_region_present(entry, group_resv_regions))
> + continue;
In the case of overlapping regions which _aren't_ an exact match, would
it be better to expand the existing one rather than leave the caller to
sort it out? It seems a bit inconsistent to handle only the one case here.
> + region = iommu_alloc_resv_region(entry->start, entry->length,
> + entry->prot);
> + if (!region)
> + return -ENOMEM;
> +
> + list_add_tail(®ion->list, group_resv_regions);
> + }
> + return 0;
> +}
> +
> +int iommu_get_group_resv_regions(struct iommu_group *group,
> + struct list_head *head)
> +{
> + struct iommu_device *device;
> + int ret = 0;
> +
> + list_for_each_entry(device, &group->devices, list) {
Should we not be taking the group mutex around this?
Robin.
> + struct list_head dev_resv_regions;
> +
> + INIT_LIST_HEAD(&dev_resv_regions);
> + iommu_get_resv_regions(device->dev, &dev_resv_regions);
> + ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
> + iommu_put_resv_regions(device->dev, &dev_resv_regions);
> + if (ret)
> + break;
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions);
> +
> static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
>
> static void iommu_group_release(struct kobject *kobj)
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 0aea877..0f7ae2c 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -243,6 +243,8 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
> extern int iommu_request_dm_for_dev(struct device *dev);
> extern struct iommu_resv_region *
> iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot);
> +extern int iommu_get_group_resv_regions(struct iommu_group *group,
> + struct list_head *head);
>
> extern int iommu_attach_group(struct iommu_domain *domain,
> struct iommu_group *group);
> @@ -462,6 +464,12 @@ static inline void iommu_put_resv_regions(struct device *dev,
> return NULL;
> }
>
> +static inline int iommu_get_group_resv_regions(struct iommu_group *group,
> + struct list_head *head)
> +{
> + return -ENODEV;
> +}
> +
> static inline int iommu_request_dm_for_dev(struct device *dev)
> {
> return -ENODEV;
>
On 15/11/16 13:09, Eric Auger wrote:
> The get() populates the list with the PCI host bridge windows
> and the MSI IOVA range.
>
> At the moment an arbitray MSI IOVA window is set at 0x8000000
> of size 1MB. This will allow to report those info in iommu-group
> sysfs?
>
> Signed-off-by: Eric Auger <[email protected]>
>
> ---
>
> RFC v2 -> v3:
> - use existing get/put_resv_regions
>
> RFC v1 -> v2:
> - use defines for MSI IOVA base and length
> ---
> drivers/iommu/arm-smmu.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 52 insertions(+)
>
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 8f72814..81f1a83 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -278,6 +278,9 @@ enum arm_smmu_s2cr_privcfg {
>
> #define FSYNR0_WNR (1 << 4)
>
> +#define MSI_IOVA_BASE 0x8000000
> +#define MSI_IOVA_LENGTH 0x100000
> +
> static int force_stage;
> module_param(force_stage, int, S_IRUGO);
> MODULE_PARM_DESC(force_stage,
> @@ -1545,6 +1548,53 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
> return iommu_fwspec_add_ids(dev, &fwid, 1);
> }
>
> +static void arm_smmu_get_resv_regions(struct device *dev,
> + struct list_head *head)
> +{
> + struct iommu_resv_region *region;
> + struct pci_host_bridge *bridge;
> + struct resource_entry *window;
> +
> + /* MSI region */
> + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
> + IOMMU_RESV_MSI);
> + if (!region)
> + return;
> +
> + list_add_tail(®ion->list, head);
> +
> + if (!dev_is_pci(dev))
> + return;
> +
> + bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
> +
> + resource_list_for_each_entry(window, &bridge->windows) {
> + phys_addr_t start;
> + size_t length;
> +
> + if (resource_type(window->res) != IORESOURCE_MEM &&
> + resource_type(window->res) != IORESOURCE_IO)
As Joerg commented elsewhere, considering anything other than memory
resources isn't right (I appreciate you've merely copied my own mistake
here). We need some other way to handle root complexes where the CPU
MMIO views of PCI windows appear in PCI memory space - using the I/O
address of I/O resources only works by chance on Juno, and it still
doesn't account for config space. I suggest we just leave that out for
the time being to make life easier (does it even apply to anything other
than Juno?) and figure it out later.
> + continue;
> +
> + start = window->res->start - window->offset;
> + length = window->res->end - window->res->start + 1;
> + region = iommu_alloc_resv_region(start, length,
> + IOMMU_RESV_NOMAP);
> + if (!region)
> + return;
> + list_add_tail(®ion->list, head);
> + }
> +}
Either way, there's nothing SMMU-specific about PCI windows. The fact
that we'd have to copy-paste all of this into the SMMUv3 driver
unchanged suggests it should go somewhere common (although I would be
inclined to leave the insertion of the fake MSI region to driver-private
wrappers). As I said before, the current iova_reserve_pci_windows()
simply wants splitting into appropriate public callbacks for
get_resv_regions and apply_resv_regions.
Robin.
> +static void arm_smmu_put_resv_regions(struct device *dev,
> + struct list_head *head)
> +{
> + struct iommu_resv_region *entry, *next;
> +
> + list_for_each_entry_safe(entry, next, head, list)
> + kfree(entry);
> +}
> +
> static struct iommu_ops arm_smmu_ops = {
> .capable = arm_smmu_capable,
> .domain_alloc = arm_smmu_domain_alloc,
> @@ -1560,6 +1610,8 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
> .domain_get_attr = arm_smmu_domain_get_attr,
> .domain_set_attr = arm_smmu_domain_set_attr,
> .of_xlate = arm_smmu_of_xlate,
> + .get_resv_regions = arm_smmu_get_resv_regions,
> + .put_resv_regions = arm_smmu_put_resv_regions,
> .pgsize_bitmap = -1UL, /* Restricted during device attach */
> };
>
>
Hi Robin,
On 06/12/2016 19:55, Robin Murphy wrote:
> On 15/11/16 13:09, Eric Auger wrote:
>> The get() populates the list with the PCI host bridge windows
>> and the MSI IOVA range.
>>
>> At the moment an arbitray MSI IOVA window is set at 0x8000000
>> of size 1MB. This will allow to report those info in iommu-group
>> sysfs?
First thank you for reviewing the series. This is definitively helpful!
>>
>> Signed-off-by: Eric Auger <[email protected]>
>>
>> ---
>>
>> RFC v2 -> v3:
>> - use existing get/put_resv_regions
>>
>> RFC v1 -> v2:
>> - use defines for MSI IOVA base and length
>> ---
>> drivers/iommu/arm-smmu.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 52 insertions(+)
>>
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index 8f72814..81f1a83 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -278,6 +278,9 @@ enum arm_smmu_s2cr_privcfg {
>>
>> #define FSYNR0_WNR (1 << 4)
>>
>> +#define MSI_IOVA_BASE 0x8000000
>> +#define MSI_IOVA_LENGTH 0x100000
>> +
>> static int force_stage;
>> module_param(force_stage, int, S_IRUGO);
>> MODULE_PARM_DESC(force_stage,
>> @@ -1545,6 +1548,53 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>> return iommu_fwspec_add_ids(dev, &fwid, 1);
>> }
>>
>> +static void arm_smmu_get_resv_regions(struct device *dev,
>> + struct list_head *head)
>> +{
>> + struct iommu_resv_region *region;
>> + struct pci_host_bridge *bridge;
>> + struct resource_entry *window;
>> +
>> + /* MSI region */
>> + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
>> + IOMMU_RESV_MSI);
>> + if (!region)
>> + return;
>> +
>> + list_add_tail(®ion->list, head);
>> +
>> + if (!dev_is_pci(dev))
>> + return;
>> +
>> + bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
>> +
>> + resource_list_for_each_entry(window, &bridge->windows) {
>> + phys_addr_t start;
>> + size_t length;
>> +
>> + if (resource_type(window->res) != IORESOURCE_MEM &&
>> + resource_type(window->res) != IORESOURCE_IO)
>
> As Joerg commented elsewhere, considering anything other than memory
> resources isn't right (I appreciate you've merely copied my own mistake
> here). We need some other way to handle root complexes where the CPU
> MMIO views of PCI windows appear in PCI memory space - using the I/O
> address of I/O resources only works by chance on Juno, and it still
> doesn't account for config space. I suggest we just leave that out for
> the time being to make life easier (does it even apply to anything other
> than Juno?) and figure it out later.
OK so I understand I should remove IORESOURCE_IO check.
>
>> + continue;
>> +
>> + start = window->res->start - window->offset;
>> + length = window->res->end - window->res->start + 1;
>> + region = iommu_alloc_resv_region(start, length,
>> + IOMMU_RESV_NOMAP);
>> + if (!region)
>> + return;
>> + list_add_tail(®ion->list, head);
>> + }
>> +}
>
> Either way, there's nothing SMMU-specific about PCI windows. The fact
> that we'd have to copy-paste all of this into the SMMUv3 driver
> unchanged suggests it should go somewhere common (although I would be
> inclined to leave the insertion of the fake MSI region to driver-private
> wrappers). As I said before, the current iova_reserve_pci_windows()
> simply wants splitting into appropriate public callbacks for
> get_resv_regions and apply_resv_regions.
Do you mean somewhere common in the arm-smmu subsystem (new file) or in
another subsystem (pci?)
More generally the current implementation does not handle the case where
any of those PCIe host bridge window collide with the MSI window. To me
this is a flaw.
1) Either we take into account the PCIe windows and prevent any
collision when allocating the MSI window.
2) or we do not care about PCIe host bridge windows at kernel level.
If 1) we are back to the original issue of where do we put the MSI
window. Obviously at a place which might not be QEMU friendly anymore.
What allocation policy shall we use?
Second option - sorry I may look stubborn - which I definitively prefer
and which was also advocated by Alex, we handle PCI host bridge windows
at user level. MSI window is reported through the iommu group sysfs.
PCIe host bridge windows can be enumerated through /proc/iomem. Both x86
iommu and arm smmu would report an MSI reserved window. ARM MSI window
would become a de facto reserved window for guests.
Thoughts?
Eric
>
> Robin.
>
>> +static void arm_smmu_put_resv_regions(struct device *dev,
>> + struct list_head *head)
>> +{
>> + struct iommu_resv_region *entry, *next;
>> +
>> + list_for_each_entry_safe(entry, next, head, list)
>> + kfree(entry);
>> +}
>> +
>> static struct iommu_ops arm_smmu_ops = {
>> .capable = arm_smmu_capable,
>> .domain_alloc = arm_smmu_domain_alloc,
>> @@ -1560,6 +1610,8 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>> .domain_get_attr = arm_smmu_domain_get_attr,
>> .domain_set_attr = arm_smmu_domain_set_attr,
>> .of_xlate = arm_smmu_of_xlate,
>> + .get_resv_regions = arm_smmu_get_resv_regions,
>> + .put_resv_regions = arm_smmu_put_resv_regions,
>> .pgsize_bitmap = -1UL, /* Restricted during device attach */
>> };
>>
>>
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Hi Robin,
On 06/12/2016 19:13, Robin Murphy wrote:
> On 15/11/16 13:09, Eric Auger wrote:
>> Introduce iommu_get_group_resv_regions whose role consists in
>> enumerating all devices from the group and collecting their
>> reserved regions. It checks duplicates.
>>
>> Signed-off-by: Eric Auger <[email protected]>
>>
>> ---
>>
>> - we do not move list elements from device to group list since
>> the iommu_put_resv_regions() could not be called.
>> - at the moment I did not introduce any iommu_put_group_resv_regions
>> since it simply consists in voiding/freeing the list
>> ---
>> drivers/iommu/iommu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> include/linux/iommu.h | 8 ++++++++
>> 2 files changed, 61 insertions(+)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index a4530ad..e0fbcc5 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -133,6 +133,59 @@ static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
>> return sprintf(buf, "%s\n", group->name);
>> }
>>
>> +static bool iommu_resv_region_present(struct iommu_resv_region *region,
>> + struct list_head *head)
>> +{
>> + struct iommu_resv_region *entry;
>> +
>> + list_for_each_entry(entry, head, list) {
>> + if ((region->start == entry->start) &&
>> + (region->length == entry->length) &&
>> + (region->prot == entry->prot))
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> +static int
>> +iommu_insert_device_resv_regions(struct list_head *dev_resv_regions,
>> + struct list_head *group_resv_regions)
>> +{
>> + struct iommu_resv_region *entry, *region;
>> +
>> + list_for_each_entry(entry, dev_resv_regions, list) {
>> + if (iommu_resv_region_present(entry, group_resv_regions))
>> + continue;
>
> In the case of overlapping regions which _aren't_ an exact match, would
> it be better to expand the existing one rather than leave the caller to
> sort it out? It seems a bit inconsistent to handle only the one case here.
Well this is mostly here to avoid inserting several times the same PCIe
host bridge windows (retrieved from several PCIe EP attached to the same
bridge). I don't know if it is worth making things over-complicated. Do
you have another situation in mind?
>
>> + region = iommu_alloc_resv_region(entry->start, entry->length,
>> + entry->prot);
>> + if (!region)
>> + return -ENOMEM;
>> +
>> + list_add_tail(®ion->list, group_resv_regions);
>> + }
>> + return 0;
>> +}
>> +
>> +int iommu_get_group_resv_regions(struct iommu_group *group,
>> + struct list_head *head)
>> +{
>> + struct iommu_device *device;
>> + int ret = 0;
>> +
>> + list_for_each_entry(device, &group->devices, list) {
>
> Should we not be taking the group mutex around this?
Yes you're right.
Thanks
Eric
>
> Robin.
>
>> + struct list_head dev_resv_regions;
>> +
>> + INIT_LIST_HEAD(&dev_resv_regions);
>> + iommu_get_resv_regions(device->dev, &dev_resv_regions);
>> + ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
>> + iommu_put_resv_regions(device->dev, &dev_resv_regions);
>> + if (ret)
>> + break;
>> + }
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions);
>> +
>> static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
>>
>> static void iommu_group_release(struct kobject *kobj)
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index 0aea877..0f7ae2c 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -243,6 +243,8 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
>> extern int iommu_request_dm_for_dev(struct device *dev);
>> extern struct iommu_resv_region *
>> iommu_alloc_resv_region(phys_addr_t start, size_t length, unsigned int prot);
>> +extern int iommu_get_group_resv_regions(struct iommu_group *group,
>> + struct list_head *head);
>>
>> extern int iommu_attach_group(struct iommu_domain *domain,
>> struct iommu_group *group);
>> @@ -462,6 +464,12 @@ static inline void iommu_put_resv_regions(struct device *dev,
>> return NULL;
>> }
>>
>> +static inline int iommu_get_group_resv_regions(struct iommu_group *group,
>> + struct list_head *head)
>> +{
>> + return -ENODEV;
>> +}
>> +
>> static inline int iommu_request_dm_for_dev(struct device *dev)
>> {
>> return -ENODEV;
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
Hi Robin
On 06/12/2016 18:36, Robin Murphy wrote:
> On 15/11/16 13:09, Eric Auger wrote:
>> As we introduced IOMMU_RESV_NOMAP and IOMMU_RESV_MSI regions,
>> let's prevent those new regions from being mapped.
>>
>> Signed-off-by: Eric Auger <[email protected]>
>> ---
>> drivers/iommu/iommu.c | 3 +++
>> 1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 6ee529f..a4530ad 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -343,6 +343,9 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
>> start = ALIGN(entry->start, pg_size);
>> end = ALIGN(entry->start + entry->length, pg_size);
>>
>> + if (entry->prot & IOMMU_RESV_MASK)
>
> This seems to be the only place that this mask is used, and frankly I
> think it's less clear than simply "(IOMMU_RESV_NOMAP | IOMMU_RESV_MSI)"
> would be, at which point we may as well drop the mask and special value
> trickery altogether. Plus, per my previous comment, if it were to be "if
> (entry->type != <direct mapped type>)" instead, that's about as obvious
> as it can get.
OK I will add this new type entry in the reserved window struct.
thanks
Eric
>
> Robin.
>
>> + continue;
>> +
>> for (addr = start; addr < end; addr += pg_size) {
>> phys_addr_t phys_addr;
>>
>>
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
On 07/12/16 15:02, Auger Eric wrote:
> Hi Robin,
> On 06/12/2016 19:55, Robin Murphy wrote:
>> On 15/11/16 13:09, Eric Auger wrote:
>>> The get() populates the list with the PCI host bridge windows
>>> and the MSI IOVA range.
>>>
>>> At the moment an arbitray MSI IOVA window is set at 0x8000000
>>> of size 1MB. This will allow to report those info in iommu-group
>>> sysfs?
>
>
> First thank you for reviewing the series. This is definitively helpful!
>>>
>>> Signed-off-by: Eric Auger <[email protected]>
>>>
>>> ---
>>>
>>> RFC v2 -> v3:
>>> - use existing get/put_resv_regions
>>>
>>> RFC v1 -> v2:
>>> - use defines for MSI IOVA base and length
>>> ---
>>> drivers/iommu/arm-smmu.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 52 insertions(+)
>>>
>>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>>> index 8f72814..81f1a83 100644
>>> --- a/drivers/iommu/arm-smmu.c
>>> +++ b/drivers/iommu/arm-smmu.c
>>> @@ -278,6 +278,9 @@ enum arm_smmu_s2cr_privcfg {
>>>
>>> #define FSYNR0_WNR (1 << 4)
>>>
>>> +#define MSI_IOVA_BASE 0x8000000
>>> +#define MSI_IOVA_LENGTH 0x100000
>>> +
>>> static int force_stage;
>>> module_param(force_stage, int, S_IRUGO);
>>> MODULE_PARM_DESC(force_stage,
>>> @@ -1545,6 +1548,53 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>>> return iommu_fwspec_add_ids(dev, &fwid, 1);
>>> }
>>>
>>> +static void arm_smmu_get_resv_regions(struct device *dev,
>>> + struct list_head *head)
>>> +{
>>> + struct iommu_resv_region *region;
>>> + struct pci_host_bridge *bridge;
>>> + struct resource_entry *window;
>>> +
>>> + /* MSI region */
>>> + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
>>> + IOMMU_RESV_MSI);
>>> + if (!region)
>>> + return;
>>> +
>>> + list_add_tail(®ion->list, head);
>>> +
>>> + if (!dev_is_pci(dev))
>>> + return;
>>> +
>>> + bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
>>> +
>>> + resource_list_for_each_entry(window, &bridge->windows) {
>>> + phys_addr_t start;
>>> + size_t length;
>>> +
>>> + if (resource_type(window->res) != IORESOURCE_MEM &&
>>> + resource_type(window->res) != IORESOURCE_IO)
>>
>> As Joerg commented elsewhere, considering anything other than memory
>> resources isn't right (I appreciate you've merely copied my own mistake
>> here). We need some other way to handle root complexes where the CPU
>> MMIO views of PCI windows appear in PCI memory space - using the I/O
>> address of I/O resources only works by chance on Juno, and it still
>> doesn't account for config space. I suggest we just leave that out for
>> the time being to make life easier (does it even apply to anything other
>> than Juno?) and figure it out later.
> OK so I understand I should remove IORESOURCE_IO check.
>>
>>> + continue;
>>> +
>>> + start = window->res->start - window->offset;
>>> + length = window->res->end - window->res->start + 1;
>>> + region = iommu_alloc_resv_region(start, length,
>>> + IOMMU_RESV_NOMAP);
>>> + if (!region)
>>> + return;
>>> + list_add_tail(®ion->list, head);
>>> + }
>>> +}
>>
>> Either way, there's nothing SMMU-specific about PCI windows. The fact
>> that we'd have to copy-paste all of this into the SMMUv3 driver
>> unchanged suggests it should go somewhere common (although I would be
>> inclined to leave the insertion of the fake MSI region to driver-private
>> wrappers). As I said before, the current iova_reserve_pci_windows()
>> simply wants splitting into appropriate public callbacks for
>> get_resv_regions and apply_resv_regions.
> Do you mean somewhere common in the arm-smmu subsystem (new file) or in
> another subsystem (pci?)
>
> More generally the current implementation does not handle the case where
> any of those PCIe host bridge window collide with the MSI window. To me
> this is a flaw.
> 1) Either we take into account the PCIe windows and prevent any
> collision when allocating the MSI window.
> 2) or we do not care about PCIe host bridge windows at kernel level.
Even more generally, the MSI window also needs to avoid any other
IOMMU-specific reserved regions as well - fortunately I don't think
there's any current intersection between platforms with RMRR-type
reservations and platforms which require MSI mapping - so I think we've
got enough freedom for the moment, but it's certainly an argument in
favour of ultimately expressing PCI windows through the same mechanism
to keep everything in the same place. The other big advantage of
reserved regions is that they will automatically apply to DMA domains as
well.
> If 1) we are back to the original issue of where do we put the MSI
> window. Obviously at a place which might not be QEMU friendly anymore.
> What allocation policy shall we use?
>
> Second option - sorry I may look stubborn - which I definitively prefer
> and which was also advocated by Alex, we handle PCI host bridge windows
> at user level. MSI window is reported through the iommu group sysfs.
> PCIe host bridge windows can be enumerated through /proc/iomem. Both x86
> iommu and arm smmu would report an MSI reserved window. ARM MSI window
> would become a de facto reserved window for guests.
So from the ABI perspective, the sysfs iommu_group/*/reserved_regions
represents a minimum set of regions (MSI, RMRR, etc.) which definitely
*must* be reserved, but offers no guarantee that there aren't also other
regions not represented there. That seems reasonable to start with, and
still leaves us free to expand the scope of reserved regions in future
without breaking anything.
> Thoughts?
I like the second option too - "grep PCI /proc/iomem" already catches
more than enumerating the resources does (i.e. ECAM space) - and neither
does it preclude growing the more extensive version on top over time.
For the sake of moving forward, I'd be happy with just dropping the PCI
stuff from here, and leaving the SMMU drivers exposing the single
hard-coded MSI region directly (to be fair, it'd hardly be the first
function which is identical between the two). We can take a look into
making iommu-dma implement PCI windows as nomap resv_regions properly as
an orthogonal thing (for the sake of DMA domains), after which we should
be in a position to drop the hard-coding and start placing the MSI
window dynamically where appropriate.
Robin.
>>> +static void arm_smmu_put_resv_regions(struct device *dev,
>>> + struct list_head *head)
>>> +{
>>> + struct iommu_resv_region *entry, *next;
>>> +
>>> + list_for_each_entry_safe(entry, next, head, list)
>>> + kfree(entry);
>>> +}
>>> +
>>> static struct iommu_ops arm_smmu_ops = {
>>> .capable = arm_smmu_capable,
>>> .domain_alloc = arm_smmu_domain_alloc,
>>> @@ -1560,6 +1610,8 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>>> .domain_get_attr = arm_smmu_domain_get_attr,
>>> .domain_set_attr = arm_smmu_domain_set_attr,
>>> .of_xlate = arm_smmu_of_xlate,
>>> + .get_resv_regions = arm_smmu_get_resv_regions,
>>> + .put_resv_regions = arm_smmu_put_resv_regions,
>>> .pgsize_bitmap = -1UL, /* Restricted during device attach */
>>> };
>>>
>>>
>>
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>>
Hi Eric,
Is there any reason why you are not supporting SMMUv3 driver? Qualcomm
hardware doesn't not support SMMUv2 hardware, please add support for
SMMUv3 in next patch set. I've ported ' RFC,v3,09/10] iommu/arm-smmu:
Implement reserved region get/put callbacks' to SMMUv3 driver and tested
device-pass-through feature on Qualcomm server platform without any issue.
Tested-by: Shanker Donthineni <[email protected]>
--
Shanker Donthineni
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
Hi Eric,
I have tested this series on NXP platform.
Thanks
-Bharat
> -----Original Message-----
> From: [email protected] [mailto:iommu-
> [email protected]] On Behalf Of Eric Auger
> Sent: Tuesday, November 15, 2016 6:39 PM
> To: [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]
> Subject: [RFC v3 00/10] KVM PCIe/MSI passthrough on ARM/ARM64 and
> IOVA reserved regions
>
> Following LPC discussions, we now report reserved regions through iommu-
> group sysfs reserved_regions attribute file.
>
> Reserved regions are populated through the IOMMU get_resv_region
> callback (former get_dm_regions), now implemented by amd-iommu, intel-
> iommu and arm-smmu.
>
> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
> IOMMU_RESV_NOMAP reserved region.
>
> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and 1MB
> large) and the PCI host bridge windows.
>
> The series integrates a not officially posted patch from Robin:
> "iommu/dma: Allow MSI-only cookies".
>
> This series currently does not address IRQ safety assessment.
>
> Best Regards
>
> Eric
>
> Git: complete series available at
> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>
> History:
> RFC v2 -> v3:
> - switch to an iommu-group sysfs API
> - use new dummy allocator provided by Robin
> - dummy allocator initialized by vfio-iommu-type1 after enumerating
> the reserved regions
> - at the moment ARM MSI base address/size is left unchanged compared
> to v2
> - we currently report reserved regions and not usable IOVA regions as
> requested by Alex
>
> RFC v1 -> v2:
> - fix intel_add_reserved_regions
> - add mutex lock/unlock in vfio_iommu_type1
>
>
> Eric Auger (10):
> iommu/dma: Allow MSI-only cookies
> iommu: Rename iommu_dm_regions into iommu_resv_regions
> iommu: Add new reserved IOMMU attributes
> iommu: iommu_alloc_resv_region
> iommu: Do not map reserved regions
> iommu: iommu_get_group_resv_regions
> iommu: Implement reserved_regions iommu-group sysfs file
> iommu/vt-d: Implement reserved region get/put callbacks
> iommu/arm-smmu: Implement reserved region get/put callbacks
> vfio/type1: Get MSI cookie
>
> drivers/iommu/amd_iommu.c | 20 +++---
> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++----
> ---
> drivers/iommu/intel-iommu.c | 50 ++++++++++----
> drivers/iommu/iommu.c | 141
> ++++++++++++++++++++++++++++++++++++----
> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
> include/linux/dma-iommu.h | 7 ++
> include/linux/iommu.h | 49 ++++++++++----
> 8 files changed, 391 insertions(+), 70 deletions(-)
>
> --
> 1.9.1
>
> _______________________________________________
> iommu mailing list
> [email protected]
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Hi Shanker,
On 07/12/2016 19:52, Shanker Donthineni wrote:
> Hi Eric,
>
> Is there any reason why you are not supporting SMMUv3 driver? Qualcomm
> hardware doesn't not support SMMUv2 hardware, please add support for
> SMMUv3 in next patch set. I've ported ' RFC,v3,09/10] iommu/arm-smmu:
> Implement reserved region get/put callbacks' to SMMUv3 driver and tested
> device-pass-through feature on Qualcomm server platform without any issue.
>
> Tested-by: Shanker Donthineni <[email protected]>
Thanks!
No reason behind not supporting smmuv3 except I don't have any HW to test.
I will add this support in next version.
Thanks
Eric
Hi Robin,
On 07/12/2016 19:24, Robin Murphy wrote:
> On 07/12/16 15:02, Auger Eric wrote:
>> Hi Robin,
>> On 06/12/2016 19:55, Robin Murphy wrote:
>>> On 15/11/16 13:09, Eric Auger wrote:
>>>> The get() populates the list with the PCI host bridge windows
>>>> and the MSI IOVA range.
>>>>
>>>> At the moment an arbitray MSI IOVA window is set at 0x8000000
>>>> of size 1MB. This will allow to report those info in iommu-group
>>>> sysfs?
>>
>>
>> First thank you for reviewing the series. This is definitively helpful!
>>>>
>>>> Signed-off-by: Eric Auger <[email protected]>
>>>>
>>>> ---
>>>>
>>>> RFC v2 -> v3:
>>>> - use existing get/put_resv_regions
>>>>
>>>> RFC v1 -> v2:
>>>> - use defines for MSI IOVA base and length
>>>> ---
>>>> drivers/iommu/arm-smmu.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>> 1 file changed, 52 insertions(+)
>>>>
>>>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>>>> index 8f72814..81f1a83 100644
>>>> --- a/drivers/iommu/arm-smmu.c
>>>> +++ b/drivers/iommu/arm-smmu.c
>>>> @@ -278,6 +278,9 @@ enum arm_smmu_s2cr_privcfg {
>>>>
>>>> #define FSYNR0_WNR (1 << 4)
>>>>
>>>> +#define MSI_IOVA_BASE 0x8000000
>>>> +#define MSI_IOVA_LENGTH 0x100000
>>>> +
>>>> static int force_stage;
>>>> module_param(force_stage, int, S_IRUGO);
>>>> MODULE_PARM_DESC(force_stage,
>>>> @@ -1545,6 +1548,53 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>>>> return iommu_fwspec_add_ids(dev, &fwid, 1);
>>>> }
>>>>
>>>> +static void arm_smmu_get_resv_regions(struct device *dev,
>>>> + struct list_head *head)
>>>> +{
>>>> + struct iommu_resv_region *region;
>>>> + struct pci_host_bridge *bridge;
>>>> + struct resource_entry *window;
>>>> +
>>>> + /* MSI region */
>>>> + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
>>>> + IOMMU_RESV_MSI);
>>>> + if (!region)
>>>> + return;
>>>> +
>>>> + list_add_tail(®ion->list, head);
>>>> +
>>>> + if (!dev_is_pci(dev))
>>>> + return;
>>>> +
>>>> + bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
>>>> +
>>>> + resource_list_for_each_entry(window, &bridge->windows) {
>>>> + phys_addr_t start;
>>>> + size_t length;
>>>> +
>>>> + if (resource_type(window->res) != IORESOURCE_MEM &&
>>>> + resource_type(window->res) != IORESOURCE_IO)
>>>
>>> As Joerg commented elsewhere, considering anything other than memory
>>> resources isn't right (I appreciate you've merely copied my own mistake
>>> here). We need some other way to handle root complexes where the CPU
>>> MMIO views of PCI windows appear in PCI memory space - using the I/O
>>> address of I/O resources only works by chance on Juno, and it still
>>> doesn't account for config space. I suggest we just leave that out for
>>> the time being to make life easier (does it even apply to anything other
>>> than Juno?) and figure it out later.
>> OK so I understand I should remove IORESOURCE_IO check.
>>>
>>>> + continue;
>>>> +
>>>> + start = window->res->start - window->offset;
>>>> + length = window->res->end - window->res->start + 1;
>>>> + region = iommu_alloc_resv_region(start, length,
>>>> + IOMMU_RESV_NOMAP);
>>>> + if (!region)
>>>> + return;
>>>> + list_add_tail(®ion->list, head);
>>>> + }
>>>> +}
>>>
>>> Either way, there's nothing SMMU-specific about PCI windows. The fact
>>> that we'd have to copy-paste all of this into the SMMUv3 driver
>>> unchanged suggests it should go somewhere common (although I would be
>>> inclined to leave the insertion of the fake MSI region to driver-private
>>> wrappers). As I said before, the current iova_reserve_pci_windows()
>>> simply wants splitting into appropriate public callbacks for
>>> get_resv_regions and apply_resv_regions.
>> Do you mean somewhere common in the arm-smmu subsystem (new file) or in
>> another subsystem (pci?)
>>
>> More generally the current implementation does not handle the case where
>> any of those PCIe host bridge window collide with the MSI window. To me
>> this is a flaw.
>> 1) Either we take into account the PCIe windows and prevent any
>> collision when allocating the MSI window.
>> 2) or we do not care about PCIe host bridge windows at kernel level.
>
> Even more generally, the MSI window also needs to avoid any other
> IOMMU-specific reserved regions as well - fortunately I don't think
> there's any current intersection between platforms with RMRR-type
> reservations and platforms which require MSI mapping - so I think we've
> got enough freedom for the moment, but it's certainly an argument in
> favour of ultimately expressing PCI windows through the same mechanism
> to keep everything in the same place. The other big advantage of
> reserved regions is that they will automatically apply to DMA domains as
> well.
>
>> If 1) we are back to the original issue of where do we put the MSI
>> window. Obviously at a place which might not be QEMU friendly anymore.
>> What allocation policy shall we use?
>>
>> Second option - sorry I may look stubborn - which I definitively prefer
>> and which was also advocated by Alex, we handle PCI host bridge windows
>> at user level. MSI window is reported through the iommu group sysfs.
>> PCIe host bridge windows can be enumerated through /proc/iomem. Both x86
>> iommu and arm smmu would report an MSI reserved window. ARM MSI window
>> would become a de facto reserved window for guests.
>
> So from the ABI perspective, the sysfs iommu_group/*/reserved_regions
> represents a minimum set of regions (MSI, RMRR, etc.) which definitely
> *must* be reserved, but offers no guarantee that there aren't also other
> regions not represented there. That seems reasonable to start with, and
> still leaves us free to expand the scope of reserved regions in future
> without breaking anything.
>
>> Thoughts?
>
> I like the second option too - "grep PCI /proc/iomem" already catches
> more than enumerating the resources does (i.e. ECAM space) - and neither
> does it preclude growing the more extensive version on top over time.
>
> For the sake of moving forward, I'd be happy with just dropping the PCI
> stuff from here, and leaving the SMMU drivers exposing the single
> hard-coded MSI region directly (to be fair, it'd hardly be the first
> function which is identical between the two).
OK cool
Thanks
Eric
We can take a look into
> making iommu-dma implement PCI windows as nomap resv_regions properly as
> an orthogonal thing (for the sake of DMA domains), after which we should
> be in a position to drop the hard-coding and start placing the MSI
> window dynamically where appropriate.
>
> Robin.
>
>>>> +static void arm_smmu_put_resv_regions(struct device *dev,
>>>> + struct list_head *head)
>>>> +{
>>>> + struct iommu_resv_region *entry, *next;
>>>> +
>>>> + list_for_each_entry_safe(entry, next, head, list)
>>>> + kfree(entry);
>>>> +}
>>>> +
>>>> static struct iommu_ops arm_smmu_ops = {
>>>> .capable = arm_smmu_capable,
>>>> .domain_alloc = arm_smmu_domain_alloc,
>>>> @@ -1560,6 +1610,8 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>>>> .domain_get_attr = arm_smmu_domain_get_attr,
>>>> .domain_set_attr = arm_smmu_domain_set_attr,
>>>> .of_xlate = arm_smmu_of_xlate,
>>>> + .get_resv_regions = arm_smmu_get_resv_regions,
>>>> + .put_resv_regions = arm_smmu_put_resv_regions,
>>>> .pgsize_bitmap = -1UL, /* Restricted during device attach */
>>>> };
>>>>
>>>>
>>>
>>>
>>> _______________________________________________
>>> linux-arm-kernel mailing list
>>> [email protected]
>>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>>>
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Hi,
On 15/11/2016 14:09, Eric Auger wrote:
> Following LPC discussions, we now report reserved regions through
> iommu-group sysfs reserved_regions attribute file.
While I am respinning this series into v4, here is a tentative summary
of technical topics for which no consensus was reached at this point.
1) Shall we report the usable IOVA range instead of reserved IOVA
ranges. Not discussed at/after LPC.
x I currently report reserved regions. Alex expressed the need to
report the full usable IOVA range instead (x86 min-max range
minus MSI APIC window). I think this is meaningful for ARM
too where arm-smmu might not support the full 64b range.
x Any objection we report the usable IOVA regions instead?
2) Shall the kernel check collision with MSI window* when userspace
calls VFIO_IOMMU_MAP_DMA?
Joerg/Will No; Alex yes
*for IOVA regions consumed downstream to the IOMMU: everyone says NO
3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
My current series does not expose them in iommu group sysfs.
I understand we can expose the RMRR regions in the iomm group sysfs
without necessarily supporting RMRR requiring device assignment.
We can also add this support later.
Thanks
Eric
>
> Reserved regions are populated through the IOMMU get_resv_region callback
> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
> arm-smmu.
>
> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
> IOMMU_RESV_NOMAP reserved region.
>
> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
> 1MB large) and the PCI host bridge windows.
>
> The series integrates a not officially posted patch from Robin:
> "iommu/dma: Allow MSI-only cookies".
>
> This series currently does not address IRQ safety assessment.
>
> Best Regards
>
> Eric
>
> Git: complete series available at
> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>
> History:
> RFC v2 -> v3:
> - switch to an iommu-group sysfs API
> - use new dummy allocator provided by Robin
> - dummy allocator initialized by vfio-iommu-type1 after enumerating
> the reserved regions
> - at the moment ARM MSI base address/size is left unchanged compared
> to v2
> - we currently report reserved regions and not usable IOVA regions as
> requested by Alex
>
> RFC v1 -> v2:
> - fix intel_add_reserved_regions
> - add mutex lock/unlock in vfio_iommu_type1
>
>
> Eric Auger (10):
> iommu/dma: Allow MSI-only cookies
> iommu: Rename iommu_dm_regions into iommu_resv_regions
> iommu: Add new reserved IOMMU attributes
> iommu: iommu_alloc_resv_region
> iommu: Do not map reserved regions
> iommu: iommu_get_group_resv_regions
> iommu: Implement reserved_regions iommu-group sysfs file
> iommu/vt-d: Implement reserved region get/put callbacks
> iommu/arm-smmu: Implement reserved region get/put callbacks
> vfio/type1: Get MSI cookie
>
> drivers/iommu/amd_iommu.c | 20 +++---
> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
> drivers/iommu/intel-iommu.c | 50 ++++++++++----
> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
> include/linux/dma-iommu.h | 7 ++
> include/linux/iommu.h | 49 ++++++++++----
> 8 files changed, 391 insertions(+), 70 deletions(-)
>
On 08/12/16 09:36, Auger Eric wrote:
> Hi,
>
> On 15/11/2016 14:09, Eric Auger wrote:
>> Following LPC discussions, we now report reserved regions through
>> iommu-group sysfs reserved_regions attribute file.
>
>
> While I am respinning this series into v4, here is a tentative summary
> of technical topics for which no consensus was reached at this point.
>
> 1) Shall we report the usable IOVA range instead of reserved IOVA
> ranges. Not discussed at/after LPC.
> x I currently report reserved regions. Alex expressed the need to
> report the full usable IOVA range instead (x86 min-max range
> minus MSI APIC window). I think this is meaningful for ARM
> too where arm-smmu might not support the full 64b range.
> x Any objection we report the usable IOVA regions instead?
The issue with that is that we can't actually report "the usable
regions" at the moment, as that involves pulling together disjoint
properties of arbitrary hardware unrelated to the IOMMU. We'd be
reporting "the not-definitely-unusable regions, which may have some
unusable holes in them still". That seems like an ABI nightmare - I'd
still much rather say "here are some, but not necessarily all, regions
you definitely can't use", because saying "here are some regions which
you might be able to use most of, probably" is what we're already doing
today, via a single implicit region from 0 to ULONG_MAX ;)
The address space limits are definitely useful to know, but I think it
would be better to expose them separately to avoid the ambiguity. At
worst, I guess it would be reasonable to express the limits via an
"out-of-range" reserved region type for 0 to $base and $top to
ULONG-MAX. To *safely* expose usable regions, we'd have to start out
with a very conservative assumption (e.g. only IOVAs matching physical
RAM), and only expand them once we're sure we can detect every possible
bit of problematic hardware in the system - that's just too limiting to
be useful. And if we expose something knowingly inaccurate, we risk
having another "bogoMIPS in /proc/cpuinfo" ABI burden on our hands, and
nobody wants that...
> 2) Shall the kernel check collision with MSI window* when userspace
> calls VFIO_IOMMU_MAP_DMA?
> Joerg/Will No; Alex yes
> *for IOVA regions consumed downstream to the IOMMU: everyone says NO
If we're starting off by having the SMMU drivers expose it as a fake
fixed region, I don't think we need to worry about this yet. We all seem
to agree that as long as we communicate the fixed regions to userspace,
it's then userspace's job to work around them. Let's come back to this
one once we actually get to the point of dynamically sizing and
allocating 'real' MSI remapping region(s).
Ultimately, the kernel *will* police collisions either way, because an
underlying iommu_map() is going to fail if overlapping IOVAs are ever
actually used, so it's really just a question of whether to have a more
user-friendly failure mode.
> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
> My current series does not expose them in iommu group sysfs.
> I understand we can expose the RMRR regions in the iomm group sysfs
> without necessarily supporting RMRR requiring device assignment.
> We can also add this support later.
As you say, reporting them doesn't necessitate allowing device
assignment, and it's information which can already be easily grovelled
out of dmesg (for intel-iommu at least) - there doesn't seem to be any
need to hide them, but the x86 folks can have the final word on that.
Robin.
> Thanks
>
> Eric
>
>
>>
>> Reserved regions are populated through the IOMMU get_resv_region callback
>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>> arm-smmu.
>>
>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>> IOMMU_RESV_NOMAP reserved region.
>>
>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>> 1MB large) and the PCI host bridge windows.
>>
>> The series integrates a not officially posted patch from Robin:
>> "iommu/dma: Allow MSI-only cookies".
>>
>> This series currently does not address IRQ safety assessment.
>>
>> Best Regards
>>
>> Eric
>>
>> Git: complete series available at
>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>
>> History:
>> RFC v2 -> v3:
>> - switch to an iommu-group sysfs API
>> - use new dummy allocator provided by Robin
>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>> the reserved regions
>> - at the moment ARM MSI base address/size is left unchanged compared
>> to v2
>> - we currently report reserved regions and not usable IOVA regions as
>> requested by Alex
>>
>> RFC v1 -> v2:
>> - fix intel_add_reserved_regions
>> - add mutex lock/unlock in vfio_iommu_type1
>>
>>
>> Eric Auger (10):
>> iommu/dma: Allow MSI-only cookies
>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>> iommu: Add new reserved IOMMU attributes
>> iommu: iommu_alloc_resv_region
>> iommu: Do not map reserved regions
>> iommu: iommu_get_group_resv_regions
>> iommu: Implement reserved_regions iommu-group sysfs file
>> iommu/vt-d: Implement reserved region get/put callbacks
>> iommu/arm-smmu: Implement reserved region get/put callbacks
>> vfio/type1: Get MSI cookie
>>
>> drivers/iommu/amd_iommu.c | 20 +++---
>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>> include/linux/dma-iommu.h | 7 ++
>> include/linux/iommu.h | 49 ++++++++++----
>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>
Hi Robin,
On 08/12/2016 14:14, Robin Murphy wrote:
> On 08/12/16 09:36, Auger Eric wrote:
>> Hi,
>>
>> On 15/11/2016 14:09, Eric Auger wrote:
>>> Following LPC discussions, we now report reserved regions through
>>> iommu-group sysfs reserved_regions attribute file.
>>
>>
>> While I am respinning this series into v4, here is a tentative summary
>> of technical topics for which no consensus was reached at this point.
>>
>> 1) Shall we report the usable IOVA range instead of reserved IOVA
>> ranges. Not discussed at/after LPC.
>> x I currently report reserved regions. Alex expressed the need to
>> report the full usable IOVA range instead (x86 min-max range
>> minus MSI APIC window). I think this is meaningful for ARM
>> too where arm-smmu might not support the full 64b range.
>> x Any objection we report the usable IOVA regions instead?
>
> The issue with that is that we can't actually report "the usable
> regions" at the moment, as that involves pulling together disjoint
> properties of arbitrary hardware unrelated to the IOMMU. We'd be
> reporting "the not-definitely-unusable regions, which may have some
> unusable holes in them still". That seems like an ABI nightmare - I'd
> still much rather say "here are some, but not necessarily all, regions
> you definitely can't use", because saying "here are some regions which
> you might be able to use most of, probably" is what we're already doing
> today, via a single implicit region from 0 to ULONG_MAX ;)
>
> The address space limits are definitely useful to know, but I think it
> would be better to expose them separately to avoid the ambiguity. At
> worst, I guess it would be reasonable to express the limits via an
> "out-of-range" reserved region type for 0 to $base and $top to
> ULONG-MAX. To *safely* expose usable regions, we'd have to start out
> with a very conservative assumption (e.g. only IOVAs matching physical
> RAM), and only expand them once we're sure we can detect every possible
> bit of problematic hardware in the system - that's just too limiting to
> be useful. And if we expose something knowingly inaccurate, we risk
> having another "bogoMIPS in /proc/cpuinfo" ABI burden on our hands, and
> nobody wants that...
Makes sense to me. "out-of-range reserved region type for 0 to $base and
$top to ULONG-MAX" can be an alternative to fulfill the requirement.
>
>> 2) Shall the kernel check collision with MSI window* when userspace
>> calls VFIO_IOMMU_MAP_DMA?
>> Joerg/Will No; Alex yes
>> *for IOVA regions consumed downstream to the IOMMU: everyone says NO
>
> If we're starting off by having the SMMU drivers expose it as a fake
> fixed region, I don't think we need to worry about this yet. We all seem
> to agree that as long as we communicate the fixed regions to userspace,
> it's then userspace's job to work around them. Let's come back to this
> one once we actually get to the point of dynamically sizing and
> allocating 'real' MSI remapping region(s).
>
> Ultimately, the kernel *will* police collisions either way, because an
> underlying iommu_map() is going to fail if overlapping IOVAs are ever
> actually used, so it's really just a question of whether to have a more
> user-friendly failure mode.
That's true on ARM but not on x86 where the APIC MSI region is not
mapped I think.
>
>> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
>> My current series does not expose them in iommu group sysfs.
>> I understand we can expose the RMRR regions in the iomm group sysfs
>> without necessarily supporting RMRR requiring device assignment.
>> We can also add this support later.
>
> As you say, reporting them doesn't necessitate allowing device
> assignment, and it's information which can already be easily grovelled
> out of dmesg (for intel-iommu at least) - there doesn't seem to be any
> need to hide them, but the x86 folks can have the final word on that.
agreed
Thanks
Eric
>
> Robin.
>
>> Thanks
>>
>> Eric
>>
>>
>>>
>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>> arm-smmu.
>>>
>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>> IOMMU_RESV_NOMAP reserved region.
>>>
>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>> 1MB large) and the PCI host bridge windows.
>>>
>>> The series integrates a not officially posted patch from Robin:
>>> "iommu/dma: Allow MSI-only cookies".
>>>
>>> This series currently does not address IRQ safety assessment.
>>>
>>> Best Regards
>>>
>>> Eric
>>>
>>> Git: complete series available at
>>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>>
>>> History:
>>> RFC v2 -> v3:
>>> - switch to an iommu-group sysfs API
>>> - use new dummy allocator provided by Robin
>>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>> the reserved regions
>>> - at the moment ARM MSI base address/size is left unchanged compared
>>> to v2
>>> - we currently report reserved regions and not usable IOVA regions as
>>> requested by Alex
>>>
>>> RFC v1 -> v2:
>>> - fix intel_add_reserved_regions
>>> - add mutex lock/unlock in vfio_iommu_type1
>>>
>>>
>>> Eric Auger (10):
>>> iommu/dma: Allow MSI-only cookies
>>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>>> iommu: Add new reserved IOMMU attributes
>>> iommu: iommu_alloc_resv_region
>>> iommu: Do not map reserved regions
>>> iommu: iommu_get_group_resv_regions
>>> iommu: Implement reserved_regions iommu-group sysfs file
>>> iommu/vt-d: Implement reserved region get/put callbacks
>>> iommu/arm-smmu: Implement reserved region get/put callbacks
>>> vfio/type1: Get MSI cookie
>>>
>>> drivers/iommu/amd_iommu.c | 20 +++---
>>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>>> include/linux/dma-iommu.h | 7 ++
>>> include/linux/iommu.h | 49 ++++++++++----
>>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On 08/12/16 13:36, Auger Eric wrote:
> Hi Robin,
>
> On 08/12/2016 14:14, Robin Murphy wrote:
>> On 08/12/16 09:36, Auger Eric wrote:
>>> Hi,
>>>
>>> On 15/11/2016 14:09, Eric Auger wrote:
>>>> Following LPC discussions, we now report reserved regions through
>>>> iommu-group sysfs reserved_regions attribute file.
>>>
>>>
>>> While I am respinning this series into v4, here is a tentative summary
>>> of technical topics for which no consensus was reached at this point.
>>>
>>> 1) Shall we report the usable IOVA range instead of reserved IOVA
>>> ranges. Not discussed at/after LPC.
>>> x I currently report reserved regions. Alex expressed the need to
>>> report the full usable IOVA range instead (x86 min-max range
>>> minus MSI APIC window). I think this is meaningful for ARM
>>> too where arm-smmu might not support the full 64b range.
>>> x Any objection we report the usable IOVA regions instead?
>>
>> The issue with that is that we can't actually report "the usable
>> regions" at the moment, as that involves pulling together disjoint
>> properties of arbitrary hardware unrelated to the IOMMU. We'd be
>> reporting "the not-definitely-unusable regions, which may have some
>> unusable holes in them still". That seems like an ABI nightmare - I'd
>> still much rather say "here are some, but not necessarily all, regions
>> you definitely can't use", because saying "here are some regions which
>> you might be able to use most of, probably" is what we're already doing
>> today, via a single implicit region from 0 to ULONG_MAX ;)
>>
>> The address space limits are definitely useful to know, but I think it
>> would be better to expose them separately to avoid the ambiguity. At
>> worst, I guess it would be reasonable to express the limits via an
>> "out-of-range" reserved region type for 0 to $base and $top to
>> ULONG-MAX. To *safely* expose usable regions, we'd have to start out
>> with a very conservative assumption (e.g. only IOVAs matching physical
>> RAM), and only expand them once we're sure we can detect every possible
>> bit of problematic hardware in the system - that's just too limiting to
>> be useful. And if we expose something knowingly inaccurate, we risk
>> having another "bogoMIPS in /proc/cpuinfo" ABI burden on our hands, and
>> nobody wants that...
> Makes sense to me. "out-of-range reserved region type for 0 to $base and
> $top to ULONG-MAX" can be an alternative to fulfill the requirement.
>>
>>> 2) Shall the kernel check collision with MSI window* when userspace
>>> calls VFIO_IOMMU_MAP_DMA?
>>> Joerg/Will No; Alex yes
>>> *for IOVA regions consumed downstream to the IOMMU: everyone says NO
>>
>> If we're starting off by having the SMMU drivers expose it as a fake
>> fixed region, I don't think we need to worry about this yet. We all seem
>> to agree that as long as we communicate the fixed regions to userspace,
>> it's then userspace's job to work around them. Let's come back to this
>> one once we actually get to the point of dynamically sizing and
>> allocating 'real' MSI remapping region(s).
>>
>> Ultimately, the kernel *will* police collisions either way, because an
>> underlying iommu_map() is going to fail if overlapping IOVAs are ever
>> actually used, so it's really just a question of whether to have a more
>> user-friendly failure mode.
> That's true on ARM but not on x86 where the APIC MSI region is not
> mapped I think.
Yes, but the APIC interrupt region is fixed, i.e. it falls under "IOVA
regions consumed downstream to the IOMMU" since the DMAR units are
physically incapable of remapping those addresses. I take "MSI window"
to mean specifically the thing we have to set aside and get a magic
remapping cookie for, which is also why it warrants its own special
internal type - we definitely *don't* want VFIO trying to set up a
remapping cookie on x86. We just don't let userspace know about the
difference, yet (if ever).
Robin.
>>> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
>>> My current series does not expose them in iommu group sysfs.
>>> I understand we can expose the RMRR regions in the iomm group sysfs
>>> without necessarily supporting RMRR requiring device assignment.
>>> We can also add this support later.
>>
>> As you say, reporting them doesn't necessitate allowing device
>> assignment, and it's information which can already be easily grovelled
>> out of dmesg (for intel-iommu at least) - there doesn't seem to be any
>> need to hide them, but the x86 folks can have the final word on that.
> agreed
>
> Thanks
>
> Eric
>>
>> Robin.
>>
>>> Thanks
>>>
>>> Eric
>>>
>>>
>>>>
>>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>>> arm-smmu.
>>>>
>>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>>> IOMMU_RESV_NOMAP reserved region.
>>>>
>>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>>> 1MB large) and the PCI host bridge windows.
>>>>
>>>> The series integrates a not officially posted patch from Robin:
>>>> "iommu/dma: Allow MSI-only cookies".
>>>>
>>>> This series currently does not address IRQ safety assessment.
>>>>
>>>> Best Regards
>>>>
>>>> Eric
>>>>
>>>> Git: complete series available at
>>>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>>>
>>>> History:
>>>> RFC v2 -> v3:
>>>> - switch to an iommu-group sysfs API
>>>> - use new dummy allocator provided by Robin
>>>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>>> the reserved regions
>>>> - at the moment ARM MSI base address/size is left unchanged compared
>>>> to v2
>>>> - we currently report reserved regions and not usable IOVA regions as
>>>> requested by Alex
>>>>
>>>> RFC v1 -> v2:
>>>> - fix intel_add_reserved_regions
>>>> - add mutex lock/unlock in vfio_iommu_type1
>>>>
>>>>
>>>> Eric Auger (10):
>>>> iommu/dma: Allow MSI-only cookies
>>>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>>>> iommu: Add new reserved IOMMU attributes
>>>> iommu: iommu_alloc_resv_region
>>>> iommu: Do not map reserved regions
>>>> iommu: iommu_get_group_resv_regions
>>>> iommu: Implement reserved_regions iommu-group sysfs file
>>>> iommu/vt-d: Implement reserved region get/put callbacks
>>>> iommu/arm-smmu: Implement reserved region get/put callbacks
>>>> vfio/type1: Get MSI cookie
>>>>
>>>> drivers/iommu/amd_iommu.c | 20 +++---
>>>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>>>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>>>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>>>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>>>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>>>> include/linux/dma-iommu.h | 7 ++
>>>> include/linux/iommu.h | 49 ++++++++++----
>>>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
On Thu, 8 Dec 2016 13:14:04 +0000
Robin Murphy <[email protected]> wrote:
> On 08/12/16 09:36, Auger Eric wrote:
> > 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
> > My current series does not expose them in iommu group sysfs.
> > I understand we can expose the RMRR regions in the iomm group sysfs
> > without necessarily supporting RMRR requiring device assignment.
> > We can also add this support later.
>
> As you say, reporting them doesn't necessitate allowing device
> assignment, and it's information which can already be easily grovelled
> out of dmesg (for intel-iommu at least) - there doesn't seem to be any
> need to hide them, but the x86 folks can have the final word on that.
Eric and I talked about this and I don't see the value in identifying
an RMRR as anything other than a reserved range for a device. It's not
userspace's job to maintain an identify mapped range for the device,
and it can't be trusted to do so anyway. It does throw a kink in the
machinery though as an RMRR is a reserved memory range unique to a
device. It doesn't really fit into a monolithic /sys/class/iommu view
of global reserved regions as an RMRR is only relevant to the device
paths affected.
Another kink is that sometimes we know what the RMRR is for, know that
it's irrelevant for our use case, and ignore it. This is true for USB
and Intel graphics use cases of RMRRs.
Also, aside from the above mentioned cases, devices with RMRRs are
currently excluded from participating in the IOMMU API by the
intel-iommu driver and I expect this to continue in the general case
regardless of whether the ranges are more easily exposed to userspace.
ARM may have to deal with mangling a guest memory map due to lack of
any standard layout, de facto or otherwise, but for x86 I don't think
it's worth the migration and hotplug implications. Thanks,
Alex
On 08/12/16 17:01, Alex Williamson wrote:
> On Thu, 8 Dec 2016 13:14:04 +0000
> Robin Murphy <[email protected]> wrote:
>> On 08/12/16 09:36, Auger Eric wrote:
>>> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
>>> My current series does not expose them in iommu group sysfs.
>>> I understand we can expose the RMRR regions in the iomm group sysfs
>>> without necessarily supporting RMRR requiring device assignment.
>>> We can also add this support later.
>>
>> As you say, reporting them doesn't necessitate allowing device
>> assignment, and it's information which can already be easily grovelled
>> out of dmesg (for intel-iommu at least) - there doesn't seem to be any
>> need to hide them, but the x86 folks can have the final word on that.
>
> Eric and I talked about this and I don't see the value in identifying
> an RMRR as anything other than a reserved range for a device. It's not
> userspace's job to maintain an identify mapped range for the device,
> and it can't be trusted to do so anyway. It does throw a kink in the
> machinery though as an RMRR is a reserved memory range unique to a
> device. It doesn't really fit into a monolithic /sys/class/iommu view
> of global reserved regions as an RMRR is only relevant to the device
> paths affected.
I think we're in violent agreement then - to clarify, I was thinking in
terms of patch 7 of this series, where everything relevant to a
particular group would be exposed as just an opaque "don't use this
address range" regardless of the internal type.
I'm less convinced the kernel has any need to provide its own 'global'
view of reservations which strictly are always at least per-IOMMU, if
not per-root-complex, even when all the instances do share the same
address by design. The group-based interface fits the reality neatly,
and userspace can easily iterate all the groups if it wants to consider
everything. Plus if it doesn't want to, then it needn't bother reserving
anything which doesn't apply to the group(s) it's going to bind to VFIO.
Robin.
> Another kink is that sometimes we know what the RMRR is for, know that
> it's irrelevant for our use case, and ignore it. This is true for USB
> and Intel graphics use cases of RMRRs.
>
> Also, aside from the above mentioned cases, devices with RMRRs are
> currently excluded from participating in the IOMMU API by the
> intel-iommu driver and I expect this to continue in the general case
> regardless of whether the ranges are more easily exposed to userspace.
> ARM may have to deal with mangling a guest memory map due to lack of
> any standard layout, de facto or otherwise, but for x86 I don't think
> it's worth the migration and hotplug implications. Thanks,
>
> Alex
>
On 12/08/2016 04:36 AM, Auger Eric wrote:
> Hi,
>
> On 15/11/2016 14:09, Eric Auger wrote:
>> Following LPC discussions, we now report reserved regions through
>> iommu-group sysfs reserved_regions attribute file.
>
>
> While I am respinning this series into v4, here is a tentative summary
> of technical topics for which no consensus was reached at this point.
>
> 1) Shall we report the usable IOVA range instead of reserved IOVA
> ranges. Not discussed at/after LPC.
> x I currently report reserved regions. Alex expressed the need to
> report the full usable IOVA range instead (x86 min-max range
> minus MSI APIC window). I think this is meaningful for ARM
> too where arm-smmu might not support the full 64b range.
> x Any objection we report the usable IOVA regions instead?
>
> 2) Shall the kernel check collision with MSI window* when userspace
> calls VFIO_IOMMU_MAP_DMA?
> Joerg/Will No; Alex yes
> *for IOVA regions consumed downstream to the IOMMU: everyone says NO
>
> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
Um, I'm missing context, but the only thing I recall saying no to wrt RMRR
is that _any_ device that has an RMRR cannot be assigned to a guest.
Or, are you saying, RMRR's should be exposed in the guest os? if so, then
you have my 'no' there.
> My current series does not expose them in iommu group sysfs.
> I understand we can expose the RMRR regions in the iomm group sysfs
> without necessarily supporting RMRR requiring device assignment.
This sentence doesn't make sense to me.
Can you try re-wording it?
I can't tell what RMRR has to do w/device assignment, other than what I said above.
Exposing RMRR's in sysfs is not an issue in general.
> We can also add this support later.
>
> Thanks
>
> Eric
>
>
>>
>> Reserved regions are populated through the IOMMU get_resv_region callback
>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>> arm-smmu.
>>
>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>> IOMMU_RESV_NOMAP reserved region.
>>
>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>> 1MB large) and the PCI host bridge windows.
>>
>> The series integrates a not officially posted patch from Robin:
>> "iommu/dma: Allow MSI-only cookies".
>>
>> This series currently does not address IRQ safety assessment.
>>
>> Best Regards
>>
>> Eric
>>
>> Git: complete series available at
>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>
>> History:
>> RFC v2 -> v3:
>> - switch to an iommu-group sysfs API
>> - use new dummy allocator provided by Robin
>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>> the reserved regions
>> - at the moment ARM MSI base address/size is left unchanged compared
>> to v2
>> - we currently report reserved regions and not usable IOVA regions as
>> requested by Alex
>>
>> RFC v1 -> v2:
>> - fix intel_add_reserved_regions
>> - add mutex lock/unlock in vfio_iommu_type1
>>
>>
>> Eric Auger (10):
>> iommu/dma: Allow MSI-only cookies
>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>> iommu: Add new reserved IOMMU attributes
>> iommu: iommu_alloc_resv_region
>> iommu: Do not map reserved regions
>> iommu: iommu_get_group_resv_regions
>> iommu: Implement reserved_regions iommu-group sysfs file
>> iommu/vt-d: Implement reserved region get/put callbacks
>> iommu/arm-smmu: Implement reserved region get/put callbacks
>> vfio/type1: Get MSI cookie
>>
>> drivers/iommu/amd_iommu.c | 20 +++---
>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>> drivers/iommu/dma-iommu.c | 116 ++++++++++++++++++++++++++-------
>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>> drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++----
>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>> include/linux/dma-iommu.h | 7 ++
>> include/linux/iommu.h | 49 ++++++++++----
>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>
Hi Don,
On 11/12/2016 03:05, Don Dutile wrote:
> On 12/08/2016 04:36 AM, Auger Eric wrote:
>> Hi,
>>
>> On 15/11/2016 14:09, Eric Auger wrote:
>>> Following LPC discussions, we now report reserved regions through
>>> iommu-group sysfs reserved_regions attribute file.
>>
>>
>> While I am respinning this series into v4, here is a tentative summary
>> of technical topics for which no consensus was reached at this point.
>>
>> 1) Shall we report the usable IOVA range instead of reserved IOVA
>> ranges. Not discussed at/after LPC.
>> x I currently report reserved regions. Alex expressed the need to
>> report the full usable IOVA range instead (x86 min-max range
>> minus MSI APIC window). I think this is meaningful for ARM
>> too where arm-smmu might not support the full 64b range.
>> x Any objection we report the usable IOVA regions instead?
>>
>> 2) Shall the kernel check collision with MSI window* when userspace
>> calls VFIO_IOMMU_MAP_DMA?
>> Joerg/Will No; Alex yes
>> *for IOVA regions consumed downstream to the IOMMU: everyone says NO
>>
>> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
> Um, I'm missing context, but the only thing I recall saying no to wrt RMRR
> is that _any_ device that has an RMRR cannot be assigned to a guest.
Yes that was my understanding
> Or, are you saying, RMRR's should be exposed in the guest os? if so, then
> you have my 'no' there.
>
>> My current series does not expose them in iommu group sysfs.
>> I understand we can expose the RMRR regions in the iomm group sysfs
>> without necessarily supporting RMRR requiring device assignment.
> This sentence doesn't make sense to me.
> Can you try re-wording it?
> I can't tell what RMRR has to do w/device assignment, other than what I
> said above.
> Exposing RMRR's in sysfs is not an issue in general.
Sorry for the confusion. I Meant we can expose RMRR regions as part of
the reserved regions through the iommu group sysfs API without
supporting device assignment of devices that has RMRR.
Hope it clarifies
Eric
>
>> We can also add this support later.
>>
>> Thanks
>>
>> Eric
>>
>>
>>>
>>> Reserved regions are populated through the IOMMU get_resv_region
>>> callback
>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>> arm-smmu.
>>>
>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>> IOMMU_RESV_NOMAP reserved region.
>>>
>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>> 1MB large) and the PCI host bridge windows.
>>>
>>> The series integrates a not officially posted patch from Robin:
>>> "iommu/dma: Allow MSI-only cookies".
>>>
>>> This series currently does not address IRQ safety assessment.
>>>
>>> Best Regards
>>>
>>> Eric
>>>
>>> Git: complete series available at
>>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>>
>>> History:
>>> RFC v2 -> v3:
>>> - switch to an iommu-group sysfs API
>>> - use new dummy allocator provided by Robin
>>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>> the reserved regions
>>> - at the moment ARM MSI base address/size is left unchanged compared
>>> to v2
>>> - we currently report reserved regions and not usable IOVA regions as
>>> requested by Alex
>>>
>>> RFC v1 -> v2:
>>> - fix intel_add_reserved_regions
>>> - add mutex lock/unlock in vfio_iommu_type1
>>>
>>>
>>> Eric Auger (10):
>>> iommu/dma: Allow MSI-only cookies
>>> iommu: Rename iommu_dm_regions into iommu_resv_regions
>>> iommu: Add new reserved IOMMU attributes
>>> iommu: iommu_alloc_resv_region
>>> iommu: Do not map reserved regions
>>> iommu: iommu_get_group_resv_regions
>>> iommu: Implement reserved_regions iommu-group sysfs file
>>> iommu/vt-d: Implement reserved region get/put callbacks
>>> iommu/arm-smmu: Implement reserved region get/put callbacks
>>> vfio/type1: Get MSI cookie
>>>
>>> drivers/iommu/amd_iommu.c | 20 +++---
>>> drivers/iommu/arm-smmu.c | 52 +++++++++++++++
>>> drivers/iommu/dma-iommu.c | 116
>>> ++++++++++++++++++++++++++-------
>>> drivers/iommu/intel-iommu.c | 50 ++++++++++----
>>> drivers/iommu/iommu.c | 141
>>> ++++++++++++++++++++++++++++++++++++----
>>> drivers/vfio/vfio_iommu_type1.c | 26 ++++++++
>>> include/linux/dma-iommu.h | 7 ++
>>> include/linux/iommu.h | 49 ++++++++++----
>>> 8 files changed, 391 insertions(+), 70 deletions(-)
>>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html