When Shared Virtual Address (SVA) is enabled for a guest OS via
vIOMMU, we need to provide invalidation support at IOMMU API and driver
level. This patch adds Intel VT-d specific function to implement
iommu passdown invalidate API for shared virtual address.
The use case is for supporting caching structure invalidation
of assigned SVM capable devices. Emulated IOMMU exposes queue
invalidation capability and passes down all descriptors from the guest
to the physical IOMMU.
The assumption is that guest to host device ID mapping should be
resolved prior to calling IOMMU driver. Based on the device handle,
host IOMMU driver can replace certain fields before submit to the
invalidation queue.
---
v7 review fixed in v10
---
Signed-off-by: Jacob Pan <[email protected]>
Signed-off-by: Ashok Raj <[email protected]>
Signed-off-by: Liu, Yi L <[email protected]>
---
drivers/iommu/intel-iommu.c | 182 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 182 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index b1477cd423dd..a76afb0fd51a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5619,6 +5619,187 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
aux_domain_remove_dev(to_dmar_domain(domain), dev);
}
+/*
+ * 2D array for converting and sanitizing IOMMU generic TLB granularity to
+ * VT-d granularity. Invalidation is typically included in the unmap operation
+ * as a result of DMA or VFIO unmap. However, for assigned devices guest
+ * owns the first level page tables. Invalidations of translation caches in the
+ * guest are trapped and passed down to the host.
+ *
+ * vIOMMU in the guest will only expose first level page tables, therefore
+ * we do not include IOTLB granularity for request without PASID (second level).
+ *
+ * For example, to find the VT-d granularity encoding for IOTLB
+ * type and page selective granularity within PASID:
+ * X: indexed by iommu cache type
+ * Y: indexed by enum iommu_inv_granularity
+ * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
+ *
+ * Granu_map array indicates validity of the table. 1: valid, 0: invalid
+ *
+ */
+const static int inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
+ /*
+ * PASID based IOTLB invalidation: PASID selective (per PASID),
+ * page selective (address granularity)
+ */
+ {0, 1, 1},
+ /* PASID based dev TLBs, only support all PASIDs or single PASID */
+ {1, 1, 0},
+ /* PASID cache */
+ {1, 1, 0}
+};
+
+const static int inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
+ /* PASID based IOTLB */
+ {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
+ /* PASID based dev TLBs */
+ {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
+ /* PASID cache */
+ {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
+};
+
+static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
+{
+ if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >= IOMMU_INV_GRANU_NR ||
+ !inv_type_granu_map[type][granu])
+ return -EINVAL;
+
+ *vtd_granu = inv_type_granu_table[type][granu];
+
+ return 0;
+}
+
+static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
+{
+ u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
+
+ /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
+ * IOMMU cache invalidate API passes granu_size in bytes, and number of
+ * granu size in contiguous memory.
+ */
+ return order_base_2(nr_pages);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
+ struct device *dev, struct iommu_cache_invalidate_info *inv_info)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ struct intel_iommu *iommu;
+ unsigned long flags;
+ int cache_type;
+ u8 bus, devfn;
+ u16 did, sid;
+ int ret = 0;
+ u64 size = 0;
+
+ if (!inv_info || !dmar_domain ||
+ inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+ return -EINVAL;
+
+ if (!dev || !dev_is_pci(dev))
+ return -ENODEV;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ spin_lock(&iommu->lock);
+ info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
+ if (!info) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ did = dmar_domain->iommu_did[iommu->seq_id];
+ sid = PCI_DEVID(bus, devfn);
+
+ /* Size is only valid in non-PASID selective invalidation */
+ if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
+ size = to_vtd_size(inv_info->addr_info.granule_size,
+ inv_info->addr_info.nb_granules);
+
+ for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache, IOMMU_CACHE_INV_TYPE_NR) {
+ int granu = 0;
+ u64 pasid = 0;
+
+ ret = to_vtd_granularity(cache_type, inv_info->granularity, &granu);
+ if (ret) {
+ pr_err("Invalid cache type and granu combination %d/%d\n", cache_type,
+ inv_info->granularity);
+ break;
+ }
+
+ /* PASID is stored in different locations based on granularity */
+ if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
+ inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)
+ pasid = inv_info->pasid_info.pasid;
+ else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+ inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)
+ pasid = inv_info->addr_info.pasid;
+ else {
+ pr_err("Cannot find PASID for given cache type and granularity\n");
+ break;
+ }
+
+ switch (BIT(cache_type)) {
+ case IOMMU_CACHE_INV_TYPE_IOTLB:
+ if ((inv_info->granularity != IOMMU_INV_GRANU_PASID) &&
+ size && (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
+ pr_err("Address out of range, 0x%llx, size order %llu\n",
+ inv_info->addr_info.addr, size);
+ ret = -ERANGE;
+ goto out_unlock;
+ }
+
+ qi_flush_piotlb(iommu, did,
+ pasid,
+ mm_to_dma_pfn(inv_info->addr_info.addr),
+ (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
+ inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
+
+ /*
+ * Always flush device IOTLB if ATS is enabled since guest
+ * vIOMMU exposes CM = 1, no device IOTLB flush will be passed
+ * down.
+ */
+ if (info->ats_enabled) {
+ qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid,
+ pasid, info->ats_qdep,
+ inv_info->addr_info.addr, size,
+ granu);
+ }
+ break;
+ case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
+ if (info->ats_enabled) {
+ qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid,
+ inv_info->addr_info.pasid, info->ats_qdep,
+ inv_info->addr_info.addr, size,
+ granu);
+ } else
+ pr_warn("Passdown device IOTLB flush w/o ATS!\n");
+
+ break;
+ case IOMMU_CACHE_INV_TYPE_PASID:
+ qi_flush_pasid_cache(iommu, did, granu, inv_info->pasid_info.pasid);
+
+ break;
+ default:
+ dev_err(dev, "Unsupported IOMMU invalidation type %d\n",
+ cache_type);
+ ret = -EINVAL;
+ }
+ }
+out_unlock:
+ spin_unlock(&iommu->lock);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return ret;
+}
+#endif
+
static int intel_iommu_map(struct iommu_domain *domain,
unsigned long iova, phys_addr_t hpa,
size_t size, int iommu_prot, gfp_t gfp)
@@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
.is_attach_deferred = intel_iommu_is_attach_deferred,
.pgsize_bitmap = INTEL_IOMMU_PGSIZES,
#ifdef CONFIG_INTEL_IOMMU_SVM
+ .cache_invalidate = intel_iommu_sva_invalidate,
.sva_bind_gpasid = intel_svm_bind_gpasid,
.sva_unbind_gpasid = intel_svm_unbind_gpasid,
#endif
--
2.7.4
> From: Jacob Pan <[email protected]>
> Sent: Saturday, March 21, 2020 7:28 AM
>
> When Shared Virtual Address (SVA) is enabled for a guest OS via
> vIOMMU, we need to provide invalidation support at IOMMU API and driver
> level. This patch adds Intel VT-d specific function to implement
> iommu passdown invalidate API for shared virtual address.
>
> The use case is for supporting caching structure invalidation
> of assigned SVM capable devices. Emulated IOMMU exposes queue
emulated IOMMU -> vIOMMU, since virito-iommu could use the
interface as well.
> invalidation capability and passes down all descriptors from the guest
> to the physical IOMMU.
>
> The assumption is that guest to host device ID mapping should be
> resolved prior to calling IOMMU driver. Based on the device handle,
> host IOMMU driver can replace certain fields before submit to the
> invalidation queue.
>
> ---
> v7 review fixed in v10
> ---
>
> Signed-off-by: Jacob Pan <[email protected]>
> Signed-off-by: Ashok Raj <[email protected]>
> Signed-off-by: Liu, Yi L <[email protected]>
> ---
> drivers/iommu/intel-iommu.c | 182
> ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 182 insertions(+)
>
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index b1477cd423dd..a76afb0fd51a 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -5619,6 +5619,187 @@ static void
> intel_iommu_aux_detach_device(struct iommu_domain *domain,
> aux_domain_remove_dev(to_dmar_domain(domain), dev);
> }
>
> +/*
> + * 2D array for converting and sanitizing IOMMU generic TLB granularity to
> + * VT-d granularity. Invalidation is typically included in the unmap operation
> + * as a result of DMA or VFIO unmap. However, for assigned devices guest
> + * owns the first level page tables. Invalidations of translation caches in the
> + * guest are trapped and passed down to the host.
> + *
> + * vIOMMU in the guest will only expose first level page tables, therefore
> + * we do not include IOTLB granularity for request without PASID (second
> level).
I would revise above as "We do not support IOTLB granularity for request
without PASID (second level), therefore any vIOMMU implementation that
exposes the SVA capability to the guest should only expose the first level
page tables, implying all invalidation requests from the guest will include
a valid PASID"
> + *
> + * For example, to find the VT-d granularity encoding for IOTLB
> + * type and page selective granularity within PASID:
> + * X: indexed by iommu cache type
> + * Y: indexed by enum iommu_inv_granularity
> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> + *
> + * Granu_map array indicates validity of the table. 1: valid, 0: invalid
> + *
> + */
> +const static int
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> NR] = {
> + /*
> + * PASID based IOTLB invalidation: PASID selective (per PASID),
> + * page selective (address granularity)
> + */
> + {0, 1, 1},
> + /* PASID based dev TLBs, only support all PASIDs or single PASID */
> + {1, 1, 0},
Is this combination correct? when single PASID is being specified, it is
essentially a page-selective invalidation since you need provide Address
and Size.
> + /* PASID cache */
PASID cache is fully managed by the host. Guest PASID cache invalidation
is interpreted by vIOMMU for bind and unbind operations. I don't think
we should accept any PASID cache invalidation from userspace or guest.
> + {1, 1, 0}
> +};
> +
> +const static int
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> _NR] = {
> + /* PASID based IOTLB */
> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> + /* PASID based dev TLBs */
> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> + /* PASID cache */
> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> +};
> +
> +static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
> +{
> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> IOMMU_INV_GRANU_NR ||
> + !inv_type_granu_map[type][granu])
> + return -EINVAL;
> +
> + *vtd_granu = inv_type_granu_table[type][granu];
> +
btw do we really need both map and table here? Can't we just
use one table with unsupported granularity marked as a special
value?
> + return 0;
> +}
> +
> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> +{
> + u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
> +
> + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
> + * IOMMU cache invalidate API passes granu_size in bytes, and
> number of
> + * granu size in contiguous memory.
> + */
> + return order_base_2(nr_pages);
> +}
> +
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
> + struct device *dev, struct iommu_cache_invalidate_info
> *inv_info)
> +{
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> + struct device_domain_info *info;
> + struct intel_iommu *iommu;
> + unsigned long flags;
> + int cache_type;
> + u8 bus, devfn;
> + u16 did, sid;
> + int ret = 0;
> + u64 size = 0;
> +
> + if (!inv_info || !dmar_domain ||
> + inv_info->version !=
> IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> + return -EINVAL;
> +
> + if (!dev || !dev_is_pci(dev))
> + return -ENODEV;
> +
> + iommu = device_to_iommu(dev, &bus, &devfn);
> + if (!iommu)
> + return -ENODEV;
> +
> + spin_lock_irqsave(&device_domain_lock, flags);
> + spin_lock(&iommu->lock);
> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
> + if (!info) {
> + ret = -EINVAL;
> + goto out_unlock;
-ENOTSUPP?
> + }
> + did = dmar_domain->iommu_did[iommu->seq_id];
> + sid = PCI_DEVID(bus, devfn);
> +
> + /* Size is only valid in non-PASID selective invalidation */
> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> + size = to_vtd_size(inv_info->addr_info.granule_size,
> + inv_info->addr_info.nb_granules);
> +
> + for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache,
> IOMMU_CACHE_INV_TYPE_NR) {
> + int granu = 0;
> + u64 pasid = 0;
> +
> + ret = to_vtd_granularity(cache_type, inv_info->granularity,
> &granu);
> + if (ret) {
> + pr_err("Invalid cache type and granu
> combination %d/%d\n", cache_type,
> + inv_info->granularity);
> + break;
> + }
> +
> + /* PASID is stored in different locations based on granularity
> */
> + if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
> + inv_info->pasid_info.flags &
> IOMMU_INV_PASID_FLAGS_PASID)
> + pasid = inv_info->pasid_info.pasid;
> + else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR
> &&
> + inv_info->addr_info.flags &
> IOMMU_INV_ADDR_FLAGS_PASID)
> + pasid = inv_info->addr_info.pasid;
> + else {
> + pr_err("Cannot find PASID for given cache type and
> granularity\n");
> + break;
> + }
> +
> + switch (BIT(cache_type)) {
> + case IOMMU_CACHE_INV_TYPE_IOTLB:
> + if ((inv_info->granularity !=
> IOMMU_INV_GRANU_PASID) &&
granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
why IOMMU_INV_GRANU_DOMAIN also needs size check.
> + size && (inv_info->addr_info.addr &
> ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
> + pr_err("Address out of range, 0x%llx, size
> order %llu\n",
> + inv_info->addr_info.addr, size);
> + ret = -ERANGE;
> + goto out_unlock;
> + }
> +
> + qi_flush_piotlb(iommu, did,
> + pasid,
> + mm_to_dma_pfn(inv_info-
> >addr_info.addr),
> + (granu == QI_GRAN_NONG_PASID) ? -
> 1 : 1 << size,
> + inv_info->addr_info.flags &
> IOMMU_INV_ADDR_FLAGS_LEAF);
> +
> + /*
> + * Always flush device IOTLB if ATS is enabled since
> guest
> + * vIOMMU exposes CM = 1, no device IOTLB flush
> will be passed
> + * down.
> + */
Does VT-d spec mention that no device IOTLB flush is required when CM=1?
> + if (info->ats_enabled) {
> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
> >pfsid,
> + pasid, info->ats_qdep,
> + inv_info->addr_info.addr,
> size,
> + granu);
> + }
> + break;
> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> + if (info->ats_enabled) {
> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
> >pfsid,
> + inv_info->addr_info.pasid,
> info->ats_qdep,
> + inv_info->addr_info.addr,
> size,
> + granu);
I'm confused here. There are two granularities allowed for devtlb, but here
you only handle one of them?
> + } else
> + pr_warn("Passdown device IOTLB flush w/o
> ATS!\n");
> +
> + break;
> + case IOMMU_CACHE_INV_TYPE_PASID:
> + qi_flush_pasid_cache(iommu, did, granu, inv_info-
> >pasid_info.pasid);
> +
as earlier comment, we shouldn't allow userspace or guest to invalidate
PASID cache
> + break;
> + default:
> + dev_err(dev, "Unsupported IOMMU invalidation
> type %d\n",
> + cache_type);
> + ret = -EINVAL;
> + }
> + }
> +out_unlock:
> + spin_unlock(&iommu->lock);
> + spin_unlock_irqrestore(&device_domain_lock, flags);
> +
> + return ret;
> +}
> +#endif
> +
> static int intel_iommu_map(struct iommu_domain *domain,
> unsigned long iova, phys_addr_t hpa,
> size_t size, int iommu_prot, gfp_t gfp)
> @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
> .is_attach_deferred = intel_iommu_is_attach_deferred,
> .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
> #ifdef CONFIG_INTEL_IOMMU_SVM
> + .cache_invalidate = intel_iommu_sva_invalidate,
> .sva_bind_gpasid = intel_svm_bind_gpasid,
> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> #endif
> --
> 2.7.4
Hi,
On 3/28/20 11:01 AM, Tian, Kevin wrote:
>> From: Jacob Pan <[email protected]>
>> Sent: Saturday, March 21, 2020 7:28 AM
>>
>> When Shared Virtual Address (SVA) is enabled for a guest OS via
>> vIOMMU, we need to provide invalidation support at IOMMU API and driver
>> level. This patch adds Intel VT-d specific function to implement
>> iommu passdown invalidate API for shared virtual address.
>>
>> The use case is for supporting caching structure invalidation
>> of assigned SVM capable devices. Emulated IOMMU exposes queue
>
> emulated IOMMU -> vIOMMU, since virito-iommu could use the
> interface as well.
>
>> invalidation capability and passes down all descriptors from the guest
>> to the physical IOMMU.
>>
>> The assumption is that guest to host device ID mapping should be
>> resolved prior to calling IOMMU driver. Based on the device handle,
>> host IOMMU driver can replace certain fields before submit to the
>> invalidation queue.
>>
>> ---
>> v7 review fixed in v10
>> ---
>>
>> Signed-off-by: Jacob Pan <[email protected]>
>> Signed-off-by: Ashok Raj <[email protected]>
>> Signed-off-by: Liu, Yi L <[email protected]>
>> ---
>> drivers/iommu/intel-iommu.c | 182
>> ++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 182 insertions(+)
>>
>> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
>> index b1477cd423dd..a76afb0fd51a 100644
>> --- a/drivers/iommu/intel-iommu.c
>> +++ b/drivers/iommu/intel-iommu.c
>> @@ -5619,6 +5619,187 @@ static void
>> intel_iommu_aux_detach_device(struct iommu_domain *domain,
>> aux_domain_remove_dev(to_dmar_domain(domain), dev);
>> }
>>
>> +/*
>> + * 2D array for converting and sanitizing IOMMU generic TLB granularity to
>> + * VT-d granularity. Invalidation is typically included in the unmap operation
>> + * as a result of DMA or VFIO unmap. However, for assigned devices guest
>> + * owns the first level page tables. Invalidations of translation caches in the
>> + * guest are trapped and passed down to the host.
>> + *
>> + * vIOMMU in the guest will only expose first level page tables, therefore
>> + * we do not include IOTLB granularity for request without PASID (second
>> level).
>
> I would revise above as "We do not support IOTLB granularity for request
> without PASID (second level), therefore any vIOMMU implementation that
> exposes the SVA capability to the guest should only expose the first level
> page tables, implying all invalidation requests from the guest will include
> a valid PASID"
>
>> + *
>> + * For example, to find the VT-d granularity encoding for IOTLB
>> + * type and page selective granularity within PASID:
>> + * X: indexed by iommu cache type
>> + * Y: indexed by enum iommu_inv_granularity
>> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
>> + *
>> + * Granu_map array indicates validity of the table. 1: valid, 0: invalid
>> + *
>> + */
>> +const static int
>> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
>> NR] = {
>> + /*
>> + * PASID based IOTLB invalidation: PASID selective (per PASID),
>> + * page selective (address granularity)
>> + */
>> + {0, 1, 1},
>> + /* PASID based dev TLBs, only support all PASIDs or single PASID */
>> + {1, 1, 0},
>
> Is this combination correct? when single PASID is being specified, it is
> essentially a page-selective invalidation since you need provide Address
> and Size.
Isn't it the same when G=1? Still the addr/size is used. Doesn't it
correspond to IOMMU_INV_GRANU_ADDR with IOMMU_INV_ADDR_FLAGS_PASID flag
unset?
so {0, 0, 1}?
Thanks
Eric
>
>> + /* PASID cache */
>
> PASID cache is fully managed by the host. Guest PASID cache invalidation
> is interpreted by vIOMMU for bind and unbind operations. I don't think
> we should accept any PASID cache invalidation from userspace or guest.
>
>> + {1, 1, 0}
>> +};
>> +
>> +const static int
>> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
>> _NR] = {
>> + /* PASID based IOTLB */
>> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
>> + /* PASID based dev TLBs */
>> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
>> + /* PASID cache */
>> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
>> +};
>> +
>> +static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
>> +{
>> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
>> IOMMU_INV_GRANU_NR ||
>> + !inv_type_granu_map[type][granu])
>> + return -EINVAL;
>> +
>> + *vtd_granu = inv_type_granu_table[type][granu];
>> +
>
> btw do we really need both map and table here? Can't we just
> use one table with unsupported granularity marked as a special
> value?
>
>> + return 0;
>> +}
>> +
>> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
>> +{
>> + u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
>> +
>> + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
>> + * IOMMU cache invalidate API passes granu_size in bytes, and
>> number of
>> + * granu size in contiguous memory.
>> + */
>> + return order_base_2(nr_pages);
>> +}
>> +
>> +#ifdef CONFIG_INTEL_IOMMU_SVM
>> +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
>> + struct device *dev, struct iommu_cache_invalidate_info
>> *inv_info)
>> +{
>> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
>> + struct device_domain_info *info;
>> + struct intel_iommu *iommu;
>> + unsigned long flags;
>> + int cache_type;
>> + u8 bus, devfn;
>> + u16 did, sid;
>> + int ret = 0;
>> + u64 size = 0;
>> +
>> + if (!inv_info || !dmar_domain ||
>> + inv_info->version !=
>> IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
>> + return -EINVAL;
>> +
>> + if (!dev || !dev_is_pci(dev))
>> + return -ENODEV;
>> +
>> + iommu = device_to_iommu(dev, &bus, &devfn);
>> + if (!iommu)
>> + return -ENODEV;
>> +
>> + spin_lock_irqsave(&device_domain_lock, flags);
>> + spin_lock(&iommu->lock);
>> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
>> + if (!info) {
>> + ret = -EINVAL;
>> + goto out_unlock;
>
> -ENOTSUPP?
>
>> + }
>> + did = dmar_domain->iommu_did[iommu->seq_id];
>> + sid = PCI_DEVID(bus, devfn);
>> +
>> + /* Size is only valid in non-PASID selective invalidation */
>> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
>> + size = to_vtd_size(inv_info->addr_info.granule_size,
>> + inv_info->addr_info.nb_granules);
>> +
>> + for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache,
>> IOMMU_CACHE_INV_TYPE_NR) {
>> + int granu = 0;
>> + u64 pasid = 0;
>> +
>> + ret = to_vtd_granularity(cache_type, inv_info->granularity,
>> &granu);
>> + if (ret) {
>> + pr_err("Invalid cache type and granu
>> combination %d/%d\n", cache_type,
>> + inv_info->granularity);
>> + break;
>> + }
>> +
>> + /* PASID is stored in different locations based on granularity
>> */
>> + if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
>> + inv_info->pasid_info.flags &
>> IOMMU_INV_PASID_FLAGS_PASID)
>> + pasid = inv_info->pasid_info.pasid;
>> + else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR
>> &&
>> + inv_info->addr_info.flags &
>> IOMMU_INV_ADDR_FLAGS_PASID)
>> + pasid = inv_info->addr_info.pasid;
>> + else {
>> + pr_err("Cannot find PASID for given cache type and
>> granularity\n");
>> + break;
>> + }
>> +
>> + switch (BIT(cache_type)) {
>> + case IOMMU_CACHE_INV_TYPE_IOTLB:
>> + if ((inv_info->granularity !=
>> IOMMU_INV_GRANU_PASID) &&
>
> granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> why IOMMU_INV_GRANU_DOMAIN also needs size check.
>
>> + size && (inv_info->addr_info.addr &
>> ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
>> + pr_err("Address out of range, 0x%llx, size
>> order %llu\n",
>> + inv_info->addr_info.addr, size);
>> + ret = -ERANGE;
>> + goto out_unlock;
>> + }
>> +
>> + qi_flush_piotlb(iommu, did,
>> + pasid,
>> + mm_to_dma_pfn(inv_info-
>>> addr_info.addr),
>> + (granu == QI_GRAN_NONG_PASID) ? -
>> 1 : 1 << size,
>> + inv_info->addr_info.flags &
>> IOMMU_INV_ADDR_FLAGS_LEAF);
>> +
>> + /*
>> + * Always flush device IOTLB if ATS is enabled since
>> guest
>> + * vIOMMU exposes CM = 1, no device IOTLB flush
>> will be passed
>> + * down.
>> + */
>
> Does VT-d spec mention that no device IOTLB flush is required when CM=1?
>
>> + if (info->ats_enabled) {
>> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
>>> pfsid,
>> + pasid, info->ats_qdep,
>> + inv_info->addr_info.addr,
>> size,
>> + granu);
>> + }
>> + break;
>> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
>> + if (info->ats_enabled) {
>> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
>>> pfsid,
>> + inv_info->addr_info.pasid,
>> info->ats_qdep,
>> + inv_info->addr_info.addr,
>> size,
>> + granu);
>
> I'm confused here. There are two granularities allowed for devtlb, but here
> you only handle one of them?
>
>> + } else
>> + pr_warn("Passdown device IOTLB flush w/o
>> ATS!\n");
>> +
>> + break;
>> + case IOMMU_CACHE_INV_TYPE_PASID:
>> + qi_flush_pasid_cache(iommu, did, granu, inv_info-
>>> pasid_info.pasid);
>> +
>
> as earlier comment, we shouldn't allow userspace or guest to invalidate
> PASID cache
>
>> + break;
>> + default:
>> + dev_err(dev, "Unsupported IOMMU invalidation
>> type %d\n",
>> + cache_type);
>> + ret = -EINVAL;
>> + }
>> + }
>> +out_unlock:
>> + spin_unlock(&iommu->lock);
>> + spin_unlock_irqrestore(&device_domain_lock, flags);
>> +
>> + return ret;
>> +}
>> +#endif
>> +
>> static int intel_iommu_map(struct iommu_domain *domain,
>> unsigned long iova, phys_addr_t hpa,
>> size_t size, int iommu_prot, gfp_t gfp)
>> @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
>> .is_attach_deferred = intel_iommu_is_attach_deferred,
>> .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
>> #ifdef CONFIG_INTEL_IOMMU_SVM
>> + .cache_invalidate = intel_iommu_sva_invalidate,
>> .sva_bind_gpasid = intel_svm_bind_gpasid,
>> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
>> #endif
>> --
>> 2.7.4
>
On 3/28/20 11:01 AM, Tian, Kevin wrote:
>> From: Jacob Pan <[email protected]>
>> Sent: Saturday, March 21, 2020 7:28 AM
>>
>> When Shared Virtual Address (SVA) is enabled for a guest OS via
>> vIOMMU, we need to provide invalidation support at IOMMU API and driver
>> level. This patch adds Intel VT-d specific function to implement
>> iommu passdown invalidate API for shared virtual address.
>>
>> The use case is for supporting caching structure invalidation
>> of assigned SVM capable devices. Emulated IOMMU exposes queue
>
> emulated IOMMU -> vIOMMU, since virito-iommu could use the
> interface as well.
>
>> invalidation capability and passes down all descriptors from the guest
>> to the physical IOMMU.
>>
>> The assumption is that guest to host device ID mapping should be
>> resolved prior to calling IOMMU driver. Based on the device handle,
>> host IOMMU driver can replace certain fields before submit to the
>> invalidation queue.
>>
>> ---
>> v7 review fixed in v10
>> ---
>>
>> Signed-off-by: Jacob Pan <[email protected]>
>> Signed-off-by: Ashok Raj <[email protected]>
>> Signed-off-by: Liu, Yi L <[email protected]>
>> ---
>> drivers/iommu/intel-iommu.c | 182
>> ++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 182 insertions(+)
>>
>> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
>> index b1477cd423dd..a76afb0fd51a 100644
>> --- a/drivers/iommu/intel-iommu.c
>> +++ b/drivers/iommu/intel-iommu.c
>> @@ -5619,6 +5619,187 @@ static void
>> intel_iommu_aux_detach_device(struct iommu_domain *domain,
>> aux_domain_remove_dev(to_dmar_domain(domain), dev);
>> }
>>
>> +/*
>> + * 2D array for converting and sanitizing IOMMU generic TLB granularity to
>> + * VT-d granularity. Invalidation is typically included in the unmap operation
>> + * as a result of DMA or VFIO unmap. However, for assigned devices guest
>> + * owns the first level page tables. Invalidations of translation caches in the
>> + * guest are trapped and passed down to the host.
>> + *
>> + * vIOMMU in the guest will only expose first level page tables, therefore
>> + * we do not include IOTLB granularity for request without PASID (second
>> level).
>
> I would revise above as "We do not support IOTLB granularity for request
> without PASID (second level), therefore any vIOMMU implementation that
> exposes the SVA capability to the guest should only expose the first level
> page tables, implying all invalidation requests from the guest will include
> a valid PASID"
>
>> + *
>> + * For example, to find the VT-d granularity encoding for IOTLB
>> + * type and page selective granularity within PASID:
>> + * X: indexed by iommu cache type
>> + * Y: indexed by enum iommu_inv_granularity
>> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
>> + *
>> + * Granu_map array indicates validity of the table. 1: valid, 0: invalid
>> + *
>> + */
>> +const static int
>> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
>> NR] = {
>> + /*
>> + * PASID based IOTLB invalidation: PASID selective (per PASID),
>> + * page selective (address granularity)
>> + */
>> + {0, 1, 1},
>> + /* PASID based dev TLBs, only support all PASIDs or single PASID */
>> + {1, 1, 0},
>
> Is this combination correct? when single PASID is being specified, it is
> essentially a page-selective invalidation since you need provide Address
> and Size.
>
>> + /* PASID cache */
>
> PASID cache is fully managed by the host. Guest PASID cache invalidation
> is interpreted by vIOMMU for bind and unbind operations. I don't think
> we should accept any PASID cache invalidation from userspace or guest.
I tend to agree here.
>
>> + {1, 1, 0}
>> +};
>> +
>> +const static int
>> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
>> _NR] = {
>> + /* PASID based IOTLB */
>> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
>> + /* PASID based dev TLBs */
>> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
>> + /* PASID cache */
>> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
>> +};
>> +
>> +static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
>> +{
>> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
>> IOMMU_INV_GRANU_NR ||
>> + !inv_type_granu_map[type][granu])
>> + return -EINVAL;
>> +
>> + *vtd_granu = inv_type_granu_table[type][granu];
>> +
>
> btw do we really need both map and table here? Can't we just
> use one table with unsupported granularity marked as a special
> value?
I asked the same question some time ago. If I remember correctly the
issue is while a granu can be supported in inv_type_granu_map, the
associated value in inv_type_granu_table can be 0. This typically
matches both values of G field (0 or 1) in the invalidation cmd. See
other comment below.
>
>> + return 0;
>> +}
>> +
>> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
>> +{
>> + u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
>> +
>> + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
>> + * IOMMU cache invalidate API passes granu_size in bytes, and
>> number of
>> + * granu size in contiguous memory.
>> + */
>> + return order_base_2(nr_pages);
>> +}
>> +
>> +#ifdef CONFIG_INTEL_IOMMU_SVM
>> +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
>> + struct device *dev, struct iommu_cache_invalidate_info
>> *inv_info)
>> +{
>> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
>> + struct device_domain_info *info;
>> + struct intel_iommu *iommu;
>> + unsigned long flags;
>> + int cache_type;
>> + u8 bus, devfn;
>> + u16 did, sid;
>> + int ret = 0;
>> + u64 size = 0;
>> +
>> + if (!inv_info || !dmar_domain ||
>> + inv_info->version !=
>> IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
>> + return -EINVAL;
>> +
>> + if (!dev || !dev_is_pci(dev))
>> + return -ENODEV;
>> +
>> + iommu = device_to_iommu(dev, &bus, &devfn);
>> + if (!iommu)
>> + return -ENODEV;
>> +
>> + spin_lock_irqsave(&device_domain_lock, flags);
>> + spin_lock(&iommu->lock);
>> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
>> + if (!info) {
>> + ret = -EINVAL;
>> + goto out_unlock;
>
> -ENOTSUPP?
>
>> + }
>> + did = dmar_domain->iommu_did[iommu->seq_id];
>> + sid = PCI_DEVID(bus, devfn);
>> +
>> + /* Size is only valid in non-PASID selective invalidation */
>> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
>> + size = to_vtd_size(inv_info->addr_info.granule_size,
>> + inv_info->addr_info.nb_granules);
>> +
>> + for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache,
>> IOMMU_CACHE_INV_TYPE_NR) {
>> + int granu = 0;
>> + u64 pasid = 0;
>> +
>> + ret = to_vtd_granularity(cache_type, inv_info->granularity,
>> &granu);
>> + if (ret) {
>> + pr_err("Invalid cache type and granu
>> combination %d/%d\n", cache_type,
>> + inv_info->granularity);
>> + break;
>> + }
>> +
>> + /* PASID is stored in different locations based on granularity
>> */
>> + if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
>> + inv_info->pasid_info.flags &
>> IOMMU_INV_PASID_FLAGS_PASID)
>> + pasid = inv_info->pasid_info.pasid;
>> + else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR
>> &&
>> + inv_info->addr_info.flags &
>> IOMMU_INV_ADDR_FLAGS_PASID)
>> + pasid = inv_info->addr_info.pasid;
>> + else {
>> + pr_err("Cannot find PASID for given cache type and
>> granularity\n");
>> + break;
>> + }
>> +
>> + switch (BIT(cache_type)) {
>> + case IOMMU_CACHE_INV_TYPE_IOTLB:
>> + if ((inv_info->granularity !=
>> IOMMU_INV_GRANU_PASID) &&
>
> granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> why IOMMU_INV_GRANU_DOMAIN also needs size check.
>
>> + size && (inv_info->addr_info.addr &
>> ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
>> + pr_err("Address out of range, 0x%llx, size
>> order %llu\n",
>> + inv_info->addr_info.addr, size);
>> + ret = -ERANGE;
>> + goto out_unlock;
>> + }
>> +
>> + qi_flush_piotlb(iommu, did,
>> + pasid,
>> + mm_to_dma_pfn(inv_info-
>>> addr_info.addr),
>> + (granu == QI_GRAN_NONG_PASID) ? -
>> 1 : 1 << size,
>> + inv_info->addr_info.flags &
>> IOMMU_INV_ADDR_FLAGS_LEAF);
>> +
>> + /*
>> + * Always flush device IOTLB if ATS is enabled since
>> guest
>> + * vIOMMU exposes CM = 1, no device IOTLB flush
>> will be passed
>> + * down.
>> + */
>
> Does VT-d spec mention that no device IOTLB flush is required when CM=1?
>
>> + if (info->ats_enabled) {
>> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
>>> pfsid,
>> + pasid, info->ats_qdep,
>> + inv_info->addr_info.addr,
>> size,
>> + granu);
>> + }
>> + break;
>> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
>> + if (info->ats_enabled) {
>> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
>>> pfsid,
>> + inv_info->addr_info.pasid,
>> info->ats_qdep,
>> + inv_info->addr_info.addr,
>> size,
>> + granu);
>
> I'm confused here. There are two granularities allowed for devtlb, but here
> you only handle one of them?
granu is the result of to_vtd_granularity() so it can take either of the
2 values.
Thanks
Eric
>
>> + } else
>> + pr_warn("Passdown device IOTLB flush w/o
>> ATS!\n");
>> +
>> + break;
>> + case IOMMU_CACHE_INV_TYPE_PASID:
>> + qi_flush_pasid_cache(iommu, did, granu, inv_info-
>>> pasid_info.pasid);
>> +
>
> as earlier comment, we shouldn't allow userspace or guest to invalidate
> PASID cache
>
>> + break;
>> + default:
>> + dev_err(dev, "Unsupported IOMMU invalidation
>> type %d\n",
>> + cache_type);
>> + ret = -EINVAL;
>> + }
>> + }
>> +out_unlock:
>> + spin_unlock(&iommu->lock);
>> + spin_unlock_irqrestore(&device_domain_lock, flags);
>> +
>> + return ret;
>> +}
>> +#endif
>> +
>> static int intel_iommu_map(struct iommu_domain *domain,
>> unsigned long iova, phys_addr_t hpa,
>> size_t size, int iommu_prot, gfp_t gfp)
>> @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
>> .is_attach_deferred = intel_iommu_is_attach_deferred,
>> .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
>> #ifdef CONFIG_INTEL_IOMMU_SVM
>> + .cache_invalidate = intel_iommu_sva_invalidate,
>> .sva_bind_gpasid = intel_svm_bind_gpasid,
>> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
>> #endif
>> --
>> 2.7.4
>
Hi Jacob,
On 3/21/20 12:27 AM, Jacob Pan wrote:
> When Shared Virtual Address (SVA) is enabled for a guest OS via
> vIOMMU, we need to provide invalidation support at IOMMU API and driver
> level. This patch adds Intel VT-d specific function to implement
> iommu passdown invalidate API for shared virtual address.
>
> The use case is for supporting caching structure invalidation
> of assigned SVM capable devices. Emulated IOMMU exposes queue
> invalidation capability and passes down all descriptors from the guest
> to the physical IOMMU.
>
> The assumption is that guest to host device ID mapping should be
> resolved prior to calling IOMMU driver. Based on the device handle,
> host IOMMU driver can replace certain fields before submit to the
> invalidation queue.
>
> ---
> v7 review fixed in v10
> ---
>
> Signed-off-by: Jacob Pan <[email protected]>
> Signed-off-by: Ashok Raj <[email protected]>
> Signed-off-by: Liu, Yi L <[email protected]>
> ---
> drivers/iommu/intel-iommu.c | 182 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 182 insertions(+)
>
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index b1477cd423dd..a76afb0fd51a 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -5619,6 +5619,187 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
> aux_domain_remove_dev(to_dmar_domain(domain), dev);
> }
>
> +/*
> + * 2D array for converting and sanitizing IOMMU generic TLB granularity to
> + * VT-d granularity. Invalidation is typically included in the unmap operation
> + * as a result of DMA or VFIO unmap. However, for assigned devices guest
> + * owns the first level page tables. Invalidations of translation caches in the
> + * guest are trapped and passed down to the host.
> + *
> + * vIOMMU in the guest will only expose first level page tables, therefore
> + * we do not include IOTLB granularity for request without PASID (second level).
> + *
> + * For example, to find the VT-d granularity encoding for IOTLB
> + * type and page selective granularity within PASID:
> + * X: indexed by iommu cache type
> + * Y: indexed by enum iommu_inv_granularity
> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> + *
> + * Granu_map array indicates validity of the table. 1: valid, 0: invalid
> + *
> + */
> +const static int inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
> + /*
> + * PASID based IOTLB invalidation: PASID selective (per PASID),
> + * page selective (address granularity)
> + */
> + {0, 1, 1},
> + /* PASID based dev TLBs, only support all PASIDs or single PASID */
> + {1, 1, 0},
> + /* PASID cache */
> + {1, 1, 0}
> +};
> +
> +const static int inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
> + /* PASID based IOTLB */
> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> + /* PASID based dev TLBs */
> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> + /* PASID cache */
> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> +};
> +
> +static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
> +{
> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >= IOMMU_INV_GRANU_NR ||
> + !inv_type_granu_map[type][granu])
> + return -EINVAL;
> +
> + *vtd_granu = inv_type_granu_table[type][granu];
> +
> + return 0;
> +}
> +
> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> +{
> + u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
> +
> + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
> + * IOMMU cache invalidate API passes granu_size in bytes, and number of
> + * granu size in contiguous memory.
> + */
> + return order_base_2(nr_pages);
> +}
> +
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
> + struct device *dev, struct iommu_cache_invalidate_info *inv_info)
> +{
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> + struct device_domain_info *info;
> + struct intel_iommu *iommu;
> + unsigned long flags;
> + int cache_type;
> + u8 bus, devfn;
> + u16 did, sid;
> + int ret = 0;
> + u64 size = 0;
> +
> + if (!inv_info || !dmar_domain ||
> + inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> + return -EINVAL;
> +
> + if (!dev || !dev_is_pci(dev))
> + return -ENODEV;
> +
> + iommu = device_to_iommu(dev, &bus, &devfn);
> + if (!iommu)
> + return -ENODEV;
> +
> + spin_lock_irqsave(&device_domain_lock, flags);
> + spin_lock(&iommu->lock);
> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
> + if (!info) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + did = dmar_domain->iommu_did[iommu->seq_id];
> + sid = PCI_DEVID(bus, devfn);
> +
> + /* Size is only valid in non-PASID selective invalidation */
> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> + size = to_vtd_size(inv_info->addr_info.granule_size,
> + inv_info->addr_info.nb_granules);
> +
> + for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache, IOMMU_CACHE_INV_TYPE_NR) {
> + int granu = 0;
> + u64 pasid = 0;
> +
> + ret = to_vtd_granularity(cache_type, inv_info->granularity, &granu);
> + if (ret) {
> + pr_err("Invalid cache type and granu combination %d/%d\n", cache_type,
> + inv_info->granularity);
> + break;
> + }
> +
> + /* PASID is stored in different locations based on granularity */
> + if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
> + inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)
> + pasid = inv_info->pasid_info.pasid;
> + else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
> + inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)
> + pasid = inv_info->addr_info.pasid;
> + else {
> + pr_err("Cannot find PASID for given cache type and granularity\n");
I don't get this error msg. In case of domain-selective invalidation,
PASID is not used so if I am not wrong you will end up here while there
is no issue.
> + break;
> + }
> +
> + switch (BIT(cache_type)) {
> + case IOMMU_CACHE_INV_TYPE_IOTLB:
> + if ((inv_info->granularity != IOMMU_INV_GRANU_PASID) &&
> + size && (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
> + pr_err("Address out of range, 0x%llx, size order %llu\n",
> + inv_info->addr_info.addr, size);
> + ret = -ERANGE;
> + goto out_unlock;
> + }
> +
> + qi_flush_piotlb(iommu, did,
> + pasid,
> + mm_to_dma_pfn(inv_info->addr_info.addr),
> + (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
> + inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
> +
> + /*
> + * Always flush device IOTLB if ATS is enabled since guest
> + * vIOMMU exposes CM = 1, no device IOTLB flush will be passed
> + * down.
> + */
> + if (info->ats_enabled) {
nit {} not requested
> + qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid,
> + pasid, info->ats_qdep,
> + inv_info->addr_info.addr, size,
> + granu);
> + }
> + break;
> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> + if (info->ats_enabled) {
nit {} not requested
> + qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid,
> + inv_info->addr_info.pasid, info->ats_qdep,
> + inv_info->addr_info.addr, size,
> + granu);
> + } else
> + pr_warn("Passdown device IOTLB flush w/o ATS!\n");
> +
nit: extra line
> + break;
> + case IOMMU_CACHE_INV_TYPE_PASID:
> + qi_flush_pasid_cache(iommu, did, granu, inv_info->pasid_info.pasid);
> +
nit: extra line
> + break;
> + default:
> + dev_err(dev, "Unsupported IOMMU invalidation type %d\n",
> + cache_type);
> + ret = -EINVAL;
> + }
> + }
> +out_unlock:
> + spin_unlock(&iommu->lock);
> + spin_unlock_irqrestore(&device_domain_lock, flags);
> +
> + return ret;
> +}
> +#endif
> +
> static int intel_iommu_map(struct iommu_domain *domain,
> unsigned long iova, phys_addr_t hpa,
> size_t size, int iommu_prot, gfp_t gfp)
> @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
> .is_attach_deferred = intel_iommu_is_attach_deferred,
> .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
> #ifdef CONFIG_INTEL_IOMMU_SVM
> + .cache_invalidate = intel_iommu_sva_invalidate,
> .sva_bind_gpasid = intel_svm_bind_gpasid,
> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> #endif
>
Thanks
Eric
> From: Auger Eric <[email protected]>
> Sent: Sunday, March 29, 2020 11:34 PM
>
> Hi,
>
> On 3/28/20 11:01 AM, Tian, Kevin wrote:
> >> From: Jacob Pan <[email protected]>
> >> Sent: Saturday, March 21, 2020 7:28 AM
> >>
> >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> >> vIOMMU, we need to provide invalidation support at IOMMU API and
> driver
> >> level. This patch adds Intel VT-d specific function to implement
> >> iommu passdown invalidate API for shared virtual address.
> >>
> >> The use case is for supporting caching structure invalidation
> >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> >
> > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > interface as well.
> >
> >> invalidation capability and passes down all descriptors from the guest
> >> to the physical IOMMU.
> >>
> >> The assumption is that guest to host device ID mapping should be
> >> resolved prior to calling IOMMU driver. Based on the device handle,
> >> host IOMMU driver can replace certain fields before submit to the
> >> invalidation queue.
> >>
> >> ---
> >> v7 review fixed in v10
> >> ---
> >>
> >> Signed-off-by: Jacob Pan <[email protected]>
> >> Signed-off-by: Ashok Raj <[email protected]>
> >> Signed-off-by: Liu, Yi L <[email protected]>
> >> ---
> >> drivers/iommu/intel-iommu.c | 182
> >> ++++++++++++++++++++++++++++++++++++++++++++
> >> 1 file changed, 182 insertions(+)
> >>
> >> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> >> index b1477cd423dd..a76afb0fd51a 100644
> >> --- a/drivers/iommu/intel-iommu.c
> >> +++ b/drivers/iommu/intel-iommu.c
> >> @@ -5619,6 +5619,187 @@ static void
> >> intel_iommu_aux_detach_device(struct iommu_domain *domain,
> >> aux_domain_remove_dev(to_dmar_domain(domain), dev);
> >> }
> >>
> >> +/*
> >> + * 2D array for converting and sanitizing IOMMU generic TLB granularity
> to
> >> + * VT-d granularity. Invalidation is typically included in the unmap
> operation
> >> + * as a result of DMA or VFIO unmap. However, for assigned devices
> guest
> >> + * owns the first level page tables. Invalidations of translation caches in
> the
> >> + * guest are trapped and passed down to the host.
> >> + *
> >> + * vIOMMU in the guest will only expose first level page tables, therefore
> >> + * we do not include IOTLB granularity for request without PASID (second
> >> level).
> >
> > I would revise above as "We do not support IOTLB granularity for request
> > without PASID (second level), therefore any vIOMMU implementation that
> > exposes the SVA capability to the guest should only expose the first level
> > page tables, implying all invalidation requests from the guest will include
> > a valid PASID"
> >
> >> + *
> >> + * For example, to find the VT-d granularity encoding for IOTLB
> >> + * type and page selective granularity within PASID:
> >> + * X: indexed by iommu cache type
> >> + * Y: indexed by enum iommu_inv_granularity
> >> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> >> + *
> >> + * Granu_map array indicates validity of the table. 1: valid, 0: invalid
> >> + *
> >> + */
> >> +const static int
> >>
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> >> NR] = {
> >> + /*
> >> + * PASID based IOTLB invalidation: PASID selective (per PASID),
> >> + * page selective (address granularity)
> >> + */
> >> + {0, 1, 1},
> >> + /* PASID based dev TLBs, only support all PASIDs or single PASID */
> >> + {1, 1, 0},
> >
> > Is this combination correct? when single PASID is being specified, it is
> > essentially a page-selective invalidation since you need provide Address
> > and Size.
> Isn't it the same when G=1? Still the addr/size is used. Doesn't it
I thought addr/size is not used when G=1, but it might be wrong. I'm
checking with our vt-d spec owner.
> correspond to IOMMU_INV_GRANU_ADDR with
> IOMMU_INV_ADDR_FLAGS_PASID flag
> unset?
>
> so {0, 0, 1}?
I have one more open:
How does userspace know which invalidation type/gran is supported?
I didn't see such capability reporting in Yi's VFIO vSVA patch set. Do we
want the user/kernel assume the same capability set if they are
architectural? However the kernel could also do some optimization
e.g. hide devtlb invalidation capability given that the kernel already
invalidate devtlb automatically when serving iotlb invalidation...
Thanks
Kevin
>
> Thanks
>
> Eric
>
> >
> >> + /* PASID cache */
> >
> > PASID cache is fully managed by the host. Guest PASID cache invalidation
> > is interpreted by vIOMMU for bind and unbind operations. I don't think
> > we should accept any PASID cache invalidation from userspace or guest.
> >
> >> + {1, 1, 0}
> >> +};
> >> +
> >> +const static int
> >>
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> >> _NR] = {
> >> + /* PASID based IOTLB */
> >> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> >> + /* PASID based dev TLBs */
> >> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> >> + /* PASID cache */
> >> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> >> +};
> >> +
> >> +static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
> >> +{
> >> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> >> IOMMU_INV_GRANU_NR ||
> >> + !inv_type_granu_map[type][granu])
> >> + return -EINVAL;
> >> +
> >> + *vtd_granu = inv_type_granu_table[type][granu];
> >> +
> >
> > btw do we really need both map and table here? Can't we just
> > use one table with unsupported granularity marked as a special
> > value?
> >
> >> + return 0;
> >> +}
> >> +
> >> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> >> +{
> >> + u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
> >> +
> >> + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
> >> + * IOMMU cache invalidate API passes granu_size in bytes, and
> >> number of
> >> + * granu size in contiguous memory.
> >> + */
> >> + return order_base_2(nr_pages);
> >> +}
> >> +
> >> +#ifdef CONFIG_INTEL_IOMMU_SVM
> >> +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
> >> + struct device *dev, struct iommu_cache_invalidate_info
> >> *inv_info)
> >> +{
> >> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> >> + struct device_domain_info *info;
> >> + struct intel_iommu *iommu;
> >> + unsigned long flags;
> >> + int cache_type;
> >> + u8 bus, devfn;
> >> + u16 did, sid;
> >> + int ret = 0;
> >> + u64 size = 0;
> >> +
> >> + if (!inv_info || !dmar_domain ||
> >> + inv_info->version !=
> >> IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> >> + return -EINVAL;
> >> +
> >> + if (!dev || !dev_is_pci(dev))
> >> + return -ENODEV;
> >> +
> >> + iommu = device_to_iommu(dev, &bus, &devfn);
> >> + if (!iommu)
> >> + return -ENODEV;
> >> +
> >> + spin_lock_irqsave(&device_domain_lock, flags);
> >> + spin_lock(&iommu->lock);
> >> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
> >> + if (!info) {
> >> + ret = -EINVAL;
> >> + goto out_unlock;
> >
> > -ENOTSUPP?
> >
> >> + }
> >> + did = dmar_domain->iommu_did[iommu->seq_id];
> >> + sid = PCI_DEVID(bus, devfn);
> >> +
> >> + /* Size is only valid in non-PASID selective invalidation */
> >> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> >> + size = to_vtd_size(inv_info->addr_info.granule_size,
> >> + inv_info->addr_info.nb_granules);
> >> +
> >> + for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache,
> >> IOMMU_CACHE_INV_TYPE_NR) {
> >> + int granu = 0;
> >> + u64 pasid = 0;
> >> +
> >> + ret = to_vtd_granularity(cache_type, inv_info->granularity,
> >> &granu);
> >> + if (ret) {
> >> + pr_err("Invalid cache type and granu
> >> combination %d/%d\n", cache_type,
> >> + inv_info->granularity);
> >> + break;
> >> + }
> >> +
> >> + /* PASID is stored in different locations based on granularity
> >> */
> >> + if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
> >> + inv_info->pasid_info.flags &
> >> IOMMU_INV_PASID_FLAGS_PASID)
> >> + pasid = inv_info->pasid_info.pasid;
> >> + else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR
> >> &&
> >> + inv_info->addr_info.flags &
> >> IOMMU_INV_ADDR_FLAGS_PASID)
> >> + pasid = inv_info->addr_info.pasid;
> >> + else {
> >> + pr_err("Cannot find PASID for given cache type and
> >> granularity\n");
> >> + break;
> >> + }
> >> +
> >> + switch (BIT(cache_type)) {
> >> + case IOMMU_CACHE_INV_TYPE_IOTLB:
> >> + if ((inv_info->granularity !=
> >> IOMMU_INV_GRANU_PASID) &&
> >
> > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> >
> >> + size && (inv_info->addr_info.addr &
> >> ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
> >> + pr_err("Address out of range, 0x%llx, size
> >> order %llu\n",
> >> + inv_info->addr_info.addr, size);
> >> + ret = -ERANGE;
> >> + goto out_unlock;
> >> + }
> >> +
> >> + qi_flush_piotlb(iommu, did,
> >> + pasid,
> >> + mm_to_dma_pfn(inv_info-
> >>> addr_info.addr),
> >> + (granu == QI_GRAN_NONG_PASID) ? -
> >> 1 : 1 << size,
> >> + inv_info->addr_info.flags &
> >> IOMMU_INV_ADDR_FLAGS_LEAF);
> >> +
> >> + /*
> >> + * Always flush device IOTLB if ATS is enabled since
> >> guest
> >> + * vIOMMU exposes CM = 1, no device IOTLB flush
> >> will be passed
> >> + * down.
> >> + */
> >
> > Does VT-d spec mention that no device IOTLB flush is required when CM=1?
> >
> >> + if (info->ats_enabled) {
> >> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
> >>> pfsid,
> >> + pasid, info->ats_qdep,
> >> + inv_info->addr_info.addr,
> >> size,
> >> + granu);
> >> + }
> >> + break;
> >> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> >> + if (info->ats_enabled) {
> >> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
> >>> pfsid,
> >> + inv_info->addr_info.pasid,
> >> info->ats_qdep,
> >> + inv_info->addr_info.addr,
> >> size,
> >> + granu);
> >
> > I'm confused here. There are two granularities allowed for devtlb, but here
> > you only handle one of them?
> >
> >> + } else
> >> + pr_warn("Passdown device IOTLB flush w/o
> >> ATS!\n");
> >> +
> >> + break;
> >> + case IOMMU_CACHE_INV_TYPE_PASID:
> >> + qi_flush_pasid_cache(iommu, did, granu, inv_info-
> >>> pasid_info.pasid);
> >> +
> >
> > as earlier comment, we shouldn't allow userspace or guest to invalidate
> > PASID cache
> >
> >> + break;
> >> + default:
> >> + dev_err(dev, "Unsupported IOMMU invalidation
> >> type %d\n",
> >> + cache_type);
> >> + ret = -EINVAL;
> >> + }
> >> + }
> >> +out_unlock:
> >> + spin_unlock(&iommu->lock);
> >> + spin_unlock_irqrestore(&device_domain_lock, flags);
> >> +
> >> + return ret;
> >> +}
> >> +#endif
> >> +
> >> static int intel_iommu_map(struct iommu_domain *domain,
> >> unsigned long iova, phys_addr_t hpa,
> >> size_t size, int iommu_prot, gfp_t gfp)
> >> @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
> >> .is_attach_deferred = intel_iommu_is_attach_deferred,
> >> .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
> >> #ifdef CONFIG_INTEL_IOMMU_SVM
> >> + .cache_invalidate = intel_iommu_sva_invalidate,
> >> .sva_bind_gpasid = intel_svm_bind_gpasid,
> >> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> >> #endif
> >> --
> >> 2.7.4
> >
> From: Auger Eric <[email protected]>
> Sent: Monday, March 30, 2020 12:05 AM
>
> On 3/28/20 11:01 AM, Tian, Kevin wrote:
> >> From: Jacob Pan <[email protected]>
> >> Sent: Saturday, March 21, 2020 7:28 AM
> >>
> >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> >> vIOMMU, we need to provide invalidation support at IOMMU API and
> driver
> >> level. This patch adds Intel VT-d specific function to implement
> >> iommu passdown invalidate API for shared virtual address.
> >>
> >> The use case is for supporting caching structure invalidation
> >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> >
> > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > interface as well.
> >
> >> invalidation capability and passes down all descriptors from the guest
> >> to the physical IOMMU.
> >>
> >> The assumption is that guest to host device ID mapping should be
> >> resolved prior to calling IOMMU driver. Based on the device handle,
> >> host IOMMU driver can replace certain fields before submit to the
> >> invalidation queue.
> >>
> >> ---
> >> v7 review fixed in v10
> >> ---
> >>
> >> Signed-off-by: Jacob Pan <[email protected]>
> >> Signed-off-by: Ashok Raj <[email protected]>
> >> Signed-off-by: Liu, Yi L <[email protected]>
> >> ---
> >> drivers/iommu/intel-iommu.c | 182
> >> ++++++++++++++++++++++++++++++++++++++++++++
> >> 1 file changed, 182 insertions(+)
> >>
> >> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> >> index b1477cd423dd..a76afb0fd51a 100644
> >> --- a/drivers/iommu/intel-iommu.c
> >> +++ b/drivers/iommu/intel-iommu.c
> >> @@ -5619,6 +5619,187 @@ static void
> >> intel_iommu_aux_detach_device(struct iommu_domain *domain,
> >> aux_domain_remove_dev(to_dmar_domain(domain), dev);
> >> }
> >>
> >> +/*
> >> + * 2D array for converting and sanitizing IOMMU generic TLB granularity
> to
> >> + * VT-d granularity. Invalidation is typically included in the unmap
> operation
> >> + * as a result of DMA or VFIO unmap. However, for assigned devices
> guest
> >> + * owns the first level page tables. Invalidations of translation caches in
> the
> >> + * guest are trapped and passed down to the host.
> >> + *
> >> + * vIOMMU in the guest will only expose first level page tables, therefore
> >> + * we do not include IOTLB granularity for request without PASID (second
> >> level).
> >
> > I would revise above as "We do not support IOTLB granularity for request
> > without PASID (second level), therefore any vIOMMU implementation that
> > exposes the SVA capability to the guest should only expose the first level
> > page tables, implying all invalidation requests from the guest will include
> > a valid PASID"
> >
> >> + *
> >> + * For example, to find the VT-d granularity encoding for IOTLB
> >> + * type and page selective granularity within PASID:
> >> + * X: indexed by iommu cache type
> >> + * Y: indexed by enum iommu_inv_granularity
> >> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> >> + *
> >> + * Granu_map array indicates validity of the table. 1: valid, 0: invalid
> >> + *
> >> + */
> >> +const static int
> >>
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> >> NR] = {
> >> + /*
> >> + * PASID based IOTLB invalidation: PASID selective (per PASID),
> >> + * page selective (address granularity)
> >> + */
> >> + {0, 1, 1},
> >> + /* PASID based dev TLBs, only support all PASIDs or single PASID */
> >> + {1, 1, 0},
> >
> > Is this combination correct? when single PASID is being specified, it is
> > essentially a page-selective invalidation since you need provide Address
> > and Size.
> >
> >> + /* PASID cache */
> >
> > PASID cache is fully managed by the host. Guest PASID cache invalidation
> > is interpreted by vIOMMU for bind and unbind operations. I don't think
> > we should accept any PASID cache invalidation from userspace or guest.
> I tend to agree here.
> >
> >> + {1, 1, 0}
> >> +};
> >> +
> >> +const static int
> >>
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> >> _NR] = {
> >> + /* PASID based IOTLB */
> >> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> >> + /* PASID based dev TLBs */
> >> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> >> + /* PASID cache */
> >> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> >> +};
> >> +
> >> +static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
> >> +{
> >> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> >> IOMMU_INV_GRANU_NR ||
> >> + !inv_type_granu_map[type][granu])
> >> + return -EINVAL;
> >> +
> >> + *vtd_granu = inv_type_granu_table[type][granu];
> >> +
> >
> > btw do we really need both map and table here? Can't we just
> > use one table with unsupported granularity marked as a special
> > value?
> I asked the same question some time ago. If I remember correctly the
> issue is while a granu can be supported in inv_type_granu_map, the
> associated value in inv_type_granu_table can be 0. This typically
> matches both values of G field (0 or 1) in the invalidation cmd. See
> other comment below.
I didn't fully understand it. Also what does a value '0' imply? also
it's interesting to see below in [PATCH 07/11]:
+/* QI Dev-IOTLB inv granu */
+#define QI_DEV_IOTLB_GRAN_ALL 1
+#define QI_DEV_IOTLB_GRAN_PASID_SEL 0
+
> >
> >> + return 0;
> >> +}
> >> +
> >> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> >> +{
> >> + u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
> >> +
> >> + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
> >> + * IOMMU cache invalidate API passes granu_size in bytes, and
> >> number of
> >> + * granu size in contiguous memory.
> >> + */
> >> + return order_base_2(nr_pages);
> >> +}
> >> +
> >> +#ifdef CONFIG_INTEL_IOMMU_SVM
> >> +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
> >> + struct device *dev, struct iommu_cache_invalidate_info
> >> *inv_info)
> >> +{
> >> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> >> + struct device_domain_info *info;
> >> + struct intel_iommu *iommu;
> >> + unsigned long flags;
> >> + int cache_type;
> >> + u8 bus, devfn;
> >> + u16 did, sid;
> >> + int ret = 0;
> >> + u64 size = 0;
> >> +
> >> + if (!inv_info || !dmar_domain ||
> >> + inv_info->version !=
> >> IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> >> + return -EINVAL;
> >> +
> >> + if (!dev || !dev_is_pci(dev))
> >> + return -ENODEV;
> >> +
> >> + iommu = device_to_iommu(dev, &bus, &devfn);
> >> + if (!iommu)
> >> + return -ENODEV;
> >> +
> >> + spin_lock_irqsave(&device_domain_lock, flags);
> >> + spin_lock(&iommu->lock);
> >> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
> >> + if (!info) {
> >> + ret = -EINVAL;
> >> + goto out_unlock;
> >
> > -ENOTSUPP?
> >
> >> + }
> >> + did = dmar_domain->iommu_did[iommu->seq_id];
> >> + sid = PCI_DEVID(bus, devfn);
> >> +
> >> + /* Size is only valid in non-PASID selective invalidation */
> >> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> >> + size = to_vtd_size(inv_info->addr_info.granule_size,
> >> + inv_info->addr_info.nb_granules);
> >> +
> >> + for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache,
> >> IOMMU_CACHE_INV_TYPE_NR) {
> >> + int granu = 0;
> >> + u64 pasid = 0;
> >> +
> >> + ret = to_vtd_granularity(cache_type, inv_info->granularity,
> >> &granu);
> >> + if (ret) {
> >> + pr_err("Invalid cache type and granu
> >> combination %d/%d\n", cache_type,
> >> + inv_info->granularity);
> >> + break;
> >> + }
> >> +
> >> + /* PASID is stored in different locations based on granularity
> >> */
> >> + if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
> >> + inv_info->pasid_info.flags &
> >> IOMMU_INV_PASID_FLAGS_PASID)
> >> + pasid = inv_info->pasid_info.pasid;
> >> + else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR
> >> &&
> >> + inv_info->addr_info.flags &
> >> IOMMU_INV_ADDR_FLAGS_PASID)
> >> + pasid = inv_info->addr_info.pasid;
> >> + else {
> >> + pr_err("Cannot find PASID for given cache type and
> >> granularity\n");
> >> + break;
> >> + }
> >> +
> >> + switch (BIT(cache_type)) {
> >> + case IOMMU_CACHE_INV_TYPE_IOTLB:
> >> + if ((inv_info->granularity !=
> >> IOMMU_INV_GRANU_PASID) &&
> >
> > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> >
> >> + size && (inv_info->addr_info.addr &
> >> ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
> >> + pr_err("Address out of range, 0x%llx, size
> >> order %llu\n",
> >> + inv_info->addr_info.addr, size);
> >> + ret = -ERANGE;
> >> + goto out_unlock;
> >> + }
> >> +
> >> + qi_flush_piotlb(iommu, did,
> >> + pasid,
> >> + mm_to_dma_pfn(inv_info-
> >>> addr_info.addr),
> >> + (granu == QI_GRAN_NONG_PASID) ? -
> >> 1 : 1 << size,
> >> + inv_info->addr_info.flags &
> >> IOMMU_INV_ADDR_FLAGS_LEAF);
> >> +
> >> + /*
> >> + * Always flush device IOTLB if ATS is enabled since
> >> guest
> >> + * vIOMMU exposes CM = 1, no device IOTLB flush
> >> will be passed
> >> + * down.
> >> + */
> >
> > Does VT-d spec mention that no device IOTLB flush is required when CM=1?
> >
> >> + if (info->ats_enabled) {
> >> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
> >>> pfsid,
> >> + pasid, info->ats_qdep,
> >> + inv_info->addr_info.addr,
> >> size,
> >> + granu);
> >> + }
> >> + break;
> >> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> >> + if (info->ats_enabled) {
> >> + qi_flush_dev_iotlb_pasid(iommu, sid, info-
> >>> pfsid,
> >> + inv_info->addr_info.pasid,
> >> info->ats_qdep,
> >> + inv_info->addr_info.addr,
> >> size,
> >> + granu);
> >
> > I'm confused here. There are two granularities allowed for devtlb, but here
> > you only handle one of them?
> granu is the result of to_vtd_granularity() so it can take either of the
> 2 values.
yes, you're right.
>
> Thanks
>
> Eric
> >
> >> + } else
> >> + pr_warn("Passdown device IOTLB flush w/o
> >> ATS!\n");
> >> +
> >> + break;
> >> + case IOMMU_CACHE_INV_TYPE_PASID:
> >> + qi_flush_pasid_cache(iommu, did, granu, inv_info-
> >>> pasid_info.pasid);
> >> +
> >
> > as earlier comment, we shouldn't allow userspace or guest to invalidate
> > PASID cache
> >
> >> + break;
> >> + default:
> >> + dev_err(dev, "Unsupported IOMMU invalidation
> >> type %d\n",
> >> + cache_type);
> >> + ret = -EINVAL;
> >> + }
> >> + }
> >> +out_unlock:
> >> + spin_unlock(&iommu->lock);
> >> + spin_unlock_irqrestore(&device_domain_lock, flags);
> >> +
> >> + return ret;
> >> +}
> >> +#endif
> >> +
> >> static int intel_iommu_map(struct iommu_domain *domain,
> >> unsigned long iova, phys_addr_t hpa,
> >> size_t size, int iommu_prot, gfp_t gfp)
> >> @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
> >> .is_attach_deferred = intel_iommu_is_attach_deferred,
> >> .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
> >> #ifdef CONFIG_INTEL_IOMMU_SVM
> >> + .cache_invalidate = intel_iommu_sva_invalidate,
> >> .sva_bind_gpasid = intel_svm_bind_gpasid,
> >> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> >> #endif
> >> --
> >> 2.7.4
> >
On Sat, 28 Mar 2020 10:01:42 +0000
"Tian, Kevin" <[email protected]> wrote:
> > From: Jacob Pan <[email protected]>
> > Sent: Saturday, March 21, 2020 7:28 AM
> >
> > When Shared Virtual Address (SVA) is enabled for a guest OS via
> > vIOMMU, we need to provide invalidation support at IOMMU API and
> > driver level. This patch adds Intel VT-d specific function to
> > implement iommu passdown invalidate API for shared virtual address.
> >
> > The use case is for supporting caching structure invalidation
> > of assigned SVM capable devices. Emulated IOMMU exposes queue
>
> emulated IOMMU -> vIOMMU, since virito-iommu could use the
> interface as well.
>
True, but it does not invalidate this statement about emulated IOMMU. I
will add another statement saying "the same interface can be used for
virtio-IOMMU as well". OK?
> > invalidation capability and passes down all descriptors from the
> > guest to the physical IOMMU.
> >
> > The assumption is that guest to host device ID mapping should be
> > resolved prior to calling IOMMU driver. Based on the device handle,
> > host IOMMU driver can replace certain fields before submit to the
> > invalidation queue.
> >
> > ---
> > v7 review fixed in v10
> > ---
> >
> > Signed-off-by: Jacob Pan <[email protected]>
> > Signed-off-by: Ashok Raj <[email protected]>
> > Signed-off-by: Liu, Yi L <[email protected]>
> > ---
> > drivers/iommu/intel-iommu.c | 182
> > ++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 182 insertions(+)
> >
> > diff --git a/drivers/iommu/intel-iommu.c
> > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > 100644 --- a/drivers/iommu/intel-iommu.c
> > +++ b/drivers/iommu/intel-iommu.c
> > @@ -5619,6 +5619,187 @@ static void
> > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > }
> >
> > +/*
> > + * 2D array for converting and sanitizing IOMMU generic TLB
> > granularity to
> > + * VT-d granularity. Invalidation is typically included in the
> > unmap operation
> > + * as a result of DMA or VFIO unmap. However, for assigned devices
> > guest
> > + * owns the first level page tables. Invalidations of translation
> > caches in the
> > + * guest are trapped and passed down to the host.
> > + *
> > + * vIOMMU in the guest will only expose first level page tables,
> > therefore
> > + * we do not include IOTLB granularity for request without PASID
> > (second level).
>
> I would revise above as "We do not support IOTLB granularity for
> request without PASID (second level), therefore any vIOMMU
> implementation that exposes the SVA capability to the guest should
> only expose the first level page tables, implying all invalidation
> requests from the guest will include a valid PASID"
>
Sounds good.
> > + *
> > + * For example, to find the VT-d granularity encoding for IOTLB
> > + * type and page selective granularity within PASID:
> > + * X: indexed by iommu cache type
> > + * Y: indexed by enum iommu_inv_granularity
> > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > + *
> > + * Granu_map array indicates validity of the table. 1: valid, 0:
> > invalid
> > + *
> > + */
> > +const static int
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > NR] = {
> > + /*
> > + * PASID based IOTLB invalidation: PASID selective (per
> > PASID),
> > + * page selective (address granularity)
> > + */
> > + {0, 1, 1},
> > + /* PASID based dev TLBs, only support all PASIDs or single
> > PASID */
> > + {1, 1, 0},
>
> Is this combination correct? when single PASID is being specified, it
> is essentially a page-selective invalidation since you need provide
> Address and Size.
>
This is for translation between generic UAPI granu to VT-d granu, it
has nothing to do with address and size.
e.g.
If user passes IOMMU_INV_GRANU_PASID for the single PASID case as you
mentioned, this map table shows it is valid.
Then the lookup result will get VT-d granu:
QI_DEV_IOTLB_GRAN_PASID_SEL, which means G=0.
> > + /* PASID cache */
>
> PASID cache is fully managed by the host. Guest PASID cache
> invalidation is interpreted by vIOMMU for bind and unbind operations.
> I don't think we should accept any PASID cache invalidation from
> userspace or guest.
>
True for vIOMMU, this is here for completeness. Can be used by virtio
IOMMU, since PC flush is inclusive (IOTLB, devTLB), it is more
efficient.
> > + {1, 1, 0}
> > +};
> > +
> > +const static int
> > inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> > _NR] = {
> > + /* PASID based IOTLB */
> > + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> > + /* PASID based dev TLBs */
> > + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> > + /* PASID cache */
> > + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> > +};
> > +
> > +static inline int to_vtd_granularity(int type, int granu, int
> > *vtd_granu) +{
> > + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> > IOMMU_INV_GRANU_NR ||
> > + !inv_type_granu_map[type][granu])
> > + return -EINVAL;
> > +
> > + *vtd_granu = inv_type_granu_table[type][granu];
> > +
>
> btw do we really need both map and table here? Can't we just
> use one table with unsupported granularity marked as a special
> value?
>
Yes, for value = 1. e.g. G=0 but still valid.
> > + return 0;
> > +}
> > +
> > +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> > +{
> > + u64 nr_pages = (granu_size * nr_granules) >>
> > VTD_PAGE_SHIFT; +
> > + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9
> > for 2MB, etc.
> > + * IOMMU cache invalidate API passes granu_size in bytes,
> > and number of
> > + * granu size in contiguous memory.
> > + */
> > + return order_base_2(nr_pages);
> > +}
> > +
> > +#ifdef CONFIG_INTEL_IOMMU_SVM
> > +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
> > + struct device *dev, struct
> > iommu_cache_invalidate_info *inv_info)
> > +{
> > + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> > + struct device_domain_info *info;
> > + struct intel_iommu *iommu;
> > + unsigned long flags;
> > + int cache_type;
> > + u8 bus, devfn;
> > + u16 did, sid;
> > + int ret = 0;
> > + u64 size = 0;
> > +
> > + if (!inv_info || !dmar_domain ||
> > + inv_info->version !=
> > IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> > + return -EINVAL;
> > +
> > + if (!dev || !dev_is_pci(dev))
> > + return -ENODEV;
> > +
> > + iommu = device_to_iommu(dev, &bus, &devfn);
> > + if (!iommu)
> > + return -ENODEV;
> > +
> > + spin_lock_irqsave(&device_domain_lock, flags);
> > + spin_lock(&iommu->lock);
> > + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus,
> > devfn);
> > + if (!info) {
> > + ret = -EINVAL;
> > + goto out_unlock;
>
> -ENOTSUPP?
>
I guess it can go either way in that the error is based on invalid
inputs.
> > + }
> > + did = dmar_domain->iommu_did[iommu->seq_id];
> > + sid = PCI_DEVID(bus, devfn);
> > +
> > + /* Size is only valid in non-PASID selective invalidation
> > */
> > + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> > + size =
> > to_vtd_size(inv_info->addr_info.granule_size,
> > +
> > inv_info->addr_info.nb_granules); +
> > + for_each_set_bit(cache_type, (unsigned long
> > *)&inv_info->cache, IOMMU_CACHE_INV_TYPE_NR) {
> > + int granu = 0;
> > + u64 pasid = 0;
> > +
> > + ret = to_vtd_granularity(cache_type,
> > inv_info->granularity, &granu);
> > + if (ret) {
> > + pr_err("Invalid cache type and granu
> > combination %d/%d\n", cache_type,
> > + inv_info->granularity);
> > + break;
> > + }
> > +
> > + /* PASID is stored in different locations based on
> > granularity */
> > + if (inv_info->granularity == IOMMU_INV_GRANU_PASID
> > &&
> > + inv_info->pasid_info.flags &
> > IOMMU_INV_PASID_FLAGS_PASID)
> > + pasid = inv_info->pasid_info.pasid;
> > + else if (inv_info->granularity ==
> > IOMMU_INV_GRANU_ADDR &&
> > + inv_info->addr_info.flags &
> > IOMMU_INV_ADDR_FLAGS_PASID)
> > + pasid = inv_info->addr_info.pasid;
> > + else {
> > + pr_err("Cannot find PASID for given cache
> > type and granularity\n");
> > + break;
> > + }
> > +
> > + switch (BIT(cache_type)) {
> > + case IOMMU_CACHE_INV_TYPE_IOTLB:
> > + if ((inv_info->granularity !=
> > IOMMU_INV_GRANU_PASID) &&
>
> granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> why IOMMU_INV_GRANU_DOMAIN also needs size check.
>
Good point! will fix.
> > + size && (inv_info->addr_info.addr &
> > ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
> > + pr_err("Address out of range,
> > 0x%llx, size order %llu\n",
> > + inv_info->addr_info.addr,
> > size);
> > + ret = -ERANGE;
> > + goto out_unlock;
> > + }
> > +
> > + qi_flush_piotlb(iommu, did,
> > + pasid,
> > + mm_to_dma_pfn(inv_info-
> > >addr_info.addr),
> > + (granu ==
> > QI_GRAN_NONG_PASID) ? - 1 : 1 << size,
> > + inv_info->addr_info.flags &
> > IOMMU_INV_ADDR_FLAGS_LEAF);
> > +
> > + /*
> > + * Always flush device IOTLB if ATS is
> > enabled since guest
> > + * vIOMMU exposes CM = 1, no device IOTLB
> > flush will be passed
> > + * down.
> > + */
>
> Does VT-d spec mention that no device IOTLB flush is required when
> CM=1?
>
Not explicitly. Just following the guideline in CH6.1 for efficient
virtualization. Early on, we also had discussion on supporting virtio
where IOTLB flush is inclusive.
Let me rephrase the comment:
/*
* Always flush device IOTLB if ATS is enabled. vIOMMU
* in the guest may assume IOTLB flush is inclusive,
* which is more efficient.
*/
> > + if (info->ats_enabled) {
> > + qi_flush_dev_iotlb_pasid(iommu,
> > sid, info-
> > >pfsid,
> > + pasid,
> > info->ats_qdep,
> > +
> > inv_info->addr_info.addr, size,
> > + granu);
> > + }
> > + break;
> > + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> > + if (info->ats_enabled) {
> > + qi_flush_dev_iotlb_pasid(iommu,
> > sid, info-
> > >pfsid,
> > +
> > inv_info->addr_info.pasid, info->ats_qdep,
> > +
> > inv_info->addr_info.addr, size,
> > + granu);
>
> I'm confused here. There are two granularities allowed for devtlb,
> but here you only handle one of them?
>
granu is passed into the flush function, which can be 1 or 0.
> > + } else
> > + pr_warn("Passdown device IOTLB
> > flush w/o ATS!\n");
> > +
> > + break;
> > + case IOMMU_CACHE_INV_TYPE_PASID:
> > + qi_flush_pasid_cache(iommu, did, granu,
> > inv_info-
> > >pasid_info.pasid);
> > +
>
> as earlier comment, we shouldn't allow userspace or guest to
> invalidate PASID cache
>
same explanation :)
> > + break;
> > + default:
> > + dev_err(dev, "Unsupported IOMMU
> > invalidation type %d\n",
> > + cache_type);
> > + ret = -EINVAL;
> > + }
> > + }
> > +out_unlock:
> > + spin_unlock(&iommu->lock);
> > + spin_unlock_irqrestore(&device_domain_lock, flags);
> > +
> > + return ret;
> > +}
> > +#endif
> > +
> > static int intel_iommu_map(struct iommu_domain *domain,
> > unsigned long iova, phys_addr_t hpa,
> > size_t size, int iommu_prot, gfp_t gfp)
> > @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
> > .is_attach_deferred =
> > intel_iommu_is_attach_deferred, .pgsize_bitmap =
> > INTEL_IOMMU_PGSIZES, #ifdef CONFIG_INTEL_IOMMU_SVM
> > + .cache_invalidate = intel_iommu_sva_invalidate,
> > .sva_bind_gpasid = intel_svm_bind_gpasid,
> > .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> > #endif
> > --
> > 2.7.4
>
[Jacob Pan]
On Tue, 31 Mar 2020 02:49:21 +0000
"Tian, Kevin" <[email protected]> wrote:
> > From: Auger Eric <[email protected]>
> > Sent: Sunday, March 29, 2020 11:34 PM
> >
> > Hi,
> >
> > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > >> From: Jacob Pan <[email protected]>
> > >> Sent: Saturday, March 21, 2020 7:28 AM
> > >>
> > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > >> and
> > driver
> > >> level. This patch adds Intel VT-d specific function to implement
> > >> iommu passdown invalidate API for shared virtual address.
> > >>
> > >> The use case is for supporting caching structure invalidation
> > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> [...]
> [...]
> > to
> > >> + * VT-d granularity. Invalidation is typically included in the
> > >> unmap
> > operation
> > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > >> devices
> > guest
> > >> + * owns the first level page tables. Invalidations of
> > >> translation caches in
> > the
> [...]
> [...]
> [...]
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > >> NR] = {
> > >> + /*
> > >> + * PASID based IOTLB invalidation: PASID selective (per
> > >> PASID),
> > >> + * page selective (address granularity)
> > >> + */
> > >> + {0, 1, 1},
> > >> + /* PASID based dev TLBs, only support all PASIDs or
> > >> single PASID */
> > >> + {1, 1, 0},
> > >
> > > Is this combination correct? when single PASID is being
> > > specified, it is essentially a page-selective invalidation since
> > > you need provide Address and Size.
> > Isn't it the same when G=1? Still the addr/size is used. Doesn't
> > it
>
> I thought addr/size is not used when G=1, but it might be wrong. I'm
> checking with our vt-d spec owner.
>
> > correspond to IOMMU_INV_GRANU_ADDR with
> > IOMMU_INV_ADDR_FLAGS_PASID flag
> > unset?
> >
> > so {0, 0, 1}?
>
I am not sure I got your logic. The three fields correspond to
IOMMU_INV_GRANU_DOMAIN, /* domain-selective invalidation */
IOMMU_INV_GRANU_PASID, /* PASID-selective invalidation */
IOMMU_INV_GRANU_ADDR, /* page-selective invalidation *
For devTLB, we use domain as global since there is no domain. Then I
came up with {1, 1, 0}, which means we could have global and pasid
granu invalidation for PASID based devTLB.
If the caller also provide addr and S bit, the flush routine will put
that into QI descriptor. I know this is a little odd, but from the
granu translation p.o.v. VT-d spec has no G bit for page selective
invalidation.
> I have one more open:
>
> How does userspace know which invalidation type/gran is supported?
> I didn't see such capability reporting in Yi's VFIO vSVA patch set.
> Do we want the user/kernel assume the same capability set if they are
> architectural? However the kernel could also do some optimization
> e.g. hide devtlb invalidation capability given that the kernel
> already invalidate devtlb automatically when serving iotlb
> invalidation...
>
In general, we are trending to use VFIO capability chain to expose iommu
capabilities.
But for architectural features such as type/granu, we have to assume
the same capability between host & guest. Granu and types are not
enumerated on the host IOMMU either.
For devTLB optimization, I agree we need to expose a capability to
the guest stating that implicit devtlb invalidation is supported.
Otherwise, if Linux guest runs on other OSes may not support implicit
devtlb invalidation.
Right Yi?
> Thanks
> Kevin
>
> >
> > Thanks
> >
> > Eric
> >
> > >
> > >> + /* PASID cache */
> > >
> > > PASID cache is fully managed by the host. Guest PASID cache
> > > invalidation is interpreted by vIOMMU for bind and unbind
> > > operations. I don't think we should accept any PASID cache
> > > invalidation from userspace or guest.
> [...]
> > inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> [...]
> > >
> > > btw do we really need both map and table here? Can't we just
> > > use one table with unsupported granularity marked as a special
> > > value?
> > >
> [...]
> > >
> > > -ENOTSUPP?
> > >
> [...]
> > >
> > > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> > >
> [...]
> > >>> addr_info.addr),
> [...]
> [...]
> > >> + if (info->ats_enabled) {
> > >> + qi_flush_dev_iotlb_pasid(iommu,
> > >> sid, info-
> > >>> pfsid,
> [...]
> > >>> pfsid,
> > >> +
> > >> inv_info->addr_info.pasid, info->ats_qdep,
> > >> +
> > >> inv_info->addr_info.addr, size,
> > >> + granu);
> [...]
> [...]
> > >>> pasid_info.pasid);
> > >> +
> > >
> > > as earlier comment, we shouldn't allow userspace or guest to
> > > invalidate PASID cache
> > >
> [...]
> > >
>
[Jacob Pan]
On Tue, 31 Mar 2020 03:34:22 +0000
"Tian, Kevin" <[email protected]> wrote:
> > From: Auger Eric <[email protected]>
> > Sent: Monday, March 30, 2020 12:05 AM
> >
> > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > >> From: Jacob Pan <[email protected]>
> > >> Sent: Saturday, March 21, 2020 7:28 AM
> > >>
> > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > >> and
> > driver
> > >> level. This patch adds Intel VT-d specific function to implement
> > >> iommu passdown invalidate API for shared virtual address.
> > >>
> > >> The use case is for supporting caching structure invalidation
> > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> > >
> > > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > > interface as well.
> > >
> > >> invalidation capability and passes down all descriptors from the
> > >> guest to the physical IOMMU.
> > >>
> > >> The assumption is that guest to host device ID mapping should be
> > >> resolved prior to calling IOMMU driver. Based on the device
> > >> handle, host IOMMU driver can replace certain fields before
> > >> submit to the invalidation queue.
> > >>
> > >> ---
> > >> v7 review fixed in v10
> > >> ---
> > >>
> > >> Signed-off-by: Jacob Pan <[email protected]>
> > >> Signed-off-by: Ashok Raj <[email protected]>
> > >> Signed-off-by: Liu, Yi L <[email protected]>
> > >> ---
> > >> drivers/iommu/intel-iommu.c | 182
> > >> ++++++++++++++++++++++++++++++++++++++++++++
> > >> 1 file changed, 182 insertions(+)
> > >>
> > >> diff --git a/drivers/iommu/intel-iommu.c
> > >> b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > >> 100644 --- a/drivers/iommu/intel-iommu.c
> > >> +++ b/drivers/iommu/intel-iommu.c
> > >> @@ -5619,6 +5619,187 @@ static void
> > >> intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > >> aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > >> }
> > >>
> > >> +/*
> > >> + * 2D array for converting and sanitizing IOMMU generic TLB
> > >> granularity
> > to
> > >> + * VT-d granularity. Invalidation is typically included in the
> > >> unmap
> > operation
> > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > >> devices
> > guest
> > >> + * owns the first level page tables. Invalidations of
> > >> translation caches in
> > the
> > >> + * guest are trapped and passed down to the host.
> > >> + *
> > >> + * vIOMMU in the guest will only expose first level page
> > >> tables, therefore
> > >> + * we do not include IOTLB granularity for request without
> > >> PASID (second level).
> > >
> > > I would revise above as "We do not support IOTLB granularity for
> > > request without PASID (second level), therefore any vIOMMU
> > > implementation that exposes the SVA capability to the guest
> > > should only expose the first level page tables, implying all
> > > invalidation requests from the guest will include a valid PASID"
> > >
> > >> + *
> > >> + * For example, to find the VT-d granularity encoding for IOTLB
> > >> + * type and page selective granularity within PASID:
> > >> + * X: indexed by iommu cache type
> > >> + * Y: indexed by enum iommu_inv_granularity
> > >> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > >> + *
> > >> + * Granu_map array indicates validity of the table. 1: valid,
> > >> 0: invalid
> > >> + *
> > >> + */
> > >> +const static int
> > >>
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > >> NR] = {
> > >> + /*
> > >> + * PASID based IOTLB invalidation: PASID selective (per
> > >> PASID),
> > >> + * page selective (address granularity)
> > >> + */
> > >> + {0, 1, 1},
> > >> + /* PASID based dev TLBs, only support all PASIDs or
> > >> single PASID */
> > >> + {1, 1, 0},
> > >
> > > Is this combination correct? when single PASID is being
> > > specified, it is essentially a page-selective invalidation since
> > > you need provide Address and Size.
> > >
> > >> + /* PASID cache */
> > >
> > > PASID cache is fully managed by the host. Guest PASID cache
> > > invalidation is interpreted by vIOMMU for bind and unbind
> > > operations. I don't think we should accept any PASID cache
> > > invalidation from userspace or guest.
> > I tend to agree here.
> > >
> > >> + {1, 1, 0}
> > >> +};
> > >> +
> > >> +const static int
> > >>
> > inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> > >> _NR] = {
> > >> + /* PASID based IOTLB */
> > >> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> > >> + /* PASID based dev TLBs */
> > >> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> > >> + /* PASID cache */
> > >> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> > >> +};
> > >> +
> > >> +static inline int to_vtd_granularity(int type, int granu, int
> > >> *vtd_granu) +{
> > >> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> > >> IOMMU_INV_GRANU_NR ||
> > >> + !inv_type_granu_map[type][granu])
> > >> + return -EINVAL;
> > >> +
> > >> + *vtd_granu = inv_type_granu_table[type][granu];
> > >> +
> > >
> > > btw do we really need both map and table here? Can't we just
> > > use one table with unsupported granularity marked as a special
> > > value?
> > I asked the same question some time ago. If I remember correctly the
> > issue is while a granu can be supported in inv_type_granu_map, the
> > associated value in inv_type_granu_table can be 0. This typically
> > matches both values of G field (0 or 1) in the invalidation cmd. See
> > other comment below.
>
> I didn't fully understand it. Also what does a value '0' imply? also
> it's interesting to see below in [PATCH 07/11]:
>
0 in 2D map array means invalid.
0 in granu table can be either valid or invalid
That is why we need the map table to tell the difference.
I will add following comments since this causes lots of confusion.
* Granu_map array indicates validity of the table. 1: valid, 0: invalid
* This is useful when the entry in the granu table has a value of 0,
* which can be a valid or invalid value.
> +/* QI Dev-IOTLB inv granu */
> +#define QI_DEV_IOTLB_GRAN_ALL 1
> +#define QI_DEV_IOTLB_GRAN_PASID_SEL 0
> +
>
Sorry I didn't get the point? These are the valid vt-d granu values.
Per Spec CH 6.5.2.6
> > >
> > >> + return 0;
> > >> +}
> > >> +
> > >> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> > >> +{
> > >> + u64 nr_pages = (granu_size * nr_granules) >>
> > >> VTD_PAGE_SHIFT; +
> > >> + /* VT-d size is encoded as 2^size of 4K pages, 0 for
> > >> 4k, 9 for 2MB, etc.
> > >> + * IOMMU cache invalidate API passes granu_size in
> > >> bytes, and number of
> > >> + * granu size in contiguous memory.
> > >> + */
> > >> + return order_base_2(nr_pages);
> > >> +}
> > >> +
> > >> +#ifdef CONFIG_INTEL_IOMMU_SVM
> > >> +static int intel_iommu_sva_invalidate(struct iommu_domain
> > >> *domain,
> > >> + struct device *dev, struct
> > >> iommu_cache_invalidate_info *inv_info)
> > >> +{
> > >> + struct dmar_domain *dmar_domain =
> > >> to_dmar_domain(domain);
> > >> + struct device_domain_info *info;
> > >> + struct intel_iommu *iommu;
> > >> + unsigned long flags;
> > >> + int cache_type;
> > >> + u8 bus, devfn;
> > >> + u16 did, sid;
> > >> + int ret = 0;
> > >> + u64 size = 0;
> > >> +
> > >> + if (!inv_info || !dmar_domain ||
> > >> + inv_info->version !=
> > >> IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> > >> + return -EINVAL;
> > >> +
> > >> + if (!dev || !dev_is_pci(dev))
> > >> + return -ENODEV;
> > >> +
> > >> + iommu = device_to_iommu(dev, &bus, &devfn);
> > >> + if (!iommu)
> > >> + return -ENODEV;
> > >> +
> > >> + spin_lock_irqsave(&device_domain_lock, flags);
> > >> + spin_lock(&iommu->lock);
> > >> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus,
> > >> devfn);
> > >> + if (!info) {
> > >> + ret = -EINVAL;
> > >> + goto out_unlock;
> > >
> > > -ENOTSUPP?
> > >
> > >> + }
> > >> + did = dmar_domain->iommu_did[iommu->seq_id];
> > >> + sid = PCI_DEVID(bus, devfn);
> > >> +
> > >> + /* Size is only valid in non-PASID selective
> > >> invalidation */
> > >> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> > >> + size =
> > >> to_vtd_size(inv_info->addr_info.granule_size,
> > >> +
> > >> inv_info->addr_info.nb_granules); +
> > >> + for_each_set_bit(cache_type, (unsigned long
> > >> *)&inv_info->cache, IOMMU_CACHE_INV_TYPE_NR) {
> > >> + int granu = 0;
> > >> + u64 pasid = 0;
> > >> +
> > >> + ret = to_vtd_granularity(cache_type,
> > >> inv_info->granularity, &granu);
> > >> + if (ret) {
> > >> + pr_err("Invalid cache type and granu
> > >> combination %d/%d\n", cache_type,
> > >> + inv_info->granularity);
> > >> + break;
> > >> + }
> > >> +
> > >> + /* PASID is stored in different locations based
> > >> on granularity */
> > >> + if (inv_info->granularity ==
> > >> IOMMU_INV_GRANU_PASID &&
> > >> + inv_info->pasid_info.flags &
> > >> IOMMU_INV_PASID_FLAGS_PASID)
> > >> + pasid = inv_info->pasid_info.pasid;
> > >> + else if (inv_info->granularity ==
> > >> IOMMU_INV_GRANU_ADDR &&
> > >> + inv_info->addr_info.flags &
> > >> IOMMU_INV_ADDR_FLAGS_PASID)
> > >> + pasid = inv_info->addr_info.pasid;
> > >> + else {
> > >> + pr_err("Cannot find PASID for given
> > >> cache type and granularity\n");
> > >> + break;
> > >> + }
> > >> +
> > >> + switch (BIT(cache_type)) {
> > >> + case IOMMU_CACHE_INV_TYPE_IOTLB:
> > >> + if ((inv_info->granularity !=
> > >> IOMMU_INV_GRANU_PASID) &&
> > >
> > > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> > >
> > >> + size &&
> > >> (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1)))
> > >> {
> > >> + pr_err("Address out of range,
> > >> 0x%llx, size order %llu\n",
> > >> +
> > >> inv_info->addr_info.addr, size);
> > >> + ret = -ERANGE;
> > >> + goto out_unlock;
> > >> + }
> > >> +
> > >> + qi_flush_piotlb(iommu, did,
> > >> + pasid,
> > >> +
> > >> mm_to_dma_pfn(inv_info-
> > >>> addr_info.addr),
> > >> + (granu ==
> > >> QI_GRAN_NONG_PASID) ? - 1 : 1 << size,
> > >> +
> > >> inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
> > >> +
> > >> + /*
> > >> + * Always flush device IOTLB if ATS is
> > >> enabled since guest
> > >> + * vIOMMU exposes CM = 1, no device
> > >> IOTLB flush will be passed
> > >> + * down.
> > >> + */
> > >
> > > Does VT-d spec mention that no device IOTLB flush is required
> > > when CM=1?
> > >> + if (info->ats_enabled) {
> > >> + qi_flush_dev_iotlb_pasid(iommu,
> > >> sid, info-
> > >>> pfsid,
> > >> + pasid,
> > >> info->ats_qdep,
> > >> +
> > >> inv_info->addr_info.addr, size,
> > >> + granu);
> > >> + }
> > >> + break;
> > >> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> > >> + if (info->ats_enabled) {
> > >> + qi_flush_dev_iotlb_pasid(iommu,
> > >> sid, info-
> > >>> pfsid,
> > >> +
> > >> inv_info->addr_info.pasid, info->ats_qdep,
> > >> +
> > >> inv_info->addr_info.addr, size,
> > >> + granu);
> > >
> > > I'm confused here. There are two granularities allowed for
> > > devtlb, but here you only handle one of them?
> > granu is the result of to_vtd_granularity() so it can take either
> > of the 2 values.
>
> yes, you're right.
>
> >
> > Thanks
> >
> > Eric
> > >
> > >> + } else
> > >> + pr_warn("Passdown device IOTLB
> > >> flush w/o ATS!\n");
> > >> +
> > >> + break;
> > >> + case IOMMU_CACHE_INV_TYPE_PASID:
> > >> + qi_flush_pasid_cache(iommu, did, granu,
> > >> inv_info-
> > >>> pasid_info.pasid);
> > >> +
> > >
> > > as earlier comment, we shouldn't allow userspace or guest to
> > > invalidate PASID cache
> > >
> > >> + break;
> > >> + default:
> > >> + dev_err(dev, "Unsupported IOMMU
> > >> invalidation type %d\n",
> > >> + cache_type);
> > >> + ret = -EINVAL;
> > >> + }
> > >> + }
> > >> +out_unlock:
> > >> + spin_unlock(&iommu->lock);
> > >> + spin_unlock_irqrestore(&device_domain_lock, flags);
> > >> +
> > >> + return ret;
> > >> +}
> > >> +#endif
> > >> +
> > >> static int intel_iommu_map(struct iommu_domain *domain,
> > >> unsigned long iova, phys_addr_t hpa,
> > >> size_t size, int iommu_prot, gfp_t
> > >> gfp) @@ -6204,6 +6385,7 @@ const struct iommu_ops
> > >> intel_iommu_ops = { .is_attach_deferred =
> > >> intel_iommu_is_attach_deferred, .pgsize_bitmap =
> > >> INTEL_IOMMU_PGSIZES, #ifdef CONFIG_INTEL_IOMMU_SVM
> > >> + .cache_invalidate = intel_iommu_sva_invalidate,
> > >> .sva_bind_gpasid = intel_svm_bind_gpasid,
> > >> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> > >> #endif
> > >> --
> > >> 2.7.4
> > >
>
[Jacob Pan]
On Sun, 29 Mar 2020 18:05:47 +0200
Auger Eric <[email protected]> wrote:
> Hi Jacob,
>
> On 3/21/20 12:27 AM, Jacob Pan wrote:
> > When Shared Virtual Address (SVA) is enabled for a guest OS via
> > vIOMMU, we need to provide invalidation support at IOMMU API and
> > driver level. This patch adds Intel VT-d specific function to
> > implement iommu passdown invalidate API for shared virtual address.
> >
> > The use case is for supporting caching structure invalidation
> > of assigned SVM capable devices. Emulated IOMMU exposes queue
> > invalidation capability and passes down all descriptors from the
> > guest to the physical IOMMU.
> >
> > The assumption is that guest to host device ID mapping should be
> > resolved prior to calling IOMMU driver. Based on the device handle,
> > host IOMMU driver can replace certain fields before submit to the
> > invalidation queue.
> >
> > ---
> > v7 review fixed in v10
> > ---
> >
> > Signed-off-by: Jacob Pan <[email protected]>
> > Signed-off-by: Ashok Raj <[email protected]>
> > Signed-off-by: Liu, Yi L <[email protected]>
> > ---
> > drivers/iommu/intel-iommu.c | 182
> > ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 182
> > insertions(+)
> >
> > diff --git a/drivers/iommu/intel-iommu.c
> > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > 100644 --- a/drivers/iommu/intel-iommu.c
> > +++ b/drivers/iommu/intel-iommu.c
> > @@ -5619,6 +5619,187 @@ static void
> > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > aux_domain_remove_dev(to_dmar_domain(domain), dev); }
> >
> > +/*
> > + * 2D array for converting and sanitizing IOMMU generic TLB
> > granularity to
> > + * VT-d granularity. Invalidation is typically included in the
> > unmap operation
> > + * as a result of DMA or VFIO unmap. However, for assigned devices
> > guest
> > + * owns the first level page tables. Invalidations of translation
> > caches in the
> > + * guest are trapped and passed down to the host.
> > + *
> > + * vIOMMU in the guest will only expose first level page tables,
> > therefore
> > + * we do not include IOTLB granularity for request without PASID
> > (second level).
> > + *
> > + * For example, to find the VT-d granularity encoding for IOTLB
> > + * type and page selective granularity within PASID:
> > + * X: indexed by iommu cache type
> > + * Y: indexed by enum iommu_inv_granularity
> > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > + *
> > + * Granu_map array indicates validity of the table. 1: valid, 0:
> > invalid
> > + *
> > + */
> > +const static int
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
> > + /*
> > + * PASID based IOTLB invalidation: PASID selective (per
> > PASID),
> > + * page selective (address granularity)
> > + */
> > + {0, 1, 1},
> > + /* PASID based dev TLBs, only support all PASIDs or single
> > PASID */
> > + {1, 1, 0},
> > + /* PASID cache */
> > + {1, 1, 0}
> > +};
> > +
> > +const static int
> > inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] =
> > {
> > + /* PASID based IOTLB */
> > + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> > + /* PASID based dev TLBs */
> > + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> > + /* PASID cache */
> > + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> > +};
> > +
> > +static inline int to_vtd_granularity(int type, int granu, int
> > *vtd_granu) +{
> > + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> > IOMMU_INV_GRANU_NR ||
> > + !inv_type_granu_map[type][granu])
> > + return -EINVAL;
> > +
> > + *vtd_granu = inv_type_granu_table[type][granu];
> > +
> > + return 0;
> > +}
> > +
> > +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> > +{
> > + u64 nr_pages = (granu_size * nr_granules) >>
> > VTD_PAGE_SHIFT; +
> > + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9
> > for 2MB, etc.
> > + * IOMMU cache invalidate API passes granu_size in bytes,
> > and number of
> > + * granu size in contiguous memory.
> > + */
> > + return order_base_2(nr_pages);
> > +}
> > +
> > +#ifdef CONFIG_INTEL_IOMMU_SVM
> > +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
> > + struct device *dev, struct
> > iommu_cache_invalidate_info *inv_info) +{
> > + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> > + struct device_domain_info *info;
> > + struct intel_iommu *iommu;
> > + unsigned long flags;
> > + int cache_type;
> > + u8 bus, devfn;
> > + u16 did, sid;
> > + int ret = 0;
> > + u64 size = 0;
> > +
> > + if (!inv_info || !dmar_domain ||
> > + inv_info->version !=
> > IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> > + return -EINVAL;
> > +
> > + if (!dev || !dev_is_pci(dev))
> > + return -ENODEV;
> > +
> > + iommu = device_to_iommu(dev, &bus, &devfn);
> > + if (!iommu)
> > + return -ENODEV;
> > +
> > + spin_lock_irqsave(&device_domain_lock, flags);
> > + spin_lock(&iommu->lock);
> > + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus,
> > devfn);
> > + if (!info) {
> > + ret = -EINVAL;
> > + goto out_unlock;
> > + }
> > + did = dmar_domain->iommu_did[iommu->seq_id];
> > + sid = PCI_DEVID(bus, devfn);
> > +
> > + /* Size is only valid in non-PASID selective invalidation
> > */
> > + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> > + size =
> > to_vtd_size(inv_info->addr_info.granule_size,
> > +
> > inv_info->addr_info.nb_granules); +
> > + for_each_set_bit(cache_type, (unsigned long
> > *)&inv_info->cache, IOMMU_CACHE_INV_TYPE_NR) {
> > + int granu = 0;
> > + u64 pasid = 0;
> > +
> > + ret = to_vtd_granularity(cache_type,
> > inv_info->granularity, &granu);
> > + if (ret) {
> > + pr_err("Invalid cache type and granu
> > combination %d/%d\n", cache_type,
> > + inv_info->granularity);
> > + break;
> > + }
> > +
> > + /* PASID is stored in different locations based on
> > granularity */
> > + if (inv_info->granularity == IOMMU_INV_GRANU_PASID
> > &&
> > + inv_info->pasid_info.flags &
> > IOMMU_INV_PASID_FLAGS_PASID)
> > + pasid = inv_info->pasid_info.pasid;
> > + else if (inv_info->granularity ==
> > IOMMU_INV_GRANU_ADDR &&
> > + inv_info->addr_info.flags &
> > IOMMU_INV_ADDR_FLAGS_PASID)
> > + pasid = inv_info->addr_info.pasid;
> > + else {
> > + pr_err("Cannot find PASID for given cache
> > type and granularity\n");
> I don't get this error msg. In case of domain-selective invalidation,
> PASID is not used so if I am not wrong you will end up here while
> there is no issue.
Right, I will remove the else.
> > + break;
> > + }
> > +
> > + switch (BIT(cache_type)) {
> > + case IOMMU_CACHE_INV_TYPE_IOTLB:
> > + if ((inv_info->granularity !=
> > IOMMU_INV_GRANU_PASID) &&
> > + size && (inv_info->addr_info.addr
> > & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
> > + pr_err("Address out of range,
> > 0x%llx, size order %llu\n",
> > + inv_info->addr_info.addr,
> > size);
> > + ret = -ERANGE;
> > + goto out_unlock;
> > + }
> > +
> > + qi_flush_piotlb(iommu, did,
> > + pasid,
> > +
> > mm_to_dma_pfn(inv_info->addr_info.addr),
> > + (granu ==
> > QI_GRAN_NONG_PASID) ? -1 : 1 << size,
> > + inv_info->addr_info.flags
> > & IOMMU_INV_ADDR_FLAGS_LEAF); +
> > + /*
> > + * Always flush device IOTLB if ATS is
> > enabled since guest
> > + * vIOMMU exposes CM = 1, no device IOTLB
> > flush will be passed
> > + * down.
> > + */
> > + if (info->ats_enabled) {
> nit {} not requested
Will fix. same for the remaining.
Thanks!
Jacob
> > + qi_flush_dev_iotlb_pasid(iommu,
> > sid, info->pfsid,
> > + pasid,
> > info->ats_qdep,
> > +
> > inv_info->addr_info.addr, size,
> > + granu);
> > + }
> > + break;
> > + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> > + if (info->ats_enabled) {
> nit {} not requested
> > + qi_flush_dev_iotlb_pasid(iommu,
> > sid, info->pfsid,
> > +
> > inv_info->addr_info.pasid, info->ats_qdep,
> > +
> > inv_info->addr_info.addr, size,
> > + granu);
> > + } else
> > + pr_warn("Passdown device IOTLB
> > flush w/o ATS!\n");
> > +
> nit: extra line
> > + break;
> > + case IOMMU_CACHE_INV_TYPE_PASID:
> > + qi_flush_pasid_cache(iommu, did, granu,
> > inv_info->pasid_info.pasid);
> > +
> nit: extra line
> > + break;
> > + default:
> > + dev_err(dev, "Unsupported IOMMU
> > invalidation type %d\n",
> > + cache_type);
> > + ret = -EINVAL;
> > + }
> > + }
> > +out_unlock:
> > + spin_unlock(&iommu->lock);
> > + spin_unlock_irqrestore(&device_domain_lock, flags);
> > +
> > + return ret;
> > +}
> > +#endif
> > +
> > static int intel_iommu_map(struct iommu_domain *domain,
> > unsigned long iova, phys_addr_t hpa,
> > size_t size, int iommu_prot, gfp_t gfp)
> > @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
> > .is_attach_deferred =
> > intel_iommu_is_attach_deferred, .pgsize_bitmap =
> > INTEL_IOMMU_PGSIZES, #ifdef CONFIG_INTEL_IOMMU_SVM
> > + .cache_invalidate = intel_iommu_sva_invalidate,
> > .sva_bind_gpasid = intel_svm_bind_gpasid,
> > .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> > #endif
> >
> Thanks
>
> Eric
>
[Jacob Pan]
> From: Jacob Pan <[email protected]>
> Sent: Wednesday, April 1, 2020 2:14 AM
>
> On Sat, 28 Mar 2020 10:01:42 +0000
> "Tian, Kevin" <[email protected]> wrote:
>
> > > From: Jacob Pan <[email protected]>
> > > Sent: Saturday, March 21, 2020 7:28 AM
> > >
> > > When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > vIOMMU, we need to provide invalidation support at IOMMU API and
> > > driver level. This patch adds Intel VT-d specific function to
> > > implement iommu passdown invalidate API for shared virtual address.
> > >
> > > The use case is for supporting caching structure invalidation
> > > of assigned SVM capable devices. Emulated IOMMU exposes queue
> >
> > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > interface as well.
> >
> True, but it does not invalidate this statement about emulated IOMMU. I
> will add another statement saying "the same interface can be used for
> virtio-IOMMU as well". OK?
sure
>
> > > invalidation capability and passes down all descriptors from the
> > > guest to the physical IOMMU.
> > >
> > > The assumption is that guest to host device ID mapping should be
> > > resolved prior to calling IOMMU driver. Based on the device handle,
> > > host IOMMU driver can replace certain fields before submit to the
> > > invalidation queue.
> > >
> > > ---
> > > v7 review fixed in v10
> > > ---
> > >
> > > Signed-off-by: Jacob Pan <[email protected]>
> > > Signed-off-by: Ashok Raj <[email protected]>
> > > Signed-off-by: Liu, Yi L <[email protected]>
> > > ---
> > > drivers/iommu/intel-iommu.c | 182
> > > ++++++++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 182 insertions(+)
> > >
> > > diff --git a/drivers/iommu/intel-iommu.c
> > > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > 100644 --- a/drivers/iommu/intel-iommu.c
> > > +++ b/drivers/iommu/intel-iommu.c
> > > @@ -5619,6 +5619,187 @@ static void
> > > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > > aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > > }
> > >
> > > +/*
> > > + * 2D array for converting and sanitizing IOMMU generic TLB
> > > granularity to
> > > + * VT-d granularity. Invalidation is typically included in the
> > > unmap operation
> > > + * as a result of DMA or VFIO unmap. However, for assigned devices
> > > guest
> > > + * owns the first level page tables. Invalidations of translation
> > > caches in the
> > > + * guest are trapped and passed down to the host.
> > > + *
> > > + * vIOMMU in the guest will only expose first level page tables,
> > > therefore
> > > + * we do not include IOTLB granularity for request without PASID
> > > (second level).
> >
> > I would revise above as "We do not support IOTLB granularity for
> > request without PASID (second level), therefore any vIOMMU
> > implementation that exposes the SVA capability to the guest should
> > only expose the first level page tables, implying all invalidation
> > requests from the guest will include a valid PASID"
> >
> Sounds good.
>
> > > + *
> > > + * For example, to find the VT-d granularity encoding for IOTLB
> > > + * type and page selective granularity within PASID:
> > > + * X: indexed by iommu cache type
> > > + * Y: indexed by enum iommu_inv_granularity
> > > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > + *
> > > + * Granu_map array indicates validity of the table. 1: valid, 0:
> > > invalid
> > > + *
> > > + */
> > > +const static int
> > >
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > NR] = {
> > > + /*
> > > + * PASID based IOTLB invalidation: PASID selective (per
> > > PASID),
> > > + * page selective (address granularity)
> > > + */
> > > + {0, 1, 1},
> > > + /* PASID based dev TLBs, only support all PASIDs or single
> > > PASID */
> > > + {1, 1, 0},
> >
> > Is this combination correct? when single PASID is being specified, it
> > is essentially a page-selective invalidation since you need provide
> > Address and Size.
> >
> This is for translation between generic UAPI granu to VT-d granu, it
> has nothing to do with address and size.
Generic UAPI defines three granularities: domain, pasid and addr.
from the definition domain applies all entries related to did, pasid
applies to all entries related to pasid, while addr is specific for a
range.
from what we just confirmed internally with VT-d spec owner, our
PASID based dev TLB invalidation always requires addr and size,
while current uAPI doesn't support multiple PASIDs based range
invaliation. It sounds to me that you want to use domain to replace
multiple PASIDs case (G=1), but it then changes the meaning of
the domain granularity and easily lead to confusion.
I feel Eric's proposal makes more sense. Here we'd better use {0, 0, 1}
to indicate only addr range invalidation is allowed, matching the
spec definition. We may use a special flag in iommu_inv_addr_info
to indicate G=1 case, if necessary.
> e.g.
> If user passes IOMMU_INV_GRANU_PASID for the single PASID case as you
> mentioned, this map table shows it is valid.
>
> Then the lookup result will get VT-d granu:
> QI_DEV_IOTLB_GRAN_PASID_SEL, which means G=0.
>
>
> > > + /* PASID cache */
> >
> > PASID cache is fully managed by the host. Guest PASID cache
> > invalidation is interpreted by vIOMMU for bind and unbind operations.
> > I don't think we should accept any PASID cache invalidation from
> > userspace or guest.
> >
>
> True for vIOMMU, this is here for completeness. Can be used by virtio
> IOMMU, since PC flush is inclusive (IOTLB, devTLB), it is more
> efficient.
I think it is not correct in concept. We should not allow the userspace or
guest to request an operation which is beyond its privilege (just because
doing so may bring some performance benefit). You can always introduce
new cmd for such purpose.
>
> > > + {1, 1, 0}
> > > +};
> > > +
> > > +const static int
> > >
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> > > _NR] = {
> > > + /* PASID based IOTLB */
> > > + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> > > + /* PASID based dev TLBs */
> > > + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> > > + /* PASID cache */
> > > + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> > > +};
> > > +
> > > +static inline int to_vtd_granularity(int type, int granu, int
> > > *vtd_granu) +{
> > > + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> > > IOMMU_INV_GRANU_NR ||
> > > + !inv_type_granu_map[type][granu])
> > > + return -EINVAL;
> > > +
> > > + *vtd_granu = inv_type_granu_table[type][granu];
> > > +
> >
> > btw do we really need both map and table here? Can't we just
> > use one table with unsupported granularity marked as a special
> > value?
> >
> Yes, for value = 1. e.g. G=0 but still valid.
>
> > > + return 0;
> > > +}
> > > +
> > > +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> > > +{
> > > + u64 nr_pages = (granu_size * nr_granules) >>
> > > VTD_PAGE_SHIFT; +
> > > + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9
> > > for 2MB, etc.
> > > + * IOMMU cache invalidate API passes granu_size in bytes,
> > > and number of
> > > + * granu size in contiguous memory.
> > > + */
> > > + return order_base_2(nr_pages);
> > > +}
> > > +
> > > +#ifdef CONFIG_INTEL_IOMMU_SVM
> > > +static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
> > > + struct device *dev, struct
> > > iommu_cache_invalidate_info *inv_info)
> > > +{
> > > + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> > > + struct device_domain_info *info;
> > > + struct intel_iommu *iommu;
> > > + unsigned long flags;
> > > + int cache_type;
> > > + u8 bus, devfn;
> > > + u16 did, sid;
> > > + int ret = 0;
> > > + u64 size = 0;
> > > +
> > > + if (!inv_info || !dmar_domain ||
> > > + inv_info->version !=
> > > IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> > > + return -EINVAL;
> > > +
> > > + if (!dev || !dev_is_pci(dev))
> > > + return -ENODEV;
> > > +
> > > + iommu = device_to_iommu(dev, &bus, &devfn);
> > > + if (!iommu)
> > > + return -ENODEV;
> > > +
> > > + spin_lock_irqsave(&device_domain_lock, flags);
> > > + spin_lock(&iommu->lock);
> > > + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus,
> > > devfn);
> > > + if (!info) {
> > > + ret = -EINVAL;
> > > + goto out_unlock;
> >
> > -ENOTSUPP?
> >
> I guess it can go either way in that the error is based on invalid
> inputs.
>
> > > + }
> > > + did = dmar_domain->iommu_did[iommu->seq_id];
> > > + sid = PCI_DEVID(bus, devfn);
> > > +
> > > + /* Size is only valid in non-PASID selective invalidation
> > > */
> > > + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> > > + size =
> > > to_vtd_size(inv_info->addr_info.granule_size,
> > > +
> > > inv_info->addr_info.nb_granules); +
> > > + for_each_set_bit(cache_type, (unsigned long
> > > *)&inv_info->cache, IOMMU_CACHE_INV_TYPE_NR) {
> > > + int granu = 0;
> > > + u64 pasid = 0;
> > > +
> > > + ret = to_vtd_granularity(cache_type,
> > > inv_info->granularity, &granu);
> > > + if (ret) {
> > > + pr_err("Invalid cache type and granu
> > > combination %d/%d\n", cache_type,
> > > + inv_info->granularity);
> > > + break;
> > > + }
> > > +
> > > + /* PASID is stored in different locations based on
> > > granularity */
> > > + if (inv_info->granularity == IOMMU_INV_GRANU_PASID
> > > &&
> > > + inv_info->pasid_info.flags &
> > > IOMMU_INV_PASID_FLAGS_PASID)
> > > + pasid = inv_info->pasid_info.pasid;
> > > + else if (inv_info->granularity ==
> > > IOMMU_INV_GRANU_ADDR &&
> > > + inv_info->addr_info.flags &
> > > IOMMU_INV_ADDR_FLAGS_PASID)
> > > + pasid = inv_info->addr_info.pasid;
> > > + else {
> > > + pr_err("Cannot find PASID for given cache
> > > type and granularity\n");
> > > + break;
> > > + }
> > > +
> > > + switch (BIT(cache_type)) {
> > > + case IOMMU_CACHE_INV_TYPE_IOTLB:
> > > + if ((inv_info->granularity !=
> > > IOMMU_INV_GRANU_PASID) &&
> >
> > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> >
> Good point! will fix.
>
> > > + size && (inv_info->addr_info.addr &
> > > ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
> > > + pr_err("Address out of range,
> > > 0x%llx, size order %llu\n",
> > > + inv_info->addr_info.addr,
> > > size);
> > > + ret = -ERANGE;
> > > + goto out_unlock;
> > > + }
> > > +
> > > + qi_flush_piotlb(iommu, did,
> > > + pasid,
> > > + mm_to_dma_pfn(inv_info-
> > > >addr_info.addr),
> > > + (granu ==
> > > QI_GRAN_NONG_PASID) ? - 1 : 1 << size,
> > > + inv_info->addr_info.flags &
> > > IOMMU_INV_ADDR_FLAGS_LEAF);
> > > +
> > > + /*
> > > + * Always flush device IOTLB if ATS is
> > > enabled since guest
> > > + * vIOMMU exposes CM = 1, no device IOTLB
> > > flush will be passed
> > > + * down.
> > > + */
> >
> > Does VT-d spec mention that no device IOTLB flush is required when
> > CM=1?
> >
> Not explicitly. Just following the guideline in CH6.1 for efficient
> virtualization. Early on, we also had discussion on supporting virtio
> where IOTLB flush is inclusive.
> Let me rephrase the comment:
> /*
> * Always flush device IOTLB if ATS is enabled. vIOMMU
> * in the guest may assume IOTLB flush is inclusive,
> * which is more efficient.
> */
this looks better.
>
>
> > > + if (info->ats_enabled) {
> > > + qi_flush_dev_iotlb_pasid(iommu,
> > > sid, info-
> > > >pfsid,
> > > + pasid,
> > > info->ats_qdep,
> > > +
> > > inv_info->addr_info.addr, size,
> > > + granu);
> > > + }
> > > + break;
> > > + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> > > + if (info->ats_enabled) {
> > > + qi_flush_dev_iotlb_pasid(iommu,
> > > sid, info-
> > > >pfsid,
> > > +
> > > inv_info->addr_info.pasid, info->ats_qdep,
> > > +
> > > inv_info->addr_info.addr, size,
> > > + granu);
> >
> > I'm confused here. There are two granularities allowed for devtlb,
> > but here you only handle one of them?
> >
> granu is passed into the flush function, which can be 1 or 0.
>
> > > + } else
> > > + pr_warn("Passdown device IOTLB
> > > flush w/o ATS!\n");
> > > +
> > > + break;
> > > + case IOMMU_CACHE_INV_TYPE_PASID:
> > > + qi_flush_pasid_cache(iommu, did, granu,
> > > inv_info-
> > > >pasid_info.pasid);
> > > +
> >
> > as earlier comment, we shouldn't allow userspace or guest to
> > invalidate PASID cache
> >
> same explanation :)
>
> > > + break;
> > > + default:
> > > + dev_err(dev, "Unsupported IOMMU
> > > invalidation type %d\n",
> > > + cache_type);
> > > + ret = -EINVAL;
> > > + }
> > > + }
> > > +out_unlock:
> > > + spin_unlock(&iommu->lock);
> > > + spin_unlock_irqrestore(&device_domain_lock, flags);
> > > +
> > > + return ret;
> > > +}
> > > +#endif
> > > +
> > > static int intel_iommu_map(struct iommu_domain *domain,
> > > unsigned long iova, phys_addr_t hpa,
> > > size_t size, int iommu_prot, gfp_t gfp)
> > > @@ -6204,6 +6385,7 @@ const struct iommu_ops intel_iommu_ops = {
> > > .is_attach_deferred =
> > > intel_iommu_is_attach_deferred, .pgsize_bitmap =
> > > INTEL_IOMMU_PGSIZES, #ifdef CONFIG_INTEL_IOMMU_SVM
> > > + .cache_invalidate = intel_iommu_sva_invalidate,
> > > .sva_bind_gpasid = intel_svm_bind_gpasid,
> > > .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> > > #endif
> > > --
> > > 2.7.4
> >
>
> [Jacob Pan]
> From: Jacob Pan <[email protected]>
> Sent: Wednesday, April 1, 2020 4:58 AM
>
> On Tue, 31 Mar 2020 02:49:21 +0000
> "Tian, Kevin" <[email protected]> wrote:
>
> > > From: Auger Eric <[email protected]>
> > > Sent: Sunday, March 29, 2020 11:34 PM
> > >
> > > Hi,
> > >
> > > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > > >> From: Jacob Pan <[email protected]>
> > > >> Sent: Saturday, March 21, 2020 7:28 AM
> > > >>
> > > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > > >> and
> > > driver
> > > >> level. This patch adds Intel VT-d specific function to implement
> > > >> iommu passdown invalidate API for shared virtual address.
> > > >>
> > > >> The use case is for supporting caching structure invalidation
> > > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> > [...]
> > [...]
> > > to
> > > >> + * VT-d granularity. Invalidation is typically included in the
> > > >> unmap
> > > operation
> > > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > > >> devices
> > > guest
> > > >> + * owns the first level page tables. Invalidations of
> > > >> translation caches in
> > > the
> > [...]
> > [...]
> > [...]
> > >
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > >> NR] = {
> > > >> + /*
> > > >> + * PASID based IOTLB invalidation: PASID selective (per
> > > >> PASID),
> > > >> + * page selective (address granularity)
> > > >> + */
> > > >> + {0, 1, 1},
> > > >> + /* PASID based dev TLBs, only support all PASIDs or
> > > >> single PASID */
> > > >> + {1, 1, 0},
> > > >
> > > > Is this combination correct? when single PASID is being
> > > > specified, it is essentially a page-selective invalidation since
> > > > you need provide Address and Size.
> > > Isn't it the same when G=1? Still the addr/size is used. Doesn't
> > > it
> >
> > I thought addr/size is not used when G=1, but it might be wrong. I'm
> > checking with our vt-d spec owner.
> >
>
> > > correspond to IOMMU_INV_GRANU_ADDR with
> > > IOMMU_INV_ADDR_FLAGS_PASID flag
> > > unset?
> > >
> > > so {0, 0, 1}?
> >
> I am not sure I got your logic. The three fields correspond to
> IOMMU_INV_GRANU_DOMAIN, /* domain-selective
> invalidation */
> IOMMU_INV_GRANU_PASID, /* PASID-selective invalidation */
> IOMMU_INV_GRANU_ADDR, /* page-selective invalidation *
>
> For devTLB, we use domain as global since there is no domain. Then I
> came up with {1, 1, 0}, which means we could have global and pasid
> granu invalidation for PASID based devTLB.
>
> If the caller also provide addr and S bit, the flush routine will put
"also" -> "must", because vt-d requires addr/size must be provided
in devtlb descriptor, that is why Eric suggests {0, 0, 1}.
> that into QI descriptor. I know this is a little odd, but from the
> granu translation p.o.v. VT-d spec has no G bit for page selective
> invalidation.
We don't need such odd way if can do it properly. ????
>
> > I have one more open:
> >
> > How does userspace know which invalidation type/gran is supported?
> > I didn't see such capability reporting in Yi's VFIO vSVA patch set.
> > Do we want the user/kernel assume the same capability set if they are
> > architectural? However the kernel could also do some optimization
> > e.g. hide devtlb invalidation capability given that the kernel
> > already invalidate devtlb automatically when serving iotlb
> > invalidation...
> >
> In general, we are trending to use VFIO capability chain to expose iommu
> capabilities.
>
> But for architectural features such as type/granu, we have to assume
> the same capability between host & guest. Granu and types are not
> enumerated on the host IOMMU either.
>
> For devTLB optimization, I agree we need to expose a capability to
> the guest stating that implicit devtlb invalidation is supported.
> Otherwise, if Linux guest runs on other OSes may not support implicit
> devtlb invalidation.
>
> Right Yi?
Thanks for explanation. So we are assumed to support all operations
defined in spec, so no need to expose them one-by-one. For
optimization, I'm fine to do it later.
>
> > Thanks
> > Kevin
> >
> > >
> > > Thanks
> > >
> > > Eric
> > >
> > > >
> > > >> + /* PASID cache */
> > > >
> > > > PASID cache is fully managed by the host. Guest PASID cache
> > > > invalidation is interpreted by vIOMMU for bind and unbind
> > > > operations. I don't think we should accept any PASID cache
> > > > invalidation from userspace or guest.
> > [...]
> > >
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> > [...]
> > > >
> > > > btw do we really need both map and table here? Can't we just
> > > > use one table with unsupported granularity marked as a special
> > > > value?
> > > >
> > [...]
> > > >
> > > > -ENOTSUPP?
> > > >
> > [...]
> > > >
> > > > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > > > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> > > >
> > [...]
> > > >>> addr_info.addr),
> > [...]
> > [...]
> > > >> + if (info->ats_enabled) {
> > > >> + qi_flush_dev_iotlb_pasid(iommu,
> > > >> sid, info-
> > > >>> pfsid,
> > [...]
> > > >>> pfsid,
> > > >> +
> > > >> inv_info->addr_info.pasid, info->ats_qdep,
> > > >> +
> > > >> inv_info->addr_info.addr, size,
> > > >> + granu);
> > [...]
> > [...]
> > > >>> pasid_info.pasid);
> > > >> +
> > > >
> > > > as earlier comment, we shouldn't allow userspace or guest to
> > > > invalidate PASID cache
> > > >
> > [...]
> > > >
> >
>
> [Jacob Pan]
Thanks
Kevin
> From: Jacob Pan <[email protected]>
> Sent: Wednesday, April 1, 2020 5:08 AM
>
> On Tue, 31 Mar 2020 03:34:22 +0000
> "Tian, Kevin" <[email protected]> wrote:
>
> > > From: Auger Eric <[email protected]>
> > > Sent: Monday, March 30, 2020 12:05 AM
> > >
> > > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > > >> From: Jacob Pan <[email protected]>
> > > >> Sent: Saturday, March 21, 2020 7:28 AM
> > > >>
> > > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > > >> and
> > > driver
> > > >> level. This patch adds Intel VT-d specific function to implement
> > > >> iommu passdown invalidate API for shared virtual address.
> > > >>
> > > >> The use case is for supporting caching structure invalidation
> > > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> > > >
> > > > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > > > interface as well.
> > > >
> > > >> invalidation capability and passes down all descriptors from the
> > > >> guest to the physical IOMMU.
> > > >>
> > > >> The assumption is that guest to host device ID mapping should be
> > > >> resolved prior to calling IOMMU driver. Based on the device
> > > >> handle, host IOMMU driver can replace certain fields before
> > > >> submit to the invalidation queue.
> > > >>
> > > >> ---
> > > >> v7 review fixed in v10
> > > >> ---
> > > >>
> > > >> Signed-off-by: Jacob Pan <[email protected]>
> > > >> Signed-off-by: Ashok Raj <[email protected]>
> > > >> Signed-off-by: Liu, Yi L <[email protected]>
> > > >> ---
> > > >> drivers/iommu/intel-iommu.c | 182
> > > >> ++++++++++++++++++++++++++++++++++++++++++++
> > > >> 1 file changed, 182 insertions(+)
> > > >>
> > > >> diff --git a/drivers/iommu/intel-iommu.c
> > > >> b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > >> 100644 --- a/drivers/iommu/intel-iommu.c
> > > >> +++ b/drivers/iommu/intel-iommu.c
> > > >> @@ -5619,6 +5619,187 @@ static void
> > > >> intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > > >> aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > > >> }
> > > >>
> > > >> +/*
> > > >> + * 2D array for converting and sanitizing IOMMU generic TLB
> > > >> granularity
> > > to
> > > >> + * VT-d granularity. Invalidation is typically included in the
> > > >> unmap
> > > operation
> > > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > > >> devices
> > > guest
> > > >> + * owns the first level page tables. Invalidations of
> > > >> translation caches in
> > > the
> > > >> + * guest are trapped and passed down to the host.
> > > >> + *
> > > >> + * vIOMMU in the guest will only expose first level page
> > > >> tables, therefore
> > > >> + * we do not include IOTLB granularity for request without
> > > >> PASID (second level).
> > > >
> > > > I would revise above as "We do not support IOTLB granularity for
> > > > request without PASID (second level), therefore any vIOMMU
> > > > implementation that exposes the SVA capability to the guest
> > > > should only expose the first level page tables, implying all
> > > > invalidation requests from the guest will include a valid PASID"
> > > >
> > > >> + *
> > > >> + * For example, to find the VT-d granularity encoding for IOTLB
> > > >> + * type and page selective granularity within PASID:
> > > >> + * X: indexed by iommu cache type
> > > >> + * Y: indexed by enum iommu_inv_granularity
> > > >> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > >> + *
> > > >> + * Granu_map array indicates validity of the table. 1: valid,
> > > >> 0: invalid
> > > >> + *
> > > >> + */
> > > >> +const static int
> > > >>
> > >
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > >> NR] = {
> > > >> + /*
> > > >> + * PASID based IOTLB invalidation: PASID selective (per
> > > >> PASID),
> > > >> + * page selective (address granularity)
> > > >> + */
> > > >> + {0, 1, 1},
> > > >> + /* PASID based dev TLBs, only support all PASIDs or
> > > >> single PASID */
> > > >> + {1, 1, 0},
> > > >
> > > > Is this combination correct? when single PASID is being
> > > > specified, it is essentially a page-selective invalidation since
> > > > you need provide Address and Size.
> > > >
> > > >> + /* PASID cache */
> > > >
> > > > PASID cache is fully managed by the host. Guest PASID cache
> > > > invalidation is interpreted by vIOMMU for bind and unbind
> > > > operations. I don't think we should accept any PASID cache
> > > > invalidation from userspace or guest.
> > > I tend to agree here.
> > > >
> > > >> + {1, 1, 0}
> > > >> +};
> > > >> +
> > > >> +const static int
> > > >>
> > >
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> > > >> _NR] = {
> > > >> + /* PASID based IOTLB */
> > > >> + {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> > > >> + /* PASID based dev TLBs */
> > > >> + {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> > > >> + /* PASID cache */
> > > >> + {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> > > >> +};
> > > >> +
> > > >> +static inline int to_vtd_granularity(int type, int granu, int
> > > >> *vtd_granu) +{
> > > >> + if (type >= IOMMU_CACHE_INV_TYPE_NR || granu >=
> > > >> IOMMU_INV_GRANU_NR ||
> > > >> + !inv_type_granu_map[type][granu])
> > > >> + return -EINVAL;
> > > >> +
> > > >> + *vtd_granu = inv_type_granu_table[type][granu];
> > > >> +
> > > >
> > > > btw do we really need both map and table here? Can't we just
> > > > use one table with unsupported granularity marked as a special
> > > > value?
> > > I asked the same question some time ago. If I remember correctly the
> > > issue is while a granu can be supported in inv_type_granu_map, the
> > > associated value in inv_type_granu_table can be 0. This typically
> > > matches both values of G field (0 or 1) in the invalidation cmd. See
> > > other comment below.
> >
> > I didn't fully understand it. Also what does a value '0' imply? also
> > it's interesting to see below in [PATCH 07/11]:
> >
> 0 in 2D map array means invalid.
> 0 in granu table can be either valid or invalid
> That is why we need the map table to tell the difference.
> I will add following comments since this causes lots of confusion.
>
> * Granu_map array indicates validity of the table. 1: valid, 0: invalid
> * This is useful when the entry in the granu table has a value of 0,
> * which can be a valid or invalid value.
>
>
> > +/* QI Dev-IOTLB inv granu */
> > +#define QI_DEV_IOTLB_GRAN_ALL 1
> > +#define QI_DEV_IOTLB_GRAN_PASID_SEL 0
> > +
> >
> Sorry I didn't get the point? These are the valid vt-d granu values.
> Per Spec CH 6.5.2.6
well, I thought '0' means invalid then why we define a valid granu
as invalid. But with your latest explanation I think I get the
rationale behind now.
just one more thought, since the element type is int in the
granular table, why not using -1 to represent invalid then you
can still use one table?
>
> > > >
> > > >> + return 0;
> > > >> +}
> > > >> +
> > > >> +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
> > > >> +{
> > > >> + u64 nr_pages = (granu_size * nr_granules) >>
> > > >> VTD_PAGE_SHIFT; +
> > > >> + /* VT-d size is encoded as 2^size of 4K pages, 0 for
> > > >> 4k, 9 for 2MB, etc.
> > > >> + * IOMMU cache invalidate API passes granu_size in
> > > >> bytes, and number of
> > > >> + * granu size in contiguous memory.
> > > >> + */
> > > >> + return order_base_2(nr_pages);
> > > >> +}
> > > >> +
> > > >> +#ifdef CONFIG_INTEL_IOMMU_SVM
> > > >> +static int intel_iommu_sva_invalidate(struct iommu_domain
> > > >> *domain,
> > > >> + struct device *dev, struct
> > > >> iommu_cache_invalidate_info *inv_info)
> > > >> +{
> > > >> + struct dmar_domain *dmar_domain =
> > > >> to_dmar_domain(domain);
> > > >> + struct device_domain_info *info;
> > > >> + struct intel_iommu *iommu;
> > > >> + unsigned long flags;
> > > >> + int cache_type;
> > > >> + u8 bus, devfn;
> > > >> + u16 did, sid;
> > > >> + int ret = 0;
> > > >> + u64 size = 0;
> > > >> +
> > > >> + if (!inv_info || !dmar_domain ||
> > > >> + inv_info->version !=
> > > >> IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
> > > >> + return -EINVAL;
> > > >> +
> > > >> + if (!dev || !dev_is_pci(dev))
> > > >> + return -ENODEV;
> > > >> +
> > > >> + iommu = device_to_iommu(dev, &bus, &devfn);
> > > >> + if (!iommu)
> > > >> + return -ENODEV;
> > > >> +
> > > >> + spin_lock_irqsave(&device_domain_lock, flags);
> > > >> + spin_lock(&iommu->lock);
> > > >> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus,
> > > >> devfn);
> > > >> + if (!info) {
> > > >> + ret = -EINVAL;
> > > >> + goto out_unlock;
> > > >
> > > > -ENOTSUPP?
> > > >
> > > >> + }
> > > >> + did = dmar_domain->iommu_did[iommu->seq_id];
> > > >> + sid = PCI_DEVID(bus, devfn);
> > > >> +
> > > >> + /* Size is only valid in non-PASID selective
> > > >> invalidation */
> > > >> + if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
> > > >> + size =
> > > >> to_vtd_size(inv_info->addr_info.granule_size,
> > > >> +
> > > >> inv_info->addr_info.nb_granules); +
> > > >> + for_each_set_bit(cache_type, (unsigned long
> > > >> *)&inv_info->cache, IOMMU_CACHE_INV_TYPE_NR) {
> > > >> + int granu = 0;
> > > >> + u64 pasid = 0;
> > > >> +
> > > >> + ret = to_vtd_granularity(cache_type,
> > > >> inv_info->granularity, &granu);
> > > >> + if (ret) {
> > > >> + pr_err("Invalid cache type and granu
> > > >> combination %d/%d\n", cache_type,
> > > >> + inv_info->granularity);
> > > >> + break;
> > > >> + }
> > > >> +
> > > >> + /* PASID is stored in different locations based
> > > >> on granularity */
> > > >> + if (inv_info->granularity ==
> > > >> IOMMU_INV_GRANU_PASID &&
> > > >> + inv_info->pasid_info.flags &
> > > >> IOMMU_INV_PASID_FLAGS_PASID)
> > > >> + pasid = inv_info->pasid_info.pasid;
> > > >> + else if (inv_info->granularity ==
> > > >> IOMMU_INV_GRANU_ADDR &&
> > > >> + inv_info->addr_info.flags &
> > > >> IOMMU_INV_ADDR_FLAGS_PASID)
> > > >> + pasid = inv_info->addr_info.pasid;
> > > >> + else {
> > > >> + pr_err("Cannot find PASID for given
> > > >> cache type and granularity\n");
> > > >> + break;
> > > >> + }
> > > >> +
> > > >> + switch (BIT(cache_type)) {
> > > >> + case IOMMU_CACHE_INV_TYPE_IOTLB:
> > > >> + if ((inv_info->granularity !=
> > > >> IOMMU_INV_GRANU_PASID) &&
> > > >
> > > > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > > > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> > > >
> > > >> + size &&
> > > >> (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1)))
> > > >> {
> > > >> + pr_err("Address out of range,
> > > >> 0x%llx, size order %llu\n",
> > > >> +
> > > >> inv_info->addr_info.addr, size);
> > > >> + ret = -ERANGE;
> > > >> + goto out_unlock;
> > > >> + }
> > > >> +
> > > >> + qi_flush_piotlb(iommu, did,
> > > >> + pasid,
> > > >> +
> > > >> mm_to_dma_pfn(inv_info-
> > > >>> addr_info.addr),
> > > >> + (granu ==
> > > >> QI_GRAN_NONG_PASID) ? - 1 : 1 << size,
> > > >> +
> > > >> inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
> > > >> +
> > > >> + /*
> > > >> + * Always flush device IOTLB if ATS is
> > > >> enabled since guest
> > > >> + * vIOMMU exposes CM = 1, no device
> > > >> IOTLB flush will be passed
> > > >> + * down.
> > > >> + */
> > > >
> > > > Does VT-d spec mention that no device IOTLB flush is required
> > > > when CM=1?
> > > >> + if (info->ats_enabled) {
> > > >> + qi_flush_dev_iotlb_pasid(iommu,
> > > >> sid, info-
> > > >>> pfsid,
> > > >> + pasid,
> > > >> info->ats_qdep,
> > > >> +
> > > >> inv_info->addr_info.addr, size,
> > > >> + granu);
> > > >> + }
> > > >> + break;
> > > >> + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
> > > >> + if (info->ats_enabled) {
> > > >> + qi_flush_dev_iotlb_pasid(iommu,
> > > >> sid, info-
> > > >>> pfsid,
> > > >> +
> > > >> inv_info->addr_info.pasid, info->ats_qdep,
> > > >> +
> > > >> inv_info->addr_info.addr, size,
> > > >> + granu);
> > > >
> > > > I'm confused here. There are two granularities allowed for
> > > > devtlb, but here you only handle one of them?
> > > granu is the result of to_vtd_granularity() so it can take either
> > > of the 2 values.
> >
> > yes, you're right.
> >
> > >
> > > Thanks
> > >
> > > Eric
> > > >
> > > >> + } else
> > > >> + pr_warn("Passdown device IOTLB
> > > >> flush w/o ATS!\n");
> > > >> +
> > > >> + break;
> > > >> + case IOMMU_CACHE_INV_TYPE_PASID:
> > > >> + qi_flush_pasid_cache(iommu, did, granu,
> > > >> inv_info-
> > > >>> pasid_info.pasid);
> > > >> +
> > > >
> > > > as earlier comment, we shouldn't allow userspace or guest to
> > > > invalidate PASID cache
> > > >
> > > >> + break;
> > > >> + default:
> > > >> + dev_err(dev, "Unsupported IOMMU
> > > >> invalidation type %d\n",
> > > >> + cache_type);
> > > >> + ret = -EINVAL;
> > > >> + }
> > > >> + }
> > > >> +out_unlock:
> > > >> + spin_unlock(&iommu->lock);
> > > >> + spin_unlock_irqrestore(&device_domain_lock, flags);
> > > >> +
> > > >> + return ret;
> > > >> +}
> > > >> +#endif
> > > >> +
> > > >> static int intel_iommu_map(struct iommu_domain *domain,
> > > >> unsigned long iova, phys_addr_t hpa,
> > > >> size_t size, int iommu_prot, gfp_t
> > > >> gfp) @@ -6204,6 +6385,7 @@ const struct iommu_ops
> > > >> intel_iommu_ops = { .is_attach_deferred =
> > > >> intel_iommu_is_attach_deferred, .pgsize_bitmap =
> > > >> INTEL_IOMMU_PGSIZES, #ifdef CONFIG_INTEL_IOMMU_SVM
> > > >> + .cache_invalidate = intel_iommu_sva_invalidate,
> > > >> .sva_bind_gpasid = intel_svm_bind_gpasid,
> > > >> .sva_unbind_gpasid = intel_svm_unbind_gpasid,
> > > >> #endif
> > > >> --
> > > >> 2.7.4
> > > >
> >
>
> [Jacob Pan]
Thanks
Kevin
> From: Tian, Kevin <[email protected]>
> Sent: Wednesday, April 1, 2020 2:24 PM
> To: Jacob Pan <[email protected]>
> Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function
>
> > From: Jacob Pan <[email protected]>
> > Sent: Wednesday, April 1, 2020 2:14 AM
> >
> > On Sat, 28 Mar 2020 10:01:42 +0000
> > "Tian, Kevin" <[email protected]> wrote:
> >
> > > > From: Jacob Pan <[email protected]>
> > > > Sent: Saturday, March 21, 2020 7:28 AM
> > > >
> > > > When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > > vIOMMU, we need to provide invalidation support at IOMMU API and
> > > > driver level. This patch adds Intel VT-d specific function to
> > > > implement iommu passdown invalidate API for shared virtual address.
> > > >
> > > > The use case is for supporting caching structure invalidation
> > > > of assigned SVM capable devices. Emulated IOMMU exposes queue
> > >
> > > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > > interface as well.
> > >
> > True, but it does not invalidate this statement about emulated IOMMU. I
> > will add another statement saying "the same interface can be used for
> > virtio-IOMMU as well". OK?
>
> sure
>
> >
> > > > invalidation capability and passes down all descriptors from the
> > > > guest to the physical IOMMU.
> > > >
> > > > The assumption is that guest to host device ID mapping should be
> > > > resolved prior to calling IOMMU driver. Based on the device handle,
> > > > host IOMMU driver can replace certain fields before submit to the
> > > > invalidation queue.
> > > >
> > > > ---
> > > > v7 review fixed in v10
> > > > ---
> > > >
> > > > Signed-off-by: Jacob Pan <[email protected]>
> > > > Signed-off-by: Ashok Raj <[email protected]>
> > > > Signed-off-by: Liu, Yi L <[email protected]>
> > > > ---
> > > > drivers/iommu/intel-iommu.c | 182
> > > > ++++++++++++++++++++++++++++++++++++++++++++
> > > > 1 file changed, 182 insertions(+)
> > > >
> > > > diff --git a/drivers/iommu/intel-iommu.c
> > > > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > > 100644 --- a/drivers/iommu/intel-iommu.c
> > > > +++ b/drivers/iommu/intel-iommu.c
> > > > @@ -5619,6 +5619,187 @@ static void
> > > > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > > > aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > > > }
> > > >
> > > > +/*
> > > > + * 2D array for converting and sanitizing IOMMU generic TLB
> > > > granularity to
> > > > + * VT-d granularity. Invalidation is typically included in the
> > > > unmap operation
> > > > + * as a result of DMA or VFIO unmap. However, for assigned devices
> > > > guest
> > > > + * owns the first level page tables. Invalidations of translation
> > > > caches in the
> > > > + * guest are trapped and passed down to the host.
> > > > + *
> > > > + * vIOMMU in the guest will only expose first level page tables,
> > > > therefore
> > > > + * we do not include IOTLB granularity for request without PASID
> > > > (second level).
> > >
> > > I would revise above as "We do not support IOTLB granularity for
> > > request without PASID (second level), therefore any vIOMMU
> > > implementation that exposes the SVA capability to the guest should
> > > only expose the first level page tables, implying all invalidation
> > > requests from the guest will include a valid PASID"
> > >
> > Sounds good.
> >
> > > > + *
> > > > + * For example, to find the VT-d granularity encoding for IOTLB
> > > > + * type and page selective granularity within PASID:
> > > > + * X: indexed by iommu cache type
> > > > + * Y: indexed by enum iommu_inv_granularity
> > > > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > > + *
> > > > + * Granu_map array indicates validity of the table. 1: valid, 0:
> > > > invalid
> > > > + *
> > > > + */
> > > > +const static int
> > > >
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > > NR] = {
> > > > + /*
> > > > + * PASID based IOTLB invalidation: PASID selective (per
> > > > PASID),
> > > > + * page selective (address granularity)
> > > > + */
> > > > + {0, 1, 1},
> > > > + /* PASID based dev TLBs, only support all PASIDs or single
> > > > PASID */
> > > > + {1, 1, 0},
> > >
> > > Is this combination correct? when single PASID is being specified, it
> > > is essentially a page-selective invalidation since you need provide
> > > Address and Size.
> > >
> > This is for translation between generic UAPI granu to VT-d granu, it
> > has nothing to do with address and size.
>
> Generic UAPI defines three granularities: domain, pasid and addr.
> from the definition domain applies all entries related to did, pasid
> applies to all entries related to pasid, while addr is specific for a
> range.
>
> from what we just confirmed internally with VT-d spec owner, our
> PASID based dev TLB invalidation always requires addr and size,
> while current uAPI doesn't support multiple PASIDs based range
> invaliation. It sounds to me that you want to use domain to replace
> multiple PASIDs case (G=1), but it then changes the meaning of
> the domain granularity and easily lead to confusion.
>
> I feel Eric's proposal makes more sense. Here we'd better use {0, 0, 1}
> to indicate only addr range invalidation is allowed, matching the
> spec definition. We may use a special flag in iommu_inv_addr_info
> to indicate G=1 case, if necessary.
I agree. G=1 case should be supported. I think we had a flag for global
as there is GL bit in p_iotlb_inv_dsc (a.k.a ext_iotlb_inv_dsc), but it was
dropped as 3.0 spec dropped GL bit. Let's add it back as for DevTLB
flush case.
> > e.g.
> > If user passes IOMMU_INV_GRANU_PASID for the single PASID case as you
> > mentioned, this map table shows it is valid.
> >
> > Then the lookup result will get VT-d granu:
> > QI_DEV_IOTLB_GRAN_PASID_SEL, which means G=0.
> >
> >
> > > > + /* PASID cache */
> > >
> > > PASID cache is fully managed by the host. Guest PASID cache
> > > invalidation is interpreted by vIOMMU for bind and unbind operations.
> > > I don't think we should accept any PASID cache invalidation from
> > > userspace or guest.
> > >
> >
> > True for vIOMMU, this is here for completeness. Can be used by virtio
> > IOMMU, since PC flush is inclusive (IOTLB, devTLB), it is more
> > efficient.
>
> I think it is not correct in concept. We should not allow the userspace or
> guest to request an operation which is beyond its privilege (just because
> doing so may bring some performance benefit). You can always introduce
> new cmd for such purpose.
I guess it was added for the pasid table binding case? Now, our platform
doesn't support it. So I guess we can just make it as unsupported in the
2D table.
Regards,
Yi Liu
> From: Tian, Kevin <[email protected]>
> Sent: Wednesday, April 1, 2020 2:30 PM
> To: Jacob Pan <[email protected]>
> Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function
>
> > From: Jacob Pan <[email protected]>
> > Sent: Wednesday, April 1, 2020 4:58 AM
> >
> > On Tue, 31 Mar 2020 02:49:21 +0000
> > "Tian, Kevin" <[email protected]> wrote:
> >
> > > > From: Auger Eric <[email protected]>
> > > > Sent: Sunday, March 29, 2020 11:34 PM
> > > >
> > > > Hi,
> > > >
> > > > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > > > >> From: Jacob Pan <[email protected]>
> > > > >> Sent: Saturday, March 21, 2020 7:28 AM
> > > > >>
> > > > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > > > >> and
> > > > driver
> > > > >> level. This patch adds Intel VT-d specific function to
> > > > >> implement iommu passdown invalidate API for shared virtual address.
> > > > >>
> > > > >> The use case is for supporting caching structure invalidation
> > > > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> > > [...]
> > > [...]
> > > > to
> > > > >> + * VT-d granularity. Invalidation is typically included in the
> > > > >> unmap
> > > > operation
> > > > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > > > >> devices
> > > > guest
> > > > >> + * owns the first level page tables. Invalidations of
> > > > >> translation caches in
> > > > the
> > > [...]
> > > [...]
> > > [...]
> > > >
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > > >> NR] = {
> > > > >> + /*
> > > > >> + * PASID based IOTLB invalidation: PASID selective (per
> > > > >> PASID),
> > > > >> + * page selective (address granularity)
> > > > >> + */
> > > > >> + {0, 1, 1},
> > > > >> + /* PASID based dev TLBs, only support all PASIDs or
> > > > >> single PASID */
> > > > >> + {1, 1, 0},
> > > > >
> > > > > Is this combination correct? when single PASID is being
> > > > > specified, it is essentially a page-selective invalidation since
> > > > > you need provide Address and Size.
> > > > Isn't it the same when G=1? Still the addr/size is used. Doesn't
> > > > it
> > >
> > > I thought addr/size is not used when G=1, but it might be wrong. I'm
> > > checking with our vt-d spec owner.
> > >
> >
> > > > correspond to IOMMU_INV_GRANU_ADDR with
> IOMMU_INV_ADDR_FLAGS_PASID
> > > > flag unset?
> > > >
> > > > so {0, 0, 1}?
> > >
> > I am not sure I got your logic. The three fields correspond to
> > IOMMU_INV_GRANU_DOMAIN, /* domain-selective
> > invalidation */
> > IOMMU_INV_GRANU_PASID, /* PASID-selective invalidation */
> > IOMMU_INV_GRANU_ADDR, /* page-selective invalidation *
> >
> > For devTLB, we use domain as global since there is no domain. Then I
> > came up with {1, 1, 0}, which means we could have global and pasid
> > granu invalidation for PASID based devTLB.
> >
> > If the caller also provide addr and S bit, the flush routine will put
>
> "also" -> "must", because vt-d requires addr/size must be provided in
> devtlb
> descriptor, that is why Eric suggests {0, 0, 1}.
I think it should be {0, 0, 1} :-) addr field and S field are must, pasid
field depends on G bit.
I didn’t read through all comments. Here is a concern with this 2-D table,
the iommu cache type is defined as below. I suppose there is a problem here.
If I'm using IOMMU_CACHE_INV_TYPE_PASID, it will beyond the 2-D table.
/* IOMMU paging structure cache */
#define IOMMU_CACHE_INV_TYPE_IOTLB (1 << 0) /* IOMMU IOTLB */
#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB (1 << 1) /* Device IOTLB */
#define IOMMU_CACHE_INV_TYPE_PASID (1 << 2) /* PASID cache */
#define IOMMU_CACHE_INV_TYPE_NR (3)
> >
> > > I have one more open:
> > >
> > > How does userspace know which invalidation type/gran is supported?
> > > I didn't see such capability reporting in Yi's VFIO vSVA patch set.
> > > Do we want the user/kernel assume the same capability set if they
> > > are architectural? However the kernel could also do some
> > > optimization e.g. hide devtlb invalidation capability given that the
> > > kernel already invalidate devtlb automatically when serving iotlb
> > > invalidation...
> > >
> > In general, we are trending to use VFIO capability chain to expose
> > iommu capabilities.
> >
> > But for architectural features such as type/granu, we have to assume
> > the same capability between host & guest. Granu and types are not
> > enumerated on the host IOMMU either.
> >
> > For devTLB optimization, I agree we need to expose a capability to the
> > guest stating that implicit devtlb invalidation is supported.
> > Otherwise, if Linux guest runs on other OSes may not support implicit
> > devtlb invalidation.
> >
> > Right Yi?
>
> Thanks for explanation. So we are assumed to support all operations
> defined in spec, so no need to expose them one-by-one. For optimization,
> I'm fine to do it later.
yes. :-)
Regards,
Yi Liu
Hi,
On 4/1/20 9:13 AM, Liu, Yi L wrote:
>> From: Tian, Kevin <[email protected]>
>> Sent: Wednesday, April 1, 2020 2:30 PM
>> To: Jacob Pan <[email protected]>
>> Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function
>>
>>> From: Jacob Pan <[email protected]>
>>> Sent: Wednesday, April 1, 2020 4:58 AM
>>>
>>> On Tue, 31 Mar 2020 02:49:21 +0000
>>> "Tian, Kevin" <[email protected]> wrote:
>>>
>>>>> From: Auger Eric <[email protected]>
>>>>> Sent: Sunday, March 29, 2020 11:34 PM
>>>>>
>>>>> Hi,
>>>>>
>>>>> On 3/28/20 11:01 AM, Tian, Kevin wrote:
>>>>>>> From: Jacob Pan <[email protected]>
>>>>>>> Sent: Saturday, March 21, 2020 7:28 AM
>>>>>>>
>>>>>>> When Shared Virtual Address (SVA) is enabled for a guest OS via
>>>>>>> vIOMMU, we need to provide invalidation support at IOMMU API
>>>>>>> and
>>>>> driver
>>>>>>> level. This patch adds Intel VT-d specific function to
>>>>>>> implement iommu passdown invalidate API for shared virtual address.
>>>>>>>
>>>>>>> The use case is for supporting caching structure invalidation
>>>>>>> of assigned SVM capable devices. Emulated IOMMU exposes queue
>>>> [...]
>>>> [...]
>>>>> to
>>>>>>> + * VT-d granularity. Invalidation is typically included in the
>>>>>>> unmap
>>>>> operation
>>>>>>> + * as a result of DMA or VFIO unmap. However, for assigned
>>>>>>> devices
>>>>> guest
>>>>>>> + * owns the first level page tables. Invalidations of
>>>>>>> translation caches in
>>>>> the
>>>> [...]
>>>> [...]
>>>> [...]
>>>>>
>>> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
>>>>>>> NR] = {
>>>>>>> + /*
>>>>>>> + * PASID based IOTLB invalidation: PASID selective (per
>>>>>>> PASID),
>>>>>>> + * page selective (address granularity)
>>>>>>> + */
>>>>>>> + {0, 1, 1},
>>>>>>> + /* PASID based dev TLBs, only support all PASIDs or
>>>>>>> single PASID */
>>>>>>> + {1, 1, 0},
>>>>>>
>>>>>> Is this combination correct? when single PASID is being
>>>>>> specified, it is essentially a page-selective invalidation since
>>>>>> you need provide Address and Size.
>>>>> Isn't it the same when G=1? Still the addr/size is used. Doesn't
>>>>> it
>>>>
>>>> I thought addr/size is not used when G=1, but it might be wrong. I'm
>>>> checking with our vt-d spec owner.
>>>>
>>>
>>>>> correspond to IOMMU_INV_GRANU_ADDR with
>> IOMMU_INV_ADDR_FLAGS_PASID
>>>>> flag unset?
>>>>>
>>>>> so {0, 0, 1}?
>>>>
>>> I am not sure I got your logic. The three fields correspond to
>>> IOMMU_INV_GRANU_DOMAIN, /* domain-selective
>>> invalidation */
>>> IOMMU_INV_GRANU_PASID, /* PASID-selective invalidation */
>>> IOMMU_INV_GRANU_ADDR, /* page-selective invalidation *
>>>
>>> For devTLB, we use domain as global since there is no domain. Then I
>>> came up with {1, 1, 0}, which means we could have global and pasid
>>> granu invalidation for PASID based devTLB.
>>>
>>> If the caller also provide addr and S bit, the flush routine will put
>>
>> "also" -> "must", because vt-d requires addr/size must be provided in
>> devtlb
>> descriptor, that is why Eric suggests {0, 0, 1}.
>
> I think it should be {0, 0, 1} :-) addr field and S field are must, pasid
> field depends on G bit.
On my side, I understood from the spec that addr/S are always used
whatever the granularity, hence the above suggestion.
As a comparison, for PASID based IOTLB invalidation, it is clearly
stated that if G matches PASID selective invalidation, address field is
ignored. This is not written that way for PASID-based device TLB inv.
>
> I didn’t read through all comments. Here is a concern with this 2-D table,
> the iommu cache type is defined as below. I suppose there is a problem here.
> If I'm using IOMMU_CACHE_INV_TYPE_PASID, it will beyond the 2-D table.
>
> /* IOMMU paging structure cache */
> #define IOMMU_CACHE_INV_TYPE_IOTLB (1 << 0) /* IOMMU IOTLB */
> #define IOMMU_CACHE_INV_TYPE_DEV_IOTLB (1 << 1) /* Device IOTLB */
> #define IOMMU_CACHE_INV_TYPE_PASID (1 << 2) /* PASID cache */
> #define IOMMU_CACHE_INV_TYPE_NR (3)
oups indeed
Thanks
Eric
>
>>>
>>>> I have one more open:
>>>>
>>>> How does userspace know which invalidation type/gran is supported?
>>>> I didn't see such capability reporting in Yi's VFIO vSVA patch set.
>>>> Do we want the user/kernel assume the same capability set if they
>>>> are architectural? However the kernel could also do some
>>>> optimization e.g. hide devtlb invalidation capability given that the
>>>> kernel already invalidate devtlb automatically when serving iotlb
>>>> invalidation...
>>>>
>>> In general, we are trending to use VFIO capability chain to expose
>>> iommu capabilities.
>>>
>>> But for architectural features such as type/granu, we have to assume
>>> the same capability between host & guest. Granu and types are not
>>> enumerated on the host IOMMU either.
>>>
>>> For devTLB optimization, I agree we need to expose a capability to the
>>> guest stating that implicit devtlb invalidation is supported.
>>> Otherwise, if Linux guest runs on other OSes may not support implicit
>>> devtlb invalidation.
>>>
>>> Right Yi?
>>
>> Thanks for explanation. So we are assumed to support all operations
>> defined in spec, so no need to expose them one-by-one. For optimization,
>> I'm fine to do it later.
>
> yes. :-)
>
> Regards,
> Yi Liu
>
On Wed, 1 Apr 2020 06:57:42 +0000
"Liu, Yi L" <[email protected]> wrote:
> > From: Tian, Kevin <[email protected]>
> > Sent: Wednesday, April 1, 2020 2:24 PM
> > To: Jacob Pan <[email protected]>
> > Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate
> > function
> > > From: Jacob Pan <[email protected]>
> > > Sent: Wednesday, April 1, 2020 2:14 AM
> > >
> > > On Sat, 28 Mar 2020 10:01:42 +0000
> > > "Tian, Kevin" <[email protected]> wrote:
> > >
> > > > > From: Jacob Pan <[email protected]>
> > > > > Sent: Saturday, March 21, 2020 7:28 AM
> > > > >
> > > > > When Shared Virtual Address (SVA) is enabled for a guest OS
> > > > > via vIOMMU, we need to provide invalidation support at IOMMU
> > > > > API and driver level. This patch adds Intel VT-d specific
> > > > > function to implement iommu passdown invalidate API for
> > > > > shared virtual address.
> > > > >
> > > > > The use case is for supporting caching structure invalidation
> > > > > of assigned SVM capable devices. Emulated IOMMU exposes
> > > > > queue
> > > >
> > > > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > > > interface as well.
> > > >
> > > True, but it does not invalidate this statement about emulated
> > > IOMMU. I will add another statement saying "the same interface
> > > can be used for virtio-IOMMU as well". OK?
> >
> > sure
> >
> > >
> > > > > invalidation capability and passes down all descriptors from
> > > > > the guest to the physical IOMMU.
> > > > >
> > > > > The assumption is that guest to host device ID mapping should
> > > > > be resolved prior to calling IOMMU driver. Based on the
> > > > > device handle, host IOMMU driver can replace certain fields
> > > > > before submit to the invalidation queue.
> > > > >
> > > > > ---
> > > > > v7 review fixed in v10
> > > > > ---
> > > > >
> > > > > Signed-off-by: Jacob Pan <[email protected]>
> > > > > Signed-off-by: Ashok Raj <[email protected]>
> > > > > Signed-off-by: Liu, Yi L <[email protected]>
> > > > > ---
> > > > > drivers/iommu/intel-iommu.c | 182
> > > > > ++++++++++++++++++++++++++++++++++++++++++++
> > > > > 1 file changed, 182 insertions(+)
> > > > >
> > > > > diff --git a/drivers/iommu/intel-iommu.c
> > > > > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > > > 100644 --- a/drivers/iommu/intel-iommu.c
> > > > > +++ b/drivers/iommu/intel-iommu.c
> > > > > @@ -5619,6 +5619,187 @@ static void
> > > > > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > > > > aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > > > > }
> > > > >
> > > > > +/*
> > > > > + * 2D array for converting and sanitizing IOMMU generic TLB
> > > > > granularity to
> > > > > + * VT-d granularity. Invalidation is typically included in
> > > > > the unmap operation
> > > > > + * as a result of DMA or VFIO unmap. However, for assigned
> > > > > devices guest
> > > > > + * owns the first level page tables. Invalidations of
> > > > > translation caches in the
> > > > > + * guest are trapped and passed down to the host.
> > > > > + *
> > > > > + * vIOMMU in the guest will only expose first level page
> > > > > tables, therefore
> > > > > + * we do not include IOTLB granularity for request without
> > > > > PASID (second level).
> > > >
> > > > I would revise above as "We do not support IOTLB granularity for
> > > > request without PASID (second level), therefore any vIOMMU
> > > > implementation that exposes the SVA capability to the guest
> > > > should only expose the first level page tables, implying all
> > > > invalidation requests from the guest will include a valid PASID"
> > > >
> > > Sounds good.
> > >
> > > > > + *
> > > > > + * For example, to find the VT-d granularity encoding for
> > > > > IOTLB
> > > > > + * type and page selective granularity within PASID:
> > > > > + * X: indexed by iommu cache type
> > > > > + * Y: indexed by enum iommu_inv_granularity
> > > > > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > > > + *
> > > > > + * Granu_map array indicates validity of the table. 1:
> > > > > valid, 0: invalid
> > > > > + *
> > > > > + */
> > > > > +const static int
> > > > >
> > > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > > > NR] = {
> > > > > + /*
> > > > > + * PASID based IOTLB invalidation: PASID selective
> > > > > (per PASID),
> > > > > + * page selective (address granularity)
> > > > > + */
> > > > > + {0, 1, 1},
> > > > > + /* PASID based dev TLBs, only support all PASIDs or
> > > > > single PASID */
> > > > > + {1, 1, 0},
> > > >
> > > > Is this combination correct? when single PASID is being
> > > > specified, it is essentially a page-selective invalidation
> > > > since you need provide Address and Size.
> > > >
> > > This is for translation between generic UAPI granu to VT-d granu,
> > > it has nothing to do with address and size.
> >
> > Generic UAPI defines three granularities: domain, pasid and addr.
> > from the definition domain applies all entries related to did, pasid
> > applies to all entries related to pasid, while addr is specific for
> > a range.
> >
> > from what we just confirmed internally with VT-d spec owner, our
> > PASID based dev TLB invalidation always requires addr and size,
> > while current uAPI doesn't support multiple PASIDs based range
> > invaliation. It sounds to me that you want to use domain to replace
> > multiple PASIDs case (G=1), but it then changes the meaning of
> > the domain granularity and easily lead to confusion.
> >
> > I feel Eric's proposal makes more sense. Here we'd better use {0,
> > 0, 1} to indicate only addr range invalidation is allowed, matching
> > the spec definition. We may use a special flag in
> > iommu_inv_addr_info to indicate G=1 case, if necessary.
>
> I agree. G=1 case should be supported. I think we had a flag for
> global as there is GL bit in p_iotlb_inv_dsc (a.k.a
> ext_iotlb_inv_dsc), but it was dropped as 3.0 spec dropped GL bit.
> Let's add it back as for DevTLB flush case.
>
Make sense. I will change that to
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5741,7 +5741,7 @@ const static int inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR]
*/
{0, 1, 1},
/* PASID based dev TLBs, only support all PASIDs or single PASID */
- {1, 1, 0},
+ {0, 0, 1},
/* PASID cache */
{1, 1, 0}
};
@@ -5750,7 +5750,7 @@ const static int inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_N
/* PASID based IOTLB */
{0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
/* PASID based dev TLBs */
- {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
+ {0, 0, QI_DEV_IOTLB_GRAN_PASID_SEL},
/* PASID cache */
{QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
};
> > > e.g.
> > > If user passes IOMMU_INV_GRANU_PASID for the single PASID case as
> > > you mentioned, this map table shows it is valid.
> > >
> > > Then the lookup result will get VT-d granu:
> > > QI_DEV_IOTLB_GRAN_PASID_SEL, which means G=0.
> > >
> > >
> > > > > + /* PASID cache */
> > > >
> > > > PASID cache is fully managed by the host. Guest PASID cache
> > > > invalidation is interpreted by vIOMMU for bind and unbind
> > > > operations. I don't think we should accept any PASID cache
> > > > invalidation from userspace or guest.
> > > >
> > >
> > > True for vIOMMU, this is here for completeness. Can be used by
> > > virtio IOMMU, since PC flush is inclusive (IOTLB, devTLB), it is
> > > more efficient.
> >
> > I think it is not correct in concept. We should not allow the
> > userspace or guest to request an operation which is beyond its
> > privilege (just because doing so may bring some performance
> > benefit). You can always introduce new cmd for such purpose.
>
> I guess it was added for the pasid table binding case? Now, our
> platform doesn't support it. So I guess we can just make it as
> unsupported in the 2D table.
Sounds good.
On Wed, 1 Apr 2020 09:32:37 +0200
Auger Eric <[email protected]> wrote:
> >> devtlb
> >> descriptor, that is why Eric suggests {0, 0, 1}.
> >
> > I think it should be {0, 0, 1} :-) addr field and S field are must,
> > pasid field depends on G bit.
>
> On my side, I understood from the spec that addr/S are always used
> whatever the granularity, hence the above suggestion.
>
> As a comparison, for PASID based IOTLB invalidation, it is clearly
> stated that if G matches PASID selective invalidation, address field
> is ignored. This is not written that way for PASID-based device TLB
> inv.
> >
I misread the S bit. It all makes sense now. Thanks for the proposal
and explanation.
Jacob
On Wed, 1 Apr 2020 09:32:37 +0200
Auger Eric <[email protected]> wrote:
> > I didn’t read through all comments. Here is a concern with this 2-D
> > table, the iommu cache type is defined as below. I suppose there is
> > a problem here. If I'm using IOMMU_CACHE_INV_TYPE_PASID, it will
> > beyond the 2-D table.
> >
> > /* IOMMU paging structure cache */
> > #define IOMMU_CACHE_INV_TYPE_IOTLB (1 << 0) /* IOMMU IOTLB */
> > #define IOMMU_CACHE_INV_TYPE_DEV_IOTLB (1 << 1) /* Device IOTLB */
> > #define IOMMU_CACHE_INV_TYPE_PASID (1 << 2) /* PASID cache */
> > #define IOMMU_CACHE_INV_TYPE_NR (3)
> oups indeed
I think it is not an issue, since we use bit position not the raw cache
type as index into the 2D array. Right?
for_each_set_bit(cache_type,
ret = to_vtd_granularity(cache_type, inv_info->granularity, &
static inline int to_vtd_granularity(int type, int granu, int *vtd_granu)
{
*vtd_granu = inv_type_granu_table[type][granu];