2024-02-08 08:23:23

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 0/8] Add missing cache flush and dirty tracking set for nested parent domain

This series adds the missing cache flush and dirty track set for nested
parent domain (it's s2_domain but used as parent) which has no insight
into devices/DID's under the nested domain (a.k.a s1_domain). This
results in missing cache flush per parent domain change and incomplete
dirty tracking set on the parent domain. There was a discussion about
this in the mailing list [1].

This series adds a s1_domains list in the parent domain to track the nested
domains. Hence, the driver can loop the nested domains and use the DIDs of
the nested domain to flush iotlb. The driver can also loop the nested domains
and nested domain's devices list to flush device iotlb and set the dirty
tracking completely.

This series doesn't touch the pasid iotlb or the pasid devtlb as there is
no support of attaching pasid to nested domain yet. It will be covered when
that feature is enabled.

The complete code can be found at[2], this has been tested with a hacky
Qemu[3] to test the unmap on parent domain and dirty tracking set on parent
domain. This just verifies the new path. So appreciated to see t-b with
regression tests.

This aims to the 6.8 rc#, so your special attention is welcomed.

[1] https://lore.kernel.org/linux-iommu/[email protected]/
[2] https://github.com/yiliu1765/iommufd/tree/vtd_nesting_fixes
[3] https://github.com/yiliu1765/qemu/tree/tmp/for-testing-unmap-and-dirty-set-on-parent

base commit: 547ab8fc4cb04a1a6b34377dd8fad34cd2c8a8e3

Regards,
Yi Liu

Yi Liu (8):
iommu/vt-d: Track nested domains in parent
iommu/vt-d: Add __iommu_flush_iotlb_psi()
iommu/vt-d: Add missing iotlb flush for parent domain
iommu/vt-d: Update iotlb in nested domain attach
iommu/vt-d: Add missing device iotlb flush for parent domain
iommu/vt-d: Remove @domain parameter from
intel_pasid_setup_dirty_tracking()
iommu/vt-d: Wrap the dirty tracking loop to be a helper
iommu/vt-d: Add missing dirty tracking set for parent domain

drivers/iommu/intel/iommu.c | 213 ++++++++++++++++++++++++++---------
drivers/iommu/intel/iommu.h | 7 ++
drivers/iommu/intel/nested.c | 12 ++
drivers/iommu/intel/pasid.c | 3 +-
drivers/iommu/intel/pasid.h | 1 -
5 files changed, 182 insertions(+), 54 deletions(-)

--
2.34.1



2024-02-08 08:23:46

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 1/8] iommu/vt-d: Track nested domains in parent

Today the parent domain (s2_domain) is unaware of which DID's are
used by and which devices are attached to nested domains (s1_domain)
nested on it. This leads to a problem that some operations (flush
iotlb/devtlb and enable dirty tracking) on parent domain only apply to
DID's and devices directly tracked in the parent domain hence are
incomplete.

This tracks the nested domains in list in parent domain. With this,
operations on parent domain can loop the nested domains and refer to
the devices and iommu_array to ensure the operations on parent domain
take effect on all the affected devices and iommus.

Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 18 ++++++++++++++----
drivers/iommu/intel/iommu.h | 6 ++++++
drivers/iommu/intel/nested.c | 10 ++++++++++
3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6fb5f6fceea1..e393c62776f3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3883,6 +3883,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
struct intel_iommu *iommu = info->iommu;
+ struct dmar_domain *dmar_domain;
struct iommu_domain *domain;

/* Must be NESTING domain */
@@ -3908,11 +3909,16 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
if (!domain)
return ERR_PTR(-ENOMEM);

- if (nested_parent)
- to_dmar_domain(domain)->nested_parent = true;
+ dmar_domain = to_dmar_domain(domain);
+
+ if (nested_parent) {
+ dmar_domain->nested_parent = true;
+ INIT_LIST_HEAD(&dmar_domain->s1_domains);
+ spin_lock_init(&dmar_domain->s1_lock);
+ }

if (dirty_tracking) {
- if (to_dmar_domain(domain)->use_first_level) {
+ if (dmar_domain->use_first_level) {
iommu_domain_free(domain);
return ERR_PTR(-EOPNOTSUPP);
}
@@ -3924,8 +3930,12 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags,

static void intel_iommu_domain_free(struct iommu_domain *domain)
{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+ WARN_ON(dmar_domain->nested_parent &&
+ !list_empty(&dmar_domain->s1_domains));
if (domain != &si_domain->domain)
- domain_exit(to_dmar_domain(domain));
+ domain_exit(dmar_domain);
}

int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index d02f916d8e59..9b27edb73aa9 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -627,6 +627,10 @@ struct dmar_domain {
int agaw;
/* maximum mapped address */
u64 max_addr;
+ /* Protect the s1_domains list */
+ spinlock_t s1_lock;
+ /* Track s1_domains nested on this domain */
+ struct list_head s1_domains;
};

/* Nested user domain */
@@ -637,6 +641,8 @@ struct dmar_domain {
unsigned long s1_pgtbl;
/* page table attributes */
struct iommu_hwpt_vtd_s1 s1_cfg;
+ /* link to parent domain siblings */
+ struct list_head s2_link;
};
};

diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index f26c7f1c46cc..42d4d222e70f 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -70,6 +70,12 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,

static void intel_nested_domain_free(struct iommu_domain *domain)
{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct dmar_domain *s2_domain = dmar_domain->s2_domain;
+
+ spin_lock(&s2_domain->s1_lock);
+ list_del(&dmar_domain->s2_link);
+ spin_unlock(&s2_domain->s1_lock);
kfree(to_dmar_domain(domain));
}

@@ -201,5 +207,9 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
spin_lock_init(&domain->lock);
xa_init(&domain->iommu_array);

+ spin_lock(&s2_domain->s1_lock);
+ list_add(&domain->s2_link, &s2_domain->s1_domains);
+ spin_unlock(&s2_domain->s1_lock);
+
return &domain->domain;
}
--
2.34.1


2024-02-08 08:23:57

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 2/8] iommu/vt-d: Add __iommu_flush_iotlb_psi()

Add __iommu_flush_iotlb_psi() to do the psi iotlb flush with a DID input
rather than calculating it within the helper.

This is useful when flushing cache for parent domain which reuses DIDs of
its nested domains.

Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 79 +++++++++++++++++++++----------------
1 file changed, 44 insertions(+), 35 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e393c62776f3..eef6a187b651 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1368,6 +1368,47 @@ static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
spin_unlock_irqrestore(&domain->lock, flags);
}

+static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+ unsigned long pfn, unsigned int pages,
+ int ih)
+{
+ unsigned int aligned_pages = __roundup_pow_of_two(pages);
+ unsigned int mask = ilog2(aligned_pages);
+ uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+ unsigned long bitmask = aligned_pages - 1;
+
+ /*
+ * PSI masks the low order bits of the base address. If the
+ * address isn't aligned to the mask, then compute a mask value
+ * needed to ensure the target range is flushed.
+ */
+ if (unlikely(bitmask & pfn)) {
+ unsigned long end_pfn = pfn + pages - 1, shared_bits;
+
+ /*
+ * Since end_pfn <= pfn + bitmask, the only way bits
+ * higher than bitmask can differ in pfn and end_pfn is
+ * by carrying. This means after masking out bitmask,
+ * high bits starting with the first set bit in
+ * shared_bits are all equal in both pfn and end_pfn.
+ */
+ shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
+ mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
+ }
+
+ /*
+ * Fallback to domain selective flush if no PSI support or
+ * the size is too big.
+ */
+ if (!cap_pgsel_inv(iommu->cap) ||
+ mask > cap_max_amask_val(iommu->cap))
+ iommu->flush.flush_iotlb(iommu, did, 0, 0,
+ DMA_TLB_DSI_FLUSH);
+ else
+ iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
+ DMA_TLB_PSI_FLUSH);
+}
+
static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
struct dmar_domain *domain,
unsigned long pfn, unsigned int pages,
@@ -1384,42 +1425,10 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
if (ih)
ih = 1 << 6;

- if (domain->use_first_level) {
+ if (domain->use_first_level)
domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
- } else {
- unsigned long bitmask = aligned_pages - 1;
-
- /*
- * PSI masks the low order bits of the base address. If the
- * address isn't aligned to the mask, then compute a mask value
- * needed to ensure the target range is flushed.
- */
- if (unlikely(bitmask & pfn)) {
- unsigned long end_pfn = pfn + pages - 1, shared_bits;
-
- /*
- * Since end_pfn <= pfn + bitmask, the only way bits
- * higher than bitmask can differ in pfn and end_pfn is
- * by carrying. This means after masking out bitmask,
- * high bits starting with the first set bit in
- * shared_bits are all equal in both pfn and end_pfn.
- */
- shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
- mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
- }
-
- /*
- * Fallback to domain selective flush if no PSI support or
- * the size is too big.
- */
- if (!cap_pgsel_inv(iommu->cap) ||
- mask > cap_max_amask_val(iommu->cap))
- iommu->flush.flush_iotlb(iommu, did, 0, 0,
- DMA_TLB_DSI_FLUSH);
- else
- iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
- DMA_TLB_PSI_FLUSH);
- }
+ else
+ __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);

/*
* In caching mode, changes of pages from non-present to present require
--
2.34.1


2024-02-08 08:24:16

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 3/8] iommu/vt-d: Add missing iotlb flush for parent domain

If a domain is used as the parent in nested translation its mappings might
be cached using DID of the nested domain. But the existing code ignores
this fact to only invalidate the iotlb entries tagged by the domain's own
DID.

Loop the s1_domains list, if any, to invalidate all iotlb entries related
to the target s2 address range. According to VT-d spec there is no need for
software to explicitly flush the affected s1 cache. It's implicitly done by
HW when s2 cache is invalidated.

Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation")
Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index eef6a187b651..46174ff5ae22 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1452,6 +1452,28 @@ static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *
iommu_flush_write_buffer(iommu);
}

+/*
+ * Flush the relevant caches in nested translation if the domain
+ * also serves as a parent
+ */
+static void parent_domain_flush(struct dmar_domain *domain,
+ unsigned long pfn,
+ unsigned long pages, int ih)
+{
+ struct dmar_domain *s1_domain;
+
+ spin_lock(&domain->s1_lock);
+ list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
+ struct iommu_domain_info *info;
+ unsigned long i;
+
+ xa_for_each(&s1_domain->iommu_array, i, info)
+ __iommu_flush_iotlb_psi(info->iommu, info->did,
+ pfn, pages, ih);
+ }
+ spin_unlock(&domain->s1_lock);
+}
+
static void intel_flush_iotlb_all(struct iommu_domain *domain)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
@@ -1471,6 +1493,9 @@ static void intel_flush_iotlb_all(struct iommu_domain *domain)
if (!cap_caching_mode(iommu->cap))
iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
}
+
+ if (dmar_domain->nested_parent)
+ parent_domain_flush(dmar_domain, 0, -1, 0);
}

static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1994,6 +2019,9 @@ static void switch_to_super_page(struct dmar_domain *domain,
iommu_flush_iotlb_psi(info->iommu, domain,
start_pfn, lvl_pages,
0, 0);
+ if (domain->nested_parent)
+ parent_domain_flush(domain, start_pfn,
+ lvl_pages, 0);
}

pte++;
@@ -4126,6 +4154,9 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
start_pfn, nrpages,
list_empty(&gather->freelist), 0);

+ if (dmar_domain->nested_parent)
+ parent_domain_flush(dmar_domain, start_pfn, nrpages,
+ list_empty(&gather->freelist));
put_pages_list(&gather->freelist);
}

--
2.34.1


2024-02-08 08:24:49

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 5/8] iommu/vt-d: Add missing device iotlb flush for parent domain

ATS-capable devices cache the result of nested translation. This result
relies on the mappings in s2 domain (a.k.a. parent). When there are
modifications in the s2 domain, the related nested translation caches on
the device should be flushed. This includes the devices that are attached
to the s1 domain. However, the existing code ignores this fact to only
loops its own devices.

As there is no easy way to identify the exact set of nested translations
affected by the change of s2 domain. So, this just flushes the entire
device iotlb on the device.

As above, driver loops the s2 domain's s1_domains list and loops the devices
list of each s1_domain to flush the entire device iotlb on the devices.

Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation")
Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b1ceebe13107..c5a0275697cb 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1462,12 +1462,30 @@ static void parent_domain_flush(struct dmar_domain *domain,

spin_lock(&domain->s1_lock);
list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
+ struct device_domain_info *device_info;
struct iommu_domain_info *info;
+ unsigned long flags;
unsigned long i;

xa_for_each(&s1_domain->iommu_array, i, info)
__iommu_flush_iotlb_psi(info->iommu, info->did,
pfn, pages, ih);
+
+ if (!s1_domain->has_iotlb_device)
+ continue;
+
+ spin_lock_irqsave(&s1_domain->lock, flags);
+ list_for_each_entry(device_info, &s1_domain->devices, link)
+ /*
+ * Address translation cache in device side caches the
+ * result of nested translation. There is no easy way
+ * to identify the exact set of nested translations
+ * affected by a change in S2. So just flush the entire
+ * device cache.
+ */
+ __iommu_flush_dev_iotlb(device_info, 0,
+ MAX_AGAW_PFN_WIDTH);
+ spin_unlock_irqrestore(&s1_domain->lock, flags);
}
spin_unlock(&domain->s1_lock);
}
--
2.34.1


2024-02-08 08:25:48

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 7/8] iommu/vt-d: Wrap the dirty tracking loop to be a helper

Add device_set_dirty_tracking() to loop all the devices and set the dirty
tracking per the @enable parameter.

Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 32 +++++++++++++++++++++-----------
1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index dae20991e036..7636d3f03905 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4730,23 +4730,35 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
return vtd;
}

+static int
+device_set_dirty_tracking(struct list_head *devices, bool enable)
+{
+ struct device_domain_info *info;
+ int ret = 0;
+
+ list_for_each_entry(info, devices, link) {
+ ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
+ IOMMU_NO_PASID, enable);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct device_domain_info *info;
int ret;

spin_lock(&dmar_domain->lock);
if (dmar_domain->dirty_tracking == enable)
goto out_unlock;

- list_for_each_entry(info, &dmar_domain->devices, link) {
- ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
- IOMMU_NO_PASID, enable);
- if (ret)
- goto err_unwind;
- }
+ ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
+ if (ret)
+ goto err_unwind;

dmar_domain->dirty_tracking = enable;
out_unlock:
@@ -4755,10 +4767,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
return 0;

err_unwind:
- list_for_each_entry(info, &dmar_domain->devices, link)
- intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
- IOMMU_NO_PASID,
- dmar_domain->dirty_tracking);
+ device_set_dirty_tracking(&dmar_domain->devices,
+ dmar_domain->dirty_tracking);
spin_unlock(&dmar_domain->lock);
return ret;
}
--
2.34.1


2024-02-08 08:27:52

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 4/8] iommu/vt-d: Update iotlb in nested domain attach

Should call domain_update_iotlb() to update the has_iotlb_device flag
of the domain after attaching device to nested domain. Without it, this
flag is not set properly and would result in missing device TLB flush.

Fixes: 9838f2bb6b6b ("iommu/vt-d: Set the nested domain to a device")
Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 4 +---
drivers/iommu/intel/iommu.h | 1 +
drivers/iommu/intel/nested.c | 2 ++
3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 46174ff5ae22..b1ceebe13107 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -396,8 +396,6 @@ static int domain_update_device_node(struct dmar_domain *domain)
return nid;
}

-static void domain_update_iotlb(struct dmar_domain *domain);
-
/* Return the super pagesize bitmap if supported. */
static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
{
@@ -1218,7 +1216,7 @@ domain_lookup_dev_info(struct dmar_domain *domain,
return NULL;
}

-static void domain_update_iotlb(struct dmar_domain *domain)
+void domain_update_iotlb(struct dmar_domain *domain)
{
struct dev_pasid_info *dev_pasid;
struct device_domain_info *info;
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 9b27edb73aa9..4145c04cb1c6 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1066,6 +1066,7 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
*/
#define QI_OPT_WAIT_DRAIN BIT(0)

+void domain_update_iotlb(struct dmar_domain *domain);
int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
void device_block_translation(struct device *dev);
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 42d4d222e70f..f068ecf72f5a 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -65,6 +65,8 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
list_add(&info->link, &dmar_domain->devices);
spin_unlock_irqrestore(&dmar_domain->lock, flags);

+ domain_update_iotlb(dmar_domain);
+
return 0;
}

--
2.34.1


2024-02-08 08:28:23

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 6/8] iommu/vt-d: Remove @domain parameter from intel_pasid_setup_dirty_tracking()

The only usage of input @domain is to get the domain id (DID) to flush
cache after setting dirty tracking. However, DID can be obtained from
the pasid entry. So no need to pass in domain. This can make this helper
cleaner when adding the missing dirty tracking for the parent domain,
which needs to use the DID of nested domain.

Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 7 +++----
drivers/iommu/intel/pasid.c | 3 +--
drivers/iommu/intel/pasid.h | 1 -
3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c5a0275697cb..dae20991e036 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4742,8 +4742,7 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
goto out_unlock;

list_for_each_entry(info, &dmar_domain->devices, link) {
- ret = intel_pasid_setup_dirty_tracking(info->iommu,
- info->domain, info->dev,
+ ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
IOMMU_NO_PASID, enable);
if (ret)
goto err_unwind;
@@ -4757,8 +4756,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,

err_unwind:
list_for_each_entry(info, &dmar_domain->devices, link)
- intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
- info->dev, IOMMU_NO_PASID,
+ intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
+ IOMMU_NO_PASID,
dmar_domain->dirty_tracking);
spin_unlock(&dmar_domain->lock);
return ret;
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 3239cefa4c33..a32d7e509842 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -428,7 +428,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
* Set up dirty tracking on a second only or nested translation type.
*/
int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
- struct dmar_domain *domain,
struct device *dev, u32 pasid,
bool enabled)
{
@@ -445,7 +444,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
return -ENODEV;
}

- did = domain_id_iommu(domain, iommu);
+ did = pasid_get_domain_id(pte);
pgtt = pasid_pte_get_pgtt(pte);
if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
pgtt != PASID_ENTRY_PGTT_NESTED) {
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 8d40d4c66e31..487ede039bdd 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -307,7 +307,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
- struct dmar_domain *domain,
struct device *dev, u32 pasid,
bool enabled);
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
--
2.34.1


2024-02-08 08:28:56

by Yi Liu

[permalink] [raw]
Subject: [PATCH rc 8/8] iommu/vt-d: Add missing dirty tracking set for parent domain

Setting dirty tracking for a s2 domain requires to loop all the related
devices and set the dirty tracking enable bit in the PASID table entry.
This includes the devices that are attached to the nested domains of a
s2 domain if this s2 domain is used as parent. However, the existing dirty
tracking set only loops s2 domain's own devices. It will miss dirty page
logs in the parent domain.

Now, the parent domain tracks the nested domains, so it can loop the
nested domains and the devices attached to the nested domains to ensure
dirty tracking on the parent is set completely.

Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation")
Signed-off-by: Yi Sun <[email protected]>
Signed-off-by: Yi Liu <[email protected]>
---
drivers/iommu/intel/iommu.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7636d3f03905..b4101712a0c3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4746,6 +4746,36 @@ device_set_dirty_tracking(struct list_head *devices, bool enable)
return ret;
}

+static int
+parent_domain_set_dirty_tracking(struct dmar_domain *domain,
+ bool enable)
+{
+ struct dmar_domain *s1_domain;
+ unsigned long flags;
+ int ret;
+
+ spin_lock(&domain->s1_lock);
+ list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
+ spin_lock_irqsave(&s1_domain->lock, flags);
+ ret = device_set_dirty_tracking(&s1_domain->devices, enable);
+ spin_unlock_irqrestore(&s1_domain->lock, flags);
+ if (ret)
+ goto err_unwind;
+ }
+ spin_unlock(&domain->s1_lock);
+ return 0;
+
+err_unwind:
+ list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
+ spin_lock_irqsave(&s1_domain->lock, flags);
+ device_set_dirty_tracking(&s1_domain->devices,
+ domain->dirty_tracking);
+ spin_unlock_irqrestore(&s1_domain->lock, flags);
+ }
+ spin_unlock(&domain->s1_lock);
+ return ret;
+}
+
static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
@@ -4760,6 +4790,12 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
if (ret)
goto err_unwind;

+ if (dmar_domain->nested_parent) {
+ ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
+ if (ret)
+ goto err_unwind;
+ }
+
dmar_domain->dirty_tracking = enable;
out_unlock:
spin_unlock(&dmar_domain->lock);
--
2.34.1


2024-02-08 08:31:05

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 2/8] iommu/vt-d: Add __iommu_flush_iotlb_psi()

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> Add __iommu_flush_iotlb_psi() to do the psi iotlb flush with a DID input
> rather than calculating it within the helper.
>
> This is useful when flushing cache for parent domain which reuses DIDs of
> its nested domains.
>
> Signed-off-by: Yi Liu <[email protected]>

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 08:31:36

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 1/8] iommu/vt-d: Track nested domains in parent

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> static void intel_nested_domain_free(struct iommu_domain *domain)
> {
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> + struct dmar_domain *s2_domain = dmar_domain->s2_domain;
> +
> + spin_lock(&s2_domain->s1_lock);
> + list_del(&dmar_domain->s2_link);
> + spin_unlock(&s2_domain->s1_lock);
> kfree(to_dmar_domain(domain));

use 'dmar_domain'.

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 08:39:10

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 3/8] iommu/vt-d: Add missing iotlb flush for parent domain

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> +/*
> + * Flush the relevant caches in nested translation if the domain
> + * also serves as a parent
> + */
> +static void parent_domain_flush(struct dmar_domain *domain,
> + unsigned long pfn,
> + unsigned long pages, int ih)
> +{
> + struct dmar_domain *s1_domain;
> +
> + spin_lock(&domain->s1_lock);
> + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
> + struct iommu_domain_info *info;
> + unsigned long i;
> +
> + xa_for_each(&s1_domain->iommu_array, i, info)
> + __iommu_flush_iotlb_psi(info->iommu, info->did,
> + pfn, pages, ih);
> + }

As Jason suggested before this xarray lacks of proper locking.

but given it's rc fix I'm fine with it. @Baolu we do need fix it soon so
this problem won't further expand like here.

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 08:40:46

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 4/8] iommu/vt-d: Update iotlb in nested domain attach

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> Should call domain_update_iotlb() to update the has_iotlb_device flag
> of the domain after attaching device to nested domain. Without it, this
> flag is not set properly and would result in missing device TLB flush.
>
> Fixes: 9838f2bb6b6b ("iommu/vt-d: Set the nested domain to a device")
> Signed-off-by: Yi Liu <[email protected]>

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 08:42:42

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 5/8] iommu/vt-d: Add missing device iotlb flush for parent domain

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> ATS-capable devices cache the result of nested translation. This result
> relies on the mappings in s2 domain (a.k.a. parent). When there are
> modifications in the s2 domain, the related nested translation caches on
> the device should be flushed. This includes the devices that are attached
> to the s1 domain. However, the existing code ignores this fact to only
> loops its own devices.
>
> As there is no easy way to identify the exact set of nested translations
> affected by the change of s2 domain. So, this just flushes the entire
> device iotlb on the device.
>
> As above, driver loops the s2 domain's s1_domains list and loops the devices
> list of each s1_domain to flush the entire device iotlb on the devices.
>
> Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation")
> Signed-off-by: Yi Liu <[email protected]>

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 08:45:42

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 6/8] iommu/vt-d: Remove @domain parameter from intel_pasid_setup_dirty_tracking()

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> The only usage of input @domain is to get the domain id (DID) to flush
> cache after setting dirty tracking. However, DID can be obtained from
> the pasid entry. So no need to pass in domain. This can make this helper
> cleaner when adding the missing dirty tracking for the parent domain,
> which needs to use the DID of nested domain.
>
> Signed-off-by: Yi Liu <[email protected]>

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 08:47:21

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 7/8] iommu/vt-d: Wrap the dirty tracking loop to be a helper

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> Add device_set_dirty_tracking() to loop all the devices and set the dirty
> tracking per the @enable parameter.
>
> Signed-off-by: Yi Liu <[email protected]>

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 08:56:42

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH rc 8/8] iommu/vt-d: Add missing dirty tracking set for parent domain

> From: Liu, Yi L <[email protected]>
> Sent: Thursday, February 8, 2024 4:23 PM
>
> @@ -4760,6 +4790,12 @@ static int intel_iommu_set_dirty_tracking(struct
> iommu_domain *domain,
> if (ret)
> goto err_unwind;
>
> + if (dmar_domain->nested_parent) {
> + ret = parent_domain_set_dirty_tracking(dmar_domain,
> enable);
> + if (ret)
> + goto err_unwind;
> + }
> +

there also lack of setting dirty tracking when a device is attached to the
nested domain while the parent already has dirty-tracking enabled.

but looks even w/o nesting this is missed in the attach path.

could be fixed separately. so for this one:

Reviewed-by: Kevin Tian <[email protected]>

2024-02-08 09:20:31

by Yi Liu

[permalink] [raw]
Subject: Re: [PATCH rc 1/8] iommu/vt-d: Track nested domains in parent



On 2024/2/8 16:28, Tian, Kevin wrote:
>> From: Liu, Yi L <[email protected]>
>> Sent: Thursday, February 8, 2024 4:23 PM
>>
>> static void intel_nested_domain_free(struct iommu_domain *domain)
>> {
>> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
>> + struct dmar_domain *s2_domain = dmar_domain->s2_domain;
>> +
>> + spin_lock(&s2_domain->s1_lock);
>> + list_del(&dmar_domain->s2_link);
>> + spin_unlock(&s2_domain->s1_lock);
>> kfree(to_dmar_domain(domain));
>
> use 'dmar_domain'.

Oops. yes it is.

> Reviewed-by: Kevin Tian <[email protected]>

--
Regards,
Yi Liu

2024-02-08 09:21:38

by Yi Liu

[permalink] [raw]
Subject: Re: [PATCH rc 8/8] iommu/vt-d: Add missing dirty tracking set for parent domain

On 2024/2/8 16:53, Tian, Kevin wrote:
>> From: Liu, Yi L <[email protected]>
>> Sent: Thursday, February 8, 2024 4:23 PM
>>
>> @@ -4760,6 +4790,12 @@ static int intel_iommu_set_dirty_tracking(struct
>> iommu_domain *domain,
>> if (ret)
>> goto err_unwind;
>>
>> + if (dmar_domain->nested_parent) {
>> + ret = parent_domain_set_dirty_tracking(dmar_domain,
>> enable);
>> + if (ret)
>> + goto err_unwind;
>> + }
>> +
>
> there also lack of setting dirty tracking when a device is attached to the
> nested domain while the parent already has dirty-tracking enabled.
>
> but looks even w/o nesting this is missed in the attach path.
>
> could be fixed separately. so for this one:

it's fixed here.
https://lore.kernel.org/linux-iommu/[email protected]/

> Reviewed-by: Kevin Tian <[email protected]>

--
Regards,
Yi Liu

2024-02-08 10:31:06

by Joao Martins

[permalink] [raw]
Subject: Re: [PATCH rc 7/8] iommu/vt-d: Wrap the dirty tracking loop to be a helper

On 08/02/2024 08:23, Yi Liu wrote:
> Add device_set_dirty_tracking() to loop all the devices and set the dirty
> tracking per the @enable parameter.
>
> Signed-off-by: Yi Liu <[email protected]>

Nice cleanup,

Reviewed-by: Joao Martins <[email protected]>

> ---
> drivers/iommu/intel/iommu.c | 32 +++++++++++++++++++++-----------
> 1 file changed, 21 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index dae20991e036..7636d3f03905 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -4730,23 +4730,35 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
> return vtd;
> }
>
> +static int
> +device_set_dirty_tracking(struct list_head *devices, bool enable)
> +{
> + struct device_domain_info *info;
> + int ret = 0;
> +
> + list_for_each_entry(info, devices, link) {
> + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
> + IOMMU_NO_PASID, enable);
> + if (ret)
> + break;
> + }
> +
> + return ret;
> +}
> +
> static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
> bool enable)
> {
> struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> - struct device_domain_info *info;
> int ret;
>
> spin_lock(&dmar_domain->lock);
> if (dmar_domain->dirty_tracking == enable)
> goto out_unlock;
>
> - list_for_each_entry(info, &dmar_domain->devices, link) {
> - ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
> - IOMMU_NO_PASID, enable);
> - if (ret)
> - goto err_unwind;
> - }
> + ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
> + if (ret)
> + goto err_unwind;
>
> dmar_domain->dirty_tracking = enable;
> out_unlock:
> @@ -4755,10 +4767,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
> return 0;
>
> err_unwind:
> - list_for_each_entry(info, &dmar_domain->devices, link)
> - intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
> - IOMMU_NO_PASID,
> - dmar_domain->dirty_tracking);
> + device_set_dirty_tracking(&dmar_domain->devices,
> + dmar_domain->dirty_tracking);
> spin_unlock(&dmar_domain->lock);
> return ret;
> }


2024-02-08 11:04:27

by Joao Martins

[permalink] [raw]
Subject: Re: [PATCH rc 6/8] iommu/vt-d: Remove @domain parameter from intel_pasid_setup_dirty_tracking()

On 08/02/2024 08:23, Yi Liu wrote:
> The only usage of input @domain is to get the domain id (DID) to flush
> cache after setting dirty tracking. However, DID can be obtained from
> the pasid entry. So no need to pass in domain. This can make this helper
> cleaner when adding the missing dirty tracking for the parent domain,
> which needs to use the DID of nested domain.
>
> Signed-off-by: Yi Liu <[email protected]>

Reviewed-by: Joao Martins <[email protected]>

> ---
> drivers/iommu/intel/iommu.c | 7 +++----
> drivers/iommu/intel/pasid.c | 3 +--
> drivers/iommu/intel/pasid.h | 1 -
> 3 files changed, 4 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index c5a0275697cb..dae20991e036 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -4742,8 +4742,7 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
> goto out_unlock;
>
> list_for_each_entry(info, &dmar_domain->devices, link) {
> - ret = intel_pasid_setup_dirty_tracking(info->iommu,
> - info->domain, info->dev,
> + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
> IOMMU_NO_PASID, enable);
> if (ret)
> goto err_unwind;
> @@ -4757,8 +4756,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
>
> err_unwind:
> list_for_each_entry(info, &dmar_domain->devices, link)
> - intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
> - info->dev, IOMMU_NO_PASID,
> + intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
> + IOMMU_NO_PASID,
> dmar_domain->dirty_tracking);
> spin_unlock(&dmar_domain->lock);
> return ret;
> diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
> index 3239cefa4c33..a32d7e509842 100644
> --- a/drivers/iommu/intel/pasid.c
> +++ b/drivers/iommu/intel/pasid.c
> @@ -428,7 +428,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
> * Set up dirty tracking on a second only or nested translation type.
> */
> int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
> - struct dmar_domain *domain,
> struct device *dev, u32 pasid,
> bool enabled)
> {
> @@ -445,7 +444,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
> return -ENODEV;
> }
>
> - did = domain_id_iommu(domain, iommu);
> + did = pasid_get_domain_id(pte);
> pgtt = pasid_pte_get_pgtt(pte);
> if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
> pgtt != PASID_ENTRY_PGTT_NESTED) {
> diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
> index 8d40d4c66e31..487ede039bdd 100644
> --- a/drivers/iommu/intel/pasid.h
> +++ b/drivers/iommu/intel/pasid.h
> @@ -307,7 +307,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
> struct dmar_domain *domain,
> struct device *dev, u32 pasid);
> int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
> - struct dmar_domain *domain,
> struct device *dev, u32 pasid,
> bool enabled);
> int intel_pasid_setup_pass_through(struct intel_iommu *iommu,


2024-02-09 02:41:30

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH rc 7/8] iommu/vt-d: Wrap the dirty tracking loop to be a helper

On 2024/2/8 16:23, Yi Liu wrote:
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -4730,23 +4730,35 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
> return vtd;
> }
>
> +static int
> +device_set_dirty_tracking(struct list_head *devices, bool enable)
> +{
> + struct device_domain_info *info;
> + int ret = 0;
> +
> + list_for_each_entry(info, devices, link) {
> + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
> + IOMMU_NO_PASID, enable);
> + if (ret)
> + break;
> + }
> +
> + return ret;
> +}

Let's make this helper specific. Something like below.

/*
* Set dirty tracking for the device list of a domain. The caller must
* hold the domain->lock when calling it.
*/
static int device_list_set_dirty_tracking(...)

Best regards,
baolu

2024-02-09 06:44:04

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH rc 3/8] iommu/vt-d: Add missing iotlb flush for parent domain

On 2024/2/8 16:38, Tian, Kevin wrote:
>> From: Liu, Yi L<[email protected]>
>> Sent: Thursday, February 8, 2024 4:23 PM
>>
>> +/*
>> + * Flush the relevant caches in nested translation if the domain
>> + * also serves as a parent
>> + */
>> +static void parent_domain_flush(struct dmar_domain *domain,
>> + unsigned long pfn,
>> + unsigned long pages, int ih)
>> +{
>> + struct dmar_domain *s1_domain;
>> +
>> + spin_lock(&domain->s1_lock);
>> + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
>> + struct iommu_domain_info *info;
>> + unsigned long i;
>> +
>> + xa_for_each(&s1_domain->iommu_array, i, info)
>> + __iommu_flush_iotlb_psi(info->iommu, info->did,
>> + pfn, pages, ih);
>> + }
> As Jason suggested before this xarray lacks of proper locking.
>
> but given it's rc fix I'm fine with it. @Baolu we do need fix it soon so
> this problem won't further expand like here.

Yes. I planned to fix the locking issue in a separated series.

Best regards,
baolu

2024-02-21 15:20:07

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH rc 3/8] iommu/vt-d: Add missing iotlb flush for parent domain

On Thu, Feb 08, 2024 at 12:23:02AM -0800, Yi Liu wrote:
> If a domain is used as the parent in nested translation its mappings might
> be cached using DID of the nested domain. But the existing code ignores
> this fact to only invalidate the iotlb entries tagged by the domain's own
> DID.

> Loop the s1_domains list, if any, to invalidate all iotlb entries related
> to the target s2 address range. According to VT-d spec there is no need for
> software to explicitly flush the affected s1 cache. It's implicitly done by
> HW when s2 cache is invalidated.

I had to look this up to understand what it means.. The HW caches
per-DID and if you invalidate the DID's S2 then the HW flushes the S1
as well within that DID only.

It doesn't mean that the S2 is globally shared across all the nesting
translations (like ARM does), and you still have to iterate over every
nesting DID.

In light of that this design seems to have gone a bit off..

A domain should have a list of places that need invalidation,
specifically a list of DIDs and ATCs that need an invalidation to be
issued.

Instead we now somehow have 4 different lists in the domain the
invalidation code iterates over?

So I would think this:

struct dmar_domain {
struct xarray iommu_array; /* Attached IOMMU array */
struct list_head devices; /* all devices' list */
struct list_head dev_pasids; /* all attached pasids */
struct list_head s1_domains;

Would make sense to be collapsed into one logical list of attached
devices:

struct intel_iommu_domain_attachment {
unsigned int did;
ioasid_t pasid;
struct device_domain_info *info;
list_head item;
};

When you attach a S1/S2 nest you allocate two of the above structs and
one is linked on the S1 and one is linked on the S2..

Jason

2024-02-22 08:42:04

by Yi Liu

[permalink] [raw]
Subject: Re: [PATCH rc 3/8] iommu/vt-d: Add missing iotlb flush for parent domain

On 2024/2/21 23:19, Jason Gunthorpe wrote:
> On Thu, Feb 08, 2024 at 12:23:02AM -0800, Yi Liu wrote:
>> If a domain is used as the parent in nested translation its mappings might
>> be cached using DID of the nested domain. But the existing code ignores
>> this fact to only invalidate the iotlb entries tagged by the domain's own
>> DID.
>
>> Loop the s1_domains list, if any, to invalidate all iotlb entries related
>> to the target s2 address range. According to VT-d spec there is no need for
>> software to explicitly flush the affected s1 cache. It's implicitly done by
>> HW when s2 cache is invalidated.
>
> I had to look this up to understand what it means.. The HW caches
> per-DID and if you invalidate the DID's S2 then the HW flushes the S1
> as well within that DID only.

yes.

>
> It doesn't mean that the S2 is globally shared across all the nesting
> translations (like ARM does), and you still have to iterate over every
> nesting DID.
>
> In light of that this design seems to have gone a bit off..
>
> A domain should have a list of places that need invalidation,
> specifically a list of DIDs and ATCs that need an invalidation to be
> issued.
>
> Instead we now somehow have 4 different lists in the domain the
> invalidation code iterates over?
>
> So I would think this:
>
> struct dmar_domain {
> struct xarray iommu_array; /* Attached IOMMU array */
> struct list_head devices; /* all devices' list */
> struct list_head dev_pasids; /* all attached pasids */
> struct list_head s1_domains;
>
> Would make sense to be collapsed into one logical list of attached
> devices:
>
> struct intel_iommu_domain_attachment {
> unsigned int did;
> ioasid_t pasid;
> struct device_domain_info *info;
> list_head item;
> };
>
> When you attach a S1/S2 nest you allocate two of the above structs and
> one is linked on the S1 and one is linked on the S2..

yes, this shall work. ATC flushing should be fine. But there can be a
drawback when flushing DIDs. VT-d side, if there are multiple devices
behind the same iommu unit, just need one DID flushing is enough. But
after the above change, the number of DID flushing would increase per
the number of devices. Although no functional issue, but it submits
duplicated invalidation.

--
Regards,
Yi Liu

2024-02-22 15:17:56

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH rc 3/8] iommu/vt-d: Add missing iotlb flush for parent domain

On Thu, Feb 22, 2024 at 04:34:10PM +0800, Yi Liu wrote:
> > It doesn't mean that the S2 is globally shared across all the nesting
> > translations (like ARM does), and you still have to iterate over every
> > nesting DID.
> >
> > In light of that this design seems to have gone a bit off..
> >
> > A domain should have a list of places that need invalidation,
> > specifically a list of DIDs and ATCs that need an invalidation to be
> > issued.
> >
> > Instead we now somehow have 4 different lists in the domain the
> > invalidation code iterates over?
> >
> > So I would think this:
> >
> > struct dmar_domain {
> > struct xarray iommu_array; /* Attached IOMMU array */
> > struct list_head devices; /* all devices' list */
> > struct list_head dev_pasids; /* all attached pasids */
> > struct list_head s1_domains;
> >
> > Would make sense to be collapsed into one logical list of attached
> > devices:
> >
> > struct intel_iommu_domain_attachment {
> > unsigned int did;
> > ioasid_t pasid;
> > struct device_domain_info *info;
> > list_head item;
> > };
> >
> > When you attach a S1/S2 nest you allocate two of the above structs and
> > one is linked on the S1 and one is linked on the S2..
>
> yes, this shall work. ATC flushing should be fine. But there can be a
> drawback when flushing DIDs. VT-d side, if there are multiple devices
> behind the same iommu unit, just need one DID flushing is enough. But
> after the above change, the number of DID flushing would increase per
> the number of devices. Although no functional issue, but it submits
> duplicated invalidation.

At least the three server drivers all have this same problem, I would
like to seem some core code to help solve it, since it is tricky and
the RCU idea is good..

Collapsing invalidations can be done by sorting, I think this was
Robin's suggestion. But we could also easially maintain multiple list
threads hidden inside the API, or allocate a multi-level list.

Something very approximately like this:

Jason

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d14413916f93a0..7b2de139e7c437 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -38,6 +38,8 @@

#include "iommu-sva.h"

+#include <linux/iommu-alist.h>
+
static struct kset *iommu_group_kset;
static DEFINE_IDA(iommu_group_ida);
static DEFINE_IDA(iommu_global_pasid_ida);
diff --git a/include/linux/iommu-alist.h b/include/linux/iommu-alist.h
new file mode 100644
index 00000000000000..7a9243f8b2b5f8
--- /dev/null
+++ b/include/linux/iommu-alist.h
@@ -0,0 +1,126 @@
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/iommu.h>
+
+/*
+ * Place in an iommu_domain to keep track of what devices have been attached to
+ * it.
+ */
+struct iommu_attach_list {
+ spinlock_t lock;
+ struct list_head all;
+};
+
+/*
+ * Allocate for every time a device is attached to a domain
+ */
+struct iommu_attach_elm {
+ /*
+ * Note that this pointer is accessed under RCU, the driver has to be
+ * careful to rcu free it.
+ */
+ struct iommu_device *iommu_device;
+ ioasid_t pasid;
+ u8 using_atc;
+ struct list_head all_item;
+
+ /* May not be accessed under RCU! */
+ struct device *device;
+};
+
+void iommu_attach_init(struct iommu_attach_list *alist)
+{
+ spin_lock_init(&alist->lock);
+}
+
+int iommu_attach_add(struct iommu_attach_list *alist,
+ struct iommu_attach_elm *elm)
+{
+ struct list_head *insert_after;
+
+ spin_lock(&alist->lock);
+ insert_after = list_find_sorted_insertion(alist, elm, cmp);
+ list_add_rcu(&elm->all_item, insert_after);
+ spin_unlock(&alist->lock);
+}
+
+void iommu_attach_del_free(struct iommu_attach_list *alist, struct iommu_attach_elm *elm)
+{
+ spin_lock(&alist->lock);
+ list_del_rcu(&elm->all_item);
+ spin_unlock(&alist->lock);
+ /* assumes 0 offset */
+ kfree_rcu_mightsleep(elm);
+}
+
+static struct iommu_attach_elm *
+__iommu_attach_next_iommu(struct iommu_attach_list *alist,
+ struct iommu_attach_elm *pos,
+ struct iommu_attach_elm *last)
+{
+ struct iommu_attach_elm *next;
+
+ do {
+ next = list_next_or_null_rcu(&alist->all, &pos->all_item,
+ struct iommu_attach_elm, all_item);
+ if (!next)
+ return NULL;
+ if (!last)
+ return next;
+ } while (pos->iommu_device == next->iommu_device);
+ return next;
+}
+
+/* assumes 0 offset */
+#define iommu_attach_next_iommu(alist, pos, last, member) \
+ container_of(__iommu_attach_next_iommu(alist, &(pos)->member, \
+ &(last)->member), \
+ typeof(*pos), member)
+
+#define iommu_attach_for_each_iommu(pos, last, alist, member) \
+ for (({ \
+ RCU_LOCKDEP_WARN( \
+ !rcu_read_lock_any_held(), \
+ "RCU-list traversed in non-reader section!"); \
+ }), \
+ pos = list_first_or_null_rcu(&(alist)->all, typeof(*pos), \
+ member.all_item), \
+ last = NULL; \
+ pos; pos = iommu_attach_next_iommu(alist, pos, last, member))
+
+/* Example */
+struct driver_domain {
+ struct iommu_domain domain;
+ struct iommu_attach_list alist;
+};
+
+struct driver_attach_elm {
+ struct iommu_attach_elm aelm;
+ unsigned int cache_tag;
+};
+
+static void example(struct driver_domain *domain)
+{
+ struct driver_attach_elm *elm;
+ struct driver_attach_elm *pos, *last;
+ bool need_atc;
+
+ iommu_attach_init(&domain->alist);
+
+ elm = kzalloc(sizeof(*elm), GFP_KERNEL);
+ iommu_attach_add(&domain->alist, &elm->aelm);
+
+ rcu_read_lock();
+ iommu_attach_for_each_iommu(pos, last, &domain->alist, aelm) {
+ invalidate_iommu(elm);
+ need_atc |= elm->aelm.using_atc;
+ }
+ if (need_atc) {
+ list_for_each_entry_rcu(pos, &domain->alist.all,
+ aelm.all_item) {
+ if (pos->aelm.using_atc)
+ invalidate_atc(elm);
+ }
+ }
+ rcu_read_unlock();
+}