2022-09-22 10:09:44

by Niklas Schnelle

[permalink] [raw]
Subject: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
calls") we can end up with duplicates in the list of devices attached to
a domain. This is inefficient and confusing since only one domain can
actually be in control of the IOMMU translations for a device. Fix this
by detaching the device from the previous domain, if any, on attach.
Add a WARN_ON() in case we still have attached devices on freeing the
domain.

Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
Signed-off-by: Niklas Schnelle <[email protected]>
---
Changes since v1:
- WARN_ON() non-empty list in s390_domain_free()
- Drop the found flag and instead WARN_ON() if we're detaching
from a domain that isn't the active domain for the device

drivers/iommu/s390-iommu.c | 81 ++++++++++++++++++++++----------------
1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index c898bcbbce11..187d2c7ba9ff 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -78,19 +78,48 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
static void s390_domain_free(struct iommu_domain *domain)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
+ unsigned long flags;

+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ WARN_ON(!list_empty(&s390_domain->devices));
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
dma_cleanup_tables(s390_domain->dma_table);
kfree(s390_domain);
}

+static int __s390_iommu_detach_device(struct s390_domain *s390_domain,
+ struct zpci_dev *zdev)
+{
+ struct s390_domain_device *domain_device, *tmp;
+ unsigned long flags;
+
+ WARN_ON(zdev->s390_domain != s390_domain);
+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
+ list) {
+ if (domain_device->zdev == zdev) {
+ list_del(&domain_device->list);
+ kfree(domain_device);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+
+ zpci_unregister_ioat(zdev, 0);
+ zdev->s390_domain = NULL;
+ zdev->dma_table = NULL;
+ return 0;
+}
+
static int s390_iommu_attach_device(struct iommu_domain *domain,
struct device *dev)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain_device *domain_device;
+ struct s390_domain *prev_domain = NULL;
unsigned long flags;
- int cc, rc;
+ int cc, rc = 0;

if (!zdev)
return -ENODEV;
@@ -99,16 +128,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
if (!domain_device)
return -ENOMEM;

- if (zdev->dma_table && !zdev->s390_domain) {
- cc = zpci_dma_exit_device(zdev);
- if (cc) {
+ if (zdev->s390_domain) {
+ prev_domain = zdev->s390_domain;
+ rc = __s390_iommu_detach_device(zdev->s390_domain, zdev);
+ } else if (zdev->dma_table) {
+ if (zpci_dma_exit_device(zdev))
rc = -EIO;
- goto out_free;
- }
}
-
- if (zdev->s390_domain)
- zpci_unregister_ioat(zdev, 0);
+ if (rc)
+ goto out_free;

zdev->dma_table = s390_domain->dma_table;
cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
@@ -129,7 +157,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
domain->geometry.aperture_end != zdev->end_dma) {
rc = -EINVAL;
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
- goto out_restore;
+ goto out_unregister_restore;
}
domain_device->zdev = zdev;
zdev->s390_domain = s390_domain;
@@ -138,14 +166,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,

return 0;

+out_unregister_restore:
+ zpci_unregister_ioat(zdev, 0);
out_restore:
- if (!zdev->s390_domain) {
+ zdev->dma_table = NULL;
+ if (prev_domain)
+ s390_iommu_attach_device(&prev_domain->domain,
+ dev);
+ else
zpci_dma_init_device(zdev);
- } else {
- zdev->dma_table = zdev->s390_domain->dma_table;
- zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- virt_to_phys(zdev->dma_table));
- }
out_free:
kfree(domain_device);

@@ -157,30 +186,12 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
- struct s390_domain_device *domain_device, *tmp;
- unsigned long flags;
- int found = 0;

if (!zdev)
return;

- spin_lock_irqsave(&s390_domain->list_lock, flags);
- list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
- list) {
- if (domain_device->zdev == zdev) {
- list_del(&domain_device->list);
- kfree(domain_device);
- found = 1;
- break;
- }
- }
- spin_unlock_irqrestore(&s390_domain->list_lock, flags);
-
- if (found && (zdev->s390_domain == s390_domain)) {
- zdev->s390_domain = NULL;
- zpci_unregister_ioat(zdev, 0);
+ if (!__s390_iommu_detach_device(s390_domain, zdev))
zpci_dma_init_device(zdev);
- }
}

static struct iommu_device *s390_iommu_probe_device(struct device *dev)
--
2.34.1


2022-09-22 15:10:13

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Thu, Sep 22, 2022 at 11:52:37AM +0200, Niklas Schnelle wrote:
> Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
> calls") we can end up with duplicates in the list of devices attached to
> a domain. This is inefficient and confusing since only one domain can
> actually be in control of the IOMMU translations for a device. Fix this
> by detaching the device from the previous domain, if any, on attach.
> Add a WARN_ON() in case we still have attached devices on freeing the
> domain.
>
> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
> Signed-off-by: Niklas Schnelle <[email protected]>
> ---
> Changes since v1:
> - WARN_ON() non-empty list in s390_domain_free()
> - Drop the found flag and instead WARN_ON() if we're detaching
> from a domain that isn't the active domain for the device
>
> drivers/iommu/s390-iommu.c | 81 ++++++++++++++++++++++----------------
> 1 file changed, 46 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
> index c898bcbbce11..187d2c7ba9ff 100644
> --- a/drivers/iommu/s390-iommu.c
> +++ b/drivers/iommu/s390-iommu.c
> @@ -78,19 +78,48 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
> static void s390_domain_free(struct iommu_domain *domain)
> {
> struct s390_domain *s390_domain = to_s390_domain(domain);
> + unsigned long flags;
>
> + spin_lock_irqsave(&s390_domain->list_lock, flags);
> + WARN_ON(!list_empty(&s390_domain->devices));
> + spin_unlock_irqrestore(&s390_domain->list_lock, flags);

Minor, but, this is about to free the memory holding the lock, we
don't need to take it to do the WARN_ON.. list_empty() is already
lockless safe.

> static int __s390_iommu_detach_device(struct s390_domain *s390_domain,
> struct zpci_dev *zdev)
> {

This doesn't return a failure code anymore, make it void

> static int s390_iommu_attach_device(struct iommu_domain *domain,
> struct device *dev)
> {
> struct s390_domain *s390_domain = to_s390_domain(domain);
> struct zpci_dev *zdev = to_zpci_dev(dev);
> struct s390_domain_device *domain_device;
> + struct s390_domain *prev_domain = NULL;
> unsigned long flags;
> - int cc, rc;
> + int cc, rc = 0;
>
> if (!zdev)
> return -ENODEV;
> @@ -99,16 +128,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> if (!domain_device)
> return -ENOMEM;
>
> - if (zdev->dma_table && !zdev->s390_domain) {
> - cc = zpci_dma_exit_device(zdev);
> - if (cc) {
> + if (zdev->s390_domain) {
> + prev_domain = zdev->s390_domain;
> + rc = __s390_iommu_detach_device(zdev->s390_domain, zdev);
> + } else if (zdev->dma_table) {
> + if (zpci_dma_exit_device(zdev))
> rc = -EIO;
> - goto out_free;
> - }
> }
> -
> - if (zdev->s390_domain)
> - zpci_unregister_ioat(zdev, 0);
> + if (rc)
> + goto out_free;
>
> zdev->dma_table = s390_domain->dma_table;
> cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
> @@ -129,7 +157,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> domain->geometry.aperture_end != zdev->end_dma) {
> rc = -EINVAL;
> spin_unlock_irqrestore(&s390_domain->list_lock, flags);
> - goto out_restore;
> + goto out_unregister_restore;
> }
> domain_device->zdev = zdev;
> zdev->s390_domain = s390_domain;
> @@ -138,14 +166,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>
> return 0;
>
> +out_unregister_restore:
> + zpci_unregister_ioat(zdev, 0);
> out_restore:
> - if (!zdev->s390_domain) {
> + zdev->dma_table = NULL;
> + if (prev_domain)
> + s390_iommu_attach_device(&prev_domain->domain,
> + dev);

Huh. That is a surprising thing

I think this function needs some re-ordering to avoid this condition

The checks for aperture should be earlier, and they are not quite
right. The aperture is only allowed to grow. If it starts out as 0 and
then is set to something valid on first attach, a later attach cannot
then shrink it. There could already be mappings in the domain under
the now invalidated aperture and no caller is prepared to deal with
this.

That leaves the only error case as zpci_register_ioat() - which seems
like it is the actual "attach" operation. Since
__s390_iommu_detach_device() is just internal accounting (and can't
fail) it should be moved after

So the logic order should be

1) Attempt to widen the aperture, if this fails the domain is
incompatible bail immediately

2) zpci_register_ioat() to make the new domain current in the HW

3) fixup the internal records to record the now current domain (eg
__s390_iommu_detach_device)

And some similar changing to the non-domain path..

No sketchy error unwind attempting to re-attach a domain..

Jason

2022-09-26 09:21:02

by Niklas Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Thu, 2022-09-22 at 11:33 -0300, Jason Gunthorpe wrote:
> On Thu, Sep 22, 2022 at 11:52:37AM +0200, Niklas Schnelle wrote:
> > Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
> > calls") we can end up with duplicates in the list of devices attached to
> > a domain. This is inefficient and confusing since only one domain can
> > actually be in control of the IOMMU translations for a device. Fix this
> > by detaching the device from the previous domain, if any, on attach.
> > Add a WARN_ON() in case we still have attached devices on freeing the
> > domain.
> >
> > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
> > Signed-off-by: Niklas Schnelle <[email protected]>
> > ---
> > Changes since v1:
> > - WARN_ON() non-empty list in s390_domain_free()
> > - Drop the found flag and instead WARN_ON() if we're detaching
> > from a domain that isn't the active domain for the device
> >
> > drivers/iommu/s390-iommu.c | 81 ++++++++++++++++++++++----------------
> > 1 file changed, 46 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
> > index c898bcbbce11..187d2c7ba9ff 100644
> > --- a/drivers/iommu/s390-iommu.c
> > +++ b/drivers/iommu/s390-iommu.c
> > @@ -78,19 +78,48 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
> > static void s390_domain_free(struct iommu_domain *domain)
> > {
> > struct s390_domain *s390_domain = to_s390_domain(domain);
> > + unsigned long flags;
> >
> > + spin_lock_irqsave(&s390_domain->list_lock, flags);
> > + WARN_ON(!list_empty(&s390_domain->devices));
> > + spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>
> Minor, but, this is about to free the memory holding the lock, we
> don't need to take it to do the WARN_ON.. list_empty() is already
> lockless safe.

Makes sense.

>
> > static int __s390_iommu_detach_device(struct s390_domain *s390_domain,
> > struct zpci_dev *zdev)
> > {
>
> This doesn't return a failure code anymore, make it void
>
> > static int s390_iommu_attach_device(struct iommu_domain *domain,
> > struct device *dev)
> > {
> > struct s390_domain *s390_domain = to_s390_domain(domain);
> > struct zpci_dev *zdev = to_zpci_dev(dev);
> > struct s390_domain_device *domain_device;
> > + struct s390_domain *prev_domain = NULL;
> > unsigned long flags;
> > - int cc, rc;
> > + int cc, rc = 0;
> >
> > if (!zdev)
> > return -ENODEV;
> > @@ -99,16 +128,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> > if (!domain_device)
> > return -ENOMEM;
> >
> > - if (zdev->dma_table && !zdev->s390_domain) {
> > - cc = zpci_dma_exit_device(zdev);
> > - if (cc) {
> > + if (zdev->s390_domain) {
> > + prev_domain = zdev->s390_domain;
> > + rc = __s390_iommu_detach_device(zdev->s390_domain, zdev);
> > + } else if (zdev->dma_table) {
> > + if (zpci_dma_exit_device(zdev))
> > rc = -EIO;
> > - goto out_free;
> > - }
> > }
> > -
> > - if (zdev->s390_domain)
> > - zpci_unregister_ioat(zdev, 0);
> > + if (rc)
> > + goto out_free;
> >
> > zdev->dma_table = s390_domain->dma_table;
> > cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
> > @@ -129,7 +157,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> > domain->geometry.aperture_end != zdev->end_dma) {
> > rc = -EINVAL;
> > spin_unlock_irqrestore(&s390_domain->list_lock, flags);
> > - goto out_restore;
> > + goto out_unregister_restore;
> > }
> > domain_device->zdev = zdev;
> > zdev->s390_domain = s390_domain;
> > @@ -138,14 +166,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> >
> > return 0;
> >
> > +out_unregister_restore:
> > + zpci_unregister_ioat(zdev, 0);
> > out_restore:
> > - if (!zdev->s390_domain) {
> > + zdev->dma_table = NULL;
> > + if (prev_domain)
> > + s390_iommu_attach_device(&prev_domain->domain,
> > + dev);
>
> Huh. That is a surprising thing
>
> I think this function needs some re-ordering to avoid this condition
>
> The checks for aperture should be earlier, and they are not quite
> right. The aperture is only allowed to grow. If it starts out as 0 and
> then is set to something valid on first attach, a later attach cannot
> then shrink it. There could already be mappings in the domain under
> the now invalidated aperture and no caller is prepared to deal with
> this.

Ohh I think this is indeed broken. Let me rephrase to see if I
understand correctly. You're saying that while we only allow exactly
matching apertures on additional attaches, we do allow shrinking if
there is temporarily no device attached to the domain. That part is
then broken because there could still be mappings outside the new
aperture stored in the translation tables?

>
> That leaves the only error case as zpci_register_ioat() - which seems
> like it is the actual "attach" operation. Since
> __s390_iommu_detach_device() is just internal accounting (and can't
> fail) it should be moved after
>
> So the logic order should be
>
> 1) Attempt to widen the aperture, if this fails the domain is
> incompatible bail immediately

Question. If the widening succeeds but we fail later during the attach
e.g. in 2) then the aperture remains widend or would that be rolled
back? Rolling this back seems bad at least if we can't hold the lock
over the entire process. So to do this properly it sounds to me like we
really want to get rid of the allocation (i.e. patch 3) and hold the
lock over the entire process. If we do that I think it would be okay to
keep enforcing equality for now as it is only overly strict not broken
like the shrinking.

>
> 2) zpci_register_ioat() to make the new domain current in the HW
>
> 3) fixup the internal records to record the now current domain (eg
> __s390_iommu_detach_device)
>
> And some similar changing to the non-domain path..
>
> No sketchy error unwind attempting to re-attach a domain..
>
> Jason

You make very good points and this sounds like it could simplify the
process too. We did have the aperture check moved to the beginning of
the function in an earlier version. The problem with that was the
raciness created by having to release and re-take the lock after the
allocation. So I think the best approach would be to roll the struct
s390_domain removal into the fix and do the steps as you describe them
but with the lock held during the entire process which we should be
able to do if there is no allocation.

2022-09-26 16:29:31

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Mon, Sep 26, 2022 at 11:00:53AM +0200, Niklas Schnelle wrote:

> > > +out_unregister_restore:
> > > + zpci_unregister_ioat(zdev, 0);
> > > out_restore:
> > > - if (!zdev->s390_domain) {
> > > + zdev->dma_table = NULL;
> > > + if (prev_domain)
> > > + s390_iommu_attach_device(&prev_domain->domain,
> > > + dev);
> >
> > Huh. That is a surprising thing
> >
> > I think this function needs some re-ordering to avoid this condition
> >
> > The checks for aperture should be earlier, and they are not quite
> > right. The aperture is only allowed to grow. If it starts out as 0 and
> > then is set to something valid on first attach, a later attach cannot
> > then shrink it. There could already be mappings in the domain under
> > the now invalidated aperture and no caller is prepared to deal with
> > this.
>
> Ohh I think this is indeed broken. Let me rephrase to see if I
> understand correctly. You're saying that while we only allow exactly
> matching apertures on additional attaches, we do allow shrinking if
> there is temporarily no device attached to the domain. That part is
> then broken because there could still be mappings outside the new
> aperture stored in the translation tables?

Right, go from 0 -> sized apperture on first attach, and then once it
is sized it doesn't change again.

> > That leaves the only error case as zpci_register_ioat() - which seems
> > like it is the actual "attach" operation. Since
> > __s390_iommu_detach_device() is just internal accounting (and can't
> > fail) it should be moved after
> >
> > So the logic order should be
> >
> > 1) Attempt to widen the aperture, if this fails the domain is
> > incompatible bail immediately
>
> Question. If the widening succeeds but we fail later during the attach
> e.g. in 2) then the aperture remains widend or would that be rolled
> back?

I'd leave it widened.

IMHO I don't like this trick of setting the aperture on attach. It is
logically wrong. The aperture is part of the configuration of the page
table itself. The domain should know what page table format and thus
apterture it has the moment it is allocated. Usually this is the
related to the number of levels in the radix tree.

It seems to me that the issue here is trying to use the aperture when
the reserved region is the appropriate tool.

eg I see that s390_domain_alloc calls dma_alloc_cpu_table() which just
allocates a 3 level radix tree. This means it has a specific max
address that can be passed to dma_walk_cpu_trans(). So the aperture
should be fixed based on the radix tree parameters.

The device specific start/end should be represented as a reserved
regions per-device. See patch below..

This is meaningful because it effects when VFIO can share the domains
across devices. If devices have different reserved ranges we can still
share domains, so long as no mapping is placed in the union of the
reserved ranges. However if you vary the aperture, like is currently
happening, then the domains become unsharable.

diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index c898bcbbce118f..ba80325da76cd9 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -51,6 +51,12 @@ static bool s390_iommu_capable(enum iommu_cap cap)
}
}

+/*
+ * dma_alloc_cpu_table() creates a 3 level table, rtx, sx, px, so the address
+ * that can be passed to dma_walk_cpu_trans() must be less than this.
+ */
+#define MAX_DMA_TABLE_ADDR (ZPCI_TABLE_SIZE * ZPCI_TABLE_SIZE * ZPCI_PT_SIZE)
+
static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
{
struct s390_domain *s390_domain;
@@ -68,6 +74,10 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
return NULL;
}

+ s390_domain->domain.geometry.force_aperture = true;
+ s390_domain->domain.geometry.aperture_start = 0;
+ s390_domain->domain.geometry.aperture_end = MAX_DMA_TABLE_ADDR;
+
spin_lock_init(&s390_domain->dma_table_lock);
spin_lock_init(&s390_domain->list_lock);
INIT_LIST_HEAD(&s390_domain->devices);
@@ -119,18 +129,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
}

spin_lock_irqsave(&s390_domain->list_lock, flags);
- /* First device defines the DMA range limits */
- if (list_empty(&s390_domain->devices)) {
- domain->geometry.aperture_start = zdev->start_dma;
- domain->geometry.aperture_end = zdev->end_dma;
- domain->geometry.force_aperture = true;
- /* Allow only devices with identical DMA range limits */
- } else if (domain->geometry.aperture_start != zdev->start_dma ||
- domain->geometry.aperture_end != zdev->end_dma) {
- rc = -EINVAL;
- spin_unlock_irqrestore(&s390_domain->list_lock, flags);
- goto out_restore;
- }
domain_device->zdev = zdev;
zdev->s390_domain = s390_domain;
list_add(&domain_device->list, &s390_domain->devices);
@@ -152,6 +150,30 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
return rc;
}

+static void s390_iommu_get_resv_regions(struct device *dev,
+ struct list_head *list)
+{
+ struct zpci_dev *zdev = to_zpci_dev(dev);
+ struct iommu_resv_region *region;
+
+ if (zdev->start_dma) {
+ region = iommu_alloc_resv_region(0, zdev->start_dma, 0,
+ IOMMU_RESV_RESERVED);
+ if (!region)
+ return;
+ list_add_tail(&region->list, list);
+ }
+
+ if (zdev->end_dma < MAX_DMA_TABLE_ADDR) {
+ region = iommu_alloc_resv_region(
+ zdev->end_dma, MAX_DMA_TABLE_ADDR - zdev->end_dma, 0,
+ IOMMU_RESV_RESERVED);
+ if (!region)
+ return;
+ list_add_tail(&region->list, list);
+ }
+}
+
static void s390_iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
@@ -376,6 +398,7 @@ static const struct iommu_ops s390_iommu_ops = {
.release_device = s390_iommu_release_device,
.device_group = generic_device_group,
.pgsize_bitmap = S390_IOMMU_PGSIZES,
+ .get_resv_regions = s390_iommu_get_resv_regions,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = s390_iommu_attach_device,
.detach_dev = s390_iommu_detach_device,

2022-09-26 16:29:37

by Niklas Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Thu, 2022-09-22 at 11:33 -0300, Jason Gunthorpe wrote:
> On Thu, Sep 22, 2022 at 11:52:37AM +0200, Niklas Schnelle wrote:
> > Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
> > calls") we can end up with duplicates in the list of devices attached to
> > a domain. This is inefficient and confusing since only one domain can
> > actually be in control of the IOMMU translations for a device. Fix this
> > by detaching the device from the previous domain, if any, on attach.
> > Add a WARN_ON() in case we still have attached devices on freeing the
> > domain.
> >
> > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
> > Signed-off-by: Niklas Schnelle <[email protected]>
> > ---
> > Changes since v1:
> > - WARN_ON() non-empty list in s390_domain_free()
> > - Drop the found flag and instead WARN_ON() if we're detaching
> > from a domain that isn't the active domain for the device
> >
> > drivers/iommu/s390-iommu.c | 81 ++++++++++++++++++++++----------------
> > 1 file changed, 46 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
> > index c898bcbbce11..187d2c7ba9ff 100644
> > --- a/drivers/iommu/s390-iommu.c
> > +++ b/drivers/iommu/s390-iommu.c
> > @@ -78,19 +78,48 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
> > static void s390_domain_free(struct iommu_domain *domain)
> > {
> > struct s390_domain *s390_domain = to_s390_domain(domain);
> > + unsigned long flags;
> >
> > + spin_lock_irqsave(&s390_domain->list_lock, flags);
> > + WARN_ON(!list_empty(&s390_domain->devices));
> > + spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>
> Minor, but, this is about to free the memory holding the lock, we
> don't need to take it to do the WARN_ON.. list_empty() is already
> lockless safe.
>
> > static int __s390_iommu_detach_device(struct s390_domain *s390_domain,
> > struct zpci_dev *zdev)
> > {
>
> This doesn't return a failure code anymore, make it void
>
> > static int s390_iommu_attach_device(struct iommu_domain *domain,
> > struct device *dev)
> > {
> > struct s390_domain *s390_domain = to_s390_domain(domain);
> > struct zpci_dev *zdev = to_zpci_dev(dev);
> > struct s390_domain_device *domain_device;
> > + struct s390_domain *prev_domain = NULL;
> > unsigned long flags;
> > - int cc, rc;
> > + int cc, rc = 0;
> >
> > if (!zdev)
> > return -ENODEV;
> > @@ -99,16 +128,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> > if (!domain_device)
> > return -ENOMEM;
> >
> > - if (zdev->dma_table && !zdev->s390_domain) {
> > - cc = zpci_dma_exit_device(zdev);
> > - if (cc) {
> > + if (zdev->s390_domain) {
> > + prev_domain = zdev->s390_domain;
> > + rc = __s390_iommu_detach_device(zdev->s390_domain, zdev);
> > + } else if (zdev->dma_table) {
> > + if (zpci_dma_exit_device(zdev))
> > rc = -EIO;
> > - goto out_free;
> > - }
> > }
> > -
> > - if (zdev->s390_domain)
> > - zpci_unregister_ioat(zdev, 0);
> > + if (rc)
> > + goto out_free;
> >
> > zdev->dma_table = s390_domain->dma_table;
> > cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
> > @@ -129,7 +157,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> > domain->geometry.aperture_end != zdev->end_dma) {
> > rc = -EINVAL;
> > spin_unlock_irqrestore(&s390_domain->list_lock, flags);
> > - goto out_restore;
> > + goto out_unregister_restore;
> > }
> > domain_device->zdev = zdev;
> > zdev->s390_domain = s390_domain;
> > @@ -138,14 +166,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
> >
> > return 0;
> >
> > +out_unregister_restore:
> > + zpci_unregister_ioat(zdev, 0);
> > out_restore:
> > - if (!zdev->s390_domain) {
> > + zdev->dma_table = NULL;
> > + if (prev_domain)
> > + s390_iommu_attach_device(&prev_domain->domain,
> > + dev);
>
> Huh. That is a surprising thing
>
> I think this function needs some re-ordering to avoid this condition
>
> The checks for aperture should be earlier, and they are not quite
> right. The aperture is only allowed to grow. If it starts out as 0 and
> then is set to something valid on first attach, a later attach cannot
> then shrink it. There could already be mappings in the domain under
> the now invalidated aperture and no caller is prepared to deal with
> this.
>
> That leaves the only error case as zpci_register_ioat() - which seems
> like it is the actual "attach" operation. Since
> __s390_iommu_detach_device() is just internal accounting (and can't
> fail) it should be moved after

I did miss a problem in my initial answer. While zpci_register_ioat()
is indeed the actual "attach" operation, it assumes that at that point
no DMA address translations are registered. In that state DMA is
blocked of course. With that zpci_register_ioat() needs to come after
the zpci_unregister_ioat() that is part of __s390_iommu_detach_device()
and zpci_dma_exit_device(). If we do call those though we fundamentally
need to restore the previous domain / DMA API state on any subsequent
failure. If we don't restore we would leave the device detached from
any domain with DMA blocked. I wonder if this could be an acceptable
failure state though? It's safe as no DMA is possible and we could get
out of it with a successful attach.

2022-09-26 16:37:31

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Mon, Sep 26, 2022 at 03:29:49PM +0200, Niklas Schnelle wrote:
> I did miss a problem in my initial answer. While zpci_register_ioat()
> is indeed the actual "attach" operation, it assumes that at that point
> no DMA address translations are registered. In that state DMA is
> blocked of course. With that zpci_register_ioat() needs to come after
> the zpci_unregister_ioat() that is part of __s390_iommu_detach_device()
> and zpci_dma_exit_device(). If we do call those though we fundamentally
> need to restore the previous domain / DMA API state on any subsequent
> failure. If we don't restore we would leave the device detached from
> any domain with DMA blocked. I wonder if this could be an acceptable
> failure state though? It's safe as no DMA is possible and we could get
> out of it with a successful attach.

Is this because of that FW call it does?

It seems like an FW API misdesign to not allow an unfailable change of
translation, if the FW forces an unregister first then there are
always error cases you can't correctly recover from.

IMHO if the FW fails an attach you are just busted, there is no reason
to think it would suddenly accept attaching the old domain just
because it has a different physical address, right?

So make it simple, leave it DMA blocked and throw a WARN_ON..

Talk to your FW people about fixing the API?

Jason

2022-09-27 16:41:45

by Niklas Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Mon, 2022-09-26 at 10:46 -0300, Jason Gunthorpe wrote:
> On Mon, Sep 26, 2022 at 11:00:53AM +0200, Niklas Schnelle wrote:
>
> > > > +out_unregister_restore:
> > > > + zpci_unregister_ioat(zdev, 0);
> > > > out_restore:
> > > > - if (!zdev->s390_domain) {
> > > > + zdev->dma_table = NULL;
> > > > + if (prev_domain)
> > > > + s390_iommu_attach_device(&prev_domain->domain,
> > > > + dev);
> > >
> > > Huh. That is a surprising thing
> > >
> > > I think this function needs some re-ordering to avoid this condition
> > >
> > > The checks for aperture should be earlier, and they are not quite
> > > right. The aperture is only allowed to grow. If it starts out as 0 and
> > > then is set to something valid on first attach, a later attach cannot
> > > then shrink it. There could already be mappings in the domain under
> > > the now invalidated aperture and no caller is prepared to deal with
> > > this.
> >
> > Ohh I think this is indeed broken. Let me rephrase to see if I
> > understand correctly. You're saying that while we only allow exactly
> > matching apertures on additional attaches, we do allow shrinking if
> > there is temporarily no device attached to the domain. That part is
> > then broken because there could still be mappings outside the new
> > aperture stored in the translation tables?
>
> Right, go from 0 -> sized apperture on first attach, and then once it
> is sized it doesn't change again.

Ok, some background. This is currently largely a theoretical issue as
all native PCI devices that currently exist on s390 have the same zdev-
>start_dma (0x100000000) and we will set zdev->end_dma to (zdev-
>start_dma + MEMORY_SIZE) in zpci_dma_init_device() to save space for
our IOVA allocation bitmap. So currently no shrinking can occur.

Still I think we should do this properly. Also by the way, there is a
known off-by-one (aperture too small) in the current checks for which I
was about to send a patch as part of the conversion to using dma-
iommu.c.

>
> > > That leaves the only error case as zpci_register_ioat() - which seems
> > > like it is the actual "attach" operation. Since
> > > __s390_iommu_detach_device() is just internal accounting (and can't
> > > fail) it should be moved after
> > >
> > > So the logic order should be
> > >
> > > 1) Attempt to widen the aperture, if this fails the domain is
> > > incompatible bail immediately
> >
> > Question. If the widening succeeds but we fail later during the attach
> > e.g. in 2) then the aperture remains widend or would that be rolled
> > back?
>
> I'd leave it widened.
>
> IMHO I don't like this trick of setting the aperture on attach. It is
> logically wrong. The aperture is part of the configuration of the page
> table itself. The domain should know what page table format and thus
> apterture it has the moment it is allocated. Usually this is the
> related to the number of levels in the radix tree.
>
> It seems to me that the issue here is trying to use the aperture when
> the reserved region is the appropriate tool.
>
> eg I see that s390_domain_alloc calls dma_alloc_cpu_table() which just
> allocates a 3 level radix tree. This means it has a specific max
> address that can be passed to dma_walk_cpu_trans(). So the aperture
> should be fixed based on the radix tree parameters.

Yes, we even have the ZPCI_TABLE_SIZE_RT constant already that is the
maximum number of translations.

>
> The device specific start/end should be represented as a reserved
> regions per-device. See patch below..
>
> This is meaningful because it effects when VFIO can share the domains
> across devices. If devices have different reserved ranges we can still
> share domains, so long as no mapping is placed in the union of the
> reserved ranges. However if you vary the aperture, like is currently
> happening, then the domains become unsharable.

Ok, interesting idea.

As we haven't used these reserved regions so far and I'm not familiar
with what they are usually used for, I had a look at my x86_64 test box
(amd_iommu=on) and see the following:

# cat /sys/bus/pci/devices/*/iommu_group/reserved_regions | sort | uniq
0x00000000fee00000 0x00000000feefffff msi
0x000000fd00000000 0x000000ffffffffff reserved

Not sure what the non-MSI reservation is for? It does seem like x86_64
also uses this for quite large ranges.

With your patch and the off by one fixed (see below) on an s390x
machine with 32 GiB memory I then get the following:

# cat /sys/bus/pci/devices/*/iommu_group/reserved_regions | sort | uniq
0x0000000000000000 0x00000000ffffffff reserved
0x0000000900000000 0x000003ffffffffff reserved

As the upper limit is shown inclusive this looks correct to me.

Not all is rosy though, while this seems to work with the current code
and KVM pass-through, it breaks my conversion for using dma-iommu.

This is because I'm getting a map request for an IOVA in the reserved
region. I pass the zdev->start_dma and zdev->end_dma to
iommu_setup_dma_ops() but while iommu_dma_init_domain() uses the lower
limit for the base_pfn it only checks if the upper limit fits in the
aperture and otherwise ignores it. Looking at iommu_dma_alloc_iova() I
don't think that avoids the reserved regions either but it does use the
domain->geometry.aperture_end or dev->bus_dma_limit, whichever is
smaller.

With that knowledge setting dev->bus_dma_limit to zdev->end_dma makes
my conversion work again that feels wrong though but at least confirms
the issue.

>
> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
> index c898bcbbce118f..ba80325da76cd9 100644
> --- a/drivers/iommu/s390-iommu.c
> +++ b/drivers/iommu/s390-iommu.c
> @@ -51,6 +51,12 @@ static bool s390_iommu_capable(enum iommu_cap cap)
> }
> }
>
>
---8<---
>
> +static void s390_iommu_get_resv_regions(struct device *dev,
> + struct list_head *list)
> +{
> + struct zpci_dev *zdev = to_zpci_dev(dev);
> + struct iommu_resv_region *region;
> +
> + if (zdev->start_dma) {
> + region = iommu_alloc_resv_region(0, zdev->start_dma, 0,
> + IOMMU_RESV_RESERVED);
> + if (!region)
> + return;
> + list_add_tail(&region->list, list);
> + }
> +
> + if (zdev->end_dma < MAX_DMA_TABLE_ADDR) {
> + region = iommu_alloc_resv_region(
> + zdev->end_dma, MAX_DMA_TABLE_ADDR - zdev->end_dma, 0,

Not your fault since there is a pre-existing bug in the range check but
I think there is an off-by-one error here as zdev->end_dma is the
highest usable address.

> + IOMMU_RESV_RESERVED);
> + if (!region)
> + return;
> + list_add_tail(&region->list, list);
> + }
> +}
> +
> static void s390_iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
> {
> @@ -376,6 +398,7 @@ static const struct iommu_ops s390_iommu_ops = {
> .release_device = s390_iommu_release_device,
> .device_group = generic_device_group,
> .pgsize_bitmap = S390_IOMMU_PGSIZES,
> + .get_resv_regions = s390_iommu_get_resv_regions,
> .default_domain_ops = &(const struct iommu_domain_ops) {
> .attach_dev = s390_iommu_attach_device,
> .detach_dev = s390_iommu_detach_device,


2022-09-27 16:47:03

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Tue, Sep 27, 2022 at 06:24:54PM +0200, Niklas Schnelle wrote:
> On Mon, 2022-09-26 at 10:51 -0300, Jason Gunthorpe wrote:
> > On Mon, Sep 26, 2022 at 03:29:49PM +0200, Niklas Schnelle wrote:
> > > I did miss a problem in my initial answer. While zpci_register_ioat()
> > > is indeed the actual "attach" operation, it assumes that at that point
> > > no DMA address translations are registered. In that state DMA is
> > > blocked of course. With that zpci_register_ioat() needs to come after
> > > the zpci_unregister_ioat() that is part of __s390_iommu_detach_device()
> > > and zpci_dma_exit_device(). If we do call those though we fundamentally
> > > need to restore the previous domain / DMA API state on any subsequent
> > > failure. If we don't restore we would leave the device detached from
> > > any domain with DMA blocked. I wonder if this could be an acceptable
> > > failure state though? It's safe as no DMA is possible and we could get
> > > out of it with a successful attach.
> >
> > Is this because of that FW call it does?
> >
> > It seems like an FW API misdesign to not allow an unfailable change of
> > translation, if the FW forces an unregister first then there are
> > always error cases you can't correctly recover from.
> >
> > IMHO if the FW fails an attach you are just busted, there is no reason
> > to think it would suddenly accept attaching the old domain just
> > because it has a different physical address, right?
>
> While I can't come up with a case where an immediate retry would
> actually help, there is at least one error case that one should be able
> to recover from. The attach can fail if a firmware driven PCI device
> recovery is in progress. Now if that happens during a switch between
> domains I think one will have to do the equivalent of
>
> echo 0 > /sys/bus/pci/slots/<dev>/power; echo 1 > /.../power
>
> That of course tears down the whole PCI device so we don't have to
> answer the question if the old or new domain is the active one.
>
> So I think in the consequences you're still right, attempting to re-
> attach is a lot of hassle for little or no gain.

I don't know about FW driven PCI device recovery, but if FW can
trigger some behavior that causes the kernel driver to malfunction,
(and not having a DMA domain attached is malfunctioning) then that is
a WARN_ON condition, IMHO.

Especially if the FW driver recovery is done co-operatively with a
driver, then it is reasonable to demand no domains change while
recovery is ongoing.

Regardless, I still think this is a bug in the FW - it doesn't make
sense that a domain can be attached when FW device recovery starts,
and that the kernel can't change the domain while the FW device
recovery is ongoing. Presumably FW should block DMA during recovery
and just book-keep what the domain should be post-recovery.

Keep in mind, we already have some WARN_ONs on this path:

void iommu_group_release_dma_owner(struct iommu_group *group)
{
ret = __iommu_group_set_domain(group, group->default_domain);
WARN(ret, "iommu driver failed to attach the default domain");

Attach is really is not something that is able to fail in normal
cases..

> and the device is in a defined and isolated state. Maybe in the future
> we could even make this explicit and attach to the blocking domain on
> failed attach, does that make sense?

This would help somewhat, if the blocking domain is properly recorded
as the current group domain then things like
iommu_device_use_default_domain() will fail, meaning no drivers can be
attached to the device until it is hotpluged in/out.

However, this is hard to do properly because multi-device groups
cannot tolerate devices within the group having different current
domains.

Overall changing to the blocking domain or back to the default domain
should never fail, drivers should be designed with this in mind.

If fixing the FW is not feasible perhaps the better error recovery is
to sleep/retry until it succeeds.

Jason

2022-09-27 16:56:05

by Niklas Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Mon, 2022-09-26 at 10:51 -0300, Jason Gunthorpe wrote:
> On Mon, Sep 26, 2022 at 03:29:49PM +0200, Niklas Schnelle wrote:
> > I did miss a problem in my initial answer. While zpci_register_ioat()
> > is indeed the actual "attach" operation, it assumes that at that point
> > no DMA address translations are registered. In that state DMA is
> > blocked of course. With that zpci_register_ioat() needs to come after
> > the zpci_unregister_ioat() that is part of __s390_iommu_detach_device()
> > and zpci_dma_exit_device(). If we do call those though we fundamentally
> > need to restore the previous domain / DMA API state on any subsequent
> > failure. If we don't restore we would leave the device detached from
> > any domain with DMA blocked. I wonder if this could be an acceptable
> > failure state though? It's safe as no DMA is possible and we could get
> > out of it with a successful attach.
>
> Is this because of that FW call it does?
>
> It seems like an FW API misdesign to not allow an unfailable change of
> translation, if the FW forces an unregister first then there are
> always error cases you can't correctly recover from.
>
> IMHO if the FW fails an attach you are just busted, there is no reason
> to think it would suddenly accept attaching the old domain just
> because it has a different physical address, right?

While I can't come up with a case where an immediate retry would
actually help, there is at least one error case that one should be able
to recover from. The attach can fail if a firmware driven PCI device
recovery is in progress. Now if that happens during a switch between
domains I think one will have to do the equivalent of

echo 0 > /sys/bus/pci/slots/<dev>/power; echo 1 > /.../power

That of course tears down the whole PCI device so we don't have to
answer the question if the old or new domain is the active one.

So I think in the consequences you're still right, attempting to re-
attach is a lot of hassle for little or no gain.

>
> So make it simple, leave it DMA blocked and throw a WARN_ON..

To me a WARN_ON() isn't appropriate here, as stated above there is at
least one error case that is recoverable and doesn't necessarily
indicate a progamming bug. Also while not recoverable the device having
been surprise unplugged is another case where the attach failing is not
necessarily a bug. It would also thus cause false panics for e.g. CI
systems with PANIC_ON_WARN.

That said yes I think leaving DMA blocked and the device detached from
any domain is reasonable. That way the above recover scenario will work
and the device is in a defined and isolated state. Maybe in the future
we could even make this explicit and attach to the blocking domain on
failed attach, does that make sense?

2022-09-27 18:25:41

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Tue, Sep 27, 2022 at 06:33:48PM +0200, Niklas Schnelle wrote:

> Not sure what the non-MSI reservation is for? It does seem like x86_64
> also uses this for quite large ranges.

There are lots of things that are unsuitable for DMA on x86 platforms,
unfortunately.. But yeah, I'm not sure either.

> This is because I'm getting a map request for an IOVA in the reserved
> region.

How come? iova_reserve_iommu_regions() reads the reserved regions and
loads them as reserved into the iovad which should cause
iommu_dma_alloc_iova() and alloc_iova_fast() to not return values in
those ranges.

It all looks like it is supposed to work

Did something go wrong in the initialization order perhaps?

Jason

2022-09-28 09:27:35

by Niklas Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Tue, 2022-09-27 at 13:56 -0300, Jason Gunthorpe wrote:
> On Tue, Sep 27, 2022 at 06:33:48PM +0200, Niklas Schnelle wrote:
>
> > Not sure what the non-MSI reservation is for? It does seem like x86_64
> > also uses this for quite large ranges.
>
> There are lots of things that are unsuitable for DMA on x86 platforms,
> unfortunately.. But yeah, I'm not sure either.
>
> > This is because I'm getting a map request for an IOVA in the reserved
> > region.
>
> How come? iova_reserve_iommu_regions() reads the reserved regions and
> loads them as reserved into the iovad which should cause
> iommu_dma_alloc_iova() and alloc_iova_fast() to not return values in
> those ranges.
>
> It all looks like it is supposed to work
>
> Did something go wrong in the initialization order perhaps?
>
> Jason

It was of course a classic off-by-one, the table size is a number of
entries but geometry.aperture_end seems to be the largest allowed IOVA.
So we need:

s390_domain->domain.geometry.force_aperture = true;
s390_domain->domain.geometry.aperture_start = 0;
s390_domain->domain.geometry.aperture_end = ZPCI_TABLE_SIZE_RT - 1;

Otherwise the first IOVA allocated is ZPCI_TABLE_SIZE_RT itself.
Similarly we need the second reserved region if (zdev->end_dma <
ZPCI_TABLE_SIZE_RT - 1). In your patch I think you had the
MAX_DMA_TABLE_ADDR name right but would have also calculated the number
of entries.

On the other hand with the dma-iommu.c conversion it no longer makes
sense to lower zdev->end_dma artificially, so at least on current
machine LPARs we would end up with just a lower reserved region
0x0000000000000000 to 0x00000000ffffffff and can use IOVAs up to
aperture_end.

2022-09-28 13:36:40

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Wed, Sep 28, 2022 at 10:58:22AM +0200, Niklas Schnelle wrote:
> On Tue, 2022-09-27 at 13:56 -0300, Jason Gunthorpe wrote:
> > On Tue, Sep 27, 2022 at 06:33:48PM +0200, Niklas Schnelle wrote:
> >
> > > Not sure what the non-MSI reservation is for? It does seem like x86_64
> > > also uses this for quite large ranges.
> >
> > There are lots of things that are unsuitable for DMA on x86 platforms,
> > unfortunately.. But yeah, I'm not sure either.
> >
> > > This is because I'm getting a map request for an IOVA in the reserved
> > > region.
> >
> > How come? iova_reserve_iommu_regions() reads the reserved regions and
> > loads them as reserved into the iovad which should cause
> > iommu_dma_alloc_iova() and alloc_iova_fast() to not return values in
> > those ranges.
> >
> > It all looks like it is supposed to work
> >
> > Did something go wrong in the initialization order perhaps?
> >
> > Jason
>
> It was of course a classic off-by-one, the table size is a number of
> entries but geometry.aperture_end seems to be the largest allowed IOVA.
> So we need:

Right, I dislike this naming usually 'end' means "start + length" and
'last' means "start + length - 1"

> Otherwise the first IOVA allocated is ZPCI_TABLE_SIZE_RT itself.
> Similarly we need the second reserved region if (zdev->end_dma <
> ZPCI_TABLE_SIZE_RT - 1). In your patch I think you had the
> MAX_DMA_TABLE_ADDR name right but would have also calculated the number
> of entries.

Make sense..

> On the other hand with the dma-iommu.c conversion it no longer makes
> sense to lower zdev->end_dma artificially, so at least on current
> machine LPARs we would end up with just a lower reserved region
> 0x0000000000000000 to 0x00000000ffffffff and can use IOVAs up to
> aperture_end.

So zdev->end_dma == MAX_DMA_TABLE_ADDR?

(and is zdev->end_dma and 'end' or 'last' ?)

Can you include this patch once you are happy with it, it nicely
tidies this series?

Thanks,
Jason

2022-09-29 08:22:37

by Niklas Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

On Wed, 2022-09-28 at 10:32 -0300, Jason Gunthorpe wrote:
> On Wed, Sep 28, 2022 at 10:58:22AM +0200, Niklas Schnelle wrote:
> > On Tue, 2022-09-27 at 13:56 -0300, Jason Gunthorpe wrote:
> > > On Tue, Sep 27, 2022 at 06:33:48PM +0200, Niklas Schnelle wrote:
> > >
> > > > Not sure what the non-MSI reservation is for? It does seem like x86_64
> > > > also uses this for quite large ranges.
> > >
> > > There are lots of things that are unsuitable for DMA on x86 platforms,
> > > unfortunately.. But yeah, I'm not sure either.
> > >
> > > > This is because I'm getting a map request for an IOVA in the reserved
> > > > region.
> > >
> > > How come? iova_reserve_iommu_regions() reads the reserved regions and
> > > loads them as reserved into the iovad which should cause
> > > iommu_dma_alloc_iova() and alloc_iova_fast() to not return values in
> > > those ranges.
> > >
> > > It all looks like it is supposed to work
> > >
> > > Did something go wrong in the initialization order perhaps?
> > >
> > > Jason
> >
> > It was of course a classic off-by-one, the table size is a number of
> > entries but geometry.aperture_end seems to be the largest allowed IOVA.
> > So we need:
>
> Right, I dislike this naming usually 'end' means "start + length" and
> 'last' means "start + length - 1"
>
> > Otherwise the first IOVA allocated is ZPCI_TABLE_SIZE_RT itself.
> > Similarly we need the second reserved region if (zdev->end_dma <
> > ZPCI_TABLE_SIZE_RT - 1). In your patch I think you had the
> > MAX_DMA_TABLE_ADDR name right but would have also calculated the number
> > of entries.
>
> Make sense..
>
> > On the other hand with the dma-iommu.c conversion it no longer makes
> > sense to lower zdev->end_dma artificially, so at least on current
> > machine LPARs we would end up with just a lower reserved region
> > 0x0000000000000000 to 0x00000000ffffffff and can use IOVAs up to
> > aperture_end.
>
> So zdev->end_dma == MAX_DMA_TABLE_ADDR?
>
> (and is zdev->end_dma and 'end' or 'last' ?)

Basically yes though at least on LPARs the firmware interface that
gives us the initial zdev->end returns an even higher value but we
clamp it down to the aperture. It is "start + length - 1".

>
> Can you include this patch once you are happy with it, it nicely
> tidies this series?
>
> Thanks,
> Jason

Yes will do. In the meantime I'm now close to sending an RFC version of
the conversion to dma-iommu. So my plan is to send out 3 series of
patches.

1. v3 of this series of IOMMU fixes including your suggestion to use
reserved ranges, the previously mentioned off-by-one fix and another
IOMMU issue I found (pgsize_bitmap is wrong).

2. A series of improvements to the s390 IOMMU code including
implementing map_pages() and lock-free page table updates

3. A series converting s390 to use dma-iommu plus changes against dma-
iommu.c common code to implement an alternative flushing scheme that
brings z/VM and KVM guest PCI performance back to the level of our
existing DMA API implementation.