Hi All,
This is a follow up to Matt's recent series[0] where he tackled a race that
turned out to be outside of the s390 IOMMU driver itself as well as duplicate
device attachments. After an internal discussion we came up with what I believe
is a cleaner fix. Instead of actively checking for duplicates we instead detach
from any previous domain on attach. From my cursory reading of the code this
seems to be what the Intel IOMMU driver is doing as well.
During development of this fix we realized that we can get rid of struct
s390_domain_device entirely if we instead thread the list through the attached
struct zpci_devs. This saves us from having to allocate during attach and gets
rid of one level of indirection during IOMMU operations. Coincidentally
I discovered that a previous list_head in struct zpci_dev is unused so this is
removed and then replaced.
The duplicate entry fix is the first patch of this series and the only one
which carries a Fixes tag. It may be applied alone or together with patches
2 and 3 which are followup clean ups.
Best regards,
Niklas
[0] https://lore.kernel.org/linux-iommu/[email protected]/
Niklas Schnelle (3):
iommu/s390: Fix duplicate domain attachments
s390/pci: remove unused bus_next field from struct zpci_dev
iommu/s390: Get rid of s390_domain_device
arch/s390/include/asm/pci.h | 2 +-
drivers/iommu/s390-iommu.c | 110 +++++++++++++++++++-----------------
2 files changed, 59 insertions(+), 53 deletions(-)
--
2.34.1
When an s390_domain is freed without previously detaching all devices
the corresponding s390_domain_device is leaked. Instead of fixing this
leak by freeing the s390_domain_devices in s390_domain_free() do one
better and just get rid of s390_domain_device entirely by threading the
domain's device list through struct zpci_dev. This also gets rid of
a level of indirection during operations but also the allocation of
the s390_domain_device during attach thus making it more reliable under
memory pressure. Even though this naturally fixes the leak of
s390_domain_device let's still invalidate the list heads of formally
attached struct zpci_devs on s390_domain_free() to aid in debug.
Signed-off-by: Niklas Schnelle <[email protected]>
---
arch/s390/include/asm/pci.h | 1 +
drivers/iommu/s390-iommu.c | 42 ++++++++++++++++---------------------
2 files changed, 19 insertions(+), 24 deletions(-)
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 108e732d7b14..15f8714ca9b7 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -117,6 +117,7 @@ struct zpci_bus {
struct zpci_dev {
struct zpci_bus *zbus;
struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */
+ struct list_head iommu_list;
struct kref kref;
struct hotplug_slot hotplug_slot;
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index de8f76775240..2fec198823a8 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -29,11 +29,6 @@ struct s390_domain {
spinlock_t list_lock;
};
-struct s390_domain_device {
- struct list_head list;
- struct zpci_dev *zdev;
-};
-
static struct s390_domain *to_s390_domain(struct iommu_domain *dom)
{
return container_of(dom, struct s390_domain, domain);
@@ -78,7 +73,15 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
static void s390_domain_free(struct iommu_domain *domain)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
+ struct zpci_dev *zdev, *tmp;
+ unsigned long flags;
+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ list_for_each_entry_safe(zdev, tmp,
+ &s390_domain->devices, iommu_list) {
+ list_del(&zdev->iommu_list);
+ }
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
dma_cleanup_tables(s390_domain->dma_table);
kfree(s390_domain);
}
@@ -86,16 +89,15 @@ static void s390_domain_free(struct iommu_domain *domain)
static bool __s390_iommu_detach_device(struct s390_domain *s390_domain,
struct zpci_dev *zdev)
{
- struct s390_domain_device *domain_device, *tmp;
+ struct zpci_dev *zdev_iter, *tmp;
unsigned long flags;
bool found = false;
spin_lock_irqsave(&s390_domain->list_lock, flags);
- list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
- list) {
- if (domain_device->zdev == zdev) {
- list_del(&domain_device->list);
- kfree(domain_device);
+ list_for_each_entry_safe(zdev_iter, tmp, &s390_domain->devices,
+ iommu_list) {
+ if (zdev_iter == zdev) {
+ list_del(&zdev->iommu_list);
found = true;
break;
}
@@ -114,7 +116,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
- struct s390_domain_device *domain_device;
struct s390_domain *prev_domain = NULL;
unsigned long flags;
int cc, rc = 0;
@@ -122,10 +123,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
if (!zdev)
return -ENODEV;
- domain_device = kzalloc(sizeof(*domain_device), GFP_KERNEL);
- if (!domain_device)
- return -ENOMEM;
-
if (zdev->s390_domain) {
prev_domain = zdev->s390_domain;
if (!__s390_iommu_detach_device(zdev->s390_domain, zdev))
@@ -135,7 +132,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
rc = -EIO;
}
if (rc)
- goto out_free;
+ return rc;
zdev->dma_table = s390_domain->dma_table;
cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
@@ -158,9 +155,8 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
goto out_unregister_restore;
}
- domain_device->zdev = zdev;
zdev->s390_domain = s390_domain;
- list_add(&domain_device->list, &s390_domain->devices);
+ list_add(&zdev->iommu_list, &s390_domain->devices);
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
return 0;
@@ -174,8 +170,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
dev);
else
zpci_dma_init_device(zdev);
-out_free:
- kfree(domain_device);
return rc;
}
@@ -229,10 +223,10 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain,
phys_addr_t pa, dma_addr_t dma_addr,
size_t size, int flags)
{
- struct s390_domain_device *domain_device;
phys_addr_t page_addr = pa & PAGE_MASK;
dma_addr_t start_dma_addr = dma_addr;
unsigned long irq_flags, nr_pages, i;
+ struct zpci_dev *zdev;
unsigned long *entry;
int rc = 0;
@@ -257,8 +251,8 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain,
}
spin_lock(&s390_domain->list_lock);
- list_for_each_entry(domain_device, &s390_domain->devices, list) {
- rc = zpci_refresh_trans((u64) domain_device->zdev->fh << 32,
+ list_for_each_entry(zdev, &s390_domain->devices, iommu_list) {
+ rc = zpci_refresh_trans((u64)zdev->fh << 32,
start_dma_addr, nr_pages * PAGE_SIZE);
if (rc)
break;
--
2.34.1
This field was added in commit 44510d6fa0c0 ("s390/pci: Handling
multifunctions") but is an unused remnant of an earlier version where
the the devices on the virtual bus were connected in a linked list
instead of a fixed 256 entry array of pointers.
It is also not used for the list of busses as that is threaded through
struct zpci_bus not through struct zpci_dev.
Signed-off-by: Niklas Schnelle <[email protected]>
---
arch/s390/include/asm/pci.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 7b4cdadbc023..108e732d7b14 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -117,7 +117,6 @@ struct zpci_bus {
struct zpci_dev {
struct zpci_bus *zbus;
struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */
- struct list_head bus_next;
struct kref kref;
struct hotplug_slot hotplug_slot;
--
2.34.1
Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
calls") we can end up with duplicates in the list of devices attached to
a domain. This is inefficient and confusing since only one domain can
actually be in control of the IOMMU translations for a device. Fix this
by detaching the device from the previous domain, if any, on attach.
This also makes the restore behavior analogous between IOMMU and DMA API
control.
Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
Signed-off-by: Niklas Schnelle <[email protected]>
---
drivers/iommu/s390-iommu.c | 82 ++++++++++++++++++++++----------------
1 file changed, 47 insertions(+), 35 deletions(-)
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index c898bcbbce11..de8f76775240 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -83,14 +83,41 @@ static void s390_domain_free(struct iommu_domain *domain)
kfree(s390_domain);
}
+static bool __s390_iommu_detach_device(struct s390_domain *s390_domain,
+ struct zpci_dev *zdev)
+{
+ struct s390_domain_device *domain_device, *tmp;
+ unsigned long flags;
+ bool found = false;
+
+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
+ list) {
+ if (domain_device->zdev == zdev) {
+ list_del(&domain_device->list);
+ kfree(domain_device);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+
+ if (found) {
+ zdev->s390_domain = NULL;
+ zpci_unregister_ioat(zdev, 0);
+ }
+ return found;
+}
+
static int s390_iommu_attach_device(struct iommu_domain *domain,
struct device *dev)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain_device *domain_device;
+ struct s390_domain *prev_domain = NULL;
unsigned long flags;
- int cc, rc;
+ int cc, rc = 0;
if (!zdev)
return -ENODEV;
@@ -99,16 +126,16 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
if (!domain_device)
return -ENOMEM;
- if (zdev->dma_table && !zdev->s390_domain) {
- cc = zpci_dma_exit_device(zdev);
- if (cc) {
+ if (zdev->s390_domain) {
+ prev_domain = zdev->s390_domain;
+ if (!__s390_iommu_detach_device(zdev->s390_domain, zdev))
+ rc = -EIO;
+ } else if (zdev->dma_table) {
+ if (zpci_dma_exit_device(zdev))
rc = -EIO;
- goto out_free;
- }
}
-
- if (zdev->s390_domain)
- zpci_unregister_ioat(zdev, 0);
+ if (rc)
+ goto out_free;
zdev->dma_table = s390_domain->dma_table;
cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
@@ -129,7 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
domain->geometry.aperture_end != zdev->end_dma) {
rc = -EINVAL;
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
- goto out_restore;
+ goto out_unregister_restore;
}
domain_device->zdev = zdev;
zdev->s390_domain = s390_domain;
@@ -138,14 +165,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
return 0;
+out_unregister_restore:
+ zpci_unregister_ioat(zdev, 0);
out_restore:
- if (!zdev->s390_domain) {
+ zdev->dma_table = NULL;
+ if (prev_domain)
+ s390_iommu_attach_device(&prev_domain->domain,
+ dev);
+ else
zpci_dma_init_device(zdev);
- } else {
- zdev->dma_table = zdev->s390_domain->dma_table;
- zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- virt_to_phys(zdev->dma_table));
- }
out_free:
kfree(domain_device);
@@ -157,30 +185,14 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
- struct s390_domain_device *domain_device, *tmp;
- unsigned long flags;
- int found = 0;
+ bool detached;
if (!zdev)
return;
- spin_lock_irqsave(&s390_domain->list_lock, flags);
- list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
- list) {
- if (domain_device->zdev == zdev) {
- list_del(&domain_device->list);
- kfree(domain_device);
- found = 1;
- break;
- }
- }
- spin_unlock_irqrestore(&s390_domain->list_lock, flags);
-
- if (found && (zdev->s390_domain == s390_domain)) {
- zdev->s390_domain = NULL;
- zpci_unregister_ioat(zdev, 0);
+ detached = __s390_iommu_detach_device(s390_domain, zdev);
+ if (detached)
zpci_dma_init_device(zdev);
- }
}
static struct iommu_device *s390_iommu_probe_device(struct device *dev)
--
2.34.1
On Thu, Sep 15, 2022 at 05:14:00PM +0200, Niklas Schnelle wrote:
> Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
> calls") we can end up with duplicates in the list of devices attached to
> a domain. This is inefficient and confusing since only one domain can
> actually be in control of the IOMMU translations for a device. Fix this
> by detaching the device from the previous domain, if any, on attach.
> This also makes the restore behavior analogous between IOMMU and DMA API
> control.
>
> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
> Signed-off-by: Niklas Schnelle <[email protected]>
> ---
> drivers/iommu/s390-iommu.c | 82 ++++++++++++++++++++++----------------
> 1 file changed, 47 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
> index c898bcbbce11..de8f76775240 100644
> --- a/drivers/iommu/s390-iommu.c
> +++ b/drivers/iommu/s390-iommu.c
> @@ -83,14 +83,41 @@ static void s390_domain_free(struct iommu_domain *domain)
> kfree(s390_domain);
> }
>
> +static bool __s390_iommu_detach_device(struct s390_domain *s390_domain,
> + struct zpci_dev *zdev)
> +{
> + struct s390_domain_device *domain_device, *tmp;
> + unsigned long flags;
> + bool found = false;
> +
> + spin_lock_irqsave(&s390_domain->list_lock, flags);
> + list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
> + list) {
> + if (domain_device->zdev == zdev) {
Why all this searching? The domain argument is only being provided to
help drivers find their data structures, in most cases I would expect
it to be mostly unused.
After patch 3 the struct is gone, so isn't this just
spin_lock_irqsave(&s390_domain->list_lock, flags);
list_del_init(&zdev->iommu_list)
spin_unlock_irqsave(&s390_domain->list_lock, flags);
?
Jason
On Tue, 2022-09-20 at 11:21 -0300, Jason Gunthorpe wrote:
> On Thu, Sep 15, 2022 at 05:14:00PM +0200, Niklas Schnelle wrote:
> > Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
> > calls") we can end up with duplicates in the list of devices attached to
> > a domain. This is inefficient and confusing since only one domain can
> > actually be in control of the IOMMU translations for a device. Fix this
> > by detaching the device from the previous domain, if any, on attach.
> > This also makes the restore behavior analogous between IOMMU and DMA API
> > control.
> >
> > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
> > Signed-off-by: Niklas Schnelle <[email protected]>
> > ---
> > drivers/iommu/s390-iommu.c | 82 ++++++++++++++++++++++----------------
> > 1 file changed, 47 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
> > index c898bcbbce11..de8f76775240 100644
> > --- a/drivers/iommu/s390-iommu.c
> > +++ b/drivers/iommu/s390-iommu.c
> > @@ -83,14 +83,41 @@ static void s390_domain_free(struct iommu_domain *domain)
> > kfree(s390_domain);
> > }
> >
> > +static bool __s390_iommu_detach_device(struct s390_domain *s390_domain,
> > + struct zpci_dev *zdev)
> > +{
> > + struct s390_domain_device *domain_device, *tmp;
> > + unsigned long flags;
> > + bool found = false;
> > +
> > + spin_lock_irqsave(&s390_domain->list_lock, flags);
> > + list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
> > + list) {
> > + if (domain_device->zdev == zdev) {
>
> Why all this searching? The domain argument is only being provided to
> help drivers find their data structures, in most cases I would expect
> it to be mostly unused.
Before patch 3 we have no other way besides searching in the list to
get from the struct device to the struct s390_domain_device that we
need to kfree(). But yeah as shown by patch 3 this whole
s390_domain_device thing is not needed anyway.
>
> After patch 3 the struct is gone, so isn't this just
>
> spin_lock_irqsave(&s390_domain->list_lock, flags);
> list_del_init(&zdev->iommu_list)
> spin_unlock_irqsave(&s390_domain->list_lock, flags);
>
> ?
Yes with patch 3 I think you're right, the above should be enough to
get it removed from the list and there really shouldn't be a call to
detach from a domain if it wasn't attached to it, right? Just to be
safe we could also do nothing if (zdev->s390_domain != s390_domain) or
maybe better just BUG_ON()?
One thing that is still a bit of a mismatch is that architecturally
zpci_unregister_ioat() can fail but detach returns void. Now, one
reason for that is a hot unplug of the device has occurred in the
meantime in which case the device doesn't use the DMA translations
anymore anyway.
If anything else happens I don't know what we should do,
zpci_dma_exit_device() in our DMA API implementation in these cases
leaks the DMA tables such that any hardware still accessing them would
get valid tables but I don't think I've ever seen this occurring.