On s390 PCI functions may be hotplugged individually even when they
belong to a multi-function device. In particular on an SR-IOV device VFs
may be removed and later re-added.
In commit a50297cf8235 ("s390/pci: separate zbus creation from
scanning") it was missed however that struct pci_bus and struct
zpci_bus's resource list retained a reference to the PCI functions MMIO
resources even though those resources are released and freed on
hot-unplug. These stale resources may subsequently be claimed when the
PCI function re-appears resulting in use-after-free.
One idea of fixing this use-after-free in s390 specific code that was
investigated was to simply keep resources around from the moment a PCI
function first appeared until the whole virtual PCI bus created for
a multi-function device disappears. The problem with this however is
that due to the requirement of artificial MMIO addreesses (address
cookies) we will then need extra logic and tracking in struct zpci_bus
to keep these compatible for re-use. At the same time the MMIO resources
semantically belong to the PCI function so tying their lifecycle to the
function seems more logical.
Instead a simpler approach is to remove the resources of an individually
hot-unplugged PCI function from the PCI bus's resource list while
keeping the resources of other PCI functions on the PCI bus untouched.
This is done by introducing pci_bus_remove_resource() to remove an
individual resource. Similarly the resource also needs to be removed
from the struct zpci_bus's resource list. It turns out however, that
there is really no need to add the MMIO resources at all and instead we
can simply use the zpci_bar_struct's resource pointer directly.
Fixes: a50297cf8235 ("s390/pci: separate zbus creation from scanning")
Signed-off-by: Niklas Schnelle <[email protected]>
---
arch/s390/pci/pci.c | 16 ++++++++++------
arch/s390/pci/pci_bus.c | 12 +++++-------
arch/s390/pci/pci_bus.h | 3 +--
drivers/pci/bus.c | 23 +++++++++++++++++++++++
include/linux/pci.h | 1 +
5 files changed, 40 insertions(+), 15 deletions(-)
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index ef38b1514c77..e16afacc8fd1 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -544,8 +544,7 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
return r;
}
-int zpci_setup_bus_resources(struct zpci_dev *zdev,
- struct list_head *resources)
+int zpci_setup_bus_resources(struct zpci_dev *zdev)
{
unsigned long addr, size, flags;
struct resource *res;
@@ -581,7 +580,6 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
return -ENOMEM;
}
zdev->bars[i].res = res;
- pci_add_resource(resources, res);
}
zdev->has_resources = 1;
@@ -590,17 +588,23 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
{
+ struct resource *res;
int i;
+ pci_lock_rescan_remove();
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
- if (!zdev->bars[i].size || !zdev->bars[i].res)
+ res = zdev->bars[i].res;
+ if (!res)
continue;
+ release_resource(res);
+ pci_bus_remove_resource(zdev->zbus->bus, res);
zpci_free_iomap(zdev, zdev->bars[i].map_idx);
- release_resource(zdev->bars[i].res);
- kfree(zdev->bars[i].res);
+ zdev->bars[i].res = NULL;
+ kfree(res);
}
zdev->has_resources = 0;
+ pci_unlock_rescan_remove();
}
int pcibios_device_add(struct pci_dev *pdev)
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index 6a8da1b742ae..a99926af2b69 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -41,9 +41,7 @@ static int zpci_nb_devices;
*/
static int zpci_bus_prepare_device(struct zpci_dev *zdev)
{
- struct resource_entry *window, *n;
- struct resource *res;
- int rc;
+ int rc, i;
if (!zdev_enabled(zdev)) {
rc = zpci_enable_device(zdev);
@@ -57,10 +55,10 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
}
if (!zdev->has_resources) {
- zpci_setup_bus_resources(zdev, &zdev->zbus->resources);
- resource_list_for_each_entry_safe(window, n, &zdev->zbus->resources) {
- res = window->res;
- pci_bus_add_resource(zdev->zbus->bus, res, 0);
+ zpci_setup_bus_resources(zdev);
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (zdev->bars[i].res)
+ pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0);
}
}
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
index e96c9860e064..af9f0ac79a1b 100644
--- a/arch/s390/pci/pci_bus.h
+++ b/arch/s390/pci/pci_bus.h
@@ -30,8 +30,7 @@ static inline void zpci_zdev_get(struct zpci_dev *zdev)
int zpci_alloc_domain(int domain);
void zpci_free_domain(int domain);
-int zpci_setup_bus_resources(struct zpci_dev *zdev,
- struct list_head *resources);
+int zpci_setup_bus_resources(struct zpci_dev *zdev);
static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
unsigned int devfn)
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 83ae838ceb5f..f021f1d4af9f 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -76,6 +76,29 @@ struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n)
}
EXPORT_SYMBOL_GPL(pci_bus_resource_n);
+void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res)
+{
+ struct pci_bus_resource *bus_res, *tmp;
+ int i;
+
+ for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
+ if (bus->resource[i] == res) {
+ bus->resource[i] = NULL;
+ return;
+ }
+ }
+
+ list_for_each_entry_safe(bus_res, tmp, &bus->resources, list) {
+ if (bus_res->res == res) {
+ list_del(&bus_res->list);
+ kfree(bus_res);
+ return;
+ }
+ }
+ return;
+
+}
+
void pci_bus_remove_resources(struct pci_bus *bus)
{
int i;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index adffd65e84b4..3b1974e2ec73 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1436,6 +1436,7 @@ void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
unsigned int flags);
struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n);
void pci_bus_remove_resources(struct pci_bus *bus);
+void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res);
int devm_request_pci_bus_resources(struct device *dev,
struct list_head *resources);
--
2.37.2
On Tue, Feb 14, 2023 at 10:49:10AM +0100, Niklas Schnelle wrote:
> On s390 PCI functions may be hotplugged individually even when they
> belong to a multi-function device. In particular on an SR-IOV device VFs
> may be removed and later re-added.
Is there something special about the SR-IOV/VF case that relates to
this problem? If not, it might be unnecessary distraction to mention
it.
> In commit a50297cf8235 ("s390/pci: separate zbus creation from
> scanning") it was missed however that struct pci_bus and struct
> zpci_bus's resource list retained a reference to the PCI functions MMIO
> resources even though those resources are released and freed on
> hot-unplug. These stale resources may subsequently be claimed when the
> PCI function re-appears resulting in use-after-free.
Lifetimes of all these resources definitely aren't obvious to me.
So I guess the critical thing here is the new
pci_bus_remove_resource() in zpci_cleanup_bus_resources(), which
removes (and kfrees when necessary) the resource from
pci_bus->resources.
I'm not clear on where the zpci_bus resource list comes in. I guess
we kalloc resources in zpci_setup_bus_resources(), and the current
code adds them to zpci_bus->resources and copies them onto the pci_bus
list.
The new code does not add them to zpci_bus->resources at all, and only
adds them to the pci_bus resource list. Right? I guess maybe that's
what the "no need to add the MMIO resources at all" below refers to?
> One idea of fixing this use-after-free in s390 specific code that was
> investigated was to simply keep resources around from the moment a PCI
> function first appeared until the whole virtual PCI bus created for
> a multi-function device disappears. The problem with this however is
> that due to the requirement of artificial MMIO addreesses (address
> cookies) we will then need extra logic and tracking in struct zpci_bus
> to keep these compatible for re-use. At the same time the MMIO resources
> semantically belong to the PCI function so tying their lifecycle to the
> function seems more logical.
>
> Instead a simpler approach is to remove the resources of an individually
> hot-unplugged PCI function from the PCI bus's resource list while
> keeping the resources of other PCI functions on the PCI bus untouched.
Do we currently never kfree the pci_bus resource list until we free
the whole pci_bus via release_pcibus_dev()? Does a remove + add just
allocate more resources that are probably duplicates of what the
pci_bus already had?
> This is done by introducing pci_bus_remove_resource() to remove an
> individual resource. Similarly the resource also needs to be removed
> from the struct zpci_bus's resource list. It turns out however, that
> there is really no need to add the MMIO resources at all and instead we
> can simply use the zpci_bar_struct's resource pointer directly.
>
> Fixes: a50297cf8235 ("s390/pci: separate zbus creation from scanning")
> Signed-off-by: Niklas Schnelle <[email protected]>
Other random questions unrelated to this patch:
- zpci_bus_create_pci_bus() calls pci_bus_add_devices(). Isn't that
pointless? AFAICT, the bus->devices list is empty then.
- What about zpci_bus_scan_device()? Why does it call both
pci_bus_add_device() and pci_bus_add_devices()? The latter will
just call the former, so it looks redundant. And the latter is
locked but not the former?
- Struct zpci_bus has a "resources" list. I guess this contains the
&zbus->bus_resource put there in zpci_bus_alloc(), plus an entry
for every BAR of every device on the bus (I guess you'd never see
an actual PCI-to-PCI bridge on s390?), kalloc'ed in
zpci_setup_bus_resources()?
What happens when zpci_bus_release() calls
pci_free_resource_list() on &zbus->resources? It looks like that
ultimately calls kfree(), which is OK for the
zpci_setup_bus_resources() stuff, but what about the
zbus->bus_resource that was not kalloc'ed?
> ---
> arch/s390/pci/pci.c | 16 ++++++++++------
> arch/s390/pci/pci_bus.c | 12 +++++-------
> arch/s390/pci/pci_bus.h | 3 +--
> drivers/pci/bus.c | 23 +++++++++++++++++++++++
> include/linux/pci.h | 1 +
> 5 files changed, 40 insertions(+), 15 deletions(-)
>
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index ef38b1514c77..e16afacc8fd1 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -544,8 +544,7 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
> return r;
> }
>
> -int zpci_setup_bus_resources(struct zpci_dev *zdev,
> - struct list_head *resources)
> +int zpci_setup_bus_resources(struct zpci_dev *zdev)
> {
> unsigned long addr, size, flags;
> struct resource *res;
> @@ -581,7 +580,6 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
> return -ENOMEM;
> }
> zdev->bars[i].res = res;
> - pci_add_resource(resources, res);
> }
> zdev->has_resources = 1;
>
> @@ -590,17 +588,23 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
>
> static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
> {
> + struct resource *res;
> int i;
>
> + pci_lock_rescan_remove();
What exactly is this protecting? This doesn't seem like quite the
right place since we're not adding/removing a pci_dev here. Is this
to protect the bus->resources list in pci_bus_remove_resource()?
> for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> - if (!zdev->bars[i].size || !zdev->bars[i].res)
> + res = zdev->bars[i].res;
> + if (!res)
> continue;
>
> + release_resource(res);
> + pci_bus_remove_resource(zdev->zbus->bus, res);
> zpci_free_iomap(zdev, zdev->bars[i].map_idx);
> - release_resource(zdev->bars[i].res);
> - kfree(zdev->bars[i].res);
> + zdev->bars[i].res = NULL;
> + kfree(res);
> }
> zdev->has_resources = 0;
> + pci_unlock_rescan_remove();
> }
>
> int pcibios_device_add(struct pci_dev *pdev)
> diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
> index 6a8da1b742ae..a99926af2b69 100644
> --- a/arch/s390/pci/pci_bus.c
> +++ b/arch/s390/pci/pci_bus.c
> @@ -41,9 +41,7 @@ static int zpci_nb_devices;
> */
> static int zpci_bus_prepare_device(struct zpci_dev *zdev)
> {
> - struct resource_entry *window, *n;
> - struct resource *res;
> - int rc;
> + int rc, i;
>
> if (!zdev_enabled(zdev)) {
> rc = zpci_enable_device(zdev);
> @@ -57,10 +55,10 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
> }
>
> if (!zdev->has_resources) {
> - zpci_setup_bus_resources(zdev, &zdev->zbus->resources);
> - resource_list_for_each_entry_safe(window, n, &zdev->zbus->resources) {
> - res = window->res;
> - pci_bus_add_resource(zdev->zbus->bus, res, 0);
> + zpci_setup_bus_resources(zdev);
> + for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> + if (zdev->bars[i].res)
> + pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0);
> }
> }
>
> diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
> index e96c9860e064..af9f0ac79a1b 100644
> --- a/arch/s390/pci/pci_bus.h
> +++ b/arch/s390/pci/pci_bus.h
> @@ -30,8 +30,7 @@ static inline void zpci_zdev_get(struct zpci_dev *zdev)
>
> int zpci_alloc_domain(int domain);
> void zpci_free_domain(int domain);
> -int zpci_setup_bus_resources(struct zpci_dev *zdev,
> - struct list_head *resources);
> +int zpci_setup_bus_resources(struct zpci_dev *zdev);
>
> static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
> unsigned int devfn)
> diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
> index 83ae838ceb5f..f021f1d4af9f 100644
> --- a/drivers/pci/bus.c
> +++ b/drivers/pci/bus.c
> @@ -76,6 +76,29 @@ struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n)
> }
> EXPORT_SYMBOL_GPL(pci_bus_resource_n);
>
> +void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res)
> +{
> + struct pci_bus_resource *bus_res, *tmp;
> + int i;
> +
> + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
> + if (bus->resource[i] == res) {
> + bus->resource[i] = NULL;
> + return;
> + }
> + }
> +
> + list_for_each_entry_safe(bus_res, tmp, &bus->resources, list) {
> + if (bus_res->res == res) {
> + list_del(&bus_res->list);
> + kfree(bus_res);
> + return;
> + }
> + }
> + return;
> +
Superfluous "return" and blank line.
> +}
> +
> void pci_bus_remove_resources(struct pci_bus *bus)
> {
> int i;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index adffd65e84b4..3b1974e2ec73 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1436,6 +1436,7 @@ void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
> unsigned int flags);
> struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n);
> void pci_bus_remove_resources(struct pci_bus *bus);
> +void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res);
> int devm_request_pci_bus_resources(struct device *dev,
> struct list_head *resources);
>
> --
> 2.37.2
>
On Fri, 2023-02-17 at 17:15 -0600, Bjorn Helgaas wrote:
> On Tue, Feb 14, 2023 at 10:49:10AM +0100, Niklas Schnelle wrote:
> > On s390 PCI functions may be hotplugged individually even when they
> > belong to a multi-function device. In particular on an SR-IOV device VFs
> > may be removed and later re-added.
>
> Is there something special about the SR-IOV/VF case that relates to
> this problem? If not, it might be unnecessary distraction to mention
> it.
It's not really special in that the problem could also be triggered by
other hotplug but it is the most common scenario where you'd run into
this problem. Some background. If you simply do a "echo 0 >
/sys/bus/pci/slots/<func>/power" and then power on again the PCI
resources are not removed because the function stays visible to the
system just deconfigured. To trigger removal you'd have to move a
single function to a different system in the machine hypervisor but
then you're less likely to bring it back again so the dangling resource
pointer won't cause problems. When using "../sriov_numvfs" to change
the number of VFs however they are temporarily removed and then re-
appear.
>
> > In commit a50297cf8235 ("s390/pci: separate zbus creation from
> > scanning") it was missed however that struct pci_bus and struct
> > zpci_bus's resource list retained a reference to the PCI functions MMIO
> > resources even though those resources are released and freed on
> > hot-unplug. These stale resources may subsequently be claimed when the
> > PCI function re-appears resulting in use-after-free.
>
> Lifetimes of all these resources definitely aren't obvious to me.
Yes the problem is that the old code did muddy the water here because
it didn't properly separate which resources are tied to a PCI function
and which to the bus as a whole. I tried to fix this in this patch.
But let me first explain lifetimes of the struct zpci_bus and struct
zpci_bus. As our basic architecture works with one struct zpci_dev per
function and they can be hot(un-)plugged in any order but we still want
to support exposing the topology within a multi-function PCI device the
struct zpci_bus representing the PCI bus on which the functions reside
is created when the first PCI function on that bus is discovered and it
exists until the last PCI function on that bus disappears.
>
> So I guess the critical thing here is the new
> pci_bus_remove_resource() in zpci_cleanup_bus_resources(), which
> removes (and kfrees when necessary) the resource from
> pci_bus->resources.
>
> I'm not clear on where the zpci_bus resource list comes in. I guess
> we kalloc resources in zpci_setup_bus_resources(), and the current
> code adds them to zpci_bus->resources and copies them onto the pci_bus
> list.
>
> The new code does not add them to zpci_bus->resources at all, and only
> adds them to the pci_bus resource list. Right? I guess maybe that's
> what the "no need to add the MMIO resources at all" below refers to?
Yes exactly. The problem is that I got confused with how to map our
model where the BAR resources are strictly tied to the PCI function to
the common API which more closely resembles real hardware and where the
BAR resources are tied to the PCI bus. Instead of making sure that the
per-function resources are added and removed together with the PCI
function matching when they are accessible in our architecture I added
them to the struct zpci_bus and didn't properly remove them neither
from sruct zpci_bus nor the common PCI bus though they were freed thus
creating the use-after free in two places at once. I think somehow I
had though that release_resource() somehow removed it from the resource
lists.
>
> > One idea of fixing this use-after-free in s390 specific code that was
> > investigated was to simply keep resources around from the moment a PCI
> > function first appeared until the whole virtual PCI bus created for
> > a multi-function device disappears. The problem with this however is
> > that due to the requirement of artificial MMIO addreesses (address
> > cookies) we will then need extra logic and tracking in struct zpci_bus
> > to keep these compatible for re-use. At the same time the MMIO resources
> > semantically belong to the PCI function so tying their lifecycle to the
> > function seems more logical.
> >
> > Instead a simpler approach is to remove the resources of an individually
> > hot-unplugged PCI function from the PCI bus's resource list while
> > keeping the resources of other PCI functions on the PCI bus untouched.
>
> Do we currently never kfree the pci_bus resource list until we free
> the whole pci_bus via release_pcibus_dev()? Does a remove + add just
> allocate more resources that are probably duplicates of what the
> pci_bus already had?
Yes the current code adds new resources on remove + add while leaving
dangling freed resources in the list creating kind of half duplicates.
It's only due to the order that we end up using the freed dangling
resources instead of the new ones.
>
> > This is done by introducing pci_bus_remove_resource() to remove an
> > individual resource. Similarly the resource also needs to be removed
> > from the struct zpci_bus's resource list. It turns out however, that
> > there is really no need to add the MMIO resources at all and instead we
> > can simply use the zpci_bar_struct's resource pointer directly.
> >
> > Fixes: a50297cf8235 ("s390/pci: separate zbus creation from scanning")
> > Signed-off-by: Niklas Schnelle <[email protected]>
>
> Other random questions unrelated to this patch:
>
> - zpci_bus_create_pci_bus() calls pci_bus_add_devices(). Isn't that
> pointless? AFAICT, the bus->devices list is empty then.
Yes I think you're right it does nothing and can be dropped.
>
> - What about zpci_bus_scan_device()? Why does it call both
> pci_bus_add_device() and pci_bus_add_devices()? The latter will
> just call the former, so it looks redundant. And the latter is
> locked but not the former?
Hmm. great find. This seems to have been weird and redundant since I
first used that pattern in 3047766bc6ec ("s390/pci: fix enabling a
reserved PCI function"). I think maybe then the reason for this was
that prior to 960ac3626487 ("s390/pci: allow zPCI zbus without a
function zero") when the newly enabled is devfn == 0 there could be
functions from the same bus which would not have been added yet. I'm
not sure though. That was definitely the idea behind the
zpci_bus_scan_bus() in zpci_scan_configured_devices() that is also
redundant now as we can now scan each function as it appears.
This will definitely need to be cleaned up.
>
> - Struct zpci_bus has a "resources" list. I guess this contains the
> &zbus->bus_resource put there in zpci_bus_alloc(), plus an entry
> for every BAR of every device on the bus (I guess you'd never see
> an actual PCI-to-PCI bridge on s390?), kalloc'ed in
> zpci_setup_bus_resources()?
Yes that was the situation before this patch. After this patch only the
zpci_bus.bus_resource is in this resources list. After this patch the
BAR resources are not on the list and only referred to via zdev-
>bars[i].res thus tying them to struct zpci_dev.
Currently Linux does not see PCI-to-PCI bridges on s390. We do have
PCIe switches in the hardware in the so called I/O drawers but they are
hidden from us by firmware. There have been some ideas about having
PCI-to-PCI bridges visible to Linux but nothing concrete at the moment.
>
> What happens when zpci_bus_release() calls
> pci_free_resource_list() on &zbus->resources? It looks like that
> ultimately calls kfree(), which is OK for the
> zpci_setup_bus_resources() stuff, but what about the
> zbus->bus_resource that was not kalloc'ed?
As far as I can see pci_free_resource_list() only calls kfree() on the
entry not on entry->res. The resources set up in
zpci_setup_bus_resources() are freed in zpci_cleanup_bus_resources()
explicitly.
>
> > ---
> > arch/s390/pci/pci.c | 16 ++++++++++------
> > arch/s390/pci/pci_bus.c | 12 +++++-------
> > arch/s390/pci/pci_bus.h | 3 +--
> > drivers/pci/bus.c | 23 +++++++++++++++++++++++
> > include/linux/pci.h | 1 +
> > 5 files changed, 40 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> > index ef38b1514c77..e16afacc8fd1 100644
> > --- a/arch/s390/pci/pci.c
> > +++ b/arch/s390/pci/pci.c
> > @@ -544,8 +544,7 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
> > return r;
> > }
> >
> > -int zpci_setup_bus_resources(struct zpci_dev *zdev,
> > - struct list_head *resources)
> > +int zpci_setup_bus_resources(struct zpci_dev *zdev)
> > {
> > unsigned long addr, size, flags;
> > struct resource *res;
> > @@ -581,7 +580,6 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
> > return -ENOMEM;
> > }
> > zdev->bars[i].res = res;
> > - pci_add_resource(resources, res);
> > }
> > zdev->has_resources = 1;
> >
> > @@ -590,17 +588,23 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
> >
> > static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
> > {
> > + struct resource *res;
> > int i;
> >
> > + pci_lock_rescan_remove();
>
> What exactly is this protecting? This doesn't seem like quite the
> right place since we're not adding/removing a pci_dev here. Is this
> to protect the bus->resources list in pci_bus_remove_resource()?
Yes I did not find a lock that is specifically for bus->resources but
it seemed to me that changes to resources would only affect things
running under the rescan/remove lock.
>
> > for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> > - if (!zdev->bars[i].size || !zdev->bars[i].res)
> > + res = zdev->bars[i].res;
> > + if (!res)
> > continue;
> >
> > + release_resource(res);
> > + pci_bus_remove_resource(zdev->zbus->bus, res);
> > zpci_free_iomap(zdev, zdev->bars[i].map_idx);
> > - release_resource(zdev->bars[i].res);
> > - kfree(zdev->bars[i].res);
> > + zdev->bars[i].res = NULL;
> > + kfree(res);
> > }
> > zdev->has_resources = 0;
> > + pci_unlock_rescan_remove();
> > }
> >
> > int pcibios_device_add(struct pci_dev *pdev)
> > diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
> > index 6a8da1b742ae..a99926af2b69 100644
> > --- a/arch/s390/pci/pci_bus.c
> > +++ b/arch/s390/pci/pci_bus.c
> > @@ -41,9 +41,7 @@ static int zpci_nb_devices;
> > */
> > static int zpci_bus_prepare_device(struct zpci_dev *zdev)
> > {
> > - struct resource_entry *window, *n;
> > - struct resource *res;
> > - int rc;
> > + int rc, i;
> >
> > if (!zdev_enabled(zdev)) {
> > rc = zpci_enable_device(zdev);
> > @@ -57,10 +55,10 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
> > }
> >
> > if (!zdev->has_resources) {
> > - zpci_setup_bus_resources(zdev, &zdev->zbus->resources);
> > - resource_list_for_each_entry_safe(window, n, &zdev->zbus->resources) {
> > - res = window->res;
> > - pci_bus_add_resource(zdev->zbus->bus, res, 0);
> > + zpci_setup_bus_resources(zdev);
> > + for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> > + if (zdev->bars[i].res)
> > + pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0);
> > }
> > }
> >
> > diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
> > index e96c9860e064..af9f0ac79a1b 100644
> > --- a/arch/s390/pci/pci_bus.h
> > +++ b/arch/s390/pci/pci_bus.h
> > @@ -30,8 +30,7 @@ static inline void zpci_zdev_get(struct zpci_dev *zdev)
> >
> > int zpci_alloc_domain(int domain);
> > void zpci_free_domain(int domain);
> > -int zpci_setup_bus_resources(struct zpci_dev *zdev,
> > - struct list_head *resources);
> > +int zpci_setup_bus_resources(struct zpci_dev *zdev);
> >
> > static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
> > unsigned int devfn)
> > diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
> > index 83ae838ceb5f..f021f1d4af9f 100644
> > --- a/drivers/pci/bus.c
> > +++ b/drivers/pci/bus.c
> > @@ -76,6 +76,29 @@ struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n)
> > }
> > EXPORT_SYMBOL_GPL(pci_bus_resource_n);
> >
> > +void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res)
> > +{
> > + struct pci_bus_resource *bus_res, *tmp;
> > + int i;
> > +
> > + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
> > + if (bus->resource[i] == res) {
> > + bus->resource[i] = NULL;
> > + return;
> > + }
> > + }
> > +
> > + list_for_each_entry_safe(bus_res, tmp, &bus->resources, list) {
> > + if (bus_res->res == res) {
> > + list_del(&bus_res->list);
> > + kfree(bus_res);
> > + return;
> > + }
> > + }
> > + return;
> > +
>
> Superfluous "return" and blank line.
Will drop.
>
> > +}
> > +
> > void pci_bus_remove_resources(struct pci_bus *bus)
> > {
> > int i;
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index adffd65e84b4..3b1974e2ec73 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -1436,6 +1436,7 @@ void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
> > unsigned int flags);
> > struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n);
> > void pci_bus_remove_resources(struct pci_bus *bus);
> > +void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res);
> > int devm_request_pci_bus_resources(struct device *dev,
> > struct list_head *resources);
> >
> > --
> > 2.37.2
> >
On Mon, 2023-02-20 at 13:53 +0100, Niklas Schnelle wrote:
> On Fri, 2023-02-17 at 17:15 -0600, Bjorn Helgaas wrote:
> > On Tue, Feb 14, 2023 at 10:49:10AM +0100, Niklas Schnelle wrote:
> >
> >
---8<---
> > Other random questions unrelated to this patch:
> >
> > - zpci_bus_create_pci_bus() calls pci_bus_add_devices(). Isn't that
> > pointless? AFAICT, the bus->devices list is empty then.
>
> Yes I think you're right it does nothing and can be dropped.
>
> >
> > - What about zpci_bus_scan_device()? Why does it call both
> > pci_bus_add_device() and pci_bus_add_devices()? The latter will
> > just call the former, so it looks redundant. And the latter is
> > locked but not the former?
>
> Hmm. great find. This seems to have been weird and redundant since I
> first used that pattern in 3047766bc6ec ("s390/pci: fix enabling a
> reserved PCI function"). I think maybe then the reason for this was
> that prior to 960ac3626487 ("s390/pci: allow zPCI zbus without a
> function zero") when the newly enabled is devfn == 0 there could be
> functions from the same bus which would not have been added yet. I'm
> not sure though. That was definitely the idea behind the
> zpci_bus_scan_bus() in zpci_scan_configured_devices() that is also
> redundant now as we can now scan each function as it appears.
>
> This will definitely need to be cleaned up.
>
I'm working on cleaning this up but I'm a little confused by what
exactly needs to be under the pci_rescan_remove lock. For example the
pci_bus_add_device(virtfn) at the end of pci_iov_add_virtfn() doesn't
seem to be under the lock while most calls to pci_bus_add_devices()
are, most prominently the one in acpi_pci_root_add() which I assume is
what is used on most x86 systems. Any hints?
Also I think my original thought here might have been a premature worry
about PCI-to-PCI bridges thinking that adding the new device could lead
to more devices appearing. Of course actually thinking about it a bit
more there are quite a few other things that won't work without further
changes if we wanted to add bridges e.g. we would need to create
zpci_dev structs for these somewhere.
On Mon, Feb 20, 2023 at 01:53:34PM +0100, Niklas Schnelle wrote:
> On Fri, 2023-02-17 at 17:15 -0600, Bjorn Helgaas wrote:
> > On Tue, Feb 14, 2023 at 10:49:10AM +0100, Niklas Schnelle wrote:
> > > ...
> > What happens when zpci_bus_release() calls
> > pci_free_resource_list() on &zbus->resources? It looks like that
> > ultimately calls kfree(), which is OK for the
> > zpci_setup_bus_resources() stuff, but what about the
> > zbus->bus_resource that was not kalloc'ed?
>
> As far as I can see pci_free_resource_list() only calls kfree() on the
> entry not on entry->res. The resources set up in
> zpci_setup_bus_resources() are freed in zpci_cleanup_bus_resources()
> explicitly.
So I guess the zbus->resources are allocated in zpci_bus_scan_device()
where zpci_setup_bus_resources() adds a zbus resource for every
zpci_dev BAR, and freed in zpci_bus_release() when the last zpci_dev
is unregistered.
Does that mean that if you add device A, add device B, and remove A,
the zbus retains A's resources even though A is gone? What if you
then add device C whose resources partially overlap A's?
> > > static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
> > > {
> > > + struct resource *res;
> > > int i;
> > >
> > > + pci_lock_rescan_remove();
> >
> > What exactly is this protecting? This doesn't seem like quite the
> > right place since we're not adding/removing a pci_dev here. Is this
> > to protect the bus->resources list in pci_bus_remove_resource()?
>
> Yes I did not find a lock that is specifically for bus->resources but
> it seemed to me that changes to resources would only affect things
> running under the rescan/remove lock.
Yeah, OK.
> > > for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> > > - if (!zdev->bars[i].size || !zdev->bars[i].res)
> > > + res = zdev->bars[i].res;
> > > + if (!res)
> > > continue;
> > >
> > > + release_resource(res);
> > > + pci_bus_remove_resource(zdev->zbus->bus, res);
> > > zpci_free_iomap(zdev, zdev->bars[i].map_idx);
> > > - release_resource(zdev->bars[i].res);
> > > - kfree(zdev->bars[i].res);
> > > + zdev->bars[i].res = NULL;
> > > + kfree(res);
> > > }
> > > zdev->has_resources = 0;
> > > + pci_unlock_rescan_remove();
On Wed, 2023-02-22 at 16:02 -0600, Bjorn Helgaas wrote:
> On Mon, Feb 20, 2023 at 01:53:34PM +0100, Niklas Schnelle wrote:
> > On Fri, 2023-02-17 at 17:15 -0600, Bjorn Helgaas wrote:
> > > On Tue, Feb 14, 2023 at 10:49:10AM +0100, Niklas Schnelle wrote:
> > > > ...
>
> > > What happens when zpci_bus_release() calls
> > > pci_free_resource_list() on &zbus->resources? It looks like that
> > > ultimately calls kfree(), which is OK for the
> > > zpci_setup_bus_resources() stuff, but what about the
> > > zbus->bus_resource that was not kalloc'ed?
> >
> > As far as I can see pci_free_resource_list() only calls kfree() on the
> > entry not on entry->res. The resources set up in
> > zpci_setup_bus_resources() are freed in zpci_cleanup_bus_resources()
> > explicitly.
>
> So I guess the zbus->resources are allocated in zpci_bus_scan_device()
> where zpci_setup_bus_resources() adds a zbus resource for every
> zpci_dev BAR, and freed in zpci_bus_release() when the last zpci_dev
> is unregistered.
Only the zbus->resources list itself is freed in zpci_bus_release() the
resources for the BARs of each zpci_dev are freed in
zpci_cleanup_bus_resources() when the individual zpci_dev is removed.
>
> Does that mean that if you add device A, add device B, and remove A,
> the zbus retains A's resources even though A is gone? What if you
> then add device C whose resources partially overlap A's?
>
> > >
No. Prior to the proposed patch pci_bus::resources/pci_bus::resource[]
and zpci_bus::resources would retain dangling pointers to the BAR
resources of A while the BAR resource itself was already freed when A
is removed. This is the use-after-free this patch is trying to fix.
The BAR resource freeing happens when zpci_cleanup_bus_resources() is
called in zpci_release_device() once the reference count for the struct
zpci_dev hits 0. With this patch this stays the same but we're a)
removing the resource pointer from pci_bus::resources /
pci_bus::resource[] and never adding it to zpci_bus::resources. Meaning
with this patch zpci_bus::resources doesn't store BAR resources at all
and is only used for the IORESOURCE_BUS the BAR resources are instead
only referenced by zpci_dev::bars[bar_num].res and pci_bus::resources /
pci_bus::resource[i].
I don't think we can get overlapping resources but this way the
resource structs are freed when the device (PCI function) goes away
which is also when they become inaccessible for our PCI load/store
instructions.
[+cc Lukas]
On Wed, Feb 22, 2023 at 05:54:55PM +0100, Niklas Schnelle wrote:
> On Mon, 2023-02-20 at 13:53 +0100, Niklas Schnelle wrote:
> > On Fri, 2023-02-17 at 17:15 -0600, Bjorn Helgaas wrote:
> > > On Tue, Feb 14, 2023 at 10:49:10AM +0100, Niklas Schnelle wrote:
> ---8<---
> > > - What about zpci_bus_scan_device()? Why does it call both
> > > pci_bus_add_device() and pci_bus_add_devices()? The latter will
> > > just call the former, so it looks redundant. And the latter is
> > > locked but not the former?
> >
> > Hmm. great find. This seems to have been weird and redundant since I
> > first used that pattern in 3047766bc6ec ("s390/pci: fix enabling a
> > reserved PCI function"). I think maybe then the reason for this was
> > that prior to 960ac3626487 ("s390/pci: allow zPCI zbus without a
> > function zero") when the newly enabled is devfn == 0 there could be
> > functions from the same bus which would not have been added yet. I'm
> > not sure though. That was definitely the idea behind the
> > zpci_bus_scan_bus() in zpci_scan_configured_devices() that is also
> > redundant now as we can now scan each function as it appears.
>
> I'm working on cleaning this up but I'm a little confused by what
> exactly needs to be under the pci_rescan_remove lock. For example the
> pci_bus_add_device(virtfn) at the end of pci_iov_add_virtfn() doesn't
> seem to be under the lock while most calls to pci_bus_add_devices()
> are, most prominently the one in acpi_pci_root_add() which I assume is
> what is used on most x86 systems. Any hints?
>
> Also I think my original thought here might have been a premature worry
> about PCI-to-PCI bridges thinking that adding the new device could lead
> to more devices appearing. Of course actually thinking about it a bit
> more there are quite a few other things that won't work without further
> changes if we wanted to add bridges e.g. we would need to create
> zpci_dev structs for these somewhere.
Hmm. Good question. Off the top of my head, I can't explain the
difference between pci_rescan_remove_lock and pci_bus_sem, so I'm
confused, too. I added Lukas in case he has a ready explanation.
Bjorn
On Thu, Feb 23, 2023 at 01:53:45PM -0600, Bjorn Helgaas wrote:
> Hmm. Good question. Off the top of my head, I can't explain the
> difference between pci_rescan_remove_lock and pci_bus_sem, so I'm
> confused, too. I added Lukas in case he has a ready explanation.
pci_bus_sem is a global lock which protects the "devices" list of all
pci_bus structs.
We do have a bunch of places left where the "devices" list is accessed
without holding pci_bus_sem, though I've tried to slowly eliminate
them.
pci_rescan_remove_lock is a global "big kernel lock" which serializes
any device addition and removal.
pci_rescan_remove_lock is known to be far too course-grained and thus
deadlock-prone, particularly if hotplug ports are nested (as is the
case with Thunderbolt). It needs to be split up into several smaller
locks which protect e.g. allocation of resources of a bus (bus numbers
or MMIO / IO space) and whatever else needs to be protected. It's just
that nobody has gotten around to identify what exactly needs to be
protected, adding the new locks and removing pci_rescan_remove_lock.
Thanks,
Lukas
On Fri, 2023-02-24 at 05:19 +0100, Lukas Wunner wrote:
> On Thu, Feb 23, 2023 at 01:53:45PM -0600, Bjorn Helgaas wrote:
> > Hmm. Good question. Off the top of my head, I can't explain the
> > difference between pci_rescan_remove_lock and pci_bus_sem, so I'm
> > confused, too. I added Lukas in case he has a ready explanation.
>
> pci_bus_sem is a global lock which protects the "devices" list of all
> pci_bus structs.
>
> We do have a bunch of places left where the "devices" list is accessed
> without holding pci_bus_sem, though I've tried to slowly eliminate
> them.
>
> pci_rescan_remove_lock is a global "big kernel lock" which serializes
> any device addition and removal.
>
> pci_rescan_remove_lock is known to be far too course-grained and thus
> deadlock-prone, particularly if hotplug ports are nested (as is the
> case with Thunderbolt). It needs to be split up into several smaller
> locks which protect e.g. allocation of resources of a bus (bus numbers
> or MMIO / IO space) and whatever else needs to be protected. It's just
> that nobody has gotten around to identify what exactly needs to be
> protected, adding the new locks and removing pci_rescan_remove_lock.
>
> Thanks,
>
> Lukas
Thanks for the insights. So from that description I think it might make
sense to do this fix patch with the pci_rescan_remove_lock so it can be
backported. Then we can take the opportunity to add a lock specific to
the allocation/freeing of resources which would then replace at least
this new directly and clearly resource related use of
pci_rescan_remove_lock and potentially others we find.
What do you think?
Thanks,
Niklas
On Tue, Feb 28, 2023 at 10:08:45AM +0100, Niklas Schnelle wrote:
> On Fri, 2023-02-24 at 05:19 +0100, Lukas Wunner wrote:
> > On Thu, Feb 23, 2023 at 01:53:45PM -0600, Bjorn Helgaas wrote:
> > > Hmm. Good question. Off the top of my head, I can't explain the
> > > difference between pci_rescan_remove_lock and pci_bus_sem, so I'm
> > > confused, too. I added Lukas in case he has a ready explanation.
> >
> > pci_bus_sem is a global lock which protects the "devices" list of all
> > pci_bus structs.
> >
> > We do have a bunch of places left where the "devices" list is accessed
> > without holding pci_bus_sem, though I've tried to slowly eliminate
> > them.
> >
> > pci_rescan_remove_lock is a global "big kernel lock" which serializes
> > any device addition and removal.
> >
> > pci_rescan_remove_lock is known to be far too course-grained and thus
> > deadlock-prone, particularly if hotplug ports are nested (as is the
> > case with Thunderbolt). It needs to be split up into several smaller
> > locks which protect e.g. allocation of resources of a bus (bus numbers
> > or MMIO / IO space) and whatever else needs to be protected. It's just
> > that nobody has gotten around to identify what exactly needs to be
> > protected, adding the new locks and removing pci_rescan_remove_lock.
>
> Thanks for the insights. So from that description I think it might make
> sense to do this fix patch with the pci_rescan_remove_lock so it can be
> backported. Then we can take the opportunity to add a lock specific to
> the allocation/freeing of resources which would then replace at least
> this new directly and clearly resource related use of
> pci_rescan_remove_lock and potentially others we find.
> What do you think?
I don't think Lukas was suggesting that *you* need to split the
locking up, just that it *should* be split up someday. To me, that
looks like a project on its own that is beyond the scope of this
particular fix, so I think the pci_lock_rescan_remove() as you have it
here is fine for now.
When you fix up the superfluous "return", go ahead and add my
Acked-by: Bjorn Helgaas <[email protected]>
to your patch. I assume it's easier for you to shepherd this through
the s390 tree, but let me know if you'd rather that I take it.
Bjorn