The DesignWare PCIe host driver uses a chained interrupt to demultiplex
the downstream MSI interrupts. In some circumstances, the system can get
into a state where the parent interrupt is triggered continuously, even
though reading the PCIe host registers doesn't identify any child MSI
interrupt source. This effectively locks up CPU0, which spends all the
time servicing these interrupts. This behavior has been observed on a
Qualcomm SA8540P Ride, with pcie2a and pcie3a enabled at the same time.
This is a clear example of how bypassing the interrupt core by using
chained interrupts can be very dangerous if the hardware misbehaves.
These issues are particularly hard to investigate, because the system
appears to be completely locked up.
The proposed solution is to use regular interrupts instead of chained
interrupts for the demultiplex handler in the PCI dwc driver. This
allows the interrupt storm detector to kick in and disable the faulty
interrupt. Testing showed that the interrupt storm is mitigated with no
visible impact (other than the specific log message), and the system
continues to run as expected. This is a much more desirable behavior
than a system lockup.
In a different thread [1], further advantages of regular over chained
interrupts were presented. This patch follows the guidelines set out in
that thread, and represents another real-life example of how things can
go really wrong with chained interrupts.
[1] https://lore.kernel.org/all/877csohcll.ffs@tglx/
Radu Rendec (1):
PCI: dwc: Use regular interrupt instead of chained
.../pci/controller/dwc/pcie-designware-host.c | 35 +++++++++----------
1 file changed, 17 insertions(+), 18 deletions(-)
--
2.41.0
The DesignWare PCIe host driver uses a chained interrupt to demultiplex
the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
pcie2a and pcie3a at the same time can create an interrupt storm where
the parent interrupt fires continuously, even though reading the PCIe
host registers doesn't identify any child MSI interrupt source. This
effectively locks up CPU0, which spends all the time servicing these
interrupts.
This is a clear example of how bypassing the interrupt core by using
chained interrupts can be very dangerous if the hardware misbehaves.
Convert the driver to use a regular interrupt for the demultiplex
handler. This allows the interrupt storm detector to detect the faulty
interrupt and disable it, allowing the system to run normally.
Signed-off-by: Radu Rendec <[email protected]>
---
.../pci/controller/dwc/pcie-designware-host.c | 35 +++++++++----------
1 file changed, 17 insertions(+), 18 deletions(-)
diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index 9952057c8819c..b603796d415d7 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -83,18 +83,9 @@ irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp)
return ret;
}
-/* Chained MSI interrupt service routine */
-static void dw_chained_msi_isr(struct irq_desc *desc)
+static irqreturn_t dw_pcie_msi_isr(int irq, void *dev_id)
{
- struct irq_chip *chip = irq_desc_get_chip(desc);
- struct dw_pcie_rp *pp;
-
- chained_irq_enter(chip, desc);
-
- pp = irq_desc_get_handler_data(desc);
- dw_handle_msi_irq(pp);
-
- chained_irq_exit(chip, desc);
+ return dw_handle_msi_irq(dev_id);
}
static void dw_pci_setup_msi_msg(struct irq_data *d, struct msi_msg *msg)
@@ -254,20 +245,21 @@ int dw_pcie_allocate_domains(struct dw_pcie_rp *pp)
return 0;
}
-static void dw_pcie_free_msi(struct dw_pcie_rp *pp)
+static void __dw_pcie_free_msi(struct dw_pcie_rp *pp, u32 num_ctrls)
{
u32 ctrl;
- for (ctrl = 0; ctrl < MAX_MSI_CTRLS; ctrl++) {
+ for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
if (pp->msi_irq[ctrl] > 0)
- irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
- NULL, NULL);
+ free_irq(pp->msi_irq[ctrl], pp);
}
irq_domain_remove(pp->msi_domain);
irq_domain_remove(pp->irq_domain);
}
+#define dw_pcie_free_msi(pp) __dw_pcie_free_msi(pp, MAX_MSI_CTRLS)
+
static void dw_pcie_msi_init(struct dw_pcie_rp *pp)
{
struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
@@ -361,9 +353,16 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp)
return ret;
for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
- if (pp->msi_irq[ctrl] > 0)
- irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
- dw_chained_msi_isr, pp);
+ if (pp->msi_irq[ctrl] > 0) {
+ ret = request_irq(pp->msi_irq[ctrl], dw_pcie_msi_isr, 0,
+ dev_name(dev), pp);
+ if (ret) {
+ dev_err(dev, "Failed to request irq %d: %d\n",
+ pp->msi_irq[ctrl], ret);
+ __dw_pcie_free_msi(pp, ctrl);
+ return ret;
+ }
+ }
}
/*
--
2.41.0
On Thu, Jun 29, 2023 at 02:30:19PM -0400, Radu Rendec wrote:
> The DesignWare PCIe host driver uses a chained interrupt to demultiplex
> the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
> pcie2a and pcie3a at the same time can create an interrupt storm where
> the parent interrupt fires continuously, even though reading the PCIe
> host registers doesn't identify any child MSI interrupt source. This
> effectively locks up CPU0, which spends all the time servicing these
> interrupts.
>
> This is a clear example of how bypassing the interrupt core by using
> chained interrupts can be very dangerous if the hardware misbehaves.
>
> Convert the driver to use a regular interrupt for the demultiplex
> handler. This allows the interrupt storm detector to detect the faulty
> interrupt and disable it, allowing the system to run normally.
There are many other users of irq_set_chained_handler_and_data() in
drivers/pci/controller/. Should they be similarly converted? If not,
how do we decide which need to use irq_set_chained_handler_and_data()
and which do not?
> Signed-off-by: Radu Rendec <[email protected]>
> ---
> .../pci/controller/dwc/pcie-designware-host.c | 35 +++++++++----------
> 1 file changed, 17 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> index 9952057c8819c..b603796d415d7 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> @@ -83,18 +83,9 @@ irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp)
> return ret;
> }
>
> -/* Chained MSI interrupt service routine */
> -static void dw_chained_msi_isr(struct irq_desc *desc)
> +static irqreturn_t dw_pcie_msi_isr(int irq, void *dev_id)
> {
> - struct irq_chip *chip = irq_desc_get_chip(desc);
> - struct dw_pcie_rp *pp;
> -
> - chained_irq_enter(chip, desc);
> -
> - pp = irq_desc_get_handler_data(desc);
> - dw_handle_msi_irq(pp);
> -
> - chained_irq_exit(chip, desc);
> + return dw_handle_msi_irq(dev_id);
> }
>
> static void dw_pci_setup_msi_msg(struct irq_data *d, struct msi_msg *msg)
> @@ -254,20 +245,21 @@ int dw_pcie_allocate_domains(struct dw_pcie_rp *pp)
> return 0;
> }
>
> -static void dw_pcie_free_msi(struct dw_pcie_rp *pp)
> +static void __dw_pcie_free_msi(struct dw_pcie_rp *pp, u32 num_ctrls)
> {
> u32 ctrl;
>
> - for (ctrl = 0; ctrl < MAX_MSI_CTRLS; ctrl++) {
> + for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> if (pp->msi_irq[ctrl] > 0)
> - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> - NULL, NULL);
> + free_irq(pp->msi_irq[ctrl], pp);
> }
>
> irq_domain_remove(pp->msi_domain);
> irq_domain_remove(pp->irq_domain);
> }
>
> +#define dw_pcie_free_msi(pp) __dw_pcie_free_msi(pp, MAX_MSI_CTRLS)
What is the benefit of the dw_pcie_free_msi() macro?
> static void dw_pcie_msi_init(struct dw_pcie_rp *pp)
> {
> struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> @@ -361,9 +353,16 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp)
> return ret;
>
> for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> - if (pp->msi_irq[ctrl] > 0)
> - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> - dw_chained_msi_isr, pp);
> + if (pp->msi_irq[ctrl] > 0) {
> + ret = request_irq(pp->msi_irq[ctrl], dw_pcie_msi_isr, 0,
> + dev_name(dev), pp);
> + if (ret) {
> + dev_err(dev, "Failed to request irq %d: %d\n",
> + pp->msi_irq[ctrl], ret);
> + __dw_pcie_free_msi(pp, ctrl);
> + return ret;
> + }
> + }
> }
>
> /*
> --
> 2.41.0
>
On Thu, 2023-06-29 at 14:57 -0500, Bjorn Helgaas wrote:
> On Thu, Jun 29, 2023 at 02:30:19PM -0400, Radu Rendec wrote:
> > The DesignWare PCIe host driver uses a chained interrupt to demultiplex
> > the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
> > pcie2a and pcie3a at the same time can create an interrupt storm where
> > the parent interrupt fires continuously, even though reading the PCIe
> > host registers doesn't identify any child MSI interrupt source. This
> > effectively locks up CPU0, which spends all the time servicing these
> > interrupts.
> >
> > This is a clear example of how bypassing the interrupt core by using
> > chained interrupts can be very dangerous if the hardware misbehaves.
> >
> > Convert the driver to use a regular interrupt for the demultiplex
> > handler. This allows the interrupt storm detector to detect the faulty
> > interrupt and disable it, allowing the system to run normally.
>
> There are many other users of irq_set_chained_handler_and_data() in
> drivers/pci/controller/. Should they be similarly converted? If not,
> how do we decide which need to use irq_set_chained_handler_and_data()
> and which do not?
According to Thomas Gleixner, yes. Obviously I don't want to put words
in his mouth, but I think that's the gist of what he said in a reply to
an RFC patch that I sent a few weeks ago:
https://lore.kernel.org/all/877csohcll.ffs@tglx/
> > Signed-off-by: Radu Rendec <[email protected]>
> > ---
> > .../pci/controller/dwc/pcie-designware-host.c | 35 +++++++++----------
> > 1 file changed, 17 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> > index 9952057c8819c..b603796d415d7 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> > @@ -83,18 +83,9 @@ irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp)
> > return ret;
> > }
> >
> > -/* Chained MSI interrupt service routine */
> > -static void dw_chained_msi_isr(struct irq_desc *desc)
> > +static irqreturn_t dw_pcie_msi_isr(int irq, void *dev_id)
> > {
> > - struct irq_chip *chip = irq_desc_get_chip(desc);
> > - struct dw_pcie_rp *pp;
> > -
> > - chained_irq_enter(chip, desc);
> > -
> > - pp = irq_desc_get_handler_data(desc);
> > - dw_handle_msi_irq(pp);
> > -
> > - chained_irq_exit(chip, desc);
> > + return dw_handle_msi_irq(dev_id);
> > }
> >
> > static void dw_pci_setup_msi_msg(struct irq_data *d, struct msi_msg *msg)
> > @@ -254,20 +245,21 @@ int dw_pcie_allocate_domains(struct dw_pcie_rp *pp)
> > return 0;
> > }
> >
> > -static void dw_pcie_free_msi(struct dw_pcie_rp *pp)
> > +static void __dw_pcie_free_msi(struct dw_pcie_rp *pp, u32 num_ctrls)
> > {
> > u32 ctrl;
> >
> > - for (ctrl = 0; ctrl < MAX_MSI_CTRLS; ctrl++) {
> > + for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> > if (pp->msi_irq[ctrl] > 0)
> > - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> > - NULL, NULL);
> > + free_irq(pp->msi_irq[ctrl], pp);
> > }
> >
> > irq_domain_remove(pp->msi_domain);
> > irq_domain_remove(pp->irq_domain);
> > }
> >
> > +#define dw_pcie_free_msi(pp) __dw_pcie_free_msi(pp, MAX_MSI_CTRLS)
>
> What is the benefit of the dw_pcie_free_msi() macro?
It allows me to add the num_ctrls parameter to the corresponding
function (now renamed to __dw_pcie_free_msi()) without forcing all the
existing call sites to send MAX_MSI_CTRLS explicitly.
I needed that extra parameter to avoid duplicating the tear down code
on the (new) error path in dw_pcie_msi_init() - see below.
> > static void dw_pcie_msi_init(struct dw_pcie_rp *pp)
> > {
> > struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> > @@ -361,9 +353,16 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp)
> > return ret;
> >
> > for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> > - if (pp->msi_irq[ctrl] > 0)
> > - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> > - dw_chained_msi_isr, pp);
> > + if (pp->msi_irq[ctrl] > 0) {
> > + ret = request_irq(pp->msi_irq[ctrl], dw_pcie_msi_isr, 0,
> > + dev_name(dev), pp);
> > + if (ret) {
> > + dev_err(dev, "Failed to request irq %d: %d\n",
> > + pp->msi_irq[ctrl], ret);
> > + __dw_pcie_free_msi(pp, ctrl);
This is where I'm using the extra parameter. If we fail to request an
interrupt, we need to free all the other interrupts that we have
requested so far, to leave everything in a clean state. But we can't
use MAX_MSI_CTRLS with __dw_pcie_free_msi() and rely on the check there
because there may be extra interrupts that we haven't requested *yet*
and we would attempt to free them.
> > + return ret;
> > + }
> > + }
> > }
> >
> > /*
> > --
> > 2.41.0
> >
>
On Thu, Jun 29, 2023 at 04:42:07PM -0400, Radu Rendec wrote:
> On Thu, 2023-06-29 at 14:57 -0500, Bjorn Helgaas wrote:
> > On Thu, Jun 29, 2023 at 02:30:19PM -0400, Radu Rendec wrote:
> > > The DesignWare PCIe host driver uses a chained interrupt to demultiplex
> > > the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
> > > pcie2a and pcie3a at the same time can create an interrupt storm where
> > > the parent interrupt fires continuously, even though reading the PCIe
> > > host registers doesn't identify any child MSI interrupt source. This
> > > effectively locks up CPU0, which spends all the time servicing these
> > > interrupts.
> > >
> > > This is a clear example of how bypassing the interrupt core by using
> > > chained interrupts can be very dangerous if the hardware misbehaves.
> > >
> > > Convert the driver to use a regular interrupt for the demultiplex
> > > handler. This allows the interrupt storm detector to detect the faulty
> > > interrupt and disable it, allowing the system to run normally.
> >
> > There are many other users of irq_set_chained_handler_and_data() in
> > drivers/pci/controller/.? Should they be similarly converted?? If not,
> > how do we decide which need to use irq_set_chained_handler_and_data()
> > and which do not?
>
> According to Thomas Gleixner, yes. Obviously I don't want to put words
> in his mouth, but I think that's the gist of what he said in a reply to
> an RFC patch that I sent a few weeks ago:
> https://lore.kernel.org/all/877csohcll.ffs@tglx/
Is it's a bug in pcie-designware-host.c, and it's also a bug in the
other callers, we should fix them all.
But you do imply that there's some DesignWare hardware issue involved,
too, so I guess it's possible the other drivers don't have an issue
and/or actually require the chained IRQs. That's why I asked how we
should decide.
> > > -static void dw_pcie_free_msi(struct dw_pcie_rp *pp)
> > > +static void __dw_pcie_free_msi(struct dw_pcie_rp *pp, u32 num_ctrls)
> > > ?{
> > > ????????u32 ctrl;
> > > ?
> > > -???????for (ctrl = 0; ctrl < MAX_MSI_CTRLS; ctrl++) {
> > > +???????for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> > > ????????????????if (pp->msi_irq[ctrl] > 0)
> > > -???????????????????????irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> > > -??????????????????????????????????????????????????????? NULL, NULL);
> > > +???????????????????????free_irq(pp->msi_irq[ctrl], pp);
> > > ????????}
> > > ?
> > > ????????irq_domain_remove(pp->msi_domain);
> > > ????????irq_domain_remove(pp->irq_domain);
> > > ?}
> > > ?
> > > +#define dw_pcie_free_msi(pp) __dw_pcie_free_msi(pp, MAX_MSI_CTRLS)
> >
> > What is the benefit of the dw_pcie_free_msi() macro?
>
> It allows me to add the num_ctrls parameter to the corresponding
> function (now renamed to __dw_pcie_free_msi()) without forcing all the
> existing call sites to send MAX_MSI_CTRLS explicitly.
>
> I needed that extra parameter to avoid duplicating the tear down code
> on the (new) error path in dw_pcie_msi_init() - see below.
>
> > > ?static void dw_pcie_msi_init(struct dw_pcie_rp *pp)
> > > ?{
> > > ????????struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> > > @@ -361,9 +353,16 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp)
> > > ????????????????return ret;
> > > ?
> > > ????????for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> > > -???????????????if (pp->msi_irq[ctrl] > 0)
> > > -???????????????????????irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> > > -?????????????????????????????????????????????????? dw_chained_msi_isr, pp);
> > > +???????????????if (pp->msi_irq[ctrl] > 0) {
> > > +???????????????????????ret = request_irq(pp->msi_irq[ctrl], dw_pcie_msi_isr, 0,
> > > +???????????????????????????????????????? dev_name(dev), pp);
> > > +???????????????????????if (ret) {
> > > +???????????????????????????????dev_err(dev, "Failed to request irq %d: %d\n",
> > > +???????????????????????????????????????pp->msi_irq[ctrl], ret);
> > > +???????????????????????????????__dw_pcie_free_msi(pp, ctrl);
>
> This is where I'm using the extra parameter. If we fail to request an
> interrupt, we need to free all the other interrupts that we have
> requested so far, to leave everything in a clean state. But we can't
> use MAX_MSI_CTRLS with __dw_pcie_free_msi() and rely on the check there
> because there may be extra interrupts that we haven't requested *yet*
> and we would attempt to free them.
Makes sense, thanks.
Bjorn
On Thu, 2023-06-29 at 15:58 -0500, Bjorn Helgaas wrote:
> On Thu, Jun 29, 2023 at 04:42:07PM -0400, Radu Rendec wrote:
> > On Thu, 2023-06-29 at 14:57 -0500, Bjorn Helgaas wrote:
> > > On Thu, Jun 29, 2023 at 02:30:19PM -0400, Radu Rendec wrote:
> > > > The DesignWare PCIe host driver uses a chained interrupt to demultiplex
> > > > the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
> > > > pcie2a and pcie3a at the same time can create an interrupt storm where
> > > > the parent interrupt fires continuously, even though reading the PCIe
> > > > host registers doesn't identify any child MSI interrupt source. This
> > > > effectively locks up CPU0, which spends all the time servicing these
> > > > interrupts.
> > > >
> > > > This is a clear example of how bypassing the interrupt core by using
> > > > chained interrupts can be very dangerous if the hardware misbehaves.
> > > >
> > > > Convert the driver to use a regular interrupt for the demultiplex
> > > > handler. This allows the interrupt storm detector to detect the faulty
> > > > interrupt and disable it, allowing the system to run normally.
> > >
> > > There are many other users of irq_set_chained_handler_and_data() in
> > > drivers/pci/controller/. Should they be similarly converted? If not,
> > > how do we decide which need to use irq_set_chained_handler_and_data()
> > > and which do not?
> >
> > According to Thomas Gleixner, yes. Obviously I don't want to put words
> > in his mouth, but I think that's the gist of what he said in a reply to
> > an RFC patch that I sent a few weeks ago:
> > https://lore.kernel.org/all/877csohcll.ffs@tglx/
>
> Is it's a bug in pcie-designware-host.c, and it's also a bug in the
> other callers, we should fix them all.
I wouldn't call it a bug, because the driver works as long as the
hardware behaves. But if the hardware misbehaves and generates an
interrupt storm for whatever reason, you end up with an apparently
frozen system and no clue as to what's going on. By contrast, using
regular interrupts allows the interrupt storm detector to kick in. Then
the system works fine, and there's also a useful log message pointing
to that particular interrupt.
> But you do imply that there's some DesignWare hardware issue involved,
> too, so I guess it's possible the other drivers don't have an issue
> and/or actually require the chained IRQs. That's why I asked how we
> should decide.
That makes sense. I don't know if it's a DesignWare hardware issue or
something else down the PCIe bus. Other folks in my team are
investigating the root cause. This thread has the details:
https://lore.kernel.org/all/pmodcoakbs25z2a7mlo5gpuz63zluh35vbgb5itn6k5aqhjnny@jvphbpvahtse/
I want to point out that this patch doesn't *fix* the root problem (the
interrupt storm). It just makes the kernel handle it (much) better if
it occurs.
I can't see why any driver would _require_ chained IRQs. As I see it,
chained interrupts are just a "shortcut" that circumvents the IRQ core
for (debatable) performance reasons. Other than that, they should work
the same as regular interrupts.
There are other benefits associated with using regular interrupts. One
of them is the ability to control the SMP affinity of the parent
interrupt. But that's a different topic.
Radu
> > > > -static void dw_pcie_free_msi(struct dw_pcie_rp *pp)
> > > > +static void __dw_pcie_free_msi(struct dw_pcie_rp *pp, u32 num_ctrls)
> > > > {
> > > > u32 ctrl;
> > > >
> > > > - for (ctrl = 0; ctrl < MAX_MSI_CTRLS; ctrl++) {
> > > > + for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> > > > if (pp->msi_irq[ctrl] > 0)
> > > > - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> > > > - NULL, NULL);
> > > > + free_irq(pp->msi_irq[ctrl], pp);
> > > > }
> > > >
> > > > irq_domain_remove(pp->msi_domain);
> > > > irq_domain_remove(pp->irq_domain);
> > > > }
> > > >
> > > > +#define dw_pcie_free_msi(pp) __dw_pcie_free_msi(pp, MAX_MSI_CTRLS)
> > >
> > > What is the benefit of the dw_pcie_free_msi() macro?
> >
> > It allows me to add the num_ctrls parameter to the corresponding
> > function (now renamed to __dw_pcie_free_msi()) without forcing all the
> > existing call sites to send MAX_MSI_CTRLS explicitly.
> >
> > I needed that extra parameter to avoid duplicating the tear down code
> > on the (new) error path in dw_pcie_msi_init() - see below.
> >
> > > > static void dw_pcie_msi_init(struct dw_pcie_rp *pp)
> > > > {
> > > > struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> > > > @@ -361,9 +353,16 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp)
> > > > return ret;
> > > >
> > > > for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> > > > - if (pp->msi_irq[ctrl] > 0)
> > > > - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> > > > - dw_chained_msi_isr, pp);
> > > > + if (pp->msi_irq[ctrl] > 0) {
> > > > + ret = request_irq(pp->msi_irq[ctrl], dw_pcie_msi_isr, 0,
> > > > + dev_name(dev), pp);
> > > > + if (ret) {
> > > > + dev_err(dev, "Failed to request irq %d: %d\n",
> > > > + pp->msi_irq[ctrl], ret);
> > > > + __dw_pcie_free_msi(pp, ctrl);
> >
> > This is where I'm using the extra parameter. If we fail to request an
> > interrupt, we need to free all the other interrupts that we have
> > requested so far, to leave everything in a clean state. But we can't
> > use MAX_MSI_CTRLS with __dw_pcie_free_msi() and rely on the check there
> > because there may be extra interrupts that we haven't requested *yet*
> > and we would attempt to free them.
>
> Makes sense, thanks.
>
> Bjorn
>
[+cc Pali]
On Thu, Jun 29, 2023 at 05:27:54PM -0400, Radu Rendec wrote:
> On Thu, 2023-06-29 at 15:58 -0500, Bjorn Helgaas wrote:
> > On Thu, Jun 29, 2023 at 04:42:07PM -0400, Radu Rendec wrote:
> > > On Thu, 2023-06-29 at 14:57 -0500, Bjorn Helgaas wrote:
> > > > On Thu, Jun 29, 2023 at 02:30:19PM -0400, Radu Rendec wrote:
> > > > > The DesignWare PCIe host driver uses a chained interrupt to demultiplex
> > > > > the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
> > > > > pcie2a and pcie3a at the same time can create an interrupt storm where
> > > > > the parent interrupt fires continuously, even though reading the PCIe
> > > > > host registers doesn't identify any child MSI interrupt source. This
> > > > > effectively locks up CPU0, which spends all the time servicing these
> > > > > interrupts.
> > > > >
> > > > > This is a clear example of how bypassing the interrupt core by using
> > > > > chained interrupts can be very dangerous if the hardware misbehaves.
> > > > >
> > > > > Convert the driver to use a regular interrupt for the demultiplex
> > > > > handler. This allows the interrupt storm detector to detect the faulty
> > > > > interrupt and disable it, allowing the system to run normally.
> ...
> > But you do imply that there's some DesignWare hardware issue involved,
> > too, so I guess it's possible the other drivers don't have an issue
> > and/or actually require the chained IRQs.? That's why I asked how we
> > should decide.
>
> That makes sense. I don't know if it's a DesignWare hardware issue or
> something else down the PCIe bus. Other folks in my team are
> investigating the root cause. This thread has the details:
> https://lore.kernel.org/all/pmodcoakbs25z2a7mlo5gpuz63zluh35vbgb5itn6k5aqhjnny@jvphbpvahtse/
>
> I want to point out that this patch doesn't *fix* the root problem (the
> interrupt storm). It just makes the kernel handle it (much) better if
> it occurs.
>
> I can't see why any driver would _require_ chained IRQs. As I see it,
> chained interrupts are just a "shortcut" that circumvents the IRQ core
> for (debatable) performance reasons. Other than that, they should work
> the same as regular interrupts.
>
> There are other benefits associated with using regular interrupts. One
> of them is the ability to control the SMP affinity of the parent
> interrupt. But that's a different topic.
Thanks for mentioning IRQ affinity because that reminds me of a
long-standing related issue. Pali posted a similar patch for mvebu
[1], but it's been stalled for a long time because apparently there's
some potential breakage of the userspace ABI [2].
Unfortunately I'm not an IRQ expert and haven't followed all the
arguments there. BUT it does seem like Marc's objection to Pali's
patch would also apply to your patch, so maybe this is an opportune
time to re-evaluate the situation.
If converting from chained to normal handlers can be done safely, I
would definitely be in favor if doing it across all of drivers/pci/ at
once so they're all consistent. Otherwise that code just gets copied
to new drivers, so the issue persists and spreads.
Bjorn
[1] https://lore.kernel.org/r/[email protected]
[2] https://lore.kernel.org/r/[email protected]
On Thu, 2023-06-29 at 17:11 -0500, Bjorn Helgaas wrote:
> [+cc Pali]
>
> On Thu, Jun 29, 2023 at 05:27:54PM -0400, Radu Rendec wrote:
> > On Thu, 2023-06-29 at 15:58 -0500, Bjorn Helgaas wrote:
> > > On Thu, Jun 29, 2023 at 04:42:07PM -0400, Radu Rendec wrote:
> > > > On Thu, 2023-06-29 at 14:57 -0500, Bjorn Helgaas wrote:
> > > > > On Thu, Jun 29, 2023 at 02:30:19PM -0400, Radu Rendec wrote:
> > > > > > The DesignWare PCIe host driver uses a chained interrupt to demultiplex
> > > > > > the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
> > > > > > pcie2a and pcie3a at the same time can create an interrupt storm where
> > > > > > the parent interrupt fires continuously, even though reading the PCIe
> > > > > > host registers doesn't identify any child MSI interrupt source. This
> > > > > > effectively locks up CPU0, which spends all the time servicing these
> > > > > > interrupts.
> > > > > >
> > > > > > This is a clear example of how bypassing the interrupt core by using
> > > > > > chained interrupts can be very dangerous if the hardware misbehaves.
> > > > > >
> > > > > > Convert the driver to use a regular interrupt for the demultiplex
> > > > > > handler. This allows the interrupt storm detector to detect the faulty
> > > > > > interrupt and disable it, allowing the system to run normally.
>
> > ...
> > > But you do imply that there's some DesignWare hardware issue involved,
> > > too, so I guess it's possible the other drivers don't have an issue
> > > and/or actually require the chained IRQs. That's why I asked how we
> > > should decide.
> >
> > That makes sense. I don't know if it's a DesignWare hardware issue or
> > something else down the PCIe bus. Other folks in my team are
> > investigating the root cause. This thread has the details:
> > https://lore.kernel.org/all/pmodcoakbs25z2a7mlo5gpuz63zluh35vbgb5itn6k5aqhjnny@jvphbpvahtse/
> >
> > I want to point out that this patch doesn't *fix* the root problem (the
> > interrupt storm). It just makes the kernel handle it (much) better if
> > it occurs.
> >
> > I can't see why any driver would _require_ chained IRQs. As I see it,
> > chained interrupts are just a "shortcut" that circumvents the IRQ core
> > for (debatable) performance reasons. Other than that, they should work
> > the same as regular interrupts.
> >
> > There are other benefits associated with using regular interrupts. One
> > of them is the ability to control the SMP affinity of the parent
> > interrupt. But that's a different topic.
>
> Thanks for mentioning IRQ affinity because that reminds me of a
> long-standing related issue. Pali posted a similar patch for mvebu
> [1], but it's been stalled for a long time because apparently there's
> some potential breakage of the userspace ABI [2].
I'm really glad you brought this up. Ironically, that's exactly where I
started.
> Unfortunately I'm not an IRQ expert and haven't followed all the
> arguments there. BUT it does seem like Marc's objection to Pali's
> patch would also apply to your patch, so maybe this is an opportune
> time to re-evaluate the situation.
I'm no IRQ expert either, but I've been looking into it quite a lot
lately. For the record, there's an even older thread [3] where Marc
explains the userspace ABI breakage you mentioned and he proposes a new
sysfs based interface to address that problem.
I tried to follow Marc's suggestions and sent an RFC patch series [4]
that attempts to implement the new interface and also some form of
parent-child interrupt tracking. That would be a way of addressing the
affinity issue without having to convert all the drivers. But on the
other hand, it doesn't solve the IRQ storm detection problem.
Anyway, Thomas replied [5] and explained why chained interrupts are a
bad idea altogether. When we (my team and I ) came across the IRQ storm
issue, we experienced first hand the scenario that Thomas describes.
> If converting from chained to normal handlers can be done safely, I
> would definitely be in favor if doing it across all of drivers/pci/ at
> once so they're all consistent. Otherwise that code just gets copied
> to new drivers, so the issue persists and spreads.
I think the conversion can be done safely, meaning that it won't break
the drivers. And by the way, there are other IRQ drivers (outside the
PCI space) that use chained interrupts.
Unfortunately, it seems we are going in circles. Chained interrupts are
bad because they let IRQ storms go unnoticed and lock up the system,
but converting them to regular interrupts is also bad because it breaks
the userspace ABI.
I am willing to help clean up this mess, but I think first we need to
come up with a solution that's acceptable for everybody. I was hoping
Marc and Thomas would chime in, but unfortunately that hasn't happened
yet - other than each of them pointing out (separately) what is wrong
with each approach.
Radu
> [1] https://lore.kernel.org/r/[email protected]
> [2] https://lore.kernel.org/r/[email protected]
[3] https://lore.kernel.org/all/[email protected]/
[4] https://lore.kernel.org/all/[email protected]/
[5] https://lore.kernel.org/all/877csohcll.ffs@tglx/
On Thu, Jun 29, 2023 at 11:04:50PM -0400, Radu Rendec wrote:
> On Thu, 2023-06-29 at 17:11 -0500, Bjorn Helgaas wrote:
> ...
> > If converting from chained to normal handlers can be done safely, I
> > would definitely be in favor if doing it across all of drivers/pci/ at
> > once so they're all consistent.? Otherwise that code just gets copied
> > to new drivers, so the issue persists and spreads.
>
> I think the conversion can be done safely, meaning that it won't break
> the drivers. And by the way, there are other IRQ drivers (outside the
> PCI space) that use chained interrupts.
>
> Unfortunately, it seems we are going in circles. Chained interrupts are
> bad because they let IRQ storms go unnoticed and lock up the system,
> but converting them to regular interrupts is also bad because it breaks
> the userspace ABI.
>
> I am willing to help clean up this mess, but I think first we need to
> come up with a solution that's acceptable for everybody. I was hoping
> Marc and Thomas would chime in, but unfortunately that hasn't happened
> yet - other than each of them pointing out (separately) what is wrong
> with each approach.
I don't think Marc or Thomas are going to chime in with a fully-formed
solution. I think to make progress, you (or Pali, or somebody) will
have to try to address Marc and Thomas' comments, make a proposal, and
we can iterate on it.
Bjorn
On Thu, 2023-07-13 at 12:03 -0500, Bjorn Helgaas wrote:
> On Thu, Jun 29, 2023 at 11:04:50PM -0400, Radu Rendec wrote:
> > On Thu, 2023-06-29 at 17:11 -0500, Bjorn Helgaas wrote:
> > ...
>
> > > If converting from chained to normal handlers can be done safely, I
> > > would definitely be in favor if doing it across all of drivers/pci/ at
> > > once so they're all consistent. Otherwise that code just gets copied
> > > to new drivers, so the issue persists and spreads.
> >
> > I think the conversion can be done safely, meaning that it won't break
> > the drivers. And by the way, there are other IRQ drivers (outside the
> > PCI space) that use chained interrupts.
> >
> > Unfortunately, it seems we are going in circles. Chained interrupts are
> > bad because they let IRQ storms go unnoticed and lock up the system,
> > but converting them to regular interrupts is also bad because it breaks
> > the userspace ABI.
> >
> > I am willing to help clean up this mess, but I think first we need to
> > come up with a solution that's acceptable for everybody. I was hoping
> > Marc and Thomas would chime in, but unfortunately that hasn't happened
> > yet - other than each of them pointing out (separately) what is wrong
> > with each approach.
>
> I don't think Marc or Thomas are going to chime in with a fully-formed
> solution. I think to make progress, you (or Pali, or somebody) will
> have to try to address Marc and Thomas' comments, make a proposal, and
> we can iterate on it.
That crossed my mind too. Unfortunately, Marc's and Thomas' comments
are contradictory, or at least that's my interpretation. I don't expect
them to come up with a fully-formed solution, but merely to agree upon
something that the rest of us can follow. Otherwise, I think no matter
what we may come up with, at least one of them will dismiss it. They
made very clear points, and I understand both. I just can't see a
common denominator.
Let me elaborate a bit. Thomas made it very clear that we should get
rid of chained interrupts altogether and suggested to use regular
interrupts instead. And since all regular interrupts are visible in
procfs by default, so is their affinity control interface. And with
that you can now change the affinity of a parent interrupt and it will
also affect the affinity of all child interrupts. That would break the
promise that the procfs interface currently makes, which is that
setting the affinity on an interrupt will affect *only* that particular
interrupt and nothing else (that is Marc's point).
The only solution that comes to mind is this:
* Add support for tracking parent-child interrupt relationships.
* Modify the existing procfs affinity control interface to reject
changing the affinity of a parent interrupt (i.e. an interrupt that
has at least one child interrupt associated).
* Convert chained interrupts to regular interrupts as needed.
* Create a new sysfs affinity control interface that allows setting
the affinity of any interrupt, including parent interrupts.
* Expose the parent-child interrupt relationships in sysfs, so any
program that is aware of the new interface can go to the root
interrupt to set the affinity.
To be honest, I think this approach would make things even messier and
more confusing than they are today. And I'm not even sure it would not
break the procfs interface backwards compatibility in a different way.
Of course, any comments or suggestions are welcome and would be
appreciated!
Best regards,
Radu
Hello Thomas,
On Thu, 2023-06-29 at 14:30 -0400, Radu Rendec wrote:
> The DesignWare PCIe host driver uses a chained interrupt to demultiplex
> the downstream MSI interrupts. On Qualcomm SA8540P Ride, enabling both
> pcie2a and pcie3a at the same time can create an interrupt storm where
> the parent interrupt fires continuously, even though reading the PCIe
> host registers doesn't identify any child MSI interrupt source. This
> effectively locks up CPU0, which spends all the time servicing these
> interrupts.
>
> This is a clear example of how bypassing the interrupt core by using
> chained interrupts can be very dangerous if the hardware misbehaves.
>
> Convert the driver to use a regular interrupt for the demultiplex
> handler. This allows the interrupt storm detector to detect the faulty
> interrupt and disable it, allowing the system to run normally.
Kindly bringing the chained interrupts issue back to your attention.
Last week at Plumbers, Brian Masney spoke to you about this, and you
agreed to have me send another reply to this thread to bump it to the
top of your inbox. Here it is.
Essentially, you made it very clear in a different thread [1] that you
wanted to get rid of chained interrupts altogether. However, that would
have the side effect of exposing the parent interrupt affinity control
in procfs, which in Marc's opinion [2] would break the userspace ABI.
I drafted a proposal [3] that tries to solve both problems. But we
really need some maintainer guidance here, to at least agree on a high
level description of what the solution would look like. Working on
patches that get rejected because the approach is conceptually wrong
is not very practical or productive. Thanks in advance!
Best regards,
Radu
[1] https://lore.kernel.org/all/877csohcll.ffs@tglx/
[2] https://lore.kernel.org/all/[email protected]/
[3] https://lore.kernel.org/all/[email protected]/
> Signed-off-by: Radu Rendec <[email protected]>
> ---
> .../pci/controller/dwc/pcie-designware-host.c | 35 +++++++++----------
> 1 file changed, 17 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> index 9952057c8819c..b603796d415d7 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> @@ -83,18 +83,9 @@ irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp)
> return ret;
> }
>
> -/* Chained MSI interrupt service routine */
> -static void dw_chained_msi_isr(struct irq_desc *desc)
> +static irqreturn_t dw_pcie_msi_isr(int irq, void *dev_id)
> {
> - struct irq_chip *chip = irq_desc_get_chip(desc);
> - struct dw_pcie_rp *pp;
> -
> - chained_irq_enter(chip, desc);
> -
> - pp = irq_desc_get_handler_data(desc);
> - dw_handle_msi_irq(pp);
> -
> - chained_irq_exit(chip, desc);
> + return dw_handle_msi_irq(dev_id);
> }
>
> static void dw_pci_setup_msi_msg(struct irq_data *d, struct msi_msg *msg)
> @@ -254,20 +245,21 @@ int dw_pcie_allocate_domains(struct dw_pcie_rp *pp)
> return 0;
> }
>
> -static void dw_pcie_free_msi(struct dw_pcie_rp *pp)
> +static void __dw_pcie_free_msi(struct dw_pcie_rp *pp, u32 num_ctrls)
> {
> u32 ctrl;
>
> - for (ctrl = 0; ctrl < MAX_MSI_CTRLS; ctrl++) {
> + for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> if (pp->msi_irq[ctrl] > 0)
> - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> - NULL, NULL);
> + free_irq(pp->msi_irq[ctrl], pp);
> }
>
> irq_domain_remove(pp->msi_domain);
> irq_domain_remove(pp->irq_domain);
> }
>
> +#define dw_pcie_free_msi(pp) __dw_pcie_free_msi(pp, MAX_MSI_CTRLS)
> +
> static void dw_pcie_msi_init(struct dw_pcie_rp *pp)
> {
> struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> @@ -361,9 +353,16 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp)
> return ret;
>
> for (ctrl = 0; ctrl < num_ctrls; ctrl++) {
> - if (pp->msi_irq[ctrl] > 0)
> - irq_set_chained_handler_and_data(pp->msi_irq[ctrl],
> - dw_chained_msi_isr, pp);
> + if (pp->msi_irq[ctrl] > 0) {
> + ret = request_irq(pp->msi_irq[ctrl], dw_pcie_msi_isr, 0,
> + dev_name(dev), pp);
> + if (ret) {
> + dev_err(dev, "Failed to request irq %d: %d\n",
> + pp->msi_irq[ctrl], ret);
> + __dw_pcie_free_msi(pp, ctrl);
> + return ret;
> + }
> + }
> }
>
> /*