__pci_write_msi_msg() updates three registers in the device: address
high, address low, and data. On x86 systems, address low contains
CPU targeting info, and data contains the vector. The order of writes
is address, then data.
This is problematic if an interrupt comes in after address has
been written, but before data is updated, and both the SMP affinity
and target vector are being changed. In this case, the interrupt targets
the wrong vector on the new CPU.
This case is pretty easy to stumble into using xhci and CPU hotplugging.
Create a script that repeatedly targets interrupts at a set of cores and
then offlines those cores. Put some stress on USB, and then watch xhci
lose an interrupt and die.
Avoid this by disabling MSIs during the update.
Signed-off-by: Evan Green <[email protected]>
---
Changes in v2:
- Also mask msi-x interrupts during the update
- Restore the enable/mask bit to its previous value, rather than
unconditionally enabling interrupts
Bjorn,
I was unsure whether disabling MSIs temporarily is actually an okay
thing to do. I considered using the mask bit, but got the impression
that not all devices support the mask bit. Let me know if this going to
cause problems or there's a better way. I can include the repro
script I used to cause mayhem if needed.
---
drivers/pci/msi.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 6b43a5455c7af..bb21a7739fa2c 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -311,6 +311,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
{
struct pci_dev *dev = msi_desc_to_pci_dev(entry);
+ u16 msgctl;
if (dev->current_state != PCI_D0 || pci_dev_is_disconnected(dev)) {
/* Don't touch the hardware now */
@@ -320,15 +321,25 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
if (!base)
goto skip;
+ pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
+ &msgctl);
+
+ pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
+ msgctl | PCI_MSIX_FLAGS_MASKALL);
+
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
+ pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
+ msgctl);
+
} else {
int pos = dev->msi_cap;
- u16 msgctl;
+ u16 enabled;
pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
- msgctl &= ~PCI_MSI_FLAGS_QSIZE;
+ enabled = msgctl & PCI_MSI_FLAGS_ENABLE;
+ msgctl &= ~(PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
msgctl |= entry->msi_attrib.multiple << 4;
pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
@@ -343,6 +354,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
msg->data);
}
+
+ msgctl |= enabled;
+ pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
}
skip:
--
2.24.1
On Fri, Jan 17, 2020 at 4:26 PM Evan Green <[email protected]> wrote:
>
> __pci_write_msi_msg() updates three registers in the device: address
> high, address low, and data. On x86 systems, address low contains
> CPU targeting info, and data contains the vector. The order of writes
> is address, then data.
>
> This is problematic if an interrupt comes in after address has
> been written, but before data is updated, and both the SMP affinity
> and target vector are being changed. In this case, the interrupt targets
> the wrong vector on the new CPU.
>
> This case is pretty easy to stumble into using xhci and CPU hotplugging.
> Create a script that repeatedly targets interrupts at a set of cores and
> then offlines those cores. Put some stress on USB, and then watch xhci
> lose an interrupt and die.
Do I understand it right, that even with this patch, the driver might
still miss the same interrupt (because we are disabling the interrupt
for that time) - the improvement this patch brings is that it will at
least not be delivered to the wrong CPU or via a wrong vector?
Thanks,
Rajat
>
> Avoid this by disabling MSIs during the update.
>
> Signed-off-by: Evan Green <[email protected]>
> ---
>
> Changes in v2:
> - Also mask msi-x interrupts during the update
> - Restore the enable/mask bit to its previous value, rather than
> unconditionally enabling interrupts
>
>
> Bjorn,
> I was unsure whether disabling MSIs temporarily is actually an okay
> thing to do. I considered using the mask bit, but got the impression
> that not all devices support the mask bit. Let me know if this going to
> cause problems or there's a better way. I can include the repro
> script I used to cause mayhem if needed.
>
> ---
> drivers/pci/msi.c | 18 ++++++++++++++++--
> 1 file changed, 16 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 6b43a5455c7af..bb21a7739fa2c 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -311,6 +311,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> {
> struct pci_dev *dev = msi_desc_to_pci_dev(entry);
> + u16 msgctl;
>
> if (dev->current_state != PCI_D0 || pci_dev_is_disconnected(dev)) {
> /* Don't touch the hardware now */
> @@ -320,15 +321,25 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> if (!base)
> goto skip;
>
> + pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
> + &msgctl);
> +
> + pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
> + msgctl | PCI_MSIX_FLAGS_MASKALL);
> +
> writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
> writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
> writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
> + pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
> + msgctl);
> +
> } else {
> int pos = dev->msi_cap;
> - u16 msgctl;
> + u16 enabled;
>
> pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
> - msgctl &= ~PCI_MSI_FLAGS_QSIZE;
> + enabled = msgctl & PCI_MSI_FLAGS_ENABLE;
> + msgctl &= ~(PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> msgctl |= entry->msi_attrib.multiple << 4;
> pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
>
> @@ -343,6 +354,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
> msg->data);
> }
> +
> + msgctl |= enabled;
> + pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
> }
>
> skip:
> --
> 2.24.1
>
On Wed, Jan 22, 2020 at 3:26 AM Rajat Jain <[email protected]> wrote:
>
> On Fri, Jan 17, 2020 at 4:26 PM Evan Green <[email protected]> wrote:
> >
> > __pci_write_msi_msg() updates three registers in the device: address
> > high, address low, and data. On x86 systems, address low contains
> > CPU targeting info, and data contains the vector. The order of writes
> > is address, then data.
> >
> > This is problematic if an interrupt comes in after address has
> > been written, but before data is updated, and both the SMP affinity
> > and target vector are being changed. In this case, the interrupt targets
> > the wrong vector on the new CPU.
> >
> > This case is pretty easy to stumble into using xhci and CPU hotplugging.
> > Create a script that repeatedly targets interrupts at a set of cores and
> > then offlines those cores. Put some stress on USB, and then watch xhci
> > lose an interrupt and die.
>
> Do I understand it right, that even with this patch, the driver might
> still miss the same interrupt (because we are disabling the interrupt
> for that time) - the improvement this patch brings is that it will at
> least not be delivered to the wrong CPU or via a wrong vector?
In my experiments, the driver no longer misses the interrupt. XHCI is
particularly sensitive to this, if it misses one interrupt it seems to
completely wedge the driver.
I think in my case the device pends the interrupts until MSIs are
re-enabled, because I don't see anything other than MSI for xhci in
/proc/interrupts. But I'm not sure if other devices may fall back to
line-based interrupts for a moment, and if that's a problem.
Although, I already see we call pci_msi_set_enable(0) whenever we set
up MSIs, presumably for this same reason of avoiding torn MSIs. So my
fix is really just doing the same thing for an additional case. And if
getting stuck in a never-to-be-handled line based interrupt were a
problem, you'd think it would also be a problem in
pci_restore_msi_state(), where the same thing is done.
Maybe my fix is at the wrong level, and should be up in
pci_msi_domain_write_msg() instead? Though I see a lot of callers to
pci_write_msi_msg() that I worry have the same problem.
-Evan
Evan Green <[email protected]> writes:
> In my experiments, the driver no longer misses the interrupt. XHCI is
> particularly sensitive to this, if it misses one interrupt it seems to
> completely wedge the driver.
That does not make the approach more correct.
> I think in my case the device pends the interrupts until MSIs are
> re-enabled, because I don't see anything other than MSI for xhci in
> /proc/interrupts. But I'm not sure if other devices may fall back to
> line-based interrupts for a moment, and if that's a problem.
Yes they can according to standard and it _IS_ a problem.
> Although, I already see we call pci_msi_set_enable(0) whenever we set
> up MSIs, presumably for this same reason of avoiding torn MSIs.
Please stop making random assumptions. This as absolutely nothing to do
with torn MSIs. The way how MSI setup works requires this. And this is
happening on init _before_ any interrupt can be requested on the device.
Different reason, different context.
> So my fix is really just doing the same thing for an additional
> case.
No, it's absolutely not the same. Your device is active and not in
reset/init state.
> And if getting stuck in a never-to-be-handled line based interrupt
> were a problem, you'd think it would also be a problem in
> pci_restore_msi_state(), where the same thing is done.
Again. I told you already it's not the same thing.
> Maybe my fix is at the wrong level, and should be up in
> pci_msi_domain_write_msg() instead? Though I see a lot of callers to
> pci_write_msi_msg() that I worry have the same problem.
This is not yet debugged fully and as this is happening on MSI-X I'm not
really convinced yet that your 'torn write' theory holds.
Thanks,
tglx
Evan,
Thomas Gleixner <[email protected]> writes:
> This is not yet debugged fully and as this is happening on MSI-X I'm not
> really convinced yet that your 'torn write' theory holds.
can you please apply the debug patch below and run your test. When the
failure happens, stop the tracer and collect the trace.
Another question. Did you ever try to change the affinity of that
interrupt without hotplug rapidly while the device makes traffic? If
not, it would be interesting whether this leads to a failure as well.
Thanks
tglx
8<---------------
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -964,6 +964,8 @@ void irq_force_complete_move(struct irq_
if (!vector)
goto unlock;
+ trace_printk("IRQ %u vector %u irq inprogress %u\n", vector,
+ irqd->irq, apicd->move_in_progress);
/*
* This is tricky. If the cleanup of the old vector has not been
* done yet, then the following setaffinity call will fail with
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -244,6 +244,8 @@ u64 arch_irq_stat(void)
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
+ trace_printk("Handle vector %u IRQ %u\n", vector,
+ desc->irq_data.irq);
if (IS_ENABLED(CONFIG_X86_32))
handle_irq(desc, regs);
else
@@ -252,10 +254,18 @@ u64 arch_irq_stat(void)
ack_APIC_irq();
if (desc == VECTOR_UNUSED) {
+ trace_printk("Handle unused vector %u\n", vector);
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {
+ if (desc == VECTOR_SHUTDOWN) {
+ trace_printk("Handle shutdown vector %u\n",
+ vector);
+ } else if (desc == VECTOR_RETRIGGERED) {
+ trace_printk("Handle retriggered vector %u\n",
+ vector);
+ }
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
@@ -373,9 +383,14 @@ void fixup_irqs(void)
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
+ desc = __this_cpu_read(vector_irq[vector]);
+ trace_printk("FIXUP: %u\n", desc->irq_data.irq);
+
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
if (irr & (1 << (vector % 32))) {
desc = __this_cpu_read(vector_irq[vector]);
+ trace_printk("FIXUP: %u IRR pending\n",
+ desc->irq_data.irq);
raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -122,6 +122,10 @@ static bool migrate_one_irq(struct irq_d
affinity = cpu_online_mask;
brokeaff = true;
}
+
+ trace_printk("IRQ: %d maskchip %d wasmasked %d break %d\n",
+ d->irq, maskchip, irqd_irq_masked(d), brokeaff);
+
/*
* Do not set the force argument of irq_do_set_affinity() as this
* disables the masking of offline CPUs from the supplied affinity
On Thu, Jan 23, 2020 at 12:59 PM Evan Green <[email protected]> wrote:
>
> On Thu, Jan 23, 2020 at 10:17 AM Thomas Gleixner <[email protected]> wrote:
> >
> > Evan,
> >
> > Thomas Gleixner <[email protected]> writes:
> > > This is not yet debugged fully and as this is happening on MSI-X I'm not
> > > really convinced yet that your 'torn write' theory holds.
> >
> > can you please apply the debug patch below and run your test. When the
> > failure happens, stop the tracer and collect the trace.
> >
> > Another question. Did you ever try to change the affinity of that
> > interrupt without hotplug rapidly while the device makes traffic? If
> > not, it would be interesting whether this leads to a failure as well.
>
> Thanks for the patch. Looks pretty familiar :)
> I ran into issues where trace_printks on offlined cores seem to
> disappear. I even made sure the cores were back online when I
> collected the trace. So your logs might not be useful. Known issue
> with the tracer?
>
> I figured I'd share my own debug chicken scratch, in case you could
> glean anything from it. The LOG entries print out timestamps (divide
> by 1000000) that you can match up back to earlier in the log (ie so
> the last XHCI MSI change occurred at 74.032501, the last interrupt
> came in at 74.032405). Forgive the mess.
>
> I also tried changing the affinity rapidly without CPU hotplug, but
> didn't see the issue, at least not in the few minutes I waited
> (normally repros easily within 1 minute). An interesting datapoint.
One additional datapoint. The intel guys suggested enabling
CONFIG_IRQ_REMAP, which does seem to eliminate the issue for me. I'm
still hoping there's a smaller fix so I don't have to add all that in.
-Evan
Evan Green <[email protected]> writes:
> On Thu, Jan 23, 2020 at 12:59 PM Evan Green <[email protected]> wrote:
>>
>> On Thu, Jan 23, 2020 at 10:17 AM Thomas Gleixner <[email protected]> wrote:
>> >
>> > Evan,
>> >
>> > Thomas Gleixner <[email protected]> writes:
>> > > This is not yet debugged fully and as this is happening on MSI-X I'm not
>> > > really convinced yet that your 'torn write' theory holds.
As you pointed out that this is not on MSI-X I'm considering the torn
write theory to be more likely. :)
>> > can you please apply the debug patch below and run your test. When the
>> > failure happens, stop the tracer and collect the trace.
>> >
>> > Another question. Did you ever try to change the affinity of that
>> > interrupt without hotplug rapidly while the device makes traffic? If
>> > not, it would be interesting whether this leads to a failure as well.
>>
>> Thanks for the patch. Looks pretty familiar :)
>> I ran into issues where trace_printks on offlined cores seem to
>> disappear. I even made sure the cores were back online when I
>> collected the trace. So your logs might not be useful. Known issue
>> with the tracer?
No. I tried the patch myself to verify that it does what I want.
The only information I'm missing right now is the interrupt number to
look for. But I'll stare at it with brain awake tomorrow morning again.
>> I also tried changing the affinity rapidly without CPU hotplug, but
>> didn't see the issue, at least not in the few minutes I waited
>> (normally repros easily within 1 minute). An interesting datapoint.
That's what I expected. The main difference is that the vector
modification happens at a point where a device is not supposed to send
an interrupt. They happen when the interrupt of the device is serviced
before the driver handler is invoked and at that point the device should
not send another one.
> One additional datapoint. The intel guys suggested enabling
> CONFIG_IRQ_REMAP, which does seem to eliminate the issue for me. I'm
> still hoping there's a smaller fix so I don't have to add all that in.
Right, I wanted to ask you that as well and forgot. With interrupt
remapping the migration happens at the remapping unit which does not
have the horrible 'move it while servicing' requirement and it suppports
proper masking.
Thanks,
tglx
On Thu, Jan 23, 2020 at 2:59 PM Evan Green <[email protected]> wrote:
>
> On Thu, Jan 23, 2020 at 12:59 PM Evan Green <[email protected]> wrote:
> >
> > On Thu, Jan 23, 2020 at 10:17 AM Thomas Gleixner <[email protected]> wrote:
> > >
> > > Evan,
> > >
> > > Thomas Gleixner <[email protected]> writes:
> > > > This is not yet debugged fully and as this is happening on MSI-X I'm not
> > > > really convinced yet that your 'torn write' theory holds.
> > >
> > > can you please apply the debug patch below and run your test. When the
> > > failure happens, stop the tracer and collect the trace.
> > >
> > > Another question. Did you ever try to change the affinity of that
> > > interrupt without hotplug rapidly while the device makes traffic? If
> > > not, it would be interesting whether this leads to a failure as well.
> >
> > Thanks for the patch. Looks pretty familiar :)
> > I ran into issues where trace_printks on offlined cores seem to
> > disappear. I even made sure the cores were back online when I
> > collected the trace. So your logs might not be useful. Known issue
> > with the tracer?
> >
> > I figured I'd share my own debug chicken scratch, in case you could
> > glean anything from it. The LOG entries print out timestamps (divide
> > by 1000000) that you can match up back to earlier in the log (ie so
> > the last XHCI MSI change occurred at 74.032501, the last interrupt
> > came in at 74.032405). Forgive the mess.
> >
> > I also tried changing the affinity rapidly without CPU hotplug, but
> > didn't see the issue, at least not in the few minutes I waited
> > (normally repros easily within 1 minute). An interesting datapoint.
>
> One additional datapoint. The intel guys suggested enabling
> CONFIG_IRQ_REMAP, which does seem to eliminate the issue for me. I'm
> still hoping there's a smaller fix so I don't have to add all that in.
I did another experiment that I think lends credibility to my torn MSI
hypothesis. I have the following change:
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1f69b12d5bb86..0336d23f9ba9a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1798,6 +1798,7 @@ void (*machine_check_vector)(struct pt_regs *,
long error_code) =
dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
{
+printk("EVAN MACHINE CHECK HC died");
machine_check_vector(regs, error_code);
}
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 23a363fd4c59c..31f683da857e3 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -315,6 +315,11 @@ void __pci_write_msi_msg(struct msi_desc *entry,
struct msi_msg *msg)
msgctl |= entry->msi_attrib.multiple << 4;
pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);
+if (entry->msi_attrib.is_64) {
+pci_write_config_word(dev, pos + PCI_MSI_DATA_64, 0x4012);
+} else {
+pci_write_config_word(dev, pos + PCI_MSI_DATA_32, 0x4012);
+}
pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
msg->address_lo);
if (entry->msi_attrib.is_64) {
And indeed, I get a machine check, despite the fact that MSI_DATA is
overwritten just after address is updated.
[ 79.937179] smpboot: CPU 1 is now offline
[ 80.001685] smpboot: CPU 3 is now offline
[ 80.025210] smpboot: CPU 5 is now offline
[ 80.049517] smpboot: CPU 7 is now offline
[ 80.094263] x86: Booting SMP configuration:
[ 80.099394] smpboot: Booting Node 0 Processor 1 APIC 0x1
[ 80.136233] smpboot: Booting Node 0 Processor 3 APIC 0x3
[ 80.155732] smpboot: Booting Node 0 Processor 5 APIC 0x5
[ 80.173632] smpboot: Booting Node 0 Processor 7 APIC 0x7
[ 80.297198] smpboot: CPU 1 is now offline
[ 80.331347] EVAN MACHINE CHECK HC died
[ 82.281555] Kernel panic - not syncing: Timeout: Not all CPUs
entered broadcast exception handler
[ 82.295775] Kernel Offset: disabled
[ 82.301740] gsmi: Log Shutdown Reason 0x02
[ 82.313942] Rebooting in 30 seconds..
[ 112.204113] ACPI MEMORY or I/O RESET_REG.
-Evan
Evan,
Evan Green <[email protected]> writes:
> I did another experiment that I think lends credibility to my torn MSI
> hypothesis. I have the following change:
>
> And indeed, I get a machine check, despite the fact that MSI_DATA is
> overwritten just after address is updated.
I don't have to understand why a SoC released in 2019 still has
unmaskable MSI especially as Inhell's own XHCI spec clearly documents
and recommends MSI-X.
While your workaround (disabling MSI) works in this particular case it's
not really a good option:
1) Quite some devices have a bug where the legacy INTX disable does not
work reliably or is outright broken. That means MSI disable will
reroute to INTX.
2) I digged out old debug data which confirms that some silly devices
lose interrupts accross MSI disable/reenable if the INTX fallback is
disabled.
And no, it's not a random weird device, it's part of a chipset which
was pretty popular a few years ago. I leave it as an excercise for
the reader to guess the vendor.
Can you please apply the patch below? It enforces an IPI to the new
vector/target CPU when the interrupt is MSI w/o masking. It should
cure the issue. It goes without saying that I'm not proud of it.
Thanks,
tglx
8<--------------
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -498,6 +498,7 @@ extern bool default_check_apicid_used(ph
extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap);
extern int default_cpu_present_to_apicid(int mps_cpu);
extern int default_check_phys_apicid_present(int phys_apicid);
+extern bool apic_hotplug_force_retrigger(struct irq_data *irqd);
#endif /* CONFIG_X86_LOCAL_APIC */
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -10,6 +10,7 @@ enum {
/* Allocate contiguous CPU vectors */
X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1,
X86_IRQ_ALLOC_LEGACY = 0x2,
+ X86_IRQ_MSI_NOMASK_TRAINWRECK = 0x4,
};
extern struct irq_domain *x86_vector_domain;
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -103,6 +103,14 @@ int pci_msi_prepare(struct irq_domain *d
} else {
arg->type = X86_IRQ_ALLOC_TYPE_MSI;
arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+ /*
+ * If the MSI implementation does not provide masking
+ * enable the workaround for the CPU hotplug forced
+ * migration problem which is caused by the torn write of
+ * the address/data pair.
+ */
+ if (!desc->msi_attrib.maskbit)
+ arg->flags |= X86_IRQ_MSI_NOMASK_TRAINWRECK;
}
return 0;
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -34,7 +34,8 @@ struct apic_chip_data {
unsigned int move_in_progress : 1,
is_managed : 1,
can_reserve : 1,
- has_reserved : 1;
+ has_reserved : 1,
+ force_retrigger : 1;
};
struct irq_domain *x86_vector_domain;
@@ -99,6 +100,18 @@ struct irq_cfg *irq_cfg(unsigned int irq
return irqd_cfg(irq_get_irq_data(irq));
}
+bool apic_hotplug_force_retrigger(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd;
+
+ irqd = __irq_domain_get_irq_data(x86_vector_domain, irqd);
+ if (!irqd)
+ return false;
+
+ apicd = apic_chip_data(irqd);
+ return apicd && apicd->force_retrigger;
+}
+
static struct apic_chip_data *alloc_apic_chip_data(int node)
{
struct apic_chip_data *apicd;
@@ -552,6 +565,8 @@ static int x86_vector_alloc_irqs(struct
}
apicd->irq = virq + i;
+ if (info->flags & X86_IRQ_MSI_NOMASK_TRAINWRECK)
+ apicd->force_retrigger = true;
irqd->chip = &lapic_controller;
irqd->chip_data = apicd;
irqd->hwirq = virq + i;
@@ -624,6 +639,7 @@ static void x86_vector_debug_show(struct
seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0);
seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0);
seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist));
+ seq_printf(m, "%*sforce_retrigger: %u\n", ind, "", apicd.force_retrigger ? 1 : 0);
}
#endif
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -350,6 +350,7 @@ void fixup_irqs(void)
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
+ bool retrigger;
irq_migrate_all_off_this_cpu();
@@ -370,24 +371,29 @@ void fixup_irqs(void)
* nothing else will touch it.
*/
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
+ desc = __this_cpu_read(vector_irq[vector]);
+ if (IS_ERR_OR_NULL(desc))
continue;
+ raw_spin_lock(&desc->lock);
+ data = irq_desc_get_irq_data(desc);
+ retrigger = apic_hotplug_force_retrigger(data);
+
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- if (irr & (1 << (vector % 32))) {
- desc = __this_cpu_read(vector_irq[vector]);
+ if (irr & (1 << (vector % 32)))
+ retrigger = true;
- raw_spin_lock(&desc->lock);
- data = irq_desc_get_irq_data(desc);
+ if (!retrigger) {
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ } else {
chip = irq_data_get_irq_chip(data);
if (chip->irq_retrigger) {
chip->irq_retrigger(data);
- __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ __this_cpu_write(vector_irq[vector],
+ VECTOR_RETRIGGERED);
}
- raw_spin_unlock(&desc->lock);
}
- if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
- __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ raw_spin_unlock(&desc->lock);
}
}
#endif
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -432,6 +432,8 @@ int irq_reserve_ipi(struct irq_domain *d
int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest);
/* V2 interfaces to support hierarchy IRQ domains. */
+struct irq_data *__irq_domain_get_irq_data(struct irq_domain *domain,
+ struct irq_data *irq_data);
extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
unsigned int virq);
extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1165,6 +1165,21 @@ static int irq_domain_alloc_irq_data(str
}
/**
+ * __irq_domain_get_irq_data - Get irq_data associated with @domain for @data
+ * @domain: domain to match
+ * @irq_data: initial irq data to start hierarchy search
+ */
+struct irq_data *__irq_domain_get_irq_data(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ for (; irq_data; irq_data = irq_data->parent_data) {
+ if (irq_data->domain == domain)
+ return irq_data;
+ }
+ return NULL;
+}
+
+/**
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
* @domain: domain to match
* @virq: IRQ number to get irq_data
@@ -1172,14 +1187,7 @@ static int irq_domain_alloc_irq_data(str
struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
unsigned int virq)
{
- struct irq_data *irq_data;
-
- for (irq_data = irq_get_irq_data(virq); irq_data;
- irq_data = irq_data->parent_data)
- if (irq_data->domain == domain)
- return irq_data;
-
- return NULL;
+ return __irq_domain_get_irq_data(domain, irq_get_irq_data(virq));
}
EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
On Fri, Jan 24, 2020 at 6:34 AM Thomas Gleixner <[email protected]> wrote:
>
> Evan,
>
> Evan Green <[email protected]> writes:
> > I did another experiment that I think lends credibility to my torn MSI
> > hypothesis. I have the following change:
> >
> > And indeed, I get a machine check, despite the fact that MSI_DATA is
> > overwritten just after address is updated.
>
> I don't have to understand why a SoC released in 2019 still has
> unmaskable MSI especially as Inhell's own XHCI spec clearly documents
> and recommends MSI-X.
>
> While your workaround (disabling MSI) works in this particular case it's
> not really a good option:
>
> 1) Quite some devices have a bug where the legacy INTX disable does not
> work reliably or is outright broken. That means MSI disable will
> reroute to INTX.
>
> 2) I digged out old debug data which confirms that some silly devices
> lose interrupts accross MSI disable/reenable if the INTX fallback is
> disabled.
>
> And no, it's not a random weird device, it's part of a chipset which
> was pretty popular a few years ago. I leave it as an excercise for
> the reader to guess the vendor.
>
> Can you please apply the patch below? It enforces an IPI to the new
> vector/target CPU when the interrupt is MSI w/o masking. It should
> cure the issue. It goes without saying that I'm not proud of it.
I'll feel just as dirty putting a tested-by on it :)
I don't think this patch is complete. As written, it creates "recovery
interrupts" for MSIs that are not maskable, however through the
pci_msi_domain_write_msg() path, which is the one I seem to use, we
make no effort to mask the MSI while changing affinity. So at the very
least it would need a follow-on patch that attempts to mask the MSI,
for MSIs that are maskable. __pci_restore_msi_state(), called in the
resume path, does have this masking, but for some reason not
pci_msi_domain_write_msg().
I'm also a bit concerned about all the spurious interrupts we'll be
introducing. Not just the retriggering introduced here, but the fact
that we never dealt with the torn interrupt. So in my case, XHCI will
be sending an interrupt on the old vector to the new CPU, which could
be registered to anything. I'm worried that not every driver in the
system is hardened to receiving interrupts it's not prepared for.
Perhaps the driver misbehaves, or perhaps it's a "bad" interrupt like
the MCE interrupt that takes the system down. (I realize the MCE
interrupt itself is not in the device vector region, but some other
bad interrupt then).
Now that you're on board with the torn write theory, what do you think
about my "transit vector" proposal? The idea is this:
- Reserve a single vector number on all CPUs for interrupts in
transit between CPUs.
- Interrupts in transit between CPUs are added to some sort of list,
or maybe the transit vector itself.
- __pci_msi_write_msg() would, after proper abstractions, essentially
be doing this:
pci_write(MSI_DATA, TRANSIT_VECTOR);
pci_write(MSI_ADDRESS, new_affinity);
pci_write(MSI_DATA, new_vector);
- In the rare torn case I've found here, the interrupt will come in
on <new CPU, transit_vector>, or <old CPU, transit_vector>.
- The ISR for TRANSIT_VECTOR would go through and call the ISR for
every IRQ in transit across CPUs. This does still result in a couple
extra ISR calls, since multiple interrupts might be in transit across
CPUs, but at least it's very rare.
- CPU hotplug would keep the same logic it already has, retriggering
TRANSIT_VECTOR if it happened to land on <old CPU, old vector>.
- When the interrupt is confirmed on <new CPU, new vector>, remove
the ISR from the TRANSIT_VECTOR list.
If you think it's a worthwhile idea I can try to code it up.
I've been running your patch for about 30 minutes, with no repro case.
-Evan
Evan Green <[email protected]> writes:
> On Fri, Jan 24, 2020 at 6:34 AM Thomas Gleixner <[email protected]> wrote:
>> Can you please apply the patch below? It enforces an IPI to the new
>> vector/target CPU when the interrupt is MSI w/o masking. It should
>> cure the issue. It goes without saying that I'm not proud of it.
>
> I'll feel just as dirty putting a tested-by on it :)
Hehehe.
> I don't think this patch is complete. As written, it creates "recovery
> interrupts" for MSIs that are not maskable, however through the
> pci_msi_domain_write_msg() path, which is the one I seem to use, we
> make no effort to mask the MSI while changing affinity. So at the very
> least it would need a follow-on patch that attempts to mask the MSI,
> for MSIs that are maskable. __pci_restore_msi_state(), called in the
> resume path, does have this masking, but for some reason not
> pci_msi_domain_write_msg().
Wrong. The core code does the masking already because that's required
for other things than MSI as well.
For regular affinity changes in the context of the serviced interrupt
it's done in __irq_move_irq() and for the hotplug case it's done in
migrate_one_irq().
You really need to look at the big picture of this and not just at
random bits and pieces of MSI code which are unrelated to this.
> I'm also a bit concerned about all the spurious interrupts we'll be
> introducing. Not just the retriggering introduced here, but the fact
> that we never dealt with the torn interrupt. So in my case, XHCI will
> be sending an interrupt on the old vector to the new CPU, which could
> be registered to anything. I'm worried that not every driver in the
> system is hardened to receiving interrupts it's not prepared for.
> Perhaps the driver misbehaves, or perhaps it's a "bad" interrupt like
> the MCE interrupt that takes the system down. (I realize the MCE
> interrupt itself is not in the device vector region, but some other
> bad interrupt then).
There are no bad or dangerous vectors in the range which can be assigned
to a device.
Drivers which cannot deal with spurious interrupts are broken already
today. Spurious interrupts can happen and do happen for various reasons.
Unhandled spurious interrupts are not a problem as long as there are not
gazillions of them within a split second, which is not the case here.
> Now that you're on board with the torn write theory, what do you think
> about my "transit vector" proposal? The idea is this:
> - Reserve a single vector number on all CPUs for interrupts in
> transit between CPUs.
> - Interrupts in transit between CPUs are added to some sort of list,
> or maybe the transit vector itself.
You need a list or some other form of storage for this because migration
can happen in parallel (not the hotplug one, but the regular ones).
> - __pci_msi_write_msg() would, after proper abstractions, essentially
> be doing this:
> pci_write(MSI_DATA, TRANSIT_VECTOR);
> pci_write(MSI_ADDRESS, new_affinity);
> pci_write(MSI_DATA, new_vector);
That doesn't work. You have to write in the proper order to make all
variants of MSI devices happy. So it's actually two consecutive
full __pci_msi_write_msg() invocations.
> - The ISR for TRANSIT_VECTOR would go through and call the ISR for
> every IRQ in transit across CPUs. This does still result in a couple
> extra ISR calls, since multiple interrupts might be in transit across
> CPUs, but at least it's very rare.
That's not trivial especially from the locking POV. I thought about it
for a while before hacking up that retrigger thing and everything I came
up with resulted in nasty deadlocks at least on the drawing board.
And for the hotplug case it's even less trivial because the target CPU
sits in stop machine with interrupts disabled and waits for the outgoing
CPU to die. So it cannot handle the interrupt before the outgoing one
cleaned up in fixup_irqs() and you cannot query the APIC IRR on the
target from the outgoing one.
Even in a regular migration the same problem exists because the other
CPU might either be unable to service it or service it _before_ the CPU
which does the migration has completed the process.
> If you think it's a worthwhile idea I can try to code it up.
It's worthwhile, but that needs some deep thoughts about locking and
ordering plus the inevitable race conditions this creates. If it would
be trivial, I surely wouldn't have hacked up the retrigger mess.
Thanks,
tglx
It seems we could avoid force_retrigger once IR is turned on. I have tried
this patch with IR turned on, still seeing force_retrigger=1 for MSIs w/o
masking.
Jacob Pan <[email protected]> writes:
> It seems we could avoid force_retrigger once IR is turned on. I have tried
> this patch with IR turned on, still seeing force_retrigger=1 for MSIs w/o
> masking.
Yes. That patch was a POC and surely not a final solution. Aside of that
I'm still looking into a solution to avoid that which does not involve
horrible locking and tons of scary code.
Thanks,
tglx
Evan,
Thomas Gleixner <[email protected]> writes:
> It's worthwhile, but that needs some deep thoughts about locking and
> ordering plus the inevitable race conditions this creates. If it would
> be trivial, I surely wouldn't have hacked up the retrigger mess.
So after staring at it for a while, I came up with the patch below.
Your idea of going through some well defined transition vector is just
not feasible due to locking and life-time issues.
I'm taking a similar but easier to handle approach.
1) Move the interrupt to the new vector on the old (local) CPU
2) Move it to the new CPU
3) Check if the new vector is pending on the local CPU. If yes
retrigger it on the new CPU.
That might give a spurious interrupt if the new vector on the local CPU
is in use. But as I said before this is nothing to worry about. If the
affected device driver fails to handle that spurious interrupt then it
is broken anyway.
In theory we could teach the vector allocation logic to search for an
unused pair of vectors on both CPUs, but the required code for that is
hardly worth the trouble. In the end the situation that no pair is found
has to be handled anyway. So rather than making this the corner case
which is never tested and then leads to hard to debug issues, I prefer
to make it more likely to happen.
The patch is only lightly tested, but so far it survived.
Thanks,
tglx
8<----------------
arch/x86/include/asm/apic.h | 8 +++
arch/x86/kernel/apic/msi.c | 115 ++++++++++++++++++++++++++++++++++++++++++--
include/linux/irq.h | 18 ++++++
include/linux/irqdomain.h | 7 ++
kernel/irq/debugfs.c | 1
kernel/irq/msi.c | 5 +
6 files changed, 150 insertions(+), 4 deletions(-)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -452,6 +452,14 @@ static inline void ack_APIC_irq(void)
apic_eoi();
}
+
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
+{
+ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+
+ return !!(irr & (1U << (vector % 32)));
+}
+
static inline unsigned default_get_apic_id(unsigned long x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,10 +23,8 @@
static struct irq_domain *msi_default_domain;
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
-
msg->address_hi = MSI_ADDR_BASE_HI;
if (x2apic_enabled())
@@ -47,6 +45,114 @@ static void irq_msi_compose_msg(struct i
MSI_DATA_VECTOR(cfg->vector);
}
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path)).
+ * The quirk bit is not set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_msi_nomask_quirk(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in do_IRQ().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU. Check whether the transition raced with a device
+ * interrupt and is pending in the local APICs IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+ unlock_vector_lock();
+ return ret;
+}
+
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +164,7 @@ static struct irq_chip pci_msi_controlle
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_set_affinity = msi_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -146,6 +253,8 @@ void __init arch_init_msi_domain(struct
}
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ else
+ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
}
#ifdef CONFIG_IRQ_REMAP
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -209,6 +209,8 @@ struct irq_data {
* IRQD_SINGLE_TARGET - IRQ allows only a single affinity target
* IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set
* IRQD_CAN_RESERVE - Can use reservation mode
+ * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change
+ * required
*/
enum {
IRQD_TRIGGER_MASK = 0xf,
@@ -231,6 +233,7 @@ enum {
IRQD_SINGLE_TARGET = (1 << 24),
IRQD_DEFAULT_TRIGGER_SET = (1 << 25),
IRQD_CAN_RESERVE = (1 << 26),
+ IRQD_MSI_NOMASK_QUIRK = (1 << 27),
};
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(stru
return __irqd_to_state(d) & IRQD_CAN_RESERVE;
}
+static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
+{
+ return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
+}
+
#undef __irqd_to_state
static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -207,6 +207,13 @@ enum {
IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5),
/*
+ * Quirk to handle MSI implementations which do not provide
+ * masking. Currently known to affect x86, but partially
+ * handled in core code.
+ */
+ IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6),
+
+ /*
* Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
* for implementation specific purposes and ignored by the
* core code.
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdat
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_dom
continue;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve)
+ if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;
On Tue, Jan 28, 2020 at 6:38 AM Thomas Gleixner <[email protected]> wrote:
>
> Evan,
>
> Thomas Gleixner <[email protected]> writes:
> > It's worthwhile, but that needs some deep thoughts about locking and
> > ordering plus the inevitable race conditions this creates. If it would
> > be trivial, I surely wouldn't have hacked up the retrigger mess.
>
> So after staring at it for a while, I came up with the patch below.
>
> Your idea of going through some well defined transition vector is just
> not feasible due to locking and life-time issues.
>
> I'm taking a similar but easier to handle approach.
>
> 1) Move the interrupt to the new vector on the old (local) CPU
>
> 2) Move it to the new CPU
>
> 3) Check if the new vector is pending on the local CPU. If yes
> retrigger it on the new CPU.
>
> That might give a spurious interrupt if the new vector on the local CPU
> is in use. But as I said before this is nothing to worry about. If the
> affected device driver fails to handle that spurious interrupt then it
> is broken anyway.
>
> In theory we could teach the vector allocation logic to search for an
> unused pair of vectors on both CPUs, but the required code for that is
> hardly worth the trouble. In the end the situation that no pair is found
> has to be handled anyway. So rather than making this the corner case
> which is never tested and then leads to hard to debug issues, I prefer
> to make it more likely to happen.
>
> The patch is only lightly tested, but so far it survived.
>
Hi Thomas,
Thanks for the patch, I gave it a try. I get the following splat, then a hang:
[ 62.173778] ============================================
[ 62.179723] WARNING: possible recursive locking detected
[ 62.185657] 4.19.96 #2 Not tainted
[ 62.189453] --------------------------------------------
[ 62.195388] migration/1/17 is trying to acquire lock:
[ 62.201031] 000000006885da2d (vector_lock){-.-.}, at:
apic_retrigger_irq+0x31/0x63
[ 62.209508]
[ 62.209508] but task is already holding lock:
[ 62.216026] 000000006885da2d (vector_lock){-.-.}, at:
msi_set_affinity+0x13c/0x27b
[ 62.224498]
[ 62.224498] other info that might help us debug this:
[ 62.231791] Possible unsafe locking scenario:
[ 62.231791]
[ 62.238406] CPU0
[ 62.241135] ----
[ 62.243863] lock(vector_lock);
[ 62.247467] lock(vector_lock);
[ 62.251071]
[ 62.251071] *** DEADLOCK ***
[ 62.251071]
[ 62.257687] May be due to missing lock nesting notation
[ 62.257687]
[ 62.265274] 2 locks held by migration/1/17:
[ 62.269946] #0: 00000000cfa9d8c3 (&irq_desc_lock_class){-.-.}, at:
irq_migrate_all_off_this_cpu+0x44/0x28f
[ 62.280846] #1: 000000006885da2d (vector_lock){-.-.}, at:
msi_set_affinity+0x13c/0x27b
[ 62.289801]
[ 62.289801] stack backtrace:
[ 62.294669] CPU: 1 PID: 17 Comm: migration/1 Not tainted 4.19.96 #2
[ 62.310713] Call Trace:
[ 62.313446] dump_stack+0xac/0x11e
[ 62.317255] __lock_acquire+0x64f/0x19bc
[ 62.321646] ? find_held_lock+0x3d/0xb8
[ 62.325936] ? pci_conf1_write+0x4f/0xdf
[ 62.330320] lock_acquire+0x1b2/0x1fa
[ 62.334413] ? apic_retrigger_irq+0x31/0x63
[ 62.339097] _raw_spin_lock_irqsave+0x51/0x7d
[ 62.343972] ? apic_retrigger_irq+0x31/0x63
[ 62.348646] apic_retrigger_irq+0x31/0x63
[ 62.353124] msi_set_affinity+0x25a/0x27b
[ 62.357606] irq_do_set_affinity+0x37/0xaa
[ 62.362191] irq_migrate_all_off_this_cpu+0x1c1/0x28f
[ 62.367841] fixup_irqs+0x15/0xd2
[ 62.371544] cpu_disable_common+0x20a/0x217
[ 62.376217] native_cpu_disable+0x1f/0x24
[ 62.380696] take_cpu_down+0x41/0x95
[ 62.384691] multi_cpu_stop+0xbd/0x14b
[ 62.388878] ? _raw_spin_unlock_irq+0x2c/0x40
[ 62.393746] ? stop_two_cpus+0x2c5/0x2c5
[ 62.398127] cpu_stopper_thread+0x84/0x100
[ 62.402705] smpboot_thread_fn+0x1a9/0x25f
[ 62.407281] ? cpu_report_death+0x81/0x81
[ 62.411760] kthread+0x146/0x14e
[ 62.415364] ? cpu_report_death+0x81/0x81
[ 62.419846] ? kthread_blkcg+0x31/0x31
[ 62.424042] ret_from_fork+0x24/0x50
-Evan
Evan,
Evan Green <[email protected]> writes:
> On Tue, Jan 28, 2020 at 6:38 AM Thomas Gleixner <[email protected]> wrote:
>> The patch is only lightly tested, but so far it survived.
>>
>
> Hi Thomas,
> Thanks for the patch, I gave it a try. I get the following splat, then a hang:
>
> [ 62.238406] CPU0
> [ 62.241135] ----
> [ 62.243863] lock(vector_lock);
> [ 62.247467] lock(vector_lock);
> [ 62.251071]
> [ 62.251071] *** DEADLOCK ***
> [ 62.251071]
> [ 62.257687] May be due to missing lock nesting notation
> [ 62.257687]
> [ 62.265274] 2 locks held by migration/1/17:
> [ 62.269946] #0: 00000000cfa9d8c3 (&irq_desc_lock_class){-.-.}, at:
> irq_migrate_all_off_this_cpu+0x44/0x28f
> [ 62.280846] #1: 000000006885da2d (vector_lock){-.-.}, at:
> msi_set_affinity+0x13c/0x27b
> [ 62.289801]
> [ 62.289801] stack backtrace:
> [ 62.294669] CPU: 1 PID: 17 Comm: migration/1 Not tainted 4.19.96 #2
> [ 62.310713] Call Trace:
> [ 62.313446] dump_stack+0xac/0x11e
> [ 62.317255] __lock_acquire+0x64f/0x19bc
> [ 62.321646] ? find_held_lock+0x3d/0xb8
> [ 62.325936] ? pci_conf1_write+0x4f/0xdf
> [ 62.330320] lock_acquire+0x1b2/0x1fa
> [ 62.334413] ? apic_retrigger_irq+0x31/0x63
> [ 62.339097] _raw_spin_lock_irqsave+0x51/0x7d
> [ 62.343972] ? apic_retrigger_irq+0x31/0x63
> [ 62.348646] apic_retrigger_irq+0x31/0x63
> [ 62.353124] msi_set_affinity+0x25a/0x27b
Bah. I'm sure I looked at that call chain, noticed the double vector
lock and then forgot. Delta patch below.
Thanks,
tglx
8<--------------
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -64,6 +64,7 @@ msi_set_affinity(struct irq_data *irqd,
struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
struct irq_data *parent = irqd->parent_data;
unsigned int cpu;
+ bool pending;
int ret;
/* Save the current configuration */
@@ -147,9 +148,13 @@ msi_set_affinity(struct irq_data *irqd,
* vector/CPU. Check whether the transition raced with a device
* interrupt and is pending in the local APICs IRR.
*/
- if (lapic_vector_set_in_irr(cfg->vector))
- irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+ pending = lapic_vector_set_in_irr(cfg->vector);
+
unlock_vector_lock();
+
+ if (pending)
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
return ret;
}
On Tue, Jan 28, 2020 at 2:48 PM Thomas Gleixner <[email protected]> wrote:
>
> Evan,
>
> Evan Green <[email protected]> writes:
> > On Tue, Jan 28, 2020 at 6:38 AM Thomas Gleixner <[email protected]> wrote:
> >> The patch is only lightly tested, but so far it survived.
> >>
> >
> > Hi Thomas,
> > Thanks for the patch, I gave it a try. I get the following splat, then a hang:
> >
> > [ 62.238406] CPU0
> > [ 62.241135] ----
> > [ 62.243863] lock(vector_lock);
> > [ 62.247467] lock(vector_lock);
> > [ 62.251071]
> > [ 62.251071] *** DEADLOCK ***
> > [ 62.251071]
> > [ 62.257687] May be due to missing lock nesting notation
> > [ 62.257687]
> > [ 62.265274] 2 locks held by migration/1/17:
> > [ 62.269946] #0: 00000000cfa9d8c3 (&irq_desc_lock_class){-.-.}, at:
> > irq_migrate_all_off_this_cpu+0x44/0x28f
> > [ 62.280846] #1: 000000006885da2d (vector_lock){-.-.}, at:
> > msi_set_affinity+0x13c/0x27b
> > [ 62.289801]
> > [ 62.289801] stack backtrace:
> > [ 62.294669] CPU: 1 PID: 17 Comm: migration/1 Not tainted 4.19.96 #2
> > [ 62.310713] Call Trace:
> > [ 62.313446] dump_stack+0xac/0x11e
> > [ 62.317255] __lock_acquire+0x64f/0x19bc
> > [ 62.321646] ? find_held_lock+0x3d/0xb8
> > [ 62.325936] ? pci_conf1_write+0x4f/0xdf
> > [ 62.330320] lock_acquire+0x1b2/0x1fa
> > [ 62.334413] ? apic_retrigger_irq+0x31/0x63
> > [ 62.339097] _raw_spin_lock_irqsave+0x51/0x7d
> > [ 62.343972] ? apic_retrigger_irq+0x31/0x63
> > [ 62.348646] apic_retrigger_irq+0x31/0x63
> > [ 62.353124] msi_set_affinity+0x25a/0x27b
>
> Bah. I'm sure I looked at that call chain, noticed the double vector
> lock and then forgot. Delta patch below.
It's working well with the delta patch, been running for about an hour
with no issues.
-Evan
Evan,
Evan Green <[email protected]> writes:
> On Tue, Jan 28, 2020 at 2:48 PM Thomas Gleixner <[email protected]> wrote:
>>
>> Bah. I'm sure I looked at that call chain, noticed the double vector
>> lock and then forgot. Delta patch below.
>
> It's working well with the delta patch, been running for about an hour
> with no issues.
thanks for the info and for testing!
Could you please add some instrumentation to see how often this stuff
actually triggers spurious interrupts?
Thanks,
tglx
On Wed, Jan 29, 2020 at 1:01 PM Thomas Gleixner <[email protected]> wrote:
>
> Evan,
>
> Evan Green <[email protected]> writes:
> > On Tue, Jan 28, 2020 at 2:48 PM Thomas Gleixner <[email protected]> wrote:
> >>
> >> Bah. I'm sure I looked at that call chain, noticed the double vector
> >> lock and then forgot. Delta patch below.
> >
> > It's working well with the delta patch, been running for about an hour
> > with no issues.
>
> thanks for the info and for testing!
>
> Could you please add some instrumentation to see how often this stuff
> actually triggers spurious interrupts?
In about 10 minutes of this script running, I got 142 hits. My script
can toggle the HT cpus on and off about twice per second.
Here's my diff (sorry it's mangled by gmail). If you're looking for
something else, let me know, or I can run a patch.
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 90baf2c66bd40..f9c46fc30d658 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -61,6 +61,8 @@ static void irq_msi_update_msg(struct irq_data
*irqd, struct irq_cfg *cfg)
irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
}
+int evanpending;
+
static int
msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
{
@@ -155,8 +157,10 @@ msi_set_affinity(struct irq_data *irqd, const
struct cpumask *mask, bool force)
unlock_vector_lock();
- if (pending)
+ if (pending) {
+ printk("EVAN pending %d", ++evanpending);
irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+ }
return ret;
}
-Evan
Evan,
Evan Green <[email protected]> writes:
> On Wed, Jan 29, 2020 at 1:01 PM Thomas Gleixner <[email protected]> wrote:
>
>> Could you please add some instrumentation to see how often this stuff
>> actually triggers spurious interrupts?
>
> In about 10 minutes of this script running, I got 142 hits. My script
> can toggle the HT cpus on and off about twice per second.
> Here's my diff (sorry it's mangled by gmail). If you're looking for
> something else, let me know, or I can run a patch.
>
No, that's good data. Your testing is hiting the critical path and as
you did not complain about negative side effects it seems to hold up to
the expectations. I'm going to convert this to real patch with a
proper changelog tomorrow.
Thanks for your help!
tglx
On Wed, Jan 29, 2020 at 3:16 PM Thomas Gleixner <[email protected]> wrote:
>
> Evan,
>
> Evan Green <[email protected]> writes:
> > On Wed, Jan 29, 2020 at 1:01 PM Thomas Gleixner <[email protected]> wrote:
> >
> >> Could you please add some instrumentation to see how often this stuff
> >> actually triggers spurious interrupts?
> >
> > In about 10 minutes of this script running, I got 142 hits. My script
> > can toggle the HT cpus on and off about twice per second.
> > Here's my diff (sorry it's mangled by gmail). If you're looking for
> > something else, let me know, or I can run a patch.
> >
> No, that's good data. Your testing is hiting the critical path and as
> you did not complain about negative side effects it seems to hold up to
> the expectations. I'm going to convert this to real patch with a
> proper changelog tomorrow.
>
> Thanks for your help!
Sounds good, please CC me on it and I'll be sure to test the final
result as well.
-Evan
Evan tracked down a subtle race between the update of the MSI message and
the device raising an interrupt internally on PCI devices which do not
support MSI masking. The update of the MSI message is non-atomic and
consists of either 2 or 3 sequential 32bit wide writes to the PCI config
space.
- Write address low 32bits
- Write address high 32bits (If supported by device)
- Write data
When an interrupt is migrated then both address and data might change, so
the kernel attempts to mask the MSI interrupt first. But for MSI masking is
optional, so there exist devices which do not provide it. That means that
if the device raises an interrupt internally between the writes and MSI
message is sent built from half updated state.
On x86 this can lead to spurious interrupts on the wrong interrupt
vector when the affinity setting changes both address and data. As a
consequence the device interrupt can be lost causing the device to
become stuck or malfunctioning.
Evan tried to handle that by disabling MSI accross an MSI message
update. That's not feasible because disabling MSI has issues on its own:
If MSI is disabled the PCI device is routing an interrupt to the legacy
INTx mechanism. The INTx delivery can be disabled, but the disablement is
not working on all devices.
Some devices lose interrupts when both MSI and INTx delivery are disabled.
Another way to solve this would be to enforce the allocation of the same
vector on all CPUs in the system for this kind of screwed devices. That
could be done, but it would bring back the vector space exhaustion problems
which got solved a few years ago.
Fortunately the high address (if supported by the device) is only relevant
when X2APIC is enabled which implies interrupt remapping. In the interrupt
remapping case the affinity setting is happening at the interrupt remapping
unit and the PCI MSI message is programmed only once when the PCI device is
initialized.
That makes it possible to solve it with a two step update:
1) Target the MSI msg to the new vector on the current target CPU
2) Target the MSI msg to the new vector on the new target CPU
In both cases writing the MSI message is only changing a single 32bit word
which prevents the issue of inconsistency.
After writing the final destination it is necessary to check whether the
device issued an interrupt while the intermediate state #1 (new vector,
current CPU) was in effect.
This is possible because the affinity change is always happening on the
current target CPU. The code runs with interrupts disabled, so the
interrupt can be detected by checking the IRR of the local APIC. If the
vector is pending in the IRR then the interrupt is retriggered on the new
target CPU by sending an IPI for the associated vector on the target CPU.
This can cause spurious interrupts on both the local and the new target
CPU.
1) If the new vector is not in use on the local CPU and the device
affected by the affinity change raised an interrupt during the
transitional state (step #1 above) then interrupt entry code will
ignore that spurious interrupt. The vector is marked so that the
'No irq handler for vector' warning is supressed once.
2) If the new vector is in use already on the local CPU then the IRR check
might see an pending interrupt from the device which is using this
vector. The IPI to the new target CPU will then invoke the handler of
the device, which got the affinity change, even if that device did not
issue an interrupt
3) If the new vector is in use already on the local CPU and the device
affected by the affinity change raised an interrupt during the
transitional state (step #1 above) then the handler of the device which
uses that vector on the local CPU will be invoked.
#1 is uninteresting and has no unintended side effects. #2 and #3 might
expose issues in device driver interrupt handlers which are not prepared to
handle a spurious interrupt correctly. This not a regression, it's just
exposing something which was already broken as spurious interrupts can
happen for a lot of reasons and all driver handlers need to be able to deal
with them.
Reported-by: Evan Green <[email protected]>
Debugged-by: Evan Green <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Rajat Jain <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Marc Zyngier <[email protected]>
Cc: [email protected]
---
@stable: The Fixes tag is missing intentionally, as this problem has
been there forever. The rework of the vector allocation logic just made
it more likely to be observable.
---
arch/x86/include/asm/apic.h | 8 ++
arch/x86/kernel/apic/msi.c | 128 ++++++++++++++++++++++++++++++++++++++++++--
include/linux/irq.h | 18 ++++++
include/linux/irqdomain.h | 7 ++
kernel/irq/debugfs.c | 1
kernel/irq/msi.c | 5 +
6 files changed, 163 insertions(+), 4 deletions(-)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -452,6 +452,14 @@ static inline void ack_APIC_irq(void)
apic_eoi();
}
+
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
+{
+ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+
+ return !!(irr & (1U << (vector % 32)));
+}
+
static inline unsigned default_get_apic_id(unsigned long x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,10 +23,8 @@
static struct irq_domain *msi_default_domain;
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
-
msg->address_hi = MSI_ADDR_BASE_HI;
if (x2apic_enabled())
@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct i
MSI_DATA_VECTOR(cfg->vector);
}
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path)).
+ * The quirk bit is not set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_msi_nomask_quirk(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in do_IRQ().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector)
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controlle
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_set_affinity = msi_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct
}
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ else
+ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
}
#ifdef CONFIG_IRQ_REMAP
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -209,6 +209,8 @@ struct irq_data {
* IRQD_SINGLE_TARGET - IRQ allows only a single affinity target
* IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set
* IRQD_CAN_RESERVE - Can use reservation mode
+ * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change
+ * required
*/
enum {
IRQD_TRIGGER_MASK = 0xf,
@@ -231,6 +233,7 @@ enum {
IRQD_SINGLE_TARGET = (1 << 24),
IRQD_DEFAULT_TRIGGER_SET = (1 << 25),
IRQD_CAN_RESERVE = (1 << 26),
+ IRQD_MSI_NOMASK_QUIRK = (1 << 27),
};
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(stru
return __irqd_to_state(d) & IRQD_CAN_RESERVE;
}
+static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
+{
+ return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
+}
+
#undef __irqd_to_state
static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -207,6 +207,13 @@ enum {
IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5),
/*
+ * Quirk to handle MSI implementations which do not provide
+ * masking. Currently known to affect x86, but partially
+ * handled in core code.
+ */
+ IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6),
+
+ /*
* Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
* for implementation specific purposes and ignored by the
* core code.
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdat
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_dom
continue;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve)
+ if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;
Thomas Gleixner <[email protected]> writes:
Evan tracked down a subtle race between the update of the MSI message and
the device raising an interrupt internally on PCI devices which do not
support MSI masking. The update of the MSI message is non-atomic and
consists of either 2 or 3 sequential 32bit wide writes to the PCI config
space.
- Write address low 32bits
- Write address high 32bits (If supported by device)
- Write data
When an interrupt is migrated then both address and data might change, so
the kernel attempts to mask the MSI interrupt first. But for MSI masking is
optional, so there exist devices which do not provide it. That means that
if the device raises an interrupt internally between the writes and MSI
message is sent built from half updated state.
On x86 this can lead to spurious interrupts on the wrong interrupt
vector when the affinity setting changes both address and data. As a
consequence the device interrupt can be lost causing the device to
become stuck or malfunctioning.
Evan tried to handle that by disabling MSI accross an MSI message
update. That's not feasible because disabling MSI has issues on its own:
If MSI is disabled the PCI device is routing an interrupt to the legacy
INTx mechanism. The INTx delivery can be disabled, but the disablement is
not working on all devices.
Some devices lose interrupts when both MSI and INTx delivery are disabled.
Another way to solve this would be to enforce the allocation of the same
vector on all CPUs in the system for this kind of screwed devices. That
could be done, but it would bring back the vector space exhaustion problems
which got solved a few years ago.
Fortunately the high address (if supported by the device) is only relevant
when X2APIC is enabled which implies interrupt remapping. In the interrupt
remapping case the affinity setting is happening at the interrupt remapping
unit and the PCI MSI message is programmed only once when the PCI device is
initialized.
That makes it possible to solve it with a two step update:
1) Target the MSI msg to the new vector on the current target CPU
2) Target the MSI msg to the new vector on the new target CPU
In both cases writing the MSI message is only changing a single 32bit word
which prevents the issue of inconsistency.
After writing the final destination it is necessary to check whether the
device issued an interrupt while the intermediate state #1 (new vector,
current CPU) was in effect.
This is possible because the affinity change is always happening on the
current target CPU. The code runs with interrupts disabled, so the
interrupt can be detected by checking the IRR of the local APIC. If the
vector is pending in the IRR then the interrupt is retriggered on the new
target CPU by sending an IPI for the associated vector on the target CPU.
This can cause spurious interrupts on both the local and the new target
CPU.
1) If the new vector is not in use on the local CPU and the device
affected by the affinity change raised an interrupt during the
transitional state (step #1 above) then interrupt entry code will
ignore that spurious interrupt. The vector is marked so that the
'No irq handler for vector' warning is supressed once.
2) If the new vector is in use already on the local CPU then the IRR check
might see an pending interrupt from the device which is using this
vector. The IPI to the new target CPU will then invoke the handler of
the device, which got the affinity change, even if that device did not
issue an interrupt
3) If the new vector is in use already on the local CPU and the device
affected by the affinity change raised an interrupt during the
transitional state (step #1 above) then the handler of the device which
uses that vector on the local CPU will be invoked.
#1 is uninteresting and has no unintended side effects. #2 and #3 might
expose issues in device driver interrupt handlers which are not prepared to
handle a spurious interrupt correctly. This not a regression, it's just
exposing something which was already broken as spurious interrupts can
happen for a lot of reasons and all driver handlers need to be able to deal
with them.
Reported-by: Evan Green <[email protected]>
Debugged-by: Evan Green <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Rajat Jain <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Marc Zyngier <[email protected]>
Cc: [email protected]
---
V2: Add the missing bracket ....
@stable: The Fixes tag is missing intentionally, as this problem has
been there forever. The rework of the vector allocation logic just made
it more likely to be observable.
---
arch/x86/include/asm/apic.h | 8 ++
arch/x86/kernel/apic/msi.c | 128 ++++++++++++++++++++++++++++++++++++++++++--
include/linux/irq.h | 18 ++++++
include/linux/irqdomain.h | 7 ++
kernel/irq/debugfs.c | 1
kernel/irq/msi.c | 5 +
6 files changed, 163 insertions(+), 4 deletions(-)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -452,6 +452,14 @@ static inline void ack_APIC_irq(void)
apic_eoi();
}
+
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
+{
+ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+
+ return !!(irr & (1U << (vector % 32)));
+}
+
static inline unsigned default_get_apic_id(unsigned long x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,10 +23,8 @@
static struct irq_domain *msi_default_domain;
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
-
msg->address_hi = MSI_ADDR_BASE_HI;
if (x2apic_enabled())
@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct i
MSI_DATA_VECTOR(cfg->vector);
}
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path)).
+ * The quirk bit is not set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_msi_nomask_quirk(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in do_IRQ().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controlle
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_set_affinity = msi_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct
}
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ else
+ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
}
#ifdef CONFIG_IRQ_REMAP
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -209,6 +209,8 @@ struct irq_data {
* IRQD_SINGLE_TARGET - IRQ allows only a single affinity target
* IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set
* IRQD_CAN_RESERVE - Can use reservation mode
+ * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change
+ * required
*/
enum {
IRQD_TRIGGER_MASK = 0xf,
@@ -231,6 +233,7 @@ enum {
IRQD_SINGLE_TARGET = (1 << 24),
IRQD_DEFAULT_TRIGGER_SET = (1 << 25),
IRQD_CAN_RESERVE = (1 << 26),
+ IRQD_MSI_NOMASK_QUIRK = (1 << 27),
};
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(stru
return __irqd_to_state(d) & IRQD_CAN_RESERVE;
}
+static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
+{
+ return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
+}
+
#undef __irqd_to_state
static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -207,6 +207,13 @@ enum {
IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5),
/*
+ * Quirk to handle MSI implementations which do not provide
+ * masking. Currently known to affect x86, but partially
+ * handled in core code.
+ */
+ IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6),
+
+ /*
* Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
* for implementation specific purposes and ignored by the
* core code.
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdat
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_dom
continue;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve)
+ if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;
On Fri, Jan 31, 2020 at 6:27 AM Thomas Gleixner <[email protected]> wrote:
>
> Thomas Gleixner <[email protected]> writes:
>
> Evan tracked down a subtle race between the update of the MSI message and
> the device raising an interrupt internally on PCI devices which do not
> support MSI masking. The update of the MSI message is non-atomic and
> consists of either 2 or 3 sequential 32bit wide writes to the PCI config
> space.
>
> - Write address low 32bits
> - Write address high 32bits (If supported by device)
> - Write data
>
> When an interrupt is migrated then both address and data might change, so
> the kernel attempts to mask the MSI interrupt first. But for MSI masking is
> optional, so there exist devices which do not provide it. That means that
> if the device raises an interrupt internally between the writes and MSI
> message is sent built from half updated state.
>
> On x86 this can lead to spurious interrupts on the wrong interrupt
> vector when the affinity setting changes both address and data. As a
> consequence the device interrupt can be lost causing the device to
> become stuck or malfunctioning.
>
> Evan tried to handle that by disabling MSI accross an MSI message
> update. That's not feasible because disabling MSI has issues on its own:
>
> If MSI is disabled the PCI device is routing an interrupt to the legacy
> INTx mechanism. The INTx delivery can be disabled, but the disablement is
> not working on all devices.
>
> Some devices lose interrupts when both MSI and INTx delivery are disabled.
>
> Another way to solve this would be to enforce the allocation of the same
> vector on all CPUs in the system for this kind of screwed devices. That
> could be done, but it would bring back the vector space exhaustion problems
> which got solved a few years ago.
>
> Fortunately the high address (if supported by the device) is only relevant
> when X2APIC is enabled which implies interrupt remapping. In the interrupt
> remapping case the affinity setting is happening at the interrupt remapping
> unit and the PCI MSI message is programmed only once when the PCI device is
> initialized.
>
> That makes it possible to solve it with a two step update:
>
> 1) Target the MSI msg to the new vector on the current target CPU
>
> 2) Target the MSI msg to the new vector on the new target CPU
>
> In both cases writing the MSI message is only changing a single 32bit word
> which prevents the issue of inconsistency.
>
> After writing the final destination it is necessary to check whether the
> device issued an interrupt while the intermediate state #1 (new vector,
> current CPU) was in effect.
>
> This is possible because the affinity change is always happening on the
> current target CPU. The code runs with interrupts disabled, so the
> interrupt can be detected by checking the IRR of the local APIC. If the
> vector is pending in the IRR then the interrupt is retriggered on the new
> target CPU by sending an IPI for the associated vector on the target CPU.
>
> This can cause spurious interrupts on both the local and the new target
> CPU.
>
> 1) If the new vector is not in use on the local CPU and the device
> affected by the affinity change raised an interrupt during the
> transitional state (step #1 above) then interrupt entry code will
> ignore that spurious interrupt. The vector is marked so that the
> 'No irq handler for vector' warning is supressed once.
>
> 2) If the new vector is in use already on the local CPU then the IRR check
> might see an pending interrupt from the device which is using this
> vector. The IPI to the new target CPU will then invoke the handler of
> the device, which got the affinity change, even if that device did not
> issue an interrupt
>
> 3) If the new vector is in use already on the local CPU and the device
> affected by the affinity change raised an interrupt during the
> transitional state (step #1 above) then the handler of the device which
> uses that vector on the local CPU will be invoked.
>
> #1 is uninteresting and has no unintended side effects. #2 and #3 might
> expose issues in device driver interrupt handlers which are not prepared to
> handle a spurious interrupt correctly. This not a regression, it's just
> exposing something which was already broken as spurious interrupts can
> happen for a lot of reasons and all driver handlers need to be able to deal
> with them.
>
> Reported-by: Evan Green <[email protected]>
> Debugged-by: Evan Green <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]>
Heh, thanks for the credit. Something weird happened on this line with
your signoff, though.
I've been running this on my system for a few hours with no issues
(normal repro in <1 minute). So,
Tested-by: Evan Green <[email protected]>
Evan Green <[email protected]> writes:
> On Fri, Jan 31, 2020 at 6:27 AM Thomas Gleixner <[email protected]> wrote:
>> #1 is uninteresting and has no unintended side effects. #2 and #3 might
>> expose issues in device driver interrupt handlers which are not prepared to
>> handle a spurious interrupt correctly. This not a regression, it's just
>> exposing something which was already broken as spurious interrupts can
>> happen for a lot of reasons and all driver handlers need to be able to deal
>> with them.
>>
>> Reported-by: Evan Green <[email protected]>
>> Debugged-by: Evan Green <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]>
>
> Heh, thanks for the credit.
Thanks for the detective work and the persistance to convince me!
> Something weird happened on this line with your signoff, though.
Yeah. No idea how I fatfingered that.
> I've been running this on my system for a few hours with no issues
> (normal repro in <1 minute). So,
>
> Tested-by: Evan Green <[email protected]>
Thanks for confirmation!
tglx
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 6f1a4891a5928a5969c87fa5a584844c983ec823
Gitweb: https://git.kernel.org/tip/6f1a4891a5928a5969c87fa5a584844c983ec823
Author: Thomas Gleixner <[email protected]>
AuthorDate: Fri, 31 Jan 2020 15:26:52 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Sat, 01 Feb 2020 09:31:47 +01:00
x86/apic/msi: Plug non-maskable MSI affinity race
Evan tracked down a subtle race between the update of the MSI message and
the device raising an interrupt internally on PCI devices which do not
support MSI masking. The update of the MSI message is non-atomic and
consists of either 2 or 3 sequential 32bit wide writes to the PCI config
space.
- Write address low 32bits
- Write address high 32bits (If supported by device)
- Write data
When an interrupt is migrated then both address and data might change, so
the kernel attempts to mask the MSI interrupt first. But for MSI masking is
optional, so there exist devices which do not provide it. That means that
if the device raises an interrupt internally between the writes then a MSI
message is sent built from half updated state.
On x86 this can lead to spurious interrupts on the wrong interrupt
vector when the affinity setting changes both address and data. As a
consequence the device interrupt can be lost causing the device to
become stuck or malfunctioning.
Evan tried to handle that by disabling MSI accross an MSI message
update. That's not feasible because disabling MSI has issues on its own:
If MSI is disabled the PCI device is routing an interrupt to the legacy
INTx mechanism. The INTx delivery can be disabled, but the disablement is
not working on all devices.
Some devices lose interrupts when both MSI and INTx delivery are disabled.
Another way to solve this would be to enforce the allocation of the same
vector on all CPUs in the system for this kind of screwed devices. That
could be done, but it would bring back the vector space exhaustion problems
which got solved a few years ago.
Fortunately the high address (if supported by the device) is only relevant
when X2APIC is enabled which implies interrupt remapping. In the interrupt
remapping case the affinity setting is happening at the interrupt remapping
unit and the PCI MSI message is programmed only once when the PCI device is
initialized.
That makes it possible to solve it with a two step update:
1) Target the MSI msg to the new vector on the current target CPU
2) Target the MSI msg to the new vector on the new target CPU
In both cases writing the MSI message is only changing a single 32bit word
which prevents the issue of inconsistency.
After writing the final destination it is necessary to check whether the
device issued an interrupt while the intermediate state #1 (new vector,
current CPU) was in effect.
This is possible because the affinity change is always happening on the
current target CPU. The code runs with interrupts disabled, so the
interrupt can be detected by checking the IRR of the local APIC. If the
vector is pending in the IRR then the interrupt is retriggered on the new
target CPU by sending an IPI for the associated vector on the target CPU.
This can cause spurious interrupts on both the local and the new target
CPU.
1) If the new vector is not in use on the local CPU and the device
affected by the affinity change raised an interrupt during the
transitional state (step #1 above) then interrupt entry code will
ignore that spurious interrupt. The vector is marked so that the
'No irq handler for vector' warning is supressed once.
2) If the new vector is in use already on the local CPU then the IRR check
might see an pending interrupt from the device which is using this
vector. The IPI to the new target CPU will then invoke the handler of
the device, which got the affinity change, even if that device did not
issue an interrupt
3) If the new vector is in use already on the local CPU and the device
affected by the affinity change raised an interrupt during the
transitional state (step #1 above) then the handler of the device which
uses that vector on the local CPU will be invoked.
expose issues in device driver interrupt handlers which are not prepared to
handle a spurious interrupt correctly. This not a regression, it's just
exposing something which was already broken as spurious interrupts can
happen for a lot of reasons and all driver handlers need to be able to deal
with them.
Reported-by: Evan Green <[email protected]>
Debugged-by: Evan Green <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Evan Green <[email protected]>
Cc: [email protected]
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/apic.h | 8 ++-
arch/x86/kernel/apic/msi.c | 128 ++++++++++++++++++++++++++++++++++-
include/linux/irq.h | 18 +++++-
include/linux/irqdomain.h | 7 ++-
kernel/irq/debugfs.c | 1 +-
kernel/irq/msi.c | 5 +-
6 files changed, 163 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index be0b9cf..19e94af 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -454,6 +454,14 @@ static inline void ack_APIC_irq(void)
apic_eoi();
}
+
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
+{
+ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+
+ return !!(irr & (1U << (vector % 32)));
+}
+
static inline unsigned default_get_apic_id(unsigned long x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7f75334..159bd0c 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,10 +23,8 @@
static struct irq_domain *msi_default_domain;
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
-
msg->address_hi = MSI_ADDR_BASE_HI;
if (x2apic_enabled())
@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
MSI_DATA_VECTOR(cfg->vector);
}
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path)).
+ * The quirk bit is not set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_msi_nomask_quirk(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in do_IRQ().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_set_affinity = msi_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
}
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ else
+ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
}
#ifdef CONFIG_IRQ_REMAP
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7853eb9..3ed5a05 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -209,6 +209,8 @@ struct irq_data {
* IRQD_SINGLE_TARGET - IRQ allows only a single affinity target
* IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set
* IRQD_CAN_RESERVE - Can use reservation mode
+ * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change
+ * required
*/
enum {
IRQD_TRIGGER_MASK = 0xf,
@@ -231,6 +233,7 @@ enum {
IRQD_SINGLE_TARGET = (1 << 24),
IRQD_DEFAULT_TRIGGER_SET = (1 << 25),
IRQD_CAN_RESERVE = (1 << 26),
+ IRQD_MSI_NOMASK_QUIRK = (1 << 27),
};
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(struct irq_data *d)
return __irqd_to_state(d) & IRQD_CAN_RESERVE;
}
+static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
+{
+ __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
+{
+ return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
+}
+
#undef __irqd_to_state
static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 3c340db..4da8df5 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -207,6 +207,13 @@ enum {
IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5),
/*
+ * Quirk to handle MSI implementations which do not provide
+ * masking. Currently known to affect x86, but partially
+ * handled in core code.
+ */
+ IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6),
+
+ /*
* Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
* for implementation specific purposes and ignored by the
* core code.
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c1eccd4..a949bd3 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ad26fbc..eb95f61 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
continue;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve)
+ if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;