Impact: Eliminates an issue that can leave the system in an
unusable state.
This patch addresses an issue where device generated IRQs
are no longer seen by the kernel following IRQ affinity
migration while the device is generating IRQs at a high rate.
I have been able to consistently reproduce the problem on
some of our systems by running the following script (VICTIM_IRQ
specifies the IRQ for the aic94xx device) while a single instance
of the command
# while true; do find / -exec file {} \;; done
is keeping the filesystem activity and IRQ rate reasonably high.
#!/bin/sh
SYS_CPU_DIR=/sys/devices/system/cpu
VICTIM_IRQ=25
IRQ_MASK=f0
iteration=0
while true; do
echo $iteration
echo $IRQ_MASK > /proc/irq/$VICTIM_IRQ/smp_affinity
for cpudir in $SYS_CPU_DIR/cpu[1-9] $SYS_CPU_DIR/cpu??; do
echo 0 > $cpudir/online
done
for cpudir in $SYS_CPU_DIR/cpu[1-9] $SYS_CPU_DIR/cpu??; do
echo 1 > $cpudir/online
done
iteration=`expr $iteration + 1`
done
The root cause is a known issue already addressed for some
code paths [e.g. ack_apic_level() and the now obsolete
migrate_irq_remapped_level_desc()] where the ioapic can
misbehave when the I/O redirection table register is written
while the Remote IRR bit is set.
The proposed fix uses the same avoidance method and much
of same code that the Interrupt Remapping code previously
used to avoid the same problem.
Successfully tested with Ingo's linux-2.6-tip (32 and 64-bit
builds) on the IBM x460, x3550 M2, x3850, and x3950 M2.
v2: modified to integrate with Yinghai Lu's
"irq: change ->set_affinity() to return status" changes
to intersecting/related code.
Signed-off-by: Gary Hade <[email protected]>
---
arch/x86/kernel/apic/io_apic.c | 68 ++++++++++++++++++++++++++++++-
1 file changed, 67 insertions(+), 1 deletion(-)
Index: linux-2.6-tip/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/apic/io_apic.c 2009-05-14 14:11:00.000000000 -0700
+++ linux-2.6-tip/arch/x86/kernel/apic/io_apic.c 2009-05-14 14:11:32.000000000 -0700
@@ -2296,7 +2296,8 @@ set_desc_affinity(struct irq_desc *desc,
}
static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+set_ioapic_irq_affinity_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
{
struct irq_cfg *cfg;
unsigned long flags;
@@ -2320,6 +2321,71 @@ set_ioapic_affinity_irq_desc(struct irq_
return ret;
}
+static void
+delayed_irq_move(struct work_struct *work)
+{
+ unsigned int irq;
+ struct irq_desc *desc;
+
+ for_each_irq_desc(irq, desc) {
+ if (desc->status & IRQ_MOVE_PENDING) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (!desc->chip->set_affinity ||
+ !(desc->status & IRQ_MOVE_PENDING)) {
+ desc->status &= ~IRQ_MOVE_PENDING;
+ spin_unlock_irqrestore(&desc->lock, flags);
+ continue;
+ }
+ desc->chip->set_affinity(irq, desc->pending_mask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+ }
+}
+
+static DECLARE_DELAYED_WORK(delayed_irq_move_work, delayed_irq_move);
+
+static int
+set_ioapic_irq_affinity_level_desc(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg = desc->chip_data;
+ int ret = -1;
+
+ mask_IO_APIC_irq_desc(desc);
+ if (io_apic_level_ack_pending(cfg)) {
+ /*
+ * Interrupt in progress. Migrating irq now will change
+ * the vector information in the IO-APIC RTE which will
+ * confuse the EOI broadcast performed by cpu.
+ * So, we delay the IRQ migration.
+ */
+ schedule_delayed_work(&delayed_irq_move_work, 1);
+ ret = 0;
+ goto unmask;
+ }
+
+ /* Interrupt not in progress. We can change the vector
+ * information in the IO-APIC RTE. */
+ ret = set_ioapic_irq_affinity_desc(desc, desc->pending_mask);
+ desc->status &= ~IRQ_MOVE_PENDING;
+unmask:
+ unmask_IO_APIC_irq_desc(desc);
+ return ret;
+}
+
+static int
+set_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
+{
+ if (desc->status & IRQ_LEVEL) {
+ desc->status |= IRQ_MOVE_PENDING;
+ cpumask_copy(desc->pending_mask, mask);
+ return set_ioapic_irq_affinity_level_desc(desc);
+ }
+ return set_ioapic_irq_affinity_desc(desc, mask);
+}
+
static int
set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
{
Gary Hade wrote:
> Impact: Eliminates an issue that can leave the system in an
> unusable state.
>
> This patch addresses an issue where device generated IRQs
> are no longer seen by the kernel following IRQ affinity
> migration while the device is generating IRQs at a high rate.
>
This patch doesn't seem to apply to current mainline.
What is your base?
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
Gary Hade <[email protected]> writes:
> Impact: Eliminates an issue that can leave the system in an
> unusable state.
>
> This patch addresses an issue where device generated IRQs
> are no longer seen by the kernel following IRQ affinity
> migration while the device is generating IRQs at a high rate.
>
> I have been able to consistently reproduce the problem on
> some of our systems by running the following script (VICTIM_IRQ
> specifies the IRQ for the aic94xx device) while a single instance
> of the command
> # while true; do find / -exec file {} \;; done
> is keeping the filesystem activity and IRQ rate reasonably high.
Nacked-by: "Eric W. Biederman" <[email protected]>
Again you are attempt to work around the fact that fixup_irqs
is broken.
fixup_irqs is what needs to be fixed to call these functions properly.
We have several intense debug sessions by various people including
myself that show that your delayed_irq_move function will simply not
work reliably.
Frankly simply looking at it gives me the screaming heebie jeebies.
The fact you can't reproduce the old failure cases which demonstrated
themselves as lockups in the ioapic state machines gives me no
confidence in your testing of this code.
Eric
Gary Hade <[email protected]> writes:
> Impact: Eliminates an issue that can leave the system in an
> unusable state.
>
> This patch addresses an issue where device generated IRQs
> are no longer seen by the kernel following IRQ affinity
> migration while the device is generating IRQs at a high rate.
>
> I have been able to consistently reproduce the problem on
> some of our systems by running the following script (VICTIM_IRQ
> specifies the IRQ for the aic94xx device) while a single instance
> of the command
> # while true; do find / -exec file {} \;; done
> is keeping the filesystem activity and IRQ rate reasonably high.
To be 100% clear.
If masking and checking to see if the irq was already pending was
sufficient to migrate irqs in process context was enough to
safely migrate irqs in process context then that is how we would
always do it. I have been down that road and down some extensive
testing in the past.
I found hardware bugs in both AMD and Intel IOAPIC that make your
code demonstrably unsafe.
I was challenged by some of the software guys from Intel and eventually
the came back and told me they had talked with their hardware engineers
and I was correct.
So no. This code is totally and severely broken and we should not do
it.
You are introducing complexity and heuristics to avoid the fact that
fixup_irqs is fundamentally broken. Sure you might tweak things
so they work a little more often.
> The root cause is a known issue already addressed for some
> code paths [e.g. ack_apic_level() and the now obsolete
> migrate_irq_remapped_level_desc()] where the ioapic can
> misbehave when the I/O redirection table register is written
> while the Remote IRR bit is set.
No the reason we do this is not because of the IRR. Although
that certainly does not help.
We do this because it is not in general safe to do complicated
reprogramming to the ioapic while the hardware may send an
irq. You can lock up the hardware state machine etc.
If the work around was as simple as you propose a delayed work or busy
waiting until the irq handler was complete variant would have been
written and used long ago.
So my reaction to this horrible afterthought is
NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
PLEASE NO.
Eric
On Tue, Jun 02, 2009 at 09:51:27PM -0700, H. Peter Anvin wrote:
> Gary Hade wrote:
> > Impact: Eliminates an issue that can leave the system in an
> > unusable state.
> >
> > This patch addresses an issue where device generated IRQs
> > are no longer seen by the kernel following IRQ affinity
> > migration while the device is generating IRQs at a high rate.
> >
>
> This patch doesn't seem to apply to current mainline.
>
> What is your base?
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
which Ingo encouraged me to use because it contained some of
Yinghai Lu's recent changes that hadn't reached mainline yet.
Gary
--
Gary Hade
System x Enablement
IBM Linux Technology Center
503-578-4503 IBM T/L: 775-4503
[email protected]
http://www.ibm.com/linux/ltc
On Wed, Jun 03, 2009 at 05:03:24AM -0700, Eric W. Biederman wrote:
> Gary Hade <[email protected]> writes:
>
> > Impact: Eliminates an issue that can leave the system in an
> > unusable state.
> >
> > This patch addresses an issue where device generated IRQs
> > are no longer seen by the kernel following IRQ affinity
> > migration while the device is generating IRQs at a high rate.
> >
> > I have been able to consistently reproduce the problem on
> > some of our systems by running the following script (VICTIM_IRQ
> > specifies the IRQ for the aic94xx device) while a single instance
> > of the command
> > # while true; do find / -exec file {} \;; done
> > is keeping the filesystem activity and IRQ rate reasonably high.
>
> Nacked-by: "Eric W. Biederman" <[email protected]>
>
> Again you are attempt to work around the fact that fixup_irqs
> is broken.
>
> fixup_irqs is what needs to be fixed to call these functions properly.
>
> We have several intense debug sessions by various people including
> myself that show that your delayed_irq_move function will simply not
> work reliably.
>
> Frankly simply looking at it gives me the screaming heebie jeebies.
>
> The fact you can't reproduce the old failure cases which demonstrated
> themselves as lockups in the ioapic state machines gives me no
> confidence in your testing of this code.
Correct, after the fix was applied my testing did _not_ show
the lockups that you are referring to. I wonder if there is a
chance that the root cause of those old failures and the root
cause of issue that my fix addresses are the same?
Can you provide the test case that demonstrated the old failure
cases so I can try it on our systems? Also, do you recall what
mainline version demonstrated the old failure cases?
Thanks,
Gary
--
Gary Hade
System x Enablement
IBM Linux Technology Center
503-578-4503 IBM T/L: 775-4503
[email protected]
http://www.ibm.com/linux/ltc
Gary Hade <[email protected]> writes:
> Correct, after the fix was applied my testing did _not_ show
> the lockups that you are referring to. I wonder if there is a
> chance that the root cause of those old failures and the root
> cause of issue that my fix addresses are the same?
>
> Can you provide the test case that demonstrated the old failure
> cases so I can try it on our systems? Also, do you recall what
> mainline version demonstrated the old failure
The irq migration has already been moved to interrupt context by the
time I started working on it. And I managed to verify that there were
indeed problems with moving it out of interrupt context before my code
merged.
So if you want to reproduce it reduce your irq migration to the essentials.
Set IRQ_MOVE_PCNTXT, and always migrate the irqs from process context
immediately.
Then migrate an irq that fires at a high rate rapidly from one cpu to
another.
Right now you are insulated from most of the failures because you still
don't have IRQ_MOVE_PCNTXT. So you are only really testing your new code
in the cpu hotunplug path.
Now that I look at it in more detail you are doing a double
mask_IO_APIC_irq and unmask_IO_APIC_irq on the fast path and
duplicating the pending irq check. All of which are pretty atrocious
in and of themselves.
Eric
On Wed, Jun 03, 2009 at 02:13:23PM -0700, Eric W. Biederman wrote:
> Gary Hade <[email protected]> writes:
>
> > Correct, after the fix was applied my testing did _not_ show
> > the lockups that you are referring to. I wonder if there is a
> > chance that the root cause of those old failures and the root
> > cause of issue that my fix addresses are the same?
> >
> > Can you provide the test case that demonstrated the old failure
> > cases so I can try it on our systems? Also, do you recall what
> > mainline version demonstrated the old failure
>
> The irq migration has already been moved to interrupt context by the
> time I started working on it. And I managed to verify that there were
> indeed problems with moving it out of interrupt context before my code
> merged.
>
> So if you want to reproduce it reduce your irq migration to the essentials.
> Set IRQ_MOVE_PCNTXT, and always migrate the irqs from process context
> immediately.
>
> Then migrate an irq that fires at a high rate rapidly from one cpu to
> another.
>
> Right now you are insulated from most of the failures because you still
> don't have IRQ_MOVE_PCNTXT. So you are only really testing your new code
> in the cpu hotunplug path.
OK, I'm confused.
It sounds like you want me force IRQ_MOVE_PCNTXT so that I can
test in a configuration that you say is already broken. Why
in the heck would this config, where you expect lockups without
the fix, be a productive environment in which to test the fix?
>
> Now that I look at it in more detail you are doing a double
> mask_IO_APIC_irq and unmask_IO_APIC_irq on the fast path
Good question. I had based my changes on the previous
CONFIG_INTR_REMAP IRQ migration delay code that was doing
the same thing. I had assumed that the CONFIG_INTR_REMAP
code was correct and that the nesting of
mask_IO_APIC_irq_desc/unmask_IO_APIC_irq_desc sequences
was OK. Now that I look at how mask_IO_APIC_irq_desc and
unmask_IO_APIC_irq_desc are implemented I see that there
is e.g. no reference counting to assure that only the last
caller of unmask_IO_APIC_irq_desc wins. This reduces the
range of code that is executed with the IRQ masked.
Do you see any reason why the IRQ needs to be masked
downstream of the unmask_IO_APIC_irq_desc call that the
patch adds?
> and duplicating the pending irq check.
Yes, I was aware of this and had considered removing
the current check but I was trying to minimize the changes
as much as possible, especially those changes affecting
the normal IRQ migration path.
Removal of the current check would simplify the code
and would also eliminate the failed affinity change
requests from user level that can result from the
current check.
Do you think it would be a good idea to remove this
extra check? If so, I would prefer to do it with a
separate patch so that I can do a good job of testing it.
Thanks,
Gary
--
Gary Hade
System x Enablement
IBM Linux Technology Center
503-578-4503 IBM T/L: 775-4503
[email protected]
http://www.ibm.com/linux/ltc
On Thu, Jun 04, 2009 at 01:04:37PM -0700, Gary Hade wrote:
> On Wed, Jun 03, 2009 at 02:13:23PM -0700, Eric W. Biederman wrote:
> > Gary Hade <[email protected]> writes:
> >
> > > Correct, after the fix was applied my testing did _not_ show
> > > the lockups that you are referring to. I wonder if there is a
> > > chance that the root cause of those old failures and the root
> > > cause of issue that my fix addresses are the same?
> > >
> > > Can you provide the test case that demonstrated the old failure
> > > cases so I can try it on our systems? Also, do you recall what
> > > mainline version demonstrated the old failure
> >
> > The irq migration has already been moved to interrupt context by the
> > time I started working on it. And I managed to verify that there were
> > indeed problems with moving it out of interrupt context before my code
> > merged.
> >
> > So if you want to reproduce it reduce your irq migration to the essentials.
> > Set IRQ_MOVE_PCNTXT, and always migrate the irqs from process context
> > immediately.
> >
> > Then migrate an irq that fires at a high rate rapidly from one cpu to
> > another.
> >
> > Right now you are insulated from most of the failures because you still
> > don't have IRQ_MOVE_PCNTXT. So you are only really testing your new code
> > in the cpu hotunplug path.
>
> OK, I'm confused.
>
> It sounds like you want me force IRQ_MOVE_PCNTXT so that I can
> test in a configuration that you say is already broken. Why
> in the heck would this config, where you expect lockups without
> the fix, be a productive environment in which to test the fix?
Sorry, I did not say this well. Trying again:
Why would this config, where you already expect lockups
for reasons that you say are not addressed by the fix, be
a productive environment in which to test the fix?
Gary
--
Gary Hade
System x Enablement
IBM Linux Technology Center
503-578-4503 IBM T/L: 775-4503
[email protected]
http://www.ibm.com/linux/ltc
Gary Hade <[email protected]> writes:
> On Wed, Jun 03, 2009 at 02:13:23PM -0700, Eric W. Biederman wrote:
>> Gary Hade <[email protected]> writes:
>>
>> > Correct, after the fix was applied my testing did _not_ show
>> > the lockups that you are referring to. I wonder if there is a
>> > chance that the root cause of those old failures and the root
>> > cause of issue that my fix addresses are the same?
>> >
>> > Can you provide the test case that demonstrated the old failure
>> > cases so I can try it on our systems? Also, do you recall what
>> > mainline version demonstrated the old failure
>>
>> The irq migration has already been moved to interrupt context by the
>> time I started working on it. And I managed to verify that there were
>> indeed problems with moving it out of interrupt context before my code
>> merged.
>>
>> So if you want to reproduce it reduce your irq migration to the essentials.
>> Set IRQ_MOVE_PCNTXT, and always migrate the irqs from process context
>> immediately.
>>
>> Then migrate an irq that fires at a high rate rapidly from one cpu to
>> another.
>>
>> Right now you are insulated from most of the failures because you still
>> don't have IRQ_MOVE_PCNTXT. So you are only really testing your new code
>> in the cpu hotunplug path.
>
> OK, I'm confused.
>
> It sounds like you want me force IRQ_MOVE_PCNTXT so that I can
> test in a configuration that you say is already broken. Why
> in the heck would this config, where you expect lockups without
> the fix, be a productive environment in which to test the fix?
Because that is what the brokenness of fixup_irqs does. It
effectively forces IRQ_MOVE_PCNTXT on code that has not been written
to deal with it and the failures are because of that forcing.
>> Now that I look at it in more detail you are doing a double
>> mask_IO_APIC_irq and unmask_IO_APIC_irq on the fast path
>
> Good question. I had based my changes on the previous
> CONFIG_INTR_REMAP IRQ migration delay code that was doing
> the same thing. I had assumed that the CONFIG_INTR_REMAP
> code was correct and that the nesting of
> mask_IO_APIC_irq_desc/unmask_IO_APIC_irq_desc sequences
> was OK. Now that I look at how mask_IO_APIC_irq_desc and
> unmask_IO_APIC_irq_desc are implemented I see that there
> is e.g. no reference counting to assure that only the last
> caller of unmask_IO_APIC_irq_desc wins. This reduces the
> range of code that is executed with the IRQ masked.
>
> Do you see any reason why the IRQ needs to be masked
> downstream of the unmask_IO_APIC_irq_desc call that the
> patch adds?
No. However I do see reason why the irq needs to be masked
upstream of where the mask call your patch adds.
>> and duplicating the pending irq check.
>
> Yes, I was aware of this and had considered removing
> the current check but I was trying to minimize the changes
> as much as possible, especially those changes affecting
> the normal IRQ migration path.
>
> Removal of the current check would simplify the code
> and would also eliminate the failed affinity change
> requests from user level that can result from the
> current check.
>
> Do you think it would be a good idea to remove this
> extra check? If so, I would prefer to do it with a
> separate patch so that I can do a good job of testing it.
Well I tell you what.
fixup_irqs already has a bunch of heuristics to make the code
mostly work when migrating irqs from process context.
If you can limit your changes to that path I will be sad because
we have not completely fixed the problem. But at least we
are working on something that has a shot.
Which means you won't destabilize the existing code.
By modifying it's set_affinity methods.
Something like:
static void fixup_irq_affinity(unsigned int irq, struct irq_desc *desc)
{
const struct cpumask *affinity;
int break_affinity = 0;
int set_affinity = 1;
/* interrupt's are disabled at this point */
spin_lock(&desc->lock);
affinity = desc->affinity;
if (!irq_has_action(irq) ||
cpumask_equal(affinity, cpu_online_mask)) {
spin_unlock(&desc->lock);
return;
}
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
break_affinity = 1;
affinity = cpu_all_mask;
}
if (desc->status & IRQ_MOVE_PCNTXT) {
spin_unlock(&desc->lock);
if (irq_set_affinity(irq, affinity) != 0)
set_affinity = 0;
} else {
if (desc->chip->mask)
desc->chip->mask(irq);
if (desc->chip->set_affinity) {
/* Any additional weird stabilitiy hacks can go here */
desc->chip->set_affinity(irq, affinity);
} else
set_affinity = 0;
if (desc->chip->unmask)
desc->chip->unmask(irq);
spin_unlock(&desc->lock);
}
if (break_affinity && set_affinity)
printk("Broke affinity for irq %i\n", irq);
else if (!set_affinity)
printk("Cannot set affinity for irq %i\n", irq);
/* Do something equivalent of sending the cleanup ipi here */
}
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
unsigned int irq;
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
if (!desc)
continue;
if (irq == 2)
continue;
fixup_irq_affinity(irq, desc);
}
}
Eric