By precomputing old_mask I remove an extra if statement, remove an
indentation level and make the code slightly easier to read.
This comes before my bugfix in the next patch so the patch and the
resulting code stay readable.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 13 +++++--------
1 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 6be6730..d1fcd4b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -685,6 +685,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ cpumask_t old_mask = CPU_MASK_NONE;
int old_vector = -1;
int cpu;
@@ -699,11 +700,12 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
cpus_and(*result, irq_domain[irq], mask);
if (!cpus_empty(*result))
return old_vector;
+ cpus_and(old_mask, irq_domain[irq], cpu_online_map);
}
for_each_cpu_mask(cpu, mask) {
cpumask_t domain, new_mask;
- int new_cpu;
+ int new_cpu, old_cpu;
int vector, offset;
domain = vector_allocation_domain(cpu);
@@ -728,13 +730,8 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (old_vector >= 0) {
- cpumask_t old_mask;
- int old_cpu;
- cpus_and(old_mask, irq_domain[irq], cpu_online_map);
- for_each_cpu_mask(old_cpu, old_mask)
- per_cpu(vector_irq, old_cpu)[old_vector] = -1;
- }
+ for_each_cpu_mask(old_cpu, old_mask)
+ per_cpu(vector_irq, old_cpu)[old_vector] = -1;
for_each_cpu_mask(new_cpu, new_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
irq_vector[irq] = vector;
--
1.4.4.1.g278f
When making the interrupt vectors per cpu I failed to handle a case
during irq migration. If the same interrupt comes in while we are
servicing the irq but before we migrate it the pending bit in the
local apic IRR register will be set for that irq.
After migrating the irq to another cpu and or vector the data
structures will no longer be setup to handle this pending irq. Then as
soon as we return from servicing the irq we just migrated we will get
a nasty: "No irq handler for vector" message.
Since we do not disable irqs for edge triggered interrupts except
in the smallest possible window during migration I cannot avoid
migrating an irq in the unlikely event that it becomes pending.
This is because by the time the irq could no longer become pending
I have already updated all of my data structures.
Therefore this patch introduces an intermediate state that
exists soley on the cpu where we are handling the irq during
migration. The irq number is changed to negative in the
vector_irq data structure.
Once the migration operation is complete we know we will receive
no more interrupts on this vector so the irq pending state for
this irq will no longer be updated. If the irq is not pending and
we are in the intermediate state we immediately free the vector,
otherwise in we free the vector in do_IRQ when the pending irq
arrives.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 56 ++++++++++++++++++++++++++++++++++++++---
arch/x86_64/kernel/irq.c | 19 +++++++++++++-
2 files changed, 69 insertions(+), 6 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d1fcd4b..ffcb5f6 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -730,9 +730,17 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- for_each_cpu_mask(old_cpu, old_mask)
- per_cpu(vector_irq, old_cpu)[old_vector] = -1;
- for_each_cpu_mask(new_cpu, new_mask)
+ for_each_cpu_mask(old_cpu, old_mask) {
+ int free = -1;
+ /* When migrating we need to preserve the old
+ * vector until we have processed all of the
+ * pending irqs.
+ */
+ if (old_cpu == smp_processor_id())
+ free = -irq;
+ per_cpu(vector_irq, old_cpu)[old_vector] = free;
+ }
+ for_each_cpu_mask(new_cpu, new_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
irq_vector[irq] = vector;
irq_domain[irq] = domain;
@@ -1389,6 +1397,37 @@ static int ioapic_retrigger_irq(unsigned int irq)
return 1;
}
+static unsigned apic_in_service_vector(void)
+{
+ unsigned isr, vector;
+ /* Figure out which vector we are servicing */
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < FIRST_SYSTEM_VECTOR; vector += 32) {
+ isr = apic_read(APIC_ISR + ((vector/32) * 0x10));
+ if (isr)
+ break;
+ }
+ /* Find the low bits of the vector we are servicing */
+ vector += __ffs(isr);
+ return vector;
+}
+
+static void apic_handle_pending_vector(unsigned vector)
+{
+ unsigned irr;
+ int irq;
+
+ irq = __get_cpu_var(vector_irq)[vector];
+ if (irq >= 0)
+ return;
+
+ /* If the irq we are servicing has moved and is not pending
+ * free it's vector.
+ */
+ irr = apic_read(APIC_IRR + ((vector/32) * 0x10));
+ if (!(irr & (1 << (vector % 32))))
+ __get_cpu_var(vector_irq)[vector] = -1;
+}
+
/*
* Level and edge triggered IO-APIC interrupts need different handling,
* so we use two separate IRQ descriptors. Edge triggered IRQs can be
@@ -1400,19 +1439,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
static void ack_apic_edge(unsigned int irq)
{
- move_native_irq(irq);
+ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+ move_native_irq(irq);
+ apic_handle_pending_vector(apic_in_service_vector());
+ }
ack_APIC_irq();
}
static void ack_apic_level(unsigned int irq)
{
int do_unmask_irq = 0;
+ unsigned int vector = 0;
#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
/* If we are moving the irq we need to mask it */
if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
mask_IO_APIC_irq(irq);
+ vector = apic_in_service_vector();
}
#endif
@@ -1424,8 +1468,10 @@ static void ack_apic_level(unsigned int irq)
/* Now we can move and renable the irq */
move_masked_irq(irq);
- if (unlikely(do_unmask_irq))
+ if (unlikely(do_unmask_irq)) {
+ apic_handle_pending_vector(vector);
unmask_IO_APIC_irq(irq);
+ }
}
static struct irq_chip ioapic_chip __read_mostly = {
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 648055a..0ff5fbc 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -97,6 +97,23 @@ skip:
return 0;
}
+static inline unsigned int irq_from_vector(unsigned int vector)
+{
+ int irq;
+ irq = __get_cpu_var(vector_irq)[vector];
+
+ /* If we changed vectors during migration and we had a pending
+ * irq, we left the irq allocated on this cpu. Now that the
+ * pending irq has arrived get the irq number and free this
+ * vector.
+ */
+ if (irq < -1) {
+ __get_cpu_var(vector_irq)[vector] = -1;
+ irq = -irq;
+ }
+ return irq;
+}
+
/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
@@ -112,7 +129,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
exit_idle();
irq_enter();
- irq = __get_cpu_var(vector_irq)[vector];
+ irq = irq_from_vector(vector);
#ifdef CONFIG_DEBUG_STACKOVERFLOW
stack_overflow_check(regs);
--
1.4.4.1.g278f
On Fri, 02 Feb 2007 17:35:31 -0700
[email protected] (Eric W. Biederman) wrote:
> When making the interrupt vectors per cpu I failed to handle a case
> during irq migration. If the same interrupt comes in while we are
> servicing the irq but before we migrate it the pending bit in the
> local apic IRR register will be set for that irq.
>
> After migrating the irq to another cpu and or vector the data
> structures will no longer be setup to handle this pending irq. Then as
> soon as we return from servicing the irq we just migrated we will get
> a nasty: "No irq handler for vector" message.
>
> Since we do not disable irqs for edge triggered interrupts except
> in the smallest possible window during migration I cannot avoid
> migrating an irq in the unlikely event that it becomes pending.
> This is because by the time the irq could no longer become pending
> I have already updated all of my data structures.
>
> Therefore this patch introduces an intermediate state that
> exists soley on the cpu where we are handling the irq during
> migration. The irq number is changed to negative in the
> vector_irq data structure.
>
> Once the migration operation is complete we know we will receive
> no more interrupts on this vector so the irq pending state for
> this irq will no longer be updated. If the irq is not pending and
> we are in the intermediate state we immediately free the vector,
> otherwise in we free the vector in do_IRQ when the pending irq
> arrives.
So is this a for-2.6.20 thing? The bug was present in 2.6.19, so
I assume it doesn't affect many people?
Andrew Morton <[email protected]> writes:
> So is this a for-2.6.20 thing? The bug was present in 2.6.19, so
> I assume it doesn't affect many people?
If it's not to late, and this patch isn't too scary.
It's a really rare set of circumstances that trigger it, but the
possibility of being hit is pretty widespread, anything with
more than one cpu, and more then one irq could see this.
The easiest way to trigger this is to have two level triggered irqs on
two different cpus using the same vector. In that case if one acks
it's irq while the other irq is migrating to a different cpu 2.6.19
get completely confused and stop handling interrupts properly.
With my previous bug fix (not to drop the ack when we are confused)
the machine will stay up, and that is obviously correct and can't
affect anything else so is probably a candidate for the stable tree.
With this fix everything just works.
I don't know how often a legitimate case of the exact same irq
going off twice in a row is, but that is a possibility as well
especially with edge triggered interrupts.
Setting up the test scenario was a pain, but by extremely limiting
my choice of vectors I was able to confirm I survived several hundred
of these events with in a couple of minutes no problem.
Eric
On Fri, 02 Feb 2007 18:39:15 -0700
[email protected] (Eric W. Biederman) wrote:
> Andrew Morton <[email protected]> writes:
>
> > So is this a for-2.6.20 thing? The bug was present in 2.6.19, so
> > I assume it doesn't affect many people?
>
> If it's not to late, and this patch isn't too scary.
>
> It's a really rare set of circumstances that trigger it, but the
> possibility of being hit is pretty widespread, anything with
> more than one cpu, and more then one irq could see this.
>
> The easiest way to trigger this is to have two level triggered irqs on
> two different cpus using the same vector. In that case if one acks
> it's irq while the other irq is migrating to a different cpu 2.6.19
> get completely confused and stop handling interrupts properly.
>
> With my previous bug fix (not to drop the ack when we are confused)
> the machine will stay up, and that is obviously correct and can't
> affect anything else so is probably a candidate for the stable tree.
>
> With this fix everything just works.
>
> I don't know how often a legitimate case of the exact same irq
> going off twice in a row is, but that is a possibility as well
> especially with edge triggered interrupts.
>
> Setting up the test scenario was a pain, but by extremely limiting
> my choice of vectors I was able to confirm I survived several hundred
> of these events with in a couple of minutes no problem.
>
OK, thanks. Let's await Andi's feedback.
> > Once the migration operation is complete we know we will receive
> > no more interrupts on this vector so the irq pending state for
> > this irq will no longer be updated. If the irq is not pending and
> > we are in the intermediate state we immediately free the vector,
> > otherwise in we free the vector in do_IRQ when the pending irq
> > arrives.
>
> So is this a for-2.6.20 thing? The bug was present in 2.6.19, so
> I assume it doesn't affect many people?
I got a few reports of this; irqbalance may trigger this kernel bug it
seems... I would suggest to consider this for 2.6.20 since it's a
hard-hang case
Arjan van de Ven <[email protected]> writes:
>> > Once the migration operation is complete we know we will receive
>> > no more interrupts on this vector so the irq pending state for
>> > this irq will no longer be updated. If the irq is not pending and
>> > we are in the intermediate state we immediately free the vector,
>> > otherwise in we free the vector in do_IRQ when the pending irq
>> > arrives.
>>
>> So is this a for-2.6.20 thing? The bug was present in 2.6.19, so
>> I assume it doesn't affect many people?
>
> I got a few reports of this; irqbalance may trigger this kernel bug it
> seems... I would suggest to consider this for 2.6.20 since it's a
> hard-hang case
Yes. The bug I fixed will not happen if you don't migrate irqs.
At the very least we want the patch below (already in -mm)
that makes it not a hard hang case.
Subject: [PATCH] x86_64: Survive having no irq mapping for a vector
Occasionally the kernel has bugs that result in no irq being
found for a given cpu vector. If we acknowledge the irq
the system has a good chance of continuing even though we dropped
an missed an irq message. If we continue to simply print a
message and drop and not acknowledge the irq the system is
likely to become non-responsive shortly there after.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/irq.c | 11 ++++++++---
1 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 0c06af6..648055a 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -120,9 +120,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
if (likely(irq < NR_IRQS))
generic_handle_irq(irq);
- else if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
- __func__, smp_processor_id(), vector);
+ else {
+ if (!disable_apic)
+ ack_APIC_irq();
+
+ if (printk_ratelimit())
+ printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
+ __func__, smp_processor_id(), vector);
+ }
irq_exit();
--
1.4.4.1.g278f
On Saturday 03 February 2007 01:31, Eric W. Biederman wrote:
>
> By precomputing old_mask I remove an extra if statement, remove an
> indentation level and make the code slightly easier to read.
>
> This comes before my bugfix in the next patch so the patch and the
> resulting code stay readable.
Fine for me.
-Andi
> Once the migration operation is complete we know we will receive
> no more interrupts on this vector so the irq pending state for
> this irq will no longer be updated. If the irq is not pending and
> we are in the intermediate state we immediately free the vector,
> otherwise in we free the vector in do_IRQ when the pending irq
> arrives.
Ok for me, although the magic numbers are a little nasty.
What about i386?
-Andi
Andi Kleen <[email protected]> writes:
>> Once the migration operation is complete we know we will receive
>> no more interrupts on this vector so the irq pending state for
>> this irq will no longer be updated. If the irq is not pending and
>> we are in the intermediate state we immediately free the vector,
>> otherwise in we free the vector in do_IRQ when the pending irq
>> arrives.
>
> Ok for me, although the magic numbers are a little nasty.
You must be talking about (vector/32) *0x10.
The 32 is the number of bits and 0x10 the gap between apic
registers. I couldn't think of a better form. If someone
can think of a better way it probably warrants a cleanup patch
at some point.
> What about i386?
i386 does not handle this case but since it is still globally
allocating all of it's vectors and never changes it's vectors during
migration it is totally harmless when an irq comes in on a cpu other
than the one we are expecting it on.
Eric
On Saturday 03 February 2007 11:22, Eric W. Biederman wrote:
> Andi Kleen <[email protected]> writes:
>
> >> Once the migration operation is complete we know we will receive
> >> no more interrupts on this vector so the irq pending state for
> >> this irq will no longer be updated. If the irq is not pending and
> >> we are in the intermediate state we immediately free the vector,
> >> otherwise in we free the vector in do_IRQ when the pending irq
> >> arrives.
> >
> > Ok for me, although the magic numbers are a little nasty.
>
> You must be talking about (vector/32) *0x10.
No I meant the -1s
-Andi
As I reported when I tested this patch, it works, but I could see an
abnormally high load averay while triggering the error message. anyway, it
is better to have an high load averag three or four times higher than what
you would expect then a crash/reboot. isn't it? :)
Luigi Genoni
p.s.
will test the other definitive patch on montday on both 8 and 16 CPU
system.
On Sat, 3 Feb 2007, Eric W. Biederman wrote:
> Date: Sat, 03 Feb 2007 00:55:11 -0700
> From: Eric W. Biederman <[email protected]>
> To: Arjan van de Ven <[email protected]>
> Cc: Andrew Morton <[email protected]>, [email protected],
> "Lu, Yinghai" <[email protected]>,
> Luigi Genoni <[email protected]>, Ingo Molnar <[email protected]>,
> Natalie Protasevich <[email protected]>, Andi Kleen <[email protected]>
> Subject: Re: [PATCH 2/2] x86_64 irq: Handle irqs pending in IRR during irq
> migration.
> Resent-Date: Sat, 03 Feb 2007 09:05:10 +0100
> Resent-From: <[email protected]>
>
> Arjan van de Ven <[email protected]> writes:
>
>>>> Once the migration operation is complete we know we will receive
>>>> no more interrupts on this vector so the irq pending state for
>>>> this irq will no longer be updated. If the irq is not pending and
>>>> we are in the intermediate state we immediately free the vector,
>>>> otherwise in we free the vector in do_IRQ when the pending irq
>>>> arrives.
>>>
>>> So is this a for-2.6.20 thing? The bug was present in 2.6.19, so
>>> I assume it doesn't affect many people?
>>
>> I got a few reports of this; irqbalance may trigger this kernel bug it
>> seems... I would suggest to consider this for 2.6.20 since it's a
>> hard-hang case
>
>
> Yes. The bug I fixed will not happen if you don't migrate irqs.
>
> At the very least we want the patch below (already in -mm)
> that makes it not a hard hang case.
>
> Subject: [PATCH] x86_64: Survive having no irq mapping for a vector
>
> Occasionally the kernel has bugs that result in no irq being
> found for a given cpu vector. If we acknowledge the irq
> the system has a good chance of continuing even though we dropped
> an missed an irq message. If we continue to simply print a
> message and drop and not acknowledge the irq the system is
> likely to become non-responsive shortly there after.
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> arch/x86_64/kernel/irq.c | 11 ++++++++---
> 1 files changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
> index 0c06af6..648055a 100644
> --- a/arch/x86_64/kernel/irq.c
> +++ b/arch/x86_64/kernel/irq.c
> @@ -120,9 +120,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
>
> if (likely(irq < NR_IRQS))
> generic_handle_irq(irq);
> - else if (printk_ratelimit())
> - printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
> - __func__, smp_processor_id(), vector);
> + else {
> + if (!disable_apic)
> + ack_APIC_irq();
> +
> + if (printk_ratelimit())
> + printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
> + __func__, smp_processor_id(), vector);
> + }
>
> irq_exit();
>
> --
> 1.4.4.1.g278f
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
* Eric W. Biederman <[email protected]> wrote:
> When making the interrupt vectors per cpu I failed to handle a case
> during irq migration. If the same interrupt comes in while we are
> servicing the irq but before we migrate it the pending bit in the
> local apic IRR register will be set for that irq.
hm. I think this needs more work. For example this part of the fix looks
quite ugly to me:
> +static unsigned apic_in_service_vector(void)
> +{
> + unsigned isr, vector;
> + /* Figure out which vector we are servicing */
> + for (vector = FIRST_EXTERNAL_VECTOR; vector < FIRST_SYSTEM_VECTOR; vector += 32) {
> + isr = apic_read(APIC_ISR + ((vector/32) * 0x10));
> + if (isr)
> + break;
> + }
> + /* Find the low bits of the vector we are servicing */
> + vector += __ffs(isr);
> + return vector;
so we read the hardware to figure out what the hell we are doing. The
problem is - why doesnt the kernel know at this point what it is doing?
It knows the CPU and it should know all the vector numbers. It also has
an irq number.
> + /* If the irq we are servicing has moved and is not pending
> + * free it's vector.
> + */
> + irr = apic_read(APIC_IRR + ((vector/32) * 0x10));
the IRR is quite fragile. It's a rarely used hardware field and it has
erratums attached to it. Again, it seems like this function too tries to
recover information it /should/ already have.
> @@ -1400,19 +1439,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
>
> static void ack_apic_edge(unsigned int irq)
> {
> - move_native_irq(irq);
> + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
> + move_native_irq(irq);
> + apic_handle_pending_vector(apic_in_service_vector());
> + }
> ack_APIC_irq();
this looks a bit ugly and a bit fragile. We had a simple
'move_native_irq()' call for IRQ balancing, which is straightforward,
but now we've got this complex condition open coded.
If then this should be done in some sort of helper - but even then, the
whole approach looks a bit erroneous.
To me the cleanest way to migrate an IRQ between two different vectors
on two different CPUs would be to first mask the IRQ source in the PIC,
then to do the migration /atomically/ (no intermediary state), and then
unmask. If the PIC loses edges while masked then that's a problem of the
irq chip implementation of the PIC: its ->set_affinity() method should
refuse to migrate edge-triggered IRQs if it can lose edges while
unmasked!
Ingo
Ingo thanks for the review.
Ingo Molnar <[email protected]> writes:
> * Eric W. Biederman <[email protected]> wrote:
>
>> When making the interrupt vectors per cpu I failed to handle a case
>> during irq migration. If the same interrupt comes in while we are
>> servicing the irq but before we migrate it the pending bit in the
>> local apic IRR register will be set for that irq.
>
> hm. I think this needs more work. For example this part of the fix looks
> quite ugly to me:
I'm not at all certain I can make an ugly reality look beautiful.
>> +static unsigned apic_in_service_vector(void)
>> +{
>> + unsigned isr, vector;
>> + /* Figure out which vector we are servicing */
>> + for (vector = FIRST_EXTERNAL_VECTOR; vector < FIRST_SYSTEM_VECTOR; vector +=
> 32) {
>> + isr = apic_read(APIC_ISR + ((vector/32) * 0x10));
>> + if (isr)
>> + break;
>> + }
>> + /* Find the low bits of the vector we are servicing */
>> + vector += __ffs(isr);
>> + return vector;
>
> so we read the hardware to figure out what the hell we are doing. The
> problem is - why doesnt the kernel know at this point what it is doing?
> It knows the CPU and it should know all the vector numbers. It also has
> an irq number.
Yes. And by adding a percpu global I can do this. If figured since this
should be a rare case it would be equally simple to read this from
the hardware, as it already has the information stored in roughly the
form I would need to store it in. If there errata in this area and I
am likely to hit them then it is probably a good idea to look at a
different implementation.
>> + /* If the irq we are servicing has moved and is not pending
>> + * free it's vector.
>> + */
>> + irr = apic_read(APIC_IRR + ((vector/32) * 0x10));
>
> the IRR is quite fragile. It's a rarely used hardware field and it has
> erratums attached to it. Again, it seems like this function too tries to
> recover information it /should/ already have.
If I am servicing an interrupt and that same interrupt comes in again
before I acknowledge it how /should/ I know that?
The only way I know to find that information is to ask the hardware.
>> @@ -1400,19 +1439,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
>>
>> static void ack_apic_edge(unsigned int irq)
>> {
>> - move_native_irq(irq);
>> + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
>> + move_native_irq(irq);
>> + apic_handle_pending_vector(apic_in_service_vector());
>> + }
>> ack_APIC_irq();
>
> this looks a bit ugly and a bit fragile. We had a simple
> 'move_native_irq()' call for IRQ balancing, which is straightforward,
> but now we've got this complex condition open coded.
Well the condition is testing a single bit so I don't think it is that
complex. Maybe taking it out of line will help or maybe that will obscure
things. I'm inclined to believe hiding the irq migration logic will
obscure things and make it harder to debug.
Now part of the reason I did it this way is I have at least 3
set_affinity implementations and this issue really has nothing to do
with the external interrupt controller but everything to do with the
cpu local interrupt controller, so this did not seem like something
that was reasonably buried in set_affinity.
> If then this should be done in some sort of helper - but even then, the
> whole approach looks a bit erroneous.
>
> To me the cleanest way to migrate an IRQ between two different vectors
> on two different CPUs would be to first mask the IRQ source in the PIC,
> then to do the migration /atomically/ (no intermediary state), and then
> unmask. If the PIC loses edges while masked then that's a problem of the
> irq chip implementation of the PIC: its ->set_affinity() method should
> refuse to migrate edge-triggered IRQs if it can lose edges while
> unmasked!
Ingo I believe what you have described is essentially what we are
doing before my patches, or what we were doing in even older versions
that had other races and problems.
To some extent I have inherited the current design that mostly works. The
only known reliable way to block and edge triggered irq is to be servicing it.
The practical problem there is that when we sit on an irq the irq can come
in again and queue up in irr. Which means that once we have updated the
data structures acked the irq and returned the irq will come in again
in the old location because the io_apic has a 1 deep queue for each
irq. Of course for the irq controllers that can be masked safely your
proposed change to disable irq is unfortunate.
You appear to be lobbying for disabling the irq asynchronously to all
of the irq reception activity. That would seem to me to require
running on the cpu where the irq is currently programmed to be
delivered disabling local interrupts, as well as disabling the irq in
the interrupt controller before reprogramming it. For the irqs that
applies to there does seem to be some merit in that.
Of course the irq will queue in irr so it can be delivered when the
irqs are enabled again and therefore we have to be very careful about
data structure changes and not removing them if we have the specified
irq pending on the current cpu. So I don't see how this changes the
solution domain in any meaningful way, except by reducing the set
of interrupt controllers we can support with it, because it requires
masking.
The core question: How do I know when all interrupts messages that
have sent have been serviced and acknowledged?
If you have the guarantee that the irq is disabled and queuing in the
interrupt controller and all interrupt messages that have been sent
have been serviced and acknowledged. Then the reprogramming problem
is simple. Otherwise we have to cope with interrupts in the cpu local
queue.
Dropping support for the more difficult cases when we know how to
migrate them and they are in fact some of our most common interrupts
like the timer interrupt seems an irresponsible thing to do. Now
if you can make the case that we cannot migrate them safely that
is another issue.
I'm open to suggestions and with my simple patch we at least won't
hang the kernel when this happens.
Eric
"Luigi Genoni" <[email protected]> writes:
> btw, I tested in on ( CPU and no way to reproduce the bug, (no messages
> appeared and load average was quite as high as exspected).
> Going to test an a 16 CPU (the same who triggered the bug at the beginning)
> immediatelly when it will be free.
Thanks. Hopefully that will confirm I have properly tracked this bug.
Eric
Ingo would it be reasonable to get a wait queue so I can wait for an
irq that needs the delayed disable action to actually become masked?
There are a lot of practical reasons while I cannot reprogram an
unmasked irq. However if we can wait it should be able to get all of
the convoluted irq migration logic out of interrupt context, and into
process context.
If know interrupts have been enabled on the cpus where an irq could be
delivered after the irq has been masked, I know that the irq is not
currently in progress.
Therefore I know the irq will not be in progress again until the irq
is unmasked.
Once I know the irq will not be received again it is safe to modify
the irq handling data structures and the interrupt controller.
I think the only generic piece missing from this equation is a wait
queue so I can wait for the irq to become masked. Say a generic
sleeping masq_irq() call?
Eric
* Eric W. Biederman <[email protected]> wrote:
> Ingo would it be reasonable to get a wait queue so I can wait for an
> irq that needs the delayed disable action to actually become masked?
that might make sense, but what will do the wakeup - incidental IRQ
arriving on the new CPU? Isnt that a bit risky - maybe the device wont
generate IRQs for a really long time.
Ingo
Ingo Molnar <[email protected]> writes:
> * Eric W. Biederman <[email protected]> wrote:
>
>> Ingo would it be reasonable to get a wait queue so I can wait for an
>> irq that needs the delayed disable action to actually become masked?
>
> that might make sense, but what will do the wakeup - incidental IRQ
> arriving on the new CPU?
That is what I was thinking.
> Isnt that a bit risky - maybe the device wont
> generate IRQs for a really long time.
Well this is in a user space context called from user space and it
exactly matches the semantics we have now. If we make it an
interruptible sleep the user space process shouldn't block.
I guess the other thing to do is do it in a non-block fashion
and just call schedule_work from the interrupt context when the
irq is disabled. For i386 with it's in kernel irq scheduler
that might be better.
I think the nasty case is probably what do we do when it is
the timer interrupt we are dealing with.
Hmm. I think I should look up what the rules are for
calling local_irq_enable when in interrupt context. That
might be another way to satisfy this problem.
If local irqs are enabled I don't have to worry about the irr
register.
You've got me brainstorming now.
Eric
Ingo Molnar <[email protected]> writes:
> * Eric W. Biederman <[email protected]> wrote:
>
>> Ingo would it be reasonable to get a wait queue so I can wait for an
>> irq that needs the delayed disable action to actually become masked?
>
> that might make sense, but what will do the wakeup - incidental IRQ
> arriving on the new CPU? Isnt that a bit risky - maybe the device wont
> generate IRQs for a really long time.
I still need to test this, but I believe I have found a simpler
way to avoid irr problems during migration, and I believe the code
works equally well with either edge or level triggered interrupts.
The idea is this: Instead of trying test for and handle when irr
occurs, simply enable local interrupts after disabling and
acknowledging the irq so that anything pending will be processed,
before we perform the migration operation.
I don't think the edge case cares about the mask/ack order but
but masking before acking appears important for the level triggered
case, so we might as well use that order for both.
Does this look like a sane way to handle this?
static void ack_apic(unsigned int irq)
{
struct irq_desc *desc = irq_desc + irq;
int do_unmask_irq = 0;
if (unlikely((irq_desc[irq].status & IRQ_MOVE_PENDING) && !in_irq()) {
do_unmask_irq = 1;
desc->chip->mask(irq);
}
ack_APIC_irq();
if (unlikely(do_unmask_irq)) {
/* Don't let pending irqs accumulate */
local_irq_enable();
syncrhonize_irq(irq);
move_masked_irq(irq);
local_irq_disable();
desc->chip->unmask(irq);
}
}
Eric
[email protected] (Eric W. Biederman) writes:
> Ingo Molnar <[email protected]> writes:
>
>> * Eric W. Biederman <[email protected]> wrote:
>>
>>> Ingo would it be reasonable to get a wait queue so I can wait for an
>>> irq that needs the delayed disable action to actually become masked?
>>
>> that might make sense, but what will do the wakeup - incidental IRQ
>> arriving on the new CPU? Isnt that a bit risky - maybe the device wont
>> generate IRQs for a really long time.
>
> I still need to test this, but I believe I have found a simpler
> way to avoid irr problems during migration, and I believe the code
> works equally well with either edge or level triggered interrupts.
>
> The idea is this: Instead of trying test for and handle when irr
> occurs, simply enable local interrupts after disabling and
> acknowledging the irq so that anything pending will be processed,
> before we perform the migration operation.
>
> I don't think the edge case cares about the mask/ack order but
> but masking before acking appears important for the level triggered
> case, so we might as well use that order for both.
>
> Does this look like a sane way to handle this?
The version I would up testing is below, and it doesn't work.
I still get "No irq handler for vector" warnings as well as
a couple of complaints from lock/irq debugging. The debugging
doesn't worry me. The fact that I don't have a good way to ensure
I have no more irqs in flight does.
So unless someone can find a sure way to drain the irqs in flight,
I can't migrate an irq from process context, and looking at irr and
handling a pending irq appears required. '
Eric
static void ack_apic(unsigned int irq)
{
#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
if (unlikely((irq_desc[irq].status & IRQ_MOVE_PENDING) &&
(hardirq_count() == HARDIRQ_OFFSET))) {
struct irq_desc *desc = irq_desc + irq;
desc->chip->mask(irq);
ack_APIC_irq();
/* Ensure all of the irq handlers for this irq have completed
* before we migrate it.
*/
raw_local_irq_enable();
cpu_relax();
raw_local_irq_disable();
synchronize_irq(irq);
move_masked_irq(irq);
desc->chip->unmask(irq);
return;
}
#endif
ack_APIC_irq();
}
>
> The version I would up testing is below, and it doesn't work.
> I still get "No irq handler for vector" warnings as well as
> a couple of complaints from lock/irq debugging. The debugging
> doesn't worry me. The fact that I don't have a good way to ensure
> I have no more irqs in flight does.
>
> So unless someone can find a sure way to drain the irqs in flight,
> I can't migrate an irq from process context, and looking at irr and
> handling a pending irq appears required. '
Bah. I had not taken into account that the local apic despite
being tightly coupled with the cpu is for programming purposes
an asynchronous device. If I want to give it time to react to something
I need to read from it.
The routine below actually works.
My remaining practical question is can this been done cleanly.
Ingo's lock debugging dislikes this routine.
By using raw_local_irq_enable I have avoided all but a message on
the irq return path, I haven't quite worked out where.
But at least this version feels like it could be done better
(less inline? different helpers?) someone.
For interrupts coming through a sane interrupt controller moving
this into process context would certainly simplify things. For edge
triggered interrupts coming through an io_apic I'm not at all certain
what makes sense.
When the routine below is used to ack an edge triggered interrupt
it runs before the edge triggered interrupt handler so losing an
edge shouldn't happen (we haven't acknowledged the hardware yet)
and even if we do the device driver gets to run at least once.
So doing migration in the irq handler still looks like the best
solution even if it is ugly. As long as the little bit of
stack overhead isn't a problem I think enabling interrupts to
clear out any pending irqs certainly looks simpler.
In another vein. I went and looked through all of Intel's and
AMD's public errata that I could find and there weren't any associated
with irr or isr, so I think my previous version of the code is still
sane, and not likely to break.
I can improve it a little by getting the vector as:
"vector = ~ get_irq_regs()->orig_rax;" instead of reading
ISR. That still leaves reading the pending bit in ISR and the
other funny tricks.
I'm conflicted between the two approaches a little because playing
games with enabling interrupts in an interrupt handler seems to
have some weird corner cases.
static void ack_apic(unsigned int irq)
{
#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
struct irq_desc *desc;
desc = irq_desc + irq;
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
goto simple;
if (hardirq_count() != HARDIRQ_OFFSET)
goto simple;
desc->chip->mask(irq);
ack_APIC_irq();
/* Ensure all of the irq handlers for this irq have completed
* before we migrate it.
*/
spin_unlock(&desc->lock);
raw_local_irq_enable();
apic_read(APIC_ID);
raw_local_irq_disable();
spin_lock(&desc->lock);
move_masked_irq(irq);
desc->chip->unmask(irq);
return;
simple:
#endif
ack_APIC_irq();
}
BUG: at /home/eric/projects/linux/linux-2.6-devel/kernel/lockdep.c:1860 trace_hardirqs_on()
Call Trace:
<IRQ> [<ffffffff8048562f>] trace_hardirqs_on_thunk+0x35/0x37
[<ffffffff80290401>] generic_delete_inode+0x0/0x13e
[<ffffffff8020a0fc>] restore_args+0x0/0x30
[<ffffffff80290401>] generic_delete_inode+0x0/0x13e
[<ffffffff8021648d>] ack_apic+0x63/0x99
[<ffffffff80216485>] ack_apic+0x5b/0x99
[<ffffffff8025881e>] handle_fasteoi_irq+0xc1/0xd1
[<ffffffff80290401>] generic_delete_inode+0x0/0x13e
[<ffffffff8020c0de>] do_IRQ+0x89/0xf3
[<ffffffff80208ce8>] default_idle+0x35/0x51
[<ffffffff80208cb3>] default_idle+0x0/0x51
[<ffffffff8020a0a6>] ret_from_intr+0x0/0xf
<EOI> [<ffffffff80290401>] generic_delete_inode+0x0/0x13e
[<ffffffff80208cb3>] default_idle+0x0/0x51
[<ffffffff80208ce8>] default_idle+0x35/0x51
[<ffffffff80208cea>] default_idle+0x37/0x51
[<ffffffff80208ce8>] default_idle+0x35/0x51
[<ffffffff80208d5a>] cpu_idle+0x56/0x75
[<ffffffff808b9a69>] start_secondary+0x481/0x490
I have recently been investigating why we reprogram ioapic irqs in the
interrupt handler, because it significantly complicates the code, and
makes things more fragile. Eventually I found the commit with the
justification, see below.
There are not enough details in the justification to really understand
the issue so I'm asking to see if someone has some more details.
The description makes the assertion that reprograming the ioapic
when an interrupt is pending is the only safe way to handle this.
Since edge triggered interrupts cannot be pending at the ioapic I know
it is not talking level triggered interrupts.
However it is not possible to fully reprogram a level triggered
interrupt when the interrupt is pending as the ioapic will not
receive the interrupt acknowledgement. So it turns out I have
broken this change for several kernel releases without people
screaming at me about io_apic problems.
Currently I am disabling the irq on the ioapic before reprogramming
it so I do not run into issues. Does that solve the concerns that
were patched around by only reprogramming interrupt redirection
table entry in interrupt handlers?
If it does I can solve and simply the code by moving it all back
into process context.
commit 54d5d42404e7705cf3804593189e963350d470e5
Author: Ashok Raj <[email protected]>
Date: Tue Sep 6 15:16:15 2005 -0700
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity
When handling writes to /proc/irq, current code is re-programming rte
entries directly. This is not recommended and could potentially cause
chipset's to lockup, or cause missing interrupts.
CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the
interrupt is pending. The same needs to be done for /proc/irq handling as well.
Otherwise user space irq balancers are really not doing the right thing.
- Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for
lack of a generic name.
- added move_irq out of IRQ_BALANCE, and added this same to X86_64
- Added new proc handler for write, so we can do deferred write at irq
handling time.
- Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead
it now shows only active cpu masks, or exactly what was set.
- Provided a common move_irq implementation, instead of duplicating
when using generic irq framework.
Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off.
Tested UP builds as well.
MSI testing: tbd: I have cards, need to look for a x-over cable, although I
did test an earlier version of this patch. Will test in a couple days.
Signed-off-by: Ashok Raj <[email protected]>
Acked-by: Zwane Mwaikambo <[email protected]>
Grudgingly-acked-by: Andi Kleen <[email protected]>
Signed-off-by: Coywolf Qi Hunt <[email protected]>
Signed-off-by: Ashok Raj <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
On Sat, 10 Feb 2007, Eric W. Biederman wrote:
> There are not enough details in the justification to really understand
> the issue so I'm asking to see if someone has some more details.
>
> The description makes the assertion that reprograming the ioapic
> when an interrupt is pending is the only safe way to handle this.
> Since edge triggered interrupts cannot be pending at the ioapic I know
> it is not talking level triggered interrupts.
>
> However it is not possible to fully reprogram a level triggered
> interrupt when the interrupt is pending as the ioapic will not
> receive the interrupt acknowledgement. So it turns out I have
> broken this change for several kernel releases without people
> screaming at me about io_apic problems.
>
> Currently I am disabling the irq on the ioapic before reprogramming
> it so I do not run into issues. Does that solve the concerns that
> were patched around by only reprogramming interrupt redirection
> table entry in interrupt handlers?
Hi Eric,
Could you outline in pseudocode where you're issuing the mask? If
it's done whilst an irq is pending some (intel 7500 based) chipsets will
not actually mask it but treat it as a 'legacy' IRQ and deliver it
anyway. Using the masked whilst pending logic avoids all of that.
Cheers,
Zwane
Zwane Mwaikambo <[email protected]> writes:
> On Sat, 10 Feb 2007, Eric W. Biederman wrote:
>
>> There are not enough details in the justification to really understand
>> the issue so I'm asking to see if someone has some more details.
>>
>> The description makes the assertion that reprograming the ioapic
>> when an interrupt is pending is the only safe way to handle this.
>> Since edge triggered interrupts cannot be pending at the ioapic I know
>> it is not talking level triggered interrupts.
>>
>> However it is not possible to fully reprogram a level triggered
>> interrupt when the interrupt is pending as the ioapic will not
>> receive the interrupt acknowledgement. So it turns out I have
>> broken this change for several kernel releases without people
>> screaming at me about io_apic problems.
>>
>> Currently I am disabling the irq on the ioapic before reprogramming
>> it so I do not run into issues. Does that solve the concerns that
>> were patched around by only reprogramming interrupt redirection
>> table entry in interrupt handlers?
>
> Hi Eric,
> Could you outline in pseudocode where you're issuing the mask? If
> it's done whilst an irq is pending some (intel 7500 based) chipsets will
> not actually mask it but treat it as a 'legacy' IRQ and deliver it
> anyway. Using the masked whilst pending logic avoids all of that.
The code currently in the kernel does:
pending
mask
read io_apic
ack
reprogram vector and destination
unmask
So I guess it does retain the bug fix.
What I am looking at doing is:
mask
read io_apic
-- Past this point no more irqs are expected from the io_apic
-- Now I work to drain any inflight/pending instances of the irq
send ipi to all irq destinations cpus and wait for it to return
read lapic
disable local irqs
take irq lock
-- Now no more irqs are expected to arrive
reprogram vector and destination
enable local irqs
unmask
What I need to ensure is that I have a point where I will not receive any
new messages from an ioapic about a particular irq anymore. Even if
everything is working perfectly setting the disable bit is not enough
because there could be an irq message in flight. So I need to give any
in flight irqs a chance to complete.
With a little luck that logic will cover your 7500 disable race as
well. If not and there is a reasonable work around we should look at
that. This is not a speed critical path so we can afford to do a
little more work.
The version of this that I am currently testing is below.
Eric
/*
* Synchronize the local APIC and the CPU by doing
* a dummy read from the local APIC
*/
static inline void lapic_sync(void)
{
apic_read(APIC_ID);
}
static void affinity_noop(void *info)
{
return;
}
static void mask_get_irq(unsigned int irq)
{
struct irq_desc *desc = irq_desc + irq;
int cpu;
spin_lock(&vector_lock);
/*
* Mask the irq so it will no longer occur
*/
desc->chip->mask(irq);
/* If I can run a lower priority vector on another cpu
* then obviously the irq has completed on that cpu. SMP call
* function is lower priority then all of the hardware
* irqs.
*/
for_each_cpu_mask(cpu, desc->affinity)
smp_call_function_single(cpu, affinity_noop, NULL, 0, 1);
/*
* Ensure irqs have cleared the local cpu
*/
lapic_sync();
local_irq_disable();
lapic_sync();
spin_lock(&desc->lock);
}
static void unmask_put_irq(unsigned int irq)
{
struct irq_desc *desc = irq_desc + irq;
spin_unlock(&desc->lock);
local_irq_enable();
desc->chip->unmask(irq);
spin_unlock(&vector_lock);
}
static void set_ioapic_affinity_level_irq(unsigned int irq, cpumask_t mask)
{
unsigned int dest;
int vector;
/*
* Ensure all of the irq handlers for this irq have completed.
* i.e. drain all pending irqs
*/
mask_get_irq(irq);
cpus_and(mask, mask, cpu_online_map);
if (cpus_empty(mask))
goto out;
vector = __assign_irq_vector(irq, mask, &mask);
if (vector < 0)
goto out;
dest = cpu_mask_to_apicid(mask);
/*
* Only the high 8 bits are valid
*/
dest = SET_APIC_LOGICAL_ID(dest);
spin_lock(&ioapic_lock);
__target_IO_APIC_irq(irq, dest, vector);
spin_unlock(&ioapic_lock);
set_native_irq_info(irq, mask);
out:
unmask_put_irq(irq);
}
On Sun, 11 Feb 2007, Eric W. Biederman wrote:
> What I am looking at doing is:
>
> mask
> read io_apic
> -- Past this point no more irqs are expected from the io_apic
> -- Now I work to drain any inflight/pending instances of the irq
> send ipi to all irq destinations cpus and wait for it to return
> read lapic
> disable local irqs
> take irq lock
> -- Now no more irqs are expected to arrive
> reprogram vector and destination
> enable local irqs
> unmask
>
> What I need to ensure is that I have a point where I will not receive any
> new messages from an ioapic about a particular irq anymore. Even if
> everything is working perfectly setting the disable bit is not enough
> because there could be an irq message in flight. So I need to give any
> in flight irqs a chance to complete.
>
> With a little luck that logic will cover your 7500 disable race as
> well. If not and there is a reasonable work around we should look at
> that. This is not a speed critical path so we can afford to do a
> little more work.
The 7500 issue isn't actually a race but a disease, if you mask a pending
irq in its RTE, the PCI hub generates an INTx message corresponding to
that irq. This apparently was done to support booting OSes without APIC
support. So the following would occur;
- irqN pending on IOAPIC
- mask
=> INTx message for irqN
Unfortunately it appears the below code would also be affected by this as
well, the appropriate reference is;
2.15.2 PCI Express* Legacy INTx Support and Boot Interrupt
http://download.intel.com/design/chipsets/datashts/30262802.pdf
> static void mask_get_irq(unsigned int irq)
> {
> struct irq_desc *desc = irq_desc + irq;
> int cpu;
>
> spin_lock(&vector_lock);
>
> /*
> * Mask the irq so it will no longer occur
> */
> desc->chip->mask(irq);
>
> /* If I can run a lower priority vector on another cpu
> * then obviously the irq has completed on that cpu. SMP call
> * function is lower priority then all of the hardware
> * irqs.
> */
> for_each_cpu_mask(cpu, desc->affinity)
> smp_call_function_single(cpu, affinity_noop, NULL, 0, 1);
>
> /*
> * Ensure irqs have cleared the local cpu
> */
> lapic_sync();
> local_irq_disable();
> lapic_sync();
> spin_lock(&desc->lock);
> }
"Natalie Protasevich" <[email protected]> writes:
> On 2/11/07, Eric W. Biederman <[email protected]> wrote:
>
> The code currently in the kernel does:
>
> pending
> mask
> read io_apic
> ack
> reprogram vector and destination
> unmask
>
> So I guess it does retain the bug fix.
>
> What I am looking at doing is:
>
> mask
> read io_apic
> -- Past this point no more irqs are expected from the io_apic
> -- Now I work to drain any inflight/pending instances of the irq
> send ipi to all irq destinations cpus and wait for it to return
> read lapic
> disable local irqs
> take irq lock
> -- Now no more irqs are expected to arrive
> reprogram vector and destination
> enable local irqs
> unmask
>
> What I need to ensure is that I have a point where I will not receive any
> new messages from an ioapic about a particular irq anymore. Even if
> everything is working perfectly setting the disable bit is not enough
> because there could be an irq message in flight. So I need to give any
> in flight irqs a chance to complete.
>
>
> It is probably safer to have "interlapping" time in having second rte
> programmed, then both willl have the same vector and receive the eoi. And if
> you have interrupts disabled for a little while before you dismantle the old
> entry it should insure that all the in-flight ones got services and acked...
> --Natalie
I am assuming you mean interrupts disabled at the ioapic. If that
is true mostly I agree with your assertion.
If you look above the mask is where I disable that interrupt source on
the ioapic. I then send an ipi to all of the other possible cpus and
read the local apic to ensure that the window is over in which I need
to handle interrupts at the old destination.
If I don't know when the window ends I can never dismantle the data
structures for receiving the irq at the old destination. Ouch!
If I know how long I have to wait until I can clean up the data
structure at the old destination I don't have to worry about
overlapping interrupts.
Therefore it doesn't by me anything but complexity to setup the data
structures on the new cpu before the window for receiving the irqs
has completed.
Make sense?
Eric
Zwane Mwaikambo <[email protected]> writes:
>
> The 7500 issue isn't actually a race but a disease, if you mask a pending
> irq in its RTE, the PCI hub generates an INTx message corresponding to
> that irq. This apparently was done to support booting OSes without APIC
> support. So the following would occur;
>
> - irqN pending on IOAPIC
> - mask
> => INTx message for irqN
>
> Unfortunately it appears the below code would also be affected by this as
> well, the appropriate reference is;
>
> 2.15.2 PCI Express* Legacy INTx Support and Boot Interrupt
> http://download.intel.com/design/chipsets/datashts/30262802.pdf
Ouch. And this kind of thing isn't exactly uncommon.
However if we have the irqs also disabled in the i8259 we should
be safe from actually receiving this interrupt (even if it generates
bus traffic), and when we enable the irq since it is level triggered
we should still get an interrupt message.
It isn't immediately obvious where the i8259 irq enable/disable
happens. So i"m having trouble auditing that bit of code.
Plus we can get very strange things like the irq number changing
and the sharing rules being different when going through the i8259.
So irqN may be irqM when going through the i8259.
As long as we aren't using anything on the i8259 including the timer
in ExtINT mode we can disable every interrupt pin and not worry about
interrupts from that source.
Eric
On Sun, 11 Feb 2007, Eric W. Biederman wrote:
> > 2.15.2 PCI Express* Legacy INTx Support and Boot Interrupt
> > http://download.intel.com/design/chipsets/datashts/30262802.pdf
>
> Ouch. And this kind of thing isn't exactly uncommon.
>
> However if we have the irqs also disabled in the i8259 we should
> be safe from actually receiving this interrupt (even if it generates
> bus traffic), and when we enable the irq since it is level triggered
> we should still get an interrupt message.
>
> It isn't immediately obvious where the i8259 irq enable/disable
> happens. So i"m having trouble auditing that bit of code.
>
> Plus we can get very strange things like the irq number changing
> and the sharing rules being different when going through the i8259.
> So irqN may be irqM when going through the i8259.
>
> As long as we aren't using anything on the i8259 including the timer
> in ExtINT mode we can disable every interrupt pin and not worry about
> interrupts from that source.
We do the 8259 mask in setup_IO_APIC_irq. does anyone have access to an
E7520/E7320 system for testing?
Cheers,
Zwane
Zwane Mwaikambo <[email protected]> writes:
> On Sun, 11 Feb 2007, Eric W. Biederman wrote:
>
>> > 2.15.2 PCI Express* Legacy INTx Support and Boot Interrupt
>> > http://download.intel.com/design/chipsets/datashts/30262802.pdf
>>
>> Ouch. And this kind of thing isn't exactly uncommon.
>>
>> However if we have the irqs also disabled in the i8259 we should
>> be safe from actually receiving this interrupt (even if it generates
>> bus traffic), and when we enable the irq since it is level triggered
>> we should still get an interrupt message.
>>
>> It isn't immediately obvious where the i8259 irq enable/disable
>> happens. So i"m having trouble auditing that bit of code.
>>
>> Plus we can get very strange things like the irq number changing
>> and the sharing rules being different when going through the i8259.
>> So irqN may be irqM when going through the i8259.
>>
>> As long as we aren't using anything on the i8259 including the timer
>> in ExtINT mode we can disable every interrupt pin and not worry about
>> interrupts from that source.
>
> We do the 8259 mask in setup_IO_APIC_irq. does anyone have access to an
> E7520/E7320 system for testing?
I think I do, I need to double check.
The thing is this logic is different in that it uses INTx instead of pins
but otherwise is quite standard for chipsets and their IOAPICs. I'm not
at all certain this behavior is what the original concern was about.
The description is enough different you may have found a completely
different set of behavior we have to worry about.
Since the legacy/non legacy behavior is common invoked by the ioapic
mask bit working with just about any recent chipset should get a taste
of that. But I will still try and dig up an E7520 and see what
happens.
Eric
Ok. This is just an email to summarize my findings after investigating
the ioapic programming.
The ioapics on the E75xx chipset do have issues if you attempt to
reprogramming them outside of the irq handler. I have on several
instances caused the state machine to get stuck such that an
individual ioapic entry was no longer capable of delivering
interrupts. I suspect the remote IRR bit was set stuck on such that
switch the irq to edge triggered and back to level triggered would not
clear it but I did not confirm this. I just know that I was switching
the irq to between level and edge triggered with the irq masked
and the irq did not fire.
The ioapics on the AMD 8xxx chipset do have issues if you attempt
to reprogram them outside of the irq handler. I would up with
remote IRR set and never clearing. But by temporarily switching
the irq to edge triggered while it was masked I could clear
this condition.
I could not hit verifiable bugs in the ioapics on the Nforce4
chipset. It's amazing one part of that chipset that I can't find
issues with.
I did find an algorithm that will work successfully for migrating
IRQs in process context if you have an ioapic that will follow pci
ordering rules. In particulars the properties that the algorithm
depend on are reads guaranteeing that outstanding writes are flushed,
and in this context irqs in flight are considered writes. I have
assumed that to devices outside of the cpu asic the cpu and the local
apic appear as the same device.
The algorithm was:
- Be running with interrupts enabled in process context.
- Mask the ioapic.
- Read the ioapic to flush outstanding reads to the local apic.
- Read the local apic to flush outstanding irqs to be send the cpu.
- Now that all of the irqs have been delivered and the irq is masked
that irq is finally quiescent.
- With the irq quiescent it is safe to reprogram interrupt controller
and the irq reception data structures.
There were a lot more details but that was the essence.
What I discovered was that except on the nforce chipset masking the
ioapic and then issue a read did not behave as if the interrupts were
flushed to the local apic.
I did not look close enough to tell if local apics suffered from this
issue. With local apics at least a read was necessary before you
could guarantee the local apic would deliver pending irqs. A work
around on the local apics is to simply issue a low priority interrupt
as an IPI and wait for it to be processed. This guarantees that all
higher priority interrupts have been flushed from the apic, and that
the local apic has processed interrupts.
For ioapics because they cannot be stimulated to send any irq by
stimulation from the cpu side not similar work around was possible.
** Conclusions.
*IRQs must be reprogramed in interrupt context.
The result of this is investigation is that I am convinced we need
to perform the irq migration activities in interrupt context although
I am not convinced it is completely safe. I suspect multiple irqs
firing closely enough to each other may hit the same issues as
migrating irqs from process context. However the odds are on our
side, when we are in irq context.
The reasoning for this is simply that.
- Before we reprogram a level triggered irq it's remote irr bit
must be cleared by the irq being acknowledged before the can be
safely reprogrammed.
- There is no generally effective way short of receiving an additional
irq to ensure that the irq handler has run. Polling the ioapics
remote irr bit does not work.
* The CPU hotplug is currently very buggy.
Irq migration in the cpu hotplug case is a serious problem. If we can
only safely migrate irqs from interrupt context and we cannot control
when those interrupts fire, then we cannot bound the amount of time it
will take to migrate the irqs away from a cpu. The current cpu
hotplug code currently calls chip->set_affinity directly which is
wrong, as it does not take the necessary locks, and it does not
attempt to delay execution until we are in process context.
* Only an additional irq can signal the completion of an irq movement.
The attempt to rebuild the irq migration code from first principles
did bear some fruit. I asked the question: "When is it safe to tear
down the data structures for irq movement?". The only answer I have
is when I have received an irq provably from after the irq was
reprogrammed. This is because the only way I can reliably synchronize
with irq delivery from an apic is to receive an additional irq.
Currently this is a problem both for cpu hotplug on x86_64 and i386
and for general irq migration on x86_64.
Patches to follow shortly.
Eric
My recent ioapic investigation has been painful and has taken me all
over the place. One of my conclusions is that our code for dealing with
ioapics on i386 and x86_64 is a mess that has been growing for years.
I saw tremendous amounts of fodder for cleanup patches. Ugh.
When deciding how much to include in the patchset for addressing the irq
migrations problems that have cropped up in the x86_64 code I really
wanted to keep the patchset small and minimal. But I needed to make
some substantial changes so I would have a low priority IPI I could use
for ensuring irq migration was complete. When trying to throw out
patches I kept finding things that were either bug fixes or relevant
cleanups to the problem I was addressing.
So this patchset is bigger than I would for addressing the issue irq
migration issue but there is nothing that isn't a simple cleanup or
isn't a real bug fix. So I think we need it all one way or another.
I have lost track of who has what in which tree or who it makes sense
to send this to. So this patchset is against Linus's latest git tree.
And hopefully we can include this in 2.6.21.
Eric
By precomputing old_mask I remove an extra if statement, remove an
indentation level and make the code slightly easier to read.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 13 +++++--------
1 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 950682f..357b354 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -685,6 +685,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ cpumask_t old_mask = CPU_MASK_NONE;
int old_vector = -1;
int cpu;
@@ -699,11 +700,12 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
cpus_and(*result, irq_domain[irq], mask);
if (!cpus_empty(*result))
return old_vector;
+ cpus_and(old_mask, irq_domain[irq], cpu_online_map);
}
for_each_cpu_mask(cpu, mask) {
cpumask_t domain, new_mask;
- int new_cpu;
+ int new_cpu, old_cpu;
int vector, offset;
domain = vector_allocation_domain(cpu);
@@ -728,13 +730,8 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (old_vector >= 0) {
- cpumask_t old_mask;
- int old_cpu;
- cpus_and(old_mask, irq_domain[irq], cpu_online_map);
- for_each_cpu_mask(old_cpu, old_mask)
- per_cpu(vector_irq, old_cpu)[old_vector] = -1;
- }
+ for_each_cpu_mask(old_cpu, old_mask)
+ per_cpu(vector_irq, old_cpu)[old_vector] = -1;
for_each_cpu_mask(new_cpu, new_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
irq_vector[irq] = vector;
--
1.5.0.g53756
This patch replaces all instances of "set_native_irq_info(irq, mask)"
with "irq_desc[irq].affinity = mask". The latter form is clearer
uses fewer abstractions, and makes access to this field uniform
accross different architectures.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/i386/kernel/io_apic.c | 10 +++++-----
arch/ia64/kernel/msi_ia64.c | 2 +-
arch/ia64/sn/kernel/msi_sn.c | 2 +-
arch/x86_64/kernel/io_apic.c | 10 +++++-----
include/linux/irq.h | 11 -----------
5 files changed, 12 insertions(+), 23 deletions(-)
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 4ccebd4..6fec4da 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -343,7 +343,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
break;
entry = irq_2_pin + entry->next;
}
- set_native_irq_info(irq, cpumask);
+ irq_desc[irq].affinity = cpumask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -1354,7 +1354,7 @@ static void __init setup_IO_APIC_irqs(void)
}
spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, entry);
- set_native_irq_info(irq, TARGET_CPUS);
+ irq_desc[irq].affinity = TARGET_CPUS;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
}
@@ -2585,7 +2585,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg(irq, &msg);
- set_native_irq_info(irq, mask);
+ irq_desc[irq].affinity = mask;
}
#endif /* CONFIG_SMP */
@@ -2669,7 +2669,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
dest = cpu_mask_to_apicid(mask);
target_ht_irq(irq, dest);
- set_native_irq_info(irq, mask);
+ irq_desc[irq].affinity = mask;
}
#endif
@@ -2875,7 +2875,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(ioapic, pin, entry);
- set_native_irq_info(irq, TARGET_CPUS);
+ irq_desc[irq].affinity = TARGET_CPUS;
spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 0d05450..e722090 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -60,7 +60,7 @@ static void ia64_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
msg.address_lo = addr;
write_msi_msg(irq, &msg);
- set_native_irq_info(irq, cpu_mask);
+ irq_desc[irq].affinity = cpu_mask;
}
#endif /* CONFIG_SMP */
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index ea3dc38..49873aa 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -204,7 +204,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
write_msi_msg(irq, &msg);
- set_native_irq_info(irq, cpu_mask);
+ irq_desc[irq].affinity = cpu_mask;
}
#endif /* CONFIG_SMP */
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 357b354..1847213 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -261,7 +261,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
spin_lock_irqsave(&ioapic_lock, flags);
__target_IO_APIC_irq(irq, dest, vector);
- set_native_irq_info(irq, mask);
+ irq_desc[irq].affinity = mask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
#endif
@@ -857,7 +857,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
ioapic_write_entry(apic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
- set_native_irq_info(irq, TARGET_CPUS);
+ irq_desc[irq].affinity = TARGET_CPUS;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -1930,7 +1930,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg(irq, &msg);
- set_native_irq_info(irq, mask);
+ irq_desc[irq].affinity = mask;
}
#endif /* CONFIG_SMP */
@@ -2018,7 +2018,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
dest = cpu_mask_to_apicid(tmp);
target_ht_irq(irq, dest, vector);
- set_native_irq_info(irq, mask);
+ irq_desc[irq].affinity = mask;
}
#endif
@@ -2143,7 +2143,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
ioapic_write_entry(ioapic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
- set_native_irq_info(irq, TARGET_CPUS);
+ irq_desc[irq].affinity = TARGET_CPUS;
spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index aa5b3e6..b0a44b8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -200,17 +200,6 @@ extern int setup_irq(unsigned int irq, struct irqaction *new);
#endif
#ifdef CONFIG_SMP
-static inline void set_native_irq_info(int irq, cpumask_t mask)
-{
- irq_desc[irq].affinity = mask;
-}
-#else
-static inline void set_native_irq_info(int irq, cpumask_t mask)
-{
-}
-#endif
-
-#ifdef CONFIG_SMP
#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
--
1.5.0.g53756
It's dead Jim.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 1847213..2d154e1 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -793,8 +793,6 @@ void __setup_vector_irq(int cpu)
}
-extern void (*interrupt[NR_IRQS])(void);
-
static struct irq_chip ioapic_chip;
#define IOAPIC_AUTO -1
--
1.5.0.g53756
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 2d154e1..a69c38b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -799,7 +799,7 @@ static struct irq_chip ioapic_chip;
#define IOAPIC_EDGE 0
#define IOAPIC_LEVEL 1
-static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+static void ioapic_register_intr(int irq, unsigned long trigger)
{
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
@@ -847,7 +847,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
entry.dest = cpu_mask_to_apicid(mask);
entry.vector = vector;
- ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+ ioapic_register_intr(irq, IOAPIC_AUTO);
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
@@ -2133,7 +2133,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
triggering, polarity);
- ioapic_register_intr(irq, entry.vector, triggering);
+ ioapic_register_intr(irq, triggering);
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
--
1.5.0.g53756
Currently we have two routines that do practically the same thing
setup_IO_APIC_irq and io_apic_set_pci_routing. This patch makes
setup_IO_APIC_irq the common factor of these two previous routines.
For setup_IO_APIC_irq all that was needed was to pass the trigger
and polarity to make the code a proper subset of io_apic_set_pci_routing.
Hopefully consolidating these two routines will improve maintenance
there were several differences that simply appear to be one routine
or the other getting it wrong.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 126 +++++++++++------------------------------
1 files changed, 34 insertions(+), 92 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index a69c38b..e064838 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -615,22 +615,6 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic,pin,mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
- return irq_trigger(idx);
- }
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = {
@@ -795,26 +779,36 @@ void __setup_vector_irq(int cpu)
static struct irq_chip ioapic_chip;
-#define IOAPIC_AUTO -1
-#define IOAPIC_EDGE 0
-#define IOAPIC_LEVEL 1
-
static void ioapic_register_intr(int irq, unsigned long trigger)
{
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
+ if (trigger)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_fasteoi_irq, "fasteoi");
else
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_edge_irq, "edge");
}
-static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
+
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+ int trigger, int polarity)
{
struct IO_APIC_route_entry entry;
+ cpumask_t mask;
int vector;
unsigned long flags;
+ if (!IO_APIC_IRQ(irq))
+ return;
+
+ vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
+ if (vector < 0)
+ return;
+
+ apic_printk(APIC_VERBOSE,KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+ "IRQ %d Mode:%i Active:%i)\n",
+ apic, mp_ioapics[apic].mpc_apicid, pin, vector,
+ irq, trigger, polarity);
/*
* add it to the IO-APIC irq-routing table:
@@ -823,41 +817,27 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
+ entry.dest = cpu_mask_to_apicid(mask);
entry.mask = 0; /* enable IRQ */
- entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
-
- entry.trigger = irq_trigger(idx);
- entry.polarity = irq_polarity(idx);
+ entry.trigger = trigger;
+ entry.polarity = polarity;
+ entry.vector = vector;
- if (irq_trigger(idx)) {
- entry.trigger = 1;
+ /* Mask level triggered irqs.
+ * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+ */
+ if (trigger)
entry.mask = 1;
- entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
- }
-
- if (!apic && !IO_APIC_IRQ(irq))
- return;
- if (IO_APIC_IRQ(irq)) {
- cpumask_t mask;
- vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
- if (vector < 0)
- return;
-
- entry.dest = cpu_mask_to_apicid(mask);
- entry.vector = vector;
-
- ioapic_register_intr(irq, IOAPIC_AUTO);
- if (!apic && (irq < 16))
- disable_8259A_irq(irq);
- }
+ ioapic_register_intr(irq, trigger);
+ if (irq < 16)
+ disable_8259A_irq(irq);
ioapic_write_entry(apic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
irq_desc[irq].affinity = TARGET_CPUS;
spin_unlock_irqrestore(&ioapic_lock, flags);
-
}
static void __init setup_IO_APIC_irqs(void)
@@ -882,8 +862,8 @@ static void __init setup_IO_APIC_irqs(void)
irq = pin_2_irq(idx, apic, pin);
add_pin_to_irq(irq, apic, pin);
- setup_IO_APIC_irq(apic, pin, idx, irq);
-
+ setup_IO_APIC_irq(apic, pin, irq,
+ irq_trigger(idx), irq_polarity(idx));
}
}
@@ -2090,11 +2070,6 @@ int __init io_apic_get_redir_entries (int ioapic)
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
- struct IO_APIC_route_entry entry;
- unsigned long flags;
- int vector;
- cpumask_t mask;
-
if (!IO_APIC_IRQ(irq)) {
apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
ioapic);
@@ -2107,42 +2082,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
if (irq >= 16)
add_pin_to_irq(irq, ioapic, pin);
-
- vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
- if (vector < 0)
- return vector;
-
- /*
- * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
- * Note that we mask (disable) IRQs now -- these get enabled when the
- * corresponding device driver registers for this IRQ.
- */
-
- memset(&entry,0,sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.dest = cpu_mask_to_apicid(mask);
- entry.trigger = triggering;
- entry.polarity = polarity;
- entry.mask = 1; /* Disabled (masked) */
- entry.vector = vector & 0xff;
-
- apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i)\n", ioapic,
- mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
- triggering, polarity);
-
- ioapic_register_intr(irq, triggering);
-
- if (!ioapic && (irq < 16))
- disable_8259A_irq(irq);
-
- ioapic_write_entry(ioapic, pin, entry);
-
- spin_lock_irqsave(&ioapic_lock, flags);
- irq_desc[irq].affinity = TARGET_CPUS;
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
return 0;
}
@@ -2175,7 +2115,9 @@ void __init setup_ioapic_dest(void)
* cpu is online.
*/
if(!irq_vector[irq])
- setup_IO_APIC_irq(ioapic, pin, irq_entry, irq);
+ setup_IO_APIC_irq(ioapic, pin, irq,
+ irq_trigger(irq_entry),
+ irq_polarity(irq_entry));
else
set_ioapic_affinity_irq(irq, TARGET_CPUS);
}
--
1.5.0.g53756
For some reason the code has been picking TARGET_CPUS when asked to
set the affinity to an empty set of cpus. That is just silly it's
extra work. Instead if there are no cpus to set the affinity to we
should just give up immediately. That is simpler and a little more
intuitive.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 12 +++---------
1 files changed, 3 insertions(+), 9 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index e064838..74671de 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -244,9 +244,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(mask, tmp, CPU_MASK_ALL);
+ return;
vector = assign_irq_vector(irq, mask, &tmp);
if (vector < 0)
@@ -1890,9 +1888,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(mask, tmp, CPU_MASK_ALL);
+ return;
vector = assign_irq_vector(irq, mask, &tmp);
if (vector < 0)
@@ -1985,9 +1981,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(mask, tmp, CPU_MASK_ALL);
+ return;
vector = assign_irq_vector(irq, mask, &tmp);
if (vector < 0)
--
1.5.0.g53756
If we have an irq that comes from multiple io_apic pins the FINAL action
(which is io_apic_sync or nothing) needs to be called for every entry or
else if the two pins come from different io_apics we may not wait until
after the action happens on the io_apic.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 74671de..52fc19b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -149,11 +149,11 @@ static inline void io_apic_sync(unsigned int apic)
reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
reg ACTION; \
io_apic_modify(entry->apic, reg); \
+ FINAL; \
if (!entry->next) \
break; \
entry = irq_2_pin + entry->next; \
} \
- FINAL; \
}
union entry_union {
--
1.5.0.g53756
NR_IRQ_VECTORS is currently a compatiblity define set to NR_IRQs.
This patch updates the users of NR_IRQ_VECTORS to use NR_IRQs instead
so that NR_IRQ_VECTORS can be removed.
There is still shared code with arch/i386 that uses NR_IRQ_VECTORS
so we can't remove the #define just yet :(
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 52fc19b..65d7218 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -74,7 +74,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
*/
-#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
/*
@@ -615,7 +615,7 @@ static int pin_2_irq(int idx, int apic, int pin)
/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = {
+static u8 irq_vector[NR_IRQS] __read_mostly = {
[0] = FIRST_EXTERNAL_VECTOR + 0,
[1] = FIRST_EXTERNAL_VECTOR + 1,
[2] = FIRST_EXTERNAL_VECTOR + 2,
@@ -634,7 +634,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = {
[15] = FIRST_EXTERNAL_VECTOR + 15,
};
-static cpumask_t irq_domain[NR_IRQ_VECTORS] __read_mostly = {
+static cpumask_t irq_domain[NR_IRQS] __read_mostly = {
[0] = CPU_MASK_ALL,
[1] = CPU_MASK_ALL,
[2] = CPU_MASK_ALL,
@@ -671,7 +671,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
int old_vector = -1;
int cpu;
- BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
+ BUG_ON((unsigned)irq >= NR_IRQS);
/* Only try and allocate irqs on cpus that are present */
cpus_and(mask, mask, cpu_online_map);
@@ -758,7 +758,7 @@ void __setup_vector_irq(int cpu)
int irq, vector;
/* Mark the inuse vectors */
- for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
+ for (irq = 0; irq < NR_IRQS; ++irq) {
if (!cpu_isset(cpu, irq_domain[irq]))
continue;
vector = irq_vector[irq];
--
1.5.0.g53756
Currently the io_apic.c has several parallel arrays for different
kinds of data that can be know about an irq. The parallel arrays
make the code harder to maintain and make it difficult to remove
the static limits on the number of the number of irqs.
This patch pushes irq_data and irq_vector into a irq_cfg array and
updates the code to use it.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 111 +++++++++++++++++++-----------------------
1 files changed, 50 insertions(+), 61 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 65d7218..dd6580c 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -47,6 +47,31 @@
#include <asm/msidef.h>
#include <asm/hypertransport.h>
+struct irq_cfg {
+ cpumask_t domain;
+ u8 vector;
+};
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+ [0] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 0 },
+ [1] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 1 },
+ [2] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 2 },
+ [3] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 3 },
+ [4] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 4 },
+ [5] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 5 },
+ [6] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 6 },
+ [7] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 7 },
+ [8] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 8 },
+ [9] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 9 },
+ [10] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 10 },
+ [11] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 11 },
+ [12] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 12 },
+ [13] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 13 },
+ [14] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 14 },
+ [15] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 15 },
+};
+
static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result);
#define __apicdebuginit __init
@@ -613,46 +638,6 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
-
-/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQS] __read_mostly = {
- [0] = FIRST_EXTERNAL_VECTOR + 0,
- [1] = FIRST_EXTERNAL_VECTOR + 1,
- [2] = FIRST_EXTERNAL_VECTOR + 2,
- [3] = FIRST_EXTERNAL_VECTOR + 3,
- [4] = FIRST_EXTERNAL_VECTOR + 4,
- [5] = FIRST_EXTERNAL_VECTOR + 5,
- [6] = FIRST_EXTERNAL_VECTOR + 6,
- [7] = FIRST_EXTERNAL_VECTOR + 7,
- [8] = FIRST_EXTERNAL_VECTOR + 8,
- [9] = FIRST_EXTERNAL_VECTOR + 9,
- [10] = FIRST_EXTERNAL_VECTOR + 10,
- [11] = FIRST_EXTERNAL_VECTOR + 11,
- [12] = FIRST_EXTERNAL_VECTOR + 12,
- [13] = FIRST_EXTERNAL_VECTOR + 13,
- [14] = FIRST_EXTERNAL_VECTOR + 14,
- [15] = FIRST_EXTERNAL_VECTOR + 15,
-};
-
-static cpumask_t irq_domain[NR_IRQS] __read_mostly = {
- [0] = CPU_MASK_ALL,
- [1] = CPU_MASK_ALL,
- [2] = CPU_MASK_ALL,
- [3] = CPU_MASK_ALL,
- [4] = CPU_MASK_ALL,
- [5] = CPU_MASK_ALL,
- [6] = CPU_MASK_ALL,
- [7] = CPU_MASK_ALL,
- [8] = CPU_MASK_ALL,
- [9] = CPU_MASK_ALL,
- [10] = CPU_MASK_ALL,
- [11] = CPU_MASK_ALL,
- [12] = CPU_MASK_ALL,
- [13] = CPU_MASK_ALL,
- [14] = CPU_MASK_ALL,
- [15] = CPU_MASK_ALL,
-};
-
static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
{
/*
@@ -670,19 +655,21 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
cpumask_t old_mask = CPU_MASK_NONE;
int old_vector = -1;
int cpu;
+ struct irq_cfg *cfg;
BUG_ON((unsigned)irq >= NR_IRQS);
+ cfg = &irq_cfg[irq];
/* Only try and allocate irqs on cpus that are present */
cpus_and(mask, mask, cpu_online_map);
- if (irq_vector[irq] > 0)
- old_vector = irq_vector[irq];
+ if (cfg->vector > 0)
+ old_vector = cfg->vector;
if (old_vector > 0) {
- cpus_and(*result, irq_domain[irq], mask);
+ cpus_and(*result, cfg->domain, mask);
if (!cpus_empty(*result))
return old_vector;
- cpus_and(old_mask, irq_domain[irq], cpu_online_map);
+ cpus_and(old_mask, cfg->domain, cpu_online_map);
}
for_each_cpu_mask(cpu, mask) {
@@ -716,8 +703,8 @@ next:
per_cpu(vector_irq, old_cpu)[old_vector] = -1;
for_each_cpu_mask(new_cpu, new_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
- irq_vector[irq] = vector;
- irq_domain[irq] = domain;
+ cfg->vector = vector;
+ cfg->domain = domain;
cpus_and(*result, domain, mask);
return vector;
}
@@ -737,18 +724,21 @@ static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
static void __clear_irq_vector(int irq)
{
+ struct irq_cfg *cfg;
cpumask_t mask;
int cpu, vector;
- BUG_ON(!irq_vector[irq]);
+ BUG_ON((unsigned)irq >= NR_IRQS);
+ cfg = &irq_cfg[irq];
+ BUG_ON(!cfg->vector);
- vector = irq_vector[irq];
- cpus_and(mask, irq_domain[irq], cpu_online_map);
+ vector = cfg->vector;
+ cpus_and(mask, cfg->domain, cpu_online_map);
for_each_cpu_mask(cpu, mask)
per_cpu(vector_irq, cpu)[vector] = -1;
- irq_vector[irq] = 0;
- irq_domain[irq] = CPU_MASK_NONE;
+ cfg->vector = 0;
+ cfg->domain = CPU_MASK_NONE;
}
void __setup_vector_irq(int cpu)
@@ -759,9 +749,9 @@ void __setup_vector_irq(int cpu)
/* Mark the inuse vectors */
for (irq = 0; irq < NR_IRQS; ++irq) {
- if (!cpu_isset(cpu, irq_domain[irq]))
+ if (!cpu_isset(cpu, irq_cfg[irq].domain))
continue;
- vector = irq_vector[irq];
+ vector = irq_cfg[irq].vector;
per_cpu(vector_irq, cpu)[vector] = irq;
}
/* Mark the free vectors */
@@ -769,7 +759,7 @@ void __setup_vector_irq(int cpu)
irq = per_cpu(vector_irq, cpu)[vector];
if (irq < 0)
continue;
- if (!cpu_isset(cpu, irq_domain[irq]))
+ if (!cpu_isset(cpu, irq_cfg[irq].domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
@@ -1346,16 +1336,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
static int ioapic_retrigger_irq(unsigned int irq)
{
+ struct irq_cfg *cfg = &irq_cfg[irq];
cpumask_t mask;
- unsigned vector;
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- vector = irq_vector[irq];
cpus_clear(mask);
- cpu_set(first_cpu(irq_domain[irq]), mask);
+ cpu_set(first_cpu(cfg->domain), mask);
- send_IPI_mask(mask, vector);
+ send_IPI_mask(mask, cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -1430,7 +1419,7 @@ static inline void init_IO_APIC_traps(void)
*/
for (irq = 0; irq < NR_IRQS ; irq++) {
int tmp = irq;
- if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
+ if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
@@ -1816,7 +1805,7 @@ int create_irq(void)
for (new = (NR_IRQS - 1); new >= 0; new--) {
if (platform_legacy_irq(new))
continue;
- if (irq_vector[new] != 0)
+ if (irq_cfg[new].vector != 0)
continue;
vector = __assign_irq_vector(new, TARGET_CPUS, &mask);
if (likely(vector > 0))
@@ -2108,7 +2097,7 @@ void __init setup_ioapic_dest(void)
* when you have too many devices, because at that time only boot
* cpu is online.
*/
- if(!irq_vector[irq])
+ if (!irq_cfg[irq].vector)
setup_IO_APIC_irq(ioapic, pin, irq,
irq_trigger(irq_entry),
irq_polarity(irq_entry));
--
1.5.0.g53756
Currently assign_irq_vector works mostly by side effect and returns
the results of it's changes to the caller. Which makes for a lot of
arguments to pass/return and confusion as to what to do if you need
the status but you aren't calling assign_irq_vector.
This patch stops returning values from assign_irq_vector that can be
retrieved just as easily by examining irq_cfg, and modifies the
callers to retrive those values from irq_cfg when they need them.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/io_apic.c | 109 +++++++++++++++++++++--------------------
1 files changed, 56 insertions(+), 53 deletions(-)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index dd6580c..79fa7b2 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -72,7 +72,7 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
[15] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 15 },
};
-static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result);
+static int assign_irq_vector(int irq, cpumask_t mask);
#define __apicdebuginit __init
@@ -262,19 +262,19 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
{
+ struct irq_cfg *cfg = irq_cfg + irq;
unsigned long flags;
unsigned int dest;
cpumask_t tmp;
- int vector;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
return;
- vector = assign_irq_vector(irq, mask, &tmp);
- if (vector < 0)
+ if (assign_irq_vector(irq, mask))
return;
+ cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
/*
@@ -283,7 +283,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
dest = SET_APIC_LOGICAL_ID(dest);
spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, vector);
+ __target_IO_APIC_irq(irq, dest, cfg->vector);
irq_desc[irq].affinity = mask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -638,7 +638,7 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
-static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
+static int __assign_irq_vector(int irq, cpumask_t mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -653,7 +653,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
*/
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
cpumask_t old_mask = CPU_MASK_NONE;
- int old_vector = -1;
+ unsigned int old_vector;
int cpu;
struct irq_cfg *cfg;
@@ -663,12 +663,12 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
/* Only try and allocate irqs on cpus that are present */
cpus_and(mask, mask, cpu_online_map);
- if (cfg->vector > 0)
- old_vector = cfg->vector;
- if (old_vector > 0) {
- cpus_and(*result, cfg->domain, mask);
- if (!cpus_empty(*result))
- return old_vector;
+ old_vector = cfg->vector;
+ if (old_vector) {
+ cpumask_t tmp;
+ cpus_and(tmp, cfg->domain, mask);
+ if (!cpus_empty(tmp))
+ return 0;
cpus_and(old_mask, cfg->domain, cpu_online_map);
}
@@ -705,21 +705,20 @@ next:
per_cpu(vector_irq, new_cpu)[vector] = irq;
cfg->vector = vector;
cfg->domain = domain;
- cpus_and(*result, domain, mask);
- return vector;
+ return 0;
}
return -ENOSPC;
}
-static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
+static int assign_irq_vector(int irq, cpumask_t mask)
{
- int vector;
+ int err;
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- vector = __assign_irq_vector(irq, mask, result);
+ err = __assign_irq_vector(irq, mask);
spin_unlock_irqrestore(&vector_lock, flags);
- return vector;
+ return err;
}
static void __clear_irq_vector(int irq)
@@ -780,22 +779,24 @@ static void ioapic_register_intr(int irq, unsigned long trigger)
static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
int trigger, int polarity)
{
+ struct irq_cfg *cfg = irq_cfg + irq;
struct IO_APIC_route_entry entry;
cpumask_t mask;
- int vector;
unsigned long flags;
if (!IO_APIC_IRQ(irq))
return;
- vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
- if (vector < 0)
+ mask = TARGET_CPUS;
+ if (assign_irq_vector(irq, mask))
return;
+ cpus_and(mask, cfg->domain, mask);
+
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
"IRQ %d Mode:%i Active:%i)\n",
- apic, mp_ioapics[apic].mpc_apicid, pin, vector,
+ apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
irq, trigger, polarity);
/*
@@ -809,7 +810,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
entry.mask = 0; /* enable IRQ */
entry.trigger = trigger;
entry.polarity = polarity;
- entry.vector = vector;
+ entry.vector = cfg->vector;
/* Mask level triggered irqs.
* Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1558,15 +1559,14 @@ static inline void unlock_ExtINT_logic(void)
*/
static inline void check_timer(void)
{
+ struct irq_cfg *cfg = irq_cfg + 0;
int apic1, pin1, apic2, pin2;
- int vector;
- cpumask_t mask;
/*
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
- vector = assign_irq_vector(0, TARGET_CPUS, &mask);
+ assign_irq_vector(0, TARGET_CPUS);
/*
* Subtle, code in do_timer_interrupt() expects an AEOI
@@ -1586,7 +1586,7 @@ static inline void check_timer(void)
apic2 = ioapic_i8259.apic;
apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
- vector, apic1, pin1, apic2, pin2);
+ cfg->vector, apic1, pin1, apic2, pin2);
if (pin1 != -1) {
/*
@@ -1617,7 +1617,7 @@ static inline void check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+ setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
if (timer_irq_works()) {
apic_printk(APIC_VERBOSE," works.\n");
nmi_watchdog_default();
@@ -1642,14 +1642,14 @@ static inline void check_timer(void)
disable_8259A_irq(0);
irq_desc[0].chip = &lapic_irq_type;
- apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
+ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_VERBOSE," works.\n");
return;
}
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_VERBOSE," failed.\n");
apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
@@ -1796,9 +1796,7 @@ int create_irq(void)
/* Allocate an unused irq */
int irq;
int new;
- int vector = 0;
unsigned long flags;
- cpumask_t mask;
irq = -ENOSPC;
spin_lock_irqsave(&vector_lock, flags);
@@ -1807,8 +1805,7 @@ int create_irq(void)
continue;
if (irq_cfg[new].vector != 0)
continue;
- vector = __assign_irq_vector(new, TARGET_CPUS, &mask);
- if (likely(vector > 0))
+ if (__assign_irq_vector(new, TARGET_CPUS) == 0)
irq = new;
break;
}
@@ -1837,12 +1834,15 @@ void destroy_irq(unsigned int irq)
#ifdef CONFIG_PCI_MSI
static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
{
- int vector;
+ struct irq_cfg *cfg = irq_cfg + irq;
+ int err;
unsigned dest;
cpumask_t tmp;
- vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
- if (vector >= 0) {
+ tmp = TARGET_CPUS;
+ err = assign_irq_vector(irq, tmp);
+ if (!err) {
+ cpus_and(tmp, cfg->domain, tmp);
dest = cpu_mask_to_apicid(tmp);
msg->address_hi = MSI_ADDR_BASE_HI;
@@ -1862,33 +1862,33 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
((INT_DELIVERY_MODE != dest_LowestPrio) ?
MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(vector);
+ MSI_DATA_VECTOR(cfg->vector);
}
- return vector;
+ return err;
}
#ifdef CONFIG_SMP
static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
+ struct irq_cfg *cfg = irq_cfg + irq;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
- int vector;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
return;
- vector = assign_irq_vector(irq, mask, &tmp);
- if (vector < 0)
+ if (assign_irq_vector(irq, mask))
return;
+ cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
read_msi_msg(irq, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(vector);
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
@@ -1964,21 +1964,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
{
+ struct irq_cfg *cfg = irq_cfg + irq;
unsigned int dest;
cpumask_t tmp;
- int vector;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
return;
- vector = assign_irq_vector(irq, mask, &tmp);
- if (vector < 0)
+ if (assign_irq_vector(irq, mask))
return;
+ cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
- target_ht_irq(irq, dest, vector);
+ target_ht_irq(irq, dest, cfg->vector);
irq_desc[irq].affinity = mask;
}
#endif
@@ -1996,14 +1996,17 @@ static struct irq_chip ht_irq_chip = {
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
- int vector;
+ struct irq_cfg *cfg = irq_cfg + irq;
+ int err;
cpumask_t tmp;
- vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
- if (vector >= 0) {
+ tmp = TARGET_CPUS;
+ err = assign_irq_vector(irq, tmp);
+ if (!err) {
struct ht_irq_msg msg;
unsigned dest;
+ cpus_and(tmp, cfg->domain, tmp);
dest = cpu_mask_to_apicid(tmp);
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
@@ -2011,7 +2014,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
msg.address_lo =
HT_IRQ_LOW_BASE |
HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(vector) |
+ HT_IRQ_LOW_VECTOR(cfg->vector) |
((INT_DEST_MODE == 0) ?
HT_IRQ_LOW_DM_PHYSICAL :
HT_IRQ_LOW_DM_LOGICAL) |
@@ -2026,7 +2029,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
set_irq_chip_and_handler_name(irq, &ht_irq_chip,
handle_edge_irq, "edge");
}
- return vector;
+ return err;
}
#endif /* CONFIG_HT_IRQ */
--
1.5.0.g53756
The code in io_apic.c and in i8259.c currently hardcode the same
vector for the timer interrupt so there is no reason for a special
assignment for the timer as the setup for the i8259 already takes care
of this.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/i8259.c | 6 ------
1 files changed, 0 insertions(+), 6 deletions(-)
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 01e2cf0..103517d 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -520,12 +520,6 @@ void __init init_IRQ(void)
#ifdef CONFIG_SMP
/*
- * IRQ0 must be given a fixed assignment and initialized,
- * because it's used before the IO-APIC is set up.
- */
- __get_cpu_var(vector_irq)[FIRST_DEVICE_VECTOR] = 0;
-
- /*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
*/
--
1.5.0.g53756
For the ISA irqs we reserve 16 vectors. This patch adds constants for
those vectors and modifies the code to use them. Making the code a
little clearer and making it possible to move these vectors in the future.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/i8259.c | 40 ++++++++++++++++++++--------------------
arch/x86_64/kernel/io_apic.c | 32 ++++++++++++++++----------------
include/asm-x86_64/hw_irq.h | 18 +++++++++++++++++-
3 files changed, 53 insertions(+), 37 deletions(-)
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 103517d..45d8563 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -299,7 +299,7 @@ void init_8259A(int auto_eoi)
* outb_p - this has to work on a wide range of PC hardware.
*/
outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
- outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+ outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
if (auto_eoi)
outb_p(0x03, 0x21); /* master does Auto EOI */
@@ -307,7 +307,7 @@ void init_8259A(int auto_eoi)
outb_p(0x01, 0x21); /* master expects normal EOI */
outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
- outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+ outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
is to be investigated) */
@@ -398,24 +398,24 @@ device_initcall(i8259A_init_sysfs);
static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... FIRST_EXTERNAL_VECTOR - 1] = -1,
- [FIRST_EXTERNAL_VECTOR + 0] = 0,
- [FIRST_EXTERNAL_VECTOR + 1] = 1,
- [FIRST_EXTERNAL_VECTOR + 2] = 2,
- [FIRST_EXTERNAL_VECTOR + 3] = 3,
- [FIRST_EXTERNAL_VECTOR + 4] = 4,
- [FIRST_EXTERNAL_VECTOR + 5] = 5,
- [FIRST_EXTERNAL_VECTOR + 6] = 6,
- [FIRST_EXTERNAL_VECTOR + 7] = 7,
- [FIRST_EXTERNAL_VECTOR + 8] = 8,
- [FIRST_EXTERNAL_VECTOR + 9] = 9,
- [FIRST_EXTERNAL_VECTOR + 10] = 10,
- [FIRST_EXTERNAL_VECTOR + 11] = 11,
- [FIRST_EXTERNAL_VECTOR + 12] = 12,
- [FIRST_EXTERNAL_VECTOR + 13] = 13,
- [FIRST_EXTERNAL_VECTOR + 14] = 14,
- [FIRST_EXTERNAL_VECTOR + 15] = 15,
- [FIRST_EXTERNAL_VECTOR + 16 ... NR_VECTORS - 1] = -1
+ [0 ... IRQ0_VECTOR - 1] = -1,
+ [IRQ0_VECTOR] = 0,
+ [IRQ1_VECTOR] = 1,
+ [IRQ2_VECTOR] = 2,
+ [IRQ3_VECTOR] = 3,
+ [IRQ4_VECTOR] = 4,
+ [IRQ5_VECTOR] = 5,
+ [IRQ6_VECTOR] = 6,
+ [IRQ7_VECTOR] = 7,
+ [IRQ8_VECTOR] = 8,
+ [IRQ9_VECTOR] = 9,
+ [IRQ10_VECTOR] = 10,
+ [IRQ11_VECTOR] = 11,
+ [IRQ12_VECTOR] = 12,
+ [IRQ13_VECTOR] = 13,
+ [IRQ14_VECTOR] = 14,
+ [IRQ15_VECTOR] = 15,
+ [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
};
void __init init_ISA_irqs (void)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 79fa7b2..8dede0b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -54,22 +54,22 @@ struct irq_cfg {
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
- [0] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 0 },
- [1] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 1 },
- [2] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 2 },
- [3] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 3 },
- [4] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 4 },
- [5] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 5 },
- [6] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 6 },
- [7] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 7 },
- [8] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 8 },
- [9] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 9 },
- [10] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 10 },
- [11] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 11 },
- [12] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 12 },
- [13] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 13 },
- [14] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 14 },
- [15] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR + 15 },
+ [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
+ [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
+ [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
+ [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
+ [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
+ [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
+ [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
+ [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
+ [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
+ [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
+ [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+ [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+ [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+ [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+ [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+ [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
};
static int assign_irq_vector(int irq, cpumask_t mask);
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 552df5f..dc395ed 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -35,6 +35,22 @@
/*
* Vectors 0x20-0x2f are used for ISA interrupts.
*/
+#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR
+#define IRQ1_VECTOR IRQ0_VECTOR + 1
+#define IRQ2_VECTOR IRQ0_VECTOR + 2
+#define IRQ3_VECTOR IRQ0_VECTOR + 3
+#define IRQ4_VECTOR IRQ0_VECTOR + 4
+#define IRQ5_VECTOR IRQ0_VECTOR + 5
+#define IRQ6_VECTOR IRQ0_VECTOR + 6
+#define IRQ7_VECTOR IRQ0_VECTOR + 7
+#define IRQ8_VECTOR IRQ0_VECTOR + 8
+#define IRQ9_VECTOR IRQ0_VECTOR + 9
+#define IRQ10_VECTOR IRQ0_VECTOR + 10
+#define IRQ11_VECTOR IRQ0_VECTOR + 11
+#define IRQ12_VECTOR IRQ0_VECTOR + 12
+#define IRQ13_VECTOR IRQ0_VECTOR + 13
+#define IRQ14_VECTOR IRQ0_VECTOR + 14
+#define IRQ15_VECTOR IRQ0_VECTOR + 15
/*
* Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -69,7 +85,7 @@
* we start at 0x31 to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
*/
-#define FIRST_DEVICE_VECTOR 0x31
+#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */
--
1.5.0.g53756
The problem: After moving an interrupt when is it safe to teardown
the data structures for receiving the interrupt at the old location?
With a normal pci device it is possible to issue a read to a device
to flush all posted writes. This does not work for the oldest ioapics
because they are on a 3-wire apic bus which is a completely different
data path. For some more modern ioapics when everything is using
front side bus delivery you can flush interrupts by simply issuing a
read to the ioapic. For other modern ioapics emperical testing has
shown that this does not work.
So it appears the only reliable way to know the last of the irqs from an
ioapic have been received from before the ioapic was reprogrammed is to
received the first irq from the ioapic from after it was reprogrammed.
Once we know the last irq message has been received from an ioapic
into a local apic we then need to know that irq message has been
processed through the local apics.
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/x86_64/kernel/entry.S | 3 ++
arch/x86_64/kernel/i8259.c | 6 +++-
arch/x86_64/kernel/io_apic.c | 78 +++++++++++++++++++++++++++++++++++++++---
include/asm-x86_64/hw_irq.h | 9 ++++-
4 files changed, 88 insertions(+), 8 deletions(-)
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 9f5dac6..ed4350c 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -675,6 +675,9 @@ END(invalidate_interrupt\num)
ENTRY(call_function_interrupt)
apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
END(call_function_interrupt)
+ENTRY(irq_move_cleanup_interrupt)
+ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
+END(irq_move_cleanup_interrupt)
#endif
ENTRY(apic_timer_interrupt)
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 45d8563..21d95b7 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -450,6 +450,7 @@ void spurious_interrupt(void);
void error_interrupt(void);
void reschedule_interrupt(void);
void call_function_interrupt(void);
+void irq_move_cleanup_interrupt(void);
void invalidate_interrupt0(void);
void invalidate_interrupt1(void);
void invalidate_interrupt2(void);
@@ -537,7 +538,10 @@ void __init init_IRQ(void)
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-#endif
+
+ /* Low priority IPI to cleanup after moving an irq */
+ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 8dede0b..48593f6 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -36,6 +36,7 @@
#include <acpi/acpi_bus.h>
#endif
+#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/desc.h>
@@ -49,7 +50,10 @@
struct irq_cfg {
cpumask_t domain;
+ cpumask_t old_domain;
+ unsigned move_cleanup_count;
u8 vector;
+ u8 move_in_progress : 1;
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -652,7 +656,6 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
- cpumask_t old_mask = CPU_MASK_NONE;
unsigned int old_vector;
int cpu;
struct irq_cfg *cfg;
@@ -663,18 +666,20 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
/* Only try and allocate irqs on cpus that are present */
cpus_and(mask, mask, cpu_online_map);
+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ return -EBUSY;
+
old_vector = cfg->vector;
if (old_vector) {
cpumask_t tmp;
cpus_and(tmp, cfg->domain, mask);
if (!cpus_empty(tmp))
return 0;
- cpus_and(old_mask, cfg->domain, cpu_online_map);
}
for_each_cpu_mask(cpu, mask) {
cpumask_t domain, new_mask;
- int new_cpu, old_cpu;
+ int new_cpu;
int vector, offset;
domain = vector_allocation_domain(cpu);
@@ -699,8 +704,10 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- for_each_cpu_mask(old_cpu, old_mask)
- per_cpu(vector_irq, old_cpu)[old_vector] = -1;
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cfg->old_domain = cfg->domain;
+ }
for_each_cpu_mask(new_cpu, new_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
cfg->vector = vector;
@@ -1360,8 +1367,68 @@ static int ioapic_retrigger_irq(unsigned int irq)
* races.
*/
+#ifdef CONFIG_SMP
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+ unsigned vector, me;
+ ack_APIC_irq();
+ exit_idle();
+ irq_enter();
+
+ me = smp_processor_id();
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ unsigned int irq;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ irq = __get_cpu_var(vector_irq)[vector];
+ if (irq >= NR_IRQS)
+ continue;
+
+ desc = irq_desc + irq;
+ cfg = irq_cfg + irq;
+ spin_lock(&desc->lock);
+ if (!cfg->move_cleanup_count)
+ goto unlock;
+
+ if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ goto unlock;
+
+ __get_cpu_var(vector_irq)[vector] = -1;
+ cfg->move_cleanup_count--;
+unlock:
+ spin_unlock(&desc->lock);
+ }
+
+ irq_exit();
+}
+
+static void irq_complete_move(unsigned int irq)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ unsigned vector, me;
+
+ if (likely(!cfg->move_in_progress))
+ return;
+
+ vector = ~get_irq_regs()->orig_rax;
+ me = smp_processor_id();
+ if ((vector == cfg->vector) &&
+ cpu_isset(smp_processor_id(), cfg->domain)) {
+ cpumask_t cleanup_mask;
+
+ cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+ cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ cfg->move_in_progress = 0;
+ }
+}
+#else
+static inline void irq_complete_move(unsigned int irq) {}
+#endif
+
static void ack_apic_edge(unsigned int irq)
{
+ irq_complete_move(irq);
move_native_irq(irq);
ack_APIC_irq();
}
@@ -1370,6 +1437,7 @@ static void ack_apic_level(unsigned int irq)
{
int do_unmask_irq = 0;
+ irq_complete_move(irq);
#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
/* If we are moving the irq we need to mask it */
if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index dc395ed..2e4b7a5 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -32,10 +32,15 @@
#define IA32_SYSCALL_VECTOR 0x80
+/* Reserve the lowest usable priority level 0x20 - 0x2f for triggering
+ * cleanup after irq migration.
+ */
+#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+
/*
* Vectors 0x20-0x2f are used for ISA interrupts.
*/
-#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR
+#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR + 0x10
#define IRQ1_VECTOR IRQ0_VECTOR + 1
#define IRQ2_VECTOR IRQ0_VECTOR + 2
#define IRQ3_VECTOR IRQ0_VECTOR + 3
@@ -82,7 +87,7 @@
/*
* First APIC vector available to drivers: (vectors 0x30-0xee)
- * we start at 0x31 to spread out vectors evenly between priority
+ * we start at 0x41 to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
*/
#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
--
1.5.0.g53756
move_native_irqs tries to do the right thing when migrating irqs
by disabling them. However disabling them is a software logical
thing, not a hardware thing. This has always been a little flaky
and after Ingo's latest round of changes it is guaranteed to not
mask the apic.
So this patch fixes move_native_irq to directly call the mask and
unmask chip methods to guarantee that we mask the irq when we
are migrating it. We must do this as it is required by
all code that call into the path.
Since we don't know the masked status when IRQ_DISABLED is
set so we will not be able to restore it. The patch makes the code
just give up and trying again the next time this routing is called.
Signed-off-by: Eric W. Biederman <[email protected]>
---
kernel/irq/migration.c | 9 ++++-----
1 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 4baa3bb..77b7acc 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -65,12 +65,11 @@ void move_native_irq(int irq)
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
- if (likely(!(desc->status & IRQ_DISABLED)))
- desc->chip->disable(irq);
+ if (unlikely(desc->status & IRQ_DISABLED))
+ return;
+ desc->chip->mask(irq);
move_masked_irq(irq);
-
- if (likely(!(desc->status & IRQ_DISABLED)))
- desc->chip->enable(irq);
+ desc->chip->unmask(irq);
}
--
1.5.0.g53756
After writing this up and sending out the email it occured to me this
information should be kept someplace a little more permanent, so the
next person who cares won't have to get a huge pile of test machines
and test to understand what doesn't work.
A bunch of this is in my other changelog entries in the patches I
just posted but not all of it.
Signed-off-by: Eric W. Biederman <[email protected]>
---
Documentation/x86_64/IO-APIC-what-works.txt | 109 +++++++++++++++++++++++++++
1 files changed, 109 insertions(+), 0 deletions(-)
create mode 100644 Documentation/x86_64/IO-APIC-what-works.txt
diff --git a/Documentation/x86_64/IO-APIC-what-works.txt b/Documentation/x86_64/IO-APIC-what-works.txt
new file mode 100644
index 0000000..40fa61f
--- /dev/null
+++ b/Documentation/x86_64/IO-APIC-what-works.txt
@@ -0,0 +1,109 @@
+23 Feb 2007
+
+Ok. This is just an email to summarize my findings after investigating
+the ioapic programming.
+
+The ioapics on the E75xx chipset do have issues if you attempt to
+reprogramming them outside of the irq handler. I have on several
+instances caused the state machine to get stuck such that an
+individual ioapic entry was no longer capable of delivering
+interrupts. I suspect the remote IRR bit was set stuck on such that
+switch the irq to edge triggered and back to level triggered would not
+clear it but I did not confirm this. I just know that I was switching
+the irq to between level and edge triggered with the irq masked
+and the irq did not fire.
+
+
+The ioapics on the AMD 8xxx chipset do have issues if you attempt
+to reprogram them outside of the irq handler. I would up with
+remote IRR set and never clearing. But by temporarily switching
+the irq to edge triggered while it was masked I could clear
+this condition.
+
+I could not hit verifiable bugs in the ioapics on the Nforce4
+chipset. It's amazing one part of that chipset that I can't find
+issues with.
+
+
+
+I did find an algorithm that will work successfully for migrating
+IRQs in process context if you have an ioapic that will follow pci
+ordering rules. In particulars the properties that the algorithm
+depend on are reads guaranteeing that outstanding writes are flushed,
+and in this context irqs in flight are considered writes. I have
+assumed that to devices outside of the cpu asic the cpu and the local
+apic appear as the same device.
+
+The algorithm was:
+- Be running with interrupts enabled in process context.
+- Mask the ioapic.
+- Read the ioapic to flush outstanding reads to the local apic.
+- Read the local apic to flush outstanding irqs to be send the cpu.
+
+- Now that all of the irqs have been delivered and the irq is masked
+ that irq is finally quiescent.
+
+- With the irq quiescent it is safe to reprogram interrupt controller
+ and the irq reception data structures.
+
+There were a lot more details but that was the essence.
+
+What I discovered was that except on the nforce chipset masking the
+ioapic and then issue a read did not behave as if the interrupts were
+flushed to the local apic.
+
+I did not look close enough to tell if local apics suffered from this
+issue. With local apics at least a read was necessary before you
+could guarantee the local apic would deliver pending irqs. A work
+around on the local apics is to simply issue a low priority interrupt
+as an IPI and wait for it to be processed. This guarantees that all
+higher priority interrupts have been flushed from the apic, and that
+the local apic has processed interrupts.
+
+For ioapics because they cannot be stimulated to send any irq by
+stimulation from the cpu side not similar work around was possible.
+
+
+
+** Conclusions.
+
+*IRQs must be reprogramed in interrupt context.
+
+The result of this is investigation is that I am convinced we need
+to perform the irq migration activities in interrupt context although
+I am not convinced it is completely safe. I suspect multiple irqs
+firing closely enough to each other may hit the same issues as
+migrating irqs from process context. However the odds are on our
+side, when we are in irq context.
+
+The reasoning for this is simply that.
+- Before we reprogram a level triggered irq it's remote irr bit
+ must be cleared by the irq being acknowledged before the can be
+ safely reprogrammed.
+
+- There is no generally effective way short of receiving an additional
+ irq to ensure that the irq handler has run. Polling the ioapics
+ remote irr bit does not work.
+
+
+* The CPU hotplug code is currently very buggy.
+
+Irq migration in the cpu hotplug case is a serious problem. If we can
+only safely migrate irqs from interrupt context and we cannot control
+when those interrupts fire, then we cannot bound the amount of time it
+will take to migrate the irqs away from a cpu. The current cpu
+hotplug code currently calls chip->set_affinity directly which is
+wrong, as it does not take the necessary locks, and it does not
+attempt to delay execution until we are in process context.
+
+* Only an additional irq can signal the completion of an irq movement.
+
+The attempt to rebuild the irq migration code from first principles
+did bear some fruit. I asked the question: "When is it safe to tear
+down the data structures for irq movement?". The only answer I have
+is when I have received an irq provably from after the irq was
+reprogrammed. This is because the only way I can reliably synchronize
+with irq delivery from an apic is to receive an additional irq.
+
+Currently this is a problem both for cpu hotplug on x86_64 and i386
+and for general irq migration on x86_64.
--
1.5.0.g53756
Eric W. Biederman wrote:
>
>
>** Conclusions.
>
>*IRQs must be reprogramed in interrupt context.
>
>The result of this is investigation is that I am convinced we need
>to perform the irq migration activities in interrupt context although
>I am not convinced it is completely safe. I suspect multiple irqs
>firing closely enough to each other may hit the same issues as
>migrating irqs from process context. However the odds are on our
>side, when we are in irq context.
>
>
>
In my older days of programmin 82489DX chipsets (which the AMD APIC
versions resemble
the 82489DX more closely than intel's newer incarnations), you had to
EOI the apic early if you
wanted to migrate interrupt assignments. I had to do the following steps
to move an IRQ:
1. Mask the LOCAL APIC
2, EOI the interrupt
3. Leave the interrupt entry masked until the ISR completed.
4. Reprogram the interrupt.
5. Unmask as the ISR exits
In other words, EOI early in all cases to clear the local and IOAPIC state.
Jeff
"Jeff V. Merkey" <[email protected]> writes:
> In my older days of programmin 82489DX chipsets (which the AMD APIC versions
> resemble
> the 82489DX more closely than intel's newer incarnations), you had to EOI the
> apic early if you
> wanted to migrate interrupt assignments. I had to do the following steps to move
> an IRQ:
>
> 1. Mask the LOCAL APIC
> 2, EOI the interrupt
> 3. Leave the interrupt entry masked until the ISR completed.
> 4. Reprogram the interrupt.
> 5. Unmask as the ISR exits
>
> In other words, EOI early in all cases to clear the local and IOAPIC state.
Thanks.
That is the essence of what I am doing with level triggered interrupts.
- Mask
- ACK
- Reprogram
- Unmask.
Which essentially comes from the definition of how level triggered
interrupts operate in the ioapics.
Having to run the EOI before the ISR routine runs or having to keep
the ISR masked while the ISR runs isn't something that I have encountered.
And if it was a problem there is enough volume I expect someone would have gotten
a bug report about it by now.
Eric
Here's the include file that goes with it.
Jeff
#include "types.h"
#include "emit.h"
#define _82489APIC_MASK 0x000000F0
#define APIC_IO_REG 0x00000000
#define APIC_IO_DATA 0x00000004
// APIC registers are 128 bit aligned. accesses are offset * 4
#define APIC_TASKPRI (4 * 0x00000008)
#define APIC_ID (4 * 0x00000002)
#define APIC_VERS (4 * 0x00000003)
#define APIC_LDEST (4 * 0x0000000D)
#define APIC_EOI (4 * 0x0000000B)
#define APIC_DESTFMT (4 * 0x0000000E)
#define APIC_SPUR (4 * 0x0000000F)
#define APIC_IRR0 (4 * 0x00000020)
#define APIC_ICMD (4 * 0x00000030)
#define APIC_ICMD2 (4 * 0x00000031)
#define APIC_LVT_TIMER (4 * 0x00000032)
#define APIC_LVT_I0 (4 * 0x00000035)
#define APIC_LVT_I1 (4 * 0x00000036)
#define APIC_ICOUNT (4 * 0x00000038)
#define APIC_CCOUNT (4 * 0x00000039)
// APIc command values
#define APIC_REG_ID 0x00000000
#define APIC_REG_RDT 0x00000010
#define APIC_REG_RDT2 0x00000011
#define APIC_VALUE_MASK 0x00010000
#define APIC_VALUE_TOALL 0x7FFFFFFF
#define APIC_LOGDEST(c) (0x40000000 >> (c))
#define APIC_VALUE_IM_OFF 0x80000000
#define APIC_VALUE_FIXED 0x00000000
#define APIC_VALUE_LOPRI 0x00000100
#define APIC_VALUE_NMI 0x00000400
#define APIC_VALUE_RESET 0x00000500
#define APIC_VALUE_STARTUP 0x00000600
#define APIC_VALUE_EXTINT 0x00000700
#define APIC_VALUE_PENDING 0x00001000
#define APIC_VALUE_PDEST 0x00000000
#define APIC_VALUE_LDEST 0x00000800
#define APIC_VALUE_POLOW 0x00002000
#define APIC_VALUE_POHIGH 0x00000000
#define APIC_VALUE_ASSERT 0x00004000
#define APIC_VALUE_DEASSERT 0x00000000
#define APIC_VALUE_EDGE 0x00000000
#define APIC_VALUE_LEVEL 0x00008000
#define APIC_VALUE_XTOSELF 0x00040000
#define APIC_VALUE_XTOALL 0x00080000
// APIC timer init values
#define HERTZ 100
#define NANOSECOND_PULSE_RATE 90
#define APIC_CLKNUM ((1000000000/NANOSECOND_PULSE_RATE)/HERTZ)
#define APIC_ID_MASK 0x0F000000
#define APIC_ID_SHIFT 24
#define TIMER_VECTOR 0x00000028
// IOAPIC interrupt delivery modes
#define DELIVERY_MODE_MASK 0x00000700
#define DELIVER_FIXED 0x00000000
#define DELIVER_LOW_PRIORITY 0x00000100
#define DELIVER_SMI 0x00000200
#define DELIVER_REMOTE_READ 0x00000300
#define DELIVER_NMI 0x00000400
#define DELIVER_INIT 0x00000500
#define DELIVER_INIT_REASSERT_ALL 0x00088500
#define DELIVER_INIT_REASSERT_SELF 0x00048500
#define DELIVER_RESET_HOLD_ALL 0x0008C500
#define DELIVER_EXTINT 0x00000700
// APIC addressing mode values
#define PHYSICAL_DESTINATION 0x00000000
#define LOGICAL_DESTINATION 0x00000800
#define DELIVERY_PENDING 0x00001000
#define ACTIVE_LOW 0x00002000
#define REMOTE_IRR 0x00004000
#define LEVEL_TRIGGERED 0x00008000
#define INTERRUPT_MASKED 0x00010000
#define INTERRUPT_UNMASKED 0x00000000
#define PERIODIC_TIMER 0x00020000
#define TIMER_BASE_CLOCK 0x00000000
#define TIMER_BASE_TMBASE 0x00040000
#define TIMER_BASE_DIVIDER 0x00080000
#define ICR_LEVEL_ASSERTED 0x00004000
#define ICR_RR_STATUS_MASK 0x00030000
#define ICR_RR_INVALID 0x00000000
#define ICR_RR_IN_PROGRESS 0x00010000
#define ICR_RR_VALID 0x00020000
#define ICR_SHORTHAND_MASK 0x000C0000
#define ICR_USE_DEST_FIELD 0x00000000
#define ICR_SELF 0x00040000
#define ICR_ALL_INCL_SELF 0x00080000
#define ICR_ALL_EXCL_SELF 0x000C0000
#define LU_DEST_FORMAT_MASK 0xF0000000
#define LU_DEST_FORMAT_FLAT 0xFFFFFFFF
#define LU_UNIT_ENABLED 0x00000100
//
// 8259 PIC registers
//
#define MAX_PICS 3
#define PIC_0 0x20
#define PIC_1 0xA0
#define PIC_2 0x30
#define MASK_0 0x21
#define MASK_1 0xA1
#define MASK_2 0x31
#define PIC_EOI 0x20
#define MAX_INTS 64
#define CHAIN_LENGTH 16
//
// EISA polarity registers
//
#define EISA_POLARITY_REG 0x00000C0E
#define PIC1_ELCR_PORT 0x000004D0
#define PIC2_ELCR_PORT 0x000004D1
#define ELCR_EDGE 0
#define ELCR_LEVEL 1
/* Definitions for Intel PC+MP Platform Specification */
/* Misc. */
#define EBDA_BASE 0x0009FC00 /* base of EBDA default location 639k */
#define EBDA_PTR 0x0000040E /* pointer to base of EBDA segment */
#define BMEM_PTR 0x00000413 /* pointer to installed base memory in kbyte*/
#define CPQ_RESET_VECT 0x00000467 /* reset vector location */
/* PC+MP Interrupt Mode Control Registers */
#define IMCR_ADDR 0x22
#define IMCR_DATA 0x23
#define CMOSCTL 0x8F /* CMOS warm-boot addr */
#define CMOSWARM 0x0A /* CMOS warm-boot flag */
#define CMOS_ADDR 0x70
#define CMOS_DATA 0x71
#define MP_IMCRP 0x80 /* IMCR present */
#define MP_IMCRA 0x0 /* IMCR absent */
#define MP_DEF_TYPE 6 /* default table to use */
#define MP_DEF_IMCR MP_IMCRA/* default IMCRP to use */
/* types of entries (stored in bytes[0]) */
#define MP_ET_PROC 0 /* processor */
#define MP_ET_BUS 1 /* bus */
#define MP_ET_IOAPIC 2 /* i/o apic */
#define MP_ET_I_INTR 3 /* interrupt assignment -> i/o apic */
#define MP_ET_L_INTR 4 /* interrupt assignment -> local apic */
/* flag values for intr */
#define MP_INTR_POVAL 1
#define MP_INTR_POLOW 2
#define MP_INTR_ELVAL 4
#define MP_INTR_ELLEVEL 8
#define MAX_BUSES 16
#define MAX_IOAPICS 16
struct pcmp_fptr { /* PC+MP floating pointer structure */
BYTE sig[4]; /* signature "_MP_" */
LONG *paddr; /* physical address pointer to MP table */
BYTE len;
BYTE rev; /* table length in 16 byte; revision # */
BYTE checksum; /* checksum */
BYTE mp_byte[5]; /* MP feature byte 1: default system type */
}; /* MP feature byte 2: bit 7: IMCRP */
struct mpchdr {
BYTE sig[4]; /* signature "MPAT" */
WORD tbl_len; /* length of table */
BYTE spec_rev; /* MP+AT specification revision no. */
BYTE checksum;
BYTE oem_id[8];
BYTE product_id[12];
LONG *oem_ptr; /* pointer to oem table (optional) */
WORD oem_len; /* length of above table */
WORD num_entry; /* number of 'entry's to follow */
LONG loc_apic_adr; /* local apic physical address */
LONG reserved;
};
struct mpe_proc {
BYTE entry_type;
BYTE apic_id;
BYTE apic_vers;
BYTE cpu_flags;
LONG cpu_signature;
LONG features;
LONG reserved[2];
};
struct mpe_bus {
BYTE entry_type;
BYTE bus_id;
BYTE name[6];
};
struct mpe_ioapic {
BYTE entry_type;
BYTE apic_id;
BYTE apic_vers;
BYTE ioapic_flags;
LONG io_apic_adr;
};
struct mpe_intr {
BYTE entry_type;
BYTE intr_type;
WORD intr_flags;
BYTE src_bus;
BYTE src_irq;
BYTE dest_apicid;
BYTE dest_line;
};
struct mpe_local {
BYTE entry_type;
BYTE intr_type;
WORD intr_flags;
BYTE src_bus;
BYTE src_irq;
BYTE dest_apicid;
BYTE dest_line;
};
union mpcentry {
BYTE bytes[20];
struct mpe_proc p;
struct mpe_bus b;
struct mpe_ioapic a;
struct mpe_intr i;
struct mpe_local l;
};
struct mpconfig {
struct mpchdr hdr;
union mpcentry entry[1];
};
struct bus_data {
BYTE bus_id;
BYTE bus_type;
};
struct io_apic_state_array {
BYTE line_state[24];
};
struct intr_table {
LONG line;
LONG io_apicid;
LONG dev;
LONG bus;
LONG use;
};
typedef struct _IOAPIC_IDS {
LONG address;
LONG lnum;
} IOAPIC_IDS;
//
// Intel mps apic external data
//
// MPS 1.0 standard tables
extern LONG mps_size[];
extern BYTE mps_default_table_1[];
extern BYTE mps_default_table_2[];
extern BYTE mps_default_table_3[];
extern BYTE mps_default_table_4[];
extern BYTE mps_default_table_5[];
extern BYTE mps_default_table_6[];
extern BYTE mps_default_table_7[];
extern BYTE mps_default_table_8[];
extern LONG apic_defaults[];
extern BYTE *bus_strings[];
extern BYTE *bus_display_strings[];
extern BYTE vector_table[];
extern struct intr_table int_table[];
extern BYTE irq_control[];
extern BYTE irq_mask[];
extern BYTE mask_value[];
extern LONG elcr_flags;
extern LONG mps_present;
extern LONG pcmp_fib1;
extern LONG pcmp_fib2;
extern struct pcmp_fptr *mps_fp;
extern struct mpchdr *vendor_table;
extern LONG num_buses;
extern LONG num_procs;
extern LONG num_ioapics;
extern struct mpe_ioapic *io_apic_addr[];
extern struct io_apic_state_array io_apic_state[];
extern IOAPIC_IDS io_apic_ids[];
extern LONG io_apic_entry_num[];
extern LONG io_apic_nlines[];
extern struct bus_data bus_info[];
extern struct mpe_proc *proc_id[];
extern LONG local_apic_address;
extern LONG processor_mask;
extern LONG warm_reset_vector;
//
// Intel mps apic functions
//
extern LONG mps_find_fp(LONG begin, LONG end);
extern LONG mps_locate(void);
extern void mps_ints(void);
extern LONG mps_ioapics(void);
extern LONG mps_procs(void);
extern void mps_buses(void);
extern LONG MPSDetect(void);
extern void apic_eoi(LONG intr);
extern LONG apic_xcall(LONG proc, LONG command, LONG type);
extern LONG apic_init(LONG proc);
extern void apic_close(LONG proc);
extern void write_boot_vector(LONG addr);
extern void apic_timer_start(void);
extern void apic_timer_stop(void);
extern void disable_ioapic_ints(void);
extern void enable_ioapic_ints(void);
extern LONG apic_activate_processor(LONG proc, LONG addr);
extern void dump_int_table(SCREEN *);
extern void dump_ioapic(SCREEN *, LONG num);
extern void dump_local_apic(SCREEN *);
extern void dump_remote_apic(SCREEN *, LONG proc);
extern LONG get_apic_id(void);
extern void apic_mask_timer(void);
extern void apic_unmask_timer(void);
extern void configure_eisa_el(LONG intr, LONG mode);
extern LONG apic_set_int(LONG intr, LONG proc, LONG mode, LONG share);
extern LONG apic_clear_int(LONG intr);
extern LONG apic_mask_int(LONG intr);
extern LONG apic_unmask_int(LONG intr);
extern LONG apic_directed_nmi(LONG proc);
extern void program_8254(void);
//
// internal 8259 PIC HAL section
//
extern void unmask_pic(LONG intr);
extern void mask_pic(LONG intr);
extern void mask_pic_timer(void);
extern void unmask_pic_timer(void);
extern void mask_system(void);
extern void unmask_system(void);
extern LONG pic_set_int(LONG intr, LONG proc, LONG mode, LONG share);
extern LONG pic_clear_int(LONG intr);
extern void pic_eoi(LONG intr);
Eric,
Please find attached the APIC code I used in Gadugi. It's code for plain
vanilla APICs, but does just this. This code not only allows
interrupts to be migrated, but processors to be stopped and restarted on
the fly without system interruption. You may find some useful
ideas in it.
Jeff
#include "stdarg.h"
#include "stdio.h"
#include "stdlib.h"
#include "ctype.h"
#include "string.h"
#include "kernel.h"
#include "keyboard.h"
#include "screen.h"
#include "types.h"
#include "os.h"
#include "mps.h"
#include "event.h"
#define MPS_VERBOSE 0
// MPS 1.0 standard tables
BYTE mps_default_table_1[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+1+1+16+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'I','S','A',' ',
' ',' ',
MP_ET_IOAPIC, /* i/o apic */
2, /* apic id */
1, /* apic type - 82489DX (not used) */
1, /* enabled */
0x00,0x00,0xc0,0xfe, /* address of i/o apic */
MP_ET_I_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all i/o apics, line 0 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,1,2,1, /* src(0,1) -> i/o apic with id=2, line 1 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,2,2,1, /* src(0,2) -> i/o apic with id=2, line 2 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,3,2,3, /* src(0,3) -> i/o apic with id=2, line 3 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,4,2,4, /* src(0,4) -> i/o apic with id=2, line 4 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,5,2,5, /* src(0,5) -> i/o apic with id=2, line 5 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,6,2,6, /* src(0,6) -> i/o apic with id=2, line 6 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,7,2,7, /* src(0,7) -> i/o apic with id=2, line 7 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,8,2,8, /* src(0,8) -> i/o apic with id=2, line 8 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,9,2,9, /* src(0,9) -> i/o apic with id=2, line 9 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,10,2,10, /* src(0,10) -> i/o apic with id=2, line 10 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,11,2,11, /* src(0,11) -> i/o apic with id=2, line 11 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,12,2,12, /* src(0,12) -> i/o apic with id=2, line 12 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,13,2,12, /* src(0,13) -> i/o apic with id=2, line 13 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,14,2,14, /* src(0,14) -> i/o apic with id=2, line 14 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,15,2,15, /* src(0,15) -> i/o apic with id=2, line 15 */
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
BYTE mps_default_table_2[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+1+1+14+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'E','I','S','A',
' ',' ',
MP_ET_IOAPIC, /* i/o apic */
2, /* apic id */
1, /* apic type - 82489DX (not used) */
1, /* enabled */
0x00,0x00,0xc0,0xfe, /* address of i/o apic */
MP_ET_I_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all i/o apics, line 0 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,1,2,1, /* src(0,1) -> i/o apic with id=2, line 1 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,3,2,3, /* src(0,3) -> i/o apic with id=2, line 3 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,4,2,4, /* src(0,4) -> i/o apic with id=2, line 4 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,5,2,5, /* src(0,5) -> i/o apic with id=2, line 5 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,6,2,6, /* src(0,6) -> i/o apic with id=2, line 6 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,7,2,7, /* src(0,7) -> i/o apic with id=2, line 7 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,8,2,8, /* src(0,8) -> i/o apic with id=2, line 8 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,9,2,9, /* src(0,9) -> i/o apic with id=2, line 9 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,10,2,10, /* src(0,10) -> i/o apic with id=2, line 10 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,11,2,11, /* src(0,11) -> i/o apic with id=2, line 11 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,12,2,12, /* src(0,12) -> i/o apic with id=2, line 12 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,14,2,14, /* src(0,14) -> i/o apic with id=2, line 14 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,15,2,15, /* src(0,15) -> i/o apic with id=2, line 15 */
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
BYTE mps_default_table_3[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+1+1+16+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'E','I','S','A',
' ',' ',
MP_ET_IOAPIC, /* i/o apic */
2, /* apic id */
1, /* apic type - 82489DX (not used) */
1, /* enabled */
0x00,0x00,0xc0,0xfe, /* address of i/o apic */
MP_ET_I_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all i/o apics, line 0 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,1,2,1, /* src(0,1) -> i/o apic with id=2, line 1 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,2,2,1, /* src(0,2) -> i/o apic with id=2, line 2 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,3,2,3, /* src(0,3) -> i/o apic with id=2, line 3 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,4,2,4, /* src(0,4) -> i/o apic with id=2, line 4 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,5,2,5, /* src(0,5) -> i/o apic with id=2, line 5 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,6,2,6, /* src(0,6) -> i/o apic with id=2, line 6 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,7,2,7, /* src(0,7) -> i/o apic with id=2, line 7 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,8,2,8, /* src(0,8) -> i/o apic with id=2, line 8 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,9,2,9, /* src(0,9) -> i/o apic with id=2, line 9 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,10,2,10, /* src(0,10) -> i/o apic with id=2, line 10 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,11,2,11, /* src(0,11) -> i/o apic with id=2, line 11 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,12,2,12, /* src(0,12) -> i/o apic with id=2, line 12 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,13,2,12, /* src(0,13) -> i/o apic with id=2, line 13 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,14,2,14, /* src(0,14) -> i/o apic with id=2, line 14 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,15,2,15, /* src(0,15) -> i/o apic with id=2, line 15 */
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
BYTE mps_default_table_4[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+1+1+16+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
1, /* apic type - 82489DX */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'M','C','A',' ',
' ',' ',
MP_ET_IOAPIC, /* i/o apic */
2, /* apic id */
1, /* apic type - 82489DX (not used) */
1, /* enabled */
0x00,0x00,0xc0,0xfe, /* address of i/o apic */
MP_ET_I_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all i/o apics, line 0 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,1,2,1, /* src(0,1) -> i/o apic with id=2, line 1 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,2,2,1, /* src(0,2) -> i/o apic with id=2, line 2 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,3,2,3, /* src(0,3) -> i/o apic with id=2, line 3 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,4,2,4, /* src(0,4) -> i/o apic with id=2, line 4 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,5,2,5, /* src(0,5) -> i/o apic with id=2, line 5 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,6,2,6, /* src(0,6) -> i/o apic with id=2, line 6 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,7,2,7, /* src(0,7) -> i/o apic with id=2, line 7 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,8,2,8, /* src(0,8) -> i/o apic with id=2, line 8 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,9,2,9, /* src(0,9) -> i/o apic with id=2, line 9 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,10,2,10, /* src(0,10) -> i/o apic with id=2, line 10 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,11,2,11, /* src(0,11) -> i/o apic with id=2, line 11 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,12,2,12, /* src(0,12) -> i/o apic with id=2, line 12 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,13,2,12, /* src(0,13) -> i/o apic with id=2, line 13 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,14,2,14, /* src(0,14) -> i/o apic with id=2, line 14 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,15,2,15, /* src(0,15) -> i/o apic with id=2, line 15 */
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
BYTE mps_default_table_5[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+2+1+16+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0x20,0,0, /* stepping, model, family, type=CM */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'I','S','A',' ',
' ',' ',
MP_ET_BUS, /* bus */
0, /* bus id */
'P','C','I',' ',
' ',' ',
MP_ET_IOAPIC, /* i/o apic */
2, /* apic id */
0x10, /* apic type - Integrated (not used) */
1, /* enabled */
0x00,0x00,0xc0,0xfe, /* address of i/o apic */
MP_ET_I_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all i/o apics, line 0 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,1,2,1, /* src(0,1) -> i/o apic with id=2, line 1 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,2,2,1, /* src(0,2) -> i/o apic with id=2, line 2 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,3,2,3, /* src(0,3) -> i/o apic with id=2, line 3 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,4,2,4, /* src(0,4) -> i/o apic with id=2, line 4 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,5,2,5, /* src(0,5) -> i/o apic with id=2, line 5 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,6,2,6, /* src(0,6) -> i/o apic with id=2, line 6 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,7,2,7, /* src(0,7) -> i/o apic with id=2, line 7 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,8,2,8, /* src(0,8) -> i/o apic with id=2, line 8 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,9,2,9, /* src(0,9) -> i/o apic with id=2, line 9 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,10,2,10, /* src(0,10) -> i/o apic with id=2, line 10 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,11,2,11, /* src(0,11) -> i/o apic with id=2, line 11 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,12,2,12, /* src(0,12) -> i/o apic with id=2, line 12 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,13,2,12, /* src(0,13) -> i/o apic with id=2, line 13 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,14,2,14, /* src(0,14) -> i/o apic with id=2, line 14 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,15,2,15, /* src(0,15) -> i/o apic with id=2, line 15 */
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
BYTE mps_default_table_6[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+2+1+16+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0x20,0,0, /* stepping, model, family, type=CM */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'E','I','S','A',
' ',' ',
MP_ET_BUS, /* bus */
0, /* bus id */
'P','C','I',' ',
' ',' ',
MP_ET_IOAPIC, /* i/o apic */
2, /* apic id */
0x10, /* apic type - Integrated (not used) */
1, /* enabled */
0x00,0x00,0xc0,0xfe, /* address of i/o apic */
MP_ET_I_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all i/o apics, line 0 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,1,2,1, /* src(0,1) -> i/o apic with id=2, line 1 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,2,2,1, /* src(0,2) -> i/o apic with id=2, line 2 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,3,2,3, /* src(0,3) -> i/o apic with id=2, line 3 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,4,2,4, /* src(0,4) -> i/o apic with id=2, line 4 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,5,2,5, /* src(0,5) -> i/o apic with id=2, line 5 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,6,2,6, /* src(0,6) -> i/o apic with id=2, line 6 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,7,2,7, /* src(0,7) -> i/o apic with id=2, line 7 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,8,2,8, /* src(0,8) -> i/o apic with id=2, line 8 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,9,2,9, /* src(0,9) -> i/o apic with id=2, line 9 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,10,2,10, /* src(0,10) -> i/o apic with id=2, line 10 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,11,2,11, /* src(0,11) -> i/o apic with id=2, line 11 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,12,2,12, /* src(0,12) -> i/o apic with id=2, line 12 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,13,2,12, /* src(0,13) -> i/o apic with id=2, line 13 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,14,2,14, /* src(0,14) -> i/o apic with id=2, line 14 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,15,2,15, /* src(0,15) -> i/o apic with id=2, line 15 */
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
BYTE mps_default_table_7[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+2+1+15+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0x20,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'M','C','A',' ',
' ',' ',
MP_ET_BUS, /* bus */
0, /* bus id */
'P','C','I',' ',
' ',' ',
MP_ET_IOAPIC, /* i/o apic */
2, /* apic id */
0x10, /* apic type - Integrated (not used) */
1, /* enabled */
0x00,0x00,0xc0,0xfe, /* address of i/o apic */
MP_ET_I_INTR,0,0,0, /* INTR */
0,1,2,1, /* src(0,1) -> i/o apic with id=2, line 1 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,2,2,1, /* src(0,2) -> i/o apic with id=2, line 2 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,3,2,3, /* src(0,3) -> i/o apic with id=2, line 3 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,4,2,4, /* src(0,4) -> i/o apic with id=2, line 4 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,5,2,5, /* src(0,5) -> i/o apic with id=2, line 5 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,6,2,6, /* src(0,6) -> i/o apic with id=2, line 6 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,7,2,7, /* src(0,7) -> i/o apic with id=2, line 7 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,8,2,8, /* src(0,8) -> i/o apic with id=2, line 8 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,9,2,9, /* src(0,9) -> i/o apic with id=2, line 9 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,10,2,10, /* src(0,10) -> i/o apic with id=2, line 10 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,11,2,11, /* src(0,11) -> i/o apic with id=2, line 11 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,12,2,12, /* src(0,12) -> i/o apic with id=2, line 12 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,13,2,12, /* src(0,13) -> i/o apic with id=2, line 13 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,14,2,14, /* src(0,14) -> i/o apic with id=2, line 14 */
MP_ET_I_INTR,0,0,0, /* INTR */
0,15,2,15, /* src(0,15) -> i/o apic with id=2, line 15 */
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
/* NO_IO_APIC_OPTION */
BYTE mps_default_table_8[] = {
'P','C','M','P', /* signature */
0,0, 1, 0, /* length, spec_rev, checksum */
0,0,0,0, 0,0,0,0, /* oem id string */
0,0,0,0, 0,0,0,0, 0,0,0,0, /* product id string */
0,0,0,0, /* oem table pointer */
0,0, 2+1+0+0+2,0, /* entry count */
0x00,0x00,0xe0,0xfe, /* address of local apic */
0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
0, /* apic id == 0 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0,0,0, /* stepping, model, family, type */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_PROC, /* processor */
1, /* apic id == 1 */
0x10, /* apic type - Integrated */
1, /* enable */
0,0x20,0,0, /* stepping, model, family, type=CM */
0,0,0,0, /* feature flags - not used */
0,0,0,0, 0,0,0,0, /* res. */
MP_ET_BUS, /* bus */
0, /* bus id */
'I','S','A',' ',
' ',' ',
MP_ET_L_INTR,3,0,0, /* PIC */
0,0,0xff,0, /* src(0,0) -> all local apics, line 0 */
MP_ET_L_INTR,1,0,0, /* NMI */
0,0,0xff,1, /* src(0,0) -> all local apics, line 1 */
};
LONG apic_defaults[] = {
0,
(LONG)mps_default_table_1,
(LONG)mps_default_table_2,
(LONG)mps_default_table_3,
(LONG)mps_default_table_4,
(LONG)mps_default_table_5,
(LONG)mps_default_table_6,
(LONG)mps_default_table_7,
(LONG)mps_default_table_8,
0
};
BYTE *bus_strings[]={
"ISA ",
"EISA ",
"MCA ",
"PCI ",
"CBUS ",
"CBUSII",
"FUTURE",
"INTERN",
"MBI ",
"MBII ",
"MPI ",
"MPSA ",
"NUBUS ",
"PCMCIA",
"TC ",
"VL ",
"VME ",
"XPRESS"
};
BYTE *bus_display_strings[]={
"ISA",
"EISA",
"MCA",
"PCI",
"CBUS",
"CBUSII",
"FUTURE",
"INTERN",
"MBI",
"MBII",
"MPI",
"MPSA",
"NUBUS",
"PCMCIA",
"TC",
"VL",
"VME",
"XPRESS"
};
BYTE vector_table[]={
0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, // IRQ 0-7
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, // IRQ 8-15
0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, // intr 16-23
0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x91, // intr 24-31
0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, // intr 32-39
0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, // intr 40-47
0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, // intr 48-55
0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, // intr 56-63
0xB2, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 // intr 64-65
};
LONG mps_size[]={
sizeof(struct mpe_proc),
sizeof(struct mpe_bus),
sizeof(struct mpe_ioapic),
sizeof(struct mpe_intr),
sizeof(struct mpe_local)
};
struct intr_table int_table[]={
-1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1,
-1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1,
-1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1,
-1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
-1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0,
};
LONG elcr_flags;
LONG mps_present;
LONG pcmp_fib1;
LONG pcmp_fib2;
struct pcmp_fptr *mps_fp;
struct mpchdr *vendor_table;
LONG num_buses;
LONG num_procs;
LONG num_ioapics;
LONG ioapic_locks[MAX_IOAPICS] = { 0, 0, 0, 0, 0, 0, 0, 0 };
struct mpe_ioapic *io_apic_addr[MAX_IOAPICS];
struct io_apic_state_array io_apic_state[MAX_IOAPICS];
IOAPIC_IDS io_apic_ids[MAX_IOAPICS];
LONG io_apic_entry_num[MAX_IOAPICS];
LONG io_apic_nlines[MAX_IOAPICS];
struct bus_data bus_info[MAX_IOAPICS];
struct mpe_proc *proc_id[MAX_IOAPICS];
LONG local_apic_address;
LONG processor_mask;
LONG warm_reset_vector;
BYTE pic_assign[MAX_INTS];
extern char *strncpy(char *dst, const char *src, LONG n);
LONG mps_find_fp(LONG begin, LONG end)
{
struct pcmp_fptr *fp, *endp;
BYTE *cp, sum = 0;
register int i;
endp = (struct pcmp_fptr *) (begin + (end - begin + 1) - sizeof(*fp));
for (fp = (struct pcmp_fptr *)begin; fp <= (struct pcmp_fptr *) endp; fp++)
{
if (fp->sig[0] == '_' && fp->sig[1] == 'M' &&
fp->sig[2] == 'P' && fp->sig[3] == '_')
{
cp = (BYTE *) fp;
for (i=0; i < sizeof(*fp); i++)
sum += *cp++;
if (sum == 0)
{
mps_present = 1;
vendor_table = (struct mpchdr *) fp->paddr;
pcmp_fib1 = (LONG) fp->mp_byte[0];
pcmp_fib2 = (LONG) fp->mp_byte[1];
mps_fp = fp;
return (LONG) (1);
}
}
}
return (LONG) (0);
}
LONG mps_locate(void)
{
LONG *ebda_addr, *bmem_addr;
LONG ebda_base, bmem_base;
mps_present = 0;
pcmp_fib1 = 0;
pcmp_fib2 = 0;
vendor_table = 0;
mps_fp = 0;
ebda_addr = (LONG *) EBDA_PTR; // Extended BIOS Data Area
ebda_base = *ebda_addr;
ebda_base = (ebda_base << 4) & 0xFFFFF;
if (ebda_base > 0x000A0000 || ebda_base < 0x0007F800)
ebda_base = 0;
bmem_addr = (LONG *) BMEM_PTR; // Base Memory
bmem_base = *bmem_addr;
bmem_base = (bmem_base * 1024) & 0xFFFFF;
if (bmem_base > 0x000A0000 || bmem_base < 0x0007F800)
bmem_base = 0;
if (ebda_base)
{
if (!mps_find_fp(ebda_base, ebda_base + 1023))
{
if (!mps_find_fp(0xF0000, 0xFFFFF))
{
printf("System does not support Intel MPS\n");
return (LONG) (0);
}
}
}
else
{
if (!bmem_base || !mps_find_fp(bmem_base, bmem_base + 1023))
{
if(!mps_find_fp(0xF0000, 0xFFFFF))
{
printf("System does not support Intel MPS\n");
return (LONG) (0);
}
}
}
if (mps_present)
{
if (vendor_table)
{
printf("Intel MPS version 1.%d, table type %d\n", mps_fp->rev, pcmp_fib1);
return 1;
}
if (pcmp_fib1)
{
vendor_table = (struct mpchdr *) apic_defaults[pcmp_fib1];
if (pcmp_fib1)
printf("Intel MPS version 1.%d, default table %d\n", mps_fp->rev,
pcmp_fib1);
else
printf("Intel MPS version 1.%d\n", mps_fp->rev);
return 1;
}
if (!pcmp_fib1)
{
mps_present = 1;
pcmp_fib1 = MP_DEF_TYPE;
pcmp_fib2 = MP_DEF_IMCR;
vendor_table = (struct mpchdr *) apic_defaults[MP_DEF_TYPE];
printf("Use default table %d\n", pcmp_fib1);
return 1;
}
}
mps_present = 1;
pcmp_fib1 = MP_DEF_TYPE;
pcmp_fib2 = MP_DEF_IMCR;
vendor_table = (struct mpchdr *) apic_defaults[MP_DEF_TYPE];
printf("Use default table %d\n", pcmp_fib1);
return 1;
}
void mps_ints(void)
{
union mpcentry *entry;
int num_entries, i, type, intr, r;
struct mpconfig *config;
config = (struct mpconfig *) vendor_table;
entry = config->entry;
num_entries = config->hdr.num_entry;
for (i=0; i < num_entries; i++)
{
type = entry->bytes[0];
if ((type == MP_ET_I_INTR) || (type == MP_ET_L_INTR))
{
if (!entry->i.intr_type)
{
if (bus_info[entry->i.src_bus & 0xF].bus_type == 3)
{
intr = -1;
for (r=0; r < 64; r++)
{
if (!int_table[r].use)
{
int_table[r].use = 1;
intr = r;
break;
}
}
if (intr == -1)
{
printf("MPS Table error -- FATAL\n");
return;
}
int_table[intr].io_apicid = entry->i.dest_apicid;
int_table[intr].line = entry->i.dest_line;
int_table[intr].dev = entry->i.src_irq;
int_table[intr].bus = entry->i.src_bus;
}
else
{
intr = (entry->i.src_irq & 0xF);
int_table[intr].io_apicid = entry->i.dest_apicid;
int_table[intr].line = entry->i.dest_line;
int_table[intr].dev = entry->i.src_irq;
int_table[intr].bus = entry->i.src_bus;
}
}
}
entry = (union mpcentry *)((LONG)entry + (LONG)mps_size[type]);
}
}
LONG mps_ioapics(void)
{
register int num_entries, num, i, type;
struct mpconfig *config;
union mpcentry *entry;
LONG *ioapic;
config = (struct mpconfig *) vendor_table;
entry = config->entry;
num_entries = config->hdr.num_entry;
num = 0;
for (i=0; i < num_entries; i++)
{
type = entry->bytes[0];
if (type >= MP_ET_IOAPIC)
{
if (type != MP_ET_IOAPIC)
break;
if (entry->a.ioapic_flags & 1)
{
io_apic_addr[num] = (struct mpe_ioapic *) entry;
io_apic_ids[io_apic_addr[num]->apic_id & 0xF].address =
io_apic_addr[num]->io_apic_adr;
io_apic_ids[io_apic_addr[num]->apic_id & 0xF].lnum = num;
num++;
if (num == 16)
break;
}
}
entry = (union mpcentry *) ((LONG)entry + (LONG)mps_size[type]);
}
if (!num)
printf("no I/O apics found\n");
for (i=0; i < num; i++)
{
// see if IOAPICs are 16 or 24 line
ioapic = (LONG *) io_apic_addr[i]->io_apic_adr;
// map ioapic address into page tables
map_address((LONG)ioapic >> 12, (LONG)ioapic >> 12);
map_address(((LONG)ioapic + 4096) >> 12, ((LONG)ioapic + 4096) >> 12);
ioapic[APIC_IO_REG] = 1;
io_apic_entry_num[i] = ioapic[APIC_IO_DATA];
io_apic_entry_num[i] = (io_apic_entry_num[i] >> 16) & 0x000000FF;
io_apic_nlines[i] = io_apic_entry_num[i];
#if MPS_VERBOSE
printf("io_apic addr: 0x%08X id: %02X lines: %02X (%s)\n",
io_apic_addr[i]->io_apic_adr,
io_apic_addr[i]->apic_id,
io_apic_nlines[i],
((io_apic_addr[i]->apic_vers >> 4) & 0x01) ? "Embedded" : "82489DX");
#endif
}
return (LONG) (num);
}
LONG mps_procs(void)
{
int num_entries, num, i, type, id;
struct mpconfig *config;
union mpcentry *entry;
config = (struct mpconfig *) vendor_table;
entry = config->entry;
num_entries = config->hdr.num_entry;
local_apic_address = (LONG) config->hdr.loc_apic_adr;
// map local apic address into page tables
map_address(local_apic_address >> 12, local_apic_address >> 12);
map_address(((LONG)local_apic_address + 4096) >> 12,
((LONG)local_apic_address + 4096) >> 12);
num = 0;
for (i=0; i < num_entries; i++)
{
type = entry->bytes[0];
if (type != MP_ET_PROC)
break;
if (entry->p.cpu_flags & 1)
{
proc_id[num] = (struct mpe_proc *) entry;
num++;
}
entry = (union mpcentry *)((LONG)entry + (LONG)mps_size[type]);
}
#if MPS_VERBOSE
for (i=0; i < num; i++)
printf("processor: %02X apic: %02X addr: 0x%08X (%s)\n", i,
proc_id[i]->apic_id,
local_apic_address,
((proc_id[i]->apic_vers >> 4) & 0x01) ? "Embedded" : "82489DX");
#endif
return num;
}
void mps_buses(void)
{
int num_entries, i, r, type;
struct mpconfig *config;
union mpcentry *entry;
config = (struct mpconfig *) vendor_table;
entry = config->entry;
num_entries = config->hdr.num_entry;
num_buses = 0;
for (i=0; i < num_entries; i++)
{
type = entry->bytes[0];
if (type == MP_ET_BUS)
{
for (r=0; r < 18; r++)
{
if (!strncmp(entry->b.name, bus_strings[r], 6))
{
num_buses++;
bus_info[(entry->b.bus_id & 0xF)].bus_type = r;
bus_info[(entry->b.bus_id & 0xF)].bus_id = entry->b.bus_id;
#if MPS_VERBOSE
printf("bus (%s) bus id: %02X\n", bus_display_strings[r], entry->b.bus_id);
#endif
}
}
}
entry = (union mpcentry *)((LONG)entry + (LONG)mps_size[type]);
}
}
LONG MPSDetect(void)
{
if (!mps_locate())
return 0;
mps_buses();
num_procs = mps_procs();
num_ioapics = mps_ioapics();
mps_ints();
RegisterEventNotification(EVENT_ENTER_REAL_MODE,
(void (*)(LONG))disable_ioapic_ints);
RegisterEventNotification(EVENT_RETURN_REAL_MODE,
(void (*)(LONG))enable_ioapic_ints);
return 1;
}
void apic_eoi(LONG intr)
{
LONG *local, val;
if (intr == 2)
intr = 9;
if (pic_assign[intr] || (intr < 16 && int_table[intr].line == (LONG) -1))
{
pic_eoi(intr);
}
else
{
val = 0;
local = (LONG *) local_apic_address;
local[APIC_EOI] = val;
}
return;
}
//
// returns 0 - command sent to other processor successfully
// -3 - command register busy
// -4 - type command error
//
//
// type is one of the following:
// 0 - 32 bit APIC command
// 1 - 64 bit logical destination command (proc = processor_number)
// 2 - 64 bit physical destination command (proc = apic_id)
LONG apic_xcall(LONG proc, LONG command, LONG type)
{
LONG *local;
LONG val, i;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
val = APIC_VALUE_PENDING;
switch (type)
{
case 0:
// if a command was still pending, then wait until clear
for (i=0; i < 0xFFFFF && local[APIC_ICMD] & val; i++) {};
if (i >= 0xFFFFF)
return (LONG) -3;
local[APIC_ICMD] = command; // send the command
break;
case 1:
// if a command was still pending, then wait until clear
for (i=0; i < 0xFFFFF && local[APIC_ICMD] & val; i++) {};
if (i >= 0xFFFFF)
return (LONG) -3;
local[APIC_ICMD2] = proc_id[proc]->apic_id << APIC_ID_SHIFT;
local[APIC_ICMD] = command;
break;
case 2:
// if a command was still pending, then wait until clear
for (i=0; i < 0xFFFFF && local[APIC_ICMD] & val; i++) {};
if (i >= 0xFFFFF)
return (LONG) -3;
local[APIC_ICMD2] = proc << APIC_ID_SHIFT;
local[APIC_ICMD] = command;
break;
default:
return -4;
}
set_flags(flags);
return 0;
}
LONG apic_directed_nmi(LONG proc)
{
register LONG retCode = 0;
retCode |= apic_xcall(proc, 0x0000C400, 1);
retCode |= apic_xcall(proc, 0x00008400, 1);
return retCode;
}
LONG apic_init(LONG proc)
{
LONG *local, *ioapic, val, vw;
register int i, r;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
if (!proc) // processor (0) is special case for system init
{
// if the apic is masked for LINT0, then
// virtual wire mode 'A' should be programmed,
// else assume virtual wire mode 'B', and leave LINT0 and LINT1
// alone. these entries are masked on processors other than (0)
spin_lock(&ioapic_locks[0]);
ioapic = (LONG *) io_apic_addr[0]->io_apic_adr;
ioapic[APIC_IO_REG] = APIC_REG_RDT + 2 * 0; // select INTIN_0 reg
val = ioapic[APIC_IO_DATA]; // get intr 0 vector info on ioapic 0
spin_unlock(&ioapic_locks[0]);
if ((val & APIC_VALUE_MASK) == APIC_VALUE_MASK)
{
// program virtual wire mode 'A'
local[APIC_SPUR] = LU_UNIT_ENABLED;
val = local[APIC_LVT_I0];
val &= 0xFFFE58FF;
val |= 0x700;
local[APIC_LVT_I0] = val; // program LINT0 to service ExtINT (vw 'A')
val = local[APIC_LVT_I1];
val &= 0xFFFE58FF;
val |= 0xFFFF0400;
val &= ~APIC_VALUE_MASK;
local[APIC_LVT_I1] = val; // program LINT1 to service NMI requests
vw = 0;
printf("apic virtual wire mode '%c' selected\n", vw ? 'B' : 'A');
}
else
{
// virtual wire mode 'B' pre-programmed
vw = 1;
printf("apic virtual wire mode %c selected\n", vw ? 'B' : 'A');
}
// If IMCR is present, then switch to virtual wire mode 'A' or 'B'
// IMCR should be set after and not before APIC config.
if (pcmp_fib2 & 0x80)
{
outb(IMCR_ADDR, 0x70);
outb(IMCR_DATA, 0x01);
}
local[APIC_TASKPRI] = 0xFF; // raise to highest priority (disable all
interrupts)
local[APIC_LVT_TIMER] |= APIC_VALUE_MASK; // mask the timer
local[APIC_DESTFMT] = LU_DEST_FORMAT_FLAT; // select logical destination
mode
local[APIC_LDEST] = 1 << APIC_ID_SHIFT; // set processor (0) to entry (0)
local[APIC_SPUR] = LU_UNIT_ENABLED | 0x4F; // program the spurious vector
apic_xcall(proc, DELIVER_INIT_REASSERT_ALL, 0); // tell the apic to re-sync
for (r=0; r < num_ioapics; r++)
{
spin_lock(&ioapic_locks[r]);
ioapic = (LONG *) io_apic_addr[r]->io_apic_adr;
for (i=0; i < io_apic_nlines[r] + 1; i++)
{
if (!r && i < 2)
continue;
ioapic[APIC_IO_REG] = APIC_REG_RDT + (2 * i);
val = APIC_VALUE_MASK;
ioapic[APIC_IO_DATA] |= val;
}
spin_unlock(&ioapic_locks[r]);
}
// clear any previous processor val values from the APIC_VALUE_LOPRI
// ioapic registers
for (r=0; r < num_ioapics; r++)
{
spin_lock(&ioapic_locks[r]);
ioapic = (LONG *) io_apic_addr[r]->io_apic_adr;
for (i=0; i < io_apic_nlines[r] + 1; i++)
{
if (!r && i < 2)
continue;
ioapic[APIC_IO_REG] = APIC_REG_RDT2 + (2 * i);
ioapic[APIC_IO_DATA] = 0;
}
spin_unlock(&ioapic_locks[r]);
}
// EOI any pending interrupts left over from previous
// shutdown/startup sequences
apic_eoi(0);
// reset ioapics configured for APIC_VALUE_LOPRI to current processor mask
processor_mask |= 1 << (proc + APIC_ID_SHIFT);
for (r=0; r < num_ioapics; r++)
{
spin_lock(&ioapic_locks[r]);
ioapic = (LONG *) io_apic_addr[r]->io_apic_adr;
for (i=0; i < io_apic_nlines[r] + 1; i++)
{
if (!r && i < 2)
continue;
ioapic[APIC_IO_REG] = APIC_REG_RDT2 + (2 * i);
ioapic[APIC_IO_DATA] |= processor_mask;
}
spin_unlock(&ioapic_locks[r]);
}
local[APIC_TASKPRI] = 0; // set lowest priority (enable all interrupts)
set_flags(flags);
return 0;
}
// if we got here, we are assumed on a processor other than (0)
local[APIC_LVT_TIMER] |= (LONG) APIC_VALUE_MASK; // mask the timer
local[APIC_LVT_I0] |= (LONG) APIC_VALUE_MASK; // mask LINT0
local[APIC_LVT_I1] |= (LONG) APIC_VALUE_MASK; // mask LINT1
local[APIC_DESTFMT] = (LONG) LU_DEST_FORMAT_FLAT; // select logical
destination mode
local[APIC_LDEST] = 1 << (proc + APIC_ID_SHIFT); // set processor entry
to (P# << p + APIC_ID_SHIFT)
local[APIC_SPUR] = LU_UNIT_ENABLED | 0x4F; // set the spurious vector
apic_xcall(proc, DELIVER_INIT_REASSERT_ALL, 0); // re-sync with other apics
// reset ioapics configured for APIC_VALUE_LOPRI to current processor mask
processor_mask |= 1 << (proc + APIC_ID_SHIFT);
for (r=0; r < num_ioapics; r++)
{
spin_lock(&ioapic_locks[r]);
ioapic = (LONG *) io_apic_addr[r]->io_apic_adr;
for (i=0; i < io_apic_nlines[r] + 1; i++)
{
if (!r && i < 2)
continue;
ioapic[APIC_IO_REG] = APIC_REG_RDT2 + (2 * i);
ioapic[APIC_IO_DATA] |= processor_mask;
}
spin_unlock(&ioapic_locks[r]);
}
local[APIC_TASKPRI] = 0; // set lowest priority (enable all interrupts)
set_flags(flags);
return 0;
}
void apic_close(LONG proc)
{
register int i, r;
LONG val;
LONG *local, *ioapic;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
local[APIC_TASKPRI] = 0xFF; // disable all interrupts
local[APIC_ICOUNT] = 0; // clear timer count register
val = APIC_VALUE_MASK; // mask the local timer
local[APIC_LVT_TIMER] |= val;
processor_mask &= ~(1 << (proc + APIC_ID_SHIFT)); // remove the processor
for (r = 0; r < num_ioapics; r++)
{
spin_lock(&ioapic_locks[r]);
ioapic = (LONG *) io_apic_addr[r]->io_apic_adr;
for (i=0; i < io_apic_nlines[r] + 1; i++)
{
if (!r && i < 2)
continue;
ioapic[APIC_IO_REG] = APIC_REG_RDT + 2 * i;
if (ioapic[APIC_IO_DATA] & DELIVER_LOW_PRIORITY)
{
ioapic[APIC_IO_REG] = APIC_REG_RDT2 + 2 * i;
val = ~processor_mask;
ioapic[APIC_IO_DATA] &= val;
}
// if we are on processor (0) then mask as well
if (!proc)
{
ioapic[APIC_IO_REG] = APIC_REG_RDT + (2 * i);
val = APIC_VALUE_MASK;
ioapic[APIC_IO_DATA] |= val;
}
}
spin_unlock(&ioapic_locks[r]);
}
// lower the task prority (enable interrupts)
local[APIC_TASKPRI] = 0;
// allow pending interrupts to occur
sti();
for (i=0; i < 1000; i++) i=i;
// raise the task priority to highest level (disable interrupts)
cli();
local[APIC_TASKPRI] = 0xFF;
set_flags(flags);
return;
}
void write_boot_vector(LONG addr)
{
register int i;
BYTE *dest, *src;
struct rm_addr
{
WORD offset;
WORD segment;
} rm;
register LONG flags;
flags = get_flags();
src = (BYTE *) CPQ_RESET_VECT; // save warm boot vector
dest = (BYTE *) &warm_reset_vector;
for (i=0; i < 4; i++)
*dest++ = *src++;
rm.offset = addr & 0xF;
rm.segment = (addr >> 4) & 0xFFFF;
dest = (BYTE *) CPQ_RESET_VECT; // warm boot vector address
src = (BYTE *) &rm;
for (i=0; i < 4; i++)
*dest++ = *src++;
// setup warm boot vector in CMOS
outb(CMOS_ADDR, CMOSCTL);
outb(CMOS_DATA, CMOSWARM);
set_flags(flags);
return;
}
void apic_timer_start(void)
{
LONG *local;
LONG vector;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
local[APIC_TASKPRI] = 0;
local[APIC_ICOUNT] = (APIC_CLKNUM * 8 * 2); // 50 ms intervals
vector = TIMER_VECTOR;
vector &= 0xFFFF;
vector |= 0x60000;
vector &= ~APIC_VALUE_MASK;
local[APIC_LVT_TIMER] = vector;
set_flags(flags);
return;
}
void apic_timer_stop(void)
{
LONG *local;
LONG val;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
local[APIC_ICOUNT] = 0; // zero count register
val = APIC_VALUE_MASK;
local[APIC_LVT_TIMER] |= val; // mask timer
set_flags(flags);
return;
}
void disable_ioapic_ints(void)
{
register int i, r;
LONG val;
LONG *ioapic;
LONG *local;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
local[APIC_TASKPRI] = 0xFF;
for (r=0; r < num_ioapics; r++)
{
spin_lock(&ioapic_locks[r]);
ioapic = (LONG *) io_apic_addr[r]->io_apic_adr;
for (i=0; i < io_apic_nlines[r]+1; i++)
{
if (!r && i < 2)
continue;
ioapic[APIC_IO_REG] = APIC_REG_RDT + (2 * i);
val = APIC_VALUE_MASK;
ioapic[APIC_IO_DATA] |= val;
}
spin_unlock(&ioapic_locks[r]);
}
set_flags(flags);
return;
}
void enable_ioapic_ints(void)
{
register int i, r;
LONG val;
LONG *ioapic;
LONG *local;
register LONG flags;
flags = get_flags();
for (r=0; r < num_ioapics; r++)
{
spin_lock(&ioapic_locks[r]);
ioapic = (LONG *) io_apic_addr[r]->io_apic_adr;
for (i=0; i < io_apic_nlines[r]+1; i++)
{
if (!r && i < 2)
continue;
if (io_apic_state[r].line_state[i])
{
ioapic[APIC_IO_REG] = APIC_REG_RDT + (2 * i);
val = ioapic[APIC_IO_DATA];
val &= ~APIC_VALUE_MASK;
ioapic[APIC_IO_DATA] = val;
}
}
spin_unlock(&ioapic_locks[r]);
}
local = (LONG *) local_apic_address;
local[APIC_TASKPRI] = 0;
set_flags(flags);
return;
}
LONG apic_activate_processor(LONG proc, LONG addr)
{
LONG i;
// if the processor does not exist, then return
if (!proc_id[proc])
return 0;
// delay loops are required to prevent bus hang
for (i=0; i < 0xFFFFF; i++) { i = i; };
// note: addr must be on a page boundry for startup IPI. this is
// not a requirement for assert/deassert calls for 82489DX devices
write_boot_vector(addr);
// if the APIC is 82489DX, then use ASSERT/DEASSERT
// otherwise, use the startup IPI command
if (!((proc_id[proc]->apic_vers) & 0xF0))
{
apic_xcall(proc_id[proc]->apic_id, APIC_VALUE_RESET | APIC_VALUE_LEVEL |
APIC_VALUE_ASSERT, 2);
for (i=0; i < 0xFFFFF; i++)
{ i = i; };
apic_xcall(proc_id[proc]->apic_id, APIC_VALUE_RESET | APIC_VALUE_LEVEL |
APIC_VALUE_DEASSERT, 2);
for (i=0; i < 0xFFFFF; i++)
{ i = i; };
}
else
{
apic_xcall(proc_id[proc]->apic_id, APIC_VALUE_RESET | APIC_VALUE_LEVEL |
APIC_VALUE_ASSERT, 2);
for (i=0; i < 0xFFFFF; i++)
{ i = i; };
apic_xcall(proc_id[proc]->apic_id, APIC_VALUE_RESET | APIC_VALUE_LEVEL |
APIC_VALUE_DEASSERT, 2);
for (i=0; i < 0xFFFFF; i++)
{ i = i; };
apic_xcall(proc_id[proc]->apic_id, APIC_VALUE_STARTUP | APIC_VALUE_EDGE
| ((addr >> 12) & 0xFF), 2);
for (i=0; i < 0xFFFFF; i++)
{ i = i; };
apic_xcall(proc_id[proc]->apic_id, APIC_VALUE_STARTUP | APIC_VALUE_EDGE
| ((addr >> 12) & 0xFF), 2);
for (i=0; i < 0xFFFFF; i++)
{ i = i; };
}
return 0;
}
void dump_int_table(SCREEN *screen)
{
register int i;
SetPauseMode(screen, screen->nLines - 3);
printfScreenWithAttribute(screen, BRITEWHITE, "mps interrupt table at
0x%08X\n", &int_table[0]);
for (i=0; i < 64; i++)
{
printfScreenWithAttribute(screen, LTCYAN,
"(%02i): dev: %08X bus: %02X ioapic: %02X vect: %02X line: %02X use:
%02X\n",
i,
int_table[i].dev,
int_table[i].bus,
int_table[i].io_apicid,
vector_table[i],
int_table[i].line,
int_table[i].use);
}
ClearPauseMode(screen);
}
void dump_ioapic(SCREEN *screen, LONG num)
{
LONG *p;
LONG i, val;
p = (LONG *) io_apic_addr[num]->io_apic_adr;
printfScreenWithAttribute(screen, BRITEWHITE, "io_apic registers
[0x%08X]\n", p);
for (i = 0; i <= 0x2F; i++)
{
if ((i & 3) == 0)
printfScreenWithAttribute(screen, LTCYAN, "%08X: ", i);
*p = i;
val = p[4];
printfScreenWithAttribute(screen, LTCYAN, "%08X ", val);
if ((i & 3) == 3)
printfScreenWithAttribute(screen, LTCYAN, "\n");
}
}
void dump_local_apic(SCREEN *screen)
{
LONG *p;
LONG i, val;
p = (LONG *) local_apic_address;
printfScreenWithAttribute(screen, BRITEWHITE, "local apic registers
[0x%08X]\n", p);
printfScreenWithAttribute(screen, LTCYAN, "apic_id : %08X\n", p[APIC_ID]);
printfScreenWithAttribute(screen, LTCYAN, "apic_vers : %08X\n",
p[APIC_VERS]);
printfScreenWithAttribute(screen, LTCYAN, "apic_taskpri : %08X\n",
p[APIC_TASKPRI]);
printfScreenWithAttribute(screen, LTCYAN, "apic_ldest : %08X\n",
p[APIC_LDEST]);
printfScreenWithAttribute(screen, LTCYAN, "apic_destfmt : %08X\n",
p[APIC_DESTFMT]);
printfScreenWithAttribute(screen, LTCYAN, "apic_spur : %08X\n",
p[APIC_SPUR]);
printfScreenWithAttribute(screen, LTCYAN, "apic_irr0 : %08X\n",
p[APIC_IRR0]);
printfScreenWithAttribute(screen, LTCYAN, "apic_icmd : %08X\n",
p[APIC_ICMD]);
printfScreenWithAttribute(screen, LTCYAN, "apic_icmd2 : %08X\n",
p[APIC_ICMD2]);
printfScreenWithAttribute(screen, LTCYAN, "apic_ltimer : %08X\n",
p[APIC_LVT_TIMER]);
printfScreenWithAttribute(screen, LTCYAN, "apic_lvt_i0 : %08X\n",
p[APIC_LVT_I0]);
printfScreenWithAttribute(screen, LTCYAN, "apic_lvt_i1 : %08X\n",
p[APIC_LVT_I1]);
printfScreenWithAttribute(screen, LTCYAN, "apic_icount : %08X\n",
p[APIC_ICOUNT]);
printfScreenWithAttribute(screen, LTCYAN, "apic_ccount : %08X\n\n",
p[APIC_CCOUNT]);
for (i = 0; i <= 0x3F; i++)
{
if ((i & 3) == 0)
printfScreenWithAttribute(screen, LTCYAN, "%08X: ", i);
val = p[i * 4];
printfScreenWithAttribute(screen, LTCYAN, "%08X ", val);
if ((i & 3) == 3)
printfScreenWithAttribute(screen, LTCYAN, "\n");
}
}
void dump_remote_apic(SCREEN *screen, LONG proc)
{
LONG *p;
LONG i, val;
p = (LONG *) local_apic_address;
printfScreenWithAttribute(screen, BRITEWHITE, "remote apic registers
processor(%d) [0x%08X]\n", proc, p);
for (i=0; i <= 0x3F; i++)
{
if ((i & 3) == 0)
printfScreenWithAttribute(screen, LTCYAN, "%08X: ", i);
apic_xcall(proc, i | 0x00000B00, 1);
while ((p[4 * 0x30] & ICR_RR_STATUS_MASK) == ICR_RR_IN_PROGRESS)
{ p = p; };
if ((p[4 * 0x30] & ICR_RR_STATUS_MASK) == ICR_RR_VALID)
val = p[0xc * 4];
else
val = 0xDEADBEEF;
printfScreenWithAttribute(screen, LTCYAN, "%08X ", val);
if ((i & 3) == 3)
printfScreenWithAttribute(screen, LTCYAN, "\n");
}
}
LONG get_apic_id(void)
{
register int i;
LONG *local, val;
local = (LONG *) local_apic_address;
val = local[APIC_LDEST];
val >>= APIC_ID_SHIFT;
for (i=0; i < num_procs; i++)
if ((val >> i) & 1)
return (i);
return -1;
}
void apic_mask_timer(void)
{
LONG *local, val;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
val = local[APIC_LVT_TIMER];
val |= APIC_VALUE_MASK;
local[APIC_LVT_TIMER] = val;
set_flags(flags);
return;
}
void apic_unmask_timer(void)
{
LONG *local, val;
register LONG flags;
flags = get_flags();
local = (LONG *) local_apic_address;
val = local[APIC_LVT_TIMER];
val &= ~APIC_VALUE_MASK;
local[APIC_LVT_TIMER] = val;
set_flags(flags);
return;
}
void configure_eisa_el(LONG intr, LONG mode)
{
unsigned int cascade, low_int, high_int;
if (intr == 0 || intr == 1 || intr == 2 || intr == 8 || intr == 13)
mode = ELCR_EDGE;
cascade = inb(PIC2_ELCR_PORT);
low_int = inb(PIC1_ELCR_PORT);
high_int = (cascade << 8) | low_int;
high_int &= ~(1 << intr);
high_int |= (mode << intr);
outb(PIC1_ELCR_PORT, (high_int & 0xFF));
outb(PIC2_ELCR_PORT, ((high_int >> 8) & 0xFF));
elcr_flags |= high_int;
return;
}
//
// 0 - use PIC on P0
// 1 - bind to processor
// 2 - set to APIC_VALUE_LOPRI
LONG apic_set_int(LONG intr, LONG proc, LONG mode, LONG share)
{
LONG *ioapic;
LONG val, mval = 0, vector;
LONG line, lnum;
WORD elcr_reg;
register LONG flags;
if (intr == 2)
intr = 9;
if (share)
{
mval = APIC_VALUE_LEVEL;
configure_eisa_el(intr, ELCR_LEVEL);
}
else
configure_eisa_el(intr, ELCR_EDGE);
if (intr > 15)
mval = APIC_VALUE_LEVEL;
ioapic = (LONG *) io_apic_ids[int_table[intr].io_apicid].address;
lnum = io_apic_ids[int_table[intr].io_apicid].lnum;
line = int_table[intr].line;
if (line == (LONG) -1) // if not redir entry, return error
return -1;
flags = get_flags();
switch (mode)
{
case 0:
if (intr <= 15)
unmask_pic(intr);
pic_assign[intr] = 1;
break;
case 1:
vector = vector_table[intr];
spin_lock(&ioapic_locks[lnum]);
io_apic_state[lnum].line_state[line] = TRUE;
ioapic[APIC_IO_REG] = APIC_REG_RDT2 + 2 * line;
ioapic[APIC_IO_DATA] = 1 << (proc + APIC_ID_SHIFT);
ioapic[APIC_IO_REG] = APIC_REG_RDT + 2 * line;
ioapic[APIC_IO_DATA] = (vector | APIC_VALUE_LDEST | APIC_VALUE_FIXED |
mval);
spin_unlock(&ioapic_locks[lnum]);
if (intr <= 15)
{
if (elcr_flags & (1 << intr))
{
elcr_reg = inw(EISA_POLARITY_REG);
elcr_reg ^= (1 << line);
outw(EISA_POLARITY_REG, elcr_reg);
}
}
mask_pic(intr);
break;
case 2:
vector = vector_table[intr];
spin_lock(&ioapic_locks[lnum]);
io_apic_state[lnum].line_state[line] = TRUE;
ioapic[APIC_IO_REG] = APIC_REG_RDT2 + 2 * line;
ioapic[APIC_IO_DATA] = processor_mask;
ioapic[APIC_IO_REG] = APIC_REG_RDT + 2 * line;
ioapic[APIC_IO_DATA] = (vector | APIC_VALUE_LDEST | APIC_VALUE_LOPRI |
mval);
spin_unlock(&ioapic_locks[lnum]);
if (intr <= 15)
{
if (elcr_flags & (1 << intr))
{
elcr_reg = inw(EISA_POLARITY_REG);
elcr_reg ^= (1 << line);
outw(EISA_POLARITY_REG, elcr_reg);
}
}
mask_pic(intr);
break;
default:
spin_lock(&ioapic_locks[lnum]);
io_apic_state[lnum].line_state[line] = 0;
ioapic[APIC_IO_REG] = APIC_REG_RDT2 + 2 * line;
ioapic[APIC_IO_DATA] = APIC_VALUE_MASK;
spin_unlock(&ioapic_locks[lnum]);
if (intr <= 15)
{
if (elcr_flags & (1 << intr))
{
elcr_reg = inw(EISA_POLARITY_REG);
elcr_reg ^= (1 << line);
outw(EISA_POLARITY_REG, elcr_reg);
}
}
mask_pic(intr);
break;
}
set_flags(flags);
return 0;
}
LONG apic_clear_int(LONG intr)
{
LONG *ioapic, line, lnum;
register LONG flags;
flags = get_flags();
if (intr <= 15)
mask_pic(intr);
pic_assign[intr] = 0;
ioapic = (LONG *) io_apic_ids[int_table[intr].io_apicid].address;
lnum = io_apic_ids[int_table[intr].io_apicid].lnum;
line = int_table[intr].line;
spin_lock(&ioapic_locks[lnum]);
ioapic[APIC_IO_REG] = APIC_REG_RDT + 2 * line;
ioapic[APIC_IO_DATA] = APIC_VALUE_MASK;
spin_unlock(&ioapic_locks[lnum]);
set_flags(flags);
return 0;
}
LONG apic_mask_int(LONG intr)
{
LONG *ioapic, line, val, lnum;
register LONG flags;
flags = get_flags();
if (intr <= 15)
mask_pic(intr);
ioapic = (LONG *) io_apic_ids[int_table[intr].io_apicid].address;
lnum = io_apic_ids[int_table[intr].io_apicid].lnum;
line = int_table[intr].line;
spin_lock(&ioapic_locks[lnum]);
ioapic[APIC_IO_REG] = APIC_REG_RDT + 2 * line;
val = ioapic[APIC_IO_DATA];
val |= APIC_VALUE_MASK;
ioapic[APIC_IO_DATA] = val;
spin_unlock(&ioapic_locks[lnum]);
set_flags(flags);
return 0;
}
LONG apic_unmask_int(LONG intr)
{
LONG *ioapic, line, val, lnum;
register LONG flags;
flags = get_flags();
if (intr <= 15)
mask_pic(intr);
ioapic = (LONG *) io_apic_ids[int_table[intr].io_apicid].address;
lnum = io_apic_ids[int_table[intr].io_apicid].lnum;
line = int_table[intr].line;
spin_lock(&ioapic_locks[lnum]);
ioapic[APIC_IO_REG] = APIC_REG_RDT + 2 * line;
val = ioapic[APIC_IO_DATA];
val &= ~APIC_VALUE_MASK;
ioapic[APIC_IO_DATA] = val;
spin_unlock(&ioapic_locks[lnum]);
set_flags(flags);
return 0;
}
void program_8254(void)
{
// program 8254 Timer Chip to 1/18 second interval
outb(0x43, 0x24);
outb(0x40, 0);
outb(0x40, 0);
return;
}
Andrew, This is a bug fix and must go in before 2.6.21.
Acked-by: Suresh Siddha <[email protected]>
On Fri, Feb 23, 2007 at 04:46:20AM -0700, Eric W. Biederman wrote:
>
> move_native_irqs tries to do the right thing when migrating irqs
> by disabling them. However disabling them is a software logical
> thing, not a hardware thing. This has always been a little flaky
> and after Ingo's latest round of changes it is guaranteed to not
> mask the apic.
>
> So this patch fixes move_native_irq to directly call the mask and
> unmask chip methods to guarantee that we mask the irq when we
> are migrating it. We must do this as it is required by
> all code that call into the path.
>
> Since we don't know the masked status when IRQ_DISABLED is
> set so we will not be able to restore it. The patch makes the code
> just give up and trying again the next time this routing is called.
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> kernel/irq/migration.c | 9 ++++-----
> 1 files changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
> index 4baa3bb..77b7acc 100644
> --- a/kernel/irq/migration.c
> +++ b/kernel/irq/migration.c
> @@ -65,12 +65,11 @@ void move_native_irq(int irq)
> if (likely(!(desc->status & IRQ_MOVE_PENDING)))
> return;
>
> - if (likely(!(desc->status & IRQ_DISABLED)))
> - desc->chip->disable(irq);
> + if (unlikely(desc->status & IRQ_DISABLED))
> + return;
>
> + desc->chip->mask(irq);
> move_masked_irq(irq);
> -
> - if (likely(!(desc->status & IRQ_DISABLED)))
> - desc->chip->enable(irq);
> + desc->chip->unmask(irq);
> }
>
> --
> 1.5.0.g53756
"Jeff V. Merkey" <[email protected]> writes:
> Eric,
>
> Please find attached the APIC code I used in Gadugi. It's code for plain vanilla
> APICs, but does just this. This code not only allows
> interrupts to be migrated, but processors to be stopped and restarted on the fly
> without system interruption. You may find some useful
> ideas in it.
Just for clarification. Who owns the code you posted. What is the
copyright license on it? Or do you own the copyright and place it
into the public domain?
Eric
Eric W. Biederman wrote:
>"Jeff V. Merkey" <[email protected]> writes:
>
>
>
>>Eric,
>>
>>Please find attached the APIC code I used in Gadugi. It's code for plain vanilla
>>APICs, but does just this. This code not only allows
>>interrupts to be migrated, but processors to be stopped and restarted on the fly
>>without system interruption. You may find some useful
>>ideas in it.
>>
>>
>
>Just for clarification. Who owns the code you posted. What is the
>copyright license on it? Or do you own the copyright and place it
>into the public domain?
>
>Eric
>
>
>
>
That particular code modules was released under GPL in 1999.
Jeff
Hi!
> For the ISA irqs we reserve 16 vectors. This patch adds constants for
> those vectors and modifies the code to use them. Making the code a
> little clearer and making it possible to move these vectors in the future.
> /*
> * Vectors 0x20-0x2f are used for ISA interrupts.
> */
> +#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR
> +#define IRQ1_VECTOR IRQ0_VECTOR + 1
Maybe IRQ_VECTOR(1) would be similary readable, and would avoid
> +#define IRQ2_VECTOR IRQ0_VECTOR + 2
> +#define IRQ3_VECTOR IRQ0_VECTOR + 3
> +#define IRQ4_VECTOR IRQ0_VECTOR + 4
> +#define IRQ5_VECTOR IRQ0_VECTOR + 5
> +#define IRQ6_VECTOR IRQ0_VECTOR + 6
> +#define IRQ7_VECTOR IRQ0_VECTOR + 7
> +#define IRQ8_VECTOR IRQ0_VECTOR + 8
> +#define IRQ9_VECTOR IRQ0_VECTOR + 9
> +#define IRQ10_VECTOR IRQ0_VECTOR + 10
> +#define IRQ11_VECTOR IRQ0_VECTOR + 11
> +#define IRQ12_VECTOR IRQ0_VECTOR + 12
> +#define IRQ13_VECTOR IRQ0_VECTOR + 13
> +#define IRQ14_VECTOR IRQ0_VECTOR + 14
> +#define IRQ15_VECTOR IRQ0_VECTOR + 15
...these defines?
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Pavel Machek <[email protected]> writes:
> Hi!
>
>> For the ISA irqs we reserve 16 vectors. This patch adds constants for
>> those vectors and modifies the code to use them. Making the code a
>> little clearer and making it possible to move these vectors in the future.
>
>
>> /*
>> * Vectors 0x20-0x2f are used for ISA interrupts.
>> */
>> +#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR
>> +#define IRQ1_VECTOR IRQ0_VECTOR + 1
>
> Maybe IRQ_VECTOR(1) would be similary readable, and would avoid
>
>> +#define IRQ2_VECTOR IRQ0_VECTOR + 2
>> +#define IRQ3_VECTOR IRQ0_VECTOR + 3
>> +#define IRQ4_VECTOR IRQ0_VECTOR + 4
>> +#define IRQ5_VECTOR IRQ0_VECTOR + 5
>> +#define IRQ6_VECTOR IRQ0_VECTOR + 6
>> +#define IRQ7_VECTOR IRQ0_VECTOR + 7
>> +#define IRQ8_VECTOR IRQ0_VECTOR + 8
>> +#define IRQ9_VECTOR IRQ0_VECTOR + 9
>> +#define IRQ10_VECTOR IRQ0_VECTOR + 10
>> +#define IRQ11_VECTOR IRQ0_VECTOR + 11
>> +#define IRQ12_VECTOR IRQ0_VECTOR + 12
>> +#define IRQ13_VECTOR IRQ0_VECTOR + 13
>> +#define IRQ14_VECTOR IRQ0_VECTOR + 14
>> +#define IRQ15_VECTOR IRQ0_VECTOR + 15
>
> ...these defines?
It would, and mostly I like it. However, of the 1500+ possible irqs
exactly 16 have a reserved vector. IRQ_VECTOR(N) does not convey
that in fact it conveys the opposite impression.
So I think the code is more maintainable with the distinct
definitions.
Eric
Eric W. Biederman wrote:
> * Vectors 0x20-0x2f are used for ISA interrupts.
> */
> -#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR
> +#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR + 0x10
> #define IRQ1_VECTOR IRQ0_VECTOR + 1
> #define IRQ2_VECTOR IRQ0_VECTOR + 2
> #define IRQ3_VECTOR IRQ0_VECTOR + 3
> @@ -82,7 +87,7 @@
>
I think we have a dependency in i8259.c that irq0 is mapped to vector 0x20.
--Mika
Mika Penttil? <[email protected]> writes:
> Eric W. Biederman wrote:
>> * Vectors 0x20-0x2f are used for ISA interrupts.
>> */
>> -#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR
>> +#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR + 0x10
>> #define IRQ1_VECTOR IRQ0_VECTOR + 1
>> #define IRQ2_VECTOR IRQ0_VECTOR + 2
>> #define IRQ3_VECTOR IRQ0_VECTOR + 3
>> @@ -82,7 +87,7 @@
>>
> I think we have a dependency in i8259.c that irq0 is mapped to vector 0x20.
We did, but I'm pretty certain I removed it when I introduced the
IRQ[0-15]_VECTOR defines.
One of my test machines seems to be delivering irq0 in EXTInt mode
with this patch applied (a bug of another flavor) so I don't think
there is a real issue here.
Eric
On Sun 2007-02-25 04:15:30, Eric W. Biederman wrote:
> Pavel Machek <[email protected]> writes:
>
> > Hi!
> >
> >> For the ISA irqs we reserve 16 vectors. This patch adds constants for
> >> those vectors and modifies the code to use them. Making the code a
> >> little clearer and making it possible to move these vectors in the future.
> >
> >
> >> /*
> >> * Vectors 0x20-0x2f are used for ISA interrupts.
> >> */
> >> +#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR
> >> +#define IRQ1_VECTOR IRQ0_VECTOR + 1
> >
> > Maybe IRQ_VECTOR(1) would be similary readable, and would avoid
> >
> >> +#define IRQ2_VECTOR IRQ0_VECTOR + 2
> >> +#define IRQ3_VECTOR IRQ0_VECTOR + 3
> >> +#define IRQ4_VECTOR IRQ0_VECTOR + 4
> >> +#define IRQ5_VECTOR IRQ0_VECTOR + 5
> >> +#define IRQ6_VECTOR IRQ0_VECTOR + 6
> >> +#define IRQ7_VECTOR IRQ0_VECTOR + 7
> >> +#define IRQ8_VECTOR IRQ0_VECTOR + 8
> >> +#define IRQ9_VECTOR IRQ0_VECTOR + 9
> >> +#define IRQ10_VECTOR IRQ0_VECTOR + 10
> >> +#define IRQ11_VECTOR IRQ0_VECTOR + 11
> >> +#define IRQ12_VECTOR IRQ0_VECTOR + 12
> >> +#define IRQ13_VECTOR IRQ0_VECTOR + 13
> >> +#define IRQ14_VECTOR IRQ0_VECTOR + 14
> >> +#define IRQ15_VECTOR IRQ0_VECTOR + 15
> >
> > ...these defines?
>
> It would, and mostly I like it. However, of the 1500+ possible irqs
> exactly 16 have a reserved vector. IRQ_VECTOR(N) does not convey
> that in fact it conveys the opposite impression.
>
> So I think the code is more maintainable with the distinct
> definitions.
BUILD_BUG_ON(x > 15) ? ;-).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Pavel Machek <[email protected]> writes:
> BUILD_BUG_ON(x > 15) ? ;-).
That might work. Can you use that in a constant expression?
Still it is a complexity for a handful of lines.
I would prefer to go with something simple at this point.
Eric
Hi!
> Pavel Machek <[email protected]> writes:
>
> > BUILD_BUG_ON(x > 15) ? ;-).
>
> That might work. Can you use that in a constant expression?
> Still it is a complexity for a handful of lines.
>
> I would prefer to go with something simple at this point.
I was thinking something like
#define IRQ(n) ({ BUILD_BUG_ON(n > 15); n; })
...but I did not really test it, so...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
> On Fri, 23 Feb 2007 18:06:55 -0800 "Siddha, Suresh B" <[email protected]> wrote:
> >
> > move_native_irqs tries to do the right thing when migrating irqs
> > by disabling them. However disabling them is a software logical
> > thing, not a hardware thing. This has always been a little flaky
> > and after Ingo's latest round of changes it is guaranteed to not
> > mask the apic.
> >
> > So this patch fixes move_native_irq to directly call the mask and
> > unmask chip methods to guarantee that we mask the irq when we
> > are migrating it. We must do this as it is required by
> > all code that call into the path.
> >
> > Since we don't know the masked status when IRQ_DISABLED is
> > set so we will not be able to restore it. The patch makes the code
> > just give up and trying again the next time this routing is called.
> >
> > Signed-off-by: Eric W. Biederman <[email protected]>
> > ---
> > kernel/irq/migration.c | 9 ++++-----
> > 1 files changed, 4 insertions(+), 5 deletions(-)
> >
> > diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
> > index 4baa3bb..77b7acc 100644
> > --- a/kernel/irq/migration.c
> > +++ b/kernel/irq/migration.c
> > @@ -65,12 +65,11 @@ void move_native_irq(int irq)
> > if (likely(!(desc->status & IRQ_MOVE_PENDING)))
> > return;
> >
> > - if (likely(!(desc->status & IRQ_DISABLED)))
> > - desc->chip->disable(irq);
> > + if (unlikely(desc->status & IRQ_DISABLED))
> > + return;
> >
> > + desc->chip->mask(irq);
> > move_masked_irq(irq);
> > -
> > - if (likely(!(desc->status & IRQ_DISABLED)))
> > - desc->chip->enable(irq);
> > + desc->chip->unmask(irq);
> > }
>
arghg. Please don't top-post. Ever. At all. Don't even think about
thinking about it.
<edit, edit>
> Andrew, This is a bug fix and must go in before 2.6.21.
>
> Acked-by: Suresh Siddha <[email protected]>
OK. But the rest of the patch series was a bit of a trainwreck against
Andi's pending tree so I ducked it all.
If it's really needed for 2.6.21 then we have some work to do. Presumably
carefully reviewing and testing this new work then repairing Andi's tree.
Andrew Morton <[email protected]> writes:
>>
>
> arghg. Please don't top-post. Ever. At all. Don't even think about
> thinking about it.
>
> <edit, edit>
>
>> Andrew, This is a bug fix and must go in before 2.6.21.
>>
>> Acked-by: Suresh Siddha <[email protected]>
>
> OK. But the rest of the patch series was a bit of a trainwreck against
> Andi's pending tree so I ducked it all.
>
> If it's really needed for 2.6.21 then we have some work to do. Presumably
> carefully reviewing and testing this new work then repairing Andi's tree.
I guess I should take a look at Andi's tree and see what I have wrecked.
Linus has merged the changes.
For myself the careful review and testing is why it took me so long to
generate the darn thing.
I will be happy to take all of 3rd part review I can get though.
Eric
-----Original Message-----
From: [email protected] [mailto:[email protected]]
Sent: Friday, February 23, 2007 3:33 AM
>+struct irq_cfg {
>+ cpumask_t domain;
>+ u8 vector;
>+};
>+
>+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
>+struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
>+ [0] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 0 },
>+ [1] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 1 },
>+ [2] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 2 },
>+ [3] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 3 },
>+ [4] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 4 },
>+ [5] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 5 },
>+ [6] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 6 },
>+ [7] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 7 },
>+ [8] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 8 },
>+ [9] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 9 },
>+ [10] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 10 },
>+ [11] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 11 },
>+ [12] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 12 },
>+ [13] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 13 },
>+ [14] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 14 },
>+ [15] = { .domain = CPU_MASK_ALL, .vector = FIRST_EXTERNAL_VECTOR
+ 15 },
>+};
>+
> static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t
*result);
Why not use
IRQ0_VECTOR... IRQ15_VECTOR here.
YH
"Lu, Yinghai" <[email protected]> writes:
>
> Why not use
>
> IRQ0_VECTOR... IRQ15_VECTOR here.
I do by the end of the patch series it was a patch ordering issue.
Eric