2012-05-18 10:26:51

by Alexander Gordeev

[permalink] [raw]
Subject: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

Currently x2APIC in logical destination mode delivers interrupts to a
single CPU, no matter how many CPUs were specified in the destination
cpumask.

This fix enables delivery of interrupts to multiple CPUs by bit-ORing
Logical IDs of destination CPUs that have matching Cluster ID.

Because only one cluster could be specified in a message destination
address, the destination cpumask is tried for a cluster that contains
maximum number of CPUs matching this cpumask. The CPUs in this cluster
are selected to receive the interrupts while all other CPUs (in the
cpumask) are ignored.

Signed-off-by: Alexander Gordeev <[email protected]>
---
arch/x86/include/asm/x2apic.h | 9 --
arch/x86/kernel/apic/x2apic_cluster.c | 140 +++++++++++++++++++++++++++++----
arch/x86/kernel/apic/x2apic_phys.c | 9 ++-
3 files changed, 131 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54ab..7a5a832 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void)
return 1;
}

-/*
- * For now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
static void
__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8f012b2..f8fa4c4 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -96,36 +96,142 @@ static void x2apic_send_IPI_all(int vector)
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}

+static inline unsigned int
+__x2apic_cluster_to_apicid(int cpu_in_cluster, const struct cpumask *cpumask)
+{
+ unsigned int apicid = 0;
+ int cpu;
+
+ for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, cpu_in_cluster), cpumask)
+ apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ return apicid;
+}
+
+static int
+__x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid)
+{
+ int ret = 0;
+ int cpu, heaviest;
+ unsigned int weight, max_weight;
+ cpumask_var_t target_cpus, cluster_cpus;
+
+ if (unlikely(!alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (unlikely(!alloc_cpumask_var(&cluster_cpus, GFP_ATOMIC))) {
+ ret = -ENOMEM;
+ goto out_free_target_cpus;
+ }
+
+ cpumask_and(target_cpus, cpumask, cpu_online_mask);
+ max_weight = 0;
+
+ for_each_cpu(cpu, target_cpus) {
+ cpumask_and(cluster_cpus, per_cpu(cpus_in_cluster, cpu), cpumask);
+
+ weight = cpumask_weight(cluster_cpus);
+ if (weight > max_weight) {
+ max_weight = weight;
+ heaviest = cpu;
+ }
+
+ cpumask_andnot(target_cpus, target_cpus, cluster_cpus);
+ }
+
+ if (!max_weight) {
+ ret = -EINVAL;
+ goto out_free_cluster_cpus;
+ }
+
+ *apicid = __x2apic_cluster_to_apicid(heaviest, cpumask);
+
+out_free_cluster_cpus:
+ free_cpumask_var(cluster_cpus);
+out_free_target_cpus:
+ free_cpumask_var(target_cpus);
+out:
+ return ret;
+}
+
static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
+ int err;
+ int cpu;
+ unsigned int apicid;

- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
- return BAD_APICID;
+ err = __x2apic_cpu_mask_to_apicid(cpumask, &apicid);
+ WARN_ON(err);
+
+ if (!err)
+ return apicid;
+
+ if (err == -ENOMEM) {
+ for_each_cpu(cpu, cpumask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ if (cpu < nr_cpu_ids)
+ return __x2apic_cluster_to_apicid(cpu, cpumask);
+ }
+
+ return BAD_APICID;
}

static unsigned int
x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
const struct cpumask *andmask)
{
- int cpu;
+ int err;
+ int cpu, first_cpu;
+ unsigned int apicid;
+ cpumask_var_t target_cpus;
+
+ if (likely(alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
+ cpumask_and(target_cpus, cpumask, andmask);
+
+ err = __x2apic_cpu_mask_to_apicid(target_cpus, &apicid);
+
+ free_cpumask_var(target_cpus);
+
+ if (!err)
+ return apicid;
+ } else {
+ err = -ENOMEM;
+ }
+
+ WARN_ON(err);
+
+ if (err != -ENOMEM)
+ return 0;
+
+ apicid = 0;
+ first_cpu = nr_cpu_ids;

- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
+ if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ first_cpu = cpu;
break;
+ }
+ }
+
+ if (first_cpu < nr_cpu_ids) {
+ for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, first_cpu),
+ cpumask) {
+ if (!cpumask_test_cpu(cpu, andmask))
+ continue;
+ apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
+ }
}

- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ return apicid;
+}
+
+static void
+x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_copy(retmask, cpu_possible_mask);
}

static void init_x2apic_ldr(void)
@@ -225,7 +331,7 @@ static struct apic apic_x2apic_cluster = {
.check_apicid_used = NULL,
.check_apicid_present = NULL,

- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = x2apic_cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 991e315..f0ee4a4 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -108,6 +108,13 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_apicid, cpu);
}

+static void
+x2apic_phys_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
static void init_x2apic_ldr(void)
{
}
@@ -137,7 +144,7 @@ static struct apic apic_x2apic_phys = {
.check_apicid_used = NULL,
.check_apicid_present = NULL,

- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = x2apic_phys_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,
--
1.7.6.5

--
Regards,
Alexander Gordeev
[email protected]


2012-05-18 14:41:59

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Fri, May 18, 2012 at 12:26:41PM +0200, Alexander Gordeev wrote:
> Currently x2APIC in logical destination mode delivers interrupts to a
> single CPU, no matter how many CPUs were specified in the destination
> cpumask.
>
> This fix enables delivery of interrupts to multiple CPUs by bit-ORing
> Logical IDs of destination CPUs that have matching Cluster ID.
>
> Because only one cluster could be specified in a message destination
> address, the destination cpumask is tried for a cluster that contains
> maximum number of CPUs matching this cpumask. The CPUs in this cluster
> are selected to receive the interrupts while all other CPUs (in the
> cpumask) are ignored.

Hi Alexander,

I'm a bit confused, we do compose destination id from target cpumask
and send one ipi per _cluster_ with all cpus belonging to this cluster
OR'ed. So if my memory doesn't betray me all specified cpus in cluster
should obtain this message. Thus I'm wondering where do you find that
only one apic obtains ipi? (note i don't have the real physical
machine with x2apic enabled handy to test, so please share
the experience).

Cyrill

2012-05-18 15:42:48

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Fri, May 18, 2012 at 06:41:53PM +0400, Cyrill Gorcunov wrote:
> On Fri, May 18, 2012 at 12:26:41PM +0200, Alexander Gordeev wrote:
> > Currently x2APIC in logical destination mode delivers interrupts to a
> > single CPU, no matter how many CPUs were specified in the destination
> > cpumask.
> >
> > This fix enables delivery of interrupts to multiple CPUs by bit-ORing
> > Logical IDs of destination CPUs that have matching Cluster ID.
> >
> > Because only one cluster could be specified in a message destination
> > address, the destination cpumask is tried for a cluster that contains
> > maximum number of CPUs matching this cpumask. The CPUs in this cluster
> > are selected to receive the interrupts while all other CPUs (in the
> > cpumask) are ignored.
>
> Hi Alexander,
>
> I'm a bit confused, we do compose destination id from target cpumask
> and send one ipi per _cluster_ with all cpus belonging to this cluster
> OR'ed. So if my memory doesn't betray me all specified cpus in cluster
> should obtain this message. Thus I'm wondering where do you find that
> only one apic obtains ipi? (note i don't have the real physical
> machine with x2apic enabled handy to test, so please share
> the experience).

Cyrill,

This patchset is not about IPIs at all. It is about interrupts coming from
IO-APICs.

I.e. if you check /proc/interrups for some IR-IO-APIC you will notice that just
a single (first found) CPU receives the interrupts, even though the
corresponding /proc/irq/<irq#>/smp_affinity would specify multiple CPUs.

Given the current design it is unavoidable in physical destination mode (well,
as long as we do not broadcast, which I did not try yet). But it is well
avoidable in clustered logical addressing mode + lowest priority delivery mode.

I am attaching my debugging patchses so you could check actual values of ITREs.

>
> Cyrill

--
Regards,
Alexander Gordeev
[email protected]


Attachments:
(No filename) (1.88 kB)
0001-x86-io-apic-Dump-IO-APICs-to-sys-kernel-ioapic.patch (6.54 kB)
0002-x86-intremap-Fix-get_irte-NULL-pointer-assignment.patch (816.00 B)
0003-x86-intremap-Dump-IRTEs-to-sys-kernel-intremap.patch (2.63 kB)
Download all attachments

2012-05-18 15:51:25

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Fri, May 18, 2012 at 05:42:37PM +0200, Alexander Gordeev wrote:
> > Hi Alexander,
> >
> > I'm a bit confused, we do compose destination id from target cpumask
> > and send one ipi per _cluster_ with all cpus belonging to this cluster
> > OR'ed. So if my memory doesn't betray me all specified cpus in cluster
> > should obtain this message. Thus I'm wondering where do you find that
> > only one apic obtains ipi? (note i don't have the real physical
> > machine with x2apic enabled handy to test, so please share
> > the experience).
>
> Cyrill,
>
> This patchset is not about IPIs at all. It is about interrupts coming from
> IO-APICs.

Ah, you mean io-apic here. I'll try to find some time tonight/tomorrow-morning
for review. Thanks!

Cyrill

2012-05-19 10:47:31

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Fri, May 18, 2012 at 07:51:19PM +0400, Cyrill Gorcunov wrote:
> On Fri, May 18, 2012 at 05:42:37PM +0200, Alexander Gordeev wrote:
> > > Hi Alexander,
> > >
> > > I'm a bit confused, we do compose destination id from target cpumask
> > > and send one ipi per _cluster_ with all cpus belonging to this cluster
> > > OR'ed. So if my memory doesn't betray me all specified cpus in cluster
> > > should obtain this message. Thus I'm wondering where do you find that
> > > only one apic obtains ipi? (note i don't have the real physical
> > > machine with x2apic enabled handy to test, so please share
> > > the experience).
> >
> > This patchset is not about IPIs at all. It is about interrupts coming from
> > IO-APICs.
>
> Ah, you mean io-apic here. I'll try to find some time tonight/tomorrow-morning
> for review. Thanks!

Sorry for delay, Alexander. I can only review the code (I've no x2apic
testing machine at the moment) and it looks good for me.

Reviewed-by: Cyrill Gorcunov <[email protected]>

p.s. Hope Suresh and Yinghai will take a look too. The thing which bothers
me is that we use LowPrio delivery mode, while there is no lowprio in
x2apic ipi messages and I assume the same should apply to ioapic generated
messages.

Cyrill

2012-05-19 20:53:40

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Fri, May 18, 2012 at 3:26 AM, Alexander Gordeev <[email protected]> wrote:
> Currently x2APIC in logical destination mode delivers interrupts to a
> single CPU, no matter how many CPUs were specified in the destination
> cpumask.
>
> This fix enables delivery of interrupts to multiple CPUs by bit-ORing
> Logical IDs of destination CPUs that have matching Cluster ID.
>
> Because only one cluster could be specified in a message destination
> address, the destination cpumask is tried for a cluster that contains
> maximum number of CPUs matching this cpumask. The CPUs in this cluster
> are selected to receive the interrupts while all other CPUs (in the
> cpumask) are ignored.
>
> Signed-off-by: Alexander Gordeev <[email protected]>
> ---
> ?arch/x86/include/asm/x2apic.h ? ? ? ? | ? ?9 --
> ?arch/x86/kernel/apic/x2apic_cluster.c | ?140 +++++++++++++++++++++++++++++----
> ?arch/x86/kernel/apic/x2apic_phys.c ? ?| ? ?9 ++-
> ?3 files changed, 131 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
> index 92e54ab..7a5a832 100644
> --- a/arch/x86/include/asm/x2apic.h
> +++ b/arch/x86/include/asm/x2apic.h
> @@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void)
> ? ? ? ?return 1;
> ?}
>
> -/*
> - * For now each logical cpu is in its own vector allocation domain.
> - */
> -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
> -{
> - ? ? ? cpumask_clear(retmask);
> - ? ? ? cpumask_set_cpu(cpu, retmask);
> -}
> -
> ?static void
> ?__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
> ?{
> diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
> index 8f012b2..f8fa4c4 100644
> --- a/arch/x86/kernel/apic/x2apic_cluster.c
> +++ b/arch/x86/kernel/apic/x2apic_cluster.c
> @@ -96,36 +96,142 @@ static void x2apic_send_IPI_all(int vector)
> ? ? ? ?__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
> ?}
>
> +static inline unsigned int
> +__x2apic_cluster_to_apicid(int cpu_in_cluster, const struct cpumask *cpumask)
> +{
> + ? ? ? unsigned int apicid = 0;
> + ? ? ? int cpu;
> +
> + ? ? ? for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, cpu_in_cluster), cpumask)
> + ? ? ? ? ? ? ? apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
> +
> + ? ? ? return apicid;
> +}
> +
> +static int
> +__x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid)
> +{
> + ? ? ? int ret = 0;
> + ? ? ? int cpu, heaviest;
> + ? ? ? unsigned int weight, max_weight;
> + ? ? ? cpumask_var_t target_cpus, cluster_cpus;
> +
> + ? ? ? if (unlikely(!alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
> + ? ? ? ? ? ? ? ret = -ENOMEM;
> + ? ? ? ? ? ? ? goto out;
> + ? ? ? }
> + ? ? ? if (unlikely(!alloc_cpumask_var(&cluster_cpus, GFP_ATOMIC))) {
> + ? ? ? ? ? ? ? ret = -ENOMEM;
> + ? ? ? ? ? ? ? goto out_free_target_cpus;
> + ? ? ? }
> +
> + ? ? ? cpumask_and(target_cpus, cpumask, cpu_online_mask);
> + ? ? ? max_weight = 0;
> +
> + ? ? ? for_each_cpu(cpu, target_cpus) {
> + ? ? ? ? ? ? ? cpumask_and(cluster_cpus, per_cpu(cpus_in_cluster, cpu), cpumask);
> +
> + ? ? ? ? ? ? ? weight = cpumask_weight(cluster_cpus);
> + ? ? ? ? ? ? ? if (weight > max_weight) {
> + ? ? ? ? ? ? ? ? ? ? ? max_weight = weight;
> + ? ? ? ? ? ? ? ? ? ? ? heaviest = cpu;
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? cpumask_andnot(target_cpus, target_cpus, cluster_cpus);
> + ? ? ? }
> +
> + ? ? ? if (!max_weight) {
> + ? ? ? ? ? ? ? ret = -EINVAL;
> + ? ? ? ? ? ? ? goto out_free_cluster_cpus;
> + ? ? ? }
> +
> + ? ? ? *apicid = __x2apic_cluster_to_apicid(heaviest, cpumask);
> +
> +out_free_cluster_cpus:
> + ? ? ? free_cpumask_var(cluster_cpus);
> +out_free_target_cpus:
> + ? ? ? free_cpumask_var(target_cpus);
> +out:
> + ? ? ? return ret;
> +}
> +
> ?static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
> ?{
> - ? ? ? /*
> - ? ? ? ?* We're using fixed IRQ delivery, can only return one logical APIC ID.
> - ? ? ? ?* May as well be the first.
> - ? ? ? ?*/
> - ? ? ? int cpu = cpumask_first(cpumask);
> + ? ? ? int err;
> + ? ? ? int cpu;
> + ? ? ? unsigned int apicid;
>
> - ? ? ? if ((unsigned)cpu < nr_cpu_ids)
> - ? ? ? ? ? ? ? return per_cpu(x86_cpu_to_logical_apicid, cpu);
> - ? ? ? else
> - ? ? ? ? ? ? ? return BAD_APICID;
> + ? ? ? err = __x2apic_cpu_mask_to_apicid(cpumask, &apicid);
> + ? ? ? WARN_ON(err);
> +
> + ? ? ? if (!err)
> + ? ? ? ? ? ? ? return apicid;
> +
> + ? ? ? if (err == -ENOMEM) {
> + ? ? ? ? ? ? ? for_each_cpu(cpu, cpumask) {
> + ? ? ? ? ? ? ? ? ? ? ? if (cpumask_test_cpu(cpu, cpu_online_mask))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? break;
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? if (cpu < nr_cpu_ids)
> + ? ? ? ? ? ? ? ? ? ? ? return __x2apic_cluster_to_apicid(cpu, cpumask);
> + ? ? ? }
> +
> + ? ? ? return BAD_APICID;
> ?}
>
> ?static unsigned int
> ?x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?const struct cpumask *andmask)
> ?{
> - ? ? ? int cpu;
> + ? ? ? int err;
> + ? ? ? int cpu, first_cpu;
> + ? ? ? unsigned int apicid;
> + ? ? ? cpumask_var_t target_cpus;
> +
> + ? ? ? if (likely(alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
> + ? ? ? ? ? ? ? cpumask_and(target_cpus, cpumask, andmask);
> +
> + ? ? ? ? ? ? ? err = __x2apic_cpu_mask_to_apicid(target_cpus, &apicid);
> +
> + ? ? ? ? ? ? ? free_cpumask_var(target_cpus);
> +
> + ? ? ? ? ? ? ? if (!err)
> + ? ? ? ? ? ? ? ? ? ? ? return apicid;
> + ? ? ? } else {
> + ? ? ? ? ? ? ? err = -ENOMEM;
> + ? ? ? }
> +
> + ? ? ? WARN_ON(err);
> +
> + ? ? ? if (err != -ENOMEM)
> + ? ? ? ? ? ? ? return 0;
> +
> + ? ? ? apicid = 0;
> + ? ? ? first_cpu = nr_cpu_ids;
>
> - ? ? ? /*
> - ? ? ? ?* We're using fixed IRQ delivery, can only return one logical APIC ID.
> - ? ? ? ?* May as well be the first.
> - ? ? ? ?*/
> ? ? ? ?for_each_cpu_and(cpu, cpumask, andmask) {
> - ? ? ? ? ? ? ? if (cpumask_test_cpu(cpu, cpu_online_mask))
> + ? ? ? ? ? ? ? if (cpumask_test_cpu(cpu, cpu_online_mask)) {
> + ? ? ? ? ? ? ? ? ? ? ? first_cpu = cpu;
> ? ? ? ? ? ? ? ? ? ? ? ?break;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> +
> + ? ? ? if (first_cpu < nr_cpu_ids) {
> + ? ? ? ? ? ? ? for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, first_cpu),
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?cpumask) {
> + ? ? ? ? ? ? ? ? ? ? ? if (!cpumask_test_cpu(cpu, andmask))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? ? ? ? ? apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
> + ? ? ? ? ? ? ? }
> ? ? ? ?}
>
> - ? ? ? return per_cpu(x86_cpu_to_logical_apicid, cpu);
> + ? ? ? return apicid;
> +}
> +
> +static void
> +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +{
> + ? ? ? cpumask_copy(retmask, cpu_possible_mask);

why not using per_cpu(cpus_in_cluster, cpu) instead?

also you may add one per cpu var like x86_cpu_to_logical_cluster_apicid.


Yinghai

2012-05-21 07:11:48

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Sat, May 19, 2012 at 02:47:25PM +0400, Cyrill Gorcunov wrote:
> On Fri, May 18, 2012 at 07:51:19PM +0400, Cyrill Gorcunov wrote:

> > Ah, you mean io-apic here. I'll try to find some time tonight/tomorrow-morning
> > for review. Thanks!
>
> Sorry for delay, Alexander. I can only review the code (I've no x2apic
> testing machine at the moment) and it looks good for me.
>
> Reviewed-by: Cyrill Gorcunov <[email protected]>

Thank you, Cyrill.

> p.s. Hope Suresh and Yinghai will take a look too. The thing which bothers
> me is that we use LowPrio delivery mode, while there is no lowprio in
> x2apic ipi messages and I assume the same should apply to ioapic generated
> messages.

My understanding of the specification is LowestPrio is not supported only for
IPIs on x2APIC. Can not imagine why IO-APICs need to be limited here. But let's
hear what the guys think.


--
Regards,
Alexander Gordeev
[email protected]

2012-05-21 08:13:41

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Sat, May 19, 2012 at 01:53:36PM -0700, Yinghai Lu wrote:
> > +static void
> > +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
> > +{
> > + ? ? ? cpumask_copy(retmask, cpu_possible_mask);
>
> why not using per_cpu(cpus_in_cluster, cpu) instead?

Because it would lead to suboptimal results when updating IRQ affinity:

int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
unsigned int *dest_id)
{
struct irq_cfg *cfg = data->chip_data;

if (!cpumask_intersects(mask, cpu_online_mask))
return -1;

if (assign_irq_vector(data->irq, data->chip_data, mask))
return -1;

This call ^^^ will update cfg->domain with the value returned by the call to
apic->vector_allocation_domain(). If per_cpu(cpus_in_cluster, cpu) is returned
as cfg->domain here then all other clusters contained in the 'mask' will not
be taken into consideration by the apic->cpu_mask_to_apicid_and() call below.

cpumask_copy(data->affinity, mask);

*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);

So we really need to submit all possible CPUs here ^^^ to be able finding the
best/heaviest cluster out of the 'mask'.

return 0;
}


> also you may add one per cpu var like x86_cpu_to_logical_cluster_apicid.

Both cpu_mask_to_apicid() and cpu_mask_to_apicid_and() take a cpumask to
derive the apicid from. Even though we could cache the value of apicid in
'x86_cpu_to_logical_cluster_apicid' variable, we still would have to unset
CPUs which are not in the requested cpumask. That means scanning through the
cpumask etc -- exactly what the the patch does now.

Or I am missing your point here..

> Yinghai

--
Regards,
Alexander Gordeev
[email protected]

2012-05-21 08:22:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode


* Alexander Gordeev <[email protected]> wrote:

> Currently x2APIC in logical destination mode delivers
> interrupts to a single CPU, no matter how many CPUs were
> specified in the destination cpumask.
>
> This fix enables delivery of interrupts to multiple CPUs by
> bit-ORing Logical IDs of destination CPUs that have matching
> Cluster ID.
>
> Because only one cluster could be specified in a message
> destination address, the destination cpumask is tried for a
> cluster that contains maximum number of CPUs matching this
> cpumask. The CPUs in this cluster are selected to receive the
> interrupts while all other CPUs (in the cpumask) are ignored.

I'm wondering how you tested this. AFAICS current irqbalanced
will create masks but on x2apic only the first CPU is targeted
by the kernel.

So, in theory, prior the patch you should be seeing irqs go to
only one CPU, while after the patch they are spread out amongst
the CPU. If it's using LowestPrio delivery then we depend on the
hardware doing this for us - how does this work out in practice,
are the target CPUs round-robin-ed, with a new CPU for every new
IRQ delivered?

Thanks,

Ingo

2012-05-21 09:37:09

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 10:22:40AM +0200, Ingo Molnar wrote:
>
> * Alexander Gordeev <[email protected]> wrote:
>
> > Currently x2APIC in logical destination mode delivers
> > interrupts to a single CPU, no matter how many CPUs were
> > specified in the destination cpumask.
> >
> > This fix enables delivery of interrupts to multiple CPUs by
> > bit-ORing Logical IDs of destination CPUs that have matching
> > Cluster ID.
> >
> > Because only one cluster could be specified in a message
> > destination address, the destination cpumask is tried for a
> > cluster that contains maximum number of CPUs matching this
> > cpumask. The CPUs in this cluster are selected to receive the
> > interrupts while all other CPUs (in the cpumask) are ignored.
>
> I'm wondering how you tested this. AFAICS current irqbalanced
> will create masks but on x2apic only the first CPU is targeted
> by the kernel.

Right, that is what this patch is intended to change. So I use:
'hwclock --test' to generate rtc interruts
/proc/interrupts to check where/how many interrupts were delevired
/proc/irq/8/smp_affinity to check how clusters are chosen

> So, in theory, prior the patch you should be seeing irqs go to
> only one CPU, while after the patch they are spread out amongst
> the CPU. If it's using LowestPrio delivery then we depend on the
> hardware doing this for us - how does this work out in practice,
> are the target CPUs round-robin-ed, with a new CPU for every new
> IRQ delivered?


That is exactly what I can observe.

As of 'target CPUs round-robin-ed' and 'with a new CPU for every new IRQ
delivered' -- that is something we can not control as you noted. Nor do we
care to my understanding.

I can not commit on every h/w out there obviously, but on my PowerEdge M910
with some half-dozen clusters with six CPU per each, the interrupts are
perfectly balanced among those ones present in IRTEs.


> Thanks,
>
> Ingo

--
Regards,
Alexander Gordeev
[email protected]

2012-05-21 09:46:08

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 09:11:36AM +0200, Alexander Gordeev wrote:
>
> > p.s. Hope Suresh and Yinghai will take a look too. The thing which bothers
> > me is that we use LowPrio delivery mode, while there is no lowprio in
> > x2apic ipi messages and I assume the same should apply to ioapic generated
> > messages.
>
> My understanding of the specification is LowestPrio is not supported only for
> IPIs on x2APIC. Can not imagine why IO-APICs need to be limited here. But let's
> hear what the guys think.

True. This limitation applies to IPIs only.

Cyrill

2012-05-21 12:40:32

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode


* Alexander Gordeev <[email protected]> wrote:

> > So, in theory, prior the patch you should be seeing irqs go
> > to only one CPU, while after the patch they are spread out
> > amongst the CPU. If it's using LowestPrio delivery then we
> > depend on the hardware doing this for us - how does this
> > work out in practice, are the target CPUs round-robin-ed,
> > with a new CPU for every new IRQ delivered?
>
> That is exactly what I can observe.
>
> As of 'target CPUs round-robin-ed' and 'with a new CPU for
> every new IRQ delivered' -- that is something we can not
> control as you noted. Nor do we care to my understanding.
>
> I can not commit on every h/w out there obviously, but on my
> PowerEdge M910 with some half-dozen clusters with six CPU per
> each, the interrupts are perfectly balanced among those ones
> present in IRTEs.

But that is not 'perfectly balanced' in many cases.

When the hardware round-robins the interrupts then each
interrupt will go to a 'cache cold' CPU in essence. This is
pretty much the worst thing possible thing to do in most cases:
while it's "perfectly balanced" in the sense of distributing
cycles evenly between CPUs, each interrupt handler execution
will generate an avalance of cachemisses, for cachelines there
were modified in the previous invocation of the irq.

One notable exception is when the CPUs are SMT/Hyperthreading
siblings, in that case they are sharing even the L1 cache, so
there's very little cost to round-robining the IRQs within the
CPU mask.

But AFAICS irqbalanced will spread irqs on wider masks than SMT
sibling boundaries, exposing us to the above performance
problem.

So I think we need to tread carefully here.

Thanks,

Ingo

2012-05-21 14:48:38

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 02:40:26PM +0200, Ingo Molnar wrote:
> But that is not 'perfectly balanced' in many cases.
>
> When the hardware round-robins the interrupts then each
> interrupt will go to a 'cache cold' CPU in essence. This is
> pretty much the worst thing possible thing to do in most cases:
> while it's "perfectly balanced" in the sense of distributing
> cycles evenly between CPUs, each interrupt handler execution
> will generate an avalance of cachemisses, for cachelines there
> were modified in the previous invocation of the irq.

Absolutely.

There are at least two more offenders :) exercising lowest priority + logical
addressing similarly. So in this regard the patch is nothing new:


static inline unsigned int
default_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
}

static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
unsigned int round = 0;
int cpu, apicid = 0;

/*
* The cpus in the mask must all be on the apic cluster.
*/
for_each_cpu(cpu, cpumask) {
int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);

if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
printk("%s: Not a valid mask!\n", __func__);
return BAD_APICID;
}
apicid |= new_apicid;
round++;
}
return apicid;
}

> One notable exception is when the CPUs are SMT/Hyperthreading
> siblings, in that case they are sharing even the L1 cache, so
> there's very little cost to round-robining the IRQs within the
> CPU mask.
>
> But AFAICS irqbalanced will spread irqs on wider masks than SMT
> sibling boundaries, exposing us to the above performance
> problem.

I would speculate it is irqbalanced who should be (in case of x86) cluster-
agnostic and ask for a mask while the apic layer is just execute or at least
report what was set. But that is a different topic.

Considering a bigger picture, it appears strange to me that apic is the layer
to take decision whether to make CPU a target or not. It is especially true
when one means a particluar cpumask, wants it to be set, but still is not able
to do that due to the current limitation.

> So I think we need to tread carefully here.

Kernel parameter? IRQ line flag? Totally opposed? :)

> Thanks,

>
> Ingo

--
Regards,
Alexander Gordeev
[email protected]

2012-05-21 14:59:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode


* Alexander Gordeev <[email protected]> wrote:

> > So I think we need to tread carefully here.
>
> Kernel parameter? IRQ line flag? Totally opposed? :)

Undecided and sceptical, because: -ENONUMBERS :-)

The kernel actually tries to have as much locality as we can
hide from the user.

For example we don't execute tasks for 100 usecs on one CPU,
then jump to another CPU and execute 100 usecs there, then to
yet another CPU to create an 'absolutely balanced use of CPU
resources'. Why? Because the cache-misses would be killing us.

When a device is generating ten thousand irqs a second then
round-robining the IRQs is equivalent to switching a CPU every
100 usecs.

There's another cost: tasks tend to try to migrate to sources of
wakeups. So if an IRQ wakes up a task then we will try to
execute the task locally, to have locality of access between the
data that the IRQ handler has touched, and the task that makes
use of it. Round-robining the IRQ in that situations means that
the task either follows the IRQ and round-robins (bad), or that
it stays remote to the IRQ data (not good either).

So, I'm sceptical :-/

Your other two patches looked sensible - will they apply fine
after each other, without the round-robin patch which is still
under discussion, or is there a dependency?

Thanks,

Ingo

2012-05-21 15:22:28

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 04:59:04PM +0200, Ingo Molnar wrote:
>
> * Alexander Gordeev <[email protected]> wrote:
> Your other two patches looked sensible - will they apply fine
> after each other, without the round-robin patch which is still
> under discussion, or is there a dependency?

1/3 will apply cleanly; not worth worrying though -- whitespace oneliner
3/3 depends on 2/3 round-robin patch, and doubtful :)

> Thanks,
>
> Ingo

--
Regards,
Alexander Gordeev
[email protected]

2012-05-21 15:34:36

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 04:59:04PM +0200, Ingo Molnar wrote:
>
> When a device is generating ten thousand irqs a second then
> round-robining the IRQs is equivalent to switching a CPU every
> 100 usecs.
>

As far as I remember it's even more complex, with lowprio it
might be not complete fair round-robin, since if focused
processor found then it'll be assigned to handle incoming
irq again.

Cyrill

2012-05-21 15:36:32

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 7:59 AM, Ingo Molnar <[email protected]> wrote:
>
> For example we don't execute tasks for 100 usecs on one CPU,
> then jump to another CPU and execute 100 usecs there, then to
> yet another CPU to create an 'absolutely balanced use of CPU
> resources'. Why? Because the cache-misses would be killing us.

That is likely generally not true within a single socket, though.

Interrupt handlers will basically never hit in the L1 anyway (*maybe*
it happens if the CPU is totally idle, but quite frankly, I doubt it).
Even the L2 is likely not large enough to have much cache across irqs,
unless it's one of the big Core 2 L2's that are largely shared per
socket anyway.

So it may well make perfect sense to allow a mask of CPU's for
interrupt delivery, but just make sure that the mask all points to
CPU's on the same socket. That would give the hardware some leeway in
choosing the actual core - it's very possible that hardware could
avoid cores that are running with irq's disabled (possibly improving
latecy) or even more likely - avoid cores that are in deeper
powersaving modes.

Avoiding waking up CPU's that are in C6 would not only help latency,
it would help power use. I don't know how well the irq handling
actually works on a hw level, but that's exactly the kind of thing I
would expect HW to do well (and sw would do badly, because the
latencies for things like CPU power states are low enough that trying
to do SW irq balancing at that level is entirely and completely
idiotic).

So I do think that we should aim for *allowing* hardware to do these
kinds of choices for us. Limiting irq delivery to a particular core is
very limiting for very little gain (almost no cache benefits), but
limiting it to a particular socket could certainly be a valid thing.
You might want to limit it to a particular socket anyway, just because
the hardware itself may well be closer to one socket (coming off the
PCIe lanes of that particular socket) than anything else.

Linus

2012-05-21 18:08:44

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, 2012-05-21 at 08:36 -0700, Linus Torvalds wrote:
> On Mon, May 21, 2012 at 7:59 AM, Ingo Molnar <[email protected]> wrote:
> >
> > For example we don't execute tasks for 100 usecs on one CPU,
> > then jump to another CPU and execute 100 usecs there, then to
> > yet another CPU to create an 'absolutely balanced use of CPU
> > resources'. Why? Because the cache-misses would be killing us.
>
> That is likely generally not true within a single socket, though.
>
> Interrupt handlers will basically never hit in the L1 anyway (*maybe*
> it happens if the CPU is totally idle, but quite frankly, I doubt it).
> Even the L2 is likely not large enough to have much cache across irqs,
> unless it's one of the big Core 2 L2's that are largely shared per
> socket anyway.
>
> So it may well make perfect sense to allow a mask of CPU's for
> interrupt delivery, but just make sure that the mask all points to
> CPU's on the same socket.

All the cluster members of a given x2apic cluster belong to the same
package. These x2apic cluster id's are setup by the HW and not by the
SW. And only one cluster (with one or multiple members of that cluster
set) can be specified in the interrupt destination field of the routing
table entry.

> That would give the hardware some leeway in
> choosing the actual core - it's very possible that hardware could
> avoid cores that are running with irq's disabled (possibly improving
> latecy) or even more likely - avoid cores that are in deeper
> powersaving modes.

Power aware interrupt routing in IVB does this. And the policy of
whether you want the interrupt to be routed to the busy core (to save
power) or an idle core (for minimizing the interruptions on the busy
core) can be selected by the SW (using IA32_ENERGY_PERF_BIAS MSR).

thanks,
suresh

2012-05-21 18:18:27

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 11:07 AM, Suresh Siddha
<[email protected]> wrote:
>
> All the cluster members of a given x2apic cluster belong to the same
> package. These x2apic cluster id's are setup by the HW and not by the
> SW. And only one cluster (with one or multiple members of that cluster
> set) can be specified in the interrupt destination field of the routing
> table entry.

Ok, then the main question ends up being if there are enough cache or
power domains within a cluster to still worry about it.

For example, you say "package", but that can sometimes mean multiple
dies, or even just split caches that are big enough to matter
(although I can't think of any such right now on the x86 side - Core2
Duo had huge L2's, but they were shared, not split).

> Power aware interrupt routing in IVB does this. And the policy of
> whether you want the interrupt to be routed to the busy core (to save
> power) or an idle core (for minimizing the interruptions on the busy
> core) can be selected by the SW (using IA32_ENERGY_PERF_BIAS MSR).

Sounds like we definitely would want to support this at least in the
IVB timeframe then.

But I do agree with Ingo that it would be really good to actually see
numbers (and no, I don't mean "look here, now the irq's are nicely
spread out", but power and/or performance numbers showing that it
actually helps something).

Linus

2012-05-21 18:38:53

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, 2012-05-21 at 11:18 -0700, Linus Torvalds wrote:
> On Mon, May 21, 2012 at 11:07 AM, Suresh Siddha
> <[email protected]> wrote:
> >
> > All the cluster members of a given x2apic cluster belong to the same
> > package. These x2apic cluster id's are setup by the HW and not by the
> > SW. And only one cluster (with one or multiple members of that cluster
> > set) can be specified in the interrupt destination field of the routing
> > table entry.
>
> Ok, then the main question ends up being if there are enough cache or
> power domains within a cluster to still worry about it.

There are 16 members with in a x2apic cluster. With two HT siblings,
that will still leave 8-cores.

>
> For example, you say "package", but that can sometimes mean multiple
> dies, or even just split caches that are big enough to matter
> (although I can't think of any such right now on the x86 side - Core2
> Duo had huge L2's, but they were shared, not split).

Most likely multiple dies or split caches will have different
cluster-id's. I don't know of any upcoming implementations that will
have such an implementation supporting x2apic, but will keep an eye.

>
> > Power aware interrupt routing in IVB does this. And the policy of
> > whether you want the interrupt to be routed to the busy core (to save
> > power) or an idle core (for minimizing the interruptions on the busy
> > core) can be selected by the SW (using IA32_ENERGY_PERF_BIAS MSR).
>
> Sounds like we definitely would want to support this at least in the
> IVB timeframe then.
>
> But I do agree with Ingo that it would be really good to actually see
> numbers (and no, I don't mean "look here, now the irq's are nicely
> spread out", but power and/or performance numbers showing that it
> actually helps something).

I agree. This is the reason why I held up posting these patches before.
I can come up with micro-benchmarks that can show some difference but
the key is to find good workload/benchmark that can show measurable
difference. Any suggestions?

thanks,
suresh

2012-05-21 19:15:54

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode


* Linus Torvalds <[email protected]> wrote:

> On Mon, May 21, 2012 at 7:59 AM, Ingo Molnar <[email protected]> wrote:
> >
> > For example we don't execute tasks for 100 usecs on one CPU,
> > then jump to another CPU and execute 100 usecs there, then
> > to yet another CPU to create an 'absolutely balanced use of
> > CPU resources'. Why? Because the cache-misses would be
> > killing us.
>
> That is likely generally not true within a single socket,
> though.
>
> Interrupt handlers will basically never hit in the L1 anyway
> (*maybe* it happens if the CPU is totally idle, but quite
> frankly, I doubt it). Even the L2 is likely not large enough
> to have much cache across irqs, unless it's one of the big
> Core 2 L2's that are largely shared per socket anyway.

Indeed, you are right as far as the L1 cache is concerned:

For networking we have about 180 L1 misses per IRQ handler
invocation, if the L1 cache is cold - so most IRQ handlers
easily fit int the L1 cache. I have measured this using a
constant rate of networking IRQs and doing:

perf stat -a -C 0 --repeat 10 -e L1-dcache-load-misses:k sleep 1

It only takes another 180 cache misses for these lines to get
evicted, and this happens very easily:

On the same testsystem a parallel kernel build will evict about
25 million L1 cachelines/sec/CPU. That means that an IRQ
handler's working set in the L1 cache is indeed gone in less
than 8 microseconds.

When the workload is RAM-bound then the L1 working set of an IRQ
handler is gone in about 80 usecs. That corresponds to an IRQ
rate of about 12,500/sec/CPU: if IRQs are coming in faster than
this then they can still see some of the previous execution's
footprint in the cache.

Wrt. the L2 cache the numbers come in much more in favor of not
moving IRQ handlers across L2 cache domains - this means that
allowing the hardware to distribute them per socket is a pretty
sensible default.

> So it may well make perfect sense to allow a mask of CPU's for
> interrupt delivery, but just make sure that the mask all
> points to CPU's on the same socket. That would give the
> hardware some leeway in choosing the actual core - it's very
> possible that hardware could avoid cores that are running with
> irq's disabled (possibly improving latecy) or even more likely
> - avoid cores that are in deeper powersaving modes.

Indeed, and that's an important argument.

The one negative effect I mentioned, affine wakeups done by the
scheduler, could still bite us - this has to be measured and
affine wakeups have to be made less prominent if IRQ handlers
start jumping around. We definitely don't want tasks to follow
round-robin IRQs around.

> Avoiding waking up CPU's that are in C6 would not only help
> latency, it would help power use. I don't know how well the
> irq handling actually works on a hw level, but that's exactly
> the kind of thing I would expect HW to do well (and sw would
> do badly, because the latencies for things like CPU power
> states are low enough that trying to do SW irq balancing at
> that level is entirely and completely idiotic).
>
> So I do think that we should aim for *allowing* hardware to do
> these kinds of choices for us. Limiting irq delivery to a
> particular core is very limiting for very little gain (almost
> no cache benefits), but limiting it to a particular socket
> could certainly be a valid thing. You might want to limit it
> to a particular socket anyway, just because the hardware
> itself may well be closer to one socket (coming off the PCIe
> lanes of that particular socket) than anything else.

Ok, I'm convinced.

Limiting to a socket is I suspect an important constraint: we
shouldn't just feed the hardware whatever mask user-space sends
us, user-space might not be aware of (or confused about) socket
boundaries.

Thanks,

Ingo

2012-05-21 19:30:58

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode


* Suresh Siddha <[email protected]> wrote:

> > But I do agree with Ingo that it would be really good to
> > actually see numbers (and no, I don't mean "look here, now
> > the irq's are nicely spread out", but power and/or
> > performance numbers showing that it actually helps
> > something).
>
> I agree. This is the reason why I held up posting these
> patches before. I can come up with micro-benchmarks that can
> show some difference but the key is to find good
> workload/benchmark that can show measurable difference. Any
> suggestions?

It's rather difficult to measure this reliably. The main
complication is the inherent noise of cache stats on SMP/NUMA
systems, which all modern multi-socket systems are ...

But, since you asked, if you can generate a *very* precise
incoming external IRQ rate, it's possible:

Generate say 10,000 irqs/sec of a workload directed at a single
CPU - something like multiple copies of ping -i 0.001 -q
executed on a nearby system might do.

Then run a user-space cycle soaker, nice -19 running NOPs on all
CPUs. It's important that it *only* a user-space infinite loop,
with no kernel instructions executed at all - see later.

Then play around with variants of:

perf stat -a --repeat 10 -e cycles:u -e instructions:u sleep 1

this will tell you the number of user-space cycles and
instructions executed, per second. The ':u' attribute to limit
to user-space cycles filters apart the IRQ handler overhead from
your user-space cycle soaker.

This number of 'available user-space performance' should not get
worse when you switch from single-CPU APIC target to a
harware-round-robin target mask. You can switch the mask using
/proc/irq/nr/smp_affinity with very low overhead, while all the
above masurements are running - this allows you to see how
user-space throughput reacts to the IRQ details.

Double check that the irq rate is constant, via 'vmstat 1'.

Thanks,

Ingo

2012-05-21 19:58:27

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, 2012-05-21 at 21:15 +0200, Ingo Molnar wrote:
> The one negative effect I mentioned, affine wakeups done by the
> scheduler, could still bite us - this has to be measured and
> affine wakeups have to be made less prominent if IRQ handlers
> start jumping around. We definitely don't want tasks to follow
> round-robin IRQs around.

Actually this (ping-ponging around idle cpu's in a socket) happens
already today, as the affine wakeups use select_idle_sibling() to wake
up the task on an idle sibling (HT or core sibling) if the cpu on which
the task is woken-up is busy with something else.

But I agree, somehow all these need to work together. For example to
save power, what we want is the idle core to stay in idle with least
number of interruptions, with interrupts routed to busy cores, scheduler
not aggressively waking the tasks up on idle cores etc.

One quick thought is to use the cpufreq governor decisions and if all
the cpu's in the package are in lower p-states, then we can default for
power-saving decisions, otherwise if any cpu is at P0, switch for
performance policy(in terms of irq routing, scheduler wake-up decisions
etc) dynamically etc.

thanks,
suresh


2012-05-21 23:02:10

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 1:13 AM, Alexander Gordeev <[email protected]> wrote:
> On Sat, May 19, 2012 at 01:53:36PM -0700, Yinghai Lu wrote:
>> > +static void
>> > +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
>> > +{
>> > + ? ? ? cpumask_copy(retmask, cpu_possible_mask);
>>
>> why not using per_cpu(cpus_in_cluster, cpu) instead?
>
> Because it would lead to suboptimal results when updating IRQ affinity:
>
> int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
> ? ? ? ? ? ? ? ? ? ? ? ? ?unsigned int *dest_id)
> {
> ? ? ? ?struct irq_cfg *cfg = data->chip_data;
>
> ? ? ? ?if (!cpumask_intersects(mask, cpu_online_mask))
> ? ? ? ? ? ? ? ?return -1;
>
> ? ? ? ?if (assign_irq_vector(data->irq, data->chip_data, mask))
> ? ? ? ? ? ? ? ?return -1;
>
> This call ^^^ will update cfg->domain with the value returned by the call to
> apic->vector_allocation_domain(). If per_cpu(cpus_in_cluster, cpu) is returned
> as cfg->domain here then all other clusters contained in the 'mask' will not
> be taken into consideration by the apic->cpu_mask_to_apicid_and() call below.
>
> ? ? ? ?cpumask_copy(data->affinity, mask);
>
> ? ? ? ?*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
>
> So we really need to submit all possible CPUs here ^^^ to be able finding the
> best/heaviest cluster out of the 'mask'.

ok.

>
> ? ? ? ?return 0;
> }
>
>
>> also you may add one per cpu var like x86_cpu_to_logical_cluster_apicid.
>
> Both cpu_mask_to_apicid() and cpu_mask_to_apicid_and() take a cpumask to
> derive the apicid from. Even though we could cache the value of apicid in
> 'x86_cpu_to_logical_cluster_apicid' variable, we still would have to unset
> CPUs which are not in the requested cpumask. That means scanning through the
> cpumask etc -- exactly what the the patch does now.

ok, i got it. thanks for the explanation.

I was thinking: pick up one for cpumask that user want, and then just
use all cpus in that cluster as dest cpus.

Yinghai

2012-05-21 23:33:06

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 4:02 PM, Yinghai Lu <[email protected]> wrote:
> On Mon, May 21, 2012 at 1:13 AM, Alexander Gordeev <[email protected]> wrote:
>> On Sat, May 19, 2012 at 01:53:36PM -0700, Yinghai Lu wrote:
>>> > +static void
>>> > +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
>>> > +{
>>> > + ? ? ? cpumask_copy(retmask, cpu_possible_mask);
>>>
>>> why not using per_cpu(cpus_in_cluster, cpu) instead?
>>
>> Because it would lead to suboptimal results when updating IRQ affinity:
>>
>> int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?unsigned int *dest_id)
>> {
>> ? ? ? ?struct irq_cfg *cfg = data->chip_data;
>>
>> ? ? ? ?if (!cpumask_intersects(mask, cpu_online_mask))
>> ? ? ? ? ? ? ? ?return -1;
>>
>> ? ? ? ?if (assign_irq_vector(data->irq, data->chip_data, mask))
>> ? ? ? ? ? ? ? ?return -1;
>>
>> This call ^^^ will update cfg->domain with the value returned by the call to
>> apic->vector_allocation_domain(). If per_cpu(cpus_in_cluster, cpu) is returned
>> as cfg->domain here then all other clusters contained in the 'mask' will not
>> be taken into consideration by the apic->cpu_mask_to_apicid_and() call below.
>>
>> ? ? ? ?cpumask_copy(data->affinity, mask);
>>
>> ? ? ? ?*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
>>
>> So we really need to submit all possible CPUs here ^^^ to be able finding the
>> best/heaviest cluster out of the 'mask'.
>
> ok.

Maybe you can not simply to change that to possible_cpu_mask.

otherwise assign_irq_vector will try to get same vector for all cpus
for one irq.

then will make x2apic cluster mode will one support 224 irqs instead.

so you need to make vector_allocation_domain() to be less cpu set.

Yinghai

2012-05-21 23:46:18

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, 2012-05-21 at 10:13 +0200, Alexander Gordeev wrote:
> On Sat, May 19, 2012 at 01:53:36PM -0700, Yinghai Lu wrote:
> > > +static void
> > > +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
> > > +{
> > > + cpumask_copy(retmask, cpu_possible_mask);
> >
> > why not using per_cpu(cpus_in_cluster, cpu) instead?
>
> Because it would lead to suboptimal results when updating IRQ affinity:
>
> int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
> unsigned int *dest_id)
> {
> struct irq_cfg *cfg = data->chip_data;
>
> if (!cpumask_intersects(mask, cpu_online_mask))
> return -1;
>
> if (assign_irq_vector(data->irq, data->chip_data, mask))
> return -1;
>
> This call ^^^ will update cfg->domain with the value returned by the call to
> apic->vector_allocation_domain(). If per_cpu(cpus_in_cluster, cpu) is returned
> as cfg->domain here then all other clusters contained in the 'mask' will not
> be taken into consideration by the apic->cpu_mask_to_apicid_and() call below.
>
> cpumask_copy(data->affinity, mask);
>
> *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
>
> So we really need to submit all possible CPUs here ^^^ to be able finding the
> best/heaviest cluster out of the 'mask'.
>

I don't think we need to do anything fancy, like selecting the heaviest
cluster out of the mask. Something like (cluster membership of first cpu
in the mask) AND (mask) should be enough.

Also you need some changes in the assign_irq_vector(). Will post my
previously developed patches as a reference to this thread, while I
collect some data for this.

thanks,
suresh

2012-05-21 23:59:53

by Suresh Siddha

[permalink] [raw]
Subject: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

Until now, irq_cfg domain is mostly static. Either all cpu's (used by flat
mode) or one cpu (first cpu in the irq afffinity mask) to which irq is being
migrated (this is used by the rest of apic modes).

Upcoming x2apic cluster mode optimization patch allows the irq to be sent
to any cpu in the x2apic cluster (if supported by the HW). So irq_cfg
domain changes on the fly (depending on which cpu in the x2apic cluster
is online).

Instead of checking for any intersection between the new irq affinity
mask and the current irq_cfg domain, check if the new irq affinity mask
is a subset of the current irq_cfg domain. Otherwise proceed with
updating the irq_cfg domain aswell as assigning vector's on all the cpu's
specified in the new mask.

This also cleans up a workaround in updating irq_cfg domain for legacy irq's
that are handled by the IO-APIC.

Signed-off-by: Suresh Siddha <[email protected]>
---
arch/x86/kernel/apic/io_apic.c | 15 ++++++---------
1 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ffdc152..bbf8c43 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1137,8 +1137,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
old_vector = cfg->vector;
if (old_vector) {
cpumask_and(tmp_mask, mask, cpu_online_mask);
- cpumask_and(tmp_mask, cfg->domain, tmp_mask);
- if (!cpumask_empty(tmp_mask)) {
+ if (cpumask_subset(tmp_mask, cfg->domain)) {
free_cpumask_var(tmp_mask);
return 0;
}
@@ -1152,6 +1151,11 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)

apic->vector_allocation_domain(cpu, tmp_mask);

+ if (cpumask_subset(tmp_mask, cfg->domain)) {
+ free_cpumask_var(tmp_mask);
+ return 0;
+ }
+
vector = current_vector;
offset = current_offset;
next:
@@ -1357,13 +1361,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,

if (!IO_APIC_IRQ(irq))
return;
- /*
- * For legacy irqs, cfg->domain starts with cpu 0 for legacy
- * controllers like 8259. Now that IO-APIC can handle this irq, update
- * the cfg->domain.
- */
- if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
- apic->vector_allocation_domain(0, cfg->domain);

if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
--
1.7.6.5

2012-05-21 23:59:59

by Suresh Siddha

[permalink] [raw]
Subject: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation

If the HW implements round-robin interrupt delivery, this enables multiple
cpu's (which are part of the user specified interrupt smp_affinity mask
and belong to the same x2apic cluster) to service the interrupt.

Also if the platform supports Power Aware Interrupt Routing, then this
enables the interrupt to be routed to an idle cpu or a busy cpu depending on
the perf/power bias tunable.

We are now grouping all the cpu's in a cluster to one vector domain. So that
will limit the total number of interrupt sources handled by Linux. Previously
we support "cpu-count * available-vectors-per-cpu" interrupt sources but this
will now reduce to "cpu-count/16 * available-vectors-per-cpu".

Signed-off-by: Suresh Siddha <[email protected]>
---
arch/x86/include/asm/x2apic.h | 9 -----
arch/x86/kernel/apic/x2apic_cluster.c | 56 +++++++++++++++++++++++----------
arch/x86/kernel/apic/x2apic_phys.c | 9 +++++
3 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54ab..7a5a832 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void)
return 1;
}

-/*
- * For now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
static void
__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ff35cff..90d999c 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -98,34 +98,47 @@ static void x2apic_send_IPI_all(int vector)

static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
int cpu = cpumask_first(cpumask);
+ u32 dest = 0;
+ int i;

- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
+ if (cpu > nr_cpu_ids)
return BAD_APICID;
+
+ for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu))
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+
+ return dest;
}

static unsigned int
x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
const struct cpumask *andmask)
{
- int cpu;
+ u32 dest = 0;
+ u16 cluster;
+ int i;

- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
+ for_each_cpu_and(i, cpumask, andmask) {
+ if (!cpumask_test_cpu(i, cpu_online_mask))
+ continue;
+ dest = per_cpu(x86_cpu_to_logical_apicid, i);
+ cluster = x2apic_cluster(i);
+ break;
}

- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ if (!dest)
+ return BAD_APICID;
+
+ for_each_cpu_and(i, cpumask, andmask) {
+ if (!cpumask_test_cpu(i, cpu_online_mask))
+ continue;
+ if (cluster != x2apic_cluster(i))
+ continue;
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+ }
+
+ return dest;
}

static void init_x2apic_ldr(void)
@@ -208,6 +221,15 @@ static int x2apic_cluster_probe(void)
return 0;
}

+/*
+ * Each x2apic cluster is an allocation domain.
+ */
+static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
+}
+
static struct apic apic_x2apic_cluster = {

.name = "cluster x2apic",
@@ -225,7 +247,7 @@ static struct apic apic_x2apic_cluster = {
.check_apicid_used = NULL,
.check_apicid_present = NULL,

- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c17e982..93b2570 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -120,6 +120,15 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}

+/*
+ * Each logical cpu is in its own vector allocation domain.
+ */
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
static struct apic apic_x2apic_phys = {

.name = "physical x2apic",
--
1.7.6.5

2012-05-22 07:04:35

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation


* Suresh Siddha <[email protected]> wrote:

> If the HW implements round-robin interrupt delivery, this
> enables multiple cpu's (which are part of the user specified
> interrupt smp_affinity mask and belong to the same x2apic
> cluster) to service the interrupt.

Could/should we do something similar for regular APICs as well?
They too support masks and LowestPrio delivery - and doing that
will increase test coverage rather significantly.

Thanks,

Ingo

2012-05-22 07:34:33

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation

On Tue, May 22, 2012 at 09:04:29AM +0200, Ingo Molnar wrote:
>
> * Suresh Siddha <[email protected]> wrote:
>
> > If the HW implements round-robin interrupt delivery, this
> > enables multiple cpu's (which are part of the user specified
> > interrupt smp_affinity mask and belong to the same x2apic
> > cluster) to service the interrupt.
>
> Could/should we do something similar for regular APICs as well?
> They too support masks and LowestPrio delivery - and doing that
> will increase test coverage rather significantly.

It seems only es7000_cpu_mask_to_apicid could be tuned up
to produce per-cluster mask instead of yielding bad-apic?

Cyrill

2012-05-22 09:36:18

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 04:33:03PM -0700, Yinghai Lu wrote:
> >> ? ? ? ?*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
> >>
> >> So we really need to submit all possible CPUs here ^^^ to be able finding the
> >> best/heaviest cluster out of the 'mask'.
> >
> > ok.
>
> Maybe you can not simply to change that to possible_cpu_mask.
>
> otherwise assign_irq_vector will try to get same vector for all cpus
> for one irq.
>
> then will make x2apic cluster mode will one support 224 irqs instead.

I am curious if we broke this threshold since the current design was introduced
in 2006? Not without an opposition from some notable maintainers ;)

> so you need to make vector_allocation_domain() to be less cpu set.

Yeah, reserving a vector for dozens of CPUs while up to eight will actually use
it seems an overkill. On the other hand, vector_allocation_domain() has been
either all-CPUs or single-CPU so far.

May be it is time to bring some intelligence here?

I.e. if we move cluster selection from cpu_mask_to_apicid{,_and}() to
vector_allocation_domain() ...

void vector_allocation_domain(int cpu,
const struct cpumask *cpumask,
struct cpumask *retmask);

... then it seems nicely fits into __ioapic_set_affinity(). Have to ensure it,
though.

>
> Yinghai

--
Regards,
Alexander Gordeev
[email protected]

2012-05-22 10:12:21

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

On Mon, May 21, 2012 at 04:44:39PM -0700, Suresh Siddha wrote:
> On Mon, 2012-05-21 at 10:13 +0200, Alexander Gordeev wrote:
> > So we really need to submit all possible CPUs here ^^^ to be able finding the
> > best/heaviest cluster out of the 'mask'.
> >
>
> I don't think we need to do anything fancy, like selecting the heaviest
> cluster out of the mask. Something like (cluster membership of first cpu
> in the mask) AND (mask) should be enough.

Hmm... the cost of selecting a cluster is cheap. It is called from a setup code
that should not be fast. Why not keep it?.. Well in case the data would show it
worth it.

What seems worthy though is sticking to the cluster which is currently in use
and at least one of its CPUs is still in the mask.

> thanks,
> suresh

--
Regards,
Alexander Gordeev
[email protected]

2012-05-22 17:22:51

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation

On Tue, 2012-05-22 at 09:04 +0200, Ingo Molnar wrote:
> * Suresh Siddha <[email protected]> wrote:
>
> > If the HW implements round-robin interrupt delivery, this
> > enables multiple cpu's (which are part of the user specified
> > interrupt smp_affinity mask and belong to the same x2apic
> > cluster) to service the interrupt.
>
> Could/should we do something similar for regular APICs as well?
> They too support masks and LowestPrio delivery - and doing that
> will increase test coverage rather significantly.

Existing logical flat xapic mode already takes advantage of this today.
And that apic driver allows multiple cpu's to be set in the destination
field allowing round-robin/power-aware interrupt delivery etc depending
on the platform capabilities etc.

So most of the laptops with 8 or less logical cpu's should take
advantage of this today.

For bigger platforms, we use physical xapic mode. Some older kernels
used xapic cluster mode, which allows 4 members in a cluster. With two
HT siblings, that will leave room for only 2 cores. So the benefit will
be limited. And on the multi-socket platforms, x2apic/vt-d will be
available and used for various other reasons too (virtualization etc).

x2apic is available on desktop/laptop models too.

So for legacy xapic, we can use logical flat mode to take advantage of
these HW modes.

thanks,
suresh


2012-05-22 17:39:29

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation

On Tue, May 22, 2012 at 10:21:15AM -0700, Suresh Siddha wrote:
> On Tue, 2012-05-22 at 09:04 +0200, Ingo Molnar wrote:
> > * Suresh Siddha <[email protected]> wrote:
> >
> > > If the HW implements round-robin interrupt delivery, this
> > > enables multiple cpu's (which are part of the user specified
> > > interrupt smp_affinity mask and belong to the same x2apic
> > > cluster) to service the interrupt.
> >
> > Could/should we do something similar for regular APICs as well?
> > They too support masks and LowestPrio delivery - and doing that
> > will increase test coverage rather significantly.
>
> Existing logical flat xapic mode already takes advantage of this today.

Suresh, should not we tune up es7000_cpu_mask_to_apicid code to return
apicid with a cluster number and ORed logical apicid part as done for
x2apic, or the way it resturns apicid now was done by a purpose?

Cyrill

2012-05-22 17:44:33

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation

On Tue, 2012-05-22 at 21:39 +0400, Cyrill Gorcunov wrote:
> Suresh, should not we tune up es7000_cpu_mask_to_apicid code to return
> apicid with a cluster number and ORed logical apicid part as done for
> x2apic, or the way it resturns apicid now was done by a purpose?

Don't know. Those are very old platforms. Not worth tuning them IMHO.

thanks,
suresh

2012-05-22 17:45:38

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation

On Tue, May 22, 2012 at 10:42:57AM -0700, Suresh Siddha wrote:
> > x2apic, or the way it resturns apicid now was done by a purpose?
>
> Don't know. Those are very old platforms. Not worth tuning them IMHO.

I see,thanks.

Cyrill

2012-05-22 20:03:57

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH 2/2] x2apic, cluster: use all the members of one cluster specified in the smp_affinity mask for the interrupt desintation

On Mon, May 21, 2012 at 4:58 PM, Suresh Siddha
<[email protected]> wrote:
> We are now grouping all the cpu's in a cluster to one vector domain. So that
> will limit the total number of interrupt sources handled by Linux. Previously
> we support "cpu-count * available-vectors-per-cpu" interrupt sources but this
> will now reduce to "cpu-count/16 * available-vectors-per-cpu".

maybe could add some boot parameter to control cpu number in same
domain that is used instead of 16.

Thanks

Yinghai

2012-06-06 15:04:20

by Suresh Siddha

[permalink] [raw]
Subject: [tip:x86/apic] x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain

Commit-ID: 332afa656e76458ee9cf0f0d123016a0658539e4
Gitweb: http://git.kernel.org/tip/332afa656e76458ee9cf0f0d123016a0658539e4
Author: Suresh Siddha <[email protected]>
AuthorDate: Mon, 21 May 2012 16:58:01 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 6 Jun 2012 09:51:22 +0200

x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain

Until now, irq_cfg domain is mostly static. Either all CPU's
(used by flat mode) or one CPU (first CPU in the irq afffinity
mask) to which irq is being migrated (this is used by the rest
of apic modes).

Upcoming x2apic cluster mode optimization patch allows the irq
to be sent to any CPU in the x2apic cluster (if supported by the
HW). So irq_cfg domain changes on the fly (depending on which
CPU in the x2apic cluster is online).

Instead of checking for any intersection between the new irq
affinity mask and the current irq_cfg domain, check if the new
irq affinity mask is a subset of the current irq_cfg domain.
Otherwise proceed with updating the irq_cfg domain aswell as
assigning vector's on all the CPUs specified in the new mask.

This also cleans up a workaround in updating irq_cfg domain for
legacy irq's that are handled by the IO-APIC.

Signed-off-by: Suresh Siddha <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/apic/io_apic.c | 15 ++++++---------
1 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ac96561..910a311 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1126,8 +1126,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
old_vector = cfg->vector;
if (old_vector) {
cpumask_and(tmp_mask, mask, cpu_online_mask);
- cpumask_and(tmp_mask, cfg->domain, tmp_mask);
- if (!cpumask_empty(tmp_mask)) {
+ if (cpumask_subset(tmp_mask, cfg->domain)) {
free_cpumask_var(tmp_mask);
return 0;
}
@@ -1141,6 +1140,11 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)

apic->vector_allocation_domain(cpu, tmp_mask);

+ if (cpumask_subset(tmp_mask, cfg->domain)) {
+ free_cpumask_var(tmp_mask);
+ return 0;
+ }
+
vector = current_vector;
offset = current_offset;
next:
@@ -1346,13 +1350,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,

if (!IO_APIC_IRQ(irq))
return;
- /*
- * For legacy irqs, cfg->domain starts with cpu 0 for legacy
- * controllers like 8259. Now that IO-APIC can handle this irq, update
- * the cfg->domain.
- */
- if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
- apic->vector_allocation_domain(0, cfg->domain);

if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;

2012-06-06 15:05:11

by Suresh Siddha

[permalink] [raw]
Subject: [tip:x86/apic] x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination

Commit-ID: 0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0
Gitweb: http://git.kernel.org/tip/0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0
Author: Suresh Siddha <[email protected]>
AuthorDate: Mon, 21 May 2012 16:58:02 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 6 Jun 2012 09:51:22 +0200

x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination

If the HW implements round-robin interrupt delivery, this
enables multiple cpu's (which are part of the user specified
interrupt smp_affinity mask and belong to the same x2apic
cluster) to service the interrupt.

Also if the platform supports Power Aware Interrupt Routing,
then this enables the interrupt to be routed to an idle cpu or a
busy cpu depending on the perf/power bias tunable.

We are now grouping all the cpu's in a cluster to one vector
domain. So that will limit the total number of interrupt sources
handled by Linux. Previously we support "cpu-count *
available-vectors-per-cpu" interrupt sources but this will now
reduce to "cpu-count/16 * available-vectors-per-cpu".

Signed-off-by: Suresh Siddha <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/x2apic.h | 9 -----
arch/x86/kernel/apic/x2apic_cluster.c | 56 +++++++++++++++++++++++----------
arch/x86/kernel/apic/x2apic_phys.c | 9 +++++
3 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54ab..7a5a832 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void)
return 1;
}

-/*
- * For now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
static void
__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ff35cff..90d999c 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -98,34 +98,47 @@ static void x2apic_send_IPI_all(int vector)

static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
int cpu = cpumask_first(cpumask);
+ u32 dest = 0;
+ int i;

- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
+ if (cpu > nr_cpu_ids)
return BAD_APICID;
+
+ for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu))
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+
+ return dest;
}

static unsigned int
x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
const struct cpumask *andmask)
{
- int cpu;
+ u32 dest = 0;
+ u16 cluster;
+ int i;

- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
+ for_each_cpu_and(i, cpumask, andmask) {
+ if (!cpumask_test_cpu(i, cpu_online_mask))
+ continue;
+ dest = per_cpu(x86_cpu_to_logical_apicid, i);
+ cluster = x2apic_cluster(i);
+ break;
}

- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ if (!dest)
+ return BAD_APICID;
+
+ for_each_cpu_and(i, cpumask, andmask) {
+ if (!cpumask_test_cpu(i, cpu_online_mask))
+ continue;
+ if (cluster != x2apic_cluster(i))
+ continue;
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+ }
+
+ return dest;
}

static void init_x2apic_ldr(void)
@@ -208,6 +221,15 @@ static int x2apic_cluster_probe(void)
return 0;
}

+/*
+ * Each x2apic cluster is an allocation domain.
+ */
+static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
+}
+
static struct apic apic_x2apic_cluster = {

.name = "cluster x2apic",
@@ -225,7 +247,7 @@ static struct apic apic_x2apic_cluster = {
.check_apicid_used = NULL,
.check_apicid_present = NULL,

- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c17e982..93b2570 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -120,6 +120,15 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}

+/*
+ * Each logical cpu is in its own vector allocation domain.
+ */
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
static struct apic apic_x2apic_phys = {

.name = "physical x2apic",

2012-06-06 18:07:49

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Mon, May 21, 2012 at 04:58:01PM -0700, Suresh Siddha wrote:
> Until now, irq_cfg domain is mostly static. Either all cpu's (used by flat
> mode) or one cpu (first cpu in the irq afffinity mask) to which irq is being
> migrated (this is used by the rest of apic modes).
>
> Upcoming x2apic cluster mode optimization patch allows the irq to be sent
> to any cpu in the x2apic cluster (if supported by the HW). So irq_cfg
> domain changes on the fly (depending on which cpu in the x2apic cluster
> is online).
>
> Instead of checking for any intersection between the new irq affinity
> mask and the current irq_cfg domain, check if the new irq affinity mask
> is a subset of the current irq_cfg domain. Otherwise proceed with
> updating the irq_cfg domain aswell as assigning vector's on all the cpu's
> specified in the new mask.
>
> This also cleans up a workaround in updating irq_cfg domain for legacy irq's
> that are handled by the IO-APIC.

Suresh,

I thought you posted these patches for reference and held off with my comments
until you are collecting the data. But since Ingo picked the patches I will
sound my concerns in this thread.

>
> Signed-off-by: Suresh Siddha <[email protected]>
> ---
> arch/x86/kernel/apic/io_apic.c | 15 ++++++---------
> 1 files changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index ffdc152..bbf8c43 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1137,8 +1137,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> old_vector = cfg->vector;
> if (old_vector) {
> cpumask_and(tmp_mask, mask, cpu_online_mask);
> - cpumask_and(tmp_mask, cfg->domain, tmp_mask);
> - if (!cpumask_empty(tmp_mask)) {
> + if (cpumask_subset(tmp_mask, cfg->domain)) {

Imagine that passed mask is a subset of cfg->domain and also contains at least
one online CPU from a different cluster. Since domains are always one cluster
wide this condition ^^^ will fail and we go further.

> free_cpumask_var(tmp_mask);
> return 0;
> }
> @@ -1152,6 +1151,11 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
>
> apic->vector_allocation_domain(cpu, tmp_mask);
>
> + if (cpumask_subset(tmp_mask, cfg->domain)) {

Because the mask intersects with cfg->domain this condition ^^^ may succeed and
we could return with no change from here.

That raises few concerns to me:

- The first check is not perfect, because it failed to recognize the
intersection right away. Instead, we possibly lost multiple loops through the
mask before we realized we do not need any change at all. Therefore...

- It would be better to recognize the intersection even before entering the
loop. But that is exactly what the removed code has been doing before.

- Depending from the passed mask, we equally likely could have select another
cluster and switch to it, even though the current cfg->domain is contained
within the requested mask. Besides it is just not nice, we are also switching
from a cache-hot cluster. If you suggested that it is enough to pick a first
found cluster (rather than select a best possible) then there is even less
reason to switch from cfg->domain here.

> + free_cpumask_var(tmp_mask);
> + return 0;
> + }
> +
> vector = current_vector;
> offset = current_offset;
> next:
> @@ -1357,13 +1361,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
>
> if (!IO_APIC_IRQ(irq))
> return;
> - /*
> - * For legacy irqs, cfg->domain starts with cpu 0 for legacy
> - * controllers like 8259. Now that IO-APIC can handle this irq, update
> - * the cfg->domain.
> - */
> - if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
> - apic->vector_allocation_domain(0, cfg->domain);


This hunk reverts your 69c89ef commit. Regression?

>
> if (assign_irq_vector(irq, cfg, apic->target_cpus()))
> return;
> --
> 1.7.6.5
>

--
Regards,
Alexander Gordeev
[email protected]

2012-06-06 22:21:49

by Yinghai Lu

[permalink] [raw]
Subject: Re: [tip:x86/apic] x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination

On Wed, Jun 6, 2012 at 8:04 AM, tip-bot for Suresh Siddha
<[email protected]> wrote:
> Commit-ID: ?0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0
> Gitweb: ? ? http://git.kernel.org/tip/0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0
> Author: ? ? Suresh Siddha <[email protected]>
> AuthorDate: Mon, 21 May 2012 16:58:02 -0700
> Committer: ?Ingo Molnar <[email protected]>
> CommitDate: Wed, 6 Jun 2012 09:51:22 +0200
>
> x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination
>
> If the HW implements round-robin interrupt delivery, this
> enables multiple cpu's (which are part of the user specified
> interrupt smp_affinity mask and belong to the same x2apic
> cluster) to service the interrupt.
>
> Also if the platform supports Power Aware Interrupt Routing,
> then this enables the interrupt to be routed to an idle cpu or a
> busy cpu depending on the perf/power bias tunable.
>
> We are now grouping all the cpu's in a cluster to one vector
> domain. So that will limit the total number of interrupt sources
> handled by Linux. Previously we support "cpu-count *
> available-vectors-per-cpu" interrupt sources but this will now
> reduce to "cpu-count/16 * available-vectors-per-cpu".

with this patch, will waste/hide several irq vector on some cpus.

for example:
very beginning after boot, one irq will have vector from 16 cpus of
same cluster.
later if user using irq affinity change to one cpu only.
cfg->domain will be not changed, but ioapic or irte will be confined to one cpu.
because mask is used for new dest_id.
*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);

all other 15 cpu vector are wasted or hided.

Yinghai

2012-06-06 23:02:48

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Wed, 2012-06-06 at 19:20 +0200, Alexander Gordeev wrote:
> On Mon, May 21, 2012 at 04:58:01PM -0700, Suresh Siddha wrote:
> > Until now, irq_cfg domain is mostly static. Either all cpu's (used by flat
> > mode) or one cpu (first cpu in the irq afffinity mask) to which irq is being
> > migrated (this is used by the rest of apic modes).
> >
> > Upcoming x2apic cluster mode optimization patch allows the irq to be sent
> > to any cpu in the x2apic cluster (if supported by the HW). So irq_cfg
> > domain changes on the fly (depending on which cpu in the x2apic cluster
> > is online).
> >
> > Instead of checking for any intersection between the new irq affinity
> > mask and the current irq_cfg domain, check if the new irq affinity mask
> > is a subset of the current irq_cfg domain. Otherwise proceed with
> > updating the irq_cfg domain aswell as assigning vector's on all the cpu's
> > specified in the new mask.
> >
> > This also cleans up a workaround in updating irq_cfg domain for legacy irq's
> > that are handled by the IO-APIC.
>
> Suresh,
>
> I thought you posted these patches for reference and held off with my comments
> until you are collecting the data. But since Ingo picked the patches I will
> sound my concerns in this thread.

These are tested patches and I am ok with Ingo picking it up for getting
further baked in -tip. About the data collection, I have to find the
right system/bios to run the tests for power-aware/round-robin interrupt
routing. Anyways logical xapic mode already has this capability and we
are adding the capability for x2apic cluster mode here. And also,
irqbalance has to ultimately take advantage of this by specifying
multiple cpu's when migrating an interrupt.

Only concern I have with this patchset is what I already mentioned in
the changelog of the second patch. i.e., it reduces the number of IRQ's
that the platform can handle, as we reduce the available number of
vectors by a factor of 16.

If this indeed becomes a problem, then there are few options. Either
reserve the vectors based on the irq destination mask (rather than
reserving on all the cluster members) or reducing the grouping from 16
to a smaller number etc. I can post another patch shortly for this.

> >
> > Signed-off-by: Suresh Siddha <[email protected]>
> > ---
> > arch/x86/kernel/apic/io_apic.c | 15 ++++++---------
> > 1 files changed, 6 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> > index ffdc152..bbf8c43 100644
> > --- a/arch/x86/kernel/apic/io_apic.c
> > +++ b/arch/x86/kernel/apic/io_apic.c
> > @@ -1137,8 +1137,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> > old_vector = cfg->vector;
> > if (old_vector) {
> > cpumask_and(tmp_mask, mask, cpu_online_mask);
> > - cpumask_and(tmp_mask, cfg->domain, tmp_mask);
> > - if (!cpumask_empty(tmp_mask)) {
> > + if (cpumask_subset(tmp_mask, cfg->domain)) {
>
> Imagine that passed mask is a subset of cfg->domain and also contains at least
> one online CPU from a different cluster. Since domains are always one cluster
> wide this condition ^^^ will fail and we go further.
>
> > free_cpumask_var(tmp_mask);
> > return 0;
> > }
> > @@ -1152,6 +1151,11 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> >
> > apic->vector_allocation_domain(cpu, tmp_mask);
> >
> > + if (cpumask_subset(tmp_mask, cfg->domain)) {
>
> Because the mask intersects with cfg->domain this condition ^^^ may succeed and
> we could return with no change from here.
>
> That raises few concerns to me:
> - The first check is not perfect, because it failed to recognize the
> intersection right away. Instead, we possibly lost multiple loops through the
> mask before we realized we do not need any change at all. Therefore...
>
> - It would be better to recognize the intersection even before entering the
> loop. But that is exactly what the removed code has been doing before.
>
> - Depending from the passed mask, we equally likely could have select another
> cluster and switch to it, even though the current cfg->domain is contained
> within the requested mask. Besides it is just not nice, we are also switching
> from a cache-hot cluster. If you suggested that it is enough to pick a first
> found cluster (rather than select a best possible) then there is even less
> reason to switch from cfg->domain here.

Few things to keep in perspective.

this is generic portion of the vector handling code and has to work
across different apic drivers and their cfg domains. And also most of
the intelligence lies in the irqbalance which specifies the irq
destination mask. Traditionally Kernel code selected the first possible
destination and not the best destination among the specified mask.

Anyways the above hunks are trying to address scenario like this for
example: During boot, all the IO-APIC interrupts (legacy/non-legacy) are
routed to cpu-0 with only cpu-0 in their cfg->domain (as we don't know
which other cpu's fall into the same x2apic cluster, we can't pre-set
them in the cfg->domain). Consider a single socket system. After the SMP
bringup of other siblings, those io-apic irq's affinity is modified to
all cpu's in setup_ioapic_dest(). And with the current code,
assign_irq_vector() will bail immediately with out reserving the
corresponding vector on all the cluster members that are now online. And
the interrupt ends up going to only cpu-0 and it will not get corrected
as long as cpu-0 is in the specified interrupt destination mask.

> > + free_cpumask_var(tmp_mask);
> > + return 0;
> > + }
> > +
> > vector = current_vector;
> > offset = current_offset;
> > next:
> > @@ -1357,13 +1361,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
> >
> > if (!IO_APIC_IRQ(irq))
> > return;
> > - /*
> > - * For legacy irqs, cfg->domain starts with cpu 0 for legacy
> > - * controllers like 8259. Now that IO-APIC can handle this irq, update
> > - * the cfg->domain.
> > - */
> > - if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
> > - apic->vector_allocation_domain(0, cfg->domain);
>
>
> This hunk reverts your 69c89ef commit. Regression?
>

As I mentioned in the changelog, this patch removes the need for that
hacky workaround. commit 69c89ef didn't really fix the underlying
problem (and hence we re-encountered the similar issue (above mentioned)
in the context of x2apic cluster). Clean fix is to address the issue in
assign_irq_vector() which is what this patch does.

thanks,
suresh

2012-06-06 23:15:15

by Suresh Siddha

[permalink] [raw]
Subject: Re: [tip:x86/apic] x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination

On Wed, 2012-06-06 at 15:21 -0700, Yinghai Lu wrote:
> On Wed, Jun 6, 2012 at 8:04 AM, tip-bot for Suresh Siddha
> <[email protected]> wrote:
> > Commit-ID: 0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0
> > Gitweb: http://git.kernel.org/tip/0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0
> > Author: Suresh Siddha <[email protected]>
> > AuthorDate: Mon, 21 May 2012 16:58:02 -0700
> > Committer: Ingo Molnar <[email protected]>
> > CommitDate: Wed, 6 Jun 2012 09:51:22 +0200
> >
> > x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination
> >
> > If the HW implements round-robin interrupt delivery, this
> > enables multiple cpu's (which are part of the user specified
> > interrupt smp_affinity mask and belong to the same x2apic
> > cluster) to service the interrupt.
> >
> > Also if the platform supports Power Aware Interrupt Routing,
> > then this enables the interrupt to be routed to an idle cpu or a
> > busy cpu depending on the perf/power bias tunable.
> >
> > We are now grouping all the cpu's in a cluster to one vector
> > domain. So that will limit the total number of interrupt sources
> > handled by Linux. Previously we support "cpu-count *
> > available-vectors-per-cpu" interrupt sources but this will now
> > reduce to "cpu-count/16 * available-vectors-per-cpu".
>
> with this patch, will waste/hide several irq vector on some cpus.
>
> for example:
> very beginning after boot, one irq will have vector from 16 cpus of
> same cluster.
> later if user using irq affinity change to one cpu only.
> cfg->domain will be not changed, but ioapic or irte will be confined to one cpu.
> because mask is used for new dest_id.
> *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
>
> all other 15 cpu vector are wasted or hided.

Yinghai, As I mentioned in the other thread, we should be able to
restrict the vector allocation to only the cpu's specified in the mask
and cleanup the allocation for the others in the same cluster domain. We
didn't do this for legacy xapic in the past mainly because of some
platform bugs which sent interrupts to different logical cpu's ignoring
what has been specified in IO-APIC RTE. But x2apic mode doesn't have
those HW errata, so we should be able to address this.

Will get back with a patch which does this.

thanks,
suresh

2012-06-16 00:25:20

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Wed, 2012-06-06 at 16:02 -0700, Suresh Siddha wrote:
> On Wed, 2012-06-06 at 19:20 +0200, Alexander Gordeev wrote:
> >
> > Suresh,
> >
> > I thought you posted these patches for reference and held off with my comments
> > until you are collecting the data. But since Ingo picked the patches I will
> > sound my concerns in this thread.
>
> These are tested patches and I am ok with Ingo picking it up for getting
> further baked in -tip. About the data collection, I have to find the
> right system/bios to run the tests for power-aware/round-robin interrupt
> routing. Anyways logical xapic mode already has this capability and we
> are adding the capability for x2apic cluster mode here. And also,
> irqbalance has to ultimately take advantage of this by specifying
> multiple cpu's when migrating an interrupt.
>
> Only concern I have with this patchset is what I already mentioned in
> the changelog of the second patch. i.e., it reduces the number of IRQ's
> that the platform can handle, as we reduce the available number of
> vectors by a factor of 16.
>
> If this indeed becomes a problem, then there are few options. Either
> reserve the vectors based on the irq destination mask (rather than
> reserving on all the cluster members) or reducing the grouping from 16
> to a smaller number etc. I can post another patch shortly for this.
>

Ok, here is a quick version of the patch doing the first option I
mentioned above. Reserve the vectors based on the irq destination mask
and the corresponding vector domain, rather than reserving the vector on
all the cpu's for the theoretical domain (which is an x2apic cluster).

Will do some more testing and post the patch with detailed changelog on
Monday. For now, appended the patch for quick reference. Thanks.
---

Signed-off-by: Suresh Siddha <[email protected]>
---
arch/x86/include/asm/apic.h | 10 +++++++---
arch/x86/kernel/apic/apic_noop.c | 3 ++-
arch/x86/kernel/apic/io_apic.c | 28 ++++++++++++++++++++++++++--
arch/x86/kernel/apic/x2apic_cluster.c | 7 ++++---
4 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 8619a87..af9ec10 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -303,10 +303,12 @@ struct apic {
int disable_esr;

int dest_logical;
+ bool sub_domain;
unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
unsigned long (*check_apicid_present)(int apicid);

- bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
+ bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
+ const struct cpumask *mask);
void (*init_apic_ldr)(void);

void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -615,7 +617,8 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
unsigned int *apicid);

static inline bool
-flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
+flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ const struct cpumask *mask)
{
/* Careful. Some cpus do not strictly honor the set of cpus
* specified in the interrupt destination when using lowest
@@ -631,7 +634,8 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
}

static inline bool
-default_vector_allocation_domain(int cpu, struct cpumask *retmask)
+default_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ const struct cpumask *mask)
{
cpumask_copy(retmask, cpumask_of(cpu));
return true;
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 65c07fc..ebdd349 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,7 +100,8 @@ static unsigned long noop_check_apicid_present(int bit)
return physid_isset(bit, phys_cpu_present_map);
}

-static bool noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static bool noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ struct cpumask *mask)
{
if (cpu != 0)
pr_warning("APIC: Vector allocated for non-BSP cpu\n");
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 99a794d..3d1c37c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1126,7 +1126,18 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
old_vector = cfg->vector;
if (old_vector) {
cpumask_and(tmp_mask, mask, cpu_online_mask);
- if (cpumask_subset(tmp_mask, cfg->domain)) {
+ /*
+ * If the APIC driver says it can't reliably route the
+ * interrupt to some subset of the domain members and the
+ * incoming affinity mask is already a subset of the domain,
+ * then we are done. We will keep the vector assigned in all
+ * the domain members.
+ *
+ * Or if the new mask is same as the existing domain, then
+ * also nothing more to do here.
+ */
+ if ((!apic->sub_domain && cpumask_subset(tmp_mask, cfg->domain))
+ || (cpumask_equal(tmp_mask, cfg->domain))) {
free_cpumask_var(tmp_mask);
return 0;
}
@@ -1139,9 +1150,22 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
int vector, offset;
bool more_domains;

- more_domains = apic->vector_allocation_domain(cpu, tmp_mask);
+ more_domains =
+ apic->vector_allocation_domain(cpu, tmp_mask, mask);

if (cpumask_subset(tmp_mask, cfg->domain)) {
+ /*
+ * If the APIC driver can route the interrupt
+ * to some subset of the domain, then we can
+ * cleanup the vector allocation for other members.
+ */
+ if (apic->sub_domain &&
+ !cpumask_equal(tmp_mask, cfg->domain)) {
+ cpumask_andnot(cfg->old_domain, cfg->domain,
+ tmp_mask);
+ cfg->move_in_progress = 1;
+ cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+ }
free_cpumask_var(tmp_mask);
return 0;
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 943d03f..46cbbe1 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -212,10 +212,10 @@ static int x2apic_cluster_probe(void)
/*
* Each x2apic cluster is an allocation domain.
*/
-static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ const struct cpumask *mask)
{
- cpumask_clear(retmask);
- cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
+ cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
return true;
}

@@ -233,6 +233,7 @@ static struct apic apic_x2apic_cluster = {
.target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
+ .sub_domain = true,
.check_apicid_used = NULL,
.check_apicid_present = NULL,


2012-06-18 09:17:51

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Fri, Jun 15, 2012 at 05:25:20PM -0700, Suresh Siddha wrote:
> On Wed, 2012-06-06 at 16:02 -0700, Suresh Siddha wrote:
> Ok, here is a quick version of the patch doing the first option I
> mentioned above. Reserve the vectors based on the irq destination mask
> and the corresponding vector domain, rather than reserving the vector on
> all the cpu's for the theoretical domain (which is an x2apic cluster).

Suresh,

I would suggest to go further and generalize your idea and introduce something
like compare_domains() instead sub_domain flag. This would help us to keep/let
__assign_irq_vector() be more generic and hide domains treating logic in
drivers.

Also, this would fix a previous subtle condition introduced with previous
commit 332afa6 ("x86/irq: Update irq_cfg domain unless the new affinity is a
subset of the current domain") when a change of affinity mask could trigger
unnecessary move of vector in case new and old masks intersect, but the new
mask is not a subset of the old one.

The patch is just to clarify the idea, neither tested nor even compiled.


diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index eec240e..fcca995 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,10 @@ struct apic {
unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
unsigned long (*check_apicid_present)(int apicid);

- bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
+ bool (*vector_allocation_domain)(int cpu,
+ const struct cpumask *mask,
+ struct cpumask *retmask);
+ int (*compare_domains)(const struct cpumask *, const struct cpumask *);
void (*init_apic_ldr)(void);

void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -615,7 +618,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
unsigned int *apicid);

static inline bool
-flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
+flat_vector_allocation_domain(int cpu,
+ const struct cpumask *mask,
+ struct cpumask *retmask)
{
/* Careful. Some cpus do not strictly honor the set of cpus
* specified in the interrupt destination when using lowest
@@ -630,13 +635,33 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
return false;
}

+static inline int
+flat_compare_domains(const struct cpumask *domain1,
+ const struct cpumask *domain2)
+{
+ if (cpumask_subset(domain1, domain2))
+ return 0;
+ return 1;
+}
+
static inline bool
-default_vector_allocation_domain(int cpu, struct cpumask *retmask)
+default_vector_allocation_domain(int cpu,
+ const struct cpumask *mask,
+ struct cpumask *retmask)
{
cpumask_copy(retmask, cpumask_of(cpu));
return true;
}

+static inline int
+default_compare_domains(const struct cpumask *domain1,
+ const struct cpumask *domain2)
+{
+ if (cpumask_intersects(domain1, domain2))
+ return 0;
+ return 1;
+}
+
static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
{
return physid_isset(apicid, *map);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0540f08..88cafc8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1126,7 +1126,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
old_vector = cfg->vector;
if (old_vector) {
cpumask_and(tmp_mask, mask, cpu_online_mask);
- if (cpumask_subset(tmp_mask, cfg->domain)) {
+ if (!apic->compare_domains(tmp_mask, cfg->domain)) {
free_cpumask_var(tmp_mask);
return 0;
}
@@ -1138,47 +1138,50 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
int new_cpu;
int vector, offset;
bool more_domains;
+ int cmp_domains;

- more_domains = apic->vector_allocation_domain(cpu, tmp_mask);
+ more = apic->vector_allocation_domain(cpu, mask, tmp_mask);
+ cmp = apic->compare_domains(tmp_mask, cfg->domain);

- if (cpumask_subset(tmp_mask, cfg->domain)) {
- free_cpumask_var(tmp_mask);
- return 0;
- }
-
- vector = current_vector;
- offset = current_offset;
+ if (cmp < 0) {
+ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
+ cfg->move_in_progress = 1;
+ cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+ } else if (cmp > 0) {
+ vector = current_vector;
+ offset = current_offset;
next:
- vector += 16;
- if (vector >= first_system_vector) {
- offset = (offset + 1) % 16;
- vector = FIRST_EXTERNAL_VECTOR + offset;
- }
-
- if (unlikely(current_vector == vector)) {
- if (more_domains)
- continue;
- else
- break;
- }
+ vector += 16;
+ if (vector >= first_system_vector) {
+ offset = (offset + 1) % 16;
+ vector = FIRST_EXTERNAL_VECTOR + offset;
+ }

- if (test_bit(vector, used_vectors))
- goto next;
+ if (unlikely(current_vector == vector)) {
+ if (more)
+ continue;
+ else
+ break;
+ }

- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ if (test_bit(vector, used_vectors))
goto next;
- /* Found one! */
- current_vector = vector;
- current_offset = offset;
- if (old_vector) {
- cfg->move_in_progress = 1;
- cpumask_copy(cfg->old_domain, cfg->domain);
+
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ goto next;
+ /* Found one! */
+ current_vector = vector;
+ current_offset = offset;
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cpumask_copy(cfg->old_domain, cfg->domain);
+ }
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ per_cpu(vector_irq, new_cpu)[vector] = irq;
+ cfg->vector = vector;
+ cpumask_copy(cfg->domain, tmp_mask);
}
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq;
- cfg->vector = vector;
- cpumask_copy(cfg->domain, tmp_mask);
err = 0;
break;
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 943d03f..090c69e 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -212,13 +212,24 @@ static int x2apic_cluster_probe(void)
/*
* Each x2apic cluster is an allocation domain.
*/
-static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static bool cluster_vector_allocation_domain(int cpu,
+ const struct cpumask *mask,
+ struct cpumask *retmask)
{
- cpumask_clear(retmask);
- cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
+ cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
return true;
}

+static int cluster_compare_domains(const struct cpumask *domain1,
+ const struct cpumask *domain2)
+{
+ if (cpumask_subset(domain1, domain2))
+ return -1;
+ if (cpumask_equal(domain1, domain2))
+ return 0;
+ return 1;
+}
+
static struct apic apic_x2apic_cluster = {

.name = "cluster x2apic",
@@ -237,6 +248,7 @@ static struct apic apic_x2apic_cluster = {
.check_apicid_present = NULL,

.vector_allocation_domain = cluster_vector_allocation_domain,
+ .compare_domains = cluster_compare_domains,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index e03a1e1..24511a9 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -106,6 +106,7 @@ static struct apic apic_x2apic_phys = {
.check_apicid_present = NULL,

.vector_allocation_domain = default_vector_allocation_domain,
+ .compare_domains = default_compare_domains,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,

--
Regards,
Alexander Gordeev
[email protected]

2012-06-19 00:51:23

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Mon, 2012-06-18 at 11:17 +0200, Alexander Gordeev wrote:
> I would suggest to go further and generalize your idea and introduce something
> like compare_domains() instead sub_domain flag. This would help us to keep/let
> __assign_irq_vector() be more generic and hide domains treating logic in
> drivers.

That sounds good.

> @@ -1138,47 +1138,50 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> int new_cpu;
> int vector, offset;
> bool more_domains;
> + int cmp_domains;
>
> - more_domains = apic->vector_allocation_domain(cpu, tmp_mask);
> + more = apic->vector_allocation_domain(cpu, mask, tmp_mask);
> + cmp = apic->compare_domains(tmp_mask, cfg->domain);

I think we should be able to consolidate both of them into one
apic_driver specific routine. Also I think we should be able to use the
tmp_mask in each round and minimize the number of unnecessary
for_each_cpu iterations. And that should clean up the more_domains bool
logic we did earlier.

Will post the complete patch in a day.

thanks,
suresh


2012-06-19 23:43:24

by Suresh Siddha

[permalink] [raw]
Subject: [PATCH 1/2] x86, apic: optimize cpu traversal in __assign_irq_vector() using domain membership

Currently __assign_irq_vector() goes through each cpu in the specified mask
until it finds a free vector in all the cpu's that are part of the same
interrupt domain. We visit all the interrupt domain sibling cpus to reserve
the free vector. So, when we fail to find a free vector in an interrupt
domain, it is safe to continue our search with a cpu belonging to a new
interrupt domain. No need to go through each cpu, if the domain
containing that cpu is already visited.

Use the irq_cfg's old_domain to track the visited domains and optimize
the cpu traversal while finding a free vector in the given cpumask.

NOTE: We can also optimize the search by using for_each_cpu and skip the
current cpu, if it is not the first cpu in the mask returned by the
vector_allocation_domain(). But re-using the cfg->old_domain to track
the visited domains will be slightly faster.

Signed-off-by: Suresh Siddha <[email protected]>
---
arch/x86/include/asm/apic.h | 8 +++-----
arch/x86/kernel/apic/apic_noop.c | 3 +--
arch/x86/kernel/apic/io_apic.c | 15 ++++++++-------
arch/x86/kernel/apic/x2apic_cluster.c | 3 +--
4 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 8619a87..b37fa12 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,7 @@ struct apic {
unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
unsigned long (*check_apicid_present)(int apicid);

- bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
+ void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
void (*init_apic_ldr)(void);

void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -614,7 +614,7 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
const struct cpumask *andmask,
unsigned int *apicid);

-static inline bool
+static inline void
flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
/* Careful. Some cpus do not strictly honor the set of cpus
@@ -627,14 +627,12 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
*/
cpumask_clear(retmask);
cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
- return false;
}

-static inline bool
+static inline void
default_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
cpumask_copy(retmask, cpumask_of(cpu));
- return true;
}

static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 65c07fc..08c337b 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,12 +100,11 @@ static unsigned long noop_check_apicid_present(int bit)
return physid_isset(bit, phys_cpu_present_map);
}

-static bool noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
if (cpu != 0)
pr_warning("APIC: Vector allocated for non-BSP cpu\n");
cpumask_copy(retmask, cpumask_of(cpu));
- return true;
}

static u32 noop_apic_read(u32 reg)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 99a794d..7a945f8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1134,12 +1134,13 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)

/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
- for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ cpumask_clear(cfg->old_domain);
+ cpu = cpumask_first_and(mask, cpu_online_mask);
+ while (cpu < nr_cpu_ids) {
int new_cpu;
int vector, offset;
- bool more_domains;

- more_domains = apic->vector_allocation_domain(cpu, tmp_mask);
+ apic->vector_allocation_domain(cpu, tmp_mask);

if (cpumask_subset(tmp_mask, cfg->domain)) {
free_cpumask_var(tmp_mask);
@@ -1156,10 +1157,10 @@ next:
}

if (unlikely(current_vector == vector)) {
- if (more_domains)
- continue;
- else
- break;
+ cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
+ cpumask_andnot(tmp_mask, mask, cfg->old_domain);
+ cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+ continue;
}

if (test_bit(vector, used_vectors))
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 943d03f..b5d889b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -212,11 +212,10 @@ static int x2apic_cluster_probe(void)
/*
* Each x2apic cluster is an allocation domain.
*/
-static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
cpumask_clear(retmask);
cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
- return true;
}

static struct apic apic_x2apic_cluster = {
--
1.7.6.5

2012-06-19 23:43:29

by Suresh Siddha

[permalink] [raw]
Subject: [PATCH 2/2] x86, x2apic: limit the vector reservation to the user specified mask

For the x2apic cluster mode, vector for an interrupt is currently reserved on
all the cpu's that are part of the x2apic cluster. But the interrupts will
be routed only to the cluster (derived from the first cpu in the mask) members
specified in the mask. So there is no need to reserve the vector in the unused
cluster members.

Modify __assign_irq_vector() to reserve the vectors based on the user
specified irq destination mask and the corresponding vector domain specified
by the apic driver.

Signed-off-by: Suresh Siddha <[email protected]>
---
arch/x86/include/asm/apic.h | 14 +++++++++---
arch/x86/kernel/apic/apic_noop.c | 5 +++-
arch/x86/kernel/apic/io_apic.c | 36 ++++++++++++++++++++++-----------
arch/x86/kernel/apic/x2apic_cluster.c | 8 ++++--
4 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index b37fa12..7746acb 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,9 @@ struct apic {
unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
unsigned long (*check_apicid_present)(int apicid);

- void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
+ void (*vector_allocation_domain)(const struct cpumask *mask,
+ struct cpumask *retmask,
+ const struct cpumask *prevmask);
void (*init_apic_ldr)(void);

void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -615,7 +617,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
unsigned int *apicid);

static inline void
-flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
+flat_vector_allocation_domain(const struct cpumask *mask,
+ struct cpumask *retmask,
+ const struct cpumask *prevmask)
{
/* Careful. Some cpus do not strictly honor the set of cpus
* specified in the interrupt destination when using lowest
@@ -630,9 +634,11 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
}

static inline void
-default_vector_allocation_domain(int cpu, struct cpumask *retmask)
+default_vector_allocation_domain(const struct cpumask *mask,
+ struct cpumask *retmask,
+ const struct cpumask *prevmask)
{
- cpumask_copy(retmask, cpumask_of(cpu));
+ cpumask_copy(retmask, cpumask_of(cpumask_first(retmask)));
}

static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 08c337b..847ece1 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,8 +100,11 @@ static unsigned long noop_check_apicid_present(int bit)
return physid_isset(bit, phys_cpu_present_map);
}

-static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static void noop_vector_allocation_domain(const struct cpumask *mask,
+ struct cpumask *retmask,
+ const struct cpumask *prevmask)
{
+ int cpu = cpumask_first(retmask);
if (cpu != 0)
pr_warning("APIC: Vector allocated for non-BSP cpu\n");
cpumask_copy(retmask, cpumask_of(cpu));
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7a945f8..0a55eb8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1097,6 +1097,22 @@ void unlock_vector_lock(void)
raw_spin_unlock(&vector_lock);
}

+/*
+ * New cpu mask using the vector is a subset of the current in use mask.
+ * So cleanup the vector allocation for the members that are not used anymore.
+ */
+static inline int
+cleanup_unused_subset(struct cpumask *mask, struct irq_cfg *cfg)
+{
+ if (!cpumask_equal(mask, cfg->domain)) {
+ cpumask_andnot(cfg->old_domain, cfg->domain, mask);
+ cfg->move_in_progress = 1;
+ cpumask_and(cfg->domain, cfg->domain, mask);
+ }
+ free_cpumask_var(mask);
+ return 0;
+}
+
static int
__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
@@ -1126,10 +1142,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
old_vector = cfg->vector;
if (old_vector) {
cpumask_and(tmp_mask, mask, cpu_online_mask);
- if (cpumask_subset(tmp_mask, cfg->domain)) {
- free_cpumask_var(tmp_mask);
- return 0;
- }
+ apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);
+ if (cpumask_subset(tmp_mask, cfg->domain))
+ return cleanup_unused_subset(tmp_mask, cfg);
}

/* Only try and allocate irqs on cpus that are present */
@@ -1137,15 +1152,12 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
cpumask_clear(cfg->old_domain);
cpu = cpumask_first_and(mask, cpu_online_mask);
while (cpu < nr_cpu_ids) {
- int new_cpu;
- int vector, offset;
+ int new_cpu, vector, offset;

- apic->vector_allocation_domain(cpu, tmp_mask);
-
- if (cpumask_subset(tmp_mask, cfg->domain)) {
- free_cpumask_var(tmp_mask);
- return 0;
- }
+ cpumask_copy(tmp_mask, cpumask_of(cpu));
+ apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);
+ if (cpumask_subset(tmp_mask, cfg->domain))
+ return cleanup_unused_subset(tmp_mask, cfg);

vector = current_vector;
offset = current_offset;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index b5d889b..4d49512 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -212,10 +212,12 @@ static int x2apic_cluster_probe(void)
/*
* Each x2apic cluster is an allocation domain.
*/
-static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static void cluster_vector_allocation_domain(const struct cpumask *mask,
+ struct cpumask *retmask,
+ const struct cpumask *prevmask)
{
- cpumask_clear(retmask);
- cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
+ int cpu = cpumask_first(retmask);
+ cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
}

static struct apic apic_x2apic_cluster = {
--
1.7.6.5

2012-06-20 00:18:26

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Mon, 2012-06-18 at 17:51 -0700, Suresh Siddha wrote:
> > - more_domains = apic->vector_allocation_domain(cpu, tmp_mask);
> > + more = apic->vector_allocation_domain(cpu, mask, tmp_mask);
> > + cmp = apic->compare_domains(tmp_mask, cfg->domain);
>
> I think we should be able to consolidate both of them into one
> apic_driver specific routine. Also I think we should be able to use the
> tmp_mask in each round and minimize the number of unnecessary
> for_each_cpu iterations. And that should clean up the more_domains bool
> logic we did earlier.

Just posted couple of patches which did the above. Please review and
ack.

On Mon, 2012-06-18 at 11:17 +0200, Alexander Gordeev wrote:
> Also, this would fix a previous subtle condition introduced with previous
> commit 332afa6 ("x86/irq: Update irq_cfg domain unless the new affinity is a
> subset of the current domain") when a change of affinity mask could trigger
> unnecessary move of vector in case new and old masks intersect, but the new
> mask is not a subset of the old one.

So, I gave some thought to this. And I don't like/didn't address it for
the reasons I mentioned earlier. I don't want to add intelligence to the
apic drivers. That belongs to a higher level entity like 'irqbalance'.
Also when irqbalance asks for a specific mask, I don't want its behavior
to depend on the previous configuration. And I want the behavior to be
some what consistent across different apic drivers whose underlying HW
behavior is similar. For example logical-flat with no HW round-robin
will route the interrupt to the first cpu specified in the user-mask.

Also this is a not common case worth caring about.

BTW, there is still one open that I would like to address. How to handle
the vector pressure during boot etc (as the default vector assignment
specifies all online cpus) when there are lot interrupt sources but
fewer x2apic clusters (like one or two socket server case).

We should be able to do something like the appended. Any better
suggestions? I don't want to add boot parameters to limit the x2apic
cluster membership etc (to fewer than 16 logical cpu's) if possible.
---

arch/x86/kernel/apic/x2apic_cluster.c | 15 ++++++++++++++-
1 files changed, 14 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4d49512..0e3d659 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -217,7 +217,20 @@ static void cluster_vector_allocation_domain(const struct cpumask *mask,
const struct cpumask *prevmask)
{
int cpu = cpumask_first(retmask);
- cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
+
+ /*
+ * By default during boot, device bringup etc interrupt
+ * will be routed to a specific cpu.
+ *
+ * On irq migration requests coming from irqbalance etc,
+ * interrupts will be routed to the x2apic cluster (cluster-id
+ * derived from the first cpu in the mask) members specified
+ * in the mask.
+ */
+ if (mask == cpu_online_mask)
+ cpumask_copy(retmask, cpumask_of(cpu));
+ else
+ cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
}

static struct apic apic_x2apic_cluster = {


thanks,
suresh

2012-06-20 05:53:20

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, apic: optimize cpu traversal in __assign_irq_vector() using domain membership

On Tue, Jun 19, 2012 at 4:43 PM, Suresh Siddha
<[email protected]> wrote:
> Currently __assign_irq_vector() goes through each cpu in the specified mask
> until it finds a free vector in all the cpu's that are part of the same
> interrupt domain. We visit all the interrupt domain sibling cpus to reserve
> the free vector. So, when we fail to find a free vector in an interrupt
> domain, it is safe to continue our search with a cpu belonging to a new
> interrupt domain. No need to go through each cpu, if the domain
> containing that cpu is already visited.
>
> Use the irq_cfg's old_domain to track the visited domains and optimize
> the cpu traversal while finding a free vector in the given cpumask.
>
> NOTE: We can also optimize the search by using for_each_cpu and skip the
> current cpu, if it is not the first cpu in the mask returned by the
> vector_allocation_domain(). But re-using the cfg->old_domain to track
> the visited domains will be slightly faster.
>
> Signed-off-by: Suresh Siddha <[email protected]>

Acked-by: Yinghai Lu <[email protected]>

2012-06-20 05:56:31

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH 2/2] x86, x2apic: limit the vector reservation to the user specified mask

On Tue, Jun 19, 2012 at 4:43 PM, Suresh Siddha
<[email protected]> wrote:
> For the x2apic cluster mode, vector for an interrupt is currently reserved on
> all the cpu's that are part of the x2apic cluster. But the interrupts will
> be routed only to the cluster (derived from the first cpu in the mask) members
> specified in the mask. So there is no need to reserve the vector in the unused
> cluster members.
>
> Modify __assign_irq_vector() to reserve the vectors based on the user
> specified irq destination mask and the corresponding vector domain specified
> by the apic driver.
>
> Signed-off-by: Suresh Siddha <[email protected]>
> ---
> ?arch/x86/include/asm/apic.h ? ? ? ? ? | ? 14 +++++++++---
> ?arch/x86/kernel/apic/apic_noop.c ? ? ?| ? ?5 +++-
> ?arch/x86/kernel/apic/io_apic.c ? ? ? ?| ? 36 ++++++++++++++++++++++-----------
> ?arch/x86/kernel/apic/x2apic_cluster.c | ? ?8 ++++--
> ?4 files changed, 43 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index b37fa12..7746acb 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -306,7 +306,9 @@ struct apic {
> ? ? ? ?unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
> ? ? ? ?unsigned long (*check_apicid_present)(int apicid);
>
> - ? ? ? void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
> + ? ? ? void (*vector_allocation_domain)(const struct cpumask *mask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cpumask *retmask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?const struct cpumask *prevmask);
> ? ? ? ?void (*init_apic_ldr)(void);
>
> ? ? ? ?void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
> @@ -615,7 +617,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? unsigned int *apicid);
>
> ?static inline void
> -flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +flat_vector_allocation_domain(const struct cpumask *mask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cpumask *retmask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? const struct cpumask *prevmask)
> ?{
> ? ? ? ?/* Careful. Some cpus do not strictly honor the set of cpus
> ? ? ? ? * specified in the interrupt destination when using lowest
> @@ -630,9 +634,11 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
> ?}
>
> ?static inline void
> -default_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +default_vector_allocation_domain(const struct cpumask *mask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cpumask *retmask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?const struct cpumask *prevmask)
> ?{
> - ? ? ? cpumask_copy(retmask, cpumask_of(cpu));
> + ? ? ? cpumask_copy(retmask, cpumask_of(cpumask_first(retmask)));
> ?}
>
> ?static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
> diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
> index 08c337b..847ece1 100644
> --- a/arch/x86/kernel/apic/apic_noop.c
> +++ b/arch/x86/kernel/apic/apic_noop.c
> @@ -100,8 +100,11 @@ static unsigned long noop_check_apicid_present(int bit)
> ? ? ? ?return physid_isset(bit, phys_cpu_present_map);
> ?}
>
> -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +static void noop_vector_allocation_domain(const struct cpumask *mask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cpumask *retmask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? const struct cpumask *prevmask)
> ?{
> + ? ? ? int cpu = cpumask_first(retmask);
> ? ? ? ?if (cpu != 0)
> ? ? ? ? ? ? ? ?pr_warning("APIC: Vector allocated for non-BSP cpu\n");
> ? ? ? ?cpumask_copy(retmask, cpumask_of(cpu));
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index 7a945f8..0a55eb8 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1097,6 +1097,22 @@ void unlock_vector_lock(void)
> ? ? ? ?raw_spin_unlock(&vector_lock);
> ?}
>
> +/*
> + * New cpu mask using the vector is a subset of the current in use mask.
> + * So cleanup the vector allocation for the members that are not used anymore.
> + */
> +static inline int
> +cleanup_unused_subset(struct cpumask *mask, struct irq_cfg *cfg)
> +{
> + ? ? ? if (!cpumask_equal(mask, cfg->domain)) {
> + ? ? ? ? ? ? ? cpumask_andnot(cfg->old_domain, cfg->domain, mask);
> + ? ? ? ? ? ? ? cfg->move_in_progress = 1;
> + ? ? ? ? ? ? ? cpumask_and(cfg->domain, cfg->domain, mask);
> + ? ? ? }
> + ? ? ? free_cpumask_var(mask);
> + ? ? ? return 0;
> +}
> +
> ?static int
> ?__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> ?{
> @@ -1126,10 +1142,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> ? ? ? ?old_vector = cfg->vector;
> ? ? ? ?if (old_vector) {
> ? ? ? ? ? ? ? ?cpumask_and(tmp_mask, mask, cpu_online_mask);
> - ? ? ? ? ? ? ? if (cpumask_subset(tmp_mask, cfg->domain)) {
> - ? ? ? ? ? ? ? ? ? ? ? free_cpumask_var(tmp_mask);
> - ? ? ? ? ? ? ? ? ? ? ? return 0;
> - ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);
> + ? ? ? ? ? ? ? if (cpumask_subset(tmp_mask, cfg->domain))
> + ? ? ? ? ? ? ? ? ? ? ? return cleanup_unused_subset(tmp_mask, cfg);
> ? ? ? ?}

This whole check could be removed.

>
> ? ? ? ?/* Only try and allocate irqs on cpus that are present */
> @@ -1137,15 +1152,12 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> ? ? ? ?cpumask_clear(cfg->old_domain);
> ? ? ? ?cpu = cpumask_first_and(mask, cpu_online_mask);
> ? ? ? ?while (cpu < nr_cpu_ids) {
> - ? ? ? ? ? ? ? int new_cpu;
> - ? ? ? ? ? ? ? int vector, offset;
> + ? ? ? ? ? ? ? int new_cpu, vector, offset;
>
> - ? ? ? ? ? ? ? apic->vector_allocation_domain(cpu, tmp_mask);
> -
> - ? ? ? ? ? ? ? if (cpumask_subset(tmp_mask, cfg->domain)) {
> - ? ? ? ? ? ? ? ? ? ? ? free_cpumask_var(tmp_mask);
> - ? ? ? ? ? ? ? ? ? ? ? return 0;
> - ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? cpumask_copy(tmp_mask, cpumask_of(cpu));
> + ? ? ? ? ? ? ? apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);
> + ? ? ? ? ? ? ? if (cpumask_subset(tmp_mask, cfg->domain))
> + ? ? ? ? ? ? ? ? ? ? ? return cleanup_unused_subset(tmp_mask, cfg);
>
> ? ? ? ? ? ? ? ?vector = current_vector;
> ? ? ? ? ? ? ? ?offset = current_offset;
> diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
> index b5d889b..4d49512 100644
> --- a/arch/x86/kernel/apic/x2apic_cluster.c
> +++ b/arch/x86/kernel/apic/x2apic_cluster.c
> @@ -212,10 +212,12 @@ static int x2apic_cluster_probe(void)
> ?/*
> ?* Each x2apic cluster is an allocation domain.
> ?*/
> -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +static void cluster_vector_allocation_domain(const struct cpumask *mask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cpumask *retmask,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?const struct cpumask *prevmask)
> ?{
> - ? ? ? cpumask_clear(retmask);
> - ? ? ? cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
> + ? ? ? int cpu = cpumask_first(retmask);
> + ? ? ? cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));

prevmask is not used. could be removed.

Other than those two,

Acked-by: Yinghai Lu <[email protected]>

2012-06-21 08:38:30

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, apic: optimize cpu traversal in __assign_irq_vector() using domain membership

On Tue, Jun 19, 2012 at 04:43:30PM -0700, Suresh Siddha wrote:
> Use the irq_cfg's old_domain to track the visited domains and optimize
> the cpu traversal while finding a free vector in the given cpumask.
>
> NOTE: We can also optimize the search by using for_each_cpu and skip the
> current cpu, if it is not the first cpu in the mask returned by the
> vector_allocation_domain(). But re-using the cfg->old_domain to track
> the visited domains will be slightly faster.

Sorry for the dealy, Suresh.

You also need to kick out vector_allocation_domain's return value in all other
apic drivers.

--
Regards,
Alexander Gordeev
[email protected]

2012-06-21 09:06:28

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 2/2] x86, x2apic: limit the vector reservation to the user specified mask

On Tue, Jun 19, 2012 at 04:43:31PM -0700, Suresh Siddha wrote:
> For the x2apic cluster mode, vector for an interrupt is currently reserved on
> all the cpu's that are part of the x2apic cluster. But the interrupts will
> be routed only to the cluster (derived from the first cpu in the mask) members
> specified in the mask. So there is no need to reserve the vector in the unused
> cluster members.
>
> Modify __assign_irq_vector() to reserve the vectors based on the user
> specified irq destination mask and the corresponding vector domain specified
> by the apic driver.
>
> Signed-off-by: Suresh Siddha <[email protected]>
> ---
> arch/x86/include/asm/apic.h | 14 +++++++++---
> arch/x86/kernel/apic/apic_noop.c | 5 +++-
> arch/x86/kernel/apic/io_apic.c | 36 ++++++++++++++++++++++-----------
> arch/x86/kernel/apic/x2apic_cluster.c | 8 ++++--
> 4 files changed, 43 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index b37fa12..7746acb 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -306,7 +306,9 @@ struct apic {
> unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
> unsigned long (*check_apicid_present)(int apicid);
>
> - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
> + void (*vector_allocation_domain)(const struct cpumask *mask,
> + struct cpumask *retmask,
> + const struct cpumask *prevmask);
> void (*init_apic_ldr)(void);
>
> void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
> @@ -615,7 +617,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
> unsigned int *apicid);
>
> static inline void
> -flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +flat_vector_allocation_domain(const struct cpumask *mask,
> + struct cpumask *retmask,
> + const struct cpumask *prevmask)
> {
> /* Careful. Some cpus do not strictly honor the set of cpus
> * specified in the interrupt destination when using lowest
> @@ -630,9 +634,11 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
> }
>
> static inline void
> -default_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +default_vector_allocation_domain(const struct cpumask *mask,
> + struct cpumask *retmask,
> + const struct cpumask *prevmask)
> {
> - cpumask_copy(retmask, cpumask_of(cpu));
> + cpumask_copy(retmask, cpumask_of(cpumask_first(retmask)));
> }
>
> static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
> diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
> index 08c337b..847ece1 100644
> --- a/arch/x86/kernel/apic/apic_noop.c
> +++ b/arch/x86/kernel/apic/apic_noop.c
> @@ -100,8 +100,11 @@ static unsigned long noop_check_apicid_present(int bit)
> return physid_isset(bit, phys_cpu_present_map);
> }
>
> -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +static void noop_vector_allocation_domain(const struct cpumask *mask,
> + struct cpumask *retmask,
> + const struct cpumask *prevmask)
> {
> + int cpu = cpumask_first(retmask);
> if (cpu != 0)
> pr_warning("APIC: Vector allocated for non-BSP cpu\n");
> cpumask_copy(retmask, cpumask_of(cpu));
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index 7a945f8..0a55eb8 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1097,6 +1097,22 @@ void unlock_vector_lock(void)
> raw_spin_unlock(&vector_lock);
> }
>
> +/*
> + * New cpu mask using the vector is a subset of the current in use mask.
> + * So cleanup the vector allocation for the members that are not used anymore.
> + */
> +static inline int
> +cleanup_unused_subset(struct cpumask *mask, struct irq_cfg *cfg)
> +{
> + if (!cpumask_equal(mask, cfg->domain)) {
> + cpumask_andnot(cfg->old_domain, cfg->domain, mask);
> + cfg->move_in_progress = 1;
> + cpumask_and(cfg->domain, cfg->domain, mask);
> + }
> + free_cpumask_var(mask);
> + return 0;
> +}
> +

Adding to 1st Yinghai's comment, this function does not appear needed at all...

> static int
> __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> {
> @@ -1126,10 +1142,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> old_vector = cfg->vector;
> if (old_vector) {
> cpumask_and(tmp_mask, mask, cpu_online_mask);
> - if (cpumask_subset(tmp_mask, cfg->domain)) {
> - free_cpumask_var(tmp_mask);
> - return 0;
> - }
> + apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);
> + if (cpumask_subset(tmp_mask, cfg->domain))
> + return cleanup_unused_subset(tmp_mask, cfg);

...but if you decide to leave cleanup_unused_subset() then cpumask_subset()
check is better to move inside the function while free_cpumask_var() move
out.

> }
>
> /* Only try and allocate irqs on cpus that are present */
> @@ -1137,15 +1152,12 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> cpumask_clear(cfg->old_domain);
> cpu = cpumask_first_and(mask, cpu_online_mask);
> while (cpu < nr_cpu_ids) {
> - int new_cpu;
> - int vector, offset;
> + int new_cpu, vector, offset;
>
> - apic->vector_allocation_domain(cpu, tmp_mask);
> -
> - if (cpumask_subset(tmp_mask, cfg->domain)) {
> - free_cpumask_var(tmp_mask);
> - return 0;
> - }
> + cpumask_copy(tmp_mask, cpumask_of(cpu));
> + apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);

I might be missing the point.. in the two lines above you copy a cpumask to
tmp_mask, then scan it in vector_allocation_domain() just to find the cpu
which you already know. Why not just pass cpu rather then cpumask_of(cpu)?

> + if (cpumask_subset(tmp_mask, cfg->domain))
> + return cleanup_unused_subset(tmp_mask, cfg);
>
> vector = current_vector;
> offset = current_offset;
> diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
> index b5d889b..4d49512 100644
> --- a/arch/x86/kernel/apic/x2apic_cluster.c
> +++ b/arch/x86/kernel/apic/x2apic_cluster.c
> @@ -212,10 +212,12 @@ static int x2apic_cluster_probe(void)
> /*
> * Each x2apic cluster is an allocation domain.
> */
> -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +static void cluster_vector_allocation_domain(const struct cpumask *mask,
> + struct cpumask *retmask,
> + const struct cpumask *prevmask)
> {
> - cpumask_clear(retmask);
> - cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu));
> + int cpu = cpumask_first(retmask);
> + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
> }
>
> static struct apic apic_x2apic_cluster = {
> --
> 1.7.6.5
>

--
Regards,
Alexander Gordeev
[email protected]

2012-06-21 11:02:35

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Tue, Jun 19, 2012 at 05:18:42PM -0700, Suresh Siddha wrote:
> On Mon, 2012-06-18 at 17:51 -0700, Suresh Siddha wrote:
> BTW, there is still one open that I would like to address. How to handle
> the vector pressure during boot etc (as the default vector assignment
> specifies all online cpus) when there are lot interrupt sources but
> fewer x2apic clusters (like one or two socket server case).
>
> We should be able to do something like the appended. Any better
> suggestions? I don't want to add boot parameters to limit the x2apic
> cluster membership etc (to fewer than 16 logical cpu's) if possible.

This cpu_online_mask approach should work IMO. Although it looks little bit
hacky for me. May be we could start with default_vector_allocation_domain()
and explicitly switch to cluster_vector_allocation_domain() once booted?

As of boot parameters, I can think of multi-pass walk thru a cpumask to find a
free cluster => core => sibling. In worst case I can imagine a vector space
defragmentator. But nothing really small to avoid current code reshake.

Also, how heavy the vector pressure actually is?

--
Regards,
Alexander Gordeev
[email protected]

2012-06-21 21:51:16

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2/2] x86, x2apic: limit the vector reservation to the user specified mask

On Thu, 2012-06-21 at 11:04 +0200, Alexander Gordeev wrote:
> > static int
> > __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> > {
> > @@ -1126,10 +1142,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> > old_vector = cfg->vector;
> > if (old_vector) {
> > cpumask_and(tmp_mask, mask, cpu_online_mask);
> > - if (cpumask_subset(tmp_mask, cfg->domain)) {
> > - free_cpumask_var(tmp_mask);
> > - return 0;
> > - }
> > + apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);
> > + if (cpumask_subset(tmp_mask, cfg->domain))
> > + return cleanup_unused_subset(tmp_mask, cfg);
>
> ...but if you decide to leave cleanup_unused_subset() then cpumask_subset()
> check is better to move inside the function while free_cpumask_var() move
> out.

Removed the function and the above hunk completely in the next version.

>
> > }
> >
> > /* Only try and allocate irqs on cpus that are present */
> > @@ -1137,15 +1152,12 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> > cpumask_clear(cfg->old_domain);
> > cpu = cpumask_first_and(mask, cpu_online_mask);
> > while (cpu < nr_cpu_ids) {
> > - int new_cpu;
> > - int vector, offset;
> > + int new_cpu, vector, offset;
> >
> > - apic->vector_allocation_domain(cpu, tmp_mask);
> > -
> > - if (cpumask_subset(tmp_mask, cfg->domain)) {
> > - free_cpumask_var(tmp_mask);
> > - return 0;
> > - }
> > + cpumask_copy(tmp_mask, cpumask_of(cpu));
> > + apic->vector_allocation_domain(mask, tmp_mask, cfg->domain);
>
> I might be missing the point.. in the two lines above you copy a cpumask to
> tmp_mask, then scan it in vector_allocation_domain() just to find the cpu
> which you already know. Why not just pass cpu rather then cpumask_of(cpu)?

One of my earlier experiments was using cfg->domain internally in the
vector_allocation_domain() and didn't want to use to many arguments.
Also 'cpu' argument for the first hunk above was not making sense.

Anyhow, this is all cleaned up now in the next version of the patchset.

thanks,
suresh

2012-06-21 21:52:54

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, apic: optimize cpu traversal in __assign_irq_vector() using domain membership

On Thu, 2012-06-21 at 10:31 +0200, Alexander Gordeev wrote:
> On Tue, Jun 19, 2012 at 04:43:30PM -0700, Suresh Siddha wrote:
> > Use the irq_cfg's old_domain to track the visited domains and optimize
> > the cpu traversal while finding a free vector in the given cpumask.
> >
> > NOTE: We can also optimize the search by using for_each_cpu and skip the
> > current cpu, if it is not the first cpu in the mask returned by the
> > vector_allocation_domain(). But re-using the cfg->old_domain to track
> > the visited domains will be slightly faster.
>
> Sorry for the dealy, Suresh.
>
> You also need to kick out vector_allocation_domain's return value in all other
> apic drivers.

this patch is on top of -tip which already has your patch which
consolidates different vector_allocation_domains to relatively few. I
double checked and don't seem to left out any other apic driver.

thanks,
suresh

2012-06-21 21:58:38

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86, irq: update irq_cfg domain unless the new affinity is a subset of the current domain

On Thu, 2012-06-21 at 13:00 +0200, Alexander Gordeev wrote:
> On Tue, Jun 19, 2012 at 05:18:42PM -0700, Suresh Siddha wrote:
> > On Mon, 2012-06-18 at 17:51 -0700, Suresh Siddha wrote:
> > BTW, there is still one open that I would like to address. How to handle
> > the vector pressure during boot etc (as the default vector assignment
> > specifies all online cpus) when there are lot interrupt sources but
> > fewer x2apic clusters (like one or two socket server case).
> >
> > We should be able to do something like the appended. Any better
> > suggestions? I don't want to add boot parameters to limit the x2apic
> > cluster membership etc (to fewer than 16 logical cpu's) if possible.
>
> This cpu_online_mask approach should work IMO. Although it looks little bit
> hacky for me. May be we could start with default_vector_allocation_domain()
> and explicitly switch to cluster_vector_allocation_domain() once booted?

It is not just during boot. Module load/unload will also go through
these paths.

> As of boot parameters, I can think of multi-pass walk thru a cpumask to find a
> free cluster => core => sibling. In worst case I can imagine a vector space
> defragmentator. But nothing really small to avoid current code reshake.

We can probably go with something simple as I sent earlier (third patch
in the new version does this). Depending on the need/future use-cases,
we can further enhance this later.

> Also, how heavy the vector pressure actually is?

I don't know. but I suspect one socket server where they can be one
x2apic cluster (for example 8-core with HT), will see some pressure on
some platforms.

thanks,
suresh

Subject: Re: [tip:x86/apic] x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain

On 06.06.12 08:03:58, tip-bot for Suresh Siddha wrote:
> Commit-ID: 332afa656e76458ee9cf0f0d123016a0658539e4
> Gitweb: http://git.kernel.org/tip/332afa656e76458ee9cf0f0d123016a0658539e4
> Author: Suresh Siddha <[email protected]>
> AuthorDate: Mon, 21 May 2012 16:58:01 -0700
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Wed, 6 Jun 2012 09:51:22 +0200
>
> x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain

This commit causes a sata error and thus a boot failure:

ACPI: Invalid Power Resource to register!ata1: lost interrupt (Status 0x50)
ata1.00: exception Emask 0x0 SAct 0x0 SErr 0x40000000 action 0x6 frozen
ata1: SError: { }
ata1.00: failed command: READ DMA

Reverting it as following helped:

$ git revert d872818dbbeed1bccf58c7f8c7db432154c802f9
$ git revert 1ac322d0b169c95ce34d55b3ed6d40ce1a5f3a02
$ git revert 332afa656e76458ee9cf0f0d123016a0658539e4

-Robert

--
Advanced Micro Devices, Inc.
Operating System Research Center

2012-08-07 15:41:38

by Borislav Petkov

[permalink] [raw]
Subject: do_IRQ: 1.55 No irq handler for vector (irq -1)

On Tue, Aug 07, 2012 at 05:31:49PM +0200, Robert Richter wrote:
> On 06.06.12 08:03:58, tip-bot for Suresh Siddha wrote:
> > Commit-ID: 332afa656e76458ee9cf0f0d123016a0658539e4
> > Gitweb: http://git.kernel.org/tip/332afa656e76458ee9cf0f0d123016a0658539e4
> > Author: Suresh Siddha <[email protected]>
> > AuthorDate: Mon, 21 May 2012 16:58:01 -0700
> > Committer: Ingo Molnar <[email protected]>
> > CommitDate: Wed, 6 Jun 2012 09:51:22 +0200
> >
> > x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain
>
> This commit causes a sata error and thus a boot failure:
>
> ACPI: Invalid Power Resource to register!ata1: lost interrupt (Status 0x50)
> ata1.00: exception Emask 0x0 SAct 0x0 SErr 0x40000000 action 0x6 frozen
> ata1: SError: { }
> ata1.00: failed command: READ DMA
>
> Reverting it as following helped:
>
> $ git revert d872818dbbeed1bccf58c7f8c7db432154c802f9
> $ git revert 1ac322d0b169c95ce34d55b3ed6d40ce1a5f3a02
> $ git revert 332afa656e76458ee9cf0f0d123016a0658539e4

Right,

and it is a good thing Robert and I were talking about his issue and I
mentioned seeing funny do_IRQ messages during 3.6-rc1 boot:

[ 0.170256] AMD PMU driver.
[ 0.170451] ... version: 0
[ 0.170683] ... bit width: 48
[ 0.170906] ... generic registers: 6
[ 0.171125] ... value mask: 0000ffffffffffff
[ 0.171399] ... max period: 00007fffffffffff
[ 0.171673] ... fixed-purpose events: 0
[ 0.171902] ... event mask: 000000000000003f
[ 0.172687] MCE: In-kernel MCE decoding enabled.
[ 0.184214] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
[ 0.186687] do_IRQ: 1.55 No irq handler for vector (irq -1) <---
[ 0.198126] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
[ 0.200579] do_IRQ: 2.55 No irq handler for vector (irq -1) <---
[ 0.173040] smpboot: Booting Node 0, Processors #1 #2 #3 OK
[ 0.212083] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
[ 0.214538] do_IRQ: 3.55 No irq handler for vector (irq -1) <---
[ 0.214864] Brought up 4 CPUs

of it now having IRQ handler for vector 55.

And guess what: reverting those three above make the message go away
too.

Thanks.

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

2012-08-07 16:27:29

by Suresh Siddha

[permalink] [raw]
Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On Tue, 2012-08-07 at 17:41 +0200, Borislav Petkov wrote:
> On Tue, Aug 07, 2012 at 05:31:49PM +0200, Robert Richter wrote:
> > On 06.06.12 08:03:58, tip-bot for Suresh Siddha wrote:
> > > Commit-ID: 332afa656e76458ee9cf0f0d123016a0658539e4
> > > Gitweb: http://git.kernel.org/tip/332afa656e76458ee9cf0f0d123016a0658539e4
> > > Author: Suresh Siddha <[email protected]>
> > > AuthorDate: Mon, 21 May 2012 16:58:01 -0700
> > > Committer: Ingo Molnar <[email protected]>
> > > CommitDate: Wed, 6 Jun 2012 09:51:22 +0200
> > >
> > > x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain
> >
> > This commit causes a sata error and thus a boot failure:
> >
> > ACPI: Invalid Power Resource to register!ata1: lost interrupt (Status 0x50)
> > ata1.00: exception Emask 0x0 SAct 0x0 SErr 0x40000000 action 0x6 frozen
> > ata1: SError: { }
> > ata1.00: failed command: READ DMA
> >
> > Reverting it as following helped:
> >
> > $ git revert d872818dbbeed1bccf58c7f8c7db432154c802f9
> > $ git revert 1ac322d0b169c95ce34d55b3ed6d40ce1a5f3a02
> > $ git revert 332afa656e76458ee9cf0f0d123016a0658539e4
>
> Right,
>
> and it is a good thing Robert and I were talking about his issue and I
> mentioned seeing funny do_IRQ messages during 3.6-rc1 boot:
>
> [ 0.170256] AMD PMU driver.
> [ 0.170451] ... version: 0
> [ 0.170683] ... bit width: 48
> [ 0.170906] ... generic registers: 6
> [ 0.171125] ... value mask: 0000ffffffffffff
> [ 0.171399] ... max period: 00007fffffffffff
> [ 0.171673] ... fixed-purpose events: 0
> [ 0.171902] ... event mask: 000000000000003f
> [ 0.172687] MCE: In-kernel MCE decoding enabled.
> [ 0.184214] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
> [ 0.186687] do_IRQ: 1.55 No irq handler for vector (irq -1) <---
> [ 0.198126] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
> [ 0.200579] do_IRQ: 2.55 No irq handler for vector (irq -1) <---
> [ 0.173040] smpboot: Booting Node 0, Processors #1 #2 #3 OK
> [ 0.212083] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
> [ 0.214538] do_IRQ: 3.55 No irq handler for vector (irq -1) <---
> [ 0.214864] Brought up 4 CPUs
>
> of it now having IRQ handler for vector 55.
>
> And guess what: reverting those three above make the message go away
> too.
>

Boris, Robert, Can you please send me the complete dmesg
and /proc/interrupts on a successful boot?

thanks,
suresh

Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On 07.08.12 09:24:21, Suresh Siddha wrote:
> Boris, Robert, Can you please send me the complete dmesg
> and /proc/interrupts on a successful boot?

Sent to you in private mail.

What information are you looking for specifically? Maybe we can
provide something here on the ml.

Thanks,

-Robert

--
Advanced Micro Devices, Inc.
Operating System Research Center

2012-08-07 17:45:50

by Eric W. Biederman

[permalink] [raw]
Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

Suresh Siddha <[email protected]> writes:

> On Tue, 2012-08-07 at 17:41 +0200, Borislav Petkov wrote:
>> On Tue, Aug 07, 2012 at 05:31:49PM +0200, Robert Richter wrote:
>> > On 06.06.12 08:03:58, tip-bot for Suresh Siddha wrote:
>> > > Commit-ID: 332afa656e76458ee9cf0f0d123016a0658539e4
>> > > Gitweb: http://git.kernel.org/tip/332afa656e76458ee9cf0f0d123016a0658539e4
>> > > Author: Suresh Siddha <[email protected]>
>> > > AuthorDate: Mon, 21 May 2012 16:58:01 -0700
>> > > Committer: Ingo Molnar <[email protected]>
>> > > CommitDate: Wed, 6 Jun 2012 09:51:22 +0200
>> > >
>> > > x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain
>> >
>> > This commit causes a sata error and thus a boot failure:
>> >
>> > ACPI: Invalid Power Resource to register!ata1: lost interrupt (Status 0x50)
>> > ata1.00: exception Emask 0x0 SAct 0x0 SErr 0x40000000 action 0x6 frozen
>> > ata1: SError: { }
>> > ata1.00: failed command: READ DMA
>> >
>> > Reverting it as following helped:
>> >
>> > $ git revert d872818dbbeed1bccf58c7f8c7db432154c802f9
>> > $ git revert 1ac322d0b169c95ce34d55b3ed6d40ce1a5f3a02
>> > $ git revert 332afa656e76458ee9cf0f0d123016a0658539e4
>>
>> Right,
>>
>> and it is a good thing Robert and I were talking about his issue and I
>> mentioned seeing funny do_IRQ messages during 3.6-rc1 boot:
>>
>> [ 0.170256] AMD PMU driver.
>> [ 0.170451] ... version: 0
>> [ 0.170683] ... bit width: 48
>> [ 0.170906] ... generic registers: 6
>> [ 0.171125] ... value mask: 0000ffffffffffff
>> [ 0.171399] ... max period: 00007fffffffffff
>> [ 0.171673] ... fixed-purpose events: 0
>> [ 0.171902] ... event mask: 000000000000003f
>> [ 0.172687] MCE: In-kernel MCE decoding enabled.
>> [ 0.184214] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
>> [ 0.186687] do_IRQ: 1.55 No irq handler for vector (irq -1) <---
>> [ 0.198126] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
>> [ 0.200579] do_IRQ: 2.55 No irq handler for vector (irq -1) <---
>> [ 0.173040] smpboot: Booting Node 0, Processors #1 #2 #3 OK
>> [ 0.212083] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
>> [ 0.214538] do_IRQ: 3.55 No irq handler for vector (irq -1) <---
>> [ 0.214864] Brought up 4 CPUs
>>
>> of it now having IRQ handler for vector 55.
>>
>> And guess what: reverting those three above make the message go away
>> too.
>>
>
> Boris, Robert, Can you please send me the complete dmesg
> and /proc/interrupts on a successful boot?

Hmm. I wonder if this is one of those cases where the apics don't honor
the masks in lowest priority delivery mode and simply deliver to some
cpu in the same die.

Certainly outside of x2apic mode I have seen that happen and that is why
the reservation in lowest priroity delivery mode was for the same vector
across all cpus.

This certainly looks like we have one irq going across multiple cpus
and the software simply appears unprepared for the irq to show up where
the irq is showing up.

Eric

2012-08-07 17:50:48

by Suresh Siddha

[permalink] [raw]
Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On Tue, 2012-08-07 at 19:28 +0200, Robert Richter wrote:
> On 07.08.12 09:24:21, Suresh Siddha wrote:
> > Boris, Robert, Can you please send me the complete dmesg
> > and /proc/interrupts on a successful boot?
>
> Sent to you in private mail.

Thanks.

>
> What information are you looking for specifically? Maybe we can
> provide something here on the ml.

I was looking for what APIC mode and it was a PIC/IO-APIC interrupt.

So it looks like it is using logical flat mode and the interrupt was
from IO-APIC.

So most likely on your system after a successful boot, if you manually
set the affinity of one of those SATA/PATA interrupts to a specific
logical cpu say 0, I think the interrupt still get routed to other
logical cpu's. Can you confirm?

thanks,
suresh

2012-08-07 20:57:23

by Borislav Petkov

[permalink] [raw]
Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On Tue, Aug 07, 2012 at 10:45:30AM -0700, Eric W. Biederman wrote:
> >> [ 0.170256] AMD PMU driver.
> >> [ 0.170451] ... version: 0
> >> [ 0.170683] ... bit width: 48
> >> [ 0.170906] ... generic registers: 6
> >> [ 0.171125] ... value mask: 0000ffffffffffff
> >> [ 0.171399] ... max period: 00007fffffffffff
> >> [ 0.171673] ... fixed-purpose events: 0
> >> [ 0.171902] ... event mask: 000000000000003f
> >> [ 0.172687] MCE: In-kernel MCE decoding enabled.
> >> [ 0.184214] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
> >> [ 0.186687] do_IRQ: 1.55 No irq handler for vector (irq -1) <---
> >> [ 0.198126] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
> >> [ 0.200579] do_IRQ: 2.55 No irq handler for vector (irq -1) <---
> >> [ 0.173040] smpboot: Booting Node 0, Processors #1 #2 #3 OK
> >> [ 0.212083] [Firmware Info]: CPU: Re-enabling disabled Topology Extensions Support
> >> [ 0.214538] do_IRQ: 3.55 No irq handler for vector (irq -1) <---
> >> [ 0.214864] Brought up 4 CPUs
> >>
> >> of it now having IRQ handler for vector 55.
> >>
> >> And guess what: reverting those three above make the message go away
> >> too.
> >>
> >
> > Boris, Robert, Can you please send me the complete dmesg
> > and /proc/interrupts on a successful boot?
>
> Hmm. I wonder if this is one of those cases where the apics don't honor
> the masks in lowest priority delivery mode and simply deliver to some
> cpu in the same die.

The funny thing is, they deliver to all CPUs except the BSP.

Or maybe the BSP gets that IRQ too but it actually has a handler
registered?

Btw, I'm stabbing in the dark here - I have been purposefully and
willfully keeping away from all the APIC debacle until now. I guess that
carefree time is over :(.

> Certainly outside of x2apic mode I have seen that happen and that is why
> the reservation in lowest priroity delivery mode was for the same vector
> across all cpus.
>
> This certainly looks like we have one irq going across multiple cpus
> and the software simply appears unprepared for the irq to show up where
> the irq is showing up.

The interesting thing is that this happens once per core early during
boot and not anymore. I dropped the printk_ratelimit() in do_IRQ and
still got those lines only once in dmesg.

The other funny thing is, irq 55 is not in /proc/interrupts:

CPU0 CPU1 CPU2 CPU3
0: 44 0 0 0 IO-APIC-edge timer
1: 2 1 2 4 IO-APIC-edge i8042
8: 6 7 6 6 IO-APIC-edge rtc0
9: 22 25 24 21 IO-APIC-fasteoi acpi
12: 31 23 30 30 IO-APIC-edge i8042
16: 82 82 81 117 IO-APIC-fasteoi snd_hda_intel
17: 0 1 1 0 IO-APIC-fasteoi ehci_hcd:usb1, ehci_hcd:usb2
18: 3 6 8 8 IO-APIC-fasteoi ohci_hcd:usb3, ohci_hcd:usb4, ohci_hcd:usb5
40: 0 0 0 0 PCI-MSI-edge PCIe PME
41: 0 0 0 0 PCI-MSI-edge PCIe PME
42: 0 0 0 0 PCI-MSI-edge PCIe PME
43: 0 0 0 0 PCI-MSI-edge PCIe PME
44: 675 662 676 690 PCI-MSI-edge ahci
45: 41 44 38 41 PCI-MSI-edge snd_hda_intel
46: 13484 13499 13501 13536 PCI-MSI-edge eth0
NMI: 0 0 0 0 Non-maskable interrupts
LOC: 20719 21487 18015 16445 Local timer interrupts
SPU: 0 0 0 0 Spurious interrupts
PMI: 0 0 0 0 Performance monitoring interrupts
IWI: 0 0 0 0 IRQ work interrupts
RTR: 0 0 0 0 APIC ICR read retries
RES: 13744 12640 13425 12334 Rescheduling interrupts
CAL: 571 790 539 801 Function call interrupts
TLB: 0 0 0 0 TLB shootdowns
TRM: 0 0 0 0 Thermal event interrupts
THR: 0 0 0 0 Threshold APIC interrupts
MCE: 0 0 0 0 Machine check exceptions
MCP: 66 66 66 66 Machine check polls
ERR: 0
MIS: 0

so what is that thing?

I'll read up on lowest prio delivery mode tomorrow.

Thanks.

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

2012-08-07 22:42:22

by Suresh Siddha

[permalink] [raw]
Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On Tue, 2012-08-07 at 22:57 +0200, Borislav Petkov wrote:
> The funny thing is, they deliver to all CPUs except the BSP.

Looking at your /proc/interrupts below, probably it is using some sort
of round-robin.

> Or maybe the BSP gets that IRQ too but it actually has a handler
> registered?

from /proc/interrupts you sent, bsp is also getting those.

>
> Btw, I'm stabbing in the dark here - I have been purposefully and
> willfully keeping away from all the APIC debacle until now. I guess that
> carefree time is over :(.
>
> > Certainly outside of x2apic mode I have seen that happen and that is why
> > the reservation in lowest priroity delivery mode was for the same vector
> > across all cpus.
> >
> > This certainly looks like we have one irq going across multiple cpus
> > and the software simply appears unprepared for the irq to show up where
> > the irq is showing up.
>
> The interesting thing is that this happens once per core early during
> boot and not anymore. I dropped the printk_ratelimit() in do_IRQ and
> still got those lines only once in dmesg.

What it says is the interrupts are arriving at the offline cpu's aswell.
In the pre 3.6-rc1 the vector that is assigned to the legacy irq's are
fixed (IRQ0_VECTOR, ...).

For the 3.6-rc1, we allow the vector to change when the IO-APIC starts
to handle and probably that is a bad idea given that this platform is
spraying interrupts (mostly timer?) on to the offline cpu's aswell. Pre
3.6 we handle those interrupts when we come online. Now in the new
kernels, as the vector has changed when that irq is handled by the
io-apic mode we get a spurious no irq handler for vector message.

> The other funny thing is, irq 55 is not in /proc/interrupts:

'55' is the vector number. You have to add some debug code in the kernel
to identify what irq it used to belong to.

>
> CPU0 CPU1 CPU2 CPU3
> 0: 44 0 0 0 IO-APIC-edge timer
> 1: 2 1 2 4 IO-APIC-edge i8042
> 8: 6 7 6 6 IO-APIC-edge rtc0
> 9: 22 25 24 21 IO-APIC-fasteoi acpi
> 12: 31 23 30 30 IO-APIC-edge i8042
> 16: 82 82 81 117 IO-APIC-fasteoi snd_hda_intel
> 17: 0 1 1 0 IO-APIC-fasteoi ehci_hcd:usb1, ehci_hcd:usb2
> 18: 3 6 8 8 IO-APIC-fasteoi ohci_hcd:usb3, ohci_hcd:usb4, ohci_hcd:usb5
> 40: 0 0 0 0 PCI-MSI-edge PCIe PME
> 41: 0 0 0 0 PCI-MSI-edge PCIe PME
> 42: 0 0 0 0 PCI-MSI-edge PCIe PME
> 43: 0 0 0 0 PCI-MSI-edge PCIe PME
> 44: 675 662 676 690 PCI-MSI-edge ahci
> 45: 41 44 38 41 PCI-MSI-edge snd_hda_intel
> 46: 13484 13499 13501 13536 PCI-MSI-edge eth0
> NMI: 0 0 0 0 Non-maskable interrupts
> LOC: 20719 21487 18015 16445 Local timer interrupts
> SPU: 0 0 0 0 Spurious interrupts
> PMI: 0 0 0 0 Performance monitoring interrupts
> IWI: 0 0 0 0 IRQ work interrupts
> RTR: 0 0 0 0 APIC ICR read retries
> RES: 13744 12640 13425 12334 Rescheduling interrupts
> CAL: 571 790 539 801 Function call interrupts
> TLB: 0 0 0 0 TLB shootdowns
> TRM: 0 0 0 0 Thermal event interrupts
> THR: 0 0 0 0 Threshold APIC interrupts
> MCE: 0 0 0 0 Machine check exceptions
> MCP: 66 66 66 66 Machine check polls
> ERR: 0
> MIS: 0
>
> so what is that thing?

And incase of Robert's SATA hang case, as we modify the vector when the
irq is handled by the io-apic, it sets cfg->move_in_progress during
setup_ioapic_irq() and later when we do setup_ioapic_dest() to update
the SMP affinity, we fail to update the RTE's as the
cfg->move_in_progress is still set (which gets cleared after the first
interrupt arrives).

And in case of Robert's system, all the interrupts go only to the last
cpu (cpu-7). As we fail to update the RTE's with the smp affinity in the
setup_ioapic_dest(), RTE is still pointing to cpu-0 (but the
vector_to_irq mapping is set on all the cpu's) and most likely Robert's
platform for some reason doesn't like it (though we don't see no irq
handler messages on Robert's platform).

Boris, Robert, can you check if the below patch makes both of your
systems happy again (essentially not allowing the vector to change for
legacy irq's, which also allows the RTE to be set correctly in the smp
case etc)? Based on your results and some more thinking, I will send a
detailed patch with changelog tomorrow.

arch/x86/kernel/apic/io_apic.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a6c64aa..4b98610 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1356,6 +1356,15 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
if (!IO_APIC_IRQ(irq))
return;

+ /*
+ * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC
+ * can handle this irq and the apic driver is finialized at this point,
+ * update the cfg->domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs &&
+ cpumask_equal(cfg->domain, cpumask_of(0)))
+ apic->vector_allocation_domain(0, cfg->domain, cpu_online_mask);
+
if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;


Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On 07.08.12 15:39:07, Suresh Siddha wrote:
> Boris, Robert, can you check if the below patch makes both of your
> systems happy again (essentially not allowing the vector to change for
> legacy irq's, which also allows the RTE to be set correctly in the smp
> case etc)? Based on your results and some more thinking, I will send a
> detailed patch with changelog tomorrow.
>
> arch/x86/kernel/apic/io_apic.c | 9 +++++++++
> 1 files changed, 9 insertions(+), 0 deletions(-)

Suresh,

with your patch applied the sata device works fine and the system
boots, no issues seen.

Thanks,

-Robert

>
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index a6c64aa..4b98610 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1356,6 +1356,15 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
> if (!IO_APIC_IRQ(irq))
> return;
>
> + /*
> + * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC
> + * can handle this irq and the apic driver is finialized at this point,
> + * update the cfg->domain.
> + */
> + if (irq < legacy_pic->nr_legacy_irqs &&
> + cpumask_equal(cfg->domain, cpumask_of(0)))
> + apic->vector_allocation_domain(0, cfg->domain, cpu_online_mask);
> +
> if (assign_irq_vector(irq, cfg, apic->target_cpus()))
> return;
>
>
>
>

--
Advanced Micro Devices, Inc.
Operating System Research Center

2012-08-08 11:04:21

by Borislav Petkov

[permalink] [raw]
Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On Wed, Aug 08, 2012 at 10:58:37AM +0200, Robert Richter wrote:
> On 07.08.12 15:39:07, Suresh Siddha wrote:
> > Boris, Robert, can you check if the below patch makes both of your
> > systems happy again (essentially not allowing the vector to change for
> > legacy irq's, which also allows the RTE to be set correctly in the smp
> > case etc)? Based on your results and some more thinking, I will send a
> > detailed patch with changelog tomorrow.
> >
> > arch/x86/kernel/apic/io_apic.c | 9 +++++++++
> > 1 files changed, 9 insertions(+), 0 deletions(-)
>
> Suresh,
>
> with your patch applied the sata device works fine and the system
> boots, no issues seen.

Ditto,

the do_IRQ issue of missing an irq handler for vector 55 is gone too on
my box.

I'm pretty sure you can add our Tested-by:'s to the official patch.

Thanks.

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

2012-08-08 19:19:57

by Suresh Siddha

[permalink] [raw]
Subject: Re: do_IRQ: 1.55 No irq handler for vector (irq -1)

On Wed, 2012-08-08 at 13:04 +0200, Borislav Petkov wrote:
> On Wed, Aug 08, 2012 at 10:58:37AM +0200, Robert Richter wrote:
> > On 07.08.12 15:39:07, Suresh Siddha wrote:
> > > Boris, Robert, can you check if the below patch makes both of your
> > > systems happy again (essentially not allowing the vector to change for
> > > legacy irq's, which also allows the RTE to be set correctly in the smp
> > > case etc)? Based on your results and some more thinking, I will send a
> > > detailed patch with changelog tomorrow.
> > >
> > > arch/x86/kernel/apic/io_apic.c | 9 +++++++++
> > > 1 files changed, 9 insertions(+), 0 deletions(-)
> >
> > Suresh,
> >
> > with your patch applied the sata device works fine and the system
> > boots, no issues seen.
>
> Ditto,
>
> the do_IRQ issue of missing an irq handler for vector 55 is gone too on
> my box.
>
> I'm pretty sure you can add our Tested-by:'s to the official patch.
>

Ok. Thanks Robert, Boris.

I have appended the patch with the updated changelog. Ingo/Peter, Can
you please queue this for v3.6? I have a plan to clean this all up for
v3.7. I will work with Robert, Boris offline and post a cleaner fix for
v3.7 shortly. Thanks.
---

From: Suresh Siddha <[email protected]>
Subject: x86, apic: fix broken legacy interrupts in the logical apic mode

Recent commit 332afa656e76458ee9cf0f0d123016a0658539e4 cleaned up
a workaround that updates irq_cfg domain for legacy irq's that
are handled by the IO-APIC. This was assuming that the recent
changes in assign_irq_vector() were sufficient to remove the workaround.

But this broke couple of AMD platforms. One of them seems to be
sending interrupts to the offline cpu's, resulting in spurious
"No irq handler for vector xx (irq -1)" messages when those cpu's come online.
And the other platform seems to always send the interrupt to the last logical
CPU (cpu-7). Recent changes had an unintended side effect of using only logical
cpu-0 in the IO-APIC RTE (during boot for the legacy interrupts) and this
broke the legacy interrupts not getting routed to the cpu-7 on the AMD
platform, resulting in a boot hang.

For now, reintroduce the removed workaround, (essentially not allowing the
vector to change for legacy irq's when io-apic starts to handle the irq. Which
also addressed the uninteded sife effect of just specifying cpu-0 in the
IO-APIC RTE for those irq's during boot).

Reported-and-tested-by: Robert Richter <[email protected]>
Reported-and-tested-by: Borislav Petkov <[email protected]>
Signed-off-by: Suresh Siddha <[email protected]>
---
arch/x86/kernel/apic/io_apic.c | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a6c64aa..c265593 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1356,6 +1356,16 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
if (!IO_APIC_IRQ(irq))
return;

+ /*
+ * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC
+ * can handle this irq and the apic driver is finialized at this point,
+ * update the cfg->domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs &&
+ cpumask_equal(cfg->domain, cpumask_of(0)))
+ apic->vector_allocation_domain(0, cfg->domain,
+ apic->target_cpus());
+
if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;


2012-08-14 17:02:34

by Suresh Siddha

[permalink] [raw]
Subject: [tip:x86/urgent] x86, apic: fix broken legacy interrupts in the logical apic mode

Commit-ID: f1c6300183dbf5b9da25988e13f6f25a9e27151b
Gitweb: http://git.kernel.org/tip/f1c6300183dbf5b9da25988e13f6f25a9e27151b
Author: Suresh Siddha <[email protected]>
AuthorDate: Wed, 8 Aug 2012 12:16:52 -0700
Committer: H. Peter Anvin <[email protected]>
CommitDate: Tue, 14 Aug 2012 09:52:20 -0700

x86, apic: fix broken legacy interrupts in the logical apic mode

Recent commit 332afa656e76458ee9cf0f0d123016a0658539e4 cleaned up
a workaround that updates irq_cfg domain for legacy irq's that
are handled by the IO-APIC. This was assuming that the recent
changes in assign_irq_vector() were sufficient to remove the workaround.

But this broke couple of AMD platforms. One of them seems to be
sending interrupts to the offline cpu's, resulting in spurious
"No irq handler for vector xx (irq -1)" messages when those cpu's come online.
And the other platform seems to always send the interrupt to the last logical
CPU (cpu-7). Recent changes had an unintended side effect of using only logical
cpu-0 in the IO-APIC RTE (during boot for the legacy interrupts) and this
broke the legacy interrupts not getting routed to the cpu-7 on the AMD
platform, resulting in a boot hang.

For now, reintroduce the removed workaround, (essentially not allowing the
vector to change for legacy irq's when io-apic starts to handle the irq. Which
also addressed the uninteded sife effect of just specifying cpu-0 in the
IO-APIC RTE for those irq's during boot).

Reported-and-tested-by: Robert Richter <[email protected]>
Reported-and-tested-by: Borislav Petkov <[email protected]>
Signed-off-by: Suresh Siddha <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/x86/kernel/apic/io_apic.c | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a6c64aa..c265593 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1356,6 +1356,16 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
if (!IO_APIC_IRQ(irq))
return;

+ /*
+ * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC
+ * can handle this irq and the apic driver is finialized at this point,
+ * update the cfg->domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs &&
+ cpumask_equal(cfg->domain, cpumask_of(0)))
+ apic->vector_allocation_domain(0, cfg->domain,
+ apic->target_cpus());
+
if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;