Several drivers (e.g., mlx4_core) do something similar to:
err = pci_enable_msix(pdev, entries, num_possible_cpus());
which takes us down this code path:
pci_enable_msix
native_setup_msi_irqs
create_irq_nr
__assign_irq_vector
__assign_irq_vector() preferentially uses vectors from low-numbered
CPUs. On a system with a large number (>256) CPUs this can result in
a CPU running out of vectors, and subsequent attempts to assign an
interrupt to that CPU will fail.
The following patch prefers vectors from the node associated with the
device (if the device is associated with a node). This should make it
far less likely that a single CPU's vectors will be exhausted.
Signed-off-by: Arthur Kepner <[email protected]>
---
io_apic.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 77 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7cc0a72..af5f9d8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1117,6 +1117,49 @@ next:
return err;
}
+static int
+__assign_irq_vector_node(int irq, struct irq_cfg *cfg,
+ const struct cpumask *mask, int node)
+{
+ int err = -EAGAIN;
+ int cpu, best_cpu = -1, min_vector_count = NR_VECTORS;
+
+ for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
+ /* find the 'best' CPU to take this vector -
+ * the one with the fewest assigned vectors is
+ * considered 'best' */
+ int i, vector_count = 0;
+
+ if (!cpu_online(cpu))
+ continue;
+
+ for (i = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+ i < NR_VECTORS ; i++)
+ if (per_cpu(vector_irq, cpu)[i] != -1)
+ vector_count++;
+
+ if (vector_count < min_vector_count) {
+ min_vector_count = vector_count;
+ best_cpu = cpu;
+ }
+ }
+
+ if (best_cpu >= 0) {
+ cpumask_var_t tmp_mask;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ cpumask_clear(tmp_mask);
+ cpumask_set_cpu(best_cpu, tmp_mask);
+ err = __assign_irq_vector(irq, cfg, tmp_mask);
+
+ free_cpumask_var(tmp_mask);
+ }
+
+ return err;
+}
+
int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
@@ -1128,6 +1171,39 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
return err;
}
+static int
+assign_irq_vector_node(int irq, struct irq_cfg *cfg,
+ const struct cpumask *mask, int node)
+{
+ int err;
+ unsigned long flags;
+
+ if (node == NUMA_NO_NODE)
+ return assign_irq_vector(irq, cfg, mask);
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector_node(irq, cfg, mask, node);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (err != 0)
+ /* uh oh - try again w/o specifying a node */
+ return assign_irq_vector(irq, cfg, mask);
+ else {
+ /* and set the affinity mask so that only
+ * CPUs on 'node' will be used */
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ cpumask_and(desc->irq_data.affinity, cpu_online_mask,
+ cpumask_of_node(node));
+ desc->status |= IRQ_AFFINITY_SET;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+
+ return err;
+}
+
static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
{
int cpu, vector;
@@ -3057,7 +3133,6 @@ device_initcall(ioapic_init_sysfs);
unsigned int create_irq_nr(unsigned int from, int node)
{
struct irq_cfg *cfg;
- unsigned long flags;
unsigned int ret = 0;
int irq;
@@ -3073,10 +3148,8 @@ unsigned int create_irq_nr(unsigned int from, int node)
return 0;
}
- raw_spin_lock_irqsave(&vector_lock, flags);
- if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+ if (!assign_irq_vector_node(irq, cfg, apic->target_cpus(), node))
ret = irq;
- raw_spin_unlock_irqrestore(&vector_lock, flags);
if (ret) {
set_irq_chip_data(irq, cfg);
On Fri, 3 Dec 2010, Arthur Kepner wrote:
>
> Several drivers (e.g., mlx4_core) do something similar to:
>
> err = pci_enable_msix(pdev, entries, num_possible_cpus());
Which is the main problem and this patch just addresses the
symptoms. As Eric pointed out earlier, this needs to be solved
differently.
There is absolutely no point assigning 4096 interrupts to a single
node in the first place. Especially not, if we end up using only a few
of them in the end. And those you use are not necessarily on that very
node.
I understand, that you want to work around your problem at hand, but
I'm pushing back on it, as it's a crappy solution which just ignores
the underlying problem. You don't even mention it in your changelog
that this is a work around and not a solution.
No, you just ignore that fact and the requests to look at the
underlying problem.
> which takes us down this code path:
>
> pci_enable_msix
> native_setup_msi_irqs
> create_irq_nr
> __assign_irq_vector
>
> __assign_irq_vector() preferentially uses vectors from low-numbered
> CPUs. On a system with a large number (>256) CPUs this can result in
> a CPU running out of vectors, and subsequent attempts to assign an
> interrupt to that CPU will fail.
I agree, that __assign_irq_vector() should honour the request for a
node, but I don't agree, that we should magically spread stuff on
whatever node we find, when the node ran out of vectors.
There is probably a reason, why you want an interrupt on a specific
node and just silently pushing it somewhere else feels very wrong.
This needs to be solved from ground up with proper rules about failure
modes and fallback decisions.
> +static int
> +__assign_irq_vector_node(int irq, struct irq_cfg *cfg,
> + const struct cpumask *mask, int node)
> +{
> + int err = -EAGAIN;
> + int cpu, best_cpu = -1, min_vector_count = NR_VECTORS;
> +
> + for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
> + /* find the 'best' CPU to take this vector -
> + * the one with the fewest assigned vectors is
> + * considered 'best' */
> + int i, vector_count = 0;
> +
> + if (!cpu_online(cpu))
> + continue;
> +
> + for (i = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
> + i < NR_VECTORS ; i++)
> + if (per_cpu(vector_irq, cpu)[i] != -1)
> + vector_count++;
Instead of having proper book keeping of vectors, we loop through nvec
* ncpus to figure that out ? And of course we run that loop with
interrupts disabled and vector lock held.
> + if (vector_count < min_vector_count) {
> + min_vector_count = vector_count;
> + best_cpu = cpu;
> + }
> + }
> +
> + if (best_cpu >= 0) {
> + cpumask_var_t tmp_mask;
> +
> + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
Bah. I made create_irq_nr() atomic region small with lots of effort
and got rid of all GFP_ATOMIC allocations.
> + return -ENOMEM;
> +
> + cpumask_clear(tmp_mask);
> + cpumask_set_cpu(best_cpu, tmp_mask);
> + err = __assign_irq_vector(irq, cfg, tmp_mask);
> +
> + free_cpumask_var(tmp_mask);
> + }
> +
> + return err;
> +}
> +
> int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> {
> int err;
> @@ -1128,6 +1171,39 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> return err;
> }
>
> +static int
> +assign_irq_vector_node(int irq, struct irq_cfg *cfg,
> + const struct cpumask *mask, int node)
> +{
> + int err;
> + unsigned long flags;
> +
> + if (node == NUMA_NO_NODE)
> + return assign_irq_vector(irq, cfg, mask);
> +
> + raw_spin_lock_irqsave(&vector_lock, flags);
> + err = __assign_irq_vector_node(irq, cfg, mask, node);
> + raw_spin_unlock_irqrestore(&vector_lock, flags);
> +
> + if (err != 0)
> + /* uh oh - try again w/o specifying a node */
> + return assign_irq_vector(irq, cfg, mask);
> + else {
> + /* and set the affinity mask so that only
> + * CPUs on 'node' will be used */
> + struct irq_desc *desc = irq_to_desc(irq);
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&desc->lock, flags);
> + cpumask_and(desc->irq_data.affinity, cpu_online_mask,
> + cpumask_of_node(node));
Which leaves us with an empty affinity mask, when the last cpu of that
node just went offline before locking desc->lock. Brilliant.
> + desc->status |= IRQ_AFFINITY_SET;
> + raw_spin_unlock_irqrestore(&desc->lock, flags);
Aside of that low level arch code is not supposed to fiddle in
irq_desc at will excatly for such reasons.
As you might have noticed, i'm working on removing access to irq_desc
from random places and I spent quite some effort to clean up the whole
irq mess. No way to put crap like this in again, so I can twist my
brain around it next week how to clean it up.
Thanks,
tglx
On Fri, Dec 10, 2010 at 11:55:12AM +0100, Thomas Gleixner wrote:
> On Fri, 3 Dec 2010, Arthur Kepner wrote:
> >
> > Several drivers (e.g., mlx4_core) do something similar to:
> >
> > err = pci_enable_msix(pdev, entries, num_possible_cpus());
>
> Which is the main problem and this patch just addresses the
> symptoms. As Eric pointed out earlier, this needs to be solved
> differently.
>
> There is absolutely no point assigning 4096 interrupts to a single
> node in the first place.
I'm not arguing that it's a good idea for a driver to request so
many resources. But some do. And what we do now is even worse than
assigning all the interrupts to a single node.
> Especially not, if we end up using only a few
> of them in the end. And those you use are not necessarily on that very
> node.
OK. Eric mentioned in a related thread that vector allocation
might be deferred until request_irq(). That sounds like a good
idea. But unless we change the way initial vector assignment is
done, it just defers the problem (assuming that request_irq()
is done for every irq allocated in create_irq_nr()).
Using the numa_node of the device to do the initial vector
assignment seems like a reasonable default choice, no?
>
> I understand, that you want to work around your problem at hand, but
> I'm pushing back on it, as it's a crappy solution which just ignores
> the underlying problem. You don't even mention it in your changelog
> that this is a work around and not a solution.
>
> No, you just ignore that fact and the requests to look at the
> underlying problem.
>
> > which takes us down this code path:
> >
> > pci_enable_msix
> > native_setup_msi_irqs
> > create_irq_nr
> > __assign_irq_vector
> >
> > __assign_irq_vector() preferentially uses vectors from low-numbered
> > CPUs. On a system with a large number (>256) CPUs this can result in
> > a CPU running out of vectors, and subsequent attempts to assign an
> > interrupt to that CPU will fail.
>
> I agree, that __assign_irq_vector() should honour the request for a
> node, but I don't agree, that we should magically spread stuff on
> whatever node we find, when the node ran out of vectors.
>
> There is probably a reason, why you want an interrupt on a specific
> node and just silently pushing it somewhere else feels very wrong.
>
> This needs to be solved from ground up with proper rules about failure
> modes and fallback decisions.
>
> > +static int
> > +__assign_irq_vector_node(int irq, struct irq_cfg *cfg,
> > + const struct cpumask *mask, int node)
> > +{
> > + int err = -EAGAIN;
> > + int cpu, best_cpu = -1, min_vector_count = NR_VECTORS;
> > +
> > + for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
> > + /* find the 'best' CPU to take this vector -
> > + * the one with the fewest assigned vectors is
> > + * considered 'best' */
> > + int i, vector_count = 0;
> > +
> > + if (!cpu_online(cpu))
> > + continue;
> > +
> > + for (i = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
> > + i < NR_VECTORS ; i++)
> > + if (per_cpu(vector_irq, cpu)[i] != -1)
> > + vector_count++;
>
> Instead of having proper book keeping of vectors, we loop through nvec
> * ncpus to figure that out ? And of course we run that loop with
> interrupts disabled and vector lock held.
>
> > + if (vector_count < min_vector_count) {
> > + min_vector_count = vector_count;
> > + best_cpu = cpu;
> > + }
> > + }
> > +
> > + if (best_cpu >= 0) {
> > + cpumask_var_t tmp_mask;
> > +
> > + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
>
> Bah. I made create_irq_nr() atomic region small with lots of effort
> and got rid of all GFP_ATOMIC allocations.
>
> > + return -ENOMEM;
> > +
> > + cpumask_clear(tmp_mask);
> > + cpumask_set_cpu(best_cpu, tmp_mask);
> > + err = __assign_irq_vector(irq, cfg, tmp_mask);
> > +
> > + free_cpumask_var(tmp_mask);
> > + }
> > +
> > + return err;
> > +}
> > +
> > int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> > {
> > int err;
> > @@ -1128,6 +1171,39 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
> > return err;
> > }
> >
> > +static int
> > +assign_irq_vector_node(int irq, struct irq_cfg *cfg,
> > + const struct cpumask *mask, int node)
> > +{
> > + int err;
> > + unsigned long flags;
> > +
> > + if (node == NUMA_NO_NODE)
> > + return assign_irq_vector(irq, cfg, mask);
> > +
> > + raw_spin_lock_irqsave(&vector_lock, flags);
> > + err = __assign_irq_vector_node(irq, cfg, mask, node);
> > + raw_spin_unlock_irqrestore(&vector_lock, flags);
> > +
> > + if (err != 0)
> > + /* uh oh - try again w/o specifying a node */
> > + return assign_irq_vector(irq, cfg, mask);
> > + else {
> > + /* and set the affinity mask so that only
> > + * CPUs on 'node' will be used */
> > + struct irq_desc *desc = irq_to_desc(irq);
> > + unsigned long flags;
> > +
> > + raw_spin_lock_irqsave(&desc->lock, flags);
> > + cpumask_and(desc->irq_data.affinity, cpu_online_mask,
> > + cpumask_of_node(node));
>
> Which leaves us with an empty affinity mask, when the last cpu of that
> node just went offline before locking desc->lock. Brilliant.
>
> > + desc->status |= IRQ_AFFINITY_SET;
> > + raw_spin_unlock_irqrestore(&desc->lock, flags);
>
> Aside of that low level arch code is not supposed to fiddle in
> irq_desc at will excatly for such reasons.
>
> As you might have noticed, i'm working on removing access to irq_desc
> from random places and I spent quite some effort to clean up the whole
> irq mess. No way to put crap like this in again, so I can twist my
> brain around it next week how to clean it up.
>
> Thanks,
>
> tglx
--
Arthur
B1;2401;0cOn Thu, 16 Dec 2010, Arthur Kepner wrote:
> On Fri, Dec 10, 2010 at 11:55:12AM +0100, Thomas Gleixner wrote:
> > On Fri, 3 Dec 2010, Arthur Kepner wrote:
> > >
> > > Several drivers (e.g., mlx4_core) do something similar to:
> > >
> > > err = pci_enable_msix(pdev, entries, num_possible_cpus());
> >
> > Which is the main problem and this patch just addresses the
> > symptoms. As Eric pointed out earlier, this needs to be solved
> > differently.
> >
> > There is absolutely no point assigning 4096 interrupts to a single
> > node in the first place.
>
> I'm not arguing that it's a good idea for a driver to request so
> many resources. But some do. ...
Right, some do and we now make efforts to let them do that nonsense no
matter what ?
> .... And what we do now is even worse than
> assigning all the interrupts to a single node.
We assign it to a single core, which is wrong. But what the heck is
the difference between assinging 4k irqs to a single core or to a
single node ?
Nothing. And that's the whole problem.
I agree that the per node assignement needs to be resolved, but not in
the way that we just ignore the underlying problems and solve them at
the wrong place. You did not answer my comment further down in the
patch:
> > I agree, that __assign_irq_vector() should honour the request for a
> > node, but I don't agree, that we should magically spread stuff on
> > whatever node we find, when the node ran out of vectors.
> >
> > There is probably a reason, why you want an interrupt on a specific
> > node and just silently pushing it somewhere else feels very wrong.
> >
> > This needs to be solved from ground up with proper rules about failure
> > modes and fallback decisions.
Where is the answer to this ?
> > Especially not, if we end up using only a few
> > of them in the end. And those you use are not necessarily on that very
> > node.
>
> OK. Eric mentioned in a related thread that vector allocation
> might be deferred until request_irq(). That sounds like a good
> idea. But unless we change the way initial vector assignment is
It does NOT sound like a good idea. Again, I agree that we need to fix
the per node assignment, but we need to discuss the problem of
anything which goes beyond the node.
> done, it just defers the problem (assuming that request_irq()
> is done for every irq allocated in create_irq_nr()).
There is neither a reason for a driver to create so many interrupts in
the first place nor a re reason to request them right away. This is
just horrible crap, nothing else. Why the hell would a driver need to
startup 4k interrupts just to load itself w/o a single user?
We do _NOT_ work around such insanity somewhere else even if we can.
> Using the numa_node of the device to do the initial vector
> assignment seems like a reasonable default choice, no?
I still agree that we should honour the node request, but that has a
totally different scope than what your patch is trying to do.
Thanks,
tglx
On Fri, Dec 17, 2010 at 10:04:48AM +0100, Thomas Gleixner wrote:
> B1;2401;0cOn Thu, 16 Dec 2010, Arthur Kepner wrote:
>
> > On Fri, Dec 10, 2010 at 11:55:12AM +0100, Thomas Gleixner wrote:
> > > .....
> > > There is absolutely no point assigning 4096 interrupts to a single
> > > node in the first place.
> >
> > I'm not arguing that it's a good idea for a driver to request so
> > many resources. But some do. ...
>
> Right, some do and we now make efforts to let them do that nonsense no
> matter what ?
To me, it seems better (and easier) to let drivers do non-sensical
things, than to determine what is sensible for a particluar driver.
How would you choose a limit of interrupt allocations?
>
> > .... And what we do now is even worse than
> > assigning all the interrupts to a single node.
>
> We assign it to a single core, which is wrong. But what the heck is
> the difference between assinging 4k irqs to a single core or to a
> single node ?
>
> Nothing. And that's the whole problem.
>
> I agree that the per node assignement needs to be resolved, but not in
> the way that we just ignore the underlying problems and solve them at
> the wrong place. You did not answer my comment further down in the
> patch:
>
> > > I agree, that __assign_irq_vector() should honour the request for a
> > > node, but I don't agree, that we should magically spread stuff on
> > > whatever node we find, when the node ran out of vectors.
> > >
> > > There is probably a reason, why you want an interrupt on a specific
> > > node and just silently pushing it somewhere else feels very wrong.
> > >
> > > This needs to be solved from ground up with proper rules about failure
> > > modes and fallback decisions.
>
> Where is the answer to this ?
The patch I sent made a 'best effort' to assign an interrupt to the
node where the device is installed. If no vectors were available on
that node, it tried elsewhere.
>
> > > Especially not, if we end up using only a few
> > > of them in the end. And those you use are not necessarily on that very
> > > node.
> >
> > OK. Eric mentioned in a related thread that vector allocation
> > might be deferred until request_irq(). That sounds like a good
> > idea. But unless we change the way initial vector assignment is
>
> It does NOT sound like a good idea.
This seems at odds with what you said here:
http://marc.info/?l=linux-kernel&m=128566136604152&w=2
(specifically this bit:
8<---------------------------- [snip] ----------------------------
..... So the solution would be:
create_irq allocates an irq number + irq descriptor, nothing else
chip->startup() will setup the vector and chip->shutdown releases
it. ......
8<---------------------------- [snip] ----------------------------
)
Can you clarify?
> ... Again, I agree that we need to fix
> the per node assignment, but we need to discuss the problem of
> anything which goes beyond the node.
Suppose that a interrupt is initially assigned to a different
node than the device. Is that such a big deal? They can be
redistributed later by a user-space irq balancer.
>
> > done, it just defers the problem (assuming that request_irq()
> > is done for every irq allocated in create_irq_nr()).
>
> There is neither a reason for a driver to create so many interrupts in
> the first place nor a re reason to request them right away. This is
> just horrible crap, nothing else. Why the hell would a driver need to
> startup 4k interrupts just to load itself w/o a single user?
>
> We do _NOT_ work around such insanity somewhere else even if we can.
>
> > Using the numa_node of the device to do the initial vector
> > assignment seems like a reasonable default choice, no?
>
> I still agree that we should honour the node request, but that has a
> totally different scope than what your patch is trying to do.
>
--
Arthur