This patchset adds a new CPU mask for SMP systems to the irq_desc
struct. It also exposes an API for underlying device drivers to
assist irqbalance in making smarter decisions when balancing, especially
in a NUMA environment. For example, an ethernet driver with MSI-X may
wish to limit the CPUs that an interrupt can be balanced within to
stay on a single NUMA node. Current irqbalance operation can move the
interrupt off the node, resulting in cross-node memory accesses and
locks.
The API is a get/set API within the kernel, along with a /proc entry
for the interrupt.
Signed-off-by: Peter P Waskiewicz Jr <[email protected]>
---
include/linux/interrupt.h | 8 ++++++
include/linux/irq.h | 8 ++++++
kernel/irq/manage.c | 32 +++++++++++++++++++++++++
kernel/irq/proc.c | 57 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 105 insertions(+), 0 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9fd08aa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -208,6 +208,8 @@ extern cpumask_var_t irq_default_affinity;
extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);
+extern int irq_set_node_affinity(unsigned int irq,
+ const struct cpumask *cpumask);
#else /* CONFIG_SMP */
@@ -223,6 +225,12 @@ static inline int irq_can_set_affinity(unsigned int irq)
static inline int irq_select_affinity(unsigned int irq) { return 0; }
+static inline int irq_set_node_affinity(unsigned int irq,
+ const struct cpumask *m)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
#ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ae9653d..819cda0 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -166,6 +166,7 @@ struct irq_2_iommu;
* @lock: locking for SMP
* @affinity: IRQ affinity on SMP
* @node: node index useful for balancing
+ * @node_affinity: irq mask hints for irqbalance
* @pending_mask: pending rebalanced interrupts
* @threads_active: number of irqaction threads currently running
* @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
@@ -196,6 +197,7 @@ struct irq_desc {
#ifdef CONFIG_SMP
cpumask_var_t affinity;
unsigned int node;
+ cpumask_var_t node_affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_var_t pending_mask;
#endif
@@ -445,9 +447,15 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
return false;
+ if (!alloc_cpumask_var_node(&desc->node_affinity, gfp, node)) {
+ free_cpumask_var(desc->affinity);
+ return false;
+ }
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
free_cpumask_var(desc->affinity);
+ free_cpumask_var(desc->node_affinity);
return false;
}
#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7305b29..9e80783 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,38 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
return 0;
}
+/**
+ * irq_set_node_affinity - Set the CPU mask this interrupt can run on
+ * @irq: Interrupt to modify
+ * @cpumask: CPU mask to assign to the interrupt
+ *
+ */
+int irq_set_node_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ cpumask_copy(desc->node_affinity, cpumask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(irq_set_node_affinity);
+
+/**
+ * irq_get_node_affinity - Get the CPU mask this interrupt can run on
+ * @irq: Interrupt to get information
+ *
+ */
+struct cpumask *irq_get_node_affinity(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc->node_affinity;
+}
+EXPORT_SYMBOL(irq_get_node_affinity);
+
#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
* Generic version of the affinity autoselector.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145..192e3fb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -31,6 +31,16 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
return 0;
}
+static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long)m->private);
+ const struct cpumask *mask = desc->node_affinity;
+
+ seq_cpumask(m, mask);
+ seq_putc(m, '\n');
+ return 0;
+}
+
#ifndef is_affinity_mask_valid
#define is_affinity_mask_valid(val) 1
#endif
@@ -78,11 +88,46 @@ free_cpumask:
return err;
}
+static ssize_t irq_node_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ cpumask_var_t new_value;
+ int err;
+
+ if (no_irq_affinity || irq_balancing_disabled(irq))
+ return -EIO;
+
+ if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ goto free_cpumask;
+
+ if (!is_affinity_mask_valid(new_value)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
+
+ irq_set_node_affinity(irq, new_value);
+ err = count;
+
+free_cpumask:
+ free_cpumask_var(new_value);
+ return err;
+}
+
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
+static int irq_node_affinity_proc_open(struct inode *inode, struct file *f)
+{
+ return single_open(f, irq_node_affinity_proc_show, PDE(inode)->data);
+}
+
static const struct file_operations irq_affinity_proc_fops = {
.open = irq_affinity_proc_open,
.read = seq_read,
@@ -91,6 +136,14 @@ static const struct file_operations irq_affinity_proc_fops = {
.write = irq_affinity_proc_write,
};
+static const struct file_operations irq_node_affinity_proc_fops = {
+ .open = irq_node_affinity_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_node_affinity_proc_write,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -230,6 +283,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0600, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
+
+ /* create /proc/irq/<irq>/node_affinity */
+ proc_create_data("node_affinity", 0600, desc->dir,
+ &irq_node_affinity_proc_fops, (void *)(long)irq);
#endif
proc_create_data("spurious", 0444, desc->dir,
On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> This patchset adds a new CPU mask for SMP systems to the irq_desc
> struct. It also exposes an API for underlying device drivers to
> assist irqbalance in making smarter decisions when balancing, especially
> in a NUMA environment. For example, an ethernet driver with MSI-X may
> wish to limit the CPUs that an interrupt can be balanced within to
> stay on a single NUMA node. Current irqbalance operation can move the
> interrupt off the node, resulting in cross-node memory accesses and
> locks.
>
> The API is a get/set API within the kernel, along with a /proc entry
> for the interrupt.
And what does the kernel do with this information and why are we not
using the existing device/numa_node information ?
> +extern int irq_set_node_affinity(unsigned int irq,
> + const struct cpumask *cpumask);
A node can be described with a single integer, right ?
> +static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
> +{
> + struct irq_desc *desc = irq_to_desc((long)m->private);
> + const struct cpumask *mask = desc->node_affinity;
> +
> + seq_cpumask(m, mask);
> + seq_putc(m, '\n');
> + return 0;
> +}
> +
> #ifndef is_affinity_mask_valid
> #define is_affinity_mask_valid(val) 1
> #endif
> @@ -78,11 +88,46 @@ free_cpumask:
> return err;
> }
>
> +static ssize_t irq_node_affinity_proc_write(struct file *file,
> + const char __user *buffer, size_t count, loff_t *pos)
> +{
> + unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
> + cpumask_var_t new_value;
> + int err;
> +
> + if (no_irq_affinity || irq_balancing_disabled(irq))
> + return -EIO;
Yikes. Why should user space be allowed to write to that file ? And
the whole business is what for ? Storing that value in the irq_desc
data structure for use space to read out again ?
Cool design. We provide storage space for user space applications in
the kernel now ?
See also my earlier reply in the thread. This patch is just adding
code and memory bloat while not solving anything at all.
Again, this is going nowhere else than into /dev/null.
Thanks,
tglx
From: Thomas Gleixner <[email protected]>
Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> And what does the kernel do with this information and why are we not
> using the existing device/numa_node information ?
It's a different problem space Thomas.
If the device lives on NUMA node X, we still end up wanting to
allocate memory resources (RX ring buffers) on other NUMA nodes on a
per-queue basis.
Otherwise a network card's forwarding performance is limited by the
memory bandwidth of a single NUMA node, and on a multiqueue cards we
therefore fare much better by allocating each device RX queue's memory
resources on a different NUMA node.
It is this NUMA usage that PJ is trying to export somehow to userspace
so that irqbalanced and friends can choose the IRQ cpu masks more
intelligently.
On Tue, 24 Nov 2009, David Miller wrote:
> From: Thomas Gleixner <[email protected]>
> Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
>
> > And what does the kernel do with this information and why are we not
> > using the existing device/numa_node information ?
>
> It's a different problem space Thomas.
>
> If the device lives on NUMA node X, we still end up wanting to
> allocate memory resources (RX ring buffers) on other NUMA nodes on a
> per-queue basis.
>
> Otherwise a network card's forwarding performance is limited by the
> memory bandwidth of a single NUMA node, and on a multiqueue cards we
> therefore fare much better by allocating each device RX queue's memory
> resources on a different NUMA node.
>
> It is this NUMA usage that PJ is trying to export somehow to userspace
> so that irqbalanced and friends can choose the IRQ cpu masks more
> intelligently.
So you need a preferred irq mask information on a per IRQ basis and
that mask is not restricted to the CPUs of a single NUMA node, right ?
Thanks,
tglx
On Tue, 2009-11-24 at 13:56 -0800, Thomas Gleixner wrote:
> On Tue, 24 Nov 2009, David Miller wrote:
>
> > From: Thomas Gleixner <[email protected]>
> > Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> >
> > > And what does the kernel do with this information and why are we not
> > > using the existing device/numa_node information ?
> >
> > It's a different problem space Thomas.
> >
> > If the device lives on NUMA node X, we still end up wanting to
> > allocate memory resources (RX ring buffers) on other NUMA nodes on a
> > per-queue basis.
> >
> > Otherwise a network card's forwarding performance is limited by the
> > memory bandwidth of a single NUMA node, and on a multiqueue cards we
> > therefore fare much better by allocating each device RX queue's memory
> > resources on a different NUMA node.
> >
> > It is this NUMA usage that PJ is trying to export somehow to userspace
> > so that irqbalanced and friends can choose the IRQ cpu masks more
> > intelligently.
>
> So you need a preferred irq mask information on a per IRQ basis and
> that mask is not restricted to the CPUs of a single NUMA node, right ?
>
Just to clarify, I need a preferred CPU mask on a per IRQ basis. And
yes, that mask may not be restricted to the CPUs of a single NUMA node.
But in the normal case, the mask will be restricted to CPUs of a single
node.
Cheers,
-PJ
On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> On Tue, 2009-11-24 at 13:56 -0800, Thomas Gleixner wrote:
> > On Tue, 24 Nov 2009, David Miller wrote:
> >
> > > From: Thomas Gleixner <[email protected]>
> > > Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> > >
> > > > And what does the kernel do with this information and why are we not
> > > > using the existing device/numa_node information ?
> > >
> > > It's a different problem space Thomas.
> > >
> > > If the device lives on NUMA node X, we still end up wanting to
> > > allocate memory resources (RX ring buffers) on other NUMA nodes on a
> > > per-queue basis.
> > >
> > > Otherwise a network card's forwarding performance is limited by the
> > > memory bandwidth of a single NUMA node, and on a multiqueue cards we
> > > therefore fare much better by allocating each device RX queue's memory
> > > resources on a different NUMA node.
> > >
> > > It is this NUMA usage that PJ is trying to export somehow to userspace
> > > so that irqbalanced and friends can choose the IRQ cpu masks more
> > > intelligently.
> >
> > So you need a preferred irq mask information on a per IRQ basis and
> > that mask is not restricted to the CPUs of a single NUMA node, right ?
> >
> Just to clarify, I need a preferred CPU mask on a per IRQ basis. And
> yes, that mask may not be restricted to the CPUs of a single NUMA node.
> But in the normal case, the mask will be restricted to CPUs of a single
> node.
Right, but the normal case does not help much if we need to consider
the special case of multiple nodes affected which requires another
cpumask in irq_desc. That's what I really want to avoid.
I at least understand the exact problem you guys want to solve. Will
think more about it.
Thanks,
tglx
On Tue, 2009-11-24 at 14:23 -0800, Thomas Gleixner wrote:
> On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> > On Tue, 2009-11-24 at 13:56 -0800, Thomas Gleixner wrote:
> > > On Tue, 24 Nov 2009, David Miller wrote:
> > >
> > > > From: Thomas Gleixner <[email protected]>
> > > > Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> > > >
> > > > > And what does the kernel do with this information and why are we not
> > > > > using the existing device/numa_node information ?
> > > >
> > > > It's a different problem space Thomas.
> > > >
> > > > If the device lives on NUMA node X, we still end up wanting to
> > > > allocate memory resources (RX ring buffers) on other NUMA nodes on a
> > > > per-queue basis.
> > > >
> > > > Otherwise a network card's forwarding performance is limited by the
> > > > memory bandwidth of a single NUMA node, and on a multiqueue cards we
> > > > therefore fare much better by allocating each device RX queue's memory
> > > > resources on a different NUMA node.
> > > >
> > > > It is this NUMA usage that PJ is trying to export somehow to userspace
> > > > so that irqbalanced and friends can choose the IRQ cpu masks more
> > > > intelligently.
> > >
> > > So you need a preferred irq mask information on a per IRQ basis and
> > > that mask is not restricted to the CPUs of a single NUMA node, right ?
> > >
> > Just to clarify, I need a preferred CPU mask on a per IRQ basis. And
> > yes, that mask may not be restricted to the CPUs of a single NUMA node.
> > But in the normal case, the mask will be restricted to CPUs of a single
> > node.
>
> Right, but the normal case does not help much if we need to consider
> the special case of multiple nodes affected which requires another
> cpumask in irq_desc. That's what I really want to avoid.
>
> I at least understand the exact problem you guys want to solve. Will
> think more about it.
>
Just a friendly ping Thomas. Any progress on your thinking about this
proposal?
Cheers,
-PJ