This patch adds a cpumask affinity hint to the irq_desc
structure, along with a registration function and a read-only
proc entry for each interrupt.
This affinity_hint handle for each interrupt can be used by
underlying drivers that need a better mechanism to control
interrupt affinity. The underlying driver can register a
cpumask for the interrupt, which will allow the driver to
provide the CPU mask for the interrupt to anything that
requests it. The intent is to extend the userspace daemon,
irqbalance, to help hint to it a preferred CPU mask to balance
the interrupt into.
Signed-off-by: Peter P Waskiewicz Jr <[email protected]>
---
include/linux/interrupt.h | 13 +++++++++++++
include/linux/irq.h | 1 +
kernel/irq/manage.c | 28 ++++++++++++++++++++++++++++
kernel/irq/proc.c | 33 +++++++++++++++++++++++++++++++++
4 files changed, 75 insertions(+), 0 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9c9ea2a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -209,6 +209,9 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);
+extern int irq_register_affinity_hint(unsigned int irq,
+ const struct cpumask *m);
+extern int irq_unregister_affinity_hint(unsigned int irq);
#else /* CONFIG_SMP */
static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -223,6 +226,16 @@ static inline int irq_can_set_affinity(unsigned int irq)
static inline int irq_select_affinity(unsigned int irq) { return 0; }
+static inline int irq_register_affinity_hint(unsigned int irq,
+ const struct cpumask *m)
+{
+ return -EINVAL;
+}
+
+static inline int irq_unregister_affinity_hint(unsigned int irq);
+{
+ return -EINVAL;
+}
#endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
#ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 707ab12..83b16d7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -206,6 +206,7 @@ struct irq_desc {
struct proc_dir_entry *dir;
#endif
const char *name;
+ struct cpumask *affinity_hint;
} ____cacheline_internodealigned_in_smp;
extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488..bce7e38 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,31 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
return 0;
}
+int irq_register_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc->affinity_hint = m;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(irq_register_affinity_hint);
+
+int irq_unregister_affinity_hint(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc->affinity_hint = NULL;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(irq_unregister_affinity_hint);
#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
* Generic version of the affinity autoselector.
@@ -916,6 +941,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
desc->chip->disable(irq);
}
+ /* make sure affinity_hint is cleaned up */
+ desc->affinity_hint = NULL;
+
raw_spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7a6eb04..8b85f77 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,23 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
return 0;
}
+static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long)m->private);
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ if (desc->affinity_hint) {
+ seq_cpumask(m, desc->affinity_hint);
+ seq_putc(m, '\n');
+ ret = 0;
+ }
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return ret;
+}
+
#ifndef is_affinity_mask_valid
#define is_affinity_mask_valid(val) 1
#endif
@@ -84,6 +101,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
+static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+}
+
static const struct file_operations irq_affinity_proc_fops = {
.open = irq_affinity_proc_open,
.read = seq_read,
@@ -92,6 +114,13 @@ static const struct file_operations irq_affinity_proc_fops = {
.write = irq_affinity_proc_write,
};
+static const struct file_operations irq_affinity_hint_proc_fops = {
+ .open = irq_affinity_hint_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -231,6 +260,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0600, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
+
+ /* create /proc/irq/<irq>/affinity_hint */
+ proc_create_data("affinity_hint", 0400, desc->dir,
+ &irq_affinity_hint_proc_fops, (void *)(long)irq);
#endif
proc_create_data("spurious", 0444, desc->dir,
This patch uses the new IRQ affinity_hint callback mechanism.
It serves purely as an example of how a low-level driver can
utilize this new interface.
An official ixgbe patch will be pushed through netdev once the
IRQ patches have been accepted and merged.
Signed-off-by: Peter P Waskiewicz Jr <[email protected]>
---
drivers/net/ixgbe/ixgbe.h | 2 ++
drivers/net/ixgbe/ixgbe_main.c | 20 +++++++++++++++++++-
2 files changed, 21 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 79c35ae..c220b9f 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -32,6 +32,7 @@
#include <linux/pci.h>
#include <linux/netdevice.h>
#include <linux/aer.h>
+#include <linux/cpumask.h>
#include "ixgbe_type.h"
#include "ixgbe_common.h"
@@ -236,6 +237,7 @@ struct ixgbe_q_vector {
u8 tx_itr;
u8 rx_itr;
u32 eitr;
+ cpumask_var_t affinity_mask;
};
/* Helper macros to switch between ints/sec and what the register uses.
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 1b1419c..28f9d6b 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1083,6 +1083,16 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter)
q_vector->eitr = adapter->rx_eitr_param;
ixgbe_write_eitr(q_vector);
+
+ /*
+ * Allocate the affinity_hint cpumask, assign the mask for
+ * this vector, and register our affinity_hint for this irq.
+ */
+ if (!alloc_cpumask_var(&q_vector->affinity_mask, GFP_KERNEL))
+ return;
+ cpumask_set_cpu(v_idx, q_vector->affinity_mask);
+ irq_register_affinity_hint(adapter->msix_entries[v_idx].vector,
+ q_vector->affinity_mask);
}
if (adapter->hw.mac.type == ixgbe_mac_82598EB)
@@ -3218,7 +3228,7 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
struct ixgbe_hw *hw = &adapter->hw;
u32 rxctrl;
u32 txdctl;
- int i, j;
+ int i, j, num_q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
/* signal that we are down to the interrupt handler */
set_bit(__IXGBE_DOWN, &adapter->state);
@@ -3251,6 +3261,14 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
ixgbe_napi_disable_all(adapter);
+ for (i = 0; i < num_q_vectors; i++) {
+ struct ixgbe_q_vector *q_vector = adapter->q_vector[i];
+ /* unregister the affinity_hint */
+ irq_unregister_affinity_hint(adapter->msix_entries[i].vector);
+ /* release the CPU mask memory */
+ free_cpumask_var(q_vector->affinity_mask);
+ }
+
clear_bit(__IXGBE_SFP_MODULE_NOT_FOUND, &adapter->state);
del_timer_sync(&adapter->sfp_timer);
del_timer_sync(&adapter->watchdog_timer);
On Fri, 30 Apr 2010, Peter P Waskiewicz Jr wrote:
> This patch adds a cpumask affinity hint to the irq_desc
> structure, along with a registration function and a read-only
> proc entry for each interrupt.
>
> This affinity_hint handle for each interrupt can be used by
> underlying drivers that need a better mechanism to control
> interrupt affinity. The underlying driver can register a
> cpumask for the interrupt, which will allow the driver to
> provide the CPU mask for the interrupt to anything that
> requests it. The intent is to extend the userspace daemon,
> irqbalance, to help hint to it a preferred CPU mask to balance
> the interrupt into.
>
> Signed-off-by: Peter P Waskiewicz Jr <[email protected]>
> ---
>
> include/linux/interrupt.h | 13 +++++++++++++
> include/linux/irq.h | 1 +
> kernel/irq/manage.c | 28 ++++++++++++++++++++++++++++
> kernel/irq/proc.c | 33 +++++++++++++++++++++++++++++++++
> 4 files changed, 75 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 75f3f00..9c9ea2a 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -209,6 +209,9 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
> extern int irq_can_set_affinity(unsigned int irq);
> extern int irq_select_affinity(unsigned int irq);
>
> +extern int irq_register_affinity_hint(unsigned int irq,
> + const struct cpumask *m);
I think we can do with a single funtion irq_set_affinity_hint() and
let the caller set the pointer to NULL.
>
> +int irq_register_affinity_hint(unsigned int irq, const struct cpumask *m)
> +{
> + struct irq_desc *desc = irq_to_desc(irq);
> + unsigned long flags;
desc needs to be checked. It might be NULL !
> +
> + raw_spin_lock_irqsave(&desc->lock, flags);
> + desc->affinity_hint = m;
> + raw_spin_unlock_irqrestore(&desc->lock, flags);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(irq_register_affinity_hint);
EXPORT_SYMBOL_GPL please
> +static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
> +{
> + struct irq_desc *desc = irq_to_desc((long)m->private);
> + unsigned long flags;
> + int ret = -EINVAL;
> +
> + raw_spin_lock_irqsave(&desc->lock, flags);
> + if (desc->affinity_hint) {
> + seq_cpumask(m, desc->affinity_hint);
Please make a local copy under desc->mask and do the seq_cpumask()
stuff on the local copy outside of desc->lock
Thanks,
tglx
On Fri, 30 Apr 2010, Thomas Gleixner wrote:
> On Fri, 30 Apr 2010, Peter P Waskiewicz Jr wrote:
>
>> This patch adds a cpumask affinity hint to the irq_desc
>> structure, along with a registration function and a read-only
>> proc entry for each interrupt.
>>
>> This affinity_hint handle for each interrupt can be used by
>> underlying drivers that need a better mechanism to control
>> interrupt affinity. The underlying driver can register a
>> cpumask for the interrupt, which will allow the driver to
>> provide the CPU mask for the interrupt to anything that
>> requests it. The intent is to extend the userspace daemon,
>> irqbalance, to help hint to it a preferred CPU mask to balance
>> the interrupt into.
>>
>> Signed-off-by: Peter P Waskiewicz Jr <[email protected]>
>> ---
>>
>> include/linux/interrupt.h | 13 +++++++++++++
>> include/linux/irq.h | 1 +
>> kernel/irq/manage.c | 28 ++++++++++++++++++++++++++++
>> kernel/irq/proc.c | 33 +++++++++++++++++++++++++++++++++
>> 4 files changed, 75 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
>> index 75f3f00..9c9ea2a 100644
>> --- a/include/linux/interrupt.h
>> +++ b/include/linux/interrupt.h
>> @@ -209,6 +209,9 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
>> extern int irq_can_set_affinity(unsigned int irq);
>> extern int irq_select_affinity(unsigned int irq);
>>
>> +extern int irq_register_affinity_hint(unsigned int irq,
>> + const struct cpumask *m);
>
> I think we can do with a single funtion irq_set_affinity_hint() and
> let the caller set the pointer to NULL.
Ok, I've been running into some issues. If CONFIG_CPUMASK_OFFSTACK is not
set, then cpumask_var_t structs are single-element arrays that cannot be
NULL'd out. I'm pretty sure I need to keep the unregister part of the
API. Thoughts?
>> + raw_spin_lock_irqsave(&desc->lock, flags);
>> + if (desc->affinity_hint) {
>> + seq_cpumask(m, desc->affinity_hint);
>
> Please make a local copy under desc->mask and do the seq_cpumask()
> stuff on the local copy outside of desc->lock
I just looked at the original show_affinity function, and it does not grab
desc->lock before copying mask out of desc. Should I follow that model,
or should I fix that function to honor desc->lock?
-PJ
On Fri, 30 Apr 2010, Thomas Gleixner wrote:
> On Fri, 30 Apr 2010, Peter P Waskiewicz Jr wrote:
>
>> This patch adds a cpumask affinity hint to the irq_desc
>> structure, along with a registration function and a read-only
>> proc entry for each interrupt.
>>
>> This affinity_hint handle for each interrupt can be used by
>> underlying drivers that need a better mechanism to control
>> interrupt affinity. The underlying driver can register a
>> cpumask for the interrupt, which will allow the driver to
>> provide the CPU mask for the interrupt to anything that
>> requests it. The intent is to extend the userspace daemon,
>> irqbalance, to help hint to it a preferred CPU mask to balance
>> the interrupt into.
>>
>> Signed-off-by: Peter P Waskiewicz Jr <[email protected]>
>> ---
>>
>> include/linux/interrupt.h | 13 +++++++++++++
>> include/linux/irq.h | 1 +
>> kernel/irq/manage.c | 28 ++++++++++++++++++++++++++++
>> kernel/irq/proc.c | 33 +++++++++++++++++++++++++++++++++
>> 4 files changed, 75 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
>> index 75f3f00..9c9ea2a 100644
>> --- a/include/linux/interrupt.h
>> +++ b/include/linux/interrupt.h
>> @@ -209,6 +209,9 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
>> extern int irq_can_set_affinity(unsigned int irq);
>> extern int irq_select_affinity(unsigned int irq);
>>
>> +extern int irq_register_affinity_hint(unsigned int irq,
>> + const struct cpumask *m);
>
> I think we can do with a single funtion irq_set_affinity_hint() and
> let the caller set the pointer to NULL.
That works too. I like it. :-)
>
>>
>> +int irq_register_affinity_hint(unsigned int irq, const struct cpumask *m)
>> +{
>> + struct irq_desc *desc = irq_to_desc(irq);
>> + unsigned long flags;
>
> desc needs to be checked. It might be NULL !
Doh, good point!
>
>> +
>> + raw_spin_lock_irqsave(&desc->lock, flags);
>> + desc->affinity_hint = m;
>> + raw_spin_unlock_irqrestore(&desc->lock, flags);
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL(irq_register_affinity_hint);
>
> EXPORT_SYMBOL_GPL please
Will do.
>
>> +static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
>> +{
>> + struct irq_desc *desc = irq_to_desc((long)m->private);
>> + unsigned long flags;
>> + int ret = -EINVAL;
>> +
>> + raw_spin_lock_irqsave(&desc->lock, flags);
>> + if (desc->affinity_hint) {
>> + seq_cpumask(m, desc->affinity_hint);
>
> Please make a local copy under desc->mask and do the seq_cpumask()
> stuff on the local copy outside of desc->lock
Will do.
Cheers,
-PJ
On Fri, 30 Apr 2010, Peter P Waskiewicz Jr wrote:
> On Fri, 30 Apr 2010, Thomas Gleixner wrote:
> > > +extern int irq_register_affinity_hint(unsigned int irq,
> > > + const struct cpumask *m);
> >
> > I think we can do with a single funtion irq_set_affinity_hint() and
> > let the caller set the pointer to NULL.
>
> Ok, I've been running into some issues. If CONFIG_CPUMASK_OFFSTACK is not
> set, then cpumask_var_t structs are single-element arrays that cannot be
> NULL'd out. I'm pretty sure I need to keep the unregister part of the API.
> Thoughts?
extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
So why should calling irq_set_affinity_hint(irqnr, NULL) not work ?
> I just looked at the original show_affinity function, and it does not grab
> desc->lock before copying mask out of desc. Should I follow that model, or
> should I fix that function to honor desc->lock?
desc->affinity can only race against something changing the affinity
bits, so that just might return some random data.
In the hint case the irq could be shut down and the affinity hint
could be freed while you are accessing it. Not a good idea :)
Thanks,
tglx
On Fri, 30 Apr 2010, Thomas Gleixner wrote:
> On Fri, 30 Apr 2010, Peter P Waskiewicz Jr wrote:
>> On Fri, 30 Apr 2010, Thomas Gleixner wrote:
>>>> +extern int irq_register_affinity_hint(unsigned int irq,
>>>> + const struct cpumask *m);
>>>
>>> I think we can do with a single funtion irq_set_affinity_hint() and
>>> let the caller set the pointer to NULL.
>>
>> Ok, I've been running into some issues. If CONFIG_CPUMASK_OFFSTACK is not
>> set, then cpumask_var_t structs are single-element arrays that cannot be
>> NULL'd out. I'm pretty sure I need to keep the unregister part of the API.
>> Thoughts?
>
> extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
>
> So why should calling irq_set_affinity_hint(irqnr, NULL) not work ?
What was that you said about coffee and brain cells? :-)
>
>> I just looked at the original show_affinity function, and it does not grab
>> desc->lock before copying mask out of desc. Should I follow that model, or
>> should I fix that function to honor desc->lock?
>
> desc->affinity can only race against something changing the affinity
> bits, so that just might return some random data.
>
> In the hint case the irq could be shut down and the affinity hint
> could be freed while you are accessing it. Not a good idea :)
Good point.
Latest spin coming shortly. Thanks for the quick feedback!
-PJ