2009-11-20 21:11:37

by Dimitri Sivanich

[permalink] [raw]
Subject: [PATCH v6] x86/apic: limit irq affinity

This patch allows for hard numa restrictions to irq affinity on x86 systems.

Affinity is masked to allow only those cpus which the subarchitecture
deems accessible by the given irq.

On some UV systems, this domain will be limited to the nodes accessible
to the irq's node. Initially other X86 systems will not mask off any cpus
so non-UV systems will remain unaffected.

Signed-off-by: Dimitri Sivanich <[email protected]>

---

Added apic functions for getting numa irq cpumasks. Systems other than
UV now simply return the mask passed in. Added irq_desc_node for
uniprocessor compatibility.

arch/x86/Kconfig | 1
arch/x86/include/asm/apic.h | 13 +++
arch/x86/include/asm/uv/uv_irq.h | 3
arch/x86/include/asm/uv/uv_mmrs.h | 25 +++++++
arch/x86/kernel/apic/apic_flat_64.c | 4 +
arch/x86/kernel/apic/apic_noop.c | 2
arch/x86/kernel/apic/bigsmp_32.c | 2
arch/x86/kernel/apic/es7000_32.c | 4 +
arch/x86/kernel/apic/io_apic.c | 102 +++++++++++++++++++++++-------
arch/x86/kernel/apic/numaq_32.c | 2
arch/x86/kernel/apic/probe_32.c | 2
arch/x86/kernel/apic/summit_32.c | 2
arch/x86/kernel/apic/x2apic_cluster.c | 2
arch/x86/kernel/apic/x2apic_phys.c | 2
arch/x86/kernel/apic/x2apic_uv_x.c | 4 +
arch/x86/kernel/uv_irq.c | 66 +++++++++++++++++++
16 files changed, 214 insertions(+), 22 deletions(-)

Index: linux/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/io_apic.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/io_apic.c 2009-11-20 14:57:28.000000000 -0600
@@ -352,6 +352,18 @@ struct irq_cfg *irq_cfg(unsigned int irq

#endif

+#ifdef CONFIG_SMP
+static int irq_desc_node(struct irq_desc *desc)
+{
+ return desc->node;
+}
+#else
+static int irq_desc_node(struct irq_desc *desc)
+{
+ return 0;
+}
+#endif
+
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -1427,6 +1439,7 @@ static void setup_IO_APIC_irq(int apic_i
{
struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
+ struct cpumask *tmp_mask;
unsigned int dest;

if (!IO_APIC_IRQ(irq))
@@ -1434,10 +1447,15 @@ static void setup_IO_APIC_irq(int apic_i

cfg = desc->chip_data;

- if (assign_irq_vector(irq, cfg, apic->target_cpus()))
+ tmp_mask = apic->get_restricted_mask(apic->target_cpus(),
+ irq_desc_node(desc));
+ if (!tmp_mask)
return;

- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+ if (assign_irq_vector(irq, cfg, tmp_mask))
+ goto error;
+
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, tmp_mask);

apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1451,7 +1469,7 @@ static void setup_IO_APIC_irq(int apic_i
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mp_ioapics[apic_id].apicid, pin);
__clear_irq_vector(irq, cfg);
- return;
+ goto error;
}

ioapic_register_intr(irq, desc, trigger);
@@ -1459,6 +1477,8 @@ static void setup_IO_APIC_irq(int apic_i
disable_8259A_irq(irq);

ioapic_write_entry(apic_id, pin, entry);
+error:
+ apic->free_restricted_mask(tmp_mask);
}

static struct {
@@ -2278,18 +2298,30 @@ set_desc_affinity(struct irq_desc *desc,
{
struct irq_cfg *cfg;
unsigned int irq;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return BAD_APICID;
+ struct cpumask *tmp_mask;

irq = desc->irq;
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
+
+ tmp_mask = apic->get_restricted_mask(mask, irq_desc_node(desc));
+ if (!tmp_mask)
return BAD_APICID;

- cpumask_copy(desc->affinity, mask);
+ if (!cpumask_intersects(tmp_mask, cpu_online_mask))
+ goto error;
+
+ if (assign_irq_vector(irq, cfg, tmp_mask))
+ goto error;
+
+ cpumask_copy(desc->affinity, tmp_mask);
+
+ apic->free_restricted_mask(tmp_mask);

return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+
+error:
+ apic->free_restricted_mask(tmp_mask);
+ return BAD_APICID;
}

static int
@@ -2345,22 +2377,29 @@ migrate_ioapic_irq_desc(struct irq_desc
{
struct irq_cfg *cfg;
struct irte irte;
+ struct cpumask *tmp_mask;
unsigned int dest;
unsigned int irq;
int ret = -1;

- if (!cpumask_intersects(mask, cpu_online_mask))
+ tmp_mask = apic->get_restricted_mask(mask, irq_desc_node(desc));
+ if (!tmp_mask)
return ret;

+ if (!cpumask_intersects(tmp_mask, cpu_online_mask))
+ goto error;
+
irq = desc->irq;
if (get_irte(irq, &irte))
- return ret;
+ goto error;

cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return ret;
+ if (assign_irq_vector(irq, cfg, tmp_mask))
+ goto error;
+
+ ret = 0;

- dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, tmp_mask);

irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -2373,9 +2412,10 @@ migrate_ioapic_irq_desc(struct irq_desc
if (cfg->move_in_progress)
send_cleanup_vector(cfg);

- cpumask_copy(desc->affinity, mask);
-
- return 0;
+ cpumask_copy(desc->affinity, tmp_mask);
+error:
+ apic->free_restricted_mask(tmp_mask);
+ return ret;
}

/*
@@ -3244,18 +3284,27 @@ static int msi_compose_msg(struct pci_de
struct msi_msg *msg, u8 hpet_id)
{
struct irq_cfg *cfg;
+ struct irq_desc *desc;
int err;
unsigned dest;
+ struct cpumask *tmp_mask;

if (disable_apic)
return -ENXIO;

cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ desc = irq_to_desc(irq);
+
+ tmp_mask = apic->get_restricted_mask(apic->target_cpus(),
+ irq_desc_node(desc));
+ if (!tmp_mask)
+ return -ENOMEM;
+
+ err = assign_irq_vector(irq, cfg, tmp_mask);
if (err)
- return err;
+ goto error;

- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, tmp_mask);

if (irq_remapped(irq)) {
struct irte irte;
@@ -3313,6 +3362,8 @@ static int msi_compose_msg(struct pci_de
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(cfg->vector);
}
+error:
+ apic->free_restricted_mask(tmp_mask);
return err;
}

@@ -3730,19 +3781,25 @@ static struct irq_chip ht_irq_chip = {
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
struct irq_cfg *cfg;
+ struct cpumask *tmp_mask;
int err;

if (disable_apic)
return -ENXIO;

cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
+
+ tmp_mask = apic->get_restricted_mask(apic->target_cpus(),
+ dev_to_node(&dev->dev));
+ if (!tmp_mask)
+ return -ENOMEM;
+
+ err = assign_irq_vector(irq, cfg, tmp_mask);
if (!err) {
struct ht_irq_msg msg;
unsigned dest;

- dest = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus());
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, tmp_mask);

msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);

@@ -3766,6 +3823,7 @@ int arch_setup_ht_irq(unsigned int irq,

dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
}
+ apic->free_restricted_mask(tmp_mask);
return err;
}
#endif /* CONFIG_HT_IRQ */
Index: linux/arch/x86/include/asm/uv/uv_mmrs.h
===================================================================
--- linux.orig/arch/x86/include/asm/uv/uv_mmrs.h 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/include/asm/uv/uv_mmrs.h 2009-11-20 13:25:39.000000000 -0600
@@ -823,6 +823,31 @@ union uvh_lb_mcast_aoerr0_rpt_enable_u {
};

/* ========================================================================= */
+/* UVH_LB_SOCKET_DESTINATION_TABLE */
+/* ========================================================================= */
+#define UVH_LB_SOCKET_DESTINATION_TABLE 0x321000UL
+#define UVH_LB_SOCKET_DESTINATION_TABLE_32 0x1800
+#define UVH_LB_SOCKET_DESTINATION_TABLE_DEPTH 128
+
+#define UVH_LB_SOCKET_DESTINATION_TABLE_NODE_ID_SHFT 1
+#define UVH_LB_SOCKET_DESTINATION_TABLE_NODE_ID_MASK 0x0000000000007ffeUL
+#define UVH_LB_SOCKET_DESTINATION_TABLE_CHIP_ID_SHFT 15
+#define UVH_LB_SOCKET_DESTINATION_TABLE_CHIP_ID_MASK 0x0000000000008000UL
+#define UVH_LB_SOCKET_DESTINATION_TABLE_PARITY_SHFT 16
+#define UVH_LB_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000000000010000UL
+
+union uvh_lb_socket_destination_table_u {
+ unsigned long v;
+ struct uvh_lb_socket_destination_table_s {
+ unsigned long rsvd_0 : 1; /* */
+ unsigned long node_id : 14; /* RW */
+ unsigned long chip_id : 1; /* RW */
+ unsigned long parity : 1; /* RW */
+ unsigned long rsvd_17_63: 47; /* */
+ } s;
+};
+
+/* ========================================================================= */
/* UVH_LOCAL_INT0_CONFIG */
/* ========================================================================= */
#define UVH_LOCAL_INT0_CONFIG 0x61000UL
Index: linux/arch/x86/Kconfig
===================================================================
--- linux.orig/arch/x86/Kconfig 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/Kconfig 2009-11-20 14:57:26.000000000 -0600
@@ -365,6 +365,7 @@ config X86_UV
depends on X86_EXTENDED_PLATFORM
depends on NUMA
depends on X86_X2APIC
+ depends on NUMA_IRQ_DESC
---help---
This option is needed in order to support SGI Ultraviolet systems.
If you don't have one of these, you should say N here.
Index: linux/arch/x86/kernel/uv_irq.c
===================================================================
--- linux.orig/arch/x86/kernel/uv_irq.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/uv_irq.c 2009-11-20 14:38:32.000000000 -0600
@@ -242,6 +242,72 @@ static int uv_set_irq_affinity(unsigned
return 0;
}

+static cpumask_var_t *uv_irq_cpus_allowed;
+
+struct cpumask *uv_get_restricted_mask(const struct cpumask *mask, int node)
+{
+ cpumask_var_t tmp_mask;
+ int bid;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return NULL;
+
+ if (!uv_irq_cpus_allowed || node < 0) {
+ cpumask_copy(tmp_mask, mask);
+ return tmp_mask;
+ }
+
+ bid = uv_node_to_blade_id(node);
+
+ cpumask_and(tmp_mask, mask, uv_irq_cpus_allowed[bid]);
+
+ return tmp_mask;
+}
+
+void uv_free_restricted_mask(struct cpumask *mask)
+{
+ free_cpumask_var(mask);
+}
+
+void arch_init_uv_cfg_cpus_allowed(void)
+{
+ int bid;
+
+ uv_irq_cpus_allowed = kzalloc(uv_num_possible_blades() *
+ sizeof(cpumask_var_t *), GFP_KERNEL);
+
+ if (uv_irq_cpus_allowed == NULL) {
+ printk(KERN_EMERG "Out of memory");
+ return;
+ }
+
+ for_each_possible_blade(bid) {
+ unsigned long *pa;
+ int i;
+
+ if (!zalloc_cpumask_var_node(&uv_irq_cpus_allowed[bid],
+ GFP_KERNEL, uv_blade_to_memory_nid(bid))) {
+ printk(KERN_EMERG "Out of memory on blade %d", bid);
+ return;
+ }
+
+ pa = uv_global_mmr64_address(uv_blade_to_pnode(bid),
+ UVH_LB_SOCKET_DESTINATION_TABLE);
+
+ for (i = 0; i < UVH_LB_SOCKET_DESTINATION_TABLE_DEPTH; pa++,
+ i++) {
+ int cpu;
+ int pnode = UV_NASID_TO_PNODE(*pa &
+ UVH_LB_SOCKET_DESTINATION_TABLE_NODE_ID_MASK);
+
+ for_each_possible_cpu(cpu)
+ if (uv_cpu_to_pnode(cpu) == pnode)
+ cpumask_set_cpu(cpu,
+ uv_irq_cpus_allowed[bid]);
+ }
+ }
+}
+
/*
* Set up a mapping of an available irq and vector, and enable the specified
* MMR that defines the MSI that is to be sent to the specified CPU when an
Index: linux/arch/x86/kernel/apic/x2apic_uv_x.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/x2apic_uv_x.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/x2apic_uv_x.c 2009-11-20 14:57:27.000000000 -0600
@@ -23,6 +23,7 @@

#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_irq.h>
#include <asm/current.h>
#include <asm/pgtable.h>
#include <asm/uv/bios.h>
@@ -264,6 +265,8 @@ struct apic __refdata apic_x2apic_uv_x =
.irq_dest_mode = 0, /* physical */

.target_cpus = uv_target_cpus,
+ .get_restricted_mask = uv_get_restricted_mask,
+ .free_restricted_mask = uv_free_restricted_mask,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
@@ -659,5 +662,6 @@ void __init uv_system_init(void)

uv_cpu_init();
uv_scir_register_cpu_notifier();
+ arch_init_uv_cfg_cpus_allowed();
proc_mkdir("sgi_uv", NULL);
}
Index: linux/arch/x86/include/asm/uv/uv_irq.h
===================================================================
--- linux.orig/arch/x86/include/asm/uv/uv_irq.h 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/include/asm/uv/uv_irq.h 2009-11-20 13:25:39.000000000 -0600
@@ -31,6 +31,9 @@ enum {
UV_AFFINITY_CPU
};

+extern struct cpumask *uv_get_restricted_mask(const struct cpumask *, int);
+extern void uv_free_restricted_mask(struct cpumask *);
+extern void arch_init_uv_cfg_cpus_allowed(void);
extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
extern int uv_setup_irq(char *, int, int, unsigned long, int);
extern void uv_teardown_irq(unsigned int);
Index: linux/arch/x86/include/asm/apic.h
===================================================================
--- linux.orig/arch/x86/include/asm/apic.h 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/include/asm/apic.h 2009-11-20 13:25:39.000000000 -0600
@@ -293,6 +293,9 @@ struct apic {
u32 irq_dest_mode;

const struct cpumask *(*target_cpus)(void);
+ struct cpumask *(*get_restricted_mask)(const struct cpumask *mask,
+ int node);
+ void (*free_restricted_mask)(struct cpumask *mask);

int disable_esr;

@@ -474,6 +477,16 @@ static inline const struct cpumask *defa
#endif
}

+static inline struct cpumask *
+default_get_restricted_mask(const struct cpumask *mask, int node)
+{
+ return (struct cpumask *)mask;
+}
+
+static inline void default_free_restricted_mask(struct cpumask *mask)
+{
+}
+
DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);


Index: linux/arch/x86/kernel/apic/apic_flat_64.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/apic_flat_64.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/apic_flat_64.c 2009-11-20 13:25:39.000000000 -0600
@@ -174,6 +174,8 @@ struct apic apic_flat = {
.irq_dest_mode = 1, /* logical */

.target_cpus = flat_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
@@ -323,6 +325,8 @@ struct apic apic_physflat = {
.irq_dest_mode = 0, /* physical */

.target_cpus = physflat_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
Index: linux/arch/x86/kernel/apic/apic_noop.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/apic_noop.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/apic_noop.c 2009-11-20 13:25:39.000000000 -0600
@@ -142,6 +142,8 @@ struct apic apic_noop = {
.irq_dest_mode = 1,

.target_cpus = noop_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = noop_check_apicid_used,
Index: linux/arch/x86/kernel/apic/bigsmp_32.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/bigsmp_32.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/bigsmp_32.c 2009-11-20 13:25:39.000000000 -0600
@@ -211,6 +211,8 @@ struct apic apic_bigsmp = {
.irq_dest_mode = 0,

.target_cpus = bigsmp_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = bigsmp_check_apicid_used,
Index: linux/arch/x86/kernel/apic/es7000_32.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/es7000_32.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/es7000_32.c 2009-11-20 13:25:39.000000000 -0600
@@ -661,6 +661,8 @@ struct apic __refdata apic_es7000_cluste
.irq_dest_mode = 1,

.target_cpus = target_cpus_cluster,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = es7000_check_apicid_used,
@@ -726,6 +728,8 @@ struct apic __refdata apic_es7000 = {
.irq_dest_mode = 0,

.target_cpus = es7000_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = es7000_check_apicid_used,
Index: linux/arch/x86/kernel/apic/numaq_32.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/numaq_32.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/numaq_32.c 2009-11-20 13:25:39.000000000 -0600
@@ -500,6 +500,8 @@ struct apic __refdata apic_numaq = {
.irq_dest_mode = 0,

.target_cpus = numaq_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 1,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = numaq_check_apicid_used,
Index: linux/arch/x86/kernel/apic/probe_32.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/probe_32.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/probe_32.c 2009-11-20 13:25:39.000000000 -0600
@@ -94,6 +94,8 @@ struct apic apic_default = {
.irq_dest_mode = 1,

.target_cpus = default_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = default_check_apicid_used,
Index: linux/arch/x86/kernel/apic/summit_32.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/summit_32.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/summit_32.c 2009-11-20 13:25:39.000000000 -0600
@@ -517,6 +517,8 @@ struct apic apic_summit = {
.irq_dest_mode = 1,

.target_cpus = summit_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 1,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = summit_check_apicid_used,
Index: linux/arch/x86/kernel/apic/x2apic_cluster.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/x2apic_cluster.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/x2apic_cluster.c 2009-11-20 13:25:39.000000000 -0600
@@ -198,6 +198,8 @@ struct apic apic_x2apic_cluster = {
.irq_dest_mode = 1, /* logical */

.target_cpus = x2apic_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
Index: linux/arch/x86/kernel/apic/x2apic_phys.c
===================================================================
--- linux.orig/arch/x86/kernel/apic/x2apic_phys.c 2009-11-20 13:25:30.000000000 -0600
+++ linux/arch/x86/kernel/apic/x2apic_phys.c 2009-11-20 13:25:39.000000000 -0600
@@ -187,6 +187,8 @@ struct apic apic_x2apic_phys = {
.irq_dest_mode = 0, /* physical */

.target_cpus = x2apic_target_cpus,
+ .get_restricted_mask = default_get_restricted_mask,
+ .free_restricted_mask = default_free_restricted_mask,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,


2009-11-21 18:49:50

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Dimitri Sivanich <[email protected]> writes:

> This patch allows for hard numa restrictions to irq affinity on x86 systems.
>
> Affinity is masked to allow only those cpus which the subarchitecture
> deems accessible by the given irq.
>
> On some UV systems, this domain will be limited to the nodes accessible
> to the irq's node. Initially other X86 systems will not mask off any cpus
> so non-UV systems will remain unaffected.

Is this a hardware restriction you are trying to model?
If not this seems wrong.

Eric

2009-11-22 01:14:56

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Sat, Nov 21, 2009 at 10:49:50AM -0800, Eric W. Biederman wrote:
> Dimitri Sivanich <[email protected]> writes:
>
> > This patch allows for hard numa restrictions to irq affinity on x86 systems.
> >
> > Affinity is masked to allow only those cpus which the subarchitecture
> > deems accessible by the given irq.
> >
> > On some UV systems, this domain will be limited to the nodes accessible
> > to the irq's node. Initially other X86 systems will not mask off any cpus
> > so non-UV systems will remain unaffected.
>
> Is this a hardware restriction you are trying to model?
> If not this seems wrong.

Yes, it's a hardware restriction.

2009-11-24 13:21:34

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Sat, 21 Nov 2009, Dimitri Sivanich wrote:

> On Sat, Nov 21, 2009 at 10:49:50AM -0800, Eric W. Biederman wrote:
> > Dimitri Sivanich <[email protected]> writes:
> >
> > > This patch allows for hard numa restrictions to irq affinity on x86 systems.
> > >
> > > Affinity is masked to allow only those cpus which the subarchitecture
> > > deems accessible by the given irq.
> > >
> > > On some UV systems, this domain will be limited to the nodes accessible
> > > to the irq's node. Initially other X86 systems will not mask off any cpus
> > > so non-UV systems will remain unaffected.
> >
> > Is this a hardware restriction you are trying to model?
> > If not this seems wrong.
>
> Yes, it's a hardware restriction.

Nevertheless I think that this is the wrong approach.

What we really want is a notion in the irq descriptor which tells us:
this interrupt is restricted to numa node N.

The solution in this patch is just restricted to x86 and hides that
information deep in the arch code.

Further the patch adds code which should be in the generic interrupt
management code as it is useful for other purposes as well:

Driver folks are looking for a way to restrict irq balancing to a
given numa node when they have all the driver data allocated on that
node. That's not a hardware restriction as in the UV case but requires
a similar infrastructure.

One possible solution would be to have a new flag:
IRQF_NODE_BOUND - irq is bound to desc->node

When an interrupt is set up we would query with a new irq_chip
function chip->get_node_affinity(irq) which would default to an empty
implementation returning -1. The arch code can provide its own
function to return the numa affinity which would express the hardware
restriction.

The core code would restrict affinity settings to the cpumask of that
node without any need for the arch code to check it further.

That same infrastructure could be used for the software restriction of
interrupts to a node on which the device is bound.

Having it in the core code also allows us to expose this information
to user space so that the irq balancer knows about it and does not try
to randomly move the affinity to cpus which are not in the allowed set
of the node.

Thanks,

tglx

2009-11-24 13:40:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, 2009-11-24 at 14:20 +0100, Thomas Gleixner wrote:
> On Sat, 21 Nov 2009, Dimitri Sivanich wrote:
>
> > On Sat, Nov 21, 2009 at 10:49:50AM -0800, Eric W. Biederman wrote:
> > > Dimitri Sivanich <[email protected]> writes:
> > >
> > > > This patch allows for hard numa restrictions to irq affinity on x86 systems.
> > > >
> > > > Affinity is masked to allow only those cpus which the subarchitecture
> > > > deems accessible by the given irq.
> > > >
> > > > On some UV systems, this domain will be limited to the nodes accessible
> > > > to the irq's node. Initially other X86 systems will not mask off any cpus
> > > > so non-UV systems will remain unaffected.
> > >
> > > Is this a hardware restriction you are trying to model?
> > > If not this seems wrong.
> >
> > Yes, it's a hardware restriction.
>
> Nevertheless I think that this is the wrong approach.
>
> What we really want is a notion in the irq descriptor which tells us:
> this interrupt is restricted to numa node N.
>
> The solution in this patch is just restricted to x86 and hides that
> information deep in the arch code.
>
> Further the patch adds code which should be in the generic interrupt
> management code as it is useful for other purposes as well:
>
> Driver folks are looking for a way to restrict irq balancing to a
> given numa node when they have all the driver data allocated on that
> node. That's not a hardware restriction as in the UV case but requires
> a similar infrastructure.
>
> One possible solution would be to have a new flag:
> IRQF_NODE_BOUND - irq is bound to desc->node
>
> When an interrupt is set up we would query with a new irq_chip
> function chip->get_node_affinity(irq) which would default to an empty
> implementation returning -1. The arch code can provide its own
> function to return the numa affinity which would express the hardware
> restriction.
>
> The core code would restrict affinity settings to the cpumask of that
> node without any need for the arch code to check it further.
>
> That same infrastructure could be used for the software restriction of
> interrupts to a node on which the device is bound.
>
> Having it in the core code also allows us to expose this information
> to user space so that the irq balancer knows about it and does not try
> to randomly move the affinity to cpus which are not in the allowed set
> of the node.

I think we should not combine these two cases.

Node-bound devices simply prefer the IRQ to be routed to a cpu 'near'
that node, hard-limiting them to that node is policy and is not
something we should do.

Defaulting to the node-mask is debatable, but is, I think, something we
could do. But I think we should allow user-space to write any mask as
long as the hardware can indeed route the IRQ that way, even when
clearly stupid.

Which is where the UV case comes in, they cannot route IRQs to every
CPU, so it makes sense to limit the possible masks being written. I do
however fully agree that that should be done in generic code, as I can
quite imagine more hardware than UV having limitations in this regard.

Furthermore, the /sysfs topology information should include IRQ routing
data in this case.

2009-11-24 13:56:22

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, 24 Nov 2009, Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 14:20 +0100, Thomas Gleixner wrote:
> > On Sat, 21 Nov 2009, Dimitri Sivanich wrote:
> >
> > > On Sat, Nov 21, 2009 at 10:49:50AM -0800, Eric W. Biederman wrote:
> > > > Dimitri Sivanich <[email protected]> writes:
> > > >
> > > > > This patch allows for hard numa restrictions to irq affinity on x86 systems.
> > > > >
> > > > > Affinity is masked to allow only those cpus which the subarchitecture
> > > > > deems accessible by the given irq.
> > > > >
> > > > > On some UV systems, this domain will be limited to the nodes accessible
> > > > > to the irq's node. Initially other X86 systems will not mask off any cpus
> > > > > so non-UV systems will remain unaffected.
> > > >
> > > > Is this a hardware restriction you are trying to model?
> > > > If not this seems wrong.
> > >
> > > Yes, it's a hardware restriction.
> >
> > Nevertheless I think that this is the wrong approach.
> >
> > What we really want is a notion in the irq descriptor which tells us:
> > this interrupt is restricted to numa node N.
> >
> > The solution in this patch is just restricted to x86 and hides that
> > information deep in the arch code.
> >
> > Further the patch adds code which should be in the generic interrupt
> > management code as it is useful for other purposes as well:
> >
> > Driver folks are looking for a way to restrict irq balancing to a
> > given numa node when they have all the driver data allocated on that
> > node. That's not a hardware restriction as in the UV case but requires
> > a similar infrastructure.
> >
> > One possible solution would be to have a new flag:
> > IRQF_NODE_BOUND - irq is bound to desc->node
> >
> > When an interrupt is set up we would query with a new irq_chip
> > function chip->get_node_affinity(irq) which would default to an empty
> > implementation returning -1. The arch code can provide its own
> > function to return the numa affinity which would express the hardware
> > restriction.
> >
> > The core code would restrict affinity settings to the cpumask of that
> > node without any need for the arch code to check it further.
> >
> > That same infrastructure could be used for the software restriction of
> > interrupts to a node on which the device is bound.
> >
> > Having it in the core code also allows us to expose this information
> > to user space so that the irq balancer knows about it and does not try
> > to randomly move the affinity to cpus which are not in the allowed set
> > of the node.
>
> I think we should not combine these two cases.
>
> Node-bound devices simply prefer the IRQ to be routed to a cpu 'near'
> that node, hard-limiting them to that node is policy and is not
> something we should do.
>
> Defaulting to the node-mask is debatable, but is, I think, something we
> could do. But I think we should allow user-space to write any mask as
> long as the hardware can indeed route the IRQ that way, even when
> clearly stupid.

Fair enough, but I can imagine that we want a tunable know which
prevents that. I'm not against giving sys admins enough rope to hang
themself, but at least we want to give them a helping hand to fight
off crappy user space applications which do not care about stupidity
at all.

> Which is where the UV case comes in, they cannot route IRQs to every
> CPU, so it makes sense to limit the possible masks being written. I do
> however fully agree that that should be done in generic code, as I can
> quite imagine more hardware than UV having limitations in this regard.

That's why I want to see it in the generic code.

> Furthermore, the /sysfs topology information should include IRQ routing
> data in this case.

Hmm, not sure about that. You'd need to scan through all the nodes to
find the set of CPUs where an irq can be routed to. I prefer to have
the information exposed by the irq enumeration (which is currently in
/proc/irq though).

Thanks,

tglx

2009-11-24 14:48:56

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, 24 Nov 2009 14:55:15 +0100 (CET)
> > Furthermore, the /sysfs topology information should include IRQ
> > routing data in this case.
>
> Hmm, not sure about that. You'd need to scan through all the nodes to
> find the set of CPUs where an irq can be routed to. I prefer to have
> the information exposed by the irq enumeration (which is currently in
> /proc/irq though).

yes please.

one device can have multiple irqs
one irq can be servicing multiple devices

expressing that in sysfs is a nightmare, while
sticking it in /proc/irq *where the rest of the info is* is
much nicer for apps like irqbalance


--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2009-11-24 17:41:33

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Arjan van de Ven <[email protected]> writes:

> On Tue, 24 Nov 2009 14:55:15 +0100 (CET)
>> > Furthermore, the /sysfs topology information should include IRQ
>> > routing data in this case.
>>
>> Hmm, not sure about that. You'd need to scan through all the nodes to
>> find the set of CPUs where an irq can be routed to. I prefer to have
>> the information exposed by the irq enumeration (which is currently in
>> /proc/irq though).
>
> yes please.
>
> one device can have multiple irqs
> one irq can be servicing multiple devices
>
> expressing that in sysfs is a nightmare, while
> sticking it in /proc/irq *where the rest of the info is* is
> much nicer for apps like irqbalance

Oii.

I don't think it is bad to export information to applications like irqbalance.

I think it pretty horrible that one of the standard ways I have heard
to improve performance on 10G nics is to kill irqbalance.

Guys. Migrating an irq from one cpu to another while the device is
running without dropping interrupts is hard.

At the point we start talking about limiting what a process with
CAP_SYS_ADMIN can do because it makes bad decisions I think something
is really broken.

Currently the irq code treats /proc/irq/N/smp_affinity as a strong hint
on where we would like interrupts to be delivered, and we don't have good
feedback from there to architecture specific code that knows what we really
can do. It is going to take some effort and some work to make that happen.

I think the irq scheduler is the only scheduler (except for batch jobs) that we
don't put in the kernel. It seems to me that if we are going to go to all of the
trouble to rewrite the generic code to better support irqbalance because we
are having serious irqbalance problems, it will be less effort to suck irqbalance
into the kernel along with everything else.

I really think irqbalancing belongs in the kernel. It is hard to
export all of the information we need to user space and the
information that we need to export keeps changing. Until we master
this new trend of exponentially increasing core counts that
information is going to keep changing. Today we barely know how to balance
flows across cpus. So because of the huge communication problem and
the fact that there appears to be no benefit in keeping irqbalance in
user space (there is no config file) if we are going to rework all of the
interfaces let's pull irqbalance into the kernel.


As for the UV code, what we are looking at is a fundamental irq
routing property. Those irqs cannot be routed to some cpus. That is
something the code that sets up the routes needs to be aware of.
Dimitri could you put your the extra code in assign_irq_vector instead
of in the callers of assign_irq_vector? Since the probably is not
likely to stay unique we probably want to put the information you base
things on in struct irq_desc, but the logic I seems to live best in
in assign_irq_vector.

Eric

2009-11-24 18:00:51

by Waskiewicz Jr, Peter P

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, 2009-11-24 at 09:41 -0800, Eric W. Biederman wrote:
> Arjan van de Ven <[email protected]> writes:
>
> > On Tue, 24 Nov 2009 14:55:15 +0100 (CET)
> >> > Furthermore, the /sysfs topology information should include IRQ
> >> > routing data in this case.
> >>
> >> Hmm, not sure about that. You'd need to scan through all the nodes to
> >> find the set of CPUs where an irq can be routed to. I prefer to have
> >> the information exposed by the irq enumeration (which is currently in
> >> /proc/irq though).
> >
> > yes please.
> >
> > one device can have multiple irqs
> > one irq can be servicing multiple devices
> >
> > expressing that in sysfs is a nightmare, while
> > sticking it in /proc/irq *where the rest of the info is* is
> > much nicer for apps like irqbalance
>
> Oii.
>
> I don't think it is bad to export information to applications like irqbalance.
>
> I think it pretty horrible that one of the standard ways I have heard
> to improve performance on 10G nics is to kill irqbalance.
>

This is something I'm actively trying to fix (see thread "irq: Add
node_affinity CPU masks for smarter irqbalance hints"). That patch may
not be the final answer, but whatever comes of it will fix the
recommendation of "killall irqbalance" for performance boosts.

Cheers,
-PJ

2009-11-24 18:22:13

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity


* Eric W. Biederman <[email protected]> wrote:

> Currently the irq code treats /proc/irq/N/smp_affinity as a strong
> hint on where we would like interrupts to be delivered, and we don't
> have good feedback from there to architecture specific code that knows
> what we really can do. It is going to take some effort and some work
> to make that happen.
>
> I think the irq scheduler is the only scheduler (except for batch
> jobs) that we don't put in the kernel. It seems to me that if we are
> going to go to all of the trouble to rewrite the generic code to
> better support irqbalance because we are having serious irqbalance
> problems, it will be less effort to suck irqbalance into the kernel
> along with everything else.
>
> I really think irqbalancing belongs in the kernel. [...]

Interesting. I've yet to see a solution that is maintainable and works
well, without putting too much policy into the kernel. Our previous
solutions didnt really work all that well.

What would your model be, and can it be implemented reasonably?

Ingo

2009-11-24 18:29:42

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Ingo Molnar wrote:
> * Eric W. Biederman <[email protected]> wrote:
>
>> Currently the irq code treats /proc/irq/N/smp_affinity as a strong
>> hint on where we would like interrupts to be delivered, and we don't
>> have good feedback from there to architecture specific code that knows
>> what we really can do. It is going to take some effort and some work
>> to make that happen.
>>
>> I think the irq scheduler is the only scheduler (except for batch
>> jobs) that we don't put in the kernel. It seems to me that if we are
>> going to go to all of the trouble to rewrite the generic code to
>> better support irqbalance because we are having serious irqbalance
>> problems, it will be less effort to suck irqbalance into the kernel
>> along with everything else.
>>
>> I really think irqbalancing belongs in the kernel. [...]
>
> Interesting. I've yet to see a solution that is maintainable and works
> well, without putting too much policy into the kernel. Our previous
> solutions didnt really work all that well.
>
> What would your model be, and can it be implemented reasonably?

we already have dev numa node, so could just make irqblance to some smart to use
that device node corresponding for irq that is binding to the device.

YH

2009-11-24 18:33:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, 2009-11-24 at 10:27 -0800, Yinghai Lu wrote:

> we already have dev numa node, so could just make irqblance to some smart to use
> that device node corresponding for irq that is binding to the device.

The argument is that a single device can have multiple IRQs and
therefore numa nodes.

2009-11-24 19:03:04

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 10:27 -0800, Yinghai Lu wrote:
>
>> we already have dev numa node, so could just make irqblance to some smart to use
>> that device node corresponding for irq that is binding to the device.
>
> The argument is that a single device can have multiple IRQs and
> therefore numa nodes.

one node could have 8 cores or 16 threads, so should be right.

other case the intel new 4 sockets system will have two IOHs, and one for socket0 and socket 1
and one for socket2 and socet3

acpi root bus _PXM only can return one.
so wonder if ACPI spec need to be extended to support two nodes there.

after that, we need to extend numa_node to support that.

YH

2009-11-24 21:41:21

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, Nov 24, 2009 at 09:41:18AM -0800, Eric W. Biederman wrote:
> As for the UV code, what we are looking at is a fundamental irq
> routing property. Those irqs cannot be routed to some cpus. That is
> something the code that sets up the routes needs to be aware of.

Correct. We can't allow an interrupt to be routed to an invalid node.

> Dimitri could you put your the extra code in assign_irq_vector instead
> of in the callers of assign_irq_vector? Since the probably is not
> likely to stay unique we probably want to put the information you base
> things on in struct irq_desc, but the logic I seems to live best in
> in assign_irq_vector.

So you're saying continue to use the node value in irq_desc, or add a cpumask there (which will add some size to that structure)? I'll have to take another look at assign_irq_vector, but as things are currently structured, we don't return any sort of valid cpumask that we'd need for further processing in the caller functions. One would need to pass that back or store that cpumask someplace, like irq_desc?

2009-11-24 21:54:43

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, 24 Nov 2009, Dimitri Sivanich wrote:

> On Tue, Nov 24, 2009 at 09:41:18AM -0800, Eric W. Biederman wrote:
> > As for the UV code, what we are looking at is a fundamental irq
> > routing property. Those irqs cannot be routed to some cpus. That is
> > something the code that sets up the routes needs to be aware of.
>
> Correct. We can't allow an interrupt to be routed to an invalid node.
>
> > Dimitri could you put your the extra code in assign_irq_vector instead
> > of in the callers of assign_irq_vector? Since the probably is not
> > likely to stay unique we probably want to put the information you base
> > things on in struct irq_desc, but the logic I seems to live best in
> > in assign_irq_vector.
>
> So you're saying continue to use the node value in irq_desc, or add
> a cpumask there (which will add some size to that structure)? I'll
> have to take another look at assign_irq_vector, but as things are
> currently structured, we don't return any sort of valid cpumask that
> we'd need for further processing in the caller functions. One would
> need to pass that back or store that cpumask someplace, like
> irq_desc?

Please do not put anything complex into x86 code at all. Such designs
are likely to happen on other architectures and as I said before we
want to have

1) the decision function what's valid and not in the generic code

2) a way to expose that information as part of the irq interface to
user space.

So what's wrong with a per irq_chip function which returns the cpumask
which is valid for irq N ?

That function would be called to check the affinity mask in
set_irq_affinity and to dump the mask to /proc/irq/N/possible_cpus or
whatever name we agree on.

That way we don't have to worry about where in the x86 code the
decision should reside as you simply would always get valid masks from
the core code.

That just works and is neither restricted to UV nor to x86.

Thanks,

tglx

2009-11-24 22:42:20

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Dimitri Sivanich <[email protected]> writes:

> On Tue, Nov 24, 2009 at 09:41:18AM -0800, Eric W. Biederman wrote:
>> As for the UV code, what we are looking at is a fundamental irq
>> routing property. Those irqs cannot be routed to some cpus. That is
>> something the code that sets up the routes needs to be aware of.
>
> Correct. We can't allow an interrupt to be routed to an invalid node.
>
>> Dimitri could you put your the extra code in assign_irq_vector instead
>> of in the callers of assign_irq_vector? Since the probably is not
>> likely to stay unique we probably want to put the information you base
>> things on in struct irq_desc, but the logic I seems to live best in
>> in assign_irq_vector.
>
> So you're saying continue to use the node value in irq_desc, or add
> a cpumask there (which will add some size to that structure)? I'll
> have to take another look at assign_irq_vector, but as things are
> currently structured, we don't return any sort of valid cpumask that
> we'd need for further processing in the caller functions. One would
> need to pass that back or store that cpumask someplace, like
> irq_desc?

We have cfg->domain which is the set of cpus the irq was setup to work on.
We don't need an extra return value.

All that is needed is an extra line or two up at the very top.
Something like.
cpumask_and(requested_mask, mask, irq_possible_cpus_mask);
And then use requested_mask where we use cpumask_and today.

Simple and it doesn't require extra code all over the place.

Eric

2009-11-24 23:06:36

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Thomas Gleixner <[email protected]> writes:

> Please do not put anything complex into x86 code at all. Such designs
> are likely to happen on other architectures and as I said before we
> want to have
>
> 1) the decision function what's valid and not in the generic code

For the UV problem I don't have an issue. assign_irq_vector
enforces some rules that I don't see being able to expose
to user space.

> 2) a way to expose that information as part of the irq interface to
> user space.

-EINVAL?

> So what's wrong with a per irq_chip function which returns the cpumask
> which is valid for irq N ?

I have no problems with a generic function to do that.

> That function would be called to check the affinity mask in
> set_irq_affinity and to dump the mask to /proc/irq/N/possible_cpus or
> whatever name we agree on.
>
> That way we don't have to worry about where in the x86 code the
> decision should reside as you simply would always get valid masks from
> the core code.

Impossible. assign_irq_vector is the only function that can tell you
if a mask is valid or not. Currently we support roughly 240 irqs
per cpu. Currently we support more than 240 irqs. I don't see
how you can enforce that limit.

Furthermore irq migration on x86 is a very non-trivial exercise.
We must wait until we get a new irq at the new location before
we cleanup the irq state at the old location, to ensure that the
state change has gone through. At which point again we can not
know.

So Thomas the logical conclusion that you are requesting. An
architecture specific interface for migrating irqs that does not
need to return error codes because the generic code has enough
information to avoid all problem cases is not going to happen.
It is totally unreasonable.

> That just works and is neither restricted to UV nor to x86.

Doing it all in the core totally fails as it gets the initial irq
assignment wrong.

Last I looked set_irq_affinity was a horribly broken interface. We
can not return error codes to user space when they ask us to do the
impossible. Right now irq->affinity is a hint that occasionally we
ignore when what it requests is impossible.

....

Thomas my apologies for ranting but I am extremely sensitive about
people placing demands on the irq code that would be very convenient
and simple for the rest of the world, except that the hardware does not
work the way people envision it should work. The worst offender is
the cpu hotunplug logic that requests we perform the impossible when
it comes to irq migration. In the case of UV I expect cpu hotplug is
going to request we migrate irqs to another node.

Right now a lot of the generic irq code is living in a deluded fantasy and
I really don't want to see more impossible requests from the irq code
added to the pile.

...

The architecture specific function setup_irq_vector has all of the
information available to it to make the decision. We use it
consistently everywhere. For the case of UV it needs to know about
another possible hardware limitation, to do it's job. I am happy
if that information comes from an architecture agnostic source but
making the decision somewhere else is just a guarantee that we will
have more subtle breakage that occasionally fail for people but at
too low a rate that people will care enough to fix.

Eric

2009-11-25 01:34:30

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Eric,

On Tue, 24 Nov 2009, Eric W. Biederman wrote:
> Thomas Gleixner <[email protected]> writes:
>
> > Please do not put anything complex into x86 code at all. Such designs
> > are likely to happen on other architectures and as I said before we
> > want to have
> >
> > 1) the decision function what's valid and not in the generic code
>
> For the UV problem I don't have an issue. assign_irq_vector
> enforces some rules that I don't see being able to expose
> to user space.

How is that related to user space ? All I'm asking for is to move the
decision to the generic code and not add more stuff to x86/*.

This UV problem is currently unique, but it's going to pop up all over
the place.

> > 2) a way to expose that information as part of the irq interface to
> > user space.
>
> -EINVAL?

Err ? What's your problem with telling user space that an irq is
restricted to a certain set of CPUs ?

> > So what's wrong with a per irq_chip function which returns the cpumask
> > which is valid for irq N ?
>
> I have no problems with a generic function to do that.
>
> > That function would be called to check the affinity mask in
> > set_irq_affinity and to dump the mask to /proc/irq/N/possible_cpus or
> > whatever name we agree on.
> >
> > That way we don't have to worry about where in the x86 code the
> > decision should reside as you simply would always get valid masks from
> > the core code.
>
> Impossible. assign_irq_vector is the only function that can tell you
> if a mask is valid or not. Currently we support roughly 240 irqs
> per cpu. Currently we support more than 240 irqs.

Interesting.

> I don't see how you can enforce that limit.

-ENOPARSE. Please excuse my limited ability to decode crypt_eric

> Furthermore irq migration on x86 is a very non-trivial exercise.

Thanks for the education, that's really new information to me.

> We must wait until we get a new irq at the new location before
> we cleanup the irq state at the old location, to ensure that the
> state change has gone through. At which point again we can not
> know.

Know what?

> So Thomas the logical conclusion that you are requesting. An
> architecture specific interface for migrating irqs that does not
> need to return error codes because the generic code has enough
> information to avoid all problem cases is not going to happen.
> It is totally unreasonable.

I did never request that at all. All I'm asking for is that we think
more about having decision functions which are valuable for more than
one esoteric use case like UV moved into generic code instead of
hacking them into some architecture space.

> > That just works and is neither restricted to UV nor to x86.
>
> Doing it all in the core totally fails as it gets the initial irq
> assignment wrong.

That's fixable. And I did not say that these decision functions can
only be used by the generic code, they can be used to do final or
initial decisions in the low level code as well. I'm just fighting
that you folks think that your problems are so unique that they need
to be solved in special ways.

> Last I looked set_irq_affinity was a horribly broken interface. We
> can not return error codes to user space when they ask us to do the
> impossible. Right now irq->affinity is a hint that occasionally we
> ignore when what it requests is impossible.

And that is a perfect excuse to refuse to think about improvements or
a reasonable replacement for that interface ?

> Thomas my apologies for ranting but I am extremely sensitive about
> people placing demands on the irq code that would be very convenient
> and simple for the rest of the world, except that the hardware does not
> work the way people envision it should work. The worst offender is
> the cpu hotunplug logic that requests we perform the impossible when
> it comes to irq migration. In the case of UV I expect cpu hotplug is
> going to request we migrate irqs to another node.

Sure, and the proper way to fix that is the arch/x86 code, right ?

> Right now a lot of the generic irq code is living in a deluded fantasy and
> I really don't want to see more impossible requests from the irq code
> added to the pile.

Sorry dude. _You_ are living in the delusion that your problems are
unique and can only be solved by adding special crafted hackery to
everything. You simply refuse to go the hard way of proving that it is
impossible to solve for the benefit of everyone just on the base of
handwaving and claiming that you are the only expert who understands
all the details of interrupt handling.

> The architecture specific function setup_irq_vector has all of the
> information available to it to make the decision. We use it
> consistently everywhere. For the case of UV it needs to know about
> another possible hardware limitation, to do it's job. I am happy
> if that information comes from an architecture agnostic source but
> making the decision somewhere else is just a guarantee that we will
> have more subtle breakage that occasionally fail for people but at
> too low a rate that people will care enough to fix.

You are completely ignoring the fact, that the deluded people who
started to work on a generic irq subsystem actually had and still have
very good reasons to do so and do have a not so deluded insight into
the general and also the x86 specific problem space.

I'm well aware that every generic interface has short comings, but we
are in the kernel space where we can fix things if we _want_ and work
together on doing so.

It's just impossible to do that with people who insist on their
particular world view and I'm not talking about you alone. The same
applies to the embedded crowd and driver folks who permanentely insist
on implementing "workarounds" instead of talking to the people who are
responsible and willing to fix things.

Thanks,

tglx

2009-11-25 15:38:56

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Tue, 24 Nov 2009 09:41:18 -0800
[email protected] (Eric W. Biederman) wrote:
> Oii.
>
> I don't think it is bad to export information to applications like
> irqbalance.
>
> I think it pretty horrible that one of the standard ways I have heard
> to improve performance on 10G nics is to kill irqbalance.

irqbalance does not move networking irqs; if it does there's something
evil going on in the system. But thanks for the bugreport ;)


> Guys. Migrating an irq from one cpu to another while the device is
> running without dropping interrupts is hard.

no it isn't; we've been doing it for years.


> I think the irq scheduler is the only scheduler (except for batch
> jobs) that we don't put in the kernel. It seems to me that if we are
> going to go to all of the trouble to rewrite the generic code to
> better support irqbalance because we are having serious irqbalance
> problems, it will be less effort to suck irqbalance into the kernel
> along with everything else.

we had that; it didn't work.
what I'm asking for is for the kernel to expose the numa information;
right now that is the piece that is missing.

--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2009-12-03 16:50:06

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> On Tue, 24 Nov 2009 09:41:18 -0800
> [email protected] (Eric W. Biederman) wrote:
> > Oii.
> >
> > I don't think it is bad to export information to applications like
> > irqbalance.
> >
> > I think it pretty horrible that one of the standard ways I have heard
> > to improve performance on 10G nics is to kill irqbalance.
>
> irqbalance does not move networking irqs; if it does there's something
> evil going on in the system. But thanks for the bugreport ;)

It does move networking irqs.

>
> we had that; it didn't work.
> what I'm asking for is for the kernel to expose the numa information;
> right now that is the piece that is missing.
>

I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?

I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).

2009-12-03 16:53:19

by Waskiewicz Jr, Peter P

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Thu, 3 Dec 2009, Dimitri Sivanich wrote:

> On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > On Tue, 24 Nov 2009 09:41:18 -0800
> > [email protected] (Eric W. Biederman) wrote:
> > > Oii.
> > >
> > > I don't think it is bad to export information to applications like
> > > irqbalance.
> > >
> > > I think it pretty horrible that one of the standard ways I have heard
> > > to improve performance on 10G nics is to kill irqbalance.
> >
> > irqbalance does not move networking irqs; if it does there's something
> > evil going on in the system. But thanks for the bugreport ;)
>
> It does move networking irqs.
>
> >
> > we had that; it didn't work.
> > what I'm asking for is for the kernel to expose the numa information;
> > right now that is the piece that is missing.
> >
>
> I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
>
> I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).

That's exactly what my patch in the thread "irq: Add node_affinity CPU
masks for smarter irqbalance hints" is doing. I've also done the
irqbalance changes based on that kernel patch, and Arjan currently has
that patch.

Cheers,
-PJ

2009-12-03 17:01:45

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
>
> > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > [email protected] (Eric W. Biederman) wrote:
> > > > Oii.
> > > >
> > > > I don't think it is bad to export information to applications like
> > > > irqbalance.
> > > >
> > > > I think it pretty horrible that one of the standard ways I have heard
> > > > to improve performance on 10G nics is to kill irqbalance.
> > >
> > > irqbalance does not move networking irqs; if it does there's something
> > > evil going on in the system. But thanks for the bugreport ;)
> >
> > It does move networking irqs.
> >
> > >
> > > we had that; it didn't work.
> > > what I'm asking for is for the kernel to expose the numa information;
> > > right now that is the piece that is missing.
> > >
> >
> > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> >
> > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
>
> That's exactly what my patch in the thread "irq: Add node_affinity CPU
> masks for smarter irqbalance hints" is doing. I've also done the
> irqbalance changes based on that kernel patch, and Arjan currently has
> that patch.

So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?

2009-12-03 17:07:18

by Waskiewicz Jr, Peter P

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Thu, 3 Dec 2009, Dimitri Sivanich wrote:

> On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> >
> > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > [email protected] (Eric W. Biederman) wrote:
> > > > > Oii.
> > > > >
> > > > > I don't think it is bad to export information to applications like
> > > > > irqbalance.
> > > > >
> > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > to improve performance on 10G nics is to kill irqbalance.
> > > >
> > > > irqbalance does not move networking irqs; if it does there's something
> > > > evil going on in the system. But thanks for the bugreport ;)
> > >
> > > It does move networking irqs.
> > >
> > > >
> > > > we had that; it didn't work.
> > > > what I'm asking for is for the kernel to expose the numa information;
> > > > right now that is the piece that is missing.
> > > >
> > >
> > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > >
> > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> >
> > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > masks for smarter irqbalance hints" is doing. I've also done the
> > irqbalance changes based on that kernel patch, and Arjan currently has
> > that patch.
>
> So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?

That is correct. The patch provides an interface to both the kernel
(functions) and /proc for userspace to set a CPU mask. That is the
preferred mask for the interrupt to be balanced on. Then irqbalance will
make decisions on how to balance within that provided mask, if it in fact
has been provided.

Cheers,
-PJ

2009-12-03 17:19:43

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Thu, Dec 03, 2009 at 09:07:21AM -0800, Waskiewicz Jr, Peter P wrote:
> On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
>
> > On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > >
> > > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > > [email protected] (Eric W. Biederman) wrote:
> > > > > > Oii.
> > > > > >
> > > > > > I don't think it is bad to export information to applications like
> > > > > > irqbalance.
> > > > > >
> > > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > > to improve performance on 10G nics is to kill irqbalance.
> > > > >
> > > > > irqbalance does not move networking irqs; if it does there's something
> > > > > evil going on in the system. But thanks for the bugreport ;)
> > > >
> > > > It does move networking irqs.
> > > >
> > > > >
> > > > > we had that; it didn't work.
> > > > > what I'm asking for is for the kernel to expose the numa information;
> > > > > right now that is the piece that is missing.
> > > > >
> > > >
> > > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > > >
> > > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> > >
> > > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > > masks for smarter irqbalance hints" is doing. I've also done the
> > > irqbalance changes based on that kernel patch, and Arjan currently has
> > > that patch.
> >
> > So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?
>
> That is correct. The patch provides an interface to both the kernel
> (functions) and /proc for userspace to set a CPU mask. That is the
> preferred mask for the interrupt to be balanced on. Then irqbalance will
> make decisions on how to balance within that provided mask, if it in fact
> has been provided.

What if it's not provided? Will irqbalance make decisions based on the numa_node of that irq (I would hope)?

Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).

As a matter of fact, driver's allocating rings, buffers, queues on other nodes should optimally be made aware of the restriction.

2009-12-03 18:50:41

by Waskiewicz Jr, Peter P

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Thu, 3 Dec 2009, Dimitri Sivanich wrote:

> On Thu, Dec 03, 2009 at 09:07:21AM -0800, Waskiewicz Jr, Peter P wrote:
> > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> >
> > > On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > >
> > > > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > > > [email protected] (Eric W. Biederman) wrote:
> > > > > > > Oii.
> > > > > > >
> > > > > > > I don't think it is bad to export information to applications like
> > > > > > > irqbalance.
> > > > > > >
> > > > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > > > to improve performance on 10G nics is to kill irqbalance.
> > > > > >
> > > > > > irqbalance does not move networking irqs; if it does there's something
> > > > > > evil going on in the system. But thanks for the bugreport ;)
> > > > >
> > > > > It does move networking irqs.
> > > > >
> > > > > >
> > > > > > we had that; it didn't work.
> > > > > > what I'm asking for is for the kernel to expose the numa information;
> > > > > > right now that is the piece that is missing.
> > > > > >
> > > > >
> > > > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > > > >
> > > > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> > > >
> > > > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > > > masks for smarter irqbalance hints" is doing. I've also done the
> > > > irqbalance changes based on that kernel patch, and Arjan currently has
> > > > that patch.
> > >
> > > So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?
> >
> > That is correct. The patch provides an interface to both the kernel
> > (functions) and /proc for userspace to set a CPU mask. That is the
> > preferred mask for the interrupt to be balanced on. Then irqbalance will
> > make decisions on how to balance within that provided mask, if it in fact
> > has been provided.
>
> What if it's not provided? Will irqbalance make decisions based on the numa_node of that irq (I would hope)?

If it's not provided, then irqbalance will continue to do what it does
today. No changes.

>
> Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
>

The interface allows you to put in any CPU mask. The way it's written
now, whatever mask you put in, irqbalance *only* balances within that
mask. It won't ever try and go outside that mask.

> As a matter of fact, driver's allocating rings, buffers, queues on other nodes should optimally be made aware of the restriction.

The idea is that the driver will do its memory allocations for everything
across nodes. When it does that, it will use the kernel interface
(function call) to set the corresponding mask it wants for those queue
resources. That is my end-goal for this code.

Cheers,
-PJ

2009-12-04 16:42:30

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Thu, Dec 03, 2009 at 10:50:47AM -0800, Waskiewicz Jr, Peter P wrote:
> On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
>
> > On Thu, Dec 03, 2009 at 09:07:21AM -0800, Waskiewicz Jr, Peter P wrote:
> > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > >
> > > > On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > >
> > > > > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > > > > [email protected] (Eric W. Biederman) wrote:
> > > > > > > > Oii.
> > > > > > > >
> > > > > > > > I don't think it is bad to export information to applications like
> > > > > > > > irqbalance.
> > > > > > > >
> > > > > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > > > > to improve performance on 10G nics is to kill irqbalance.
> > > > > > >
> > > > > > > irqbalance does not move networking irqs; if it does there's something
> > > > > > > evil going on in the system. But thanks for the bugreport ;)
> > > > > >
> > > > > > It does move networking irqs.
> > > > > >
> > > > > > >
> > > > > > > we had that; it didn't work.
> > > > > > > what I'm asking for is for the kernel to expose the numa information;
> > > > > > > right now that is the piece that is missing.
> > > > > > >
> > > > > >
> > > > > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > > > > >
> > > > > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> > > > >
> > > > > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > > > > masks for smarter irqbalance hints" is doing. I've also done the
> > > > > irqbalance changes based on that kernel patch, and Arjan currently has
> > > > > that patch.
> > > >
> > > > So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?
> > >
> > > That is correct. The patch provides an interface to both the kernel
> > > (functions) and /proc for userspace to set a CPU mask. That is the
> > > preferred mask for the interrupt to be balanced on. Then irqbalance will
> > > make decisions on how to balance within that provided mask, if it in fact
> > > has been provided.
> >
> > What if it's not provided? Will irqbalance make decisions based on the numa_node of that irq (I would hope)?
>
> If it's not provided, then irqbalance will continue to do what it does
> today. No changes.

So no numa awareness for ethernet irqs.

I'm wondering what needs to be exposed in proc/irq at this point.

Do we expose separate cpu masks for everything? There are really 3 possible pieces of affinity information: your node_affinity (optionally selected by the driver according to allocations), my restricted_affinity (set by the specific arch), and numa_node affinity (the 'home' node of the device). Do we show cpumasks for all of these, or maybe show numa_node in place of a cpumask for the 'home' node of the device?

With all of that information apps like irqbalance should be able to make some good decisions, but things can get confusing if there are too many masks.

Also, if I manually change affinity, irqbalance can change that affinity from under me, correct? That's fine as long as it's stated that that's how things will work (turn off irqbalance or run oneshot mode for manual setting).

>
> >
> > Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
> >
>
> The interface allows you to put in any CPU mask. The way it's written
> now, whatever mask you put in, irqbalance *only* balances within that
> mask. It won't ever try and go outside that mask.

OK. Given that, it might be nice to combine the restricted cpus that I'm describing with your node_affinity mask, but we could expose them as separate masks (node_affinity and restricted_affinity, as I describe above).

>
> > As a matter of fact, driver's allocating rings, buffers, queues on other nodes should optimally be made aware of the restriction.
>
> The idea is that the driver will do its memory allocations for everything
> across nodes. When it does that, it will use the kernel interface
> (function call) to set the corresponding mask it wants for those queue
> resources. That is my end-goal for this code.
>

OK, but we will eventually have to reject any irqbalance attempts to send irqs to restricted nodes.

2009-12-04 21:17:55

by Waskiewicz Jr, Peter P

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Fri, 2009-12-04 at 09:42 -0700, Dimitri Sivanich wrote:
> On Thu, Dec 03, 2009 at 10:50:47AM -0800, Waskiewicz Jr, Peter P wrote:
> > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> >
> > > On Thu, Dec 03, 2009 at 09:07:21AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > >
> > > > > On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > > >
> > > > > > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > > > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > > > > > [email protected] (Eric W. Biederman) wrote:
> > > > > > > > > Oii.
> > > > > > > > >
> > > > > > > > > I don't think it is bad to export information to applications like
> > > > > > > > > irqbalance.
> > > > > > > > >
> > > > > > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > > > > > to improve performance on 10G nics is to kill irqbalance.
> > > > > > > >
> > > > > > > > irqbalance does not move networking irqs; if it does there's something
> > > > > > > > evil going on in the system. But thanks for the bugreport ;)
> > > > > > >
> > > > > > > It does move networking irqs.
> > > > > > >
> > > > > > > >
> > > > > > > > we had that; it didn't work.
> > > > > > > > what I'm asking for is for the kernel to expose the numa information;
> > > > > > > > right now that is the piece that is missing.
> > > > > > > >
> > > > > > >
> > > > > > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > > > > > >
> > > > > > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> > > > > >
> > > > > > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > > > > > masks for smarter irqbalance hints" is doing. I've also done the
> > > > > > irqbalance changes based on that kernel patch, and Arjan currently has
> > > > > > that patch.
> > > > >
> > > > > So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?
> > > >
> > > > That is correct. The patch provides an interface to both the kernel
> > > > (functions) and /proc for userspace to set a CPU mask. That is the
> > > > preferred mask for the interrupt to be balanced on. Then irqbalance will
> > > > make decisions on how to balance within that provided mask, if it in fact
> > > > has been provided.
> > >
> > > What if it's not provided? Will irqbalance make decisions based on the numa_node of that irq (I would hope)?
> >
> > If it's not provided, then irqbalance will continue to do what it does
> > today. No changes.
>
> So no numa awareness for ethernet irqs.
>

Nothing that makes sense, no.

> I'm wondering what needs to be exposed in proc/irq at this point.
>
> Do we expose separate cpu masks for everything? There are really 3 possible pieces of affinity information: your node_affinity (optionally selected by the driver according to allocations), my restricted_affinity (set by the specific arch), and numa_node affinity (the 'home' node of the device). Do we show cpumasks for all of these, or maybe show numa_node in place of a cpumask for the 'home' node of the device?
>
> With all of that information apps like irqbalance should be able to make some good decisions, but things can get confusing if there are too many masks.
>
> Also, if I manually change affinity, irqbalance can change that affinity from under me, correct? That's fine as long as it's stated that that's how things will work (turn off irqbalance or run oneshot mode for manual setting).
>

If you change /proc/irq/<num>/smp_affinity, yes, irqbalance can change
that after the fact.

> >
> > >
> > > Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
> > >
> >
> > The interface allows you to put in any CPU mask. The way it's written
> > now, whatever mask you put in, irqbalance *only* balances within that
> > mask. It won't ever try and go outside that mask.
>
> OK. Given that, it might be nice to combine the restricted cpus that I'm describing with your node_affinity mask, but we could expose them as separate masks (node_affinity and restricted_affinity, as I describe above).
>

I think this might be getting too complicated. The only thing
irqbalance is lacking today, in my mind, is the feedback mechanism,
telling it what subset of CPU masks to balance within. There is a
allowed_mask, but that is used for a different purpose. Hence why I
added another. But I think your needs can be met 100% with what I have
already, and we can come up with a different name that's more generic.
The flows would be something like this:

Driver:
- Driver comes online, allocates memory in a sensible NUMA fashion
- Driver requests kernel for interrupts, ties them into handlers
- Driver now sets a NUMA-friendly affinity for each interrupt, to match
with its initial memory allocation
- irqbalance balances interrupts within their new "hinted" affinities.

Other:
- System comes online
- In your case, interrupts must be kept away from certain CPUs.
- Some mechanism in your architecture init can set the "hinted" affinity
mask for each interrupt.
- irqbalance will not move interrupts to the CPUs you left out of the
"hinted" affinity.

Does this make more sense?

> >
> > > As a matter of fact, driver's allocating rings, buffers, queues on other nodes should optimally be made aware of the restriction.
> >
> > The idea is that the driver will do its memory allocations for everything
> > across nodes. When it does that, it will use the kernel interface
> > (function call) to set the corresponding mask it wants for those queue
> > resources. That is my end-goal for this code.
> >
>
> OK, but we will eventually have to reject any irqbalance attempts to send irqs to restricted nodes.

See above.

Cheers,
-PJ

2009-12-04 23:12:22

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

Peter P Waskiewicz Jr <[email protected]> writes:

>
>> >
>> > >
>> > > Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
>> > >
>> >
>> > The interface allows you to put in any CPU mask. The way it's written
>> > now, whatever mask you put in, irqbalance *only* balances within that
>> > mask. It won't ever try and go outside that mask.
>>
>> OK. Given that, it might be nice to combine the restricted cpus that I'm describing with your node_affinity mask, but we could expose them as separate masks (node_affinity and restricted_affinity, as I describe above).
>>
>
> I think this might be getting too complicated. The only thing
> irqbalance is lacking today, in my mind, is the feedback mechanism,
> telling it what subset of CPU masks to balance within.

You mean besides knowing that devices can have more than one irq?
You mean besides making good on it's promise not to move networking
irqs? A policy of BALANCE_CORE sure doesn't look like a policy of
don't touch.
You mean besides realizing that irqs can only be directed at one cpu on
x86? At least when you have more than 8 logical cores in the system, the
cases that matter.

> There is a
> allowed_mask, but that is used for a different purpose. Hence why I
> added another. But I think your needs can be met 100% with what I have
> already, and we can come up with a different name that's more generic.
> The flows would be something like this:

Two masks? You are asking the kernel to move irqs for you then?

> Driver:
> - Driver comes online, allocates memory in a sensible NUMA fashion
> - Driver requests kernel for interrupts, ties them into handlers
> - Driver now sets a NUMA-friendly affinity for each interrupt, to match
> with its initial memory allocation
> - irqbalance balances interrupts within their new "hinted" affinities.
>
> Other:
> - System comes online
> - In your case, interrupts must be kept away from certain CPUs.
> - Some mechanism in your architecture init can set the "hinted" affinity
> mask for each interrupt.
> - irqbalance will not move interrupts to the CPUs you left out of the
> "hinted" affinity.
>
> Does this make more sense?


>> > > As a matter of fact, driver's allocating rings, buffers, queues on other nodes should optimally be made aware of the restriction.
>> >
>> > The idea is that the driver will do its memory allocations for everything
>> > across nodes. When it does that, it will use the kernel interface
>> > (function call) to set the corresponding mask it wants for those queue
>> > resources. That is my end-goal for this code.
>> >
>>
>> OK, but we will eventually have to reject any irqbalance attempts to send irqs to restricted nodes.
>
> See above.

Either I am parsing this conversation wrong or there is a strong
reality distortion field in place. It appears you are asking that we
depend on a user space application to not attempt the physically
impossible, when we could just as easily ignore or report -EINVAL to.

We really have two separate problems hear.
- How to avoid the impossible.
- How to deal with NUMA affinity.

Eric

2009-12-05 10:38:14

by Waskiewicz Jr, Peter P

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Fri, 2009-12-04 at 15:12 -0800, Eric W. Biederman wrote:
> Peter P Waskiewicz Jr <[email protected]> writes:
>
> >
> >> >
> >> > >
> >> > > Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
> >> > >
> >> >
> >> > The interface allows you to put in any CPU mask. The way it's written
> >> > now, whatever mask you put in, irqbalance *only* balances within that
> >> > mask. It won't ever try and go outside that mask.
> >>
> >> OK. Given that, it might be nice to combine the restricted cpus that I'm describing with your node_affinity mask, but we could expose them as separate masks (node_affinity and restricted_affinity, as I describe above).
> >>
> >
> > I think this might be getting too complicated. The only thing
> > irqbalance is lacking today, in my mind, is the feedback mechanism,
> > telling it what subset of CPU masks to balance within.
>
> You mean besides knowing that devices can have more than one irq?

Why does it matter if it does or doesn't? The interrupts have to go
somewhere.

> You mean besides making good on it's promise not to move networking
> irqs? A policy of BALANCE_CORE sure doesn't look like a policy of
> don't touch.

Not moving network irqs is something Arjan said could be a bug, and he'd
be happy to either look into it, or welcome a patch if it really is
broken. As for BALANCE_CORE, I have no idea what you're talking about.

> You mean besides realizing that irqs can only be directed at one cpu on
> x86? At least when you have more than 8 logical cores in the system, the
> cases that matter.
>

Huh? I can have all of my interrupts directed to a single CPU on x86.
Can you give me an example here?

> > There is a
> > allowed_mask, but that is used for a different purpose. Hence why I
> > added another. But I think your needs can be met 100% with what I have
> > already, and we can come up with a different name that's more generic.
> > The flows would be something like this:
>
> Two masks? You are asking the kernel to move irqs for you then?

Absolutely not. Were you not following this thread earlier when this
was being discussed with Thomas?

> > Driver:
> > - Driver comes online, allocates memory in a sensible NUMA fashion
> > - Driver requests kernel for interrupts, ties them into handlers
> > - Driver now sets a NUMA-friendly affinity for each interrupt, to match
> > with its initial memory allocation
> > - irqbalance balances interrupts within their new "hinted" affinities.
> >
> > Other:
> > - System comes online
> > - In your case, interrupts must be kept away from certain CPUs.
> > - Some mechanism in your architecture init can set the "hinted" affinity
> > mask for each interrupt.
> > - irqbalance will not move interrupts to the CPUs you left out of the
> > "hinted" affinity.
> >
> > Does this make more sense?
>
>
> >> > > As a matter of fact, driver's allocating rings, buffers, queues on other nodes should optimally be made aware of the restriction.
> >> >
> >> > The idea is that the driver will do its memory allocations for everything
> >> > across nodes. When it does that, it will use the kernel interface
> >> > (function call) to set the corresponding mask it wants for those queue
> >> > resources. That is my end-goal for this code.
> >> >
> >>
> >> OK, but we will eventually have to reject any irqbalance attempts to send irqs to restricted nodes.
> >
> > See above.
>
> Either I am parsing this conversation wrong or there is a strong
> reality distortion field in place. It appears you are asking that we
> depend on a user space application to not attempt the physically
> impossible, when we could just as easily ignore or report -EINVAL to.
>

You are parsing this conversation incorrectly. I also don't understand
why you always have a very negative view of how impossible everything
is. Do you think we get no work done in the kernel? We deal with
countless issues across the kernel that are hard. Being hard doesn't
mean they're impossible, it just means we may have to try something new
and unknown.

What I'm asking is we make some mechanism for drivers to manage their
interrupt affinities. Today drivers have no influence or control where
their interrupts land. This is a limitation, plain and simple. We need
a mechanism to allow a driver to say "hey, this interrupt needs to run
only on these CPUs. Going elsewhere can severely impact performance of
your network." Whatever listens and acts on that mechanism is
irrelevant.

> We really have two separate problems hear.
> - How to avoid the impossible.

Really man, this type of view is neither helpful or useful. Either help
people solve problems, or keep your negative views on proposed solutions
to problems to yourself.

> - How to deal with NUMA affinity.

More generally, how to deal with a device's preferred affinity. That is
the real issue I'm trying to solve.

Cheers,
-PJ

2009-12-07 13:39:33

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Fri, Dec 04, 2009 at 01:17:57PM -0800, Peter P Waskiewicz Jr wrote:
> On Fri, 2009-12-04 at 09:42 -0700, Dimitri Sivanich wrote:
> > On Thu, Dec 03, 2009 at 10:50:47AM -0800, Waskiewicz Jr, Peter P wrote:
> > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > >
> > > > On Thu, Dec 03, 2009 at 09:07:21AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > >
> > > > > > On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > > > >
> > > > > > > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > > > > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > > > > > > [email protected] (Eric W. Biederman) wrote:
> > > > > > > > > > Oii.
> > > > > > > > > >
> > > > > > > > > > I don't think it is bad to export information to applications like
> > > > > > > > > > irqbalance.
> > > > > > > > > >
> > > > > > > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > > > > > > to improve performance on 10G nics is to kill irqbalance.
> > > > > > > > >
> > > > > > > > > irqbalance does not move networking irqs; if it does there's something
> > > > > > > > > evil going on in the system. But thanks for the bugreport ;)
> > > > > > > >
> > > > > > > > It does move networking irqs.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > we had that; it didn't work.
> > > > > > > > > what I'm asking for is for the kernel to expose the numa information;
> > > > > > > > > right now that is the piece that is missing.
> > > > > > > > >
> > > > > > > >
> > > > > > > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > > > > > > >
> > > > > > > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> > > > > > >
> > > > > > > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > > > > > > masks for smarter irqbalance hints" is doing. I've also done the
> > > > > > > irqbalance changes based on that kernel patch, and Arjan currently has
> > > > > > > that patch.
> > > > > >
> > > > > > So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?
> > > > >
> > > > > That is correct. The patch provides an interface to both the kernel
> > > > > (functions) and /proc for userspace to set a CPU mask. That is the
> > > > > preferred mask for the interrupt to be balanced on. Then irqbalance will
> > > > > make decisions on how to balance within that provided mask, if it in fact
> > > > > has been provided.
> > > >
> > > > What if it's not provided? Will irqbalance make decisions based on the numa_node of that irq (I would hope)?
> > >
> > > If it's not provided, then irqbalance will continue to do what it does
> > > today. No changes.
> >
> > So no numa awareness for ethernet irqs.
> >
>
> Nothing that makes sense, no.

OK. See my note below concerning exposing the irq_desc 'node'.

>
> > I'm wondering what needs to be exposed in proc/irq at this point.
> >
> > Do we expose separate cpu masks for everything? There are really 3 possible pieces of affinity information: your node_affinity (optionally selected by the driver according to allocations), my restricted_affinity (set by the specific arch), and numa_node affinity (the 'home' node of the device). Do we show cpumasks for all of these, or maybe show numa_node in place of a cpumask for the 'home' node of the device?
> >
> > With all of that information apps like irqbalance should be able to make some good decisions, but things can get confusing if there are too many masks.
> >
> > Also, if I manually change affinity, irqbalance can change that affinity from under me, correct? That's fine as long as it's stated that that's how things will work (turn off irqbalance or run oneshot mode for manual setting).
> >
>
> If you change /proc/irq/<num>/smp_affinity, yes, irqbalance can change
> that after the fact.

OK.

>
> > >
> > > >
> > > > Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
> > > >
> > >
> > > The interface allows you to put in any CPU mask. The way it's written
> > > now, whatever mask you put in, irqbalance *only* balances within that
> > > mask. It won't ever try and go outside that mask.
> >
> > OK. Given that, it might be nice to combine the restricted cpus that I'm describing with your node_affinity mask, but we could expose them as separate masks (node_affinity and restricted_affinity, as I describe above).
> >
>
> I think this might be getting too complicated. The only thing
> irqbalance is lacking today, in my mind, is the feedback mechanism,
> telling it what subset of CPU masks to balance within. There is a
> allowed_mask, but that is used for a different purpose. Hence why I
> added another. But I think your needs can be met 100% with what I have
> already, and we can come up with a different name that's more generic.
> The flows would be something like this:
>
> Driver:
> - Driver comes online, allocates memory in a sensible NUMA fashion
> - Driver requests kernel for interrupts, ties them into handlers
> - Driver now sets a NUMA-friendly affinity for each interrupt, to match
> with its initial memory allocation
> - irqbalance balances interrupts within their new "hinted" affinities.
>
> Other:
> - System comes online
> - In your case, interrupts must be kept away from certain CPUs.
> - Some mechanism in your architecture init can set the "hinted" affinity
> mask for each interrupt.
> - irqbalance will not move interrupts to the CPUs you left out of the
> "hinted" affinity.
>
> Does this make more sense?


I agree that this was getting too complicated. What you have makes sense,
but it -might- still be missing a couple of small things.

Here is a list of issues that I think we're trying to resolve here:

- Enforce hardware affinity restrictions in the kernel (don't allow irqs to
be directed to where they absolutely cannot go). This will silently fail
at the userspace level.
This is the issue I have been trying to resolve in this thread.

- Give userspace apps (irqbalance) better knowledge of where to redirect irqs.
- Driver provides direction (locality of queue/buffer allocations, etc..),
which is the issue that you are attempting to resolve.

- I/O device provides direction (hint that hardware is on some node).
I think we should expose the irq_desc->node (numa_node of the device).
This is one of the two things we're missing.
Whether irqbalance uses it in the cases where the driver has not supplied
a "hinted" affinity mask is up to you, but I think that information should
be provided to userspace. Maybe we add this as a subsequent patch.

- provide hardware affinity restriction information to apps (irqbalance).
This seems optional, although I think you've covered it in the "Other" flow
above, but see my question below.

- provide hardware affinity restriction information to the driver for better
decision making in allocation of resources.
This is the other thing we're missing.
This also seems optional, but -might- be desirable for what would eventually
be a more complete solution to what you are trying to resolve. If the
driver knows where irqs absolutely cannot be sent, it can better decide
where to allocate it's resources.

I guess this leads to one more question: What happens if our hardware sets
the "hinted" affinity mask, then a driver comes along and does the same thing?
We need to make sure drivers and arches keep from stomping on each other.

2009-12-07 13:44:48

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Fri, Dec 04, 2009 at 03:12:14PM -0800, Eric W. Biederman wrote:
> Peter P Waskiewicz Jr <[email protected]> writes:
>
> >> > > As a matter of fact, driver's allocating rings, buffers, queues on other nodes should optimally be made aware of the restriction.
> >> >
> >> > The idea is that the driver will do its memory allocations for everything
> >> > across nodes. When it does that, it will use the kernel interface
> >> > (function call) to set the corresponding mask it wants for those queue
> >> > resources. That is my end-goal for this code.
> >> >
> >>
> >> OK, but we will eventually have to reject any irqbalance attempts to send irqs to restricted nodes.
> >
> > See above.
>
> Either I am parsing this conversation wrong or there is a strong
> reality distortion field in place. It appears you are asking that we
> depend on a user space application to not attempt the physically
> impossible, when we could just as easily ignore or report -EINVAL to.
>
> We really have two separate problems hear.
> - How to avoid the impossible.

The kernel does need to restrict attempts at the impossible. However I see
nothing wrong with providing apps with information as to what the impossible
actually is.

> - How to deal with NUMA affinity.

2009-12-07 23:28:28

by Waskiewicz Jr, Peter P

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Mon, 2009-12-07 at 05:39 -0800, Dimitri Sivanich wrote:
> On Fri, Dec 04, 2009 at 01:17:57PM -0800, Peter P Waskiewicz Jr wrote:
> > On Fri, 2009-12-04 at 09:42 -0700, Dimitri Sivanich wrote:
> > > On Thu, Dec 03, 2009 at 10:50:47AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > >
> > > > > On Thu, Dec 03, 2009 at 09:07:21AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > > >
> > > > > > > On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > > > > >
> > > > > > > > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > > > > > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > > > > > > > [email protected] (Eric W. Biederman) wrote:
> > > > > > > > > > > Oii.
> > > > > > > > > > >
> > > > > > > > > > > I don't think it is bad to export information to applications like
> > > > > > > > > > > irqbalance.
> > > > > > > > > > >
> > > > > > > > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > > > > > > > to improve performance on 10G nics is to kill irqbalance.
> > > > > > > > > >
> > > > > > > > > > irqbalance does not move networking irqs; if it does there's something
> > > > > > > > > > evil going on in the system. But thanks for the bugreport ;)
> > > > > > > > >
> > > > > > > > > It does move networking irqs.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > we had that; it didn't work.
> > > > > > > > > > what I'm asking for is for the kernel to expose the numa information;
> > > > > > > > > > right now that is the piece that is missing.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > > > > > > > >
> > > > > > > > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> > > > > > > >
> > > > > > > > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > > > > > > > masks for smarter irqbalance hints" is doing. I've also done the
> > > > > > > > irqbalance changes based on that kernel patch, and Arjan currently has
> > > > > > > > that patch.
> > > > > > >
> > > > > > > So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?
> > > > > >
> > > > > > That is correct. The patch provides an interface to both the kernel
> > > > > > (functions) and /proc for userspace to set a CPU mask. That is the
> > > > > > preferred mask for the interrupt to be balanced on. Then irqbalance will
> > > > > > make decisions on how to balance within that provided mask, if it in fact
> > > > > > has been provided.
> > > > >
> > > > > What if it's not provided? Will irqbalance make decisions based on the numa_node of that irq (I would hope)?
> > > >
> > > > If it's not provided, then irqbalance will continue to do what it does
> > > > today. No changes.
> > >
> > > So no numa awareness for ethernet irqs.
> > >
> >
> > Nothing that makes sense, no.
>
> OK. See my note below concerning exposing the irq_desc 'node'.
>
> >
> > > I'm wondering what needs to be exposed in proc/irq at this point.
> > >
> > > Do we expose separate cpu masks for everything? There are really 3 possible pieces of affinity information: your node_affinity (optionally selected by the driver according to allocations), my restricted_affinity (set by the specific arch), and numa_node affinity (the 'home' node of the device). Do we show cpumasks for all of these, or maybe show numa_node in place of a cpumask for the 'home' node of the device?
> > >
> > > With all of that information apps like irqbalance should be able to make some good decisions, but things can get confusing if there are too many masks.
> > >
> > > Also, if I manually change affinity, irqbalance can change that affinity from under me, correct? That's fine as long as it's stated that that's how things will work (turn off irqbalance or run oneshot mode for manual setting).
> > >
> >
> > If you change /proc/irq/<num>/smp_affinity, yes, irqbalance can change
> > that after the fact.
>
> OK.
>
> >
> > > >
> > > > >
> > > > > Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
> > > > >
> > > >
> > > > The interface allows you to put in any CPU mask. The way it's written
> > > > now, whatever mask you put in, irqbalance *only* balances within that
> > > > mask. It won't ever try and go outside that mask.
> > >
> > > OK. Given that, it might be nice to combine the restricted cpus that I'm describing with your node_affinity mask, but we could expose them as separate masks (node_affinity and restricted_affinity, as I describe above).
> > >
> >
> > I think this might be getting too complicated. The only thing
> > irqbalance is lacking today, in my mind, is the feedback mechanism,
> > telling it what subset of CPU masks to balance within. There is a
> > allowed_mask, but that is used for a different purpose. Hence why I
> > added another. But I think your needs can be met 100% with what I have
> > already, and we can come up with a different name that's more generic.
> > The flows would be something like this:
> >
> > Driver:
> > - Driver comes online, allocates memory in a sensible NUMA fashion
> > - Driver requests kernel for interrupts, ties them into handlers
> > - Driver now sets a NUMA-friendly affinity for each interrupt, to match
> > with its initial memory allocation
> > - irqbalance balances interrupts within their new "hinted" affinities.
> >
> > Other:
> > - System comes online
> > - In your case, interrupts must be kept away from certain CPUs.
> > - Some mechanism in your architecture init can set the "hinted" affinity
> > mask for each interrupt.
> > - irqbalance will not move interrupts to the CPUs you left out of the
> > "hinted" affinity.
> >
> > Does this make more sense?
>
>
> I agree that this was getting too complicated. What you have makes sense,
> but it -might- still be missing a couple of small things.
>
> Here is a list of issues that I think we're trying to resolve here:
>
> - Enforce hardware affinity restrictions in the kernel (don't allow irqs to
> be directed to where they absolutely cannot go). This will silently fail
> at the userspace level.
> This is the issue I have been trying to resolve in this thread.
>
> - Give userspace apps (irqbalance) better knowledge of where to redirect irqs.
> - Driver provides direction (locality of queue/buffer allocations, etc..),
> which is the issue that you are attempting to resolve.

Yes.

> - I/O device provides direction (hint that hardware is on some node).
> I think we should expose the irq_desc->node (numa_node of the device).
> This is one of the two things we're missing.
> Whether irqbalance uses it in the cases where the driver has not supplied
> a "hinted" affinity mask is up to you, but I think that information should
> be provided to userspace. Maybe we add this as a subsequent patch.

Yes, we can either enforce that through the hinted mask, or we can give it the actual node.
If we give it the node, irqbalance will take it a step farther and balance it down to
the cache domain within that topology. Either way you'll get the same result.

> - provide hardware affinity restriction information to apps (irqbalance).
> This seems optional, although I think you've covered it in the "Other" flow
> above, but see my question below.
>
> - provide hardware affinity restriction information to the driver for better
> decision making in allocation of resources.
> This is the other thing we're missing.
> This also seems optional, but -might- be desirable for what would eventually
> be a more complete solution to what you are trying to resolve. If the
> driver knows where irqs absolutely cannot be sent, it can better decide
> where to allocate it's resources.
>
> I guess this leads to one more question: What happens if our hardware sets
> the "hinted" affinity mask, then a driver comes along and does the same thing?
> We need to make sure drivers and arches keep from stomping on each other.

Agreed. We can add a bit or something in the irq_desc that the
architecture code can set, indicating a driver shouldn't overload the
hinted mask. The burden will still be on the driver to check that value
and honor it.

I think we're on the same page. I hope Thomas can weigh in on this
either way so I know where to go from here with my patchset. If I can
lump other solutions into mine, I'm game for doing that work.

Cheers,
-PJ

2009-12-08 15:04:05

by Dimitri Sivanich

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Mon, Dec 07, 2009 at 03:28:34PM -0800, Peter P Waskiewicz Jr wrote:
> On Mon, 2009-12-07 at 05:39 -0800, Dimitri Sivanich wrote:
> > On Fri, Dec 04, 2009 at 01:17:57PM -0800, Peter P Waskiewicz Jr wrote:
> > > On Fri, 2009-12-04 at 09:42 -0700, Dimitri Sivanich wrote:
> > > > On Thu, Dec 03, 2009 at 10:50:47AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > >
> > > > > > On Thu, Dec 03, 2009 at 09:07:21AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > > > >
> > > > > > > > On Thu, Dec 03, 2009 at 08:53:23AM -0800, Waskiewicz Jr, Peter P wrote:
> > > > > > > > > On Thu, 3 Dec 2009, Dimitri Sivanich wrote:
> > > > > > > > >
> > > > > > > > > > On Wed, Nov 25, 2009 at 07:40:33AM -0800, Arjan van de Ven wrote:
> > > > > > > > > > > On Tue, 24 Nov 2009 09:41:18 -0800
> > > > > > > > > > > [email protected] (Eric W. Biederman) wrote:
> > > > > > > > > > > > Oii.
> > > > > > > > > > > >
> > > > > > > > > > > > I don't think it is bad to export information to applications like
> > > > > > > > > > > > irqbalance.
> > > > > > > > > > > >
> > > > > > > > > > > > I think it pretty horrible that one of the standard ways I have heard
> > > > > > > > > > > > to improve performance on 10G nics is to kill irqbalance.
> > > > > > > > > > >
> > > > > > > > > > > irqbalance does not move networking irqs; if it does there's something
> > > > > > > > > > > evil going on in the system. But thanks for the bugreport ;)
> > > > > > > > > >
> > > > > > > > > > It does move networking irqs.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > we had that; it didn't work.
> > > > > > > > > > > what I'm asking for is for the kernel to expose the numa information;
> > > > > > > > > > > right now that is the piece that is missing.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I'm wondering if we should expose that numa information in the form of a node or the set of allowed cpus, or both?
> > > > > > > > > >
> > > > > > > > > > I'm guessing 'both' is the correct answer, so that apps like irqbalance can make a qualitative decision based on the node (affinity to cpus on this node is better), but an absolute decision based on allowed cpus (I cannot change affinity to anything but this set of cpus).
> > > > > > > > >
> > > > > > > > > That's exactly what my patch in the thread "irq: Add node_affinity CPU
> > > > > > > > > masks for smarter irqbalance hints" is doing. I've also done the
> > > > > > > > > irqbalance changes based on that kernel patch, and Arjan currently has
> > > > > > > > > that patch.
> > > > > > > >
> > > > > > > > So if I understand correctly, you're patch takes care of the qualitative portion of it (we prefer to set affinity to these cpus, which may be on more than one node), but not the restrictive part of it (we cannot change affinity to anything but these cpus)?
> > > > > > >
> > > > > > > That is correct. The patch provides an interface to both the kernel
> > > > > > > (functions) and /proc for userspace to set a CPU mask. That is the
> > > > > > > preferred mask for the interrupt to be balanced on. Then irqbalance will
> > > > > > > make decisions on how to balance within that provided mask, if it in fact
> > > > > > > has been provided.
> > > > > >
> > > > > > What if it's not provided? Will irqbalance make decisions based on the numa_node of that irq (I would hope)?
> > > > >
> > > > > If it's not provided, then irqbalance will continue to do what it does
> > > > > today. No changes.
> > > >
> > > > So no numa awareness for ethernet irqs.
> > > >
> > >
> > > Nothing that makes sense, no.
> >
> > OK. See my note below concerning exposing the irq_desc 'node'.
> >
> > >
> > > > I'm wondering what needs to be exposed in proc/irq at this point.
> > > >
> > > > Do we expose separate cpu masks for everything? There are really 3 possible pieces of affinity information: your node_affinity (optionally selected by the driver according to allocations), my restricted_affinity (set by the specific arch), and numa_node affinity (the 'home' node of the device). Do we show cpumasks for all of these, or maybe show numa_node in place of a cpumask for the 'home' node of the device?
> > > >
> > > > With all of that information apps like irqbalance should be able to make some good decisions, but things can get confusing if there are too many masks.
> > > >
> > > > Also, if I manually change affinity, irqbalance can change that affinity from under me, correct? That's fine as long as it's stated that that's how things will work (turn off irqbalance or run oneshot mode for manual setting).
> > > >
> > >
> > > If you change /proc/irq/<num>/smp_affinity, yes, irqbalance can change
> > > that after the fact.
> >
> > OK.
> >
> > >
> > > > >
> > > > > >
> > > > > > Also, can we add a restricted mask as I mention above into this scheme? If we can't send an IRQ to some node, we don't want to bother attempting to change affinity to cpus on that node (hopefully code in the kernel will eventually restrict this).
> > > > > >
> > > > >
> > > > > The interface allows you to put in any CPU mask. The way it's written
> > > > > now, whatever mask you put in, irqbalance *only* balances within that
> > > > > mask. It won't ever try and go outside that mask.
> > > >
> > > > OK. Given that, it might be nice to combine the restricted cpus that I'm describing with your node_affinity mask, but we could expose them as separate masks (node_affinity and restricted_affinity, as I describe above).
> > > >
> > >
> > > I think this might be getting too complicated. The only thing
> > > irqbalance is lacking today, in my mind, is the feedback mechanism,
> > > telling it what subset of CPU masks to balance within. There is a
> > > allowed_mask, but that is used for a different purpose. Hence why I
> > > added another. But I think your needs can be met 100% with what I have
> > > already, and we can come up with a different name that's more generic.
> > > The flows would be something like this:
> > >
> > > Driver:
> > > - Driver comes online, allocates memory in a sensible NUMA fashion
> > > - Driver requests kernel for interrupts, ties them into handlers
> > > - Driver now sets a NUMA-friendly affinity for each interrupt, to match
> > > with its initial memory allocation
> > > - irqbalance balances interrupts within their new "hinted" affinities.
> > >
> > > Other:
> > > - System comes online
> > > - In your case, interrupts must be kept away from certain CPUs.
> > > - Some mechanism in your architecture init can set the "hinted" affinity
> > > mask for each interrupt.
> > > - irqbalance will not move interrupts to the CPUs you left out of the
> > > "hinted" affinity.
> > >
> > > Does this make more sense?
> >
> >
> > I agree that this was getting too complicated. What you have makes sense,
> > but it -might- still be missing a couple of small things.
> >
> > Here is a list of issues that I think we're trying to resolve here:
> >
> > - Enforce hardware affinity restrictions in the kernel (don't allow irqs to
> > be directed to where they absolutely cannot go). This will silently fail
> > at the userspace level.
> > This is the issue I have been trying to resolve in this thread.
> >
> > - Give userspace apps (irqbalance) better knowledge of where to redirect irqs.
> > - Driver provides direction (locality of queue/buffer allocations, etc..),
> > which is the issue that you are attempting to resolve.
>
> Yes.
>
> > - I/O device provides direction (hint that hardware is on some node).
> > I think we should expose the irq_desc->node (numa_node of the device).
> > This is one of the two things we're missing.
> > Whether irqbalance uses it in the cases where the driver has not supplied
> > a "hinted" affinity mask is up to you, but I think that information should
> > be provided to userspace. Maybe we add this as a subsequent patch.
>
> Yes, we can either enforce that through the hinted mask, or we can give it the actual node.
> If we give it the node, irqbalance will take it a step farther and balance it down to
> the cache domain within that topology. Either way you'll get the same result.

So provide the irq_desc->node via /proc/irq/*/node for each irq, but also enforce through the hinted mask if so desired, right?

>
> > - provide hardware affinity restriction information to apps (irqbalance).
> > This seems optional, although I think you've covered it in the "Other" flow
> > above, but see my question below.
> >
> > - provide hardware affinity restriction information to the driver for better
> > decision making in allocation of resources.
> > This is the other thing we're missing.
> > This also seems optional, but -might- be desirable for what would eventually
> > be a more complete solution to what you are trying to resolve. If the
> > driver knows where irqs absolutely cannot be sent, it can better decide
> > where to allocate it's resources.
> >
> > I guess this leads to one more question: What happens if our hardware sets
> > the "hinted" affinity mask, then a driver comes along and does the same thing?
> > We need to make sure drivers and arches keep from stomping on each other.
>
> Agreed. We can add a bit or something in the irq_desc that the
> architecture code can set, indicating a driver shouldn't overload the
> hinted mask. The burden will still be on the driver to check that value
> and honor it.

OK. The driver could restrict the mask further (according to it's allocation), but not use cpus that were not set in the mask if the flag was set.

>
> I think we're on the same page. I hope Thomas can weigh in on this
> either way so I know where to go from here with my patchset. If I can
> lump other solutions into mine, I'm game for doing that work.
>
> Cheers,
> -PJ

2009-12-11 03:18:05

by David Lang

[permalink] [raw]
Subject: Re: [PATCH v6] x86/apic: limit irq affinity

On Wed, 25 Nov 2009, Arjan van de Ven wrote:

>> I think the irq scheduler is the only scheduler (except for batch
>> jobs) that we don't put in the kernel. It seems to me that if we are
>> going to go to all of the trouble to rewrite the generic code to
>> better support irqbalance because we are having serious irqbalance
>> problems, it will be less effort to suck irqbalance into the kernel
>> along with everything else.
>
> we had that; it didn't work.
> what I'm asking for is for the kernel to expose the numa information;
> right now that is the piece that is missing.

but if there is only one userspace app, and that app has no user
accessable configuration, what is the difference between telling everyone
they must use that app and having the logic from that app in the kernel?

David Lang