I'd like to thank Michael Ellerman for his feedback. This is a much
better patchset than it used to be.
Changes since the previous patchset:
- Dumped the variable renaming as it made the patchset harder to read.
- Moved the addition of the 'multiple' field from the first patch to the
second.
- Changed the calling convention of arch_teardown_msi_irqs back to
upstream.
- Fix the docbook for pci_enable_msi_block
- Limit powerpc's MSI allocation to 1 in arch_msi_check_device, not
arch_setup_msi_irqs
- Rewrote AHCI interrupt claiming code to handle failures better and to
claim fewer interrupts if possible.
Outstanding things I'd like reviewed:
- The x86-64 patch has received no comments so far.
- The AHCI patch needs to be re-reviewed. Sorry, Jeff. It's a bit
more complex than I'd like, but I don't see a clearer way to write
it, given the nasty behaviour that the AHCI spec permits.
- I've just added the MSI-HOWTO patch to this set. New reviewers are
welcome.
Patchset available as a git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/willy/misc.git multiple-msi
(multiple-msi-20080710 is this patch set in particular)
Patches will be sent in reply to this message.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
AHCI controllers can support up to 16 interrupts, one per port. This
saves us a readl() in the interrupt path to determine which port has
generated the interrupt.
Signed-off-by: Matthew Wilcox <[email protected]>
---
drivers/ata/ahci.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 122 insertions(+), 5 deletions(-)
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 5e6468a..12562fc 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -102,6 +102,7 @@ enum {
/* HOST_CTL bits */
HOST_RESET = (1 << 0), /* reset controller; self-clear */
HOST_IRQ_EN = (1 << 1), /* global IRQ enable */
+ HOST_MSI_RSM = (1 << 2), /* Revert to Single Message */
HOST_AHCI_EN = (1 << 31), /* AHCI enabled */
/* HOST_CAP bits */
@@ -1771,6 +1772,15 @@ static void ahci_port_intr(struct ata_port *ap)
}
}
+static irqreturn_t ahci_msi_interrupt(int irq, void *dev_instance)
+{
+ struct ata_port *ap = dev_instance;
+ spin_lock(&ap->host->lock);
+ ahci_port_intr(ap);
+ spin_unlock(&ap->host->lock);
+ return IRQ_HANDLED;
+}
+
static irqreturn_t ahci_interrupt(int irq, void *dev_instance)
{
struct ata_host *host = dev_instance;
@@ -2220,6 +2230,107 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
}
}
+static int ahci_request_irqs(struct pci_dev *pdev, struct ata_host *host,
+ int n_irqs)
+{
+ int i, rc;
+
+ if (n_irqs > host->n_ports) {
+ n_irqs = host->n_ports;
+ } else if (n_irqs < host->n_ports || n_irqs == 1) {
+ n_irqs--;
+ rc = devm_request_irq(host->dev, pdev->irq + n_irqs,
+ ahci_interrupt,
+ IRQF_SHARED | (n_irqs ? IRQF_DISABLED : 0),
+ dev_driver_string(host->dev), host);
+ if (rc)
+ return rc;
+ }
+
+ for (i = 0; i < n_irqs; i++) {
+ rc = devm_request_irq(host->dev, pdev->irq + i,
+ ahci_msi_interrupt, IRQF_DISABLED,
+ dev_driver_string(host->dev), host->ports[i]);
+ if (rc)
+ goto free_irqs;
+ }
+
+ return 0;
+
+ free_irqs:
+ if (n_irqs < host->n_ports)
+ devm_free_irq(host->dev, pdev->irq + n_irqs, host);
+ while (i >= 0)
+ devm_free_irq(host->dev, pdev->irq + i, host->ports[i]);
+ return rc;
+}
+
+static int ahci_setup_irq_block(struct pci_dev *pdev, struct ata_host *host,
+ int n_irqs)
+{
+ int rc;
+
+ rc = pci_enable_msi_block(pdev, n_irqs);
+ if (rc)
+ return rc;
+ if (n_irqs > 1) {
+ void __iomem *mmio = host->iomap[AHCI_PCI_BAR];
+ u32 host_ctl = readl(mmio + HOST_CTL);
+ if (host_ctl & HOST_MSI_RSM)
+ goto try_different;
+ }
+
+ rc = ahci_request_irqs(pdev, host, n_irqs);
+ if (!rc)
+ return 0;
+
+ try_different:
+ pci_disable_msi(pdev);
+ pci_intx(pdev, 1);
+ return (n_irqs == 1) ? rc : 1;
+}
+
+static int ahci_setup_irqs(struct pci_dev *pdev, struct ata_host *host)
+{
+ struct ahci_host_priv *hpriv = host->private_data;
+ int n_irqs, pos, rc;
+ u16 control;
+
+ if (hpriv->flags & AHCI_HFLAG_NO_MSI)
+ goto no_msi;
+
+ /*
+ * We only need one interrupt per port (plus one for CCC which
+ * isn't supported yet), but some AHCI controllers refuse to use
+ * multiple MSIs unless they get the maximum number of interrupts
+ */
+
+ rc = ahci_setup_irq_block(pdev, host, host->n_ports);
+ if (rc == 0)
+ return 0;
+ if (rc < 0)
+ goto no_msi;
+
+ /* Find out how many it might want */
+ pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
+ pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control);
+ n_irqs = 1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1);
+
+ for (;;) {
+ rc = ahci_setup_irq_block(pdev, host, n_irqs);
+ if (rc == 0)
+ return 0;
+ if (rc < 0)
+ goto no_msi;
+ n_irqs = rc;
+ }
+
+ return 0;
+
+ no_msi:
+ return ahci_request_irqs(pdev, host, 1);
+}
+
static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
static int printed_version;
@@ -2278,9 +2389,6 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
(pdev->revision == 0xa1 || pdev->revision == 0xa2))
hpriv->flags |= AHCI_HFLAG_NO_MSI;
- if ((hpriv->flags & AHCI_HFLAG_NO_MSI) || pci_enable_msi(pdev))
- pci_intx(pdev, 1);
-
/* save initial config */
ahci_save_initial_config(pdev, hpriv);
@@ -2335,8 +2443,17 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
ahci_print_info(host);
pci_set_master(pdev);
- return ata_host_activate(host, pdev->irq, ahci_interrupt, IRQF_SHARED,
- &ahci_sht);
+
+ rc = ata_host_start(host);
+ if (rc)
+ return rc;
+
+ rc = ahci_setup_irqs(pdev, host);
+ if (rc)
+ return rc;
+
+ rc = ata_host_register(host, &ahci_sht);
+ return rc;
}
static int __init ahci_init(void)
--
1.5.5.4
By changing from a 5-bit field to a 1-bit field, we free up some bits
that can be used by a later patch. Also rearrange the fields for better
packing on 64-bit platforms (reducing the size of msi_desc from 72 bytes
to 64 bytes).
Signed-off-by: Matthew Wilcox <[email protected]>
---
drivers/pci/msi.c | 100 ++++++++++++++++----------------------------------
include/linux/msi.h | 4 +-
2 files changed, 34 insertions(+), 70 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 8c61304..104f297 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -106,20 +106,10 @@ static void msix_flush_writes(unsigned int irq)
entry = get_irq_msi(irq);
BUG_ON(!entry || !entry->dev);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
- /* nothing to do */
- break;
- case PCI_CAP_ID_MSIX:
- {
+ if (entry->msi_attrib.is_msix) {
int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
readl(entry->mask_base + offset);
- break;
- }
- default:
- BUG();
- break;
}
}
@@ -129,8 +119,12 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
entry = get_irq_msi(irq);
BUG_ON(!entry || !entry->dev);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
+ if (entry->msi_attrib.is_msix) {
+ int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+ PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+ writel(flag, entry->mask_base + offset);
+ readl(entry->mask_base + offset);
+ } else {
if (entry->msi_attrib.maskbit) {
int pos;
u32 mask_bits;
@@ -143,18 +137,6 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
} else {
msi_set_enable(entry->dev, !flag);
}
- break;
- case PCI_CAP_ID_MSIX:
- {
- int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
- PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
- writel(flag, entry->mask_base + offset);
- readl(entry->mask_base + offset);
- break;
- }
- default:
- BUG();
- break;
}
entry->msi_attrib.masked = !!flag;
}
@@ -162,9 +144,14 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
void read_msi_msg(unsigned int irq, struct msi_msg *msg)
{
struct msi_desc *entry = get_irq_msi(irq);
- switch(entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
- {
+ if (entry->msi_attrib.is_msix) {
+ void __iomem *base = entry->mask_base +
+ entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+ msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+ msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+ msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+ } else {
struct pci_dev *dev = entry->dev;
int pos = entry->msi_attrib.pos;
u16 data;
@@ -180,30 +167,23 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
}
msg->data = data;
- break;
- }
- case PCI_CAP_ID_MSIX:
- {
- void __iomem *base;
- base = entry->mask_base +
- entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
- msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
- msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
- msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
- break;
- }
- default:
- BUG();
}
}
void write_msi_msg(unsigned int irq, struct msi_msg *msg)
{
struct msi_desc *entry = get_irq_msi(irq);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
- {
+ if (entry->msi_attrib.is_msix) {
+ void __iomem *base;
+ base = entry->mask_base +
+ entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+ writel(msg->address_lo,
+ base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+ writel(msg->address_hi,
+ base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+ writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+ } else {
struct pci_dev *dev = entry->dev;
int pos = entry->msi_attrib.pos;
@@ -218,23 +198,6 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
pci_write_config_word(dev, msi_data_reg(pos, 0),
msg->data);
}
- break;
- }
- case PCI_CAP_ID_MSIX:
- {
- void __iomem *base;
- base = entry->mask_base +
- entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
- writel(msg->address_lo,
- base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
- writel(msg->address_hi,
- base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
- writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
- break;
- }
- default:
- BUG();
}
entry->msg = *msg;
}
@@ -359,7 +322,7 @@ static int msi_capability_init(struct pci_dev *dev)
if (!entry)
return -ENOMEM;
- entry->msi_attrib.type = PCI_CAP_ID_MSI;
+ entry->msi_attrib.is_msix = 0;
entry->msi_attrib.is_64 = is_64bit_address(control);
entry->msi_attrib.entry_nr = 0;
entry->msi_attrib.maskbit = is_mask_bit_support(control);
@@ -446,7 +409,7 @@ static int msix_capability_init(struct pci_dev *dev,
break;
j = entries[i].entry;
- entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+ entry->msi_attrib.is_msix = 1;
entry->msi_attrib.is_64 = 1;
entry->msi_attrib.entry_nr = j;
entry->msi_attrib.maskbit = 1;
@@ -589,12 +552,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
u32 mask = entry->msi_attrib.maskbits_mask;
msi_set_mask_bits(dev->irq, mask, ~mask);
}
- if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+ if (!entry->dev || entry->msi_attrib.is_msix)
return;
/* Restore dev->irq to its default pin-assertion irq */
dev->irq = entry->msi_attrib.default_irq;
}
+
void pci_disable_msi(struct pci_dev* dev)
{
struct msi_desc *entry;
@@ -605,7 +569,7 @@ void pci_disable_msi(struct pci_dev* dev)
pci_msi_shutdown(dev);
entry = list_entry(dev->msi_list.next, struct msi_desc, list);
- if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+ if (!entry->dev || entry->msi_attrib.is_msix)
return;
msi_free_irqs(dev);
@@ -624,7 +588,7 @@ static int msi_free_irqs(struct pci_dev* dev)
arch_teardown_msi_irqs(dev);
list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
- if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
+ if (entry->msi_attrib.is_msix) {
writel(1, entry->mask_base + entry->msi_attrib.entry_nr
* PCI_MSIX_ENTRY_SIZE
+ PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 8f29392..13a8a1b 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -17,13 +17,13 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
struct msi_desc {
struct {
- __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */
+ __u8 is_msix : 1;
__u8 maskbit : 1; /* mask-pending bit supported ? */
__u8 masked : 1;
__u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
__u8 pos; /* Location of the msi capability */
- __u32 maskbits_mask; /* mask bits mask */
__u16 entry_nr; /* specific enabled entry */
+ __u32 maskbits_mask; /* mask bits mask */
unsigned default_irq; /* default pre-assigned irq */
}msi_attrib;
--
1.5.5.4
Implement the arch_setup_msi_irqs() interface. Extend create_irq()
into create_irq_block() and reimplement create_irq as a wrapper around
it. Create assign_irq_vector_block() based closely on
assign_irq_vector(). Teach set_msi_irq_affinity() how to handle
multiple MSIs.
Signed-off-by: Matthew Wilcox <[email protected]>
---
arch/x86/kernel/io_apic_64.c | 237 ++++++++++++++++++++++++++++++++++++------
1 files changed, 205 insertions(+), 32 deletions(-)
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8df..6a00dca 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -61,7 +61,7 @@ struct irq_cfg {
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
[0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
[1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
[2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
@@ -683,6 +683,8 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
+static int current_vector = FIRST_DEVICE_VECTOR;
+
static int __assign_irq_vector(int irq, cpumask_t mask)
{
/*
@@ -696,7 +698,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_offset = 0;
unsigned int old_vector;
int cpu;
struct irq_cfg *cfg;
@@ -769,6 +771,97 @@ static int assign_irq_vector(int irq, cpumask_t mask)
return err;
}
+static int __assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+ unsigned int old_vector;
+ int i, cpu;
+ struct irq_cfg *cfg;
+
+ /*
+ * We've got to be careful not to trash gate 0x80,
+ * because int 0x80 is hm, kind of importantish. ;)
+ */
+ BUG_ON((unsigned)irq + count > NR_IRQS);
+
+ /* Only try and allocate irqs on cpus that are present */
+ cpus_and(mask, mask, cpu_online_map);
+
+ for (i = 0; i < count; i++) {
+ cfg = &irq_cfg[irq + i];
+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ return -EBUSY;
+ }
+
+ cfg = &irq_cfg[irq];
+ old_vector = cfg->vector;
+ if (old_vector) {
+ cpumask_t tmp;
+ cpus_and(tmp, cfg->domain, mask);
+ if (!cpus_empty(tmp))
+ return 0;
+ }
+
+ for_each_cpu_mask(cpu, mask) {
+ cpumask_t domain, new_mask;
+ int new_cpu;
+ int vector;
+
+ domain = vector_allocation_domain(cpu);
+ cpus_and(new_mask, domain, cpu_online_map);
+
+ vector = current_vector & ~(count - 1);
+ next:
+ vector += count;
+ if (vector + count >= FIRST_SYSTEM_VECTOR) {
+ vector = FIRST_DEVICE_VECTOR & ~(count - 1);
+ if (vector < FIRST_DEVICE_VECTOR)
+ vector += count;
+ }
+ if (unlikely(vector == (current_vector & ~(count - 1))))
+ continue;
+ if ((IA32_SYSCALL_VECTOR >= vector) &&
+ (IA32_SYSCALL_VECTOR < vector + count))
+ goto next;
+ for_each_cpu_mask(new_cpu, new_mask) {
+ for (i = 0; i < count; i++) {
+ if (per_cpu(vector_irq, new_cpu)[vector + i]
+ != -1)
+ goto next;
+ }
+ }
+ /* Found one! */
+ current_vector = vector + count - 1;
+ for (i = 0; i < count; i++) {
+ cfg = &irq_cfg[irq + i];
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cfg->old_domain = cfg->domain;
+ }
+ for_each_cpu_mask(new_cpu, new_mask) {
+ per_cpu(vector_irq, new_cpu)[vector + i] =
+ irq + i;
+ }
+ cfg->vector = vector;
+ cfg->domain = domain;
+ }
+ return 0;
+ }
+ return -ENOSPC;
+}
+
+/* Assumes that count is a power of two and aligns to that power of two */
+static int assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+ int result;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ result = __assign_irq_vector_block(irq, count, mask);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ return result;
+}
+
static void __clear_irq_vector(int irq)
{
struct irq_cfg *cfg;
@@ -788,6 +881,14 @@ static void __clear_irq_vector(int irq)
cpus_clear(cfg->domain);
}
+static void __clear_irq_vector_block(int irq, int count)
+{
+ while (count > 0) {
+ count--;
+ __clear_irq_vector(irq + count);
+ }
+}
+
void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
@@ -1895,30 +1996,56 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-int create_irq(void)
+
+/*
+ * On success, returns the interrupt number of the lowest numbered irq
+ * in the block. If it can't find a block of the right size, it returns
+ * -1 - (length of the longest run).
+ */
+static int create_irq_block(int count)
{
- /* Allocate an unused irq */
- int irq;
- int new;
+ /* Allocate 'count' consecutive unused irqs */
+ int i, new, longest;
unsigned long flags;
- irq = -ENOSPC;
+ longest = 0;
spin_lock_irqsave(&vector_lock, flags);
for (new = (NR_IRQS - 1); new >= 0; new--) {
if (platform_legacy_irq(new))
- continue;
+ goto clear;
if (irq_cfg[new].vector != 0)
+ goto clear;
+ longest++;
+ if (longest < count)
continue;
- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
- irq = new;
+
+ while (__assign_irq_vector_block(new, longest, TARGET_CPUS))
+ longest /= 2;
+ if (longest < count)
+ __clear_irq_vector_block(new, longest);
break;
+ clear:
+ __clear_irq_vector_block(new + 1, longest);
+ longest = 0;
}
spin_unlock_irqrestore(&vector_lock, flags);
- if (irq >= 0) {
- dynamic_irq_init(irq);
+ if (longest < count)
+ return -1 - longest;
+
+ for (i = 0; i < count; i++) {
+ dynamic_irq_init(new + i);
}
- return irq;
+
+ return new;
+}
+
+int create_irq(void)
+{
+ int ret = create_irq_block(1);
+ if (ret < 0)
+ return -ENOSPC;
+ return ret;
}
void destroy_irq(unsigned int irq)
@@ -1936,7 +2063,8 @@ void destroy_irq(unsigned int irq)
* MSI message composition
*/
#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+ unsigned int count, struct msi_msg *msg)
{
struct irq_cfg *cfg = irq_cfg + irq;
int err;
@@ -1944,7 +2072,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
cpumask_t tmp;
tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ if (count == 1)
+ err = assign_irq_vector(irq, tmp);
+ else
+ err = assign_irq_vector_block(irq, count, tmp);
if (!err) {
cpus_and(tmp, cfg->domain, tmp);
dest = cpu_mask_to_apicid(tmp);
@@ -1975,6 +2106,8 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
struct irq_cfg *cfg = irq_cfg + irq;
+ struct msi_desc *desc = get_irq_msi(irq);
+ int i, count = 1 << desc->msi_attrib.multiple;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
@@ -1983,8 +2116,15 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
if (cpus_empty(tmp))
return;
- if (assign_irq_vector(irq, mask))
- return;
+ if (count > 1) {
+ /* Multiple MSIs all go to the same destination */
+ irq = desc->irq;
+ if (assign_irq_vector_block(irq, count, mask))
+ return;
+ } else {
+ if (assign_irq_vector(irq, mask))
+ return;
+ }
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -1997,7 +2137,9 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = mask;
+
+ for (i = 0; i < count; i++)
+ irq_desc[irq + i].affinity = mask;
}
#endif /* CONFIG_SMP */
@@ -2016,28 +2158,59 @@ static struct irq_chip msi_chip = {
.retrigger = ioapic_retrigger_irq,
};
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+static int x86_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc, int count)
{
struct msi_msg msg;
- int irq, ret;
- irq = create_irq();
- if (irq < 0)
- return irq;
-
- ret = msi_compose_msg(dev, irq, &msg);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
+ int i, ret, base_irq, alloc;
+
+ /* MSI can only allocate a power-of-two */
+ alloc = roundup_pow_of_two(count);
+
+ base_irq = create_irq_block(alloc);
+ if (base_irq < 0) {
+ if (alloc == 1)
+ return -ENOSPC;
+ return rounddown_pow_of_two(-base_irq - 1);
}
- set_irq_msi(irq, desc);
- write_msi_msg(irq, &msg);
+ ret = msi_compose_msg(pdev, base_irq, alloc, &msg);
+ if (ret)
+ return ret;
+
+ desc->msi_attrib.multiple = order_base_2(alloc);
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+ /* Do loop in reverse so set_irq_msi ends up setting
+ * desc->irq to base_irq
+ */
+ for (i = count - 1; i >= 0; i--) {
+ set_irq_msi(base_irq + i, desc);
+ set_irq_chip_and_handler_name(base_irq + i, &msi_chip,
+ handle_edge_irq, "edge");
+ }
+ write_msi_msg(base_irq, &msg);
return 0;
}
+int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+ struct msi_desc *desc;
+ int ret;
+
+ if (type == PCI_CAP_ID_MSI) {
+ desc = list_first_entry(&pdev->msi_list, struct msi_desc, list);
+ ret = x86_setup_msi_irq(pdev, desc, nvec);
+ } else {
+ list_for_each_entry(desc, &pdev->msi_list, list) {
+ ret = x86_setup_msi_irq(pdev, desc, 1);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
void arch_teardown_msi_irq(unsigned int irq)
{
destroy_irq(irq);
@@ -2090,7 +2263,7 @@ int arch_setup_dmar_msi(unsigned int irq)
int ret;
struct msi_msg msg;
- ret = msi_compose_msg(NULL, irq, &msg);
+ ret = msi_compose_msg(NULL, irq, 1, &msg);
if (ret < 0)
return ret;
dmar_msi_write(irq, &msg);
--
1.5.5.4
I didn't find the previous version very useful, so I rewrote it.
Also move it to the PCI subdirectory of Documentation.
Signed-off-by: Matthew Wilcox <[email protected]>
---
Documentation/MSI-HOWTO.txt | 511 ---------------------------------------
Documentation/PCI/MSI-HOWTO.txt | 318 ++++++++++++++++++++++++
2 files changed, 318 insertions(+), 511 deletions(-)
delete mode 100644 Documentation/MSI-HOWTO.txt
create mode 100644 Documentation/PCI/MSI-HOWTO.txt
diff --git a/Documentation/MSI-HOWTO.txt b/Documentation/MSI-HOWTO.txt
deleted file mode 100644
index a51f693..0000000
--- a/Documentation/MSI-HOWTO.txt
+++ /dev/null
@@ -1,511 +0,0 @@
- The MSI Driver Guide HOWTO
- Tom L Nguyen [email protected]
- 10/03/2003
- Revised Feb 12, 2004 by Martine Silbermann
- email: [email protected]
- Revised Jun 25, 2004 by Tom L Nguyen
-
-1. About this guide
-
-This guide describes the basics of Message Signaled Interrupts (MSI),
-the advantages of using MSI over traditional interrupt mechanisms,
-and how to enable your driver to use MSI or MSI-X. Also included is
-a Frequently Asked Questions (FAQ) section.
-
-1.1 Terminology
-
-PCI devices can be single-function or multi-function. In either case,
-when this text talks about enabling or disabling MSI on a "device
-function," it is referring to one specific PCI device and function and
-not to all functions on a PCI device (unless the PCI device has only
-one function).
-
-2. Copyright 2003 Intel Corporation
-
-3. What is MSI/MSI-X?
-
-Message Signaled Interrupt (MSI), as described in the PCI Local Bus
-Specification Revision 2.3 or later, is an optional feature, and a
-required feature for PCI Express devices. MSI enables a device function
-to request service by sending an Inbound Memory Write on its PCI bus to
-the FSB as a Message Signal Interrupt transaction. Because MSI is
-generated in the form of a Memory Write, all transaction conditions,
-such as a Retry, Master-Abort, Target-Abort or normal completion, are
-supported.
-
-A PCI device that supports MSI must also support pin IRQ assertion
-interrupt mechanism to provide backward compatibility for systems that
-do not support MSI. In systems which support MSI, the bus driver is
-responsible for initializing the message address and message data of
-the device function's MSI/MSI-X capability structure during device
-initial configuration.
-
-An MSI capable device function indicates MSI support by implementing
-the MSI/MSI-X capability structure in its PCI capability list. The
-device function may implement both the MSI capability structure and
-the MSI-X capability structure; however, the bus driver should not
-enable both.
-
-The MSI capability structure contains Message Control register,
-Message Address register and Message Data register. These registers
-provide the bus driver control over MSI. The Message Control register
-indicates the MSI capability supported by the device. The Message
-Address register specifies the target address and the Message Data
-register specifies the characteristics of the message. To request
-service, the device function writes the content of the Message Data
-register to the target address. The device and its software driver
-are prohibited from writing to these registers.
-
-The MSI-X capability structure is an optional extension to MSI. It
-uses an independent and separate capability structure. There are
-some key advantages to implementing the MSI-X capability structure
-over the MSI capability structure as described below.
-
- - Support a larger maximum number of vectors per function.
-
- - Provide the ability for system software to configure
- each vector with an independent message address and message
- data, specified by a table that resides in Memory Space.
-
- - MSI and MSI-X both support per-vector masking. Per-vector
- masking is an optional extension of MSI but a required
- feature for MSI-X. Per-vector masking provides the kernel the
- ability to mask/unmask a single MSI while running its
- interrupt service routine. If per-vector masking is
- not supported, then the device driver should provide the
- hardware/software synchronization to ensure that the device
- generates MSI when the driver wants it to do so.
-
-4. Why use MSI?
-
-As a benefit to the simplification of board design, MSI allows board
-designers to remove out-of-band interrupt routing. MSI is another
-step towards a legacy-free environment.
-
-Due to increasing pressure on chipset and processor packages to
-reduce pin count, the need for interrupt pins is expected to
-diminish over time. Devices, due to pin constraints, may implement
-messages to increase performance.
-
-PCI Express endpoints uses INTx emulation (in-band messages) instead
-of IRQ pin assertion. Using INTx emulation requires interrupt
-sharing among devices connected to the same node (PCI bridge) while
-MSI is unique (non-shared) and does not require BIOS configuration
-support. As a result, the PCI Express technology requires MSI
-support for better interrupt performance.
-
-Using MSI enables the device functions to support two or more
-vectors, which can be configured to target different CPUs to
-increase scalability.
-
-5. Configuring a driver to use MSI/MSI-X
-
-By default, the kernel will not enable MSI/MSI-X on all devices that
-support this capability. The CONFIG_PCI_MSI kernel option
-must be selected to enable MSI/MSI-X support.
-
-5.1 Including MSI/MSI-X support into the kernel
-
-To allow MSI/MSI-X capable device drivers to selectively enable
-MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described
-below), the VECTOR based scheme needs to be enabled by setting
-CONFIG_PCI_MSI during kernel config.
-
-Since the target of the inbound message is the local APIC, providing
-CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI.
-
-5.2 Configuring for MSI support
-
-Due to the non-contiguous fashion in vector assignment of the
-existing Linux kernel, this version does not support multiple
-messages regardless of a device function is capable of supporting
-more than one vector. To enable MSI on a device function's MSI
-capability structure requires a device driver to call the function
-pci_enable_msi() explicitly.
-
-5.2.1 API pci_enable_msi
-
-int pci_enable_msi(struct pci_dev *dev)
-
-With this new API, a device driver that wants to have MSI
-enabled on its device function must call this API to enable MSI.
-A successful call will initialize the MSI capability structure
-with ONE vector, regardless of whether a device function is
-capable of supporting multiple messages. This vector replaces the
-pre-assigned dev->irq with a new MSI vector. To avoid a conflict
-of the new assigned vector with existing pre-assigned vector requires
-a device driver to call this API before calling request_irq().
-
-5.2.2 API pci_disable_msi
-
-void pci_disable_msi(struct pci_dev *dev)
-
-This API should always be used to undo the effect of pci_enable_msi()
-when a device driver is unloading. This API restores dev->irq with
-the pre-assigned IOAPIC vector and switches a device's interrupt
-mode to PCI pin-irq assertion/INTx emulation mode.
-
-Note that a device driver should always call free_irq() on the MSI vector
-that it has done request_irq() on before calling this API. Failure to do
-so results in a BUG_ON() and a device will be left with MSI enabled and
-leaks its vector.
-
-5.2.3 MSI mode vs. legacy mode diagram
-
-The below diagram shows the events which switch the interrupt
-mode on the MSI-capable device function between MSI mode and
-PIN-IRQ assertion mode.
-
- ------------ pci_enable_msi ------------------------
- | | <=============== | |
- | MSI MODE | | PIN-IRQ ASSERTION MODE |
- | | ===============> | |
- ------------ pci_disable_msi ------------------------
-
-
-Figure 1. MSI Mode vs. Legacy Mode
-
-In Figure 1, a device operates by default in legacy mode. Legacy
-in this context means PCI pin-irq assertion or PCI-Express INTx
-emulation. A successful MSI request (using pci_enable_msi()) switches
-a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector
-stored in dev->irq will be saved by the PCI subsystem and a new
-assigned MSI vector will replace dev->irq.
-
-To return back to its default mode, a device driver should always call
-pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a
-device driver should always call free_irq() on the MSI vector it has
-done request_irq() on before calling pci_disable_msi(). Failure to do
-so results in a BUG_ON() and a device will be left with MSI enabled and
-leaks its vector. Otherwise, the PCI subsystem restores a device's
-dev->irq with a pre-assigned IOAPIC vector and marks the released
-MSI vector as unused.
-
-Once being marked as unused, there is no guarantee that the PCI
-subsystem will reserve this MSI vector for a device. Depending on
-the availability of current PCI vector resources and the number of
-MSI/MSI-X requests from other drivers, this MSI may be re-assigned.
-
-For the case where the PCI subsystem re-assigns this MSI vector to
-another driver, a request to switch back to MSI mode may result
-in being assigned a different MSI vector or a failure if no more
-vectors are available.
-
-5.3 Configuring for MSI-X support
-
-Due to the ability of the system software to configure each vector of
-the MSI-X capability structure with an independent message address
-and message data, the non-contiguous fashion in vector assignment of
-the existing Linux kernel has no impact on supporting multiple
-messages on an MSI-X capable device functions. To enable MSI-X on
-a device function's MSI-X capability structure requires its device
-driver to call the function pci_enable_msix() explicitly.
-
-The function pci_enable_msix(), once invoked, enables either
-all or nothing, depending on the current availability of PCI vector
-resources. If the PCI vector resources are available for the number
-of vectors requested by a device driver, this function will configure
-the MSI-X table of the MSI-X capability structure of a device with
-requested messages. To emphasize this reason, for example, a device
-may be capable for supporting the maximum of 32 vectors while its
-software driver usually may request 4 vectors. It is recommended
-that the device driver should call this function once during the
-initialization phase of the device driver.
-
-Unlike the function pci_enable_msi(), the function pci_enable_msix()
-does not replace the pre-assigned IOAPIC dev->irq with a new MSI
-vector because the PCI subsystem writes the 1:1 vector-to-entry mapping
-into the field vector of each element contained in a second argument.
-Note that the pre-assigned IOAPIC dev->irq is valid only if the device
-operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at
-using dev->irq by the device driver to request for interrupt service
-may result in unpredictable behavior.
-
-For each MSI-X vector granted, a device driver is responsible for calling
-other functions like request_irq(), enable_irq(), etc. to enable
-this vector with its corresponding interrupt service handler. It is
-a device driver's choice to assign all vectors with the same
-interrupt service handler or each vector with a unique interrupt
-service handler.
-
-5.3.1 Handling MMIO address space of MSI-X Table
-
-The PCI 3.0 specification has implementation notes that MMIO address
-space for a device's MSI-X structure should be isolated so that the
-software system can set different pages for controlling accesses to the
-MSI-X structure. The implementation of MSI support requires the PCI
-subsystem, not a device driver, to maintain full control of the MSI-X
-table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X
-table/MSI-X PBA. A device driver is prohibited from requesting the MMIO
-address space of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem
-will fail enabling MSI-X on its hardware device when it calls the function
-pci_enable_msix().
-
-5.3.2 API pci_enable_msix
-
-int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
-
-This API enables a device driver to request the PCI subsystem
-to enable MSI-X messages on its hardware device. Depending on
-the availability of PCI vectors resources, the PCI subsystem enables
-either all or none of the requested vectors.
-
-Argument 'dev' points to the device (pci_dev) structure.
-
-Argument 'entries' is a pointer to an array of msix_entry structs.
-The number of entries is indicated in argument 'nvec'.
-struct msix_entry is defined in /driver/pci/msi.h:
-
-struct msix_entry {
- u16 vector; /* kernel uses to write alloc vector */
- u16 entry; /* driver uses to specify entry */
-};
-
-A device driver is responsible for initializing the field 'entry' of
-each element with a unique entry supported by MSI-X table. Otherwise,
--EINVAL will be returned as a result. A successful return of zero
-indicates the PCI subsystem completed initializing each of the requested
-entries of the MSI-X table with message address and message data.
-Last but not least, the PCI subsystem will write the 1:1
-vector-to-entry mapping into the field 'vector' of each element. A
-device driver is responsible for keeping track of allocated MSI-X
-vectors in its internal data structure.
-
-A return of zero indicates that the number of MSI-X vectors was
-successfully allocated. A return of greater than zero indicates
-MSI-X vector shortage. Or a return of less than zero indicates
-a failure. This failure may be a result of duplicate entries
-specified in second argument, or a result of no available vector,
-or a result of failing to initialize MSI-X table entries.
-
-5.3.3 API pci_disable_msix
-
-void pci_disable_msix(struct pci_dev *dev)
-
-This API should always be used to undo the effect of pci_enable_msix()
-when a device driver is unloading. Note that a device driver should
-always call free_irq() on all MSI-X vectors it has done request_irq()
-on before calling this API. Failure to do so results in a BUG_ON() and
-a device will be left with MSI-X enabled and leaks its vectors.
-
-5.3.4 MSI-X mode vs. legacy mode diagram
-
-The below diagram shows the events which switch the interrupt
-mode on the MSI-X capable device function between MSI-X mode and
-PIN-IRQ assertion mode (legacy).
-
- ------------ pci_enable_msix(,,n) ------------------------
- | | <=============== | |
- | MSI-X MODE | | PIN-IRQ ASSERTION MODE |
- | | ===============> | |
- ------------ pci_disable_msix ------------------------
-
-Figure 2. MSI-X Mode vs. Legacy Mode
-
-In Figure 2, a device operates by default in legacy mode. A
-successful MSI-X request (using pci_enable_msix()) switches a
-device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector
-stored in dev->irq will be saved by the PCI subsystem; however,
-unlike MSI mode, the PCI subsystem will not replace dev->irq with
-assigned MSI-X vector because the PCI subsystem already writes the 1:1
-vector-to-entry mapping into the field 'vector' of each element
-specified in second argument.
-
-To return back to its default mode, a device driver should always call
-pci_disable_msix() to undo the effect of pci_enable_msix(). Note that
-a device driver should always call free_irq() on all MSI-X vectors it
-has done request_irq() on before calling pci_disable_msix(). Failure
-to do so results in a BUG_ON() and a device will be left with MSI-X
-enabled and leaks its vectors. Otherwise, the PCI subsystem switches a
-device function's interrupt mode from MSI-X mode to legacy mode and
-marks all allocated MSI-X vectors as unused.
-
-Once being marked as unused, there is no guarantee that the PCI
-subsystem will reserve these MSI-X vectors for a device. Depending on
-the availability of current PCI vector resources and the number of
-MSI/MSI-X requests from other drivers, these MSI-X vectors may be
-re-assigned.
-
-For the case where the PCI subsystem re-assigned these MSI-X vectors
-to other drivers, a request to switch back to MSI-X mode may result
-being assigned with another set of MSI-X vectors or a failure if no
-more vectors are available.
-
-5.4 Handling function implementing both MSI and MSI-X capabilities
-
-For the case where a function implements both MSI and MSI-X
-capabilities, the PCI subsystem enables a device to run either in MSI
-mode or MSI-X mode but not both. A device driver determines whether it
-wants MSI or MSI-X enabled on its hardware device. Once a device
-driver requests for MSI, for example, it is prohibited from requesting
-MSI-X; in other words, a device driver is not permitted to ping-pong
-between MSI mod MSI-X mode during a run-time.
-
-5.5 Hardware requirements for MSI/MSI-X support
-
-MSI/MSI-X support requires support from both system hardware and
-individual hardware device functions.
-
-5.5.1 Required x86 hardware support
-
-Since the target of MSI address is the local APIC CPU, enabling
-MSI/MSI-X support in the Linux kernel is dependent on whether existing
-system hardware supports local APIC. Users should verify that their
-system supports local APIC operation by testing that it runs when
-CONFIG_X86_LOCAL_APIC=y.
-
-In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set;
-however, in UP environment, users must manually set
-CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting
-CONFIG_PCI_MSI enables the VECTOR based scheme and the option for
-MSI-capable device drivers to selectively enable MSI/MSI-X.
-
-Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X
-vector is allocated new during runtime and MSI/MSI-X support does not
-depend on BIOS support. This key independency enables MSI/MSI-X
-support on future IOxAPIC free platforms.
-
-5.5.2 Device hardware support
-
-The hardware device function supports MSI by indicating the
-MSI/MSI-X capability structure on its PCI capability list. By
-default, this capability structure will not be initialized by
-the kernel to enable MSI during the system boot. In other words,
-the device function is running on its default pin assertion mode.
-Note that in many cases the hardware supporting MSI have bugs,
-which may result in system hangs. The software driver of specific
-MSI-capable hardware is responsible for deciding whether to call
-pci_enable_msi or not. A return of zero indicates the kernel
-successfully initialized the MSI/MSI-X capability structure of the
-device function. The device function is now running on MSI/MSI-X mode.
-
-5.6 How to tell whether MSI/MSI-X is enabled on device function
-
-At the driver level, a return of zero from the function call of
-pci_enable_msi()/pci_enable_msix() indicates to a device driver that
-its device function is initialized successfully and ready to run in
-MSI/MSI-X mode.
-
-At the user level, users can use the command 'cat /proc/interrupts'
-to display the vectors allocated for devices and their interrupt
-MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is
-enabled on a SCSI Adaptec 39320D Ultra320 controller.
-
- CPU0 CPU1
- 0: 324639 0 IO-APIC-edge timer
- 1: 1186 0 IO-APIC-edge i8042
- 2: 0 0 XT-PIC cascade
- 12: 2797 0 IO-APIC-edge i8042
- 14: 6543 0 IO-APIC-edge ide0
- 15: 1 0 IO-APIC-edge ide1
-169: 0 0 IO-APIC-level uhci-hcd
-185: 0 0 IO-APIC-level uhci-hcd
-193: 138 10 PCI-MSI aic79xx
-201: 30 0 PCI-MSI aic79xx
-225: 30 0 IO-APIC-level aic7xxx
-233: 30 0 IO-APIC-level aic7xxx
-NMI: 0 0
-LOC: 324553 325068
-ERR: 0
-MIS: 0
-
-6. MSI quirks
-
-Several PCI chipsets or devices are known to not support MSI.
-The PCI stack provides 3 possible levels of MSI disabling:
-* on a single device
-* on all devices behind a specific bridge
-* globally
-
-6.1. Disabling MSI on a single device
-
-Under some circumstances it might be required to disable MSI on a
-single device. This may be achieved by either not calling pci_enable_msi()
-or all, or setting the pci_dev->no_msi flag before (most of the time
-in a quirk).
-
-6.2. Disabling MSI below a bridge
-
-The vast majority of MSI quirks are required by PCI bridges not
-being able to route MSI between busses. In this case, MSI have to be
-disabled on all devices behind this bridge. It is achieves by setting
-the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge
-subordinate bus. There is no need to set the same flag on bridges that
-are below the broken bridge. When pci_enable_msi() is called to enable
-MSI on a device, pci_msi_supported() takes care of checking the NO_MSI
-flag in all parent busses of the device.
-
-Some bridges actually support dynamic MSI support enabling/disabling
-by changing some bits in their PCI configuration space (especially
-the Hypertransport chipsets such as the nVidia nForce and Serverworks
-HT2000). It may then be required to update the NO_MSI flag on the
-corresponding devices in the sysfs hierarchy. To enable MSI support
-on device "0000:00:0e", do:
-
- echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus
-
-To disable MSI support, echo 0 instead of 1. Note that it should be
-used with caution since changing this value might break interrupts.
-
-6.3. Disabling MSI globally
-
-Some extreme cases may require to disable MSI globally on the system.
-For now, the only known case is a Serverworks PCI-X chipsets (MSI are
-not supported on several busses that are not all connected to the
-chipset in the Linux PCI hierarchy). In the vast majority of other
-cases, disabling only behind a specific bridge is enough.
-
-For debugging purpose, the user may also pass pci=nomsi on the kernel
-command-line to explicitly disable MSI globally. But, once the appro-
-priate quirks are added to the kernel, this option should not be
-required anymore.
-
-6.4. Finding why MSI cannot be enabled on a device
-
-Assuming that MSI are not enabled on a device, you should look at
-dmesg to find messages that quirks may output when disabling MSI
-on some devices, some bridges or even globally.
-Then, lspci -t gives the list of bridges above a device. Reading
-/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI
-are enabled (1) or disabled (0). In 0 is found in a single bridge
-msi_bus file above the device, MSI cannot be enabled.
-
-7. FAQ
-
-Q1. Are there any limitations on using the MSI?
-
-A1. If the PCI device supports MSI and conforms to the
-specification and the platform supports the APIC local bus,
-then using MSI should work.
-
-Q2. Will it work on all the Pentium processors (P3, P4, Xeon,
-AMD processors)? In P3 IPI's are transmitted on the APIC local
-bus and in P4 and Xeon they are transmitted on the system
-bus. Are there any implications with this?
-
-A2. MSI support enables a PCI device sending an inbound
-memory write (0xfeexxxxx as target address) on its PCI bus
-directly to the FSB. Since the message address has a
-redirection hint bit cleared, it should work.
-
-Q3. The target address 0xfeexxxxx will be translated by the
-Host Bridge into an interrupt message. Are there any
-limitations on the chipsets such as Intel 8xx, Intel e7xxx,
-or VIA?
-
-A3. If these chipsets support an inbound memory write with
-target address set as 0xfeexxxxx, as conformed to PCI
-specification 2.3 or latest, then it should work.
-
-Q4. From the driver point of view, if the MSI is lost because
-of errors occurring during inbound memory write, then it may
-wait forever. Is there a mechanism for it to recover?
-
-A4. Since the target of the transaction is an inbound memory
-write, all transaction termination conditions (Retry,
-Master-Abort, Target-Abort, or normal completion) are
-supported. A device sending an MSI must abide by all the PCI
-rules and conditions regarding that inbound memory write. So,
-if a retry is signaled it must retry, etc... We believe that
-the recommendation for Abort is also a retry (refer to PCI
-specification 2.3 or latest).
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
new file mode 100644
index 0000000..6eb9b6a
--- /dev/null
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -0,0 +1,318 @@
+ The MSI Driver Guide HOWTO
+ Tom L Nguyen [email protected]
+ 10/03/2003
+ Revised Feb 12, 2004 by Martine Silbermann
+ email: [email protected]
+ Revised Jun 25, 2004 by Tom L Nguyen
+ Revised Jul 9, 2008 by Matthew Wilcox <[email protected]>
+ Copyright 2003, 2008 Intel Corporation
+
+1. About this guide
+
+This guide describes the basics of Message Signaled Interrupts (MSIs),
+the advantages of using MSI over traditional interrupt mechanisms,
+and how to change your driver to use MSI or MSI-X.
+
+
+2. What are MSIs?
+
+Message Signaled Interrupt (MSI) is an optional feature for devices
+which implement the PCI Local Bus Specification Revision 2.2 and later.
+MSI enables a device to generate an interrupt by sending a normal write
+to a special address in the host chipset that is translated into a CPU
+interrupt. MSI-X (introduced in PCI 3.0) is a more flexible scheme
+than MSI. It allows for greater control over what interrupts can be
+generated and supports a greater number of interrupts.
+
+A device indicates MSI support by implementing the MSI or the MSI-X
+capability in its PCI configuration space. It may implement both the
+MSI capability structure and the MSI-X capability structure, but only
+one may be enabled.
+
+
+3. Why use MSIs?
+
+Pin-based PCI interrupts are often shared amongst several devices.
+To support this, the kernel must call each interrupt handler associated
+with an interrupt which leads to increased latency for the interrupt
+handlers which are registered last.
+
+When a device performs DMA to memory and raises a pin-based interrupt, it
+is possible that the interrupt may arrive before all the data has arrived
+in memory (this becomes more likely with devices behind PCI-PCI bridges).
+In order to assure that all DMA has arrived in memory, the interrupt
+handler must read a register on the device which raised the interrupt.
+PCI ordering rules require that the writes be flushed to memory before
+the value can be returned from the register. MSI avoids this problem
+as the interrupt-generating write cannot pass the DMA writes, so by the
+time the interrupt is raised, the driver knows that the DMA has completed.
+
+Using MSI enables the device to support more interrupts, allowing
+each interrupt to be specialised to a different purpose. This allows
+infrequent conditions (such as errors) to be given their own interrupt and
+not have to check for errors during the normal interrupt handling path.
+
+
+4. How to use MSIs
+
+PCI devices are initialised to use pin-based interrupts. The device
+driver has to set up the device to use MSI or MSI-X. Not all machines
+support MSIs correctly, and for those machines, the APIs described below
+will simply fail and the device will continue to use pin-based interrupts.
+
+4.1 Include kernel support for MSIs
+
+To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
+option enabled. This option is only available on some architectures,
+and it may depend on some other options also being set. For example,
+on x86, you must also enable X86_UP_APIC or SMP in order to see the
+CONFIG_PCI_MSI option.
+
+4.2 Using MSI
+
+Most of the hard work is done for the driver in the PCI layer. It simply
+has to request that the PCI layer set up the MSI capability for this
+device.
+
+4.2.1 pci_enable_msi
+
+int pci_enable_msi(struct pci_dev *dev)
+
+A successful call will allocate ONE interrupt to the device, regardless
+of how many MSIs the device supports. The device will be switched from
+pin-based interrupt mode to MSI mode. The dev->irq number is changed
+to a new number which represents the message signaled interrupt.
+This function should be called before the driver calls request_irq()
+since enabling MSIs disables the pin-based IRQ and the driver will not
+receive interrupts on the old interrupt.
+
+4.2.2 pci_enable_msi_block
+
+int pci_enable_msi_block(struct pci_dev *pdev, int count)
+
+This variation allows a device driver to request multiple MSIs. The MSI
+specification only allows interrupts to be allocated in powers of two,
+up to a maximum of 2^5 (32).
+
+If this function returns 0, it has succeeded in allocating as many
+interrupts as the driver requested (it may have allocated more in order
+to satisfy the power-of-two requirement). In this case, the function
+enables MSI on this device and updates pdev->irq to be the lowest of
+the new interrupts assigned to it. The other interrupts assigned to
+the device are in the range pdev->irq to pdev->irq + count - 1.
+
+If this function returns a negative number, it indicates an error and
+the driver should not attempt to request any more MSI interrupts for
+this device. If this function returns a positive number, it will be
+less than 'count' and indicate the number of interrupts that could have
+been allocated. In neither case will the irq value have been
+updated, nor will the device have been switched into MSI mode.
+
+The device driver must decide what action to take if
+pci_enable_msi_block() returns a value less than the number asked for.
+Some devices can make use of fewer interrupts than the maximum they
+request; in this case the driver should call pci_enable_msi_block()
+again. Note that it is not guaranteed to succeed, even when the
+'count' has been reduced to the value returned from a previous call to
+pci_enable_msi_block(). This is because there are multiple constraints
+on the number of vectors that can be allocated; pci_enable_msi_block()
+will return as soon as it finds any constraint that doesn't allow the
+call to succeed.
+
+4.2.3 pci_disable_msi
+
+void pci_disable_msi(struct pci_dev *dev)
+
+This API should be used to undo the effect of pci_enable_msi() or
+pci_enable_msi_block(). This API restores dev->irq to the pin-based
+interrupt number and frees the previously allocated message signaled
+interrupt(s). The interrupt may subsequently be assigned to another
+device, so drivers should not cache the value of pdev->irq.
+
+A device driver must always call free_irq() on the interrupt(s)
+for which it has called request_irq() before calling this function.
+Failure to do so will result in a BUG_ON(), the device will be left with
+MSI enabled and will leak its vector.
+
+4.3 Using MSI-X
+
+The MSI-X capability is much more flexible than the MSI capability.
+It supports up to 2048 interrupts, each of which can be separately
+assigned. To support this flexibility, drivers must use an array of
+`struct msix_entry':
+
+struct msix_entry {
+ u16 vector; /* kernel uses to write alloc vector */
+ u16 entry; /* driver uses to specify entry */
+};
+
+This allows for the device to use these interrupts in a sparse fashion;
+for example it could use interrupts 3 and 1027 and allocate only a
+two-element array. The driver is expected to fill in the 'entry' value
+in each element of the array to indicate which entries it wants the kernel have
+interrupts assigned for. It is invalid to fill in two entries with the
+same number.
+
+4.3.1 pci_enable_msix
+
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+
+Calling this function asks the PCI subsystem to allocate 'nvec' MSIs.
+The 'entries' argument is a pointer to an array of msix_entry structs
+which should be at least 'nvec' entries in size. On success, the
+function will return 0 and the device will have been switched into
+MSI-X interrupt mode. The 'vector' elements in each entry will have
+been filled in with the interrupt number.
+
+If this function returns a negative number, it indicates an error and
+the driver should not attempt to allocate any more MSI-X interrupts for
+this device. If it returns a positive number, it indicates the maximum
+number of interrupt vectors that could have been allocated.
+
+This function, in contrast with pci_enable_msi(), does not adjust
+pdev->irq. The device will not generate interrupts for this interrupt
+number once MSI-X is enabled. The device driver is responsible for
+keeping track of the interrupts assigned to the MSI-X vectors so it can
+free them again later.
+
+Device drivers should normally call this function once per device
+during the initialization phase.
+
+4.3.2 pci_disable_msix
+
+void pci_disable_msix(struct pci_dev *dev)
+
+This API should be used to undo the effect of pci_enable_msix(). It frees
+the previously allocated message signaled interrupts. The interrupts may
+subsequently be assigned to another device, so drivers should not cache
+the value of the 'vector' elements over a call to pci_disable_msix().
+
+A device driver must always call free_irq() on the interrupt(s)
+for which it has called request_irq() before calling this function.
+Failure to do so will result in a BUG_ON(), the device will be left with
+MSI enabled and will leak its vector.
+
+4.3.3 The MSI-X Table
+
+The MSI-X capability specifies a BAR and offset within that BAR for the
+MSI-X Table. This address is mapped by the PCI subsystem, and should be
+considered to be off limits to the device driver. If the driver wishes to
+mask or unmask an interrupt, it should call disable_irq() / enable_irq().
+
+4.4 Handling devices implementing both MSI and MSI-X capabilities
+
+If a device implements both MSI and MSI-X capabilities, it can
+run in either MSI mode or MSI-X mode but not both simultaneously.
+This is a requirement of the PCI spec, and it is enforced by the
+PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or
+pci_enable_msix() when MSI is already enabled will result in an error.
+If a device driver wishes to switch between MSI and MSI-X at runtime,
+it must first quiesce the device, then switch it back to pin-interrupt
+mode, before calling pci_enable_msi() or pci_enable_msix() and resuming
+operation. This is not expected to be a common operation but may be
+useful for debugging or testing during development.
+
+4.5 Considerations when using MSIs
+
+4.5.1 Choosing between MSI-X and MSI
+
+If your device supports both MSI-X and MSI capabilities, you should use
+the MSI-X facilities in preference to the MSI facilities. As mentioned
+above, MSI-X supports any number of interrupts between 1 and 2048.
+In constrast, MSI is restricted to a maximum of 32 interrupts (and
+must be a power of two). In addition, the MSI interrupt vectors must
+be allocated consecutively, so the system may not be able to allocate
+as many vectors for MSI as it could for MSI-X. On some platforms, MSI
+interrupts must all be targetted at the same set of CPUs whereas MSI-X
+interrupts can all be targetted at different CPUs.
+
+4.5.2 Spinlocks
+
+Most device drivers have a per-device spinlock which is taken in the
+interrupt handler. With pin-based interrupts or a single MSI, it is not
+necessary to disable interrupts (Linux guarantees the same interrupt will
+not be re-entered). If a device uses multiple interrupts, the driver
+must disable interrupts while the lock is held. If the device sends
+a different interrupt, the driver will deadlock trying to recursively
+acquire the spinlock.
+
+There are two solutions. The first is to take the
+lock with spin_lock_irqsave() or spin_lock_irq() (see
+Documentation/DocBook/kernel-locking). The second is to specify
+IRQF_DISABLED to request_irq() so that the kernel runs the entire
+interrupt routine with interrupts disabled.
+
+If your MSI interrupt routine does not hold the lock for the whole time
+it is running, the first solution may be best. The second solution is
+normally preferred as it avoids making two transitions from interrupt
+disabled to enabled and back again.
+
+4.6 How to tell whether MSI/MSI-X is enabled on a device
+
+Using lspci -v (as root) will show some devices with "Message Signalled
+Interrupts" and others with "MSI-X". Each of these capabilities have an
+'Enable' flag which will be followed with either "+" (enabled) or "-"
+(disabled).
+
+
+5. MSI quirks
+
+Several PCI chipsets or devices are known not to support MSI.
+The PCI stack provides three possible ways to disable MSIs :
+* on a single device
+* on all devices behind a specific bridge
+* globally
+
+5.1. Disabling MSI on a single device
+
+Under some circumstances it might be required to disable MSI on a single
+device. This may be achieved by either not calling pci_enable_msi()
+for this device, or setting the pci_dev->no_msi flag before (most of
+the time in a quirk).
+
+5.2. Disabling MSI below a bridge
+
+The vast majority of MSI quirks are required by PCI bridges not
+being able to route MSI between busses. In this case, MSI have to be
+disabled on all devices behind this bridge. It is achieves by setting
+the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge
+subordinate bus. There is no need to set the same flag on bridges that
+are below the broken bridge. When pci_enable_msi() is called to enable
+MSI on a device, pci_msi_supported() takes care of checking the NO_MSI
+flag in all parent busses of the device.
+
+Some bridges actually support dynamic MSI support enabling/disabling
+by changing some bits in their PCI configuration space (especially
+the Hypertransport chipsets such as the nVidia nForce and Serverworks
+HT2000). It may then be required to update the NO_MSI flag on the
+corresponding devices in the sysfs hierarchy. To enable MSI support
+on device "0000:00:0e", do:
+
+ echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus
+
+To disable MSI support, echo 0 instead of 1. It should be
+used with caution since changing this value might break interrupts.
+
+5.3. Disabling MSI globally
+
+Some extreme cases may require to disable MSI globally on the system.
+For now, the only known case is a Serverworks PCI-X chipsets (MSI are
+not supported on several busses that are not all connected to the
+chipset in the Linux PCI hierarchy). In the vast majority of other
+cases, disabling only behind a specific bridge is enough.
+
+For debugging purpose, the user may also pass pci=nomsi on the kernel
+command-line to explicitly disable MSI globally. But, once the appro-
+priate quirks are added to the kernel, this option should not be
+required anymore.
+
+5.4. Finding why MSI cannot be enabled on a device
+
+Assuming that MSI are not enabled on a device, you should look at
+dmesg to find messages that quirks may output when disabling MSI
+on some devices, some bridges or even globally.
+Then, lspci -t gives the list of bridges above a device. Reading
+/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI
+are enabled (1) or disabled (0). In 0 is found in a single bridge
+msi_bus file above the device, MSI cannot be enabled.
+
--
1.5.5.4
Add the new API pci_enable_msi_block() to allow drivers to
request multiple MSI and reimplement pci_enable_msi in terms of
pci_enable_msi_block. Ensure that the architecture back ends don't
have to know about multiple MSI.
Signed-off-by: Matthew Wilcox <[email protected]>
---
arch/powerpc/kernel/msi.c | 4 ++
drivers/pci/msi.c | 79 +++++++++++++++++++++++++++++++-------------
drivers/pci/msi.h | 4 --
include/linux/msi.h | 1 +
include/linux/pci.h | 6 ++-
5 files changed, 64 insertions(+), 30 deletions(-)
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index c62d101..5bc8dda 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -19,6 +19,10 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
return -ENOSYS;
}
+ /* PowerPC doesn't support multiple MSI yet */
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
+
if (ppc_md.msi_check_device) {
pr_debug("msi: Using platform check routine.\n");
return ppc_md.msi_check_device(dev, nvec, type);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 104f297..bc2e888 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -45,6 +45,13 @@ arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
struct msi_desc *entry;
int ret;
+ /*
+ * If an architecture wants to support multiple MSI, it needs to
+ * override arch_setup_msi_irqs()
+ */
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
+
list_for_each_entry(entry, &dev->msi_list, list) {
ret = arch_setup_msi_irq(dev, entry);
if (ret)
@@ -65,8 +72,12 @@ arch_teardown_msi_irqs(struct pci_dev *dev)
struct msi_desc *entry;
list_for_each_entry(entry, &dev->msi_list, list) {
- if (entry->irq != 0)
- arch_teardown_msi_irq(entry->irq);
+ int i, nvec;
+ if (entry->irq == 0)
+ continue;
+ nvec = 1 << entry->msi_attrib.multiple;
+ for (i = 0; i < nvec; i++)
+ arch_teardown_msi_irq(entry->irq + i);
}
}
@@ -186,6 +197,12 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
} else {
struct pci_dev *dev = entry->dev;
int pos = entry->msi_attrib.pos;
+ u16 msgctl;
+
+ pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
+ msgctl &= ~PCI_MSI_FLAGS_QSIZE;
+ msgctl |= entry->msi_attrib.multiple << 4;
+ pci_write_config_word(dev, msi_control_reg(pos), msgctl);
pci_write_config_dword(dev, msi_lower_address_reg(pos),
msg->address_lo);
@@ -301,13 +318,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state);
/**
* msi_capability_init - configure device's MSI capability structure
* @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
*
- * Setup the MSI capability structure of device function with a single
- * MSI irq, regardless of device function is capable of handling
- * multiple messages. A return of zero indicates the successful setup
- * of an entry zero with the new MSI irq or non-zero for otherwise.
- **/
-static int msi_capability_init(struct pci_dev *dev)
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts. A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq. A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
{
struct msi_desc *entry;
int pos, ret;
@@ -351,7 +370,7 @@ static int msi_capability_init(struct pci_dev *dev)
list_add_tail(&entry->list, &dev->msi_list);
/* Configure MSI capability structure */
- ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI);
+ ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
if (ret) {
msi_free_irqs(dev);
return ret;
@@ -503,36 +522,44 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
}
/**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
*
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * irq or non-zero for otherwise.
- **/
-int pci_enable_msi(struct pci_dev* dev)
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs. If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate. If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
{
int status;
- status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+ /* MSI only supports up to 32 interrupts */
+ if (nvec > 32)
+ return 32;
+
+ status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
if (status)
return status;
WARN_ON(!!dev->msi_enabled);
- /* Check whether driver already requested for MSI-X irqs */
+ /* Check whether driver already requested MSI-X irqs */
if (dev->msix_enabled) {
printk(KERN_INFO "PCI: %s: Can't enable MSI. "
"Device already has MSI-X enabled\n",
pci_name(dev));
return -EINVAL;
}
- status = msi_capability_init(dev);
+
+ status = msi_capability_init(dev, nvec);
return status;
}
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);
void pci_msi_shutdown(struct pci_dev* dev)
{
@@ -581,8 +608,12 @@ static int msi_free_irqs(struct pci_dev* dev)
struct msi_desc *entry, *tmp;
list_for_each_entry(entry, &dev->msi_list, list) {
- if (entry->irq)
- BUG_ON(irq_has_action(entry->irq));
+ int i, nvec;
+ if (!entry->irq)
+ continue;
+ nvec = 1 << entry->msi_attrib.multiple;
+ for (i = 0; i < nvec; i++)
+ BUG_ON(irq_has_action(entry->irq + i));
}
arch_teardown_msi_irqs(dev);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 3898f52..b72e0bd 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -22,12 +22,8 @@
#define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE
#define multi_msi_capable(control) \
(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
-#define multi_msi_enable(control, num) \
- control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
#define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT))
#define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT))
-#define msi_enable(control, num) multi_msi_enable(control, num); \
- control |= PCI_MSI_FLAGS_ENABLE
#define msix_table_offset_reg(base) (base + 0x04)
#define msix_pba_offset_reg(base) (base + 0x08)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 13a8a1b..70fcebc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -18,6 +18,7 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
struct msi_desc {
struct {
__u8 is_msix : 1;
+ __u8 multiple: 3; /* log2 number of messages */
__u8 maskbit : 1; /* mask-pending bit supported ? */
__u8 masked : 1;
__u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d18b1dd..16df5f9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -699,7 +699,7 @@ struct msix_entry {
#ifndef CONFIG_PCI_MSI
-static inline int pci_enable_msi(struct pci_dev *dev)
+static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
{
return -1;
}
@@ -726,7 +726,7 @@ static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev)
static inline void pci_restore_msi_state(struct pci_dev *dev)
{ }
#else
-extern int pci_enable_msi(struct pci_dev *dev);
+extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
extern void pci_msi_shutdown(struct pci_dev *dev);
extern void pci_disable_msi(struct pci_dev *dev);
extern int pci_enable_msix(struct pci_dev *dev,
@@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
extern void pci_restore_msi_state(struct pci_dev *dev);
#endif
+#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
+
#ifdef CONFIG_HT_IRQ
/* The functions a driver should call */
int ht_create_irq(struct pci_dev *dev, int idx);
--
1.5.5.4
Hi,
I'm very sorry for very delayed comment, but I have a concern
about irq affinity code. Since I didn't have enough time to
look at your patch, I might be misunderstanding something...
In my understanding, when irq affinity is changed on one of
the IRQs corresponding to MSI, it will not completed until
interrupts occur on all the IRQs. Attempts to change irq
affinity before previous affinity change is finished will be
ignored. Here, suppose that device uses three MSI irqs. In
this case, your patch manages four irqs, but only three
interrupts are used. If irq affinity changed on this MSI
interrupts, will it be completed?
Thanks,
Kenji Kaneshige
Matthew Wilcox wrote:
> Implement the arch_setup_msi_irqs() interface. Extend create_irq()
> into create_irq_block() and reimplement create_irq as a wrapper around
> it. Create assign_irq_vector_block() based closely on
> assign_irq_vector(). Teach set_msi_irq_affinity() how to handle
> multiple MSIs.
>
> Signed-off-by: Matthew Wilcox <[email protected]>
> ---
> arch/x86/kernel/io_apic_64.c | 237 ++++++++++++++++++++++++++++++++++++------
> 1 files changed, 205 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
> index ef1a8df..6a00dca 100644
> --- a/arch/x86/kernel/io_apic_64.c
> +++ b/arch/x86/kernel/io_apic_64.c
> @@ -61,7 +61,7 @@ struct irq_cfg {
> };
>
> /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
> -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
> +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
> [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
> [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
> [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
> @@ -683,6 +683,8 @@ static int pin_2_irq(int idx, int apic, int pin)
> return irq;
> }
>
> +static int current_vector = FIRST_DEVICE_VECTOR;
> +
> static int __assign_irq_vector(int irq, cpumask_t mask)
> {
> /*
> @@ -696,7 +698,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
> * Also, we've got to be careful not to trash gate
> * 0x80, because int 0x80 is hm, kind of importantish. ;)
> */
> - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
> + static int current_offset = 0;
> unsigned int old_vector;
> int cpu;
> struct irq_cfg *cfg;
> @@ -769,6 +771,97 @@ static int assign_irq_vector(int irq, cpumask_t mask)
> return err;
> }
>
> +static int __assign_irq_vector_block(int irq, int count, cpumask_t mask)
> +{
> + unsigned int old_vector;
> + int i, cpu;
> + struct irq_cfg *cfg;
> +
> + /*
> + * We've got to be careful not to trash gate 0x80,
> + * because int 0x80 is hm, kind of importantish. ;)
> + */
> + BUG_ON((unsigned)irq + count > NR_IRQS);
> +
> + /* Only try and allocate irqs on cpus that are present */
> + cpus_and(mask, mask, cpu_online_map);
> +
> + for (i = 0; i < count; i++) {
> + cfg = &irq_cfg[irq + i];
> + if ((cfg->move_in_progress) || cfg->move_cleanup_count)
> + return -EBUSY;
> + }
> +
> + cfg = &irq_cfg[irq];
> + old_vector = cfg->vector;
> + if (old_vector) {
> + cpumask_t tmp;
> + cpus_and(tmp, cfg->domain, mask);
> + if (!cpus_empty(tmp))
> + return 0;
> + }
> +
> + for_each_cpu_mask(cpu, mask) {
> + cpumask_t domain, new_mask;
> + int new_cpu;
> + int vector;
> +
> + domain = vector_allocation_domain(cpu);
> + cpus_and(new_mask, domain, cpu_online_map);
> +
> + vector = current_vector & ~(count - 1);
> + next:
> + vector += count;
> + if (vector + count >= FIRST_SYSTEM_VECTOR) {
> + vector = FIRST_DEVICE_VECTOR & ~(count - 1);
> + if (vector < FIRST_DEVICE_VECTOR)
> + vector += count;
> + }
> + if (unlikely(vector == (current_vector & ~(count - 1))))
> + continue;
> + if ((IA32_SYSCALL_VECTOR >= vector) &&
> + (IA32_SYSCALL_VECTOR < vector + count))
> + goto next;
> + for_each_cpu_mask(new_cpu, new_mask) {
> + for (i = 0; i < count; i++) {
> + if (per_cpu(vector_irq, new_cpu)[vector + i]
> + != -1)
> + goto next;
> + }
> + }
> + /* Found one! */
> + current_vector = vector + count - 1;
> + for (i = 0; i < count; i++) {
> + cfg = &irq_cfg[irq + i];
> + if (old_vector) {
> + cfg->move_in_progress = 1;
> + cfg->old_domain = cfg->domain;
> + }
> + for_each_cpu_mask(new_cpu, new_mask) {
> + per_cpu(vector_irq, new_cpu)[vector + i] =
> + irq + i;
> + }
> + cfg->vector = vector;
> + cfg->domain = domain;
> + }
> + return 0;
> + }
> + return -ENOSPC;
> +}
> +
> +/* Assumes that count is a power of two and aligns to that power of two */
> +static int assign_irq_vector_block(int irq, int count, cpumask_t mask)
> +{
> + int result;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&vector_lock, flags);
> + result = __assign_irq_vector_block(irq, count, mask);
> + spin_unlock_irqrestore(&vector_lock, flags);
> +
> + return result;
> +}
> +
> static void __clear_irq_vector(int irq)
> {
> struct irq_cfg *cfg;
> @@ -788,6 +881,14 @@ static void __clear_irq_vector(int irq)
> cpus_clear(cfg->domain);
> }
>
> +static void __clear_irq_vector_block(int irq, int count)
> +{
> + while (count > 0) {
> + count--;
> + __clear_irq_vector(irq + count);
> + }
> +}
> +
> void __setup_vector_irq(int cpu)
> {
> /* Initialize vector_irq on a new cpu */
> @@ -1895,30 +1996,56 @@ device_initcall(ioapic_init_sysfs);
> /*
> * Dynamic irq allocate and deallocation
> */
> -int create_irq(void)
> +
> +/*
> + * On success, returns the interrupt number of the lowest numbered irq
> + * in the block. If it can't find a block of the right size, it returns
> + * -1 - (length of the longest run).
> + */
> +static int create_irq_block(int count)
> {
> - /* Allocate an unused irq */
> - int irq;
> - int new;
> + /* Allocate 'count' consecutive unused irqs */
> + int i, new, longest;
> unsigned long flags;
>
> - irq = -ENOSPC;
> + longest = 0;
> spin_lock_irqsave(&vector_lock, flags);
> for (new = (NR_IRQS - 1); new >= 0; new--) {
> if (platform_legacy_irq(new))
> - continue;
> + goto clear;
> if (irq_cfg[new].vector != 0)
> + goto clear;
> + longest++;
> + if (longest < count)
> continue;
> - if (__assign_irq_vector(new, TARGET_CPUS) == 0)
> - irq = new;
> +
> + while (__assign_irq_vector_block(new, longest, TARGET_CPUS))
> + longest /= 2;
> + if (longest < count)
> + __clear_irq_vector_block(new, longest);
> break;
> + clear:
> + __clear_irq_vector_block(new + 1, longest);
> + longest = 0;
> }
> spin_unlock_irqrestore(&vector_lock, flags);
>
> - if (irq >= 0) {
> - dynamic_irq_init(irq);
> + if (longest < count)
> + return -1 - longest;
> +
> + for (i = 0; i < count; i++) {
> + dynamic_irq_init(new + i);
> }
> - return irq;
> +
> + return new;
> +}
> +
> +int create_irq(void)
> +{
> + int ret = create_irq_block(1);
> + if (ret < 0)
> + return -ENOSPC;
> + return ret;
> }
>
> void destroy_irq(unsigned int irq)
> @@ -1936,7 +2063,8 @@ void destroy_irq(unsigned int irq)
> * MSI message composition
> */
> #ifdef CONFIG_PCI_MSI
> -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
> +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
> + unsigned int count, struct msi_msg *msg)
> {
> struct irq_cfg *cfg = irq_cfg + irq;
> int err;
> @@ -1944,7 +2072,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
> cpumask_t tmp;
>
> tmp = TARGET_CPUS;
> - err = assign_irq_vector(irq, tmp);
> + if (count == 1)
> + err = assign_irq_vector(irq, tmp);
> + else
> + err = assign_irq_vector_block(irq, count, tmp);
> if (!err) {
> cpus_and(tmp, cfg->domain, tmp);
> dest = cpu_mask_to_apicid(tmp);
> @@ -1975,6 +2106,8 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
> static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
> {
> struct irq_cfg *cfg = irq_cfg + irq;
> + struct msi_desc *desc = get_irq_msi(irq);
> + int i, count = 1 << desc->msi_attrib.multiple;
> struct msi_msg msg;
> unsigned int dest;
> cpumask_t tmp;
> @@ -1983,8 +2116,15 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
> if (cpus_empty(tmp))
> return;
>
> - if (assign_irq_vector(irq, mask))
> - return;
> + if (count > 1) {
> + /* Multiple MSIs all go to the same destination */
> + irq = desc->irq;
> + if (assign_irq_vector_block(irq, count, mask))
> + return;
> + } else {
> + if (assign_irq_vector(irq, mask))
> + return;
> + }
>
> cpus_and(tmp, cfg->domain, mask);
> dest = cpu_mask_to_apicid(tmp);
> @@ -1997,7 +2137,9 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
> msg.address_lo |= MSI_ADDR_DEST_ID(dest);
>
> write_msi_msg(irq, &msg);
> - irq_desc[irq].affinity = mask;
> +
> + for (i = 0; i < count; i++)
> + irq_desc[irq + i].affinity = mask;
> }
> #endif /* CONFIG_SMP */
>
> @@ -2016,28 +2158,59 @@ static struct irq_chip msi_chip = {
> .retrigger = ioapic_retrigger_irq,
> };
>
> -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
> +static int x86_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc, int count)
> {
> struct msi_msg msg;
> - int irq, ret;
> - irq = create_irq();
> - if (irq < 0)
> - return irq;
> -
> - ret = msi_compose_msg(dev, irq, &msg);
> - if (ret < 0) {
> - destroy_irq(irq);
> - return ret;
> + int i, ret, base_irq, alloc;
> +
> + /* MSI can only allocate a power-of-two */
> + alloc = roundup_pow_of_two(count);
> +
> + base_irq = create_irq_block(alloc);
> + if (base_irq < 0) {
> + if (alloc == 1)
> + return -ENOSPC;
> + return rounddown_pow_of_two(-base_irq - 1);
> }
>
> - set_irq_msi(irq, desc);
> - write_msi_msg(irq, &msg);
> + ret = msi_compose_msg(pdev, base_irq, alloc, &msg);
> + if (ret)
> + return ret;
> +
> + desc->msi_attrib.multiple = order_base_2(alloc);
>
> - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
> + /* Do loop in reverse so set_irq_msi ends up setting
> + * desc->irq to base_irq
> + */
> + for (i = count - 1; i >= 0; i--) {
> + set_irq_msi(base_irq + i, desc);
> + set_irq_chip_and_handler_name(base_irq + i, &msi_chip,
> + handle_edge_irq, "edge");
> + }
> + write_msi_msg(base_irq, &msg);
>
> return 0;
> }
>
> +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
> +{
> + struct msi_desc *desc;
> + int ret;
> +
> + if (type == PCI_CAP_ID_MSI) {
> + desc = list_first_entry(&pdev->msi_list, struct msi_desc, list);
> + ret = x86_setup_msi_irq(pdev, desc, nvec);
> + } else {
> + list_for_each_entry(desc, &pdev->msi_list, list) {
> + ret = x86_setup_msi_irq(pdev, desc, 1);
> + if (ret)
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> void arch_teardown_msi_irq(unsigned int irq)
> {
> destroy_irq(irq);
> @@ -2090,7 +2263,7 @@ int arch_setup_dmar_msi(unsigned int irq)
> int ret;
> struct msi_msg msg;
>
> - ret = msi_compose_msg(NULL, irq, &msg);
> + ret = msi_compose_msg(NULL, irq, 1, &msg);
> if (ret < 0)
> return ret;
> dmar_msi_write(irq, &msg);
Hi,
First of all, it seems that mask/unmask of MSI has problems.
- Per-vector masking is optional for MSI, so I think that allocating
multiple messages for a function without masking capability would be
not good idea, since all vector in the block will be masked/unmasked
at once without any agreement.
- Even if the function supports per-vector masking, current
mask/unmask_msi_irq() functions assume that MSI uses only one vector,
therefore they only set/unset the first bit of the maskbits which
for the first vector of the block. The bits for other vectors are
initialized as 'masked' but no one unmask them.
Matthew Wilcox wrote:
> + * Allocate IRQs for a device with the MSI capability.
> + * This function returns a negative errno if an error occurs. If it
> + * is unable to allocate the number of interrupts requested, it returns
> + * the number of interrupts it might be able to allocate. If it successfully
> + * allocates at least the number of interrupts requested, it returns 0 and
> + * updates the @dev's irq member to the lowest new interrupt number; the
> + * other interrupt numbers allocated to this device are consecutive.
> + */
> +int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
> {
> int status;
>
> - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
> + /* MSI only supports up to 32 interrupts */
> + if (nvec > 32)
> + return 32;
I think we should return -EINVAL here.
No one guarantee that 32 interrupts is able to be allocate at this time.
And also I think -EINVAL should be returned if nvec is greater than
the number of encoded in the function's "Multiple Message Capable", but
I could not find any mention about handling of such over-capability request
in PCI Bus Spec. 3.0.
> +
> + status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
> if (status)
> return status;
>
> WARN_ON(!!dev->msi_enabled);
>
> - /* Check whether driver already requested for MSI-X irqs */
> + /* Check whether driver already requested MSI-X irqs */
> if (dev->msix_enabled) {
> printk(KERN_INFO "PCI: %s: Can't enable MSI. "
> "Device already has MSI-X enabled\n",
> pci_name(dev));
> return -EINVAL;
> }
> - status = msi_capability_init(dev);
> +
> + status = msi_capability_init(dev, nvec);
> return status;
> }
> -EXPORT_SYMBOL(pci_enable_msi);
> +EXPORT_SYMBOL(pci_enable_msi_block);
Thanks,
H.Seto
On Fri, Jul 11, 2008 at 01:50:23PM +0900, Kenji Kaneshige wrote:
> I'm very sorry for very delayed comment, but I have a concern
> about irq affinity code. Since I didn't have enough time to
> look at your patch, I might be misunderstanding something...
>
> In my understanding, when irq affinity is changed on one of
> the IRQs corresponding to MSI, it will not completed until
> interrupts occur on all the IRQs. Attempts to change irq
> affinity before previous affinity change is finished will be
> ignored. Here, suppose that device uses three MSI irqs. In
> this case, your patch manages four irqs, but only three
> interrupts are used. If irq affinity changed on this MSI
> interrupts, will it be completed?
I have tested the affinity code with an ICH9 AHCI:
495: 117233 117966 118033 117797 PCI-MSI-edge ahci
496: 29860 29106 30191 28705 PCI-MSI-edge ahci
497: 0 0 0 0 PCI-MSI-edge ahci
498: 0 0 0 0 PCI-MSI-edge ahci
499: 0 0 0 0 PCI-MSI-edge ahci
500: 0 0 0 0 PCI-MSI-edge ahci
This chip requires 16 MSIs to be registered, and it has 6 ports.
Only ports 0 and 1 have a device attached. If I change the mask of
an active irq (eg 495 or 496), it takes effect on both of them. If I
change the mask of an inactive irq (497-500), nothing happens. But I
can subsequently change the mask on 495 or 496 successfully.
I can't tell you why this works this way; I haven't looked in enough
detail at the irq affinity code, but this is my observation.
Thanks for your comment.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
On Fri, Jul 11, 2008 at 05:28:28PM +0900, Hidetoshi Seto wrote:
> Hi,
>
> First of all, it seems that mask/unmask of MSI has problems.
> - Per-vector masking is optional for MSI, so I think that allocating
> multiple messages for a function without masking capability would be
> not good idea, since all vector in the block will be masked/unmasked
> at once without any agreement.
> - Even if the function supports per-vector masking, current
> mask/unmask_msi_irq() functions assume that MSI uses only one vector,
> therefore they only set/unset the first bit of the maskbits which
> for the first vector of the block. The bits for other vectors are
> initialized as 'masked' but no one unmask them.
Thank you for pointing out the problems with masking. The device I am
testing with does not support per-vector masking, so I have not paid
attention to this.
To your first point, if the function does not support per-vector
masking, I think it's OK to mask/unmask all vectors at once. But we
must be careful to manage this correctly in software; if we disable IRQ
496, disable IRQ 497, then enable IRQ 497, we must not enable IRQ 496 at
that time. I think we can solve this problem, but I must think about it
some more.
The second point is a simple bug that should be easy to fix. Thank you
for pointing it out.
> Matthew Wilcox wrote:
> > + * Allocate IRQs for a device with the MSI capability.
> > + * This function returns a negative errno if an error occurs. If it
> > + * is unable to allocate the number of interrupts requested, it returns
> > + * the number of interrupts it might be able to allocate. If it successfully
> > + * allocates at least the number of interrupts requested, it returns 0 and
> > + * updates the @dev's irq member to the lowest new interrupt number; the
> > + * other interrupt numbers allocated to this device are consecutive.
> > + */
> > +int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
> > {
> > int status;
> >
> > - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
> > + /* MSI only supports up to 32 interrupts */
> > + if (nvec > 32)
> > + return 32;
>
> I think we should return -EINVAL here.
> No one guarantee that 32 interrupts is able to be allocate at this time.
>
> And also I think -EINVAL should be returned if nvec is greater than
> the number of encoded in the function's "Multiple Message Capable", but
> I could not find any mention about handling of such over-capability request
> in PCI Bus Spec. 3.0.
It would be outside the scope of the PCI Bus Specification. I think
you're right that we should check the MMC bits; but I think we should
tell the driver to request a lower number, not return -EINVAL.
Thanks for your comments.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
Matthew Wilcox <[email protected]> writes:
> I'd like to thank Michael Ellerman for his feedback. This is a much
> better patchset than it used to be.
There is a reason we don't have an API to support this. Linux can not
reasonably support this, especially not on current X86. The designers
of the of the AHCI were idiots and should have used MSI-X.
Attempting to support multiple irqs in an MSI capability breaks
every interesting use of an irq.
mask/unmask is will likely break because the mask bit is optional
and when it is not present we disable the msi capability.
We can not set the affinity individually so we can not allow
different queues to be processed on different cores.
So in general it seems something that we have to jump through a million
hurdles and the result is someones twisted parody of a multiple working
irqs, that even Intel's IOMMU can't cure.
So unless the performance of the AHCI is better by a huge amount I don't
see the point, and even then I am extremely sceptical.
Eric
On Fri, Jul 11, 2008 at 03:06:33AM -0700, Eric W. Biederman wrote:
> Matthew Wilcox <[email protected]> writes:
>
> > I'd like to thank Michael Ellerman for his feedback. This is a much
> > better patchset than it used to be.
>
> There is a reason we don't have an API to support this. Linux can not
> reasonably support this, especially not on current X86. The designers
> of the of the AHCI were idiots and should have used MSI-X.
Thank you for your constructive feedback, Eric. Unfortunately, we have
to deal with the world as it is, not how we would like it to be. Since
I have it running, I'd like to know what is unreasonable about the
implementation.
> mask/unmask is will likely break because the mask bit is optional
> and when it is not present we disable the msi capability.
I believe this is fixable. I will attempt to do so.
> We can not set the affinity individually so we can not allow
> different queues to be processed on different cores.
This is true, and yet, it is still useful. Just not as useful as one
would want.
> So in general it seems something that we have to jump through a million
> hurdles and the result is someones twisted parody of a multiple working
> irqs, that even Intel's IOMMU can't cure.
More constructive feedback ...
> So unless the performance of the AHCI is better by a huge amount I don't
> see the point, and even then I am extremely sceptical.
I don't have performance numbers yet, but surely you can see that
avoiding a register read in the interrupt path is a large win?
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
From: Matthew Wilcox <[email protected]>
Date: Fri, 11 Jul 2008 04:23:26 -0600
> On Fri, Jul 11, 2008 at 03:06:33AM -0700, Eric W. Biederman wrote:
> > So unless the performance of the AHCI is better by a huge amount I don't
> > see the point, and even then I am extremely sceptical.
>
> I don't have performance numbers yet, but surely you can see that
> avoiding a register read in the interrupt path is a large win?
Such overhead is going to be amortized.
AHCI is not like networking where we have lots of very small
transactions to deal with, and therefore the per-IRQ overhead can
begin to dominate.
Therefore, like Eric, I think workng on multiple MSI is a very dubious
usage of one's time. But, it's your time, so use it how you wish :)
On Fri, Jul 11, 2008 at 03:32:42AM -0700, David Miller wrote:
> From: Matthew Wilcox <[email protected]>
> Date: Fri, 11 Jul 2008 04:23:26 -0600
>
> > On Fri, Jul 11, 2008 at 03:06:33AM -0700, Eric W. Biederman wrote:
> > > So unless the performance of the AHCI is better by a huge amount I don't
> > > see the point, and even then I am extremely sceptical.
> >
> > I don't have performance numbers yet, but surely you can see that
> > avoiding a register read in the interrupt path is a large win?
>
> Such overhead is going to be amortized.
>
> AHCI is not like networking where we have lots of very small
> transactions to deal with, and therefore the per-IRQ overhead can
> begin to dominate.
>
> Therefore, like Eric, I think workng on multiple MSI is a very dubious
> usage of one's time. But, it's your time, so use it how you wish :)
I didn't start hacking without performance numbers ;-) With an
I/O-heavy workload, the two biggest consumers of CPU time in the profile
were ahci_interrupt and ahci_qc_issue. I got 10% more bandwidth by
removing the flushing readl() from ahci_qc_issue. I'm hoping for a
similar improvement by removing this readl() too.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
Matthew Wilcox <[email protected]> writes:
> On Fri, Jul 11, 2008 at 03:06:33AM -0700, Eric W. Biederman wrote:
>> Matthew Wilcox <[email protected]> writes:
>>
>> > I'd like to thank Michael Ellerman for his feedback. This is a much
>> > better patchset than it used to be.
>>
>> There is a reason we don't have an API to support this. Linux can not
>> reasonably support this, especially not on current X86. The designers
>> of the of the AHCI were idiots and should have used MSI-X.
>
> Thank you for your constructive feedback, Eric. Unfortunately, we have
> to deal with the world as it is, not how we would like it to be. Since
> I have it running, I'd like to know what is unreasonable about the
> implementation.
At the very least it is setting all kinds of expectations that it
doesn't meet.
In addition the MSI-X spec predates the AHCI device by a long shot.
In general my experience has been that the hardware designers who
really care and have done their homework and can actually take
advantage of multiple irqs have implemented MSI-X.
>> mask/unmask is will likely break because the mask bit is optional
>> and when it is not present we disable the msi capability.
>
> I believe this is fixable. I will attempt to do so.
Assuming AHCI implements the mask bits. In the general case this
is not fixable. I know of several devices that do not implement
the optional mask bits.
>> We can not set the affinity individually so we can not allow
>> different queues to be processed on different cores.
>
> This is true, and yet, it is still useful. Just not as useful as one
> would want.
Also a case of mismatched expectations. The linux irq API allows
irqs to be bound to different cpus individually. Multi msi does
not meet that contract.
>> So unless the performance of the AHCI is better by a huge amount I don't
>> see the point, and even then I am extremely sceptical.
>
> I don't have performance numbers yet, but surely you can see that
> avoiding a register read in the interrupt path is a large win?
No. It is not obvious to me.
Eric
There is one idea that seems to model this cleanly
without breaking all kinds of expectations.
That is an irq with a very small data payload.
In that case we wire all of the vectors up to a single
irq handler that computes the payload as:
payload = vector - base-vector.
And then we figure out how to pass that to the handler in irqaction.
To most of the system it is a single irq so it avoids breaking
expectations about what an irq is.
To everything else it is a little odd, and has it's own unique
set of rules (which is good as well).
Eric
On Fri, Jul 11, 2008 at 04:34:19AM -0700, Eric W. Biederman wrote:
> There is one idea that seems to model this cleanly
> without breaking all kinds of expectations.
>
> That is an irq with a very small data payload.
>
> In that case we wire all of the vectors up to a single
> irq handler that computes the payload as:
> payload = vector - base-vector.
>
> And then we figure out how to pass that to the handler in irqaction.
>
> To most of the system it is a single irq so it avoids breaking
> expectations about what an irq is.
>
> To everything else it is a little odd, and has it's own unique
> set of rules (which is good as well).
OK, I'm willing to play this scenario through and see where it takes us.
- The affinity now matches reality. Good.
- For devices without individual masking, the masking API matches
reality. Good.
- For devices with individual masking, we'll want a new API. Adequate.
- We'll still need to allocate an aligned block of vectors on x86-64.
No change.
I think rather than passing the 'vector - base_vector' integer, the
request_irq_block() should pass in an array of pointers as large as nvec
and irqaction passes the appropriate pointer to the handler.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
On Fri, Jul 11, 2008 at 06:17:44AM -0600, Matthew Wilcox wrote:
> On Fri, Jul 11, 2008 at 04:34:19AM -0700, Eric W. Biederman wrote:
> > There is one idea that seems to model this cleanly
> > without breaking all kinds of expectations.
> >
> > That is an irq with a very small data payload.
> >
> > In that case we wire all of the vectors up to a single
> > irq handler that computes the payload as:
> > payload = vector - base-vector.
> >
> > And then we figure out how to pass that to the handler in irqaction.
> >
> > To most of the system it is a single irq so it avoids breaking
> > expectations about what an irq is.
> >
> > To everything else it is a little odd, and has it's own unique
> > set of rules (which is good as well).
Having implemented this, I wish to retract my earlier statement:
> I think rather than passing the 'vector - base_vector' integer, the
> request_irq_block() should pass in an array of pointers as large as nvec
> and irqaction passes the appropriate pointer to the handler.
That idea requires a lot of mucking with generic_IRQ_handle, which I'm
not willing to do. Anyone have any problems with the below patch?
I've not implemented the corresponding bit in io_apic_64.c yet, so
completely untested.
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac154..a5d79e9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -173,7 +173,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
stack_overflow_check(regs);
#endif
- if (likely(irq < NR_IRQS))
+ if (likely((irq & ARCH_IRQ_DATA_MASK) < NR_IRQS))
generic_handle_irq(irq);
else {
if (!disable_apic)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 552e0ec..19c679c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -282,6 +282,19 @@ extern unsigned int __do_IRQ(unsigned int irq);
#endif
/*
+ * For multiple MSI, we allow a small amount of data to be passed to
+ * the interrupt handler to determine which vector was called. MSI
+ * only allows 32 interrupts, so we default to requiring 5 bits of
+ * information to be passed. If your arch has NR_IRQS set higher than
+ * 2^27, you have some extremely large arrays and probably want to consider
+ * defining ARCH_IRQ_DATA_SHIFT and ARCH_IRQ_DATA_MASK.
+ */
+#ifndef ARCH_IRQ_DATA_SHIFT
+#define ARCH_IRQ_DATA_SHIFT 27
+#define ARCH_IRQ_DATA_MASK ((1 << ARCH_IRQ_DATA_SHIFT) - 1)
+#endif
+
+/*
* Architectures call this to let the generic IRQ layer
* handle an interrupt. If the descriptor is attached to an
* irqchip-style controller then we call the ->handle_irq() handler,
@@ -289,7 +302,7 @@ extern unsigned int __do_IRQ(unsigned int irq);
*/
static inline void generic_handle_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_desc + (irq & ARCH_IRQ_DATA_MASK);
#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
desc->handle_irq(irq, desc);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 5fa6198..86ff1ab 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -126,7 +126,7 @@ irqreturn_t no_action(int cpl, void *dev_id)
*
* Handles the action chain of an irq event
*/
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+irqreturn_t handle_IRQ_event(unsigned int irq_plus_data, struct irqaction *action)
{
irqreturn_t ret, retval = IRQ_NONE;
unsigned int status = 0;
@@ -137,7 +137,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
local_irq_enable_in_hardirq();
do {
- ret = action->handler(irq, action->dev_id);
+ ret = action->handler(irq_plus_data, action->dev_id);
if (ret == IRQ_HANDLED)
status |= action->flags;
retval |= ret;
@@ -163,15 +163,16 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
* This is the original x86 implementation which is used for every
* interrupt type.
*/
-unsigned int __do_IRQ(unsigned int irq)
+unsigned int __do_IRQ(unsigned int irq_plus_data)
{
+ unsigned int irq = irq_plus_data & ARCH_IRQ_DATA_MASK;
struct irq_desc *desc = irq_desc + irq;
struct irqaction *action;
unsigned int status;
kstat_this_cpu.irqs[irq]++;
if (CHECK_IRQ_PER_CPU(desc->status)) {
- irqreturn_t action_ret;
+ irqreturn_t ret;
/*
* No locking required for CPU-local interrupts:
@@ -179,9 +180,9 @@ unsigned int __do_IRQ(unsigned int irq)
if (desc->chip->ack)
desc->chip->ack(irq);
if (likely(!(desc->status & IRQ_DISABLED))) {
- action_ret = handle_IRQ_event(irq, desc->action);
+ ret = handle_IRQ_event(irq_plus_data, desc->action);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ note_interrupt(irq, desc, ret);
}
desc->chip->end(irq);
return 1;
@@ -233,7 +234,7 @@ unsigned int __do_IRQ(unsigned int irq)
spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, action);
+ action_ret = handle_IRQ_event(irq_plus_data, action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
On Fri, Jul 11, 2008 at 03:06:33AM -0700, Eric W. Biederman wrote:
> Matthew Wilcox <[email protected]> writes:
>
> > I'd like to thank Michael Ellerman for his feedback. This is a much
> > better patchset than it used to be.
>
> There is a reason we don't have an API to support this. Linux can not
> reasonably support this, especially not on current X86. The designers
> of the of the AHCI were idiots and should have used MSI-X.
>
> Attempting to support multiple irqs in an MSI capability breaks
> every interesting use of an irq.
>
> mask/unmask is will likely break because the mask bit is optional
> and when it is not present we disable the msi capability.
>
> We can not set the affinity individually so we can not allow
> different queues to be processed on different cores.
>
> So in general it seems something that we have to jump through a million
> hurdles and the result is someones twisted parody of a multiple working
> irqs, that even Intel's IOMMU can't cure.
With interrupt-remapping, we can program the individual interrupt
remapping table entries to point to different cpu's etc. All we have
to take care is, do the IRTE allocation in a consecutive block and
program the starting index to the MSI registers.
Just curious Eric, why do you think that won't work?
thanks,
suresh
> So unless the performance of the AHCI is better by a huge amount I don't
> see the point, and even then I am extremely sceptical.
>
> Eric
Suresh Siddha <[email protected]> writes:
> With interrupt-remapping, we can program the individual interrupt
> remapping table entries to point to different cpu's etc. All we have
> to take care is, do the IRTE allocation in a consecutive block and
> program the starting index to the MSI registers.
>
> Just curious Eric, why do you think that won't work?
Working mask/unmask. With MSI-X as specced if I mask an irq and then unmask
it, an msi message will fire if something happened while the irq was masked
and not taken care of before the irq was unmasked. That is the correct
behavior for an irq and a mmu won't let me get that.
The best I can do with an iommu is to run delayed disable and set the
interrupt remapping slot in the iommu to reject the traffic. Which
is almost but not quite what I want.
Overall introducing a new concept into the linux irq model seems a lot
cleaner and more portable, and even there we are likely to be a lot
more fragile because of the difficulty in obtaining contiguous
vectors.
Speaking of. How many interrupt targets does the dmar iommu have
for interrupts? 16K?
Eric
On Fri, Jul 11, 2008 at 03:59:59PM -0700, Eric W. Biederman wrote:
> Suresh Siddha <[email protected]> writes:
>
> > With interrupt-remapping, we can program the individual interrupt
> > remapping table entries to point to different cpu's etc. All we have
> > to take care is, do the IRTE allocation in a consecutive block and
> > program the starting index to the MSI registers.
> >
> > Just curious Eric, why do you think that won't work?
>
> Working mask/unmask. With MSI-X as specced if I mask an irq and then unmask
> it, an msi message will fire if something happened while the irq was masked
> and not taken care of before the irq was unmasked. That is the correct
> behavior for an irq and a mmu won't let me get that.
And why do we need to mask/unmask the device in the interrupt-remapping case?
>
> The best I can do with an iommu is to run delayed disable and set the
> interrupt remapping slot in the iommu to reject the traffic. Which
> is almost but not quite what I want.
>
> Overall introducing a new concept into the linux irq model seems a lot
> cleaner and more portable, and even there we are likely to be a lot
> more fragile because of the difficulty in obtaining contiguous
> vectors.
>
> Speaking of. How many interrupt targets does the dmar iommu have
> for interrupts? 16K?
There can be multiple interrupt-remapping units in the platform and
each of table in the remapping unit has max 64K entries.
thanks,
suresh
Suresh Siddha <[email protected]> writes:
> And why do we need to mask/unmask the device in the interrupt-remapping case?
Why do we ever? It is part of the linux irq API and people wind up using
it in all kinds of strange ways.
One of the more surprising uses I have see is for the real time kernel
people mask the irq, wake up a kernel thread (to handle the irq), then
ack the irq and get out of the interrupt handler. It looked like
people working on UIO were starting to implement something similar.
>> Speaking of. How many interrupt targets does the dmar iommu have
>> for interrupts? 16K?
>
> There can be multiple interrupt-remapping units in the platform and
> each of table in the remapping unit has max 64K entries.
Thanks. That should last us for a little while. :)
Eric
On Fri, 2008-07-11 at 17:28 +0900, Hidetoshi Seto wrote:
> Hi,
>
> First of all, it seems that mask/unmask of MSI has problems.
> - Per-vector masking is optional for MSI, so I think that allocating
> multiple messages for a function without masking capability would be
> not good idea, since all vector in the block will be masked/unmasked
> at once without any agreement.
> - Even if the function supports per-vector masking, current
> mask/unmask_msi_irq() functions assume that MSI uses only one vector,
> therefore they only set/unset the first bit of the maskbits which
> for the first vector of the block. The bits for other vectors are
> initialized as 'masked' but no one unmask them.
I tend to think we should just do soft-masking anyway for MSI... better
than whacking config space.
Ben
On Fri, 2008-07-11 at 15:59 -0700, Eric W. Biederman wrote:
> Suresh Siddha <[email protected]> writes:
>
> > With interrupt-remapping, we can program the individual interrupt
> > remapping table entries to point to different cpu's etc. All we have
> > to take care is, do the IRTE allocation in a consecutive block and
> > program the starting index to the MSI registers.
> >
> > Just curious Eric, why do you think that won't work?
>
> Working mask/unmask. With MSI-X as specced if I mask an irq and then unmask
> it, an msi message will fire if something happened while the irq was masked
> and not taken care of before the irq was unmasked. That is the correct
> behavior for an irq and a mmu won't let me get that.
And ? It's just a message, we can ignore it if masked, ie, do
software-masking. Not a big deal... no ?
Ben.
Benjamin Herrenschmidt <[email protected]> writes:
> On Fri, 2008-07-11 at 15:59 -0700, Eric W. Biederman wrote:
>>
>> Working mask/unmask. With MSI-X as specced if I mask an irq and then unmask
>> it, an msi message will fire if something happened while the irq was masked
>> and not taken care of before the irq was unmasked. That is the correct
>> behavior for an irq and a mmu won't let me get that.
>
> And ? It's just a message, we can ignore it if masked, ie, do
> software-masking. Not a big deal... no ?
It is edge triggered so it won't refire when unmasked (especially if we don't know).
So it is easy to wind up in a state where the device is waiting for the software
and the software is waiting for the device because an irq gets dropped.
There are enough places that have problems that we have a fairly standard work around
to the problem (listed above) by just taking the first irq (after we have disabled the
irq) and setting it pending in software and then actually masking it in hardware.
That works, but it is still isn't quite correct. Because we can run the
interrupt handler once to often. For interrupts that are never shared and
always in order with the DMA, generally don't require reading a status
register on the card, and are otherwise highly optimized that might actually
be a problem.
Which is why I said that it doesn't look like even using an iommu can
fix all of the issues with treating msi multi message mode messages
as individual irqs. We can get very close but not quite there.
Eric
> It is edge triggered so it won't refire when unmasked (especially if we don't know).
> So it is easy to wind up in a state where the device is waiting for the software
> and the software is waiting for the device because an irq gets dropped.
Well, we are smarter than that. soft-masking is a know well-solved
problem. We just latch that something happened while masked and refire
when unmasked. Not terribly hard. We already do that in various
situations to mask edge interrupts.
> There are enough places that have problems that we have a fairly standard work around
> to the problem (listed above) by just taking the first irq (after we have disabled the
> irq) and setting it pending in software and then actually masking it in hardware.
Masking in HW is totally optional. I don't mask in HW on cell for
example, the HW just can't.
> That works, but it is still isn't quite correct. Because we can run the
> interrupt handler once to often.
We only re-fire if it actually occured while "masked", that should take
care that we never fire once too much, no ?
> For interrupts that are never shared and
> always in order with the DMA, generally don't require reading a status
> register on the card, and are otherwise highly optimized that might actually
> be a problem.
There must be some way of knowing what work is to do (ie, whether a DMA
q entry is completed, some kind of done bit, etc...). There generally is
at least, so that even in that case, spurrious MSIs are mostly a non
issue, but I don't think we have them here.
> Which is why I said that it doesn't look like even using an iommu can
> fix all of the issues with treating msi multi message mode messages
> as individual irqs. We can get very close but not quite there.
I still mostly dislike the new approach, I prefer Matthew's original one
with SW masking of the MSIs. For example, if you have the MSIs be 'one'
interrupt, then you hit all of the logic in the IRQ core to make sure
only one happens at once. Might not be what you want, and -will- cause
some to be dropped... not nice.
Ben.
Benjamin Herrenschmidt <[email protected]> writes:
> I still mostly dislike the new approach, I prefer Matthew's original one
> with SW masking of the MSIs. For example, if you have the MSIs be 'one'
> interrupt, then you hit all of the logic in the IRQ core to make sure
> only one happens at once. Might not be what you want, and -will- cause
> some to be dropped... not nice.
You are correct. Using the existing irq handling logic will cause us
to drop irqs and to loose information if two messages are sent close
to each other. Which is very nasty.
With a little care we can avoid that problem by having a 32 bit bitmap
of which sub irqs have fired so we can make all of them pending
without loosing information. That does requires a new handle_irq method.
One of the primary purposes of masking irqs in hardware is to prevent
them from screaming. Unlikely with edge triggered irqs but not a capability
I would like to give up.
Multi-msi has the problem that cpu affinity can not be changed on a
per message basis without an iommu. Which is a portability problem
and a problem on common architectures.
Therefore to support multi-msi it must be handled as a special case, we can not
treat the individual messages like normal irqs.
Eric
On Sun, 2008-07-13 at 15:30 -0700, Eric W. Biederman wrote:
> You are correct. Using the existing irq handling logic will cause us
> to drop irqs and to loose information if two messages are sent close
> to each other. Which is very nasty.
>
> With a little care we can avoid that problem by having a 32 bit bitmap
> of which sub irqs have fired so we can make all of them pending
> without loosing information. That does requires a new handle_irq
> method.
And we end up re-doing the logic that the core already provides ... for
individual irqs :-)
> One of the primary purposes of masking irqs in hardware is to prevent
> them from screaming. Unlikely with edge triggered irqs but not a
> capability I would like to give up.
Have you seen screaming MSIs yet ? We -could- if necessary add a hook
to irq_chip for that case if it really ever happens...
> Multi-msi has the problem that cpu affinity can not be changed on a
> per message basis without an iommu. Which is a portability problem
> and a problem on common architectures.
It's a minor problem I believe. It's mostly an API issue in fact, and
I'm happy to live with it rather than the other approach which sounds
just ... gross. Sorry but you are reproducing locally within an IRQ what
the whole IRQ is about to differenciate them in the first place.
It also makes it harder to remove the "irq" argument to handlers which
is something Jeff started looking into and that I quite like.
> Therefore to support multi-msi it must be handled as a special case,
> we can not treat the individual messages like normal irqs.
We can, that's what they are. They just are IRQs with -some-
restrictions on -some- platforms (mostly affinity on x86 and need to
soft-mask, which are fairly minor in my book).
Ben.
Ben. Multi-MSI is a crap hardware design. Why do you think we have
MSI-X? MSI-X as specced is a properly operating irq controller that
we don't need kludges to support. Multi-MSI with a full set of
kludges almost work but not quite fits the linux irq model.
Any hardware designer who choose to implement Multi-MSI instead of
MSI-X was not really concerned about having a high performance device.
If we can find a way to model the portable capabilities of Multi-MSI
cleanly then we can support it, and our drivers and our users and our
intermediate layers won't get surprised.
So far we have too close fits but neither model really works.
Further this is all about driver optimization, so none of this is
necessary to have working hardware. Which makes kludges much less
appropriate. Modelling Multi-MSI irqs as normal irqs requires a lot
of nasty kludges.
One of the kludges is allocating a continuous chunk of irq targets,
and the resulting fragmentation issues that you get when you start
allowing different sized allocations.
Overall if Multi-MSI was to become common I think we would really
regret it.
Eric
On Sun, 2008-07-13 at 16:29 -0700, Eric W. Biederman wrote:
> Ben. Multi-MSI is a crap hardware design. Why do you think we have
> MSI-X?
I know and I agree. Which is why I'd rather keep the SW crap totally
local to the MSI support code and not add new concepts to the generic
IRQ API such as sub-channels, for which it's really not ready for imho.
They -are- separate IRQs, just badly implemented. Besides, a large part
of the problem is purely due to the typical x86 implementation of them,
since for example, on most PowerPC's (and possibly other archs), they
tend to land in the PIC as normal sources, and as such benefit from all
the "features" of such interrupts like HW masking, affinity control,
etc... at the PIC level.
In any case, SW masking will work just fine, and affinity can be dealt
with without too much harm neither.
> MSI-X as specced is a properly operating irq controller that
> we don't need kludges to support. Multi-MSI with a full set of
> kludges almost work but not quite fits the linux irq model.
And you want to change the model for it ? I say no :-) Just make them
fit with a hammer. In fact, it's not even a big hammer. The only thing
that doesn't really fit is the affinity bits, which can be dealt with.
> Any hardware designer who choose to implement Multi-MSI instead of
> MSI-X was not really concerned about having a high performance device.
Agreed.
> If we can find a way to model the portable capabilities of Multi-MSI
> cleanly then we can support it, and our drivers and our users and our
> intermediate layers won't get surprised.
Our existing model works just fine imho.
> So far we have too close fits but neither model really works.
I think Willy's initial model can work with SW masking. All of the
latching & re-emission stuff is already in the core. The only problem is
affinity which can be hacked around, by either requiring all irqs to
change affinity or bouncing them all on one change, or only exposing
affinity control on the first one, whatever.
> Further this is all about driver optimization, so none of this is
> necessary to have working hardware. Which makes kludges much less
> appropriate. Modelling Multi-MSI irqs as normal irqs requires a lot
> of nasty kludges.
No. Not a lot. And those kludge are mostly local to the MSI support core
and don't expose new semantics to the existing IRQ model.
> One of the kludges is allocating a continuous chunk of irq targets,
> and the resulting fragmentation issues that you get when you start
> allowing different sized allocations.
This is purely a backend issue and isn't a big deal imho.
> Overall if Multi-MSI was to become common I think we would really
> regret it.
I agree. But in the meantime, if we want to support it, I prefer the
option of making it fit in the existing model.
Cheers,
Ben.
From: Benjamin Herrenschmidt <[email protected]>
Date: Mon, 14 Jul 2008 10:17:39 +1000
> On Sun, 2008-07-13 at 16:29 -0700, Eric W. Biederman wrote:
> > Ben. Multi-MSI is a crap hardware design. Why do you think we have
> > MSI-X?
>
> I know and I agree. Which is why I'd rather keep the SW crap totally
> local to the MSI support code and not add new concepts to the generic
> IRQ API such as sub-channels, for which it's really not ready for imho.
>
> They -are- separate IRQs, just badly implemented. Besides, a large part
> of the problem is purely due to the typical x86 implementation of them,
> since for example, on most PowerPC's (and possibly other archs), they
> tend to land in the PIC as normal sources, and as such benefit from all
> the "features" of such interrupts like HW masking, affinity control,
> etc... at the PIC level.
This is how it works on sparc64 too.
The x86 system designers decided to implement multi-MSI in an
inconvenient way, it is not a "crap hardware design", merely
some (unfortunately common) implementations of it happen to be.
Matthew Wilcox wrote:
> On Fri, Jul 11, 2008 at 01:50:23PM +0900, Kenji Kaneshige wrote:
>> I'm very sorry for very delayed comment, but I have a concern
>> about irq affinity code. Since I didn't have enough time to
>> look at your patch, I might be misunderstanding something...
>>
>> In my understanding, when irq affinity is changed on one of
>> the IRQs corresponding to MSI, it will not completed until
>> interrupts occur on all the IRQs. Attempts to change irq
>> affinity before previous affinity change is finished will be
>> ignored. Here, suppose that device uses three MSI irqs. In
>> this case, your patch manages four irqs, but only three
>> interrupts are used. If irq affinity changed on this MSI
>> interrupts, will it be completed?
>
> I have tested the affinity code with an ICH9 AHCI:
>
> 495: 117233 117966 118033 117797 PCI-MSI-edge ahci
> 496: 29860 29106 30191 28705 PCI-MSI-edge ahci
> 497: 0 0 0 0 PCI-MSI-edge ahci
> 498: 0 0 0 0 PCI-MSI-edge ahci
> 499: 0 0 0 0 PCI-MSI-edge ahci
> 500: 0 0 0 0 PCI-MSI-edge ahci
>
> This chip requires 16 MSIs to be registered, and it has 6 ports.
> Only ports 0 and 1 have a device attached. If I change the mask of
> an active irq (eg 495 or 496), it takes effect on both of them. If I
> change the mask of an inactive irq (497-500), nothing happens. But I
> can subsequently change the mask on 495 or 496 successfully.
>
> I can't tell you why this works this way; I haven't looked in enough
> detail at the irq affinity code, but this is my observation.
>
What I was worrying about was around irq_cfg->move_in_progress.
Since your environment has less than 8 cpus according to your
/proc/interrupts, I think your environment seems to use "apic_flat"
mode for interrupt handling. In this case, irq_cfg->move_in_progress
is not used for irq migration. I think this is why your environment
didn't encounter the problem I worried about. We should note that
vector allocation logic varies depending on the environment.
BTW, I looked at your take 4 patch. The problem I worried about
seems not to exist in this version (as a good side effect).
Thanks,
Kenji Kaneshige
David Miller <[email protected]> writes:
> The x86 system designers decided to implement multi-MSI in an
> inconvenient way, it is not a "crap hardware design", merely
> some (unfortunately common) implementations of it happen to be.
To be clear I was referring to the PCI spec that describes multi-MSI
as a crap hardware design.
At the very least you are left with the problem of allocating multiple
contiguous destinations. Which has the potential to create
fragmentation on all supported platforms.
Optional mask bits are also nasty.
My honest opinion is that the should have deprecated multi-msi after the
introduction of the msi-x specification.
Eric
From: [email protected] (Eric W. Biederman)
Date: Sun, 13 Jul 2008 19:03:52 -0700
> At the very least you are left with the problem of allocating multiple
> contiguous destinations. Which has the potential to create
> fragmentation on all supported platforms.
I don't think this is a very real concern. With 256 MSI slots
available per domain, at least on sparc64, no real case can cause
problems.
And even so, drivers can and should fall back when allocations fail
anyways.
That's what drivers, at least the ones I have written, do even for
MSI-X. For example, the NIU driver scales back the number of
MSI-X vectors it askes for if the original request cannot be
satisfied.
> My honest opinion is that the should have deprecated multi-msi after
> the introduction of the msi-x specification.
I don't think this would have materially influenced what happened with
AHCI at all.