From: Matthew Wilcox <matthew@wil.cx>
To: linux-pci@vger.kernel.org
Cc: kaneshige.kenji@jp.fujitsu.com, mingo@elte.hu, tglx@linutronix.de,
       davem@davemloft.net, dan.j.williams@intel.com,
       Martine.Silbermann@hp.com, benh@kernel.crashing.org,
       michael@ellerman.id.au, linux-kernel@vger.kernel.org,
       Matthew Wilcox <matthew@wil.cx>, Matthew Wilcox <willy@linux.intel.com>
Subject: [PATCH 4/4] x86-64: Support for multiple MSIs
Date: Sat,  5 Jul 2008 09:34:15 -0400
Message-Id: <1215264855-4372-4-git-send-email-matthew@wil.cx>
In-Reply-To: <20080705132728.GO14894@parisc-linux.org>
References: <20080705132728.GO14894@parisc-linux.org>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8468
Lines: 317

Implement the arch_setup_msi_block() interface.  Rewrite create_irq()
into create_irq_block() and call create_irq_block() from create_irq().
Implement __assign_irq_vector_block() based closely on __assign_irq_vector().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 arch/x86/kernel/io_apic_64.c |  199 ++++++++++++++++++++++++++++++++++++++----
 1 files changed, 183 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8df..44e942a 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -683,6 +683,8 @@ static int pin_2_irq(int idx, int apic, int pin)
 	return irq;
 }
 
+static int current_vector = FIRST_DEVICE_VECTOR;
+
 static int __assign_irq_vector(int irq, cpumask_t mask)
 {
 	/*
@@ -696,7 +698,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	static int current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
 	struct irq_cfg *cfg;
@@ -769,6 +771,97 @@ static int assign_irq_vector(int irq, cpumask_t mask)
 	return err;
 }
 
+static int __assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+	unsigned int old_vector;
+	int i, cpu;
+	struct irq_cfg *cfg;
+
+	/*
+	 * We've got to be careful not to trash gate 0x80,
+	 * because int 0x80 is hm, kind of importantish. ;)
+	 */
+	BUG_ON((unsigned)irq + count > NR_IRQS);
+
+	/* Only try and allocate irqs on cpus that are present */
+	cpus_and(mask, mask, cpu_online_map);
+
+	for (i = 0; i < count; i++) {
+		cfg = &irq_cfg[irq + i];
+		if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+			return -EBUSY;
+	}
+
+	cfg = &irq_cfg[irq];
+	old_vector = cfg->vector;
+	if (old_vector) {
+		cpumask_t tmp;
+		cpus_and(tmp, cfg->domain, mask);
+		if (!cpus_empty(tmp))
+			return 0;
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		cpumask_t domain, new_mask;
+		int new_cpu;
+		int vector;
+
+		domain = vector_allocation_domain(cpu);
+		cpus_and(new_mask, domain, cpu_online_map);
+
+		vector = current_vector & ~(count - 1);
+ next:
+		vector += count;
+		if (vector + count >= FIRST_SYSTEM_VECTOR) {
+			vector = FIRST_DEVICE_VECTOR & ~(count - 1);
+			if (vector < FIRST_DEVICE_VECTOR)
+				vector += count;
+		}
+		if (unlikely(vector == (current_vector & ~(count - 1))))
+			continue;
+		if ((IA32_SYSCALL_VECTOR >= vector) &&
+		    (IA32_SYSCALL_VECTOR < vector + count))
+			goto next;
+		for_each_cpu_mask(new_cpu, new_mask) {
+			for (i = 0; i < count; i++) {
+				if (per_cpu(vector_irq, new_cpu)[vector + i]
+									!= -1)
+					goto next;
+			}
+		}
+		/* Found one! */
+		current_vector = vector + count - 1;
+		for (i = 0; i < count; i++) {
+			cfg = &irq_cfg[irq + i];
+			if (old_vector) {
+				cfg->move_in_progress = 1;
+				cfg->old_domain = cfg->domain;
+			}
+			for_each_cpu_mask(new_cpu, new_mask) {
+				per_cpu(vector_irq, new_cpu)[vector + i] =
+					irq + i;
+			}
+			cfg->vector = vector;
+			cfg->domain = domain;
+		}
+		return 0;
+	}
+	return -ENOSPC;
+}
+
+/* Assumes that count is a power of two and aligns to that power of two */
+static int assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+	int result;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	result = __assign_irq_vector_block(irq, count, mask);
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	return result;
+}
+
 static void __clear_irq_vector(int irq)
 {
 	struct irq_cfg *cfg;
@@ -788,6 +881,14 @@ static void __clear_irq_vector(int irq)
 	cpus_clear(cfg->domain);
 }
 
+static void __clear_irq_vector_block(int irq, int count)
+{
+	while (count > 0) {
+		count--;
+		__clear_irq_vector(irq + count);
+	}
+}
+
 void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
@@ -1895,30 +1996,56 @@ device_initcall(ioapic_init_sysfs);
 /*
  * Dynamic irq allocate and deallocation
  */
-int create_irq(void)
+
+/*
+ * On success, returns the interrupt number of the lowest numbered irq
+ * in the block.  If it can't find a block of the right size, it returns
+ * -1 - (length of the longest run).
+ */
+static int create_irq_block(int count)
 {
-	/* Allocate an unused irq */
-	int irq;
-	int new;
+	/* Allocate 'count' consecutive unused irqs */
+	int i, new, longest;
 	unsigned long flags;
 
-	irq = -ENOSPC;
+	longest = 0;
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = (NR_IRQS - 1); new >= 0; new--) {
 		if (platform_legacy_irq(new))
-			continue;
+			goto clear;
 		if (irq_cfg[new].vector != 0)
+			goto clear;
+		longest++;
+		if (longest < count)
 			continue;
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
-			irq = new;
+
+		while (__assign_irq_vector_block(new, longest, TARGET_CPUS))
+			longest /= 2;
+		if (longest < count)
+			__clear_irq_vector_block(new, longest);
 		break;
+ clear:
+		__clear_irq_vector_block(new + 1, longest);
+		longest = 0;
 	}
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (irq >= 0) {
-		dynamic_irq_init(irq);
+	if (longest < count)
+		return -1 - longest;
+
+	for (i = 0; i < count; i++) {
+		dynamic_irq_init(new + i);
 	}
-	return irq;
+
+	return new;
+}
+
+int create_irq(void)
+{
+	int ret = create_irq_block(1);
+	if (ret < 0)
+		return -ENOSPC;
+	return ret;
 }
 
 void destroy_irq(unsigned int irq)
@@ -1936,7 +2063,8 @@ void destroy_irq(unsigned int irq)
  * MSI message composition
  */
 #ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+				unsigned int count, struct msi_msg *msg)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
 	int err;
@@ -1944,7 +2072,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 	cpumask_t tmp;
 
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	if (count == 1)
+		err = assign_irq_vector(irq, tmp);
+	else
+		err = assign_irq_vector_block(irq, count, tmp);
 	if (!err) {
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
@@ -1975,6 +2106,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
+	struct msi_desc *desc = get_irq_msi(irq);
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
@@ -1983,6 +2115,10 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	if (cpus_empty(tmp))
 		return;
 
+	/* XXX: Figure out how to do CPU affinity for multiple MSIs */
+	if (desc->msi_attrib.multiple)
+		return;
+
 	if (assign_irq_vector(irq, mask))
 		return;
 
@@ -2024,7 +2160,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 	if (irq < 0)
 		return irq;
 
-	ret = msi_compose_msg(dev, irq, &msg);
+	ret = msi_compose_msg(dev, irq, 1, &msg);
 	if (ret < 0) {
 		destroy_irq(irq);
 		return ret;
@@ -2038,6 +2174,37 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 	return 0;
 }
 
+int arch_setup_msi_block(struct pci_dev *pdev, struct msi_desc *desc, int count)
+{
+	struct msi_msg msg;
+	int i, ret, base_irq, alloc;
+
+	/* MSI can only allocate a power-of-two */
+	alloc = roundup_pow_of_two(count);
+
+	base_irq = create_irq_block(alloc);
+	if (base_irq < 0)
+		return rounddown_pow_of_two(-base_irq - 1);
+
+	ret = msi_compose_msg(pdev, base_irq, alloc, &msg);
+	if (ret)
+		return ret;
+
+	desc->msi_attrib.multiple = order_base_2(alloc);
+
+	/* Do loop in reverse so set_irq_msi ends up setting
+	 * desc->irq to base_irq
+	 */
+	for (i = count - 1; i >= 0; i--) {
+		set_irq_msi(base_irq + i, desc);
+		set_irq_chip_and_handler_name(base_irq + i, &msi_chip,
+						handle_edge_irq, "edge");
+	}
+	write_msi_msg(base_irq, &msg);
+
+	return 0;
+}
+
 void arch_teardown_msi_irq(unsigned int irq)
 {
 	destroy_irq(irq);
@@ -2090,7 +2257,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 	int ret;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(NULL, irq, &msg);
+	ret = msi_compose_msg(NULL, irq, 1, &msg);
 	if (ret < 0)
 		return ret;
 	dmar_msi_write(irq, &msg);
-- 
1.5.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/