While I was messing with the seq_file document, I went ahead and hacked up
an implementation of /proc/interrupts. This will be a first pass; if
nothing else, it breaks every architecture except i386. Fixing the others
should not be hard, though I can't test them. I've also misplaced my
100-CPU system somewhere, so I can't verify that it solves the initial
problem. But it should.
This version should scale to something over 300 processors, after which it
will not be possible to fit even a single line of /proc/interrupts output
into one page. At that point, if this output format is even remotely
useful, some sort of iterator which tracks interrupt and CPU numbers will
be needed.
This is against -test9, but it should apply just as well against current
BK.
jon
diff -urN -X dontdiff test9-vanilla/arch/i386/kernel/irq.c test9/arch/i386/kernel/irq.c
--- test9-vanilla/arch/i386/kernel/irq.c Tue Oct 28 01:47:40 2003
+++ test9/arch/i386/kernel/irq.c Fri Nov 14 00:37:28 2003
@@ -136,62 +136,106 @@
* Generic, controller-independent functions:
*/
-int show_interrupts(struct seq_file *p, void *v)
+
+/*
+ * Seq_file /proc/interrupts implementation.
+ */
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+ return (*pos <= NR_IRQS) ? pos : NULL;
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
- int i, j;
- struct irqaction * action;
+ (*pos)++;
+ if (*pos > NR_IRQS)
+ return NULL;
+ return pos;
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+static int int_seq_show(struct seq_file *f, void *v)
+{
+ int irq = *(int *) v, cpu;
unsigned long flags;
+ struct irqaction *action;
+
+ /* Put in a header before IRQ 0 */
+ if (irq == 0) {
+ seq_printf(f, " ");
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (cpu_online(cpu))
+ seq_printf(f, "CPU%d ", cpu);
+ seq_putc(f, '\n');
+ }
- seq_printf(p, " ");
- for (j=0; j<NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "CPU%d ",j);
- seq_putc(p, '\n');
-
- for (i = 0 ; i < NR_IRQS ; i++) {
- spin_lock_irqsave(&irq_desc[i].lock, flags);
- action = irq_desc[i].action;
- if (!action)
- goto skip;
- seq_printf(p, "%3d: ",i);
+ if (irq < NR_IRQS) {
+ spin_lock_irqsave(&irq_desc[irq].lock, flags);
+ action = irq_desc[irq].action;
+ if (action) {
+ seq_printf(f, "%3d: ", irq);
#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
+ seq_printf(f, "%10u ", kstat_irqs(irq));
#else
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (cpu_online(cpu))
+ seq_printf(f, "%10u ", kstat_cpu(cpu).irqs[irq]);
#endif
- seq_printf(p, " %14s", irq_desc[i].handler->typename);
- seq_printf(p, " %s", action->name);
+ seq_printf(f, " %14s", irq_desc[irq].handler->typename);
+ seq_printf(f, " %s", action->name);
- for (action=action->next; action; action = action->next)
- seq_printf(p, ", %s", action->name);
+ for (action = action->next; action; action = action->next)
+ seq_printf(f, ", %s", action->name);
- seq_putc(p, '\n');
-skip:
- spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+ seq_putc(f, '\n');
+ }
+ spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
+ return 0;
}
- seq_printf(p, "NMI: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", nmi_count(j));
- seq_putc(p, '\n');
+
+ /* One last "slot" at the end for misc info */
+ if (irq == NR_IRQS) {
+ seq_printf(f, "NMI: ");
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (cpu_online(cpu))
+ seq_printf(f, "%10u ", nmi_count(cpu));
+ seq_putc(f, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs);
- seq_putc(p, '\n');
+ seq_printf(f, "LOC: ");
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (cpu_online(cpu))
+ seq_printf(f, "%10u ", irq_stat[cpu].apic_timer_irqs);
+ seq_putc(f, '\n');
#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(f, "ERR: %10u\n", atomic_read(&irq_err_count));
#ifdef CONFIG_X86_IO_APIC
#ifdef APIC_MISMATCH_DEBUG
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(f, "MIS: %10u\n", atomic_read(&irq_mis_count));
#endif
#endif
+ }
+ else
+ return -EINVAL; /* "should never happen" */
return 0;
}
+static struct seq_operations int_seq_ops = {
+ .start = int_seq_start,
+ .next = int_seq_next,
+ .stop = int_seq_stop,
+ .show = int_seq_show
+};
+
+int interrupts_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &int_seq_ops);
+}
+
+
#ifdef CONFIG_SMP
inline void synchronize_irq(unsigned int irq)
{
diff -urN -X dontdiff test9-vanilla/fs/proc/proc_misc.c test9/fs/proc/proc_misc.c
--- test9-vanilla/fs/proc/proc_misc.c Tue Sep 30 00:49:20 2003
+++ test9/fs/proc/proc_misc.c Fri Nov 14 00:38:00 2003
@@ -473,30 +473,12 @@
return proc_calc_metrics(page, start, off, count, eof, len);
}
-extern int show_interrupts(struct seq_file *p, void *v);
-static int interrupts_open(struct inode *inode, struct file *file)
-{
- unsigned size = 4096 * (1 + num_online_cpus() / 8);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
-
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_interrupts, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
-}
+extern int interrupts_open(struct inode *, struct file *); /* In arch code */
static struct file_operations proc_interrupts_operations = {
.open = interrupts_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release,
};
static int filesystems_read_proc(char *page, char **start, off_t off,
Binary files test9-vanilla/scripts/bin2c and test9/scripts/bin2c differ
On Thu, Nov 13, 2003 at 10:36:26AM -0700, Jonathan Corbet wrote:
> This version should scale to something over 300 processors, after which it
> will not be possible to fit even a single line of /proc/interrupts output
> into one page. At that point, if this output format is even remotely
> useful, some sort of iterator which tracks interrupt and CPU numbers will
> be needed.
I assume you're talking about overflowing a 4k page? If so, we might be
able to limp along with the small 300p limit for awhile on ia64 since
most people running large systems use at least a 16k page size.
Thanks,
Jesse
On Thu, Nov 13, 2003 at 10:36:26AM -0700, Jonathan Corbet wrote:
> While I was messing with the seq_file document, I went ahead and hacked up
> an implementation of /proc/interrupts. This will be a first pass; if
> nothing else, it breaks every architecture except i386. Fixing the others
> should not be hard, though I can't test them. I've also misplaced my
> 100-CPU system somewhere, so I can't verify that it solves the initial
> problem. But it should.
>
> This version should scale to something over 300 processors, after which it
> will not be possible to fit even a single line of /proc/interrupts output
> into one page. At that point, if this output format is even remotely
> useful, some sort of iterator which tracks interrupt and CPU numbers will
> be needed.
What the hell? You *do* realize that seq_read() will increase the buffer
size if it can't fit the single entry into the current buffer, don't you?
Guys, there is no 4Kb limit. At all. You get longer entries - fine, the
thing will work. It will grow buffer large enough to hold the longest
entry, though.
You don't *have* to preallocate buffer - it makes sense to do if you know
that one page will be too tight anyway, but it's not required.
You obviously want to keep entries reasonably small - exactly because users
can open the file and start reading from it. Which will allocate (besides
the things normally allocated for any opened file) a buffer for said entries.
As long as it stays within several pages, there's no problem - after all,
you can always open a pipe and write to it / open a pair of AF_UNIX sockets
and send yourself datagrams / etc. It's not that situation was unusual.
If you get buffer much bigger than that, you are asking for a DoS, obviously.
> What the hell? You *do* realize that seq_read() will increase the buffer
> size if it can't fit the single entry into the current buffer, don't you?
Why, of course I do...now... Obviously I had missed that before. That's
cool, no ~300-processor limit. Just make sure you get a really wide screen
with that monster system.
> You don't *have* to preallocate buffer - it makes sense to do if you know
> that one page will be too tight anyway, but it's not required.
My patch doesn't do any preallocation, didn't seem worthwhile. A 500
processor system will have the resources to do that performance-critical
/proc/interrupts service twice anyway :).
Here, anyway, is a better version of the patch. It's less intrusive,
forgoes some "cleanups" I indulged in the first time, and makes it easier
to update other architectures. I did x86-64, ia_64 and ppc64 just for the
heck of it, but I can't test them.
jon
diff -urN -X dontdiff test9-vanilla/arch/i386/kernel/irq.c test9/arch/i386/kernel/irq.c
--- test9-vanilla/arch/i386/kernel/irq.c Tue Oct 28 01:47:40 2003
+++ test9/arch/i386/kernel/irq.c Fri Nov 14 05:11:29 2003
@@ -138,17 +138,19 @@
int show_interrupts(struct seq_file *p, void *v)
{
- int i, j;
+ int i = *(int *) v, j;
struct irqaction * action;
unsigned long flags;
- seq_printf(p, " ");
- for (j=0; j<NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "CPU%d ",j);
- seq_putc(p, '\n');
+ if (i == 0) {
+ seq_printf(p, " ");
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "CPU%d ",j);
+ seq_putc(p, '\n');
+ }
- for (i = 0 ; i < NR_IRQS ; i++) {
+ if (i < NR_IRQS) {
spin_lock_irqsave(&irq_desc[i].lock, flags);
action = irq_desc[i].action;
if (!action)
@@ -170,28 +172,32 @@
seq_putc(p, '\n');
skip:
spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- }
- seq_printf(p, "NMI: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", nmi_count(j));
- seq_putc(p, '\n');
+ } else if (i == NR_IRQS) {
+ seq_printf(p, "NMI: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", nmi_count(j));
+ seq_putc(p, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs);
- seq_putc(p, '\n');
+ seq_printf(p, "LOC: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs);
+ seq_putc(p, '\n');
#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
#ifdef CONFIG_X86_IO_APIC
#ifdef APIC_MISMATCH_DEBUG
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
#endif
#endif
+ }
return 0;
}
+
+
+
#ifdef CONFIG_SMP
inline void synchronize_irq(unsigned int irq)
{
diff -urN -X dontdiff test9-vanilla/arch/ia64/kernel/irq.c test9/arch/ia64/kernel/irq.c
--- test9-vanilla/arch/ia64/kernel/irq.c Tue Oct 28 01:47:40 2003
+++ test9/arch/ia64/kernel/irq.c Fri Nov 14 05:04:21 2003
@@ -160,18 +160,20 @@
int show_interrupts(struct seq_file *p, void *v)
{
- int i, j;
+ int j, i = *(int *) v;
struct irqaction * action;
irq_desc_t *idesc;
unsigned long flags;
- seq_puts(p, " ");
- for (j=0; j<NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "CPU%d ",j);
- seq_putc(p, '\n');
+ if (i == 0) {
+ seq_puts(p, " ");
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "CPU%d ",j);
+ seq_putc(p, '\n');
+ }
- for (i = 0 ; i < NR_IRQS ; i++) {
+ if (i < NR_IRQS) {
idesc = irq_descp(i);
spin_lock_irqsave(&idesc->lock, flags);
action = idesc->action;
@@ -194,25 +196,26 @@
seq_putc(p, '\n');
skip:
spin_unlock_irqrestore(&idesc->lock, flags);
- }
- seq_puts(p, "NMI: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", nmi_count(j));
- seq_putc(p, '\n');
+ } else if (i == NR_IRQS) {
+ seq_puts(p, "NMI: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", nmi_count(j));
+ seq_putc(p, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
- seq_puts(p, "LOC: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs);
- seq_putc(p, '\n');
+ seq_puts(p, "LOC: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs);
+ seq_putc(p, '\n');
#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
#ifdef CONFIG_X86_IO_APIC
#ifdef APIC_MISMATCH_DEBUG
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
#endif
#endif
+ }
return 0;
}
diff -urN -X dontdiff test9-vanilla/arch/ppc64/kernel/irq.c test9/arch/ppc64/kernel/irq.c
--- test9-vanilla/arch/ppc64/kernel/irq.c Tue Oct 21 04:40:13 2003
+++ test9/arch/ppc64/kernel/irq.c Fri Nov 14 05:10:02 2003
@@ -323,18 +323,20 @@
int show_interrupts(struct seq_file *p, void *v)
{
- int i, j;
+ int i = *(int *) v, j;
struct irqaction * action;
unsigned long flags;
- seq_printf(p, " ");
- for (j=0; j<NR_CPUS; j++) {
- if (cpu_online(j))
- seq_printf(p, "CPU%d ",j);
+ if (i == 0) {
+ seq_printf(p, " ");
+ for (j=0; j<NR_CPUS; j++) {
+ if (cpu_online(j))
+ seq_printf(p, "CPU%d ",j);
+ }
+ seq_putc(p, '\n');
}
- seq_putc(p, '\n');
-
- for (i = 0 ; i < NR_IRQS ; i++) {
+
+ if (i < NR_IRQS) {
spin_lock_irqsave(&irq_desc[i].lock, flags);
action = irq_desc[i].action;
if (!action || !action->handler)
@@ -359,8 +361,8 @@
seq_putc(p, '\n');
skip:
spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- }
- seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts);
+ } else if (i == NR_IRQS)
+ seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts);
return 0;
}
diff -urN -X dontdiff test9-vanilla/arch/x86_64/kernel/irq.c test9/arch/x86_64/kernel/irq.c
--- test9-vanilla/arch/x86_64/kernel/irq.c Sat Oct 11 06:00:32 2003
+++ test9/arch/x86_64/kernel/irq.c Fri Nov 14 05:08:41 2003
@@ -138,17 +138,19 @@
int show_interrupts(struct seq_file *p, void *v)
{
- int i, j;
+ int i = *(int *) v, j;
struct irqaction * action;
unsigned long flags;
- seq_printf(p, " ");
- for (j=0; j<NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "CPU%d ",j);
- seq_putc(p, '\n');
+ if (i == 0) {
+ seq_printf(p, " ");
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "CPU%d ",j);
+ seq_putc(p, '\n');
+ }
- for (i = 0 ; i < NR_IRQS ; i++) {
+ if (i < NR_IRQS) {
spin_lock_irqsave(&irq_desc[i].lock, flags);
action = irq_desc[i].action;
if (!action)
@@ -170,25 +172,26 @@
seq_putc(p, '\n');
skip:
spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- }
- seq_printf(p, "NMI: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
- seq_putc(p, '\n');
+ } else if (i == NR_IRQS) {
+ seq_printf(p, "NMI: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
+ seq_putc(p, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
- seq_putc(p, '\n');
+ seq_printf(p, "LOC: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
+ seq_putc(p, '\n');
#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
#ifdef CONFIG_X86_IO_APIC
#ifdef APIC_MISMATCH_DEBUG
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
#endif
#endif
+ }
return 0;
}
diff -urN -X dontdiff test9-vanilla/fs/proc/proc_misc.c test9/fs/proc/proc_misc.c
--- test9-vanilla/fs/proc/proc_misc.c Tue Sep 30 00:49:20 2003
+++ test9/fs/proc/proc_misc.c Fri Nov 14 04:55:35 2003
@@ -473,30 +473,46 @@
return proc_calc_metrics(page, start, off, count, eof, len);
}
-extern int show_interrupts(struct seq_file *p, void *v);
-static int interrupts_open(struct inode *inode, struct file *file)
+/*
+ * /proc/interrupts
+ */
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
{
- unsigned size = 4096 * (1 + num_online_cpus() / 8);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
-
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_interrupts, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
+ return (*pos <= NR_IRQS) ? pos : NULL;
}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos > NR_IRQS)
+ return NULL;
+ return pos;
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+
+extern int show_interrupts(struct seq_file *f, void *v); /* In arch code */
+static struct seq_operations int_seq_ops = {
+ .start = int_seq_start,
+ .next = int_seq_next,
+ .stop = int_seq_stop,
+ .show = show_interrupts
+};
+
+int interrupts_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &int_seq_ops);
+}
+
static struct file_operations proc_interrupts_operations = {
.open = interrupts_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release,
};
static int filesystems_read_proc(char *page, char **start, off_t off,
> Here, anyway, is a better version of the patch. It's less intrusive,
> forgoes some "cleanups" I indulged in the first time, and makes it easier
> to update other architectures. I did x86-64, ia_64 and ppc64 just for the
> heck of it, but I can't test them.
I tested your changes on a small ia64 Altix here. It worked well. I'll try
it out on a 512p system when I can get a time slot on it.
Thanks for doing this.
Erik
Hi there. I'm sorry it took me so long to test this. I was able to
get some time on our 512p system this morning. I ran the test and your
fix does solve this problem.
Actually, it was 511 processors at the time.
I was going to include the output but I decied most people wouldn't want
to stretch their windows that wide. The output isn't pretty on a system
with this many processors - but it isn't breaking and that is the main
concern.
Thanks again for checking in to this. Much appreciated.
Erik
> > Here, anyway, is a better version of the patch. It's less intrusive,
> > forgoes some "cleanups" I indulged in the first time, and makes it easier
> > to update other architectures. I did x86-64, ia_64 and ppc64 just for the
> > heck of it, but I can't test them.
>
> I tested your changes on a small ia64 Altix here. It worked well. I'll try
> it out on a 512p system when I can get a time slot on it.
>
> Thanks for doing this.
>
> Erik
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Erik Jacobson - Linux System Software - Silicon Graphics - Eagan, Minnesota