As many of you will be aware, we've been working on infrastructure for
user-mode PCI and other drivers. The first step is to be able to
handle interrupts from user space. Subsequent patches add
infrastructure for setting up DMA for PCI devices.
The user-level interrupt code doesn't depend on the other patches, and
is probably the most mature of this patchset.
This patch adds a new file to /proc/irq/<nnn>/ called irq. Suitably
privileged processes can open this file. Reading the file returns the
number of interrupts (if any) that have occurred since the last read.
If the file is opened in blocking mode, reading it blocks until
an interrupt occurs. poll(2) and select(2) work as one would expect, to
allow interrupts to be one of many events to wait for.
(If you didn't like the file, one could have a special system call to
return the file descriptor).
Interrupts are usually masked; while a thread is in poll(2) or read(2) on the
file they are unmasked.
All architectures that use CONFIG_GENERIC_HARDIRQ are supported by
this patch.
A low latency user level interrupt handler would do something like
this, on a CONFIG_PREEMPT kernel:
int irqfd;
int n_ints;
struct sched_param sched_param;
irqfd = open("/proc/irq/513/irq", O_RDONLY);
mlockall()
sched_param.sched_priority = sched_get_priority_max(SCHED_FIFO) - 10;
sched_setscheduler(0, SCHED_FIFO, &sched_param);
while(read(irqfd, n_ints, sizeof n_ints) == sizeof nints) {
... talk to device to handle interrupt
}
If you don't care about latency, then forget about the mlockall() and
setting the priority, and you don't need CONFIG_PREEMPT.
Signed-off-by: Peter Chubb <[email protected]>
kernel/irq/proc.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 153 insertions(+), 10 deletions(-)
Index: linux-2.6.11-usrdrivers/kernel/irq/proc.c
===================================================================
--- linux-2.6.11-usrdrivers.orig/kernel/irq/proc.c 2005-03-11 10:30:57.875619102 +1100
+++ linux-2.6.11-usrdrivers/kernel/irq/proc.c 2005-03-11 10:45:07.146928168 +1100
@@ -9,6 +9,8 @@
#include <linux/irq.h>
#include <linux/proc_fs.h>
#include <linux/interrupt.h>
+#include <linux/poll.h>
+#include "internals.h"
static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
@@ -90,27 +92,168 @@
action->dir = proc_mkdir(name, irq_dir[irq]);
}
+struct irq_proc {
+ unsigned long irq;
+ wait_queue_head_t q;
+ atomic_t count;
+ char devname[TASK_COMM_LEN];
+};
+
+static irqreturn_t irq_proc_irq_handler(int irq, void *vidp, struct pt_regs *regs)
+{
+ struct irq_proc *idp = (struct irq_proc *)vidp;
+
+ BUG_ON(idp->irq != irq);
+ disable_irq_nosync(irq);
+ atomic_inc(&idp->count);
+ wake_up(&idp->q);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ * Signal to userspace an interrupt has occured.
+ */
+static ssize_t irq_proc_read(struct file *filp, char __user *bufp, size_t len, loff_t *ppos)
+{
+ struct irq_proc *ip = (struct irq_proc *)filp->private_data;
+ irq_desc_t *idp = irq_desc + ip->irq;
+ int pending;
+
+ DEFINE_WAIT(wait);
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ pending = atomic_read(&ip->count);
+ if (pending == 0) {
+ if (idp->status & IRQ_DISABLED)
+ enable_irq(ip->irq);
+ if (filp->f_flags & O_NONBLOCK)
+ return -EWOULDBLOCK;
+ }
+
+ while (pending == 0) {
+ prepare_to_wait(&ip->q, &wait, TASK_INTERRUPTIBLE);
+ pending = atomic_read(&ip->count);
+ if (pending == 0)
+ schedule();
+ finish_wait(&ip->q, &wait);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ }
+
+ if (copy_to_user(bufp, &pending, sizeof pending))
+ return -EFAULT;
+
+ *ppos += sizeof pending;
+
+ atomic_sub(pending, &ip->count);
+ return sizeof pending;
+}
+
+
+static int irq_proc_open(struct inode *inop, struct file *filp)
+{
+ struct irq_proc *ip;
+ struct proc_dir_entry *ent = PDE(inop);
+ int error;
+
+ ip = kmalloc(sizeof *ip, GFP_KERNEL);
+ if (ip == NULL)
+ return -ENOMEM;
+
+ memset(ip, 0, sizeof(*ip));
+ strcpy(ip->devname, current->comm);
+ init_waitqueue_head(&ip->q);
+ atomic_set(&ip->count, 0);
+ ip->irq = (unsigned long)ent->data;
+
+ error = request_irq(ip->irq,
+ irq_proc_irq_handler,
+ SA_INTERRUPT,
+ ip->devname,
+ ip);
+ if (error < 0) {
+ kfree(ip);
+ return error;
+ }
+ filp->private_data = (void *)ip;
+
+ return 0;
+}
+
+static int irq_proc_release(struct inode *inop, struct file *filp)
+{
+ struct irq_proc *ip = (struct irq_proc *)filp->private_data;
+ (void)inop;
+ free_irq(ip->irq, ip);
+ filp->private_data = NULL;
+ kfree(ip);
+ return 0;
+}
+
+static unsigned int irq_proc_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct irq_proc *ip = (struct irq_proc *)filp->private_data;
+ irq_desc_t *idp = irq_desc + ip->irq;
+
+ if (atomic_read(&ip->count) > 0)
+ return POLLIN | POLLRDNORM; /* readable */
+
+ /* if interrupts disabled and we don't have one to process... */
+ if (idp->status & IRQ_DISABLED)
+ enable_irq(ip->irq);
+
+ poll_wait(filp, &ip->q, wait);
+
+ if (atomic_read(&ip->count) > 0)
+ return POLLIN | POLLRDNORM; /* readable */
+
+ return 0;
+}
+
+static struct file_operations irq_proc_file_operations = {
+ .read = irq_proc_read,
+ .open = irq_proc_open,
+ .release = irq_proc_release,
+ .poll = irq_proc_poll,
+};
+
#undef MAX_NAMELEN
#define MAX_NAMELEN 10
void register_irq_proc(unsigned int irq)
{
+ struct proc_dir_entry *entry;
char name [MAX_NAMELEN];
- if (!root_irq_dir ||
- (irq_desc[irq].handler == &no_irq_type) ||
- irq_dir[irq])
+ if (!root_irq_dir)
return;
-
- memset(name, 0, MAX_NAMELEN);
- sprintf(name, "%d", irq);
-
- /* create /proc/irq/1234 */
- irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+
+ if (!irq_dir[irq]) {
+ memset(name, 0, MAX_NAMELEN);
+ sprintf(name, "%d", irq);
+
+ /* create /proc/irq/1234 */
+ irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+
+ /*
+ * Create handles for user-mode interrupt handlers
+ * if the kernel hasn't already grabbed the IRQ
+ */
+ entry = create_proc_entry("irq", 0600, irq_dir[irq]);
+ if (entry) {
+ entry->data = (void *)(unsigned long)irq;
+ entry->read_proc = NULL;
+ entry->write_proc = NULL;
+ entry->proc_fops = &irq_proc_file_operations;
+ }
+ }
#ifdef CONFIG_SMP
- {
+ if (!smp_affinity_entry[irq]) {
struct proc_dir_entry *entry;
/* create /proc/irq/<irq>/smp_affinity */
Hi!
> As many of you will be aware, we've been working on infrastructure for
> user-mode PCI and other drivers. The first step is to be able to
> handle interrupts from user space. Subsequent patches add
> infrastructure for setting up DMA for PCI devices.
>
> The user-level interrupt code doesn't depend on the other patches, and
> is probably the most mature of this patchset.
Okay, I like it; it means way easier PCI driver development.
But... how do you handle shared PCI interrupts?
> This patch adds a new file to /proc/irq/<nnn>/ called irq. Suitably
> privileged processes can open this file. Reading the file returns the
> number of interrupts (if any) that have occurred since the last read.
> If the file is opened in blocking mode, reading it blocks until
> an interrupt occurs. poll(2) and select(2) work as one would expect, to
> allow interrupts to be one of many events to wait for.
> (If you didn't like the file, one could have a special system call to
> return the file descriptor).
This should go into Documentation/ somewhere.
Pavel
--
People were complaining that M$ turns users into beta-testers...
...jr ghea gurz vagb qrirybcref, naq gurl frrz gb yvxr vg gung jnl!
I have many users asking for something like this. Peter's approach is
simple and it appears to solve the problem for many situations.
With that in mind though, for a more complicated but higher performing
approach please take a look at the User Level Interrupt (ULI) project at
http://oss.sgi.com/projects/uli/. It requires some per-arch assembly but
with some recent changes we're seeing ~1.5us latency from hardware event to
user space function start. I'm hoping to add an IA32 port out soon.
Thanks,
Michael
On Fri, Mar 11, 2005 at 02:36:10PM +1100, Peter Chubb wrote:
>
> As many of you will be aware, we've been working on infrastructure for
> user-mode PCI and other drivers. The first step is to be able to
> handle interrupts from user space. Subsequent patches add
> infrastructure for setting up DMA for PCI devices.
--
Michael A. Raymond Office: (651) 683-3434
Core OS Group Real-Time System Software
On Fri, Mar 11, 2005 at 07:50:32AM -0600, Michael Raymond wrote:
> I have many users asking for something like this.
Why would a "user" care about this?
Now hardware companies that want to write closed drivers is another
thing :)
thanks,
greg k-h
We have some customers doing high performance raw I/O from various PCI &
VME cards. They can already mmap() and do DMA from user space to the cards.
Allowing them to do interrupt processing in user space allows them to keep
everything in one tight package. The ULI web site talks about this a little
more.
Thanks,
Michael
On Fri, Mar 11, 2005 at 09:25:14AM -0800, Greg KH wrote:
> On Fri, Mar 11, 2005 at 07:50:32AM -0600, Michael Raymond wrote:
> > I have many users asking for something like this.
>
> Why would a "user" care about this?
>
> Now hardware companies that want to write closed drivers is another
> thing :)
>
> thanks,
>
> greg k-h
--
Michael A. Raymond Office: (651) 683-3434
Core OS Group Real-Time System Software
On Gwe, 2005-03-11 at 03:36, Peter Chubb wrote:
> +static irqreturn_t irq_proc_irq_handler(int irq, void *vidp, struct pt_regs *regs)
> +{
> + struct irq_proc *idp = (struct irq_proc *)vidp;
> +
> + BUG_ON(idp->irq != irq);
> + disable_irq_nosync(irq);
> + atomic_inc(&idp->count);
> + wake_up(&idp->q);
> + return IRQ_HANDLED;
You just deadlocked the machine in many configurations. You can't use
disable_irq for this trick you have to tell the kernel how to handle it.
I posted a proposal for this sometime ago because X has some uses for
it. The idea being you'd pass a struct that describes
1. What tells you an IRQ occurred on this device
2. How to clear it
3. How to enable/disable it.
Something like
struct {
u8 type; /* 8, 16, 32 I/O or MMIO */
u8 bar; /* PCI bar to use */
u32 offset; /* Into bar */
u32 mask; /* Bits to touch/compare */
u32 value; /* Value to check against/set */
}
On Fri, 11 Mar 2005 14:36:10 +1100, Peter Chubb
<[email protected]> wrote:
>
> As many of you will be aware, we've been working on infrastructure for
> user-mode PCI and other drivers. The first step is to be able to
> handle interrupts from user space. Subsequent patches add
> infrastructure for setting up DMA for PCI devices.
I've tried implementing this before and could not get around the
interrupt problem. Most interrupts on the x86 architecture are shared.
Disabling the IRQ at the PIC blocks all of the shared IRQs. This works
(hope your userspace handler is last on the shared handler list) until
you have a problem in userspace.
Once you have a problem in userspace there is no way to acknowledge
the interrupt anymore. I tried to address that by maintaining a timer
and suspending the hardware through the D0 state to reset it. That had
some success. Not acknowledging the interrupt results in an interrupt
loop and reboot.
The problem can be mitigated by choosing what slot your hardware to
put your hardware in. This can reduce the number of shared interrupts.
If you can get exclusive use of the interrupt this method will work.
If I were designing a new bus I would make interrupt acknowledge part
of PCI config space in order to allow a single piece of code to
acknowledge them. Since we can't change the bus the only safe way to
do this is to build a hardware specific driver for each device to
acknowledge the interrupt.
Bottom line is that I could find no reliable solution for handing interrupts.
--
Jon Smirl
[email protected]
On Fri, 11 Mar 2005 11:29:20 +0100, Pavel Machek <[email protected]> wrote:
> Hi!
>
> > As many of you will be aware, we've been working on infrastructure for
> > user-mode PCI and other drivers. The first step is to be able to
> > handle interrupts from user space. Subsequent patches add
> > infrastructure for setting up DMA for PCI devices.
> >
> > The user-level interrupt code doesn't depend on the other patches, and
> > is probably the most mature of this patchset.
>
> Okay, I like it; it means way easier PCI driver development.
It won't help with PCI driver development. I tried implementing this
for UML. If your driver has any bugs it won't get the interrupts
acknowledged correctly and you'll end up rebooting.
Xen just posted patches for using kgdb between two instances but I
don't see how they get out of the interrupt acknowledge problem
either.
>
> But... how do you handle shared PCI interrupts?
>
> > This patch adds a new file to /proc/irq/<nnn>/ called irq. Suitably
> > privileged processes can open this file. Reading the file returns the
> > number of interrupts (if any) that have occurred since the last read.
> > If the file is opened in blocking mode, reading it blocks until
> > an interrupt occurs. poll(2) and select(2) work as one would expect, to
> > allow interrupts to be one of many events to wait for.
> > (If you didn't like the file, one could have a special system call to
> > return the file descriptor).
>
> This should go into Documentation/ somewhere.
> Pavel
>
> --
> People were complaining that M$ turns users into beta-testers...
> ...jr ghea gurz vagb qrirybcref, naq gurl frrz gb yvxr vg gung jnl!
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Jon Smirl
[email protected]
On Sat, 12 Mar 2005, Jon Smirl wrote:
> On Fri, 11 Mar 2005 14:36:10 +1100, Peter Chubb
> <[email protected]> wrote:
> >
> > As many of you will be aware, we've been working on infrastructure for
> > user-mode PCI and other drivers. The first step is to be able to
> > handle interrupts from user space. Subsequent patches add
> > infrastructure for setting up DMA for PCI devices.
>
> I've tried implementing this before and could not get around the
> interrupt problem. Most interrupts on the x86 architecture are shared.
> Disabling the IRQ at the PIC blocks all of the shared IRQs. This works
> (hope your userspace handler is last on the shared handler list) until
> you have a problem in userspace.
>
> Once you have a problem in userspace there is no way to acknowledge
> the interrupt anymore. I tried to address that by maintaining a timer
> and suspending the hardware through the D0 state to reset it. That had
> some success. Not acknowledging the interrupt results in an interrupt
> loop and reboot.
>
> The problem can be mitigated by choosing what slot your hardware to
> put your hardware in. This can reduce the number of shared interrupts.
> If you can get exclusive use of the interrupt this method will work.
>
> If I were designing a new bus I would make interrupt acknowledge part
> of PCI config space in order to allow a single piece of code to
> acknowledge them. Since we can't change the bus the only safe way to
> do this is to build a hardware specific driver for each device to
> acknowledge the interrupt.
>
> Bottom line is that I could find no reliable solution for handing
> interrupts.
Alan's proposal sounds very plausible and additionally if we find that
we have an irq line screaming we could use the same supplied information
to disable userspace interrupt handled devices first.
Zwane
On Sat, 12 Mar 2005 11:27:25 -0500, Jon Smirl <[email protected]> wrote:
> Xen just posted patches for using kgdb between two instances but I
> don't see how they get out of the interrupt acknowledge problem
> either.
I just talked to the Xen people. They don't have a solution either.
They did point out that this is not a problem with MSI on PCI Express.
--
Jon Smirl
[email protected]
On Fri, 11 Mar 2005 19:14:13 +0000, Alan Cox <[email protected]> wrote:
> I posted a proposal for this sometime ago because X has some uses for
> it. The idea being you'd pass a struct that describes
>
> 1. What tells you an IRQ occurred on this device
> 2. How to clear it
> 3. How to enable/disable it.
>
> Something like
>
> struct {
> u8 type; /* 8, 16, 32 I/O or MMIO */
> u8 bar; /* PCI bar to use */
> u32 offset; /* Into bar */
> u32 mask; /* Bits to touch/compare */
> u32 value; /* Value to check against/set */
> }
>
It might useful to add this to the main kernel API, and then over time
modify all of the drivers to use it. If a driver does this it would be
safe to transparently move it to user space like in UML or xen. I've
been told that PCI Express and MSI does not have this problem.
--
Jon Smirl
[email protected]
On Gwe, 2005-03-11 at 03:36, Peter Chubb wrote:
> +static irqreturn_t irq_proc_irq_handler(int irq, void *vidp, struct pt_regs *regs)
> +{
> + struct irq_proc *idp = (struct irq_proc *)vidp;
> +
> + BUG_ON(idp->irq != irq);
> + disable_irq_nosync(irq);
> + atomic_inc(&idp->count);
> + wake_up(&idp->q);
> + return IRQ_HANDLED;
Alan> You just deadlocked the machine in many configurations. You can't use
Alan> disable_irq for this trick you have to tell the kernel how to handle it.
Can you elaborate, please? In particular, why doesn't essentially the
same action (disabling an interrupt before the EOI) in
note_interrupt() not lock up the machine?
I can see there'd be problems if the code allowed shared interrupts,
but it doesn't.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
>>>>> "Jon" == Jon Smirl <[email protected]> writes:
Jon> On Fri, 11 Mar 2005 14:36:10 +1100, Peter Chubb
Jon> <[email protected]> wrote:
>> As many of you will be aware, we've been working on infrastructure
>> for user-mode PCI and other drivers. The first step is to be able
>> to handle interrupts from user space. Subsequent patches add
>> infrastructure for setting up DMA for PCI devices.
Jon> I've tried implementing this before and could not get around the
Jon> interrupt problem. Most interrupts on the x86 architecture are
Jon> shared. Disabling the IRQ at the PIC blocks all of the shared
Fortunately, most interrupts on IA64, ARM, etc., are unshared. And
with PCI-Express, the problem will go away. Even on X86, things
aren't all bad: one can usually find a PCI slot which doesn't share
interrupts with anything you care about.
The scenario I'm thinking about with these patches are things like
low-latency user-level networking between nodes in a cluster, where
for good performance even with a kernel driver you don't want to share
your interrupt line with anything else.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
>>>>> "Jon" == Jon Smirl <[email protected]> writes:
Jon> On Fri, 11 Mar 2005 11:29:20 +0100, Pavel Machek <[email protected]>
Jon> wrote:
>> Hi!
>>
>> > As many of you will be aware, we've been working on
>> infrastructure for > user-mode PCI and other drivers. The first
>> step is to be able to > handle interrupts from user
>> space. Subsequent patches add > infrastructure for setting up DMA
>> for PCI devices.
>> >
>> > The user-level interrupt code doesn't depend on the other
>> patches, and > is probably the most mature of this patchset.
>>
>> Okay, I like it; it means way easier PCI driver development.
Jon> It won't help with PCI driver development. I tried implementing
Jon> this for UML. If your driver has any bugs it won't get the
Jon> interrupts acknowledged correctly and you'll end up rebooting.
That's not actually true, at least when we developed drivers here.
The only times we had to reboot were the times we mucked up the dma
register settings, and dma'd all over the kernel by mistake...
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
>>>>> "Jon" == Jon Smirl <[email protected]> writes:
>> The scenario I'm thinking about with these patches are things like
>> low-latency user-level networking between nodes in a cluster, where
>> for good performance even with a kernel driver you don't want to
>> share your interrupt line with anything else.
Jon> The code needs to refuse to install if the IRQ line is shared.
It does. The request_irq() call explicitly does not include SA_SHARED
in its flags, so if the line is shared, it'll return an error to user
space when the driver tries to open the file representing the interrupt.
Jon> Also what about SMP, if you shut the IRQ off on one CPU isn't it
Jon> still enabled on all of the others?
Nope. disable_irq_nosync() talks to the interrupt controller, which
is common to all the processors. The main problem is that it's slow,
because it has to go off-chip.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
On Mon, 14 Mar 2005 11:39:00 +1100, Peter Chubb
<[email protected]> wrote:
> >>>>> "Jon" == Jon Smirl <[email protected]> writes:
>
> Jon> On Fri, 11 Mar 2005 11:29:20 +0100, Pavel Machek <[email protected]>
> Jon> wrote:
> >> Hi!
> >>
> >> > As many of you will be aware, we've been working on
> >> infrastructure for > user-mode PCI and other drivers. The first
> >> step is to be able to > handle interrupts from user
> >> space. Subsequent patches add > infrastructure for setting up DMA
> >> for PCI devices.
> >> >
> >> > The user-level interrupt code doesn't depend on the other
> >> patches, and > is probably the most mature of this patchset.
> >>
> >> Okay, I like it; it means way easier PCI driver development.
>
> Jon> It won't help with PCI driver development. I tried implementing
> Jon> this for UML. If your driver has any bugs it won't get the
> Jon> interrupts acknowledged correctly and you'll end up rebooting.
>
> That's not actually true, at least when we developed drivers here.
> The only times we had to reboot were the times we mucked up the dma
> register settings, and dma'd all over the kernel by mistake...
The way you can avoid reboot is to leave the interrupt turned off at
the PIC. The side effect is that everything else using that interrupt
is also turned off.
I did experiment with catching the process exit from the user space
app on abort. Then I used the power control registers to turn off the
card if it supported being turned off. That would then safely let me
reenable the pick.
This code needs to refuse to attach to a shared IRQ until problems
with them are fixed. Most IRQs are shared on x86 desktops. Every
machine I have around here has no free IRQ's available.
>
> --
> Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
> The technical we do immediately, the political takes *forever*
>
--
Jon Smirl
[email protected]
>>>>> "Jon" == Jon Smirl <[email protected]> writes:
Jon> On Sat, 12 Mar 2005 10:11:18 -0700 (MST), Zwane Mwaikambo
Jon> <[email protected]> wrote:
>> Alan's proposal sounds very plausible and additionally if we find
>> that we have an irq line screaming we could use the same supplied
>> information to disable userspace interrupt handled devices first.
Jon> I like it too and it would help Xen. Now we just need to modify
Jon> 800 device drivers to use it.
It's incomplete. But you probably knew that...
The main problem I see is that even with the proposed interface, you'd
need to disable the interrupt in the interrupt controller, because
merely acknowledging an interrupt to a device doesn't stop it from
interrupting. And you really want the device to stop asserting the
interrupt before doing an EOI, unless you're going to mask the
interrupt. So you'd need to have an interface that not only
acknowledged the current interrupt but also prevented the device from
interrupting. That typically means reading a status register (slow!)
and then setting one or more bits in one or more control registers.
Also for a user level driver you really want to do the EIO before
invoking user space. Otherwise, depending on the interrupt
controller, lower numbered interrupts could be masked until the user
space returns --- which might be a long time off.
Reading the status register is typically one of the slowest
single parts of a device driver (latency can be > 2 usec), so you don't
really want to have to read it again within the driver... so you'd
probably want to pass it as part of the interrupt arguments to the
driver.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
>>>>> "Jon" == Jon Smirl <[email protected]> writes:
Jon> On Mon, 14 Mar 2005 12:42:27 +1100, Peter Chubb
Jon> <[email protected]> wrote:
>> >>>>> "Jon" == Jon Smirl <[email protected]> writes:
>>
>> >> The scenario I'm thinking about with these patches are things
>> like >> low-latency user-level networking between nodes in a
>> cluster, where >> for good performance even with a kernel driver
>> you don't want to >> share your interrupt line with anything else.
Jon> Instead of making up a new API what about making a library of
Jon> calls that emulates the common entry points used by device
Jon> drivers. The version I did for UML could take the same driver and
Jon> run it in user space or the kernel without changing source
Jon> code. I found this very useful.
The in-kernel device drivers interface is very large --- I want to
start with something a bit simpler. We do have a compatibility
library, as yet unreleased, that allows the same drivers to run
in-kernel or in user space.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
On Sat, 12 Mar 2005 10:11:18 -0700 (MST), Zwane Mwaikambo
<[email protected]> wrote:
> Alan's proposal sounds very plausible and additionally if we find that
> we have an irq line screaming we could use the same supplied information
> to disable userspace interrupt handled devices first.
I like it too and it would help Xen. Now we just need to modify 800
device drivers to use it.
> Zwane
>
--
Jon Smirl
[email protected]
On Mon, 14 Mar 2005 12:42:27 +1100, Peter Chubb
<[email protected]> wrote:
> >>>>> "Jon" == Jon Smirl <[email protected]> writes:
>
> >> The scenario I'm thinking about with these patches are things like
> >> low-latency user-level networking between nodes in a cluster, where
> >> for good performance even with a kernel driver you don't want to
> >> share your interrupt line with anything else.
Instead of making up a new API what about making a library of calls
that emulates the common entry points used by device drivers. The
version I did for UML could take the same driver and run it in user
space or the kernel without changing source code. I found this very
useful.
--
Jon Smirl
[email protected]
On Llu, 2005-03-14 at 00:02, Peter Chubb wrote:
> I can see there'd be problems if the code allowed shared interrupts,
> but it doesn't.
If you don't allow shared IRQ's its useless, if you do allow shared
IRQ's it deadlocks. Take your pick 8)
As to your comment about needing to do a few more I/O operations I
agree. However if your need is for speed then you might want to just
write a small IRQ helper module for the kernel or extend the syntax I
proposed a little (its conveniently trivial to generate native code from
this).
There isn't much you can do about the status read without MWI on most
chip designs (some get it right by posting status to system memory but
not many)
Alan
On Mon, 14 Mar 2005 13:33:31 +0000, Alan Cox <[email protected]> wrote:
> On Llu, 2005-03-14 at 00:02, Peter Chubb wrote:
> > I can see there'd be problems if the code allowed shared interrupts,
> > but it doesn't.
>
> If you don't allow shared IRQ's its useless, if you do allow shared
> IRQ's it deadlocks. Take your pick 8)
>
> As to your comment about needing to do a few more I/O operations I
> agree. However if your need is for speed then you might want to just
> write a small IRQ helper module for the kernel or extend the syntax I
> proposed a little (its conveniently trivial to generate native code from
> this).
The concept of passing in a little structure telling how to
acknowledge an interrupt is a very good one. I'd like to see it added
as a kernel feature so that drivers could start being converted to it.
This is a big deal for Xen since Xen has the same problem with
forwarded IRQs. Xen would pass the little structure from the domain to
the supervisor so that the supervisor could cut off the IRQ if the
domain fails.
>
> There isn't much you can do about the status read without MWI on most
> chip designs (some get it right by posting status to system memory but
> not many)
>
> Alan
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Jon Smirl
[email protected]
On Mon, 14 Mar 2005 12:42:27 +1100, Peter Chubb
<[email protected]> wrote:
> >>>>> "Jon" == Jon Smirl <[email protected]> writes:
>
> >> The scenario I'm thinking about with these patches are things like
> >> low-latency user-level networking between nodes in a cluster, where
> >> for good performance even with a kernel driver you don't want to
> >> share your interrupt line with anything else.
>
> Jon> The code needs to refuse to install if the IRQ line is shared.
>
> It does. The request_irq() call explicitly does not include SA_SHARED
> in its flags, so if the line is shared, it'll return an error to user
> space when the driver tries to open the file representing the interrupt.
Please put some big comments warning people about adding SA_SHARED. I
can easily see someone thinking that they are fixing a bug by adding
it. I'd probably even write a paragraph about what will happen if
SA_SHARED is added.
>
> Jon> Also what about SMP, if you shut the IRQ off on one CPU isn't it
> Jon> still enabled on all of the others?
>
> Nope. disable_irq_nosync() talks to the interrupt controller, which
> is common to all the processors. The main problem is that it's slow,
> because it has to go off-chip.
>
> --
> Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
> The technical we do immediately, the political takes *forever*
>
--
Jon Smirl
[email protected]
>>>>> "Jon" == Jon Smirl <[email protected]> writes:
Jon> On Mon, 14 Mar 2005 12:42:27 +1100, Peter Chubb
Jon> <[email protected]> wrote:
>> >>>>> "Jon" == Jon Smirl <[email protected]> writes:
>>
>> >> The scenario I'm thinking about with these patches are things
>> like >> low-latency user-level networking between nodes in a
>> cluster, where >> for good performance even with a kernel driver
>> you don't want to >> share your interrupt line with anything else.
>>
Jon> The code needs to refuse to install if the IRQ line is shared.
>> It does. The request_irq() call explicitly does not include
>> SA_SHARED in its flags, so if the line is shared, it'll return an
>> error to user space when the driver tries to open the file
>> representing the interrupt.
Jon> Please put some big comments warning people about adding
Jon> SA_SHARED. I can easily see someone thinking that they are fixing
Jon> a bug by adding it. I'd probably even write a paragraph about
Jon> what will happen if SA_SHARED is added.
Will do. The main problem here is X86, as other architectures either
don't care, or have enough interrupt lines. And the people who are
paying me for this kind of thing all run IA64....
What I really want to do is deprivilege the driver code as much as
possible. Whatever a driver does, the rest of the system should keep
going. That way malicious or buggy drivers can only affect the
processes that are trying to use the device they manage. Moreover, it
should be possible to kill -9 a driver, then restart it, without the
rest of the system noticing more than a hiccup. To do this,
step one is to run the driver in user space, so that it's subject to
the same resource management control as any other process. Step two,
which is a lot harder, is to connect the driver back into the kernel
so that it can be shared. Tun/Tap can be used for network devices,
but it's really too slow -- you need zero-copy and shared notification.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
On Tue, 15 Mar 2005 14:47:42 +1100, Peter Chubb
<[email protected]> wrote:
> What I really want to do is deprivilege the driver code as much as
> possible. Whatever a driver does, the rest of the system should keep
> going. That way malicious or buggy drivers can only affect the
> processes that are trying to use the device they manage. Moreover, it
> should be possible to kill -9 a driver, then restart it, without the
> rest of the system noticing more than a hiccup. To do this,
> step one is to run the driver in user space, so that it's subject to
> the same resource management control as any other process. Step two,
> which is a lot harder, is to connect the driver back into the kernel
> so that it can be shared. Tun/Tap can be used for network devices,
> but it's really too slow -- you need zero-copy and shared notification.
Have you considered running the drivers in a domain under Xen?
>
> --
> Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
> The technical we do immediately, the political takes *forever*
>
--
Jon Smirl
[email protected]
>>>>> "Jon" == Jon Smirl <[email protected]> writes:
Jon> On Tue, 15 Mar 2005 14:47:42 +1100, Peter Chubb
Jon> <[email protected]> wrote:
>> What I really want to do is deprivilege the driver code as much as
>> possible. Whatever a driver does, the rest of the system should
>> keep going. That way malicious or buggy drivers can only affect
>> the processes that are trying to use the device they manage.
>> Moreover, it should be possible to kill -9 a driver, then restart
>> it, without the rest of the system noticing more than a hiccup. To
>> do this, step one is to run the driver in user space, so that it's
>> subject to the same resource management control as any other
>> process. Step two, which is a lot harder, is to connect the driver
>> back into the kernel so that it can be shared. Tun/Tap can be used
>> for network devices, but it's really too slow -- you need zero-copy
>> and shared notification.
Jon> Have you considered running the drivers in a domain under Xen?
See the paper presented by Karlsruhr at OSDI:
Joshua LeVasseur, Volkmar Uhlig, Jan Stoess, and Stefan G?tz:
Unmodified Device Driver Reuse and Improved System Dependability via
Virtual Machines. OSDI '04.
They're using L4, rather than Xen as the paravirtualisation layer.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*
On Sat, 2005-03-12 at 21:03 -0500, Jon Smirl wrote:
> On Fri, 11 Mar 2005 19:14:13 +0000, Alan Cox <[email protected]> wrote:
> > I posted a proposal for this sometime ago because X has some uses for
> > it. The idea being you'd pass a struct that describes
> >
> > 1. What tells you an IRQ occurred on this device
> > 2. How to clear it
> > 3. How to enable/disable it.
> >
> > Something like
> >
> > struct {
> > u8 type; /* 8, 16, 32 I/O or MMIO */
> > u8 bar; /* PCI bar to use */
> > u32 offset; /* Into bar */
> > u32 mask; /* Bits to touch/compare */
> > u32 value; /* Value to check against/set */
> > }
> >
>
> It might useful to add this to the main kernel API, and then over time
> modify all of the drivers to use it. If a driver does this it would be
> safe to transparently move it to user space like in UML or xen. I've
> been told that PCI Express and MSI does not have this problem.
>
This seems sufficient for the simplest devices, that just have an
IRQ_PENDING and an IRQ_ACK register. But what about a device like the
emu10k1 where you have a half loop and loop interrupt for each of 64
channels, plus about 10 other interrupt sources? The IPR just tells you
there's a channel loop interrupt pending, in order to properly ACK it
you need to set a bit in one of 4 registers depending on whether it's a
loop or half loop interrupt, and whether the channel is 0-31 or 32-64.
Lee
On Maw, 2005-03-15 at 04:32, Lee Revell wrote:
> This seems sufficient for the simplest devices, that just have an
> IRQ_PENDING and an IRQ_ACK register. But what about a device like the
> emu10k1 where you have a half loop and loop interrupt for each of 64
> channels, plus about 10 other interrupt sources? The IPR just tells you
> there's a channel loop interrupt pending, in order to properly ACK it
> you need to set a bit in one of 4 registers depending on whether it's a
> loop or half loop interrupt, and whether the channel is 0-31 or 32-64.
Do we need to solve it for all such devices in one go and can we write
custom code for the hard cases. Peter solved the simple unshared-IRQ
case. I'd like to solve the simple shared IRQ cases too (because X can
use this).
I'm wondering where the right line is ?
From: [email protected]
[mailto:[email protected]] On Behalf Of Peter Chubb
> >>>>> "Jon" == Jon Smirl <[email protected]> writes:
>
> >> The scenario I'm thinking about with these patches are things like
> >> low-latency user-level networking between nodes in a cluster, where
> >> for good performance even with a kernel driver you don't want to
> >> share your interrupt line with anything else.
>
> Jon> The code needs to refuse to install if the IRQ line is shared.
>
> It does. The request_irq() call explicitly does not include SA_SHARED
> in its flags, so if the line is shared, it'll return an error to user
> space when the driver tries to open the file representing the
interrupt.
I actually have a simple case of user-mode handling of shared IRQs
working, based off the Gelato work from maybe 6 months ago.
What I did was add a new IOCTL to tell the kernel code how to clear the
interrupt status (or disable the interrupt output on the card, as
appropriate).
This basically took the form of telling the kernel to map a specified
subsection of a specified BAR into kernel space for register access,
then execute a small configurable read-modify-write sequence for the
actual interrupt disable (configurable read/write address, optionally no
read, configurable and/or/shift of the read value (or initial constant)
before write etc.)
This would probably cover most situations.
Once the interrupt is disabled/cleared, user-space can handle it just
like a non-shared IRQ, and finally re-enable it if the kernel mode
twiddling was a disable rather than a status clear.
This is probably pretty generic, and at least will open the code up to
handling a lot more devices, even if not everthing.
Also, the kernel detects a release on the file-handle, and can execute a
different configurable register access sequence to permanently disable
interrupts and/or reset the device, for safety once the user-space app
quits.
--
Stephen Warren, Software Engineer, NVIDIA, Fort Collins, CO
[email protected] http://www.nvidia.com/
[email protected] http://www.wwwdotorg.org/pgp.html