diff -ruN linux-2.5.41/fs/aio.c linux-2.5.41AIO/fs/aio.c
--- linux-2.5.41/fs/aio.c Mon Oct 7 11:24:13 2002
+++ linux-2.5.41AIO/fs/aio.c Mon Oct 14 12:27:05 2002
@@ -59,6 +59,8 @@
static spinlock_t fput_lock = SPIN_LOCK_UNLOCKED;
LIST_HEAD(fput_head);
+int async_poll(struct kiocb *iocb, int events);
+
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
@@ -893,6 +895,19 @@
return -EINVAL;
}
+ssize_t generic_aio_poll(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+ unsigned events = iocb->aio_buf;
+
+ /* Did the user set any bits they weren't supposed to? (The
+ * above is actually a cast.
+ */
+ if (unlikely(events != iocb->aio_buf))
+ return -EINVAL;
+
+ return async_poll(req, events);
+}
+
static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
@@ -978,12 +993,15 @@
if (file->f_op->aio_fsync)
ret = file->f_op->aio_fsync(req, 0);
break;
+ case IOCB_CMD_POLL:
+ ret = generic_aio_poll(file, req, iocb);
+ break;
default:
dprintk("EINVAL: io_submit: no operation provided\n");
ret = -EINVAL;
}
- if (likely(EIOCBQUEUED == ret))
+ if (likely(-EIOCBQUEUED == ret))
return 0;
if (ret >= 0) {
aio_complete(req, ret, 0);
diff -ruN linux-2.5.41/fs/select.c linux-2.5.41AIO/fs/select.c
--- linux-2.5.41/fs/select.c Mon Oct 7 11:23:21 2002
+++ linux-2.5.41AIO/fs/select.c Mon Oct 14 13:39:58 2002
@@ -20,6 +20,8 @@
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/init.h>
#include <asm/uaccess.h>
@@ -27,19 +29,34 @@
#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
struct poll_table_entry {
- struct file * filp;
wait_queue_t wait;
wait_queue_head_t * wait_address;
+ struct file * filp;
+ poll_table *p;
};
struct poll_table_page {
+ unsigned long size;
struct poll_table_page * next;
struct poll_table_entry * entry;
struct poll_table_entry entries[0];
};
#define POLL_TABLE_FULL(table) \
- ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+ ((unsigned long)((table)->entry+1) > \
+ (table)->size + (unsigned long)(table))
+
+/* async poll uses only one entry per poll table as it is linked to an iocb */
+typedef struct async_poll_table_struct {
+ poll_table pt;
+ int events; /* event mask for async poll */
+ int wake;
+ long sync;
+ struct poll_table_page pt_page; /* one poll table page hdr */
+ struct poll_table_entry entries[1]; /* space for a single entry */
+} async_poll_table;
+
+static kmem_cache_t *async_poll_table_cache;
/*
* Ok, Peter made a complicated, but straightforward multiple_wait() function.
@@ -53,8 +70,7 @@
* as all select/poll functions have to call it to add an entry to the
* poll table.
*/
-
-void poll_freewait(poll_table* pt)
+void __poll_freewait(poll_table* pt, wait_queue_t *wait)
{
struct poll_table_page * p = pt->table;
while (p) {
@@ -62,15 +78,141 @@
struct poll_table_page *old;
entry = p->entry;
+ if (entry == p->entries) /* may happen with async poll */
+ break;
do {
entry--;
- remove_wait_queue(entry->wait_address,&entry->wait);
+ if (wait != &entry->wait)
+ remove_wait_queue(entry->wait_address,&entry->wait);
+ else
+ __remove_wait_queue(entry->wait_address,&entry->wait);
fput(entry->filp);
} while (entry > p->entries);
old = p;
p = p->next;
- free_page((unsigned long) old);
+ if (old->size == PAGE_SIZE)
+ free_page((unsigned long) old);
}
+ if (pt->iocb)
+ kmem_cache_free(async_poll_table_cache, pt);
+}
+
+void poll_freewait(poll_table* pt)
+{
+ __poll_freewait(pt, NULL);
+}
+
+void async_poll_complete(void *data)
+{
+ async_poll_table *pasync = data;
+ poll_table *p = data;
+ struct kiocb *iocb = p->iocb;
+ unsigned int mask;
+
+ pasync->wake = 0;
+ wmb();
+ do {
+ mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, p);
+ mask &= pasync->events | POLLERR | POLLHUP;
+ if (mask) {
+ poll_table *p2 = xchg(&iocb->ki_data, NULL);
+ if (p2) {
+ poll_freewait(p2);
+ aio_complete(iocb, mask, 0);
+ }
+ return;
+ }
+ pasync->sync = 0;
+ wmb();
+ } while (pasync->wake);
+}
+
+static int async_poll_waiter(wait_queue_t *wait, unsigned mode, int sync)
+{
+ struct poll_table_entry *entry = (struct poll_table_entry *)wait;
+ async_poll_table *pasync = (async_poll_table *)(entry->p);
+ struct kiocb *iocb;
+ unsigned int mask;
+
+ iocb = pasync->pt.iocb;
+ mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, NULL);
+ mask &= pasync->events | POLLERR | POLLHUP;
+ if (mask) {
+ poll_table *p2 = xchg(&iocb->ki_data, NULL);
+ if (p2) {
+ __poll_freewait(p2, wait);
+ aio_complete(iocb, mask, 0);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int async_poll_cancel(struct kiocb *iocb, struct io_event *res)
+{
+ poll_table *p;
+
+ /* FIXME: almost right */
+ p = xchg(&iocb->ki_data, NULL);
+ if (p) {
+ poll_freewait(p);
+ aio_complete(iocb, 0, 0);
+ aio_put_req(iocb);
+ return 0;
+ }
+ aio_put_req(iocb);
+ return -EAGAIN;
+}
+
+int async_poll(struct kiocb *iocb, int events)
+{
+ unsigned int mask;
+ async_poll_table *pasync;
+ poll_table *p;
+
+ /* Fast path */
+ if (iocb->ki_filp->f_op && iocb->ki_filp->f_op->poll) {
+ mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, NULL);
+ mask &= events | POLLERR | POLLHUP;
+ if (mask & events)
+ return events;
+ }
+
+ pasync = kmem_cache_alloc(async_poll_table_cache, SLAB_KERNEL);
+ if (!pasync)
+ return -ENOMEM;
+
+ p = (poll_table *)pasync;
+ poll_initwait(p);
+ p->iocb = iocb;
+ pasync->wake = 0;
+ pasync->sync = 0;
+ pasync->events = events;
+ pasync->pt_page.entry = pasync->pt_page.entries;
+ pasync->pt_page.size = sizeof(pasync->pt_page) + sizeof(pasync->entries);
+ pasync->pt_page.next = 0;
+ p->table = &pasync->pt_page;
+
+ iocb->ki_data = p;
+ wmb();
+ iocb->ki_cancel = async_poll_cancel;
+
+ mask = DEFAULT_POLLMASK;
+#warning broken
+ iocb->ki_users ++;
+ if (iocb->ki_filp->f_op && iocb->ki_filp->f_op->poll)
+ mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, p);
+ mask &= events | POLLERR | POLLHUP;
+ if (mask && !test_and_set_bit(0, &pasync->sync))
+ aio_complete(iocb, mask, 0);
+
+ if (aio_put_req(iocb))
+ /* Must be freed after aio_complete to synchronise with
+ * cancellation of the request.
+ */
+ poll_freewait(p);
+
+ return -EIOCBQUEUED;
}
void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -86,6 +228,7 @@
__set_current_state(TASK_RUNNING);
return;
}
+ new_table->size = PAGE_SIZE;
new_table->entry = new_table->entries;
new_table->next = table;
p->table = new_table;
@@ -99,7 +242,11 @@
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
+ entry->p = p;
+ if (p->iocb) /* async poll */
+ init_waitqueue_func_entry(&entry->wait, async_poll_waiter);
+ else
+ init_waitqueue_entry(&entry->wait, current);
add_wait_queue(wait_address,&entry->wait);
}
}
@@ -495,3 +642,14 @@
poll_freewait(&table);
return err;
}
+
+static int __init async_poll_init(void)
+{
+ async_poll_table_cache = kmem_cache_create("async poll table",
+ sizeof(async_poll_table), 0, 0, NULL, NULL);
+ if (!async_poll_table_cache)
+ panic("unable to alloc poll_table_cache");
+ return 0;
+}
+
+module_init(async_poll_init);
diff -ruN linux-2.5.41/include/linux/aio_abi.h linux-2.5.41AIO/include/linux/aio_abi.h
--- linux-2.5.41/include/linux/aio_abi.h Mon Oct 7 11:23:28 2002
+++ linux-2.5.41AIO/include/linux/aio_abi.h Mon Oct 7 16:33:36 2002
@@ -40,6 +40,7 @@
* IOCB_CMD_PREADX = 4,
* IOCB_CMD_POLL = 5,
*/
+ IOCB_CMD_POLL = 5,
IOCB_CMD_NOOP = 6,
};
diff -ruN linux-2.5.41/include/linux/poll.h linux-2.5.41AIO/include/linux/poll.h
--- linux-2.5.41/include/linux/poll.h Mon Oct 7 11:24:12 2002
+++ linux-2.5.41AIO/include/linux/poll.h Tue Oct 8 12:06:44 2002
@@ -9,12 +9,14 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <asm/uaccess.h>
+#include <linux/workqueue.h>
struct poll_table_page;
typedef struct poll_table_struct {
- int error;
- struct poll_table_page * table;
+ int error;
+ struct poll_table_page *table;
+ struct kiocb *iocb; /* iocb for async poll */
} poll_table;
extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
@@ -29,6 +31,7 @@
{
pt->error = 0;
pt->table = NULL;
+ pt->iocb = NULL;
}
extern void poll_freewait(poll_table* pt);
Shailabh Nagar wrote:
> Thepatch has been tested on 2.5.41 using simple poll tests.
Your patch has numerous races, a kiocb leak, and incorrectly calls
aio_complete() upon cancellation. Please see the patch I sent to
[email protected] on September 29.
On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
> As of today, there is no scalable alternative to poll/select in the 2.5
> kernel even though the topic has been discussed a number of times
> before. The case for a scalable poll has been made often so I won't
> get into that.
Have you bothered addressing the fact that async poll scales worse than
/dev/epoll? That was the original reason for dropping it.
-ben
Benjamin LaHaise wrote:
>
> On Tue, Oct 15, 2002 at 10:06:22AM -0700, Dan Kegel wrote:
> > Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
> > alternative?
>
> No.
What's the worst part about it? The use of the signal queue?
- Dan
Benjamin LaHaise wrote:
>
> On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
> > As of today, there is no scalable alternative to poll/select in the 2.5
> > kernel even though the topic has been discussed a number of times
> > before. The case for a scalable poll has been made often so I won't
> > get into that.
>
> Have you bothered addressing the fact that async poll scales worse than
> /dev/epoll? That was the original reason for dropping it.
Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
alternative?
It's in 2.5 as far as I know. It does suffer from using the signal
queue,
but it's in production use on servers that handle many thousands of
concurrent connections, so it's pretty scalable.
- Dan
On Tue, Oct 15, 2002 at 10:06:22AM -0700, Dan Kegel wrote:
> Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
> alternative?
No.
-ben
On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> So I guess the question would now be: whats keeping /dev/epoll from
> being included in the kernel given the time left before the feature freeze ?
We don't need yet another event reporting mechanism as /dev/epoll presents.
I was thinking it should just be its own syscall but report its events in
the same way as aio.
-ben
--
"Do you seek knowledge in time travel?"
Benjamin LaHaise wrote:
>On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
>
>>As of today, there is no scalable alternative to poll/select in the 2.5
>>kernel even though the topic has been discussed a number of times
>>before. The case for a scalable poll has been made often so I won't
>>get into that.
>>
>
>Have you bothered addressing the fact that async poll scales worse than
>/dev/epoll? That was the original reason for dropping it.
>
> -ben
>
Hi Ben,
I didn't address async poll's scalability vs. /dev/epoll because
/dev/epoll isn't in the kernel either. I wasn't sure whether you had a
better async poll
in the making.
Either solution (/dev/epoll or async poll) would be a considerable
improvement over what we have today.
So I guess the question would now be: whats keeping /dev/epoll from
being included in the kernel given the time left before the feature freeze ?
-- Shailabh
Davide Libenzi wrote:
> On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
>
>
>>On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
>>
>>>So I guess the question would now be: whats keeping /dev/epoll from
>>>being included in the kernel given the time left before the feature freeze ?
>>
>>We don't need yet another event reporting mechanism as /dev/epoll presents.
>>I was thinking it should just be its own syscall but report its events in
>>the same way as aio.
>
>
> Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
> was the fastest way to have a kernel interface exposed w/out having to beg
> for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
> might be a nice complement to AIO for specific applications.
So what would the syscall look like ? Could you give a few more details on the interface ?
- Shailabh
Dan Kegel wrote:
>Benjamin LaHaise wrote:
>
>>On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
>>
>>>As of today, there is no scalable alternative to poll/select in the 2.5
>>>kernel even though the topic has been discussed a number of times
>>>before. The case for a scalable poll has been made often so I won't
>>>get into that.
>>>
>>Have you bothered addressing the fact that async poll scales worse than
>>/dev/epoll? That was the original reason for dropping it.
>>
>
>Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
>alternative?
>It's in 2.5 as far as I know. It does suffer from using the signal
>queue,
>but it's in production use on servers that handle many thousands of
>concurrent connections, so it's pretty scalable.
>
>- Dan
>
Dan,
Are there any performance numbers for F_SETSIG/F_SETOWN/SIGIO on 2.5 ?
Does it scale with the number of active connections too ?
Signal-per-fd seems to be a decent alternative (from the measurements on
Davide's /dev/epoll page) but Vitaly Luban's patch for that isn't available
for 2.5 so I'm not sure what other issues it might have.
-- Shailabh
On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> > So I guess the question would now be: whats keeping /dev/epoll from
> > being included in the kernel given the time left before the feature freeze ?
>
> We don't need yet another event reporting mechanism as /dev/epoll presents.
> I was thinking it should just be its own syscall but report its events in
> the same way as aio.
Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
was the fastest way to have a kernel interface exposed w/out having to beg
for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
might be a nice complement to AIO for specific applications.
- Davide
Shailabh Nagar wrote:
> Are there any performance numbers for F_SETSIG/F_SETOWN/SIGIO on 2.5 ?
I don't recall any. (I suppose I should get off my butt and start
running 2.5.)
> Does it scale with the number of active connections too ?
The overhead is probably mostly linear with activity, I don't know.
> Signal-per-fd seems to be a decent alternative (from the measurements on
> Davide's /dev/epoll page) but Vitaly Luban's patch for that isn't available
> for 2.5 so I'm not sure what other issues it might have.
Seems like the thing to do is to move /dev/epoll over to use
Ben's event system rather than worry about the old /dev/epoll interface.
But like signal-per-fd, we will want to collapse readiness events,
which is something Ben's event system might not do naturally.
(I wouldn't know -- I haven't actually looked at Ben's code.)
- Dan
On Tue, 15 Oct 2002, Shailabh Nagar wrote:
> Davide Libenzi wrote:
> > On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> >
> >
> >>On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> >>
> >>>So I guess the question would now be: whats keeping /dev/epoll from
> >>>being included in the kernel given the time left before the feature freeze ?
> >>
> >>We don't need yet another event reporting mechanism as /dev/epoll presents.
> >>I was thinking it should just be its own syscall but report its events in
> >>the same way as aio.
> >
> >
> > Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
> > was the fastest way to have a kernel interface exposed w/out having to beg
> > for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
> > might be a nice complement to AIO for specific applications.
>
>
> So what would the syscall look like ? Could you give a few more details on the interface ?
Something like this might work :
int sys_epoll_create(int maxfds);
void sys_epoll_close(int epd);
int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
where sys_epoll_wait() return the number of events available, 0 for
timeout, -1 for error.
- Davide
On Tue, Oct 15, 2002 at 11:53:48AM -0700, Dan Kegel wrote:
> Seems like the thing to do is to move /dev/epoll over to use
> Ben's event system rather than worry about the old /dev/epoll interface.
> But like signal-per-fd, we will want to collapse readiness events,
> which is something Ben's event system might not do naturally.
> (I wouldn't know -- I haven't actually looked at Ben's code.)
If you look at how /dev/epoll does it, the collapsing of readiness
events is very elegant: a given fd is only allowed to report a change
in its state once per run through the event loop. The ioctl that swaps
event buffers acts as a barrier between the two possible reports.
As to how this would interact with the aio event loops, I thought the
"barrier" syscall could be the point at which aio event slots are reserved
and freed. Interest registration would be the other syscall (which would
naturally have to reserve an event for the descriptor in the current set
of readiness notifications). Anyways, just a few thoughts...
-ben
--
"Do you seek knowledge in time travel?"
On Tue, 15 Oct 2002, Shailabh Nagar wrote:
> Davide Libenzi wrote:
> > On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> >
> >
> >>On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> >>
> >>>So I guess the question would now be: whats keeping /dev/epoll from
> >>>being included in the kernel given the time left before the feature freeze ?
> >>
> >>We don't need yet another event reporting mechanism as /dev/epoll presents.
> >>I was thinking it should just be its own syscall but report its events in
> >>the same way as aio.
> >
> >
> > Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
> > was the fastest way to have a kernel interface exposed w/out having to beg
> > for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
> > might be a nice complement to AIO for specific applications.
>
>
> So what would the syscall look like ? Could you give a few more details on the interface ?
Since i guess that w/out having the bility to add fds to the monitores set
would make the API useless ...
int sys_epoll_addfd(int epd, int fd);
- Davide
On Tue, Oct 15, 2002 at 12:00:30PM -0700, Davide Libenzi wrote:
> Something like this might work :
>
> int sys_epoll_create(int maxfds);
> void sys_epoll_close(int epd);
> int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
>
> where sys_epoll_wait() return the number of events available, 0 for
> timeout, -1 for error.
There's no reason to make epoll_wait a new syscall -- poll events can
easily be returned via the aio_complete mechanism (with the existing
aio_poll experiment as a possible means for doing so).
-ben
On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> Ben, one of the reasons of the /dev/epoll speed is how it returns events
> and how it collapses them. A memory mapped array is divided by two and
> while the user consumes events in one set, the kernel fill the other one.
> The next wait() will switch the pointers. There is no copy from kernel to
> user space. Doing :
>
> int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
>
> the only data the kernel has to copy to userspace is the 4(8) bytes for
> the "pevts" pointer.
Erm, the aio interface has support for the event ringbuffer being accessed
by userspace (it lives in user memory and the kernel acts as a writer, with
userspace as a reader), that's one of its advantages -- completion events
are directly accessible from userspace after being written to by an
interrupt. Ideally this is to be wrapped in a vsyscall, but we don't have
support for that yet on x86, although much of the code written for x86-64
should be reusable.
-ben
On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> On Tue, Oct 15, 2002 at 12:00:30PM -0700, Davide Libenzi wrote:
> > Something like this might work :
> >
> > int sys_epoll_create(int maxfds);
> > void sys_epoll_close(int epd);
> > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> >
> > where sys_epoll_wait() return the number of events available, 0 for
> > timeout, -1 for error.
>
> There's no reason to make epoll_wait a new syscall -- poll events can
> easily be returned via the aio_complete mechanism (with the existing
> aio_poll experiment as a possible means for doing so).
Ben, one of the reasons of the /dev/epoll speed is how it returns events
and how it collapses them. A memory mapped array is divided by two and
while the user consumes events in one set, the kernel fill the other one.
The next wait() will switch the pointers. There is no copy from kernel to
user space. Doing :
int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
the only data the kernel has to copy to userspace is the 4(8) bytes for
the "pevts" pointer.
- Davide
Benjamin LaHaise wrote:
> On Tue, Oct 15, 2002 at 12:00:30PM -0700, Davide Libenzi wrote:
>
>>Something like this might work :
>>
>>int sys_epoll_create(int maxfds);
>>void sys_epoll_close(int epd);
>>int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
>>
>>where sys_epoll_wait() return the number of events available, 0 for
>>timeout, -1 for error.
>
>
> There's no reason to make epoll_wait a new syscall -- poll events can
> easily be returned via the aio_complete mechanism (with the existing
> aio_poll experiment as a possible means for doing so).
So a user would setup an ioctx and use io_getevents to retrieve events on
an interest set of fds created and manipulated through the new system calls ?
-- Shailabh
On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> > Ben, one of the reasons of the /dev/epoll speed is how it returns events
> > and how it collapses them. A memory mapped array is divided by two and
> > while the user consumes events in one set, the kernel fill the other one.
> > The next wait() will switch the pointers. There is no copy from kernel to
> > user space. Doing :
> >
> > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> >
> > the only data the kernel has to copy to userspace is the 4(8) bytes for
> > the "pevts" pointer.
>
> Erm, the aio interface has support for the event ringbuffer being accessed
> by userspace (it lives in user memory and the kernel acts as a writer, with
> userspace as a reader), that's one of its advantages -- completion events
> are directly accessible from userspace after being written to by an
> interrupt. Ideally this is to be wrapped in a vsyscall, but we don't have
> support for that yet on x86, although much of the code written for x86-64
> should be reusable.
In general I would like to have a "common" interface to retrieve IO
events, but IMHO the two solutions should be benchmarked before adopting
the one or the other.
- Davide
Davide Libenzi wrote:
>
> On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
>
> > On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> > > Ben, one of the reasons of the /dev/epoll speed is how it returns events
> > > and how it collapses them. A memory mapped array is divided by two and
> > > while the user consumes events in one set, the kernel fill the other one.
> > > The next wait() will switch the pointers. There is no copy from kernel to
> > > user space. Doing :
> > >
> > > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> > >
> > > the only data the kernel has to copy to userspace is the 4(8) bytes for
> > > the "pevts" pointer.
> >
> > Erm, the aio interface has support for the event ringbuffer being accessed
> > by userspace (it lives in user memory and the kernel acts as a writer, with
> > userspace as a reader), that's one of its advantages -- completion events
> > are directly accessible from userspace after being written to by an
> > interrupt. Ideally this is to be wrapped in a vsyscall, but we don't have
> > support for that yet on x86, although much of the code written for x86-64
> > should be reusable.
>
> In general I would like to have a "common" interface to retrieve IO
> events, but IMHO the two solutions should be benchmarked before adopting
> the one or the other.
Seems like /dev/epoll uses a double-buffering scheme rather than
a ring buffer, and this is not just a trivial difference; it's
related to how redundant events are collapsed, right?
- Dan
On Tue, 15 Oct 2002, Dan Kegel wrote:
> Davide Libenzi wrote:
> >
> > On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> >
> > > On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> > > > Ben, one of the reasons of the /dev/epoll speed is how it returns events
> > > > and how it collapses them. A memory mapped array is divided by two and
> > > > while the user consumes events in one set, the kernel fill the other one.
> > > > The next wait() will switch the pointers. There is no copy from kernel to
> > > > user space. Doing :
> > > >
> > > > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> > > >
> > > > the only data the kernel has to copy to userspace is the 4(8) bytes for
> > > > the "pevts" pointer.
> > >
> > > Erm, the aio interface has support for the event ringbuffer being accessed
> > > by userspace (it lives in user memory and the kernel acts as a writer, with
> > > userspace as a reader), that's one of its advantages -- completion events
> > > are directly accessible from userspace after being written to by an
> > > interrupt. Ideally this is to be wrapped in a vsyscall, but we don't have
> > > support for that yet on x86, although much of the code written for x86-64
> > > should be reusable.
> >
> > In general I would like to have a "common" interface to retrieve IO
> > events, but IMHO the two solutions should be benchmarked before adopting
> > the one or the other.
>
> Seems like /dev/epoll uses a double-buffering scheme rather than
> a ring buffer, and this is not just a trivial difference; it's
> related to how redundant events are collapsed, right?
It's just a matter of implementation. With a double buffer you have
clearly two distinct working zones, one is the user zone and the other is
the kernel zone. With a ring buffer you have to mark what is the area that
is currently returned as event set to the user and avoid the kernel to
overflow on such area. The double buffer is probably faster and easy to
implement ( for event collapsing ).
- Davide
Benjamin LaHaise wrote:
>If you look at how /dev/epoll does it, the collapsing of readiness
>events is very elegant: a given fd is only allowed to report a change
>in its state once per run through the event loop.
>
And the way /dev/epoll does it has a key flaw: it only works with single
threaded callers. If you have multiple threads simultaneously trying to
get events, then race conditions abound.
>The ioctl that swaps
>event buffers acts as a barrier between the two possible reports.
>
Which assumes there are only single threaded callers. To work correctly
with multithreaded callers, there needs to be a more explicit mechanism
for a caller to indicate it has completed handling an event and wants to
rearm its interest.
There are also additional interactions with cancellation. How does the
cancellation interface report and handle the case where an associated
event is being delivered or handled by another thread? What happens
when that thread then tries to rearm the canceled interest?
I certainly hope /dev/epoll itself doesn't get accepted into the kernel,
the interface is error prone. Registering interest in a condition when
the condition is already true should immediately generate an event, the
epoll interface did not do that last time I saw it discussed. This
deficiency in the interface requires callers to include more complex
workaround code and is likely to result in subtle, hard to diagnose bugs.
On Tue, Oct 15, 2002 at 01:36:38PM -0700, John Gardiner Myers wrote:
> Benjamin LaHaise wrote:
>
> >Erm, the aio interface has support for the event ringbuffer being accessed
> >by userspace
> >
> Making the event ringbuffer visible to userspace conflicts with being
> able to support event priorities. To support event priorities, the
> ringbuffer would need to be replaced with some other data structure.
No it does not. Event priorities are easily accomplished via separate
event queues for events of different priorities. Most hardware implements
event priorities in this fashion.
-ben
--
"Do you seek knowledge in time travel?"
Benjamin LaHaise wrote:
>Erm, the aio interface has support for the event ringbuffer being accessed
>by userspace
>
Making the event ringbuffer visible to userspace conflicts with being
able to support event priorities. To support event priorities, the
ringbuffer would need to be replaced with some other data structure.
John Gardiner Myers wrote:
>
> Benjamin LaHaise wrote:
>
> >If you look at how /dev/epoll does it, the collapsing of readiness
> >events is very elegant: a given fd is only allowed to report a change
> >in its state once per run through the event loop.
> >
> And the way /dev/epoll does it has a key flaw: it only works with single
> threaded callers. If you have multiple threads simultaneously trying to
> get events, then race conditions abound.
Delaying the "get next batch of readiness events" call as long as
possible
increases the amount of event collapsing possible, which is important
because
the network stack seems to generate lots of spurious events. Thus I
suspect
you don't want multiple threads all calling the "get next batch of
events"
entry point frequently.
The most effective way to use something like /dev/epoll in a
multithreaded
program might be to have one thread call "get next batch of events",
then divvy up the events across multiple threads.
Thus I disagree that the way /dev/epoll does it is flawed.
> I certainly hope /dev/epoll itself doesn't get accepted into the kernel,
> the interface is error prone. Registering interest in a condition when
> the condition is already true should immediately generate an event, the
> epoll interface did not do that last time I saw it discussed. This
> deficiency in the interface requires callers to include more complex
> workaround code and is likely to result in subtle, hard to diagnose bugs.
With queued readiness notification schemes like SIGIO and /dev/epoll,
it's safest to allow readiness notifications from the kernel
to be wrong sometimes; this happens at least in the case of accept
readiness,
and possibly other places. Once you allow that, it's easy to handle the
condition you're worried about by generating a spurious readiness
indication when registering a fd. That's what I do in my wrapper
library.
Also, because /dev/epoll and friends are single-shot notifications of
*changes* in readiness, there is little reason to register interest in
this or that event, and change that interest over time; instead,
apps should simply register interest in any event they might ever
be interested in. The number of extra events they then have to ignore
is very
small, since if you take no action on a 'read ready' event, no more
of those events will occur.
So I pretty much disagree all around :-) but I do understand where
you're
coming from. I used to feel similarly until I figured out the
'right' way to use one-shot readiness notification systems
(sometime last week :-)
- Dan
On Tue, 15 Oct 2002, John Gardiner Myers wrote:
> Benjamin LaHaise wrote:
>
> >If you look at how /dev/epoll does it, the collapsing of readiness
> >events is very elegant: a given fd is only allowed to report a change
> >in its state once per run through the event loop.
> >
> And the way /dev/epoll does it has a key flaw: it only works with single
> threaded callers. If you have multiple threads simultaneously trying to
> get events, then race conditions abound.
>
> >The ioctl that swaps
> >event buffers acts as a barrier between the two possible reports.
> >
> Which assumes there are only single threaded callers. To work correctly
> with multithreaded callers, there needs to be a more explicit mechanism
> for a caller to indicate it has completed handling an event and wants to
> rearm its interest.
>
> There are also additional interactions with cancellation. How does the
> cancellation interface report and handle the case where an associated
> event is being delivered or handled by another thread? What happens
> when that thread then tries to rearm the canceled interest?
>
Why would you need to use threads with a multiplex-like interface like
/dev/epoll ? The reason of these ( poll()/select()//dev/epoll//dev/poll )
interfaces is to be able to handle more file descriptors inside a _single_
task.
> I certainly hope /dev/epoll itself doesn't get accepted into the kernel,
> the interface is error prone. Registering interest in a condition when
> the condition is already true should immediately generate an event, the
> epoll interface did not do that last time I saw it discussed. This
> deficiency in the interface requires callers to include more complex
> workaround code and is likely to result in subtle, hard to diagnose bugs.
It works exactly like rt-signals and all you have to do is to change your
code from :
int myread(...) {
if (wait(POLLIN))
read();
}
to :
int myread(...) {
while (read() == EGAIN)
wait(POLLIN);
}
- Davide
Dan Kegel wrote:
>The most effective way to use something like /dev/epoll in a
>multithreaded
>program might be to have one thread call "get next batch of events",
>then divvy up the events across multiple threads.
>
That is a tautology. As /dev/epoll is inherently a single threaded
interface, of course the most effective way for a multithreaded program
to use it is to have one thread call it then divvy up the events.
That's the *only* way a multithreaded program can deal with a single
threaded interface.
The cost to divvy up the events can be substantial, not the mention the
cost of funneling them all through a single CPU. This cost can easily
be greater than what one saves by combining events. io_getevents() is a
great model for divvying events across multiple threads, the poll
facility should work with that model.
The solution for optimizing the amount of event collapsing is to
implement concurrency control.
>Once you allow that, it's easy to handle the
>condition you're worried about by generating a spurious readiness
>indication when registering a fd. That's what I do in my wrapper
>library.
>
This is a workaround. You are adding additional code and complexity to
the caller in order to deal with a deficiency in the interface. This
has several problems:
Someone writing to the /dev/epoll interface needs to know that they need
to write such workaround code. If they don't know to write the code or
if they make an error in the workaround code, it is still likely that
the program will pass testing. What will result are intermittent, hard
to diagnose failures in production.
User space does not have the information to address this as well as the
kernel can. As a result, the workaround requires more processing in the
common, non racy case.
The kernel can clearly handle this situation better than user space. It
can do the necessary checks while it is still holding the necessary
locks. It is less error prone to handle the situation in the kernel.
The logic clearly belongs in the kernel.
>Also, because /dev/epoll and friends are single-shot notifications of
>*changes* in readiness, there is little reason to register interest in
>this or that event, and change that interest over time; instead,
>apps should simply register interest in any event they might ever
>be interested in.
>
You're making assumptions about the structure and flow of an
application. Sometimes when an application stops having interest in
this or that event, it needs to free/invalidate the context associated
with that event.
So that's fine as a strategy for amortizing the cost of
registration/deregistration, but it isn't universally applicable.
Davide Libenzi wrote:
>Why would you need to use threads with a multiplex-like interface like
>/dev/epoll ?
>
Because in some applications processing an event can cause the thread to
block, potentially for a long time. Multiple threads are needed to
isolate that block to the context associated with the event.
> while (read() == EGAIN)
> wait(POLLIN);
>
>
Assuming registration of interest is inside wait(), this has a race. If
the file becomes readable between the time that read() returns and the
time that wait() can register interest, the connection will hang.
On Tue, 15 Oct 2002, John Gardiner Myers wrote:
> Davide Libenzi wrote:
>
> >Why would you need to use threads with a multiplex-like interface like
> >/dev/epoll ?
> >
> Because in some applications processing an event can cause the thread to
> block, potentially for a long time. Multiple threads are needed to
> isolate that block to the context associated with the event.
I don't want this to become the latest pro/against threads but if your
processing thread block for a long time you should consider handling the
blocking condition asynchronously. If your procesing thread blocks, your
application model should very likely be redesigned, or you just go with
threads ( and you do not need any multiplex interface ).
> > while (read() == EGAIN)
> > wait(POLLIN);
> >
> >
> Assuming registration of interest is inside wait(), this has a race. If
> the file becomes readable between the time that read() returns and the
> time that wait() can register interest, the connection will hang.
Your assumption is wrong, the registration is done as soon as the fd
"born" ( socket() or accept() for example ) and is typically removed when
it dies.
- Davide
On Tue, 15 Oct 2002, John Myers wrote:
> Dan Kegel wrote:
>
> >The most effective way to use something like /dev/epoll in a
> >multithreaded
> >program might be to have one thread call "get next batch of events",
> >then divvy up the events across multiple threads.
> >
> That is a tautology. As /dev/epoll is inherently a single threaded
> interface, of course the most effective way for a multithreaded program
> to use it is to have one thread call it then divvy up the events.
> That's the *only* way a multithreaded program can deal with a single
> threaded interface.
>
> The cost to divvy up the events can be substantial, not the mention the
> cost of funneling them all through a single CPU. This cost can easily
> be greater than what one saves by combining events. io_getevents() is a
> great model for divvying events across multiple threads, the poll
> facility should work with that model.
>
> The solution for optimizing the amount of event collapsing is to
> implement concurrency control.
There're many ways to have /dev/epoll working in a threaded environment if
you think about it, and no you don't need to have a single thread fetching
events. You can have, if you really like threads, N fetching threads (
working on N private /dev/epoll fds ), feeding M queues, polled by P
service threads. If you like it ...
- Davide
Davide Libenzi wrote:
>I don't want this to become the latest pro/against threads but if your
>processing thread block for a long time you should consider handling the
>blocking condition asynchronously. If your procesing thread blocks, your
>application model should very likely be redesigned, or you just go with
>threads ( and you do not need any multiplex interface ).
>
Rewriting the code to handle the blocking condition asynchronously can
be inordinately expensive and time consuming. This is particularly true
when using third party code (such as the system DNS resolver) which only
has blocking interfaces.
A much more cost effective and timely methodology is to only
asynchronously code the most important conditions, leaving threads to
handle the rest.
>Your assumption is wrong, the registration is done as soon as the fd
>"born" ( socket() or accept() for example ) and is typically removed when
>it dies.
>
Nonetheless, the requirement for user space to test the condition after
the registration, not before, is subtle. A program which does these in
the wrong order is still likely to pass QA and will fail in production
in a way that will be difficult to diagnose. There is no rational
reason for the kernel to not test the condition upon registration.
On Tue, Oct 15, 2002 at 03:36:09PM -0700, John Gardiner Myers wrote:
> Nonetheless, the requirement for user space to test the condition after
> the registration, not before, is subtle. A program which does these in
> the wrong order is still likely to pass QA and will fail in production
> in a way that will be difficult to diagnose. There is no rational
> reason for the kernel to not test the condition upon registration.
I suppose one way of getting the async poll code up to snuff would be to
cache the poll registration in the file descriptor. Alternatively, the
iocb could simply persist until it is cancelled or a refire is permitted
(so that the event queue does not get overrun). Would you care to try
crunching the numbers with polltest on 2.5?
-ben
--
"Do you seek knowledge in time travel?"
Benjamin LaHaise wrote:
>I suppose one way of getting the async poll code up to snuff would be to
>cache the poll registration in the file descriptor. Alternatively, the
>iocb could simply persist until it is cancelled or a refire is permitted
>(so that the event queue does not get overrun).
>
First, can you confirm that your only problem with the async poll code
is the fact that it doesn't amortize the registration/deregistration
cost across multiple events?
I would say that the fact that the async poll code doesn't do this
amortization is not sufficient reason to hold up the patch. A
non-amortizing async poll is sufficiently useful to warrant inclusion.
It is conceivable that some applications will not want to amortize some
of their poll requests. The non-amortizing interface can later be
extended to an amortizing one by defining an "amortize me" bit to the
events request mask.
I don't think caching the registration in the file descriptor is a good
idea--there can be multiple registrations against a given fd. The
registration should be cached close to the iocb--either in the iocb or
in some structure that directly references the iocb.
The model for multiple-events-per-iocb I was thinking of is as follows:
Add a concept of a "partial completion". Unlike a normal completion,
when a partial completion event fires the iocb is not freed. Instead
the iocb goes into a "fired" state.
When an iocb is in the fired state, it will not generate any more
completion events until it is "rearmed" by user space. The method by
which user space rearms an iocb is to be determined. Upon rearming, the
iocb is once again able to generate either a partial or normal
completion. As with submission, rearming can generate this completion
immediately if the situation warrants.
Canceling an iocb in the rearmed state is the same as canceling an iocb
that has never generated a completion event. Canceling an iocb in the
fired state returns a normal completion through the cancellation
interface and returns a distinct error code (not 0 or -EAGAIN) to inform
the caller that there is an outstanding event to be synchronized with.
Attempting to rearm a canceled iocb returns an error, probably -EAGAIN
to be consistent with cancellation.
Davide Libenzi wrote:
>There're many ways to have /dev/epoll working in a threaded environment if
>you think about it, and no you don't need to have a single thread fetching
>events. You can have, if you really like threads, N fetching threads (
>working on N private /dev/epoll fds ), feeding M queues
>
In such models, you still have to pay the cost of divvying up the events
after you receive them. You also have to worry about keeping load
distributed evenly enough.
On Tue, 15 Oct 2002, John Gardiner Myers wrote:
> Davide Libenzi wrote:
>
> >There're many ways to have /dev/epoll working in a threaded environment if
> >you think about it, and no you don't need to have a single thread fetching
> >events. You can have, if you really like threads, N fetching threads (
> >working on N private /dev/epoll fds ), feeding M queues
> >
> In such models, you still have to pay the cost of divvying up the events
> after you receive them. You also have to worry about keeping load
> distributed evenly enough.
That's exactly the reason why you don't want to use many threads. Typical
applications that uses multiplex interfaces uses only one task (
eventually for each CPU ) that handles many connections. And again, you
can also use threads, if you design your application correctly. It is not
that expensive having service threads popping from an array. Yes, you have
a lock to be acquired but this is a price you have to pay as soon as you
choose threads.
- Davide
On Tue, 15 Oct 2002, John Gardiner Myers wrote:
> Nonetheless, the requirement for user space to test the condition after
> the registration, not before, is subtle. A program which does these in
> the wrong order is still likely to pass QA and will fail in production
> in a way that will be difficult to diagnose. There is no rational
> reason for the kernel to not test the condition upon registration.
All APIs have their own specifications and if you do not follow them, or
you're using a different interfacing just because the name looks similar
to other APIs, you're going to have problems. The problem it's not inside
the API but inside the user ...
- Davide
Davide Libenzi wrote:
>All APIs have their own specifications and if you do not follow them, or
>you're using a different interfacing just because the name looks similar
>to other APIs, you're going to have problems. The problem it's not inside
>the API but inside the user ...
>
>
The epoll API is deficient--it is subtly error prone and it forces work
on user space that is better done in the kernel. That the API is
specified in a deficient way does not make it any less deficient.
Again, there is no rational justification for the kernel to not test the
condition upon registration. There is ample justification for it to do so.
On Tue, 15 Oct 2002, John Gardiner Myers wrote:
> The epoll API is deficient--it is subtly error prone and it forces work
> on user space that is better done in the kernel. That the API is
> specified in a deficient way does not make it any less deficient.
Just a simple question : Have you ever used RT-Signal API ? Is it the API
"deficent" or is it the one that does not understand it ? Do you know the
difference between level triggered ( poll() - select() - /dev/poll ) and
edge triggered ( /dev/epoll - RT-Signal ) interfaces ?
- Davide
Davide Libenzi wrote:
>Just a simple question : Have you ever used RT-Signal API ? Is it the API
>"deficent" [...] ?
>
No. Yes. The (fixed) size of the signal queue is far too small. One
either gets catastrophic failure on overload or one has to pay to do
redundant accounting of interest.
>Do you know the
>difference between level triggered ( poll() - select() - /dev/poll ) and
>edge triggered ( /dev/epoll - RT-Signal ) interfaces ?
>
>
Yes. The registration of interest can itself be considered an edge
condition.
At 10:18 AM 15/10/2002 -0700, Dan Kegel wrote:
>Benjamin LaHaise wrote:
> >
> > On Tue, Oct 15, 2002 at 10:06:22AM -0700, Dan Kegel wrote:
> > > Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
> > > alternative?
> >
> > No.
>
>What's the worst part about it? The use of the signal queue?
there are four things that really suck about sigio.
in order of most-significant-suckage to least-significant-suckage, i see
them as:
[1] signals are very heavy.
thousands of signals/second does not scale on SMP due to the
serialization of them in the kernel.
(just look at the code path for delivering a signal)
signals also resulted in 128 u32's being transferred from kernel
to userspace for every signal. thats a lot of memory i/o
bandwidth consumed at 1000's of concurrent sockets
and tens-of-thousands of events/sec happening.
[2] SIGIO only exposes half of the POLL_OUT semantics.
with poll(), you can use POLL_OUT to indicate if there
is free buffer space to write into or not.
with SIGIO, for most applications, you can only find out
by issuing a write() and get back a -EWOULDBLOCK.
to indicate !POLL_OUT.
(perhaps that has been addressed in the last 12 months or so;
but i doubt it)
[3] SIGIO had no easy recovery path if you hit the maximum-
queue-limit for number of signals queued to userspace.
(ok, you *could* do a poll() and start again, but it couldn't
be done in a 100% race-free manner)
[4] you couldn't enable SIGIO on a incoming socket
accept()ed without there being a race window where
something happened to that socket between accept() and
enable-SIGIO. [sort-of related to (3); you could work-around
it by doing a poll() on the socket after enable-SIGIO, but
it makes a clean interface a horrible interface]
other miscellaneous things that makes SIGIO less usable in the real-world:
- you can only get one event at a time -- that means tens-of-thousands
to hundreds-of-thousands of system calls / second just to get event
status, when it'd probably make more sense to poll for multiple signals
at the same time
- SIGIO only addressed "event notification". it did nothing to address the
other large scalability that you typically hit when writing
high-performance i/o
systems: overhead of memory-copy from userspace<->kernelspace.
various zerocopy mechanisms help address that side of thing, but if you're
comparing aio to SIGIO, aio *is* addressing a much larger problem than just
SIGIO on its own
cheers,
lincoln.
John Gardiner Myers <[email protected]> writes:
> The epoll API is deficient--it is subtly error prone and it forces
> work on user space that is better done in the kernel. That the API
> is specified in a deficient way does not make it any less deficient.
You can argue that any API is subtly error prone. The whole sockets
API is that way. That's why the W. Richard Stevens network
programming books are such gems. That's why having access to kernel
source is invaluable. You have to pay attention to details to avoid
errors.
With /dev/epoll, it is perfectly feasible to write user level
wrapper libraries that help avoid the potential pitfalls.
I think it was Dan Kegel who has already mentioned one.
I've written one myself, and I'm very confident in it. I've written a
traffic generator application on top of my library that stresses the
Linux kernel protocol stack to the extreme. It generates the
proverbial 10k cps, saturates gigabit networks, etc.
It has no problem running over /dev/epoll.
IMHO, the code inside my wrapper library for the epoll case is
significantly easier to understand than the code for the case that
uses the legacy poll() interface.
If /dev/epoll were so error prone as you say it is, I think I would
have noticed it.
-- Buck
ps If anybody cares. I can give them a pointer to my code.
> Again, there is no rational justification for the kernel to not test
> the condition upon registration. There is ample justification for
> it to do so.
On Tue, 15 Oct 2002, John Myers wrote:
> Davide Libenzi wrote:
>
> >Just a simple question : Have you ever used RT-Signal API ? Is it the API
> >"deficent" [...] ?
> >
> No. Yes. The (fixed) size of the signal queue is far too small. One
> either gets catastrophic failure on overload or one has to pay to do
> redundant accounting of interest.
>
> >Do you know the
> >difference between level triggered ( poll() - select() - /dev/poll ) and
> >edge triggered ( /dev/epoll - RT-Signal ) interfaces ?
> >
> >
> Yes. The registration of interest can itself be considered an edge
> condition.
I knew you were going there, aka you do not understand how edge triggered
API have to be used. Even if the API will drop an event at registration
time you still cannot use this code scheme :
int my_io(...) {
if (event_wait(...))
do_io(...);
}
You CAN NOT. And look, it is not an API problem, it's your problem that
you want to use the API like a poll()-like API. The code scheme for an
edge triggered API is :
int my_io(...) {
while (do_io(...) == EAGAIN)
event_wait(...);
}
This because you have to consume the I/O space to push the level to 0 so
that a transaction 0->1 can happen and you can happily receive your
events.
- Davide
On 15 Oct 2002, Charles 'Buck' Krasic wrote:
>
> John Gardiner Myers <[email protected]> writes:
>
> > The epoll API is deficient--it is subtly error prone and it forces
> > work on user space that is better done in the kernel. That the API
> > is specified in a deficient way does not make it any less deficient.
>
> You can argue that any API is subtly error prone. The whole sockets
> API is that way. That's why the W. Richard Stevens network
> programming books are such gems. That's why having access to kernel
> source is invaluable. You have to pay attention to details to avoid
> errors.
>
> With /dev/epoll, it is perfectly feasible to write user level
> wrapper libraries that help avoid the potential pitfalls.
>
> I think it was Dan Kegel who has already mentioned one.
>
> I've written one myself, and I'm very confident in it. I've written a
> traffic generator application on top of my library that stresses the
> Linux kernel protocol stack to the extreme. It generates the
> proverbial 10k cps, saturates gigabit networks, etc.
>
> It has no problem running over /dev/epoll.
>
> IMHO, the code inside my wrapper library for the epoll case is
> significantly easier to understand than the code for the case that
> uses the legacy poll() interface.
>
> If /dev/epoll were so error prone as you say it is, I think I would
> have noticed it.
The /dev/epoll usage is IMHO very simple. Once the I/O fd is created you
register it with POLLIN|POLLOUT and you leave it inside the monitor set
until it is needed ( mainly until you close() it ). It is not necessary to
continuosly switch the event mask from POLLIN and POLLOUT. An hypothetical
syscall API should look like :
int sys_epoll_create(int maxfds);
void sys_epoll_close(int epd);
int sys_epoll_addfd(int epd, int fd, int evtmask);
int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
with the option ( if benchmarks will give positive results ) like Ben
suggested, of using the AIO event collector instead of sys_epoll_wait().
- Davide
Davide Libenzi wrote:
>I knew you were going there, aka you do not understand how edge triggered
>API have to be used.
>
Nonsense.
>Even if the API will drop an event at registration
>time you still cannot use this code scheme :
>
>int my_io(...) {
>
> if (event_wait(...))
> do_io(...);
>
>}
>
>You CAN NOT. And look, it is not an API problem, it's your problem that
>you want to use the API like a poll()-like API.
>
You have insufficient basis upon which to claim I would write code as
broken as above.
>This because you have to consume the I/O space to push the level to 0 so
>that a transaction 0->1 can happen and you can happily receive your
>events.
>
>
Of course you have to consume the I/O space to push the level to 0.
What do you think I am, stupid?
This is done with something like:
for (;;) {
fd = event_wait(...);
while (do_io(fd) != EAGAIN);
}
Trying to do at once as much work as one can on a given fd helps keep
that fd's context information in cache. If one needs to have the fd
yield the CPU in order to reduce system latency, one generates a
user-mode event.
Charles 'Buck' Krasic wrote:
>You can argue that any API is subtly error prone.
>
You can also argue that the earth is flat. It's just that some
arguments have more basis than others.
>With /dev/epoll, it is perfectly feasible to write user level
>wrapper libraries that help avoid the potential pitfalls.
>
In other words, you don't deny the problem. Instead, you work around it
in user space.
Better to fix the API. The kernel has more information than user space
and can do a better job. In the kernel, the problem can be fixed once
and for all, not over and over again in each different wrapper library.
It's not even as if the change would break programs correctly written
to the old API, not that we particularly care about programs written to
the old API.
Davide Libenzi wrote:
>Typical
>applications that uses multiplex interfaces uses only one task (
>eventually for each CPU ) that handles many connections.
>
As I mentioned before, this only works if you have the luxury of being
able to write your application and its supporting libraries from
scratch. If you don't have that luxury, you need additional threads in
order to isolate the latency caused by blocking operations.
>Yes, you have
>a lock to be acquired but this is a price you have to pay as soon as you
>choose threads.
>
>
You have to pay for the lock needed to receive the events. What I am
objecting to is having to pay again for a second lock (and condition
variable) to redistribute those events from the thread they were
received on to the thread they will be processed on. An interface that
supports multithreading well will permit events to be delivered directly
to the threads that need to process them.
io_getevents() is an example of an interface that support multithreaded
callers well. Since the divvying up is done in the kernel, the
information exists for its implementation to be later refined to be more
intelligent about when to deliver which events to which threads. For
example, the interface can later implement CPU affinity of events and
concurrency control.
On Wed, 16 Oct 2002, John Gardiner Myers wrote:
> Davide Libenzi wrote:
>
> >I knew you were going there, aka you do not understand how edge triggered
> >API have to be used.
> >
> Nonsense.
>
> >Even if the API will drop an event at registration
> >time you still cannot use this code scheme :
> >
> >int my_io(...) {
> >
> > if (event_wait(...))
> > do_io(...);
> >
> >}
> >
> >You CAN NOT. And look, it is not an API problem, it's your problem that
> >you want to use the API like a poll()-like API.
> >
> You have insufficient basis upon which to claim I would write code as
> broken as above.
Yes I have, look down 15 lines ...
> >This because you have to consume the I/O space to push the level to 0 so
> >that a transaction 0->1 can happen and you can happily receive your
> >events.
> >
> >
> Of course you have to consume the I/O space to push the level to 0.
> What do you think I am, stupid?
>
> This is done with something like:
>
> for (;;) {
> fd = event_wait(...);
> while (do_io(fd) != EAGAIN);
> }
I told you did not understand the API, this code won't work for edge
triggered APIs. Please consider investigating a little bit more before
shooting to perfectly working APIs.
- Davide
John Gardiner Myers wrote:
> Nonetheless, the requirement for user space to test the condition after
> the registration, not before, is subtle. A program which does these in
> the wrong order is still likely to pass QA and will fail in production
> in a way that will be difficult to diagnose. There is no rational
> reason for the kernel to not test the condition upon registration.
As long as we agree that the kernel may provide spurious readiness
notifications on occasion, I agree. Then /dev/epoll can easily fulfill
this by signaling readiness on everything at registration; more
accurate notifications could be added later as an optimization.
- Dan
John Gardiner Myers wrote:
>> while (read() == EAGAIN)
>> wait(POLLIN);
>>
> Assuming registration of interest is inside wait(), this has a race. If
> the file becomes readable between the time that read() returns and the
> time that wait() can register interest, the connection will hang.
Shouldn't the should be rearmed inside read() when it returns EAGAIN?
That's how I do it in my wrapper library these days.
No reason to have a race.
- Dan
On Wed, Oct 16, 2002 at 11:15:50AM -0700, John Gardiner Myers wrote:
> Davide Libenzi wrote:
> >This because you have to consume the I/O space to push the level to 0 so
> >that a transaction 0->1 can happen and you can happily receive your
> >events.
> This is done with something like:
> for (;;) {
> fd = event_wait(...);
> while (do_io(fd) != EAGAIN);
> }
> Trying to do at once as much work as one can on a given fd helps keep
> that fd's context information in cache. If one needs to have the fd
> yield the CPU in order to reduce system latency, one generates a
> user-mode event.
Not to enter into any of the other discussions on this issue, I wouldn't
usually do what you suggest above. Sure, for operations like accept() that
are inherently inefficient, I would loop until EAGAIN, but if I did I
a recv() or read() of 2K, and I only received 1K, there is no reason why
another system call should be invoked on the resource that likely will not
have any data ready.
mark
--
[email protected]/[email protected]/[email protected] __________________________
. . _ ._ . . .__ . . ._. .__ . . . .__ | Neighbourhood Coder
|\/| |_| |_| |/ |_ |\/| | |_ | |/ |_ |
| | | | | \ | \ |__ . | | .|. |__ |__ | \ |__ | Ottawa, Ontario, Canada
One ring to rule them all, one ring to find them, one ring to bring them all
and in the darkness bind them...
http://mark.mielke.cc/
John Gardiner Myers <[email protected]> writes:
> Charles 'Buck' Krasic wrote:
>
> In other words, you don't deny the problem. Instead, you work around
> it in user space.
Not exactly. I'm saying that the context in which /dev/epoll is used
(at least originally), is non-blocking socket IO. Anybody who has
worked with that API can tell you there are subtleties, and that if
they're ignored, will certainly lead to pitfalls. These are not the
fault of the /dev/epoll interface.
> Better to fix the API.
> The kernel has more information than user space and can do a better
> job.
I think we're talking across purposes.
I know this is the AIO list, but I think epoll has value independent
of AIO. They're complementary, not mutually exclusive.
> In the kernel, the problem can be fixed once and for all, not
> over and over again in each different wrapper library. It's not even
> as if the change would break programs correctly written to the old
> API, not that we particularly care about programs written to the old
> API.
I agree if you are talking about AIO as a whole. But epoll is more
limited in its scope, it really relates only to poll()/select not the
whole IO api.
-- Buck
Hi Davide,
On thinking about this a bit, I wonder if the evtmask isn't superflous
in sys_epoll_addf? (And in the existing epoll interface where the
application writes to /dev/epoll).
As you say, the normal usage will be to register for all events
anyway. My wrapper library does exactly that. As you say, not having
to continously switch the mask is the simpler way to go. If
registering for all events is the only sensible approach, the argument
isn't needed at all.
What do you think? It's a minor detail, I know.
Taking the idea further, I would prefer that ALL non-blocking sockets
are automatically added to the epoll interest set if the application
has already called epoll_create(). Maybe that behaviour could be an
option to epoll_create().
BTW, I'm not clear on another aspect of the API below, is there still
an mmap() for the pollfd buffers?
-- Buck
Davide Libenzi <[email protected]> writes:
> The /dev/epoll usage is IMHO very simple. Once the I/O fd is created you
> register it with POLLIN|POLLOUT and you leave it inside the monitor set
> until it is needed ( mainly until you close() it ). It is not necessary to
> continuosly switch the event mask from POLLIN and POLLOUT. An hypothetical
> syscall API should look like :
> int sys_epoll_create(int maxfds);
> void sys_epoll_close(int epd);
> int sys_epoll_addfd(int epd, int fd, int evtmask);
> int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> with the option ( if benchmarks will give positive results ) like Ben
> suggested, of using the AIO event collector instead of sys_epoll_wait().
> - Davide
On 17 Oct 2002, Charles 'Buck' Krasic wrote:
>
> Hi Davide,
>
> On thinking about this a bit, I wonder if the evtmask isn't superflous
> in sys_epoll_addf? (And in the existing epoll interface where the
> application writes to /dev/epoll).
>
> As you say, the normal usage will be to register for all events
> anyway. My wrapper library does exactly that. As you say, not having
> to continously switch the mask is the simpler way to go. If
> registering for all events is the only sensible approach, the argument
> isn't needed at all.
>
> What do you think? It's a minor detail, I know.
Even if it is the fastest way to use the API, iI would still prefer such
behaviour to be encoded in wrapper libraries instead of inside the API
itself. Having a choice is usually better that not having it, if the cost
for having a choice is not too much ( and in this particular case is not ).
> Taking the idea further, I would prefer that ALL non-blocking sockets
> are automatically added to the epoll interest set if the application
> has already called epoll_create(). Maybe that behaviour could be an
> option to epoll_create().
Same thing, I would leave this task to your my_socket() and my_accept().
I think what is really missing about /dev/epoll is an easy-to-use
interface library to avoid users confused by the presence of "poll" inside
its name, to use it like select()/poll().
> BTW, I'm not clear on another aspect of the API below, is there still
> an mmap() for the pollfd buffers?
Yes, it creates a mapping shared between the kernel and the user space.
- Davide
Charles 'Buck' Krasic wrote:
> On thinking about this a bit, I wonder if the evtmask isn't superflous
> in sys_epoll_addf? ... As you say, the normal usage will be to
> register for all events anyway.
I agree... but we might eventually have events that apps aren't
interested in. No harm in letting app specify an interest mask once.
> Taking the idea further, I would prefer that ALL non-blocking sockets
> are automatically added to the epoll interest set if the application
> has already called epoll_create().
That would prevent apps from having more than one i/o readiness
notification event source. This is a problem for modular
software, where you try to combine libraries in a multithreaded
program.
- Dan
On Mon, Oct 21, 2002 at 05:58:17PM +0100, Alan Cox wrote:
> I think a chunk of the poll scaling problem is better addressed by
> futexes. If I can say "this futex list for this fd for events X Y and Z"
> I can construct almost all the efficient stuff I need out of the futex
> interfaces, much like doing it with SIGIO setting flags but a lot less
> clocks
I've structured the aio userland structure so that this is possible, just
not implemented yet. There are fields for compatible and incompatible
features, as well as the length of the header. This way, the library can
implement a faster getevents call when futex support is added, but it
always has the option of falling back to the syscall should it not understand
any changes we make to the data structure.
-ben
--
"Do you seek knowledge in time travel?"
On Wed, 2002-10-16 at 19:29, John Gardiner Myers wrote:
> Better to fix the API. The kernel has more information than user space
> and can do a better job. In the kernel, the problem can be fixed once
> and for all, not over and over again in each different wrapper library.
> It's not even as if the change would break programs correctly written
> to the old API, not that we particularly care about programs written to
> the old API.
I think a chunk of the poll scaling problem is better addressed by
futexes. If I can say "this futex list for this fd for events X Y and Z"
I can construct almost all the efficient stuff I need out of the futex
interfaces, much like doing it with SIGIO setting flags but a lot less
clocks
Alan