From: "Paton J. Lewis" <[email protected]>
Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item.
If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the
epoll item in a multi-threaded environment. Also added a new test_epoll self-
test app to both demonstrate the need for this feature and test it.
Signed-off-by: Paton J. Lewis <[email protected]>
---
fs/eventpoll.c | 38 ++-
include/linux/eventpoll.h | 1 +
tools/testing/selftests/Makefile | 2 +-
tools/testing/selftests/epoll/Makefile | 11 +
tools/testing/selftests/epoll/test_epoll.c | 344 ++++++++++++++++++++++++++++
5 files changed, 392 insertions(+), 4 deletions(-)
create mode 100644 tools/testing/selftests/epoll/Makefile
create mode 100644 tools/testing/selftests/epoll/test_epoll.c
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 739b098..1c47bbd 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -339,7 +339,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
- return op != EPOLL_CTL_DEL;
+ return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
}
/* Initialize the poll safe wake up structure */
@@ -664,6 +664,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
return 0;
}
+/*
+ * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
+ * had no event flags set, indicating that another thread may be currently
+ * handling that item's events (in the case that EPOLLONESHOT was being
+ * used). Otherwise a zero result indicates that the item has been disabled
+ * from receiving events. A disabled item may be re-enabled via
+ * EPOLL_CTL_MOD. Must be called with "mtx" held.
+ */
+static int ep_disable(struct eventpoll *ep, struct epitem *epi)
+{
+ int result = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ep->lock, flags);
+ if (epi->event.events & ~EP_PRIVATE_BITS) {
+ if (ep_is_linked(&epi->rdllink))
+ list_del_init(&epi->rdllink);
+ /* Ensure ep_poll_callback will not add epi back onto ready
+ list: */
+ epi->event.events &= EP_PRIVATE_BITS;
+ }
+ else
+ result = -EBUSY;
+ spin_unlock_irqrestore(&ep->lock, flags);
+
+ return result;
+}
+
static void ep_free(struct eventpoll *ep)
{
struct rb_node *rbp;
@@ -996,8 +1024,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
rb_insert_color(&epi->rbn, &ep->rbr);
}
-
-
#define PATH_ARR_SIZE 5
/*
* These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1701,6 +1727,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
error = -ENOENT;
break;
+ case EPOLL_CTL_DISABLE:
+ if (epi)
+ error = ep_disable(ep, epi);
+ else
+ error = -ENOENT;
+ break;
}
mutex_unlock(&ep->mtx);
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 657ab55..e91f7e3 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -25,6 +25,7 @@
#define EPOLL_CTL_ADD 1
#define EPOLL_CTL_DEL 2
#define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_DISABLE 4
/* Set the One Shot behaviour for the target file descriptor */
#define EPOLLONESHOT (1 << 30)
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 28bc57e..4cf477f 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
-TARGETS = breakpoints vm
+TARGETS = breakpoints epoll vm
all:
for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
new file mode 100644
index 0000000..19806ed
--- /dev/null
+++ b/tools/testing/selftests/epoll/Makefile
@@ -0,0 +1,11 @@
+# Makefile for epoll selftests
+
+all: test_epoll
+%: %.c
+ gcc -pthread -g -o $@ $^
+
+run_tests: all
+ ./test_epoll
+
+clean:
+ $(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
new file mode 100644
index 0000000..cbd8f8b
--- /dev/null
+++ b/tools/testing/selftests/epoll/test_epoll.c
@@ -0,0 +1,344 @@
+/*
+ * tools/testing/selftests/epoll/test_epoll.c
+ *
+ * Copyright 2012 Adobe Systems Incorporated
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Paton J. Lewis <[email protected]>
+ *
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+
+/*
+ * A pointer to an epoll_item_private structure will be stored in the epoll
+ * item's event structure so that we can get access to the epoll_item_private
+ * data after calling epoll_wait:
+ */
+struct epoll_item_private {
+ int index; /* Position of this struct within the epoll_items array. */
+ int fd;
+ uint32_t events;
+ pthread_mutex_t mutex; /* Guards the following variables... */
+ int stop;
+ int status; /* Stores any error encountered while handling item. */
+ /* The following variable allows us to test whether we have encountered
+ a problem while attempting to cancel and delete the associated
+ event. When the test program exits, 'deleted' should be exactly
+ one. If it is greater than one, then the failed test reflects a real
+ world situation where we would have tried to access the epoll item's
+ private data after deleting it: */
+ int deleted;
+};
+
+struct epoll_item_private *epoll_items;
+
+/*
+ * Delete the specified item from the epoll set. In a real-world secneario this
+ * is where we would free the associated data structure, but in this testing
+ * environment we retain the structure so that we can test for double-deletion:
+ */
+void delete_item(int index)
+{
+ __sync_fetch_and_add(&epoll_items[index].deleted, 1);
+}
+
+/*
+ * A pointer to a read_thread_data structure will be passed as the argument to
+ * each read thread:
+ */
+struct read_thread_data {
+ int stop;
+ int status; /* Indicates any error encountered by the read thread. */
+ int epoll_set;
+};
+
+/*
+ * The function executed by the read threads:
+ */
+void *read_thread_function(void *function_data)
+{
+ struct read_thread_data *thread_data =
+ (struct read_thread_data *)function_data;
+ struct epoll_event event_data;
+ struct epoll_item_private *item_data;
+ char socket_data;
+
+ /* Handle events until we encounter an error or this thread's 'stop'
+ condition is set: */
+ while (1) {
+ int result = epoll_wait(thread_data->epoll_set,
+ &event_data,
+ 1, /* Number of desired events */
+ 1000); /* Timeout in ms */
+ if (result < 0) {
+ /* Breakpoints signal all threads. Ignore that while
+ debugging: */
+ if (errno == EINTR)
+ continue;
+ thread_data->status = errno;
+ return 0;
+ } else if (thread_data->stop)
+ return 0;
+ else if (result == 0) /* Timeout */
+ continue;
+
+ /* We need the mutex here because checking for the stop
+ condition and re-enabling the epoll item need to be done
+ together as one atomic operation when EPOLL_CTL_DISABLE is
+ available: */
+ item_data = (struct epoll_item_private *)event_data.data.ptr;
+ pthread_mutex_lock(&item_data->mutex);
+
+ /* Remove the item from the epoll set if we want to stop
+ handling that event: */
+ if (item_data->stop)
+ delete_item(item_data->index);
+ else {
+ /* Clear the data that was written to the other end of
+ our non-blocking socket: */
+ do {
+ if (read(item_data->fd, &socket_data, 1) < 1) {
+ if ((errno == EAGAIN) ||
+ (errno == EWOULDBLOCK))
+ break;
+ else
+ goto error_unlock;
+ }
+ } while (item_data->events & EPOLLET);
+
+ /* The item was one-shot, so re-enable it: */
+ event_data.events = item_data->events;
+ if (epoll_ctl(thread_data->epoll_set,
+ EPOLL_CTL_MOD,
+ item_data->fd,
+ &event_data) < 0)
+ goto error_unlock;
+ }
+
+ pthread_mutex_unlock(&item_data->mutex);
+ }
+
+error_unlock:
+ thread_data->status = item_data->status = errno;
+ pthread_mutex_unlock(&item_data->mutex);
+ return 0;
+}
+
+/*
+ * A pointer to a write_thread_data structure will be passed as the argument to
+ * the write thread:
+ */
+struct write_thread_data {
+ int stop;
+ int status; /* Indicates any error encountered by the write thread. */
+ int n_fds;
+ int *fds;
+};
+
+/*
+ * The function executed by the write thread. It writes a single byte to each
+ * socket in turn until the stop condition for this thread is set. If writing to
+ * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
+ * the moment and just move on to the next socket in the list. We don't care
+ * about the order in which we deliver events to the epoll set. In fact we don't
+ * care about the data we're writing to the pipes at all; we just want to
+ * trigger epoll events:
+ */
+void *write_thread_function(void *function_data)
+{
+ const char data = 'X';
+ int index;
+ struct write_thread_data *thread_data =
+ (struct write_thread_data *)function_data;
+ while (!write_thread_data->stop)
+ for (index = 0;
+ !thread_data->stop && (index < thread_data->n_fds);
+ ++index)
+ if ((write(thread_data->fds[index], &data, 1) < 1) &&
+ (errno != EAGAIN) &&
+ (errno != EWOULDBLOCK)) {
+ write_thread_data->status = errno;
+ return;
+ }
+}
+
+/*
+ * Arguments are currently ignored:
+ */
+int main(int argc, char **argv)
+{
+ const int n_read_threads = 100;
+ const int n_epoll_items = 500;
+ int index;
+ int epoll_set = epoll_create1(0);
+ struct write_thread_data write_thread_data = {
+ 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
+ };
+ struct read_thread_data *read_thread_data =
+ malloc(n_read_threads * sizeof(struct read_thread_data));
+ pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
+ pthread_t write_thread;
+
+ printf("-----------------\n");
+ printf("Runing test_epoll\n");
+ printf("-----------------\n");
+
+ epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
+
+ if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
+ read_thread_data == 0 || read_threads == 0)
+ goto error;
+
+ if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
+ printf("Error: please run this test on a multi-core system.\n");
+ goto error;
+ }
+
+ /* Create the socket pairs and epoll items: */
+ for (index = 0; index < n_epoll_items; ++index) {
+ int socket_pair[2];
+ struct epoll_event event_data;
+ if (socketpair(AF_UNIX,
+ SOCK_STREAM | SOCK_NONBLOCK,
+ 0,
+ socket_pair) < 0)
+ goto error;
+ write_thread_data.fds[index] = socket_pair[0];
+ epoll_items[index].index = index;
+ epoll_items[index].fd = socket_pair[1];
+ if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
+ goto error;
+ /* We always use EPOLLONESHOT because this test is currently
+ structured to demonstrate the need for EPOLL_CTL_DISABLE,
+ which only produces useful information in the EPOLLONESHOT
+ case (without EPOLLONESHOT, calling epoll_ctl with
+ EPOLL_CTL_DISABLE will never return EBUSY). If support for
+ testing events without EPOLLONESHOT is desired, it should
+ probably be implemented in a separate unit test. */
+ epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
+ if (index < n_epoll_items / 2)
+ epoll_items[index].events |= EPOLLET;
+ epoll_items[index].stop = 0;
+ epoll_items[index].status = 0;
+ epoll_items[index].deleted = 0;
+ event_data.events = epoll_items[index].events;
+ event_data.data.ptr = &epoll_items[index];
+ if (epoll_ctl(epoll_set,
+ EPOLL_CTL_ADD,
+ epoll_items[index].fd,
+ &event_data) < 0)
+ goto error;
+ }
+
+ /* Create and start the read threads: */
+ for (index = 0; index < n_read_threads; ++index) {
+ read_thread_data[index].stop = 0;
+ read_thread_data[index].status = 0;
+ read_thread_data[index].epoll_set = epoll_set;
+ if (pthread_create(&read_threads[index],
+ NULL,
+ read_thread_function,
+ &read_thread_data[index]) != 0)
+ goto error;
+ }
+
+ if (pthread_create(&write_thread,
+ NULL,
+ write_thread_function,
+ &write_thread_data) != 0)
+ goto error;
+
+ /* Cancel all event pollers: */
+#ifdef EPOLL_CTL_DISABLE
+ for (index = 0; index < n_epoll_items; ++index) {
+ pthread_mutex_lock(&epoll_items[index].mutex);
+ ++epoll_items[index].stop;
+ if (epoll_ctl(epoll_set,
+ EPOLL_CTL_DISABLE,
+ epoll_items[index].fd,
+ NULL) == 0)
+ delete_item(index);
+ else if (errno != EBUSY) {
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ goto error;
+ }
+ /* EBUSY means events were being handled; allow the other thread
+ to delete the item. */
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ }
+#else
+ for (index = 0; index < n_epoll_items; ++index) {
+ pthread_mutex_lock(&epoll_items[index].mutex);
+ ++epoll_items[index].stop;
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ /* Wait in case a thread running read_thread_function is
+ currently executing code between epoll_wait and
+ pthread_mutex_lock with this item. Note that a longer delay
+ would make double-deletion less likely (at the expense of
+ performance), but there is no guarantee that any delay would
+ ever be sufficient. Note also that we delete all event
+ pollers at once for testing purposes, but in a real-world
+ environment we are likely to want to be able to cancel event
+ pollers at arbitrary times. Therefore we can't improve this
+ situation by just splitting this loop into two loops
+ (i.e. signal 'stop' for all items, sleep, and then delete all
+ items). We also can't fix the problem via EPOLL_CTL_DEL
+ because that command can't prevent the case where some other
+ thread is executing read_thread_function within the region
+ mentioned above: */
+ usleep(1);
+ pthread_mutex_lock(&epoll_items[index].mutex);
+ if (!epoll_items[index].deleted)
+ delete_item(index);
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ }
+#endif
+
+ /* Shut down the read threads: */
+ for (index = 0; index < n_read_threads; ++index)
+ __sync_fetch_and_add(&read_thread_data[index].stop, 1);
+ for (index = 0; index < n_read_threads; ++index) {
+ if (pthread_join(read_threads[index], NULL) != 0)
+ goto error;
+ if (read_thread_data[index].status)
+ goto error;
+ }
+
+ /* Shut down the write thread: */
+ __sync_fetch_and_add(&write_thread_data.stop, 1);
+ if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
+ goto error;
+
+ /* Check for final error conditions: */
+ for (index = 0; index < n_epoll_items; ++index) {
+ if (epoll_items[index].status != 0)
+ goto error;
+ if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
+ goto error;
+ }
+ for (index = 0; index < n_epoll_items; ++index)
+ if (epoll_items[index].deleted != 1) {
+ printf("Error: item data deleted %1d times.\n",
+ epoll_items[index].deleted);
+ goto error;
+ }
+
+ printf("[PASS]\n");
+ return 0;
+
+ error:
+ printf("[FAIL]\n");
+ return errno;
+}
--
1.7.9.5
[CC += linux-api@]
Hello Paton,
On Thu, Aug 23, 2012 at 11:15 PM, Paton J. Lewis <[email protected]> wrote:
> From: "Paton J. Lewis" <[email protected]>
>
> Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item.
> If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the
> epoll item in a multi-threaded environment. Also added a new test_epoll self-
> test app to both demonstrate the need for this feature and test it.
(There's a lot of background missing from this version of the patch
that was included in the previous version
[http://thread.gmane.org/gmane.linux.kernel/1311457]. It helps to
include the full rationale with a revised patch--best not to assume
that someone has the context of past mails when reading a revised
patch.)
I've taken a look at this patch as it currently stands in 3.7-rc1, and
done a bit of testing. (By the way, the test program
tools/testing/selftests/epoll/test_epoll.c does not compile...)
There are one or two places where the behavior seems a little strange,
so I have a question or two at the end of this mail. But other than
that, I want to check my understanding so that the interface can be
correctly documented.
Just to go though my understanding, the problem is the following
scenario in a multithreaded application:
1. Multiple threads are performing epoll_wait() operations,
and maintaining a user-space cache that contains information
corresponding to each file descriptor being monitored by
epoll_wait().
2. At some point, a thread wants to delete (EPOLL_CTL_DEL)
a file descriptor from the epoll interest list, and
delete the corresponding record from the user-space cache.
3. The problem with (2) is that some other thread may have
previously done an epoll_wait() that retrieved information
about the fd in question, and may be in the middle of using
information in the cache that relates to that fd. Thus,
there is a potential race.
4. The race can't solved purely in user space, because doing
so would require applying a mutex across the epoll_wait()
call, which would of course blow thread concurrency.
Right?
Your solution is the EPOLL_CTL_DISABLE operation. I want to
confirm my understanding about how to use this flag, since
the description that has accompanied the patches so far
has been a bit sparse
0. In the scenario you're concerned about, deleting a file
descriptor means (safely) doing the following:
(a) Deleting the file descriptor from the epoll interest list
using EPOLL_CTL_DEL
(b) Deleting the corresponding record in the user-space cache
1. It's only meaningful to use this EPOLL_CTL_DISABLE in
conjunction with EPOLLONESHOT.
2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in
conjunction is a logical error.
3. The correct way to code multithreaded applications using
EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows:
a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should
should EPOLLONESHOT.
b. When a thread wants to delete a file descriptor, it
should do the following:
[1] Call epoll_ctl(EPOLL_CTL_DISABLE)
[2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE)
was zero, then the file descriptor can be safely
deleted by the thread that made this call.
[3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY,
then the descriptor is in use. In this case, the calling
thread should set a flag in the user-space cache to
indicate that the thread that is using the descriptor
should perform the deletion operation.
Is all of the above correct?
The implementation depends on checking on whether
(events & ~EP_PRIVATE_BITS) == 0
This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always
set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT
causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be
cleared.
A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE
is only useful in conjunction with EPOLLONESHOT. However, as things
stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does
not have EPOLLONESHOT set in 'events' This results in the following
(slightly surprising) behavior:
(a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0
(the indicator that the file descriptor can be safely deleted).
(b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY.
This doesn't seem particularly useful, and in fact is probably an
indication that the user made a logic error: they should only be using
epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which
EPOLLONESHOT was set in 'events'. If that is correct, then would it
not make sense to return an error to user space for this case?
Cheers,
Michael
> Signed-off-by: Paton J. Lewis <[email protected]>
> ---
> fs/eventpoll.c | 38 ++-
> include/linux/eventpoll.h | 1 +
> tools/testing/selftests/Makefile | 2 +-
> tools/testing/selftests/epoll/Makefile | 11 +
> tools/testing/selftests/epoll/test_epoll.c | 344 ++++++++++++++++++++++++++++
> 5 files changed, 392 insertions(+), 4 deletions(-)
> create mode 100644 tools/testing/selftests/epoll/Makefile
> create mode 100644 tools/testing/selftests/epoll/test_epoll.c
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 739b098..1c47bbd 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -339,7 +339,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
> /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
> static inline int ep_op_has_event(int op)
> {
> - return op != EPOLL_CTL_DEL;
> + return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
> }
>
> /* Initialize the poll safe wake up structure */
> @@ -664,6 +664,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
> return 0;
> }
>
> +/*
> + * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
> + * had no event flags set, indicating that another thread may be currently
> + * handling that item's events (in the case that EPOLLONESHOT was being
> + * used). Otherwise a zero result indicates that the item has been disabled
> + * from receiving events. A disabled item may be re-enabled via
> + * EPOLL_CTL_MOD. Must be called with "mtx" held.
> + */
> +static int ep_disable(struct eventpoll *ep, struct epitem *epi)
> +{
> + int result = 0;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&ep->lock, flags);
> + if (epi->event.events & ~EP_PRIVATE_BITS) {
> + if (ep_is_linked(&epi->rdllink))
> + list_del_init(&epi->rdllink);
> + /* Ensure ep_poll_callback will not add epi back onto ready
> + list: */
> + epi->event.events &= EP_PRIVATE_BITS;
> + }
> + else
> + result = -EBUSY;
> + spin_unlock_irqrestore(&ep->lock, flags);
> +
> + return result;
> +}
> +
> static void ep_free(struct eventpoll *ep)
> {
> struct rb_node *rbp;
> @@ -996,8 +1024,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
> rb_insert_color(&epi->rbn, &ep->rbr);
> }
>
> -
> -
> #define PATH_ARR_SIZE 5
> /*
> * These are the number paths of length 1 to 5, that we are allowing to emanate
> @@ -1701,6 +1727,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> } else
> error = -ENOENT;
> break;
> + case EPOLL_CTL_DISABLE:
> + if (epi)
> + error = ep_disable(ep, epi);
> + else
> + error = -ENOENT;
> + break;
> }
> mutex_unlock(&ep->mtx);
>
> diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
> index 657ab55..e91f7e3 100644
> --- a/include/linux/eventpoll.h
> +++ b/include/linux/eventpoll.h
> @@ -25,6 +25,7 @@
> #define EPOLL_CTL_ADD 1
> #define EPOLL_CTL_DEL 2
> #define EPOLL_CTL_MOD 3
> +#define EPOLL_CTL_DISABLE 4
>
> /* Set the One Shot behaviour for the target file descriptor */
> #define EPOLLONESHOT (1 << 30)
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index 28bc57e..4cf477f 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -1,4 +1,4 @@
> -TARGETS = breakpoints vm
> +TARGETS = breakpoints epoll vm
>
> all:
> for TARGET in $(TARGETS); do \
> diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
> new file mode 100644
> index 0000000..19806ed
> --- /dev/null
> +++ b/tools/testing/selftests/epoll/Makefile
> @@ -0,0 +1,11 @@
> +# Makefile for epoll selftests
> +
> +all: test_epoll
> +%: %.c
> + gcc -pthread -g -o $@ $^
> +
> +run_tests: all
> + ./test_epoll
> +
> +clean:
> + $(RM) test_epoll
> diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
> new file mode 100644
> index 0000000..cbd8f8b
> --- /dev/null
> +++ b/tools/testing/selftests/epoll/test_epoll.c
> @@ -0,0 +1,344 @@
> +/*
> + * tools/testing/selftests/epoll/test_epoll.c
> + *
> + * Copyright 2012 Adobe Systems Incorporated
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * Paton J. Lewis <[email protected]>
> + *
> + */
> +
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <pthread.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/epoll.h>
> +#include <sys/socket.h>
> +
> +/*
> + * A pointer to an epoll_item_private structure will be stored in the epoll
> + * item's event structure so that we can get access to the epoll_item_private
> + * data after calling epoll_wait:
> + */
> +struct epoll_item_private {
> + int index; /* Position of this struct within the epoll_items array. */
> + int fd;
> + uint32_t events;
> + pthread_mutex_t mutex; /* Guards the following variables... */
> + int stop;
> + int status; /* Stores any error encountered while handling item. */
> + /* The following variable allows us to test whether we have encountered
> + a problem while attempting to cancel and delete the associated
> + event. When the test program exits, 'deleted' should be exactly
> + one. If it is greater than one, then the failed test reflects a real
> + world situation where we would have tried to access the epoll item's
> + private data after deleting it: */
> + int deleted;
> +};
> +
> +struct epoll_item_private *epoll_items;
> +
> +/*
> + * Delete the specified item from the epoll set. In a real-world secneario this
> + * is where we would free the associated data structure, but in this testing
> + * environment we retain the structure so that we can test for double-deletion:
> + */
> +void delete_item(int index)
> +{
> + __sync_fetch_and_add(&epoll_items[index].deleted, 1);
> +}
> +
> +/*
> + * A pointer to a read_thread_data structure will be passed as the argument to
> + * each read thread:
> + */
> +struct read_thread_data {
> + int stop;
> + int status; /* Indicates any error encountered by the read thread. */
> + int epoll_set;
> +};
> +
> +/*
> + * The function executed by the read threads:
> + */
> +void *read_thread_function(void *function_data)
> +{
> + struct read_thread_data *thread_data =
> + (struct read_thread_data *)function_data;
> + struct epoll_event event_data;
> + struct epoll_item_private *item_data;
> + char socket_data;
> +
> + /* Handle events until we encounter an error or this thread's 'stop'
> + condition is set: */
> + while (1) {
> + int result = epoll_wait(thread_data->epoll_set,
> + &event_data,
> + 1, /* Number of desired events */
> + 1000); /* Timeout in ms */
> + if (result < 0) {
> + /* Breakpoints signal all threads. Ignore that while
> + debugging: */
> + if (errno == EINTR)
> + continue;
> + thread_data->status = errno;
> + return 0;
> + } else if (thread_data->stop)
> + return 0;
> + else if (result == 0) /* Timeout */
> + continue;
> +
> + /* We need the mutex here because checking for the stop
> + condition and re-enabling the epoll item need to be done
> + together as one atomic operation when EPOLL_CTL_DISABLE is
> + available: */
> + item_data = (struct epoll_item_private *)event_data.data.ptr;
> + pthread_mutex_lock(&item_data->mutex);
> +
> + /* Remove the item from the epoll set if we want to stop
> + handling that event: */
> + if (item_data->stop)
> + delete_item(item_data->index);
> + else {
> + /* Clear the data that was written to the other end of
> + our non-blocking socket: */
> + do {
> + if (read(item_data->fd, &socket_data, 1) < 1) {
> + if ((errno == EAGAIN) ||
> + (errno == EWOULDBLOCK))
> + break;
> + else
> + goto error_unlock;
> + }
> + } while (item_data->events & EPOLLET);
> +
> + /* The item was one-shot, so re-enable it: */
> + event_data.events = item_data->events;
> + if (epoll_ctl(thread_data->epoll_set,
> + EPOLL_CTL_MOD,
> + item_data->fd,
> + &event_data) < 0)
> + goto error_unlock;
> + }
> +
> + pthread_mutex_unlock(&item_data->mutex);
> + }
> +
> +error_unlock:
> + thread_data->status = item_data->status = errno;
> + pthread_mutex_unlock(&item_data->mutex);
> + return 0;
> +}
> +
> +/*
> + * A pointer to a write_thread_data structure will be passed as the argument to
> + * the write thread:
> + */
> +struct write_thread_data {
> + int stop;
> + int status; /* Indicates any error encountered by the write thread. */
> + int n_fds;
> + int *fds;
> +};
> +
> +/*
> + * The function executed by the write thread. It writes a single byte to each
> + * socket in turn until the stop condition for this thread is set. If writing to
> + * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
> + * the moment and just move on to the next socket in the list. We don't care
> + * about the order in which we deliver events to the epoll set. In fact we don't
> + * care about the data we're writing to the pipes at all; we just want to
> + * trigger epoll events:
> + */
> +void *write_thread_function(void *function_data)
> +{
> + const char data = 'X';
> + int index;
> + struct write_thread_data *thread_data =
> + (struct write_thread_data *)function_data;
> + while (!write_thread_data->stop)
> + for (index = 0;
> + !thread_data->stop && (index < thread_data->n_fds);
> + ++index)
> + if ((write(thread_data->fds[index], &data, 1) < 1) &&
> + (errno != EAGAIN) &&
> + (errno != EWOULDBLOCK)) {
> + write_thread_data->status = errno;
> + return;
> + }
> +}
> +
> +/*
> + * Arguments are currently ignored:
> + */
> +int main(int argc, char **argv)
> +{
> + const int n_read_threads = 100;
> + const int n_epoll_items = 500;
> + int index;
> + int epoll_set = epoll_create1(0);
> + struct write_thread_data write_thread_data = {
> + 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
> + };
> + struct read_thread_data *read_thread_data =
> + malloc(n_read_threads * sizeof(struct read_thread_data));
> + pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
> + pthread_t write_thread;
> +
> + printf("-----------------\n");
> + printf("Runing test_epoll\n");
> + printf("-----------------\n");
> +
> + epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
> +
> + if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
> + read_thread_data == 0 || read_threads == 0)
> + goto error;
> +
> + if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
> + printf("Error: please run this test on a multi-core system.\n");
> + goto error;
> + }
> +
> + /* Create the socket pairs and epoll items: */
> + for (index = 0; index < n_epoll_items; ++index) {
> + int socket_pair[2];
> + struct epoll_event event_data;
> + if (socketpair(AF_UNIX,
> + SOCK_STREAM | SOCK_NONBLOCK,
> + 0,
> + socket_pair) < 0)
> + goto error;
> + write_thread_data.fds[index] = socket_pair[0];
> + epoll_items[index].index = index;
> + epoll_items[index].fd = socket_pair[1];
> + if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
> + goto error;
> + /* We always use EPOLLONESHOT because this test is currently
> + structured to demonstrate the need for EPOLL_CTL_DISABLE,
> + which only produces useful information in the EPOLLONESHOT
> + case (without EPOLLONESHOT, calling epoll_ctl with
> + EPOLL_CTL_DISABLE will never return EBUSY). If support for
> + testing events without EPOLLONESHOT is desired, it should
> + probably be implemented in a separate unit test. */
> + epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
> + if (index < n_epoll_items / 2)
> + epoll_items[index].events |= EPOLLET;
> + epoll_items[index].stop = 0;
> + epoll_items[index].status = 0;
> + epoll_items[index].deleted = 0;
> + event_data.events = epoll_items[index].events;
> + event_data.data.ptr = &epoll_items[index];
> + if (epoll_ctl(epoll_set,
> + EPOLL_CTL_ADD,
> + epoll_items[index].fd,
> + &event_data) < 0)
> + goto error;
> + }
> +
> + /* Create and start the read threads: */
> + for (index = 0; index < n_read_threads; ++index) {
> + read_thread_data[index].stop = 0;
> + read_thread_data[index].status = 0;
> + read_thread_data[index].epoll_set = epoll_set;
> + if (pthread_create(&read_threads[index],
> + NULL,
> + read_thread_function,
> + &read_thread_data[index]) != 0)
> + goto error;
> + }
> +
> + if (pthread_create(&write_thread,
> + NULL,
> + write_thread_function,
> + &write_thread_data) != 0)
> + goto error;
> +
> + /* Cancel all event pollers: */
> +#ifdef EPOLL_CTL_DISABLE
> + for (index = 0; index < n_epoll_items; ++index) {
> + pthread_mutex_lock(&epoll_items[index].mutex);
> + ++epoll_items[index].stop;
> + if (epoll_ctl(epoll_set,
> + EPOLL_CTL_DISABLE,
> + epoll_items[index].fd,
> + NULL) == 0)
> + delete_item(index);
> + else if (errno != EBUSY) {
> + pthread_mutex_unlock(&epoll_items[index].mutex);
> + goto error;
> + }
> + /* EBUSY means events were being handled; allow the other thread
> + to delete the item. */
> + pthread_mutex_unlock(&epoll_items[index].mutex);
> + }
> +#else
> + for (index = 0; index < n_epoll_items; ++index) {
> + pthread_mutex_lock(&epoll_items[index].mutex);
> + ++epoll_items[index].stop;
> + pthread_mutex_unlock(&epoll_items[index].mutex);
> + /* Wait in case a thread running read_thread_function is
> + currently executing code between epoll_wait and
> + pthread_mutex_lock with this item. Note that a longer delay
> + would make double-deletion less likely (at the expense of
> + performance), but there is no guarantee that any delay would
> + ever be sufficient. Note also that we delete all event
> + pollers at once for testing purposes, but in a real-world
> + environment we are likely to want to be able to cancel event
> + pollers at arbitrary times. Therefore we can't improve this
> + situation by just splitting this loop into two loops
> + (i.e. signal 'stop' for all items, sleep, and then delete all
> + items). We also can't fix the problem via EPOLL_CTL_DEL
> + because that command can't prevent the case where some other
> + thread is executing read_thread_function within the region
> + mentioned above: */
> + usleep(1);
> + pthread_mutex_lock(&epoll_items[index].mutex);
> + if (!epoll_items[index].deleted)
> + delete_item(index);
> + pthread_mutex_unlock(&epoll_items[index].mutex);
> + }
> +#endif
> +
> + /* Shut down the read threads: */
> + for (index = 0; index < n_read_threads; ++index)
> + __sync_fetch_and_add(&read_thread_data[index].stop, 1);
> + for (index = 0; index < n_read_threads; ++index) {
> + if (pthread_join(read_threads[index], NULL) != 0)
> + goto error;
> + if (read_thread_data[index].status)
> + goto error;
> + }
> +
> + /* Shut down the write thread: */
> + __sync_fetch_and_add(&write_thread_data.stop, 1);
> + if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
> + goto error;
> +
> + /* Check for final error conditions: */
> + for (index = 0; index < n_epoll_items; ++index) {
> + if (epoll_items[index].status != 0)
> + goto error;
> + if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
> + goto error;
> + }
> + for (index = 0; index < n_epoll_items; ++index)
> + if (epoll_items[index].deleted != 1) {
> + printf("Error: item data deleted %1d times.\n",
> + epoll_items[index].deleted);
> + goto error;
> + }
> +
> + printf("[PASS]\n");
> + return 0;
> +
> + error:
> + printf("[FAIL]\n");
> + return errno;
> +}
> --
> 1.7.9.5
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface"; http://man7.org/tlpi/
On Tue, 16 Oct 2012 17:12:57 +0200
"Michael Kerrisk (man-pages)" <[email protected]> wrote:
> On Thu, Aug 23, 2012 at 11:15 PM, Paton J. Lewis <[email protected]> wrote:
> > From: "Paton J. Lewis" <[email protected]>
> >
> > Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item.
> > If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the
> > epoll item in a multi-threaded environment. Also added a new test_epoll self-
> > test app to both demonstrate the need for this feature and test it.
>
> (There's a lot of background missing from this version of the patch
> that was included in the previous version
> [http://thread.gmane.org/gmane.linux.kernel/1311457]. It helps to
> include the full rationale with a revised patch--best not to assume
> that someone has the context of past mails when reading a revised
> patch.)
>
> I've taken a look at this patch as it currently stands in 3.7-rc1, and
> done a bit of testing. (By the way, the test program
> tools/testing/selftests/epoll/test_epoll.c does not compile...)
Thanks for this. You raise significant issues. If we can't get these
fully resolved over the next month or so, we should revert the patch so
this new API doesn't get released in 3.7. I have queued a patch to do
this and shall maintain it while I watch developments...
[cc Paul McKenney, who is probably the leading expert on these things]
On 10/17/2012 04:30 PM, Andrew Morton wrote:
> On Tue, 16 Oct 2012 17:12:57 +0200
> "Michael Kerrisk (man-pages)" <[email protected]> wrote:
>
>> On Thu, Aug 23, 2012 at 11:15 PM, Paton J. Lewis <[email protected]> wrote:
>>> From: "Paton J. Lewis" <[email protected]>
>>>
>>> Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item.
>>> If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the
>>> epoll item in a multi-threaded environment. Also added a new test_epoll self-
>>> test app to both demonstrate the need for this feature and test it.
>>
>> (There's a lot of background missing from this version of the patch
>> that was included in the previous version
>> [http://thread.gmane.org/gmane.linux.kernel/1311457]. It helps to
>> include the full rationale with a revised patch--best not to assume
>> that someone has the context of past mails when reading a revised
>> patch.)
>>
>> I've taken a look at this patch as it currently stands in 3.7-rc1, and
>> done a bit of testing. (By the way, the test program
>> tools/testing/selftests/epoll/test_epoll.c does not compile...)
>
> Thanks for this. You raise significant issues. If we can't get these
> fully resolved over the next month or so, we should revert the patch so
> this new API doesn't get released in 3.7. I have queued a patch to do
> this and shall maintain it while I watch developments...
I can't shake the feeling that EPOLL_CTL_DISABLE is solving a
non-problem, or, more precisely, that there should be a perfectly good
userspace solution with no kernel changes.
Unless something is rather buggy in kernel land (and I don't think it
is), once EPOLL_CTL_DEL has returned, no call to epoll_wait that starts
*after* EPOLL_CTL_DEL finishes will return that object. This suggests
an RCU-like approach: once EPOLL_CTL_DEL has returned and every thread
has returned from an epoll_wait call that started after the
EPOLL_CTL_DEL returns, then the data structure can be safely freed.
In pseudocode:
delete(fd, pdata) {
pdata->dead = true;
EPOLL_CTL_DEL(fd);
rcu_call(delete pdata);
}
wait() {
epoll_wait;
for each event pdata {
if (pdata->gone) continue;
process the event;
}
rcu_this_is_a_grace_period();
}
Of course, these are not normal grace periods and would need to be
tracked separately. (The optimal data structure to do this without
killing scalability is not obvious. urcu presumably implements such a
thing.)
Am I right?
--Andy
Il 18/10/2012 20:05, Andy Lutomirski ha scritto:
>
> Unless something is rather buggy in kernel land (and I don't think it
> is), once EPOLL_CTL_DEL has returned, no call to epoll_wait that starts
> *after* EPOLL_CTL_DEL finishes will return that object. This suggests
> an RCU-like approach: once EPOLL_CTL_DEL has returned and every thread
> has returned from an epoll_wait call that started after the
> EPOLL_CTL_DEL returns, then the data structure can be safely freed.
>
> In pseudocode:
>
> delete(fd, pdata) {
> pdata->dead = true;
> EPOLL_CTL_DEL(fd);
> rcu_call(delete pdata);
> }
>
> wait() {
> epoll_wait;
> for each event pdata {
> if (pdata->gone) continue;
> process the event;
> }
>
> rcu_this_is_a_grace_period();
> }
>
> Of course, these are not normal grace periods and would need to be
> tracked separately. (The optimal data structure to do this without
> killing scalability is not obvious. urcu presumably implements such a
> thing.)
>
> Am I right?
Equip each thread with a) an id or something else that lets each thread
refer to "the next" thread; b) a lists of "items waiting to be deleted".
Then the deleting thread adds the item being deleted to the first
thread's list. Before executing epoll_wait, thread K empties its list
and passes the buck, appending the old contents of its list to that of
thread K+1. This is an O(1) operation no matter how many items are
being deleted; only Thread N, being the last thread, actually has to go
through the list and delete the items.
The lists need to be protected by a mutex, but contention should really
be rare since there are just two writers. Note that each thread only
needs to hold one mutex at a time, and the deletion loop does not need
to happen with the mutex held at all, so there's no worries of
"cascading" waits on the mutexes.
Compared to http://thread.gmane.org/gmane.linux.kernel/1311457, you get
rid of the per-item mutex and the operations that have to be done with
the (now per-thread) mutex held remain pretty trivial.
Paolo
On 10/19/12 6:03 AM, "Paolo Bonzini" <[email protected]> wrote:
>Il 18/10/2012 20:05, Andy Lutomirski ha scritto:
>>
>> Unless something is rather buggy in kernel land (and I don't think it
>> is), once EPOLL_CTL_DEL has returned, no call to epoll_wait that starts
>> *after* EPOLL_CTL_DEL finishes will return that object. This suggests
>> an RCU-like approach: once EPOLL_CTL_DEL has returned and every thread
>> has returned from an epoll_wait call that started after the
>> EPOLL_CTL_DEL returns, then the data structure can be safely freed.
>>
>> In pseudocode:
>>
>> delete(fd, pdata) {
>> pdata->dead = true;
>> EPOLL_CTL_DEL(fd);
>> rcu_call(delete pdata);
>> }
>>
>> wait() {
>> epoll_wait;
>> for each event pdata {
>> if (pdata->gone) continue;
>> process the event;
>> }
>>
>> rcu_this_is_a_grace_period();
>> }
>>
>> Of course, these are not normal grace periods and would need to be
>> tracked separately. (The optimal data structure to do this without
>> killing scalability is not obvious. urcu presumably implements such a
>> thing.)
>>
>> Am I right?
>
>Equip each thread with a) an id or something else that lets each thread
>refer to "the next" thread; b) a lists of "items waiting to be deleted".
> Then the deleting thread adds the item being deleted to the first
>thread's list. Before executing epoll_wait, thread K empties its list
>and passes the buck, appending the old contents of its list to that of
>thread K+1. This is an O(1) operation no matter how many items are
>being deleted; only Thread N, being the last thread, actually has to go
>through the list and delete the items.
>
>The lists need to be protected by a mutex, but contention should really
>be rare since there are just two writers. Note that each thread only
>needs to hold one mutex at a time, and the deletion loop does not need
>to happen with the mutex held at all, so there's no worries of
>"cascading" waits on the mutexes.
>
>Compared to http://thread.gmane.org/gmane.linux.kernel/1311457, you get
>rid of the per-item mutex and the operations that have to be done with
>the (now per-thread) mutex held remain pretty trivial.
>
>Paolo
A disadvantage of solutions in this direction, which was not preset in
Paton's patch, is that all calls to epoll_wait would need to specify some
timeout value (!= -1) to guarantee that they each come out of epoll_wait
and execute the "pass the buck" or "grace_period" logic. So you would
then have contention between designs that want highly responsive "delete"
operations (those would require very short timeout values to epoll_wait)
and those that want low execution overhead (those would want larger
timeout values).
>
Il 19/10/2012 15:29, Paul Holland ha scritto:
> A disadvantage of solutions in this direction, which was not preset in
> Paton's patch, is that all calls to epoll_wait would need to specify some
> timeout value (!= -1) to guarantee that they each come out of epoll_wait
> and execute the "pass the buck" or "grace_period" logic. So you would
> then have contention between designs that want highly responsive "delete"
> operations (those would require very short timeout values to epoll_wait)
> and those that want low execution overhead (those would want larger
> timeout values).
Is this really a problem? If your thread pool risks getting oversized,
you might need some kind of timeout anyway to expire threads. If your
thread pool is busy, the timeout will never be reached.
I'm not against EPOLL_CTL_DISABLE, just couldn't resist replying to "The
optimal data structure to do this without killing scalability is not
obvious". :)
Paolo
Hello Paton,
PLEASE use a properly quoting mail client! It's very hard now for
third parties to see what I wrote versus your replies.
On Tue, Oct 23, 2012 at 1:06 AM, Paton J. Lewis <[email protected]> wrote:
>
> On 10/16/12 8:12 AM, Michael Kerrisk (man-pages) wrote:
>
> [CC += linux-api@]
>
> Thank you; is this sufficient to coordinate the required changes to the
> glibc version of epoll.h?
I'm not sure. With a bit of luck, someone at glibc might monitor that list.
> Hello Paton,
>
> On Thu, Aug 23, 2012 at 11:15 PM, Paton J. Lewis <[email protected]> wrote:
>
> From: "Paton J. Lewis" <[email protected]>
>
> Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll
> item.
> If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete
> the
> epoll item in a multi-threaded environment. Also added a new test_epoll
> self-
> test app to both demonstrate the need for this feature and test it.
>
> (There's a lot of background missing from this version of the patch
> that was included in the previous version
> [http://thread.gmane.org/gmane.linux.kernel/1311457]. It helps to
> include the full rationale with a revised patch--best not to assume
> that someone has the context of past mails when reading a revised
> patch.)
>
> I was trying to present the same information in a more concise manner. I was
> hoping that the test application would provide a clearer description of the
> problem and the solution. However, I can include some of my original prose
> from the v1 email in the next revision's email if you think that would be
> helpful.
Yes, it would be a good idea.
> I've taken a look at this patch as it currently stands in 3.7-rc1, and
> done a bit of testing. (By the way, the test program
> tools/testing/selftests/epoll/test_epoll.c does not compile...)
>
> Sorry, the test program compiled against commit
> ecca5c3acc0d0933d89abc44e60afb0cc8170e35 from
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git, which I
> believe was the current head commit when I emailed the v2 patch in August. I
> thought that git's format-patch command would include the necessary
> information so people could see which commit the diff was created from.
> Should I be pulling from a different repository or working on a different
> branch? Since this is my first patch submission and I have essentially zero
> experience with git, I would appreciate any guidance you could provide here.
I'm not expert enough to provide guidance, unfortunately. I'd just
make sure that the program gets updated with any future patch
revision.
I should have added that the changes to fix the program were fairly
trivial (but they involved a misnamed variable--it looks like you did
a scan renaming a variable but didn't catch all instances, so I'm not
sure how the test program could have ever compiled in the form that
was committed), and I did get it going.
> There are one or two places where the behavior seems a little strange,
> so I have a question or two at the end of this mail. But other than
> that, I want to check my understanding so that the interface can be
> correctly documented.
>
> Just to go though my understanding, the problem is the following
> scenario in a multithreaded application:
>
> 1. Multiple threads are performing epoll_wait() operations,
> and maintaining a user-space cache that contains information
> corresponding to each file descriptor being monitored by
> epoll_wait().
>
> 2. At some point, a thread wants to delete (EPOLL_CTL_DEL)
> a file descriptor from the epoll interest list, and
> delete the corresponding record from the user-space cache.
>
> 3. The problem with (2) is that some other thread may have
> previously done an epoll_wait() that retrieved information
> about the fd in question, and may be in the middle of using
> information in the cache that relates to that fd. Thus,
> there is a potential race.
>
> 4. The race can't solved purely in user space, because doing
> so would require applying a mutex across the epoll_wait()
> call, which would of course blow thread concurrency.
>
> Right?
>
> Yes, that is an accurate description of the problem.
Okay.
> Your solution is the EPOLL_CTL_DISABLE operation. I want to
> confirm my understanding about how to use this flag, since
> the description that has accompanied the patches so far
> has been a bit sparse
>
> 0. In the scenario you're concerned about, deleting a file
> descriptor means (safely) doing the following:
> (a) Deleting the file descriptor from the epoll interest list
> using EPOLL_CTL_DEL
> (b) Deleting the corresponding record in the user-space cache
>
> 1. It's only meaningful to use this EPOLL_CTL_DISABLE in
> conjunction with EPOLLONESHOT.
>
> 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in
> conjunction is a logical error.
>
> 3. The correct way to code multithreaded applications using
> EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows:
>
> a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should
> should EPOLLONESHOT.
>
> b. When a thread wants to delete a file descriptor, it
> should do the following:
>
> [1] Call epoll_ctl(EPOLL_CTL_DISABLE)
> [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE)
> was zero, then the file descriptor can be safely
> deleted by the thread that made this call.
> [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY,
> then the descriptor is in use. In this case, the calling
> thread should set a flag in the user-space cache to
> indicate that the thread that is using the descriptor
> should perform the deletion operation.
>
> Is all of the above correct?
>
> Yes, that is correct.
Okay
> The implementation depends on checking on whether
> (events & ~EP_PRIVATE_BITS) == 0
> This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always
> set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT
> causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be
> cleared.
>
> A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE
> is only useful in conjunction with EPOLLONESHOT. However, as things
> stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does
> not have EPOLLONESHOT set in 'events' This results in the following
> (slightly surprising) behavior:
>
> (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0
> (the indicator that the file descriptor can be safely deleted).
> (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY.
>
> This doesn't seem particularly useful, and in fact is probably an
> indication that the user made a logic error: they should only be using
> epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which
> EPOLLONESHOT was set in 'events'. If that is correct, then would it
> not make sense to return an error to user space for this case?
>
> Good point. I've implemented and tested your suggested change, and I've
> updated the test application to cover that case. I will soon submit a
> revised patch.
I suppose that I have a concern that goes in the other direction. Is
there not some other solution possible that doesn't require the use of
EPOLLONESHOT? It seems overly restrictive to require that the caller
must employ this flag, and imposes the burden that the caller must
re-enable monitoring after each event.
Does a solution like the following (with no requirement for EPOLLONESHOT) work?
0. Implement an epoll_ctl() operation EPOLL_CTL_XXX
where the name XXX might be chosen based on the decision
in 4(a).
1. EPOLL_CTL_XXX employs a private flag, EPOLLUSED, in the
per-fd events mask in the ready list. By default,
that flag is off.
2. epoll_wait() always clears the EPOLLUSED flag if a
file descriptor is found to be ready.
3. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
flag is NOT set, then
a) it sets the EPOLLUSED flag
b) It disables I/O events (as per EPOLL_CTL_DISABLE)
(I'm not 100% sure if this is necesary).
c) it returns EBUSY to the caller
4. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
flag IS set, then it
a) either deletes the fd or disables events for the fd
(the choice here is a matter of design taste, I think;
deletion has the virtue of simplicity; disabling provides
the option to re-enable the fd later, if desired)
b) returns 0 to the caller.
All of the above with suitable locking around the user-space cache.
Cheers,
Michael
[Re-sending without HTML formatting; all vger.kernel.org destination
addresses bounced my original response.]
On 10/16/12 8:12 AM, Michael Kerrisk (man-pages) wrote:
> [CC += linux-api@]
Thank you; is this sufficient to coordinate the required changes to the
glibc version of epoll.h?
> Hello Paton,
>
> On Thu, Aug 23, 2012 at 11:15 PM, Paton J. Lewis <[email protected]> wrote:
>> From: "Paton J. Lewis" <[email protected]>
>>
>> Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item.
>> If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the
>> epoll item in a multi-threaded environment. Also added a new test_epoll self-
>> test app to both demonstrate the need for this feature and test it.
>
> (There's a lot of background missing from this version of the patch
> that was included in the previous version
> [http://thread.gmane.org/gmane.linux.kernel/1311457]. It helps to
> include the full rationale with a revised patch--best not to assume
> that someone has the context of past mails when reading a revised
> patch.)
I was trying to present the same information in a more concise manner. I
was hoping that the test application would provide a clearer description
of the problem and the solution. However, I can include some of my
original prose from the v1 email in the next revision's email if you
think that would be helpful.
> I've taken a look at this patch as it currently stands in 3.7-rc1, and
> done a bit of testing. (By the way, the test program
> tools/testing/selftests/epoll/test_epoll.c does not compile...)
Sorry, the test program compiled against commit
ecca5c3acc0d0933d89abc44e60afb0cc8170e35 from
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git, which
I believe was the current head commit when I emailed the v2 patch in
August. I thought that git's format-patch command would include the
necessary information so people could see which commit the diff was
created from. Should I be pulling from a different repository or working
on a different branch? Since this is my first patch submission and I
have essentially zero experience with git, I would appreciate any
guidance you could provide here.
> There are one or two places where the behavior seems a little strange,
> so I have a question or two at the end of this mail. But other than
> that, I want to check my understanding so that the interface can be
> correctly documented.
>
> Just to go though my understanding, the problem is the following
> scenario in a multithreaded application:
>
> 1. Multiple threads are performing epoll_wait() operations,
> and maintaining a user-space cache that contains information
> corresponding to each file descriptor being monitored by
> epoll_wait().
>
> 2. At some point, a thread wants to delete (EPOLL_CTL_DEL)
> a file descriptor from the epoll interest list, and
> delete the corresponding record from the user-space cache.
>
> 3. The problem with (2) is that some other thread may have
> previously done an epoll_wait() that retrieved information
> about the fd in question, and may be in the middle of using
> information in the cache that relates to that fd. Thus,
> there is a potential race.
>
> 4. The race can't solved purely in user space, because doing
> so would require applying a mutex across the epoll_wait()
> call, which would of course blow thread concurrency.
>
> Right?
Yes, that is an accurate description of the problem.
> Your solution is the EPOLL_CTL_DISABLE operation. I want to
> confirm my understanding about how to use this flag, since
> the description that has accompanied the patches so far
> has been a bit sparse
>
> 0. In the scenario you're concerned about, deleting a file
> descriptor means (safely) doing the following:
> (a) Deleting the file descriptor from the epoll interest list
> using EPOLL_CTL_DEL
> (b) Deleting the corresponding record in the user-space cache
>
> 1. It's only meaningful to use this EPOLL_CTL_DISABLE in
> conjunction with EPOLLONESHOT.
>
> 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in
> conjunction is a logical error.
>
> 3. The correct way to code multithreaded applications using
> EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows:
>
> a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should
> should EPOLLONESHOT.
>
> b. When a thread wants to delete a file descriptor, it
> should do the following:
>
> [1] Call epoll_ctl(EPOLL_CTL_DISABLE)
> [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE)
> was zero, then the file descriptor can be safely
> deleted by the thread that made this call.
> [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY,
> then the descriptor is in use. In this case, the calling
> thread should set a flag in the user-space cache to
> indicate that the thread that is using the descriptor
> should perform the deletion operation.
>
> Is all of the above correct?
Yes, that is correct.
> The implementation depends on checking on whether
> (events & ~EP_PRIVATE_BITS) == 0
> This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always
> set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT
> causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be
> cleared.
>
> A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE
> is only useful in conjunction with EPOLLONESHOT. However, as things
> stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does
> not have EPOLLONESHOT set in 'events' This results in the following
> (slightly surprising) behavior:
>
> (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0
> (the indicator that the file descriptor can be safely deleted).
> (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY.
>
> This doesn't seem particularly useful, and in fact is probably an
> indication that the user made a logic error: they should only be using
> epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which
> EPOLLONESHOT was set in 'events'. If that is correct, then would it
> not make sense to return an error to user space for this case?
Good point. I've implemented and tested your suggested change, and I've
updated the test application to cover that case. I will soon submit a
revised patch.
Thank you,
Pat
> Cheers,
>
> Michael
>
>
>> Signed-off-by: Paton J. Lewis <[email protected]>
>> ---
>> fs/eventpoll.c | 38 ++-
>> include/linux/eventpoll.h | 1 +
>> tools/testing/selftests/Makefile | 2 +-
>> tools/testing/selftests/epoll/Makefile | 11 +
>> tools/testing/selftests/epoll/test_epoll.c | 344 ++++++++++++++++++++++++++++
>> 5 files changed, 392 insertions(+), 4 deletions(-)
>> create mode 100644 tools/testing/selftests/epoll/Makefile
>> create mode 100644 tools/testing/selftests/epoll/test_epoll.c
>>
>> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
>> index 739b098..1c47bbd 100644
>> --- a/fs/eventpoll.c
>> +++ b/fs/eventpoll.c
>> @@ -339,7 +339,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
>> /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
>> static inline int ep_op_has_event(int op)
>> {
>> - return op != EPOLL_CTL_DEL;
>> + return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
>> }
>>
>> /* Initialize the poll safe wake up structure */
>> @@ -664,6 +664,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
>> return 0;
>> }
>>
>> +/*
>> + * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
>> + * had no event flags set, indicating that another thread may be currently
>> + * handling that item's events (in the case that EPOLLONESHOT was being
>> + * used). Otherwise a zero result indicates that the item has been disabled
>> + * from receiving events. A disabled item may be re-enabled via
>> + * EPOLL_CTL_MOD. Must be called with "mtx" held.
>> + */
>> +static int ep_disable(struct eventpoll *ep, struct epitem *epi)
>> +{
>> + int result = 0;
>> + unsigned long flags;
>> +
>> + spin_lock_irqsave(&ep->lock, flags);
>> + if (epi->event.events & ~EP_PRIVATE_BITS) {
>> + if (ep_is_linked(&epi->rdllink))
>> + list_del_init(&epi->rdllink);
>> + /* Ensure ep_poll_callback will not add epi back onto ready
>> + list: */
>> + epi->event.events &= EP_PRIVATE_BITS;
>> + }
>> + else
>> + result = -EBUSY;
>> + spin_unlock_irqrestore(&ep->lock, flags);
>> +
>> + return result;
>> +}
>> +
>> static void ep_free(struct eventpoll *ep)
>> {
>> struct rb_node *rbp;
>> @@ -996,8 +1024,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
>> rb_insert_color(&epi->rbn, &ep->rbr);
>> }
>>
>> -
>> -
>> #define PATH_ARR_SIZE 5
>> /*
>> * These are the number paths of length 1 to 5, that we are allowing to emanate
>> @@ -1701,6 +1727,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
>> } else
>> error = -ENOENT;
>> break;
>> + case EPOLL_CTL_DISABLE:
>> + if (epi)
>> + error = ep_disable(ep, epi);
>> + else
>> + error = -ENOENT;
>> + break;
>> }
>> mutex_unlock(&ep->mtx);
>>
>> diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
>> index 657ab55..e91f7e3 100644
>> --- a/include/linux/eventpoll.h
>> +++ b/include/linux/eventpoll.h
>> @@ -25,6 +25,7 @@
>> #define EPOLL_CTL_ADD 1
>> #define EPOLL_CTL_DEL 2
>> #define EPOLL_CTL_MOD 3
>> +#define EPOLL_CTL_DISABLE 4
>>
>> /* Set the One Shot behaviour for the target file descriptor */
>> #define EPOLLONESHOT (1 << 30)
>> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
>> index 28bc57e..4cf477f 100644
>> --- a/tools/testing/selftests/Makefile
>> +++ b/tools/testing/selftests/Makefile
>> @@ -1,4 +1,4 @@
>> -TARGETS = breakpoints vm
>> +TARGETS = breakpoints epoll vm
>>
>> all:
>> for TARGET in $(TARGETS); do \
>> diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
>> new file mode 100644
>> index 0000000..19806ed
>> --- /dev/null
>> +++ b/tools/testing/selftests/epoll/Makefile
>> @@ -0,0 +1,11 @@
>> +# Makefile for epoll selftests
>> +
>> +all: test_epoll
>> +%: %.c
>> + gcc -pthread -g -o $@ $^
>> +
>> +run_tests: all
>> + ./test_epoll
>> +
>> +clean:
>> + $(RM) test_epoll
>> diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
>> new file mode 100644
>> index 0000000..cbd8f8b
>> --- /dev/null
>> +++ b/tools/testing/selftests/epoll/test_epoll.c
>> @@ -0,0 +1,344 @@
>> +/*
>> + * tools/testing/selftests/epoll/test_epoll.c
>> + *
>> + * Copyright 2012 Adobe Systems Incorporated
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * Paton J. Lewis <[email protected]>
>> + *
>> + */
>> +
>> +#include <errno.h>
>> +#include <fcntl.h>
>> +#include <pthread.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <unistd.h>
>> +#include <sys/epoll.h>
>> +#include <sys/socket.h>
>> +
>> +/*
>> + * A pointer to an epoll_item_private structure will be stored in the epoll
>> + * item's event structure so that we can get access to the epoll_item_private
>> + * data after calling epoll_wait:
>> + */
>> +struct epoll_item_private {
>> + int index; /* Position of this struct within the epoll_items array. */
>> + int fd;
>> + uint32_t events;
>> + pthread_mutex_t mutex; /* Guards the following variables... */
>> + int stop;
>> + int status; /* Stores any error encountered while handling item. */
>> + /* The following variable allows us to test whether we have encountered
>> + a problem while attempting to cancel and delete the associated
>> + event. When the test program exits, 'deleted' should be exactly
>> + one. If it is greater than one, then the failed test reflects a real
>> + world situation where we would have tried to access the epoll item's
>> + private data after deleting it: */
>> + int deleted;
>> +};
>> +
>> +struct epoll_item_private *epoll_items;
>> +
>> +/*
>> + * Delete the specified item from the epoll set. In a real-world secneario this
>> + * is where we would free the associated data structure, but in this testing
>> + * environment we retain the structure so that we can test for double-deletion:
>> + */
>> +void delete_item(int index)
>> +{
>> + __sync_fetch_and_add(&epoll_items[index].deleted, 1);
>> +}
>> +
>> +/*
>> + * A pointer to a read_thread_data structure will be passed as the argument to
>> + * each read thread:
>> + */
>> +struct read_thread_data {
>> + int stop;
>> + int status; /* Indicates any error encountered by the read thread. */
>> + int epoll_set;
>> +};
>> +
>> +/*
>> + * The function executed by the read threads:
>> + */
>> +void *read_thread_function(void *function_data)
>> +{
>> + struct read_thread_data *thread_data =
>> + (struct read_thread_data *)function_data;
>> + struct epoll_event event_data;
>> + struct epoll_item_private *item_data;
>> + char socket_data;
>> +
>> + /* Handle events until we encounter an error or this thread's 'stop'
>> + condition is set: */
>> + while (1) {
>> + int result = epoll_wait(thread_data->epoll_set,
>> + &event_data,
>> + 1, /* Number of desired events */
>> + 1000); /* Timeout in ms */
>> + if (result < 0) {
>> + /* Breakpoints signal all threads. Ignore that while
>> + debugging: */
>> + if (errno == EINTR)
>> + continue;
>> + thread_data->status = errno;
>> + return 0;
>> + } else if (thread_data->stop)
>> + return 0;
>> + else if (result == 0) /* Timeout */
>> + continue;
>> +
>> + /* We need the mutex here because checking for the stop
>> + condition and re-enabling the epoll item need to be done
>> + together as one atomic operation when EPOLL_CTL_DISABLE is
>> + available: */
>> + item_data = (struct epoll_item_private *)event_data.data.ptr;
>> + pthread_mutex_lock(&item_data->mutex);
>> +
>> + /* Remove the item from the epoll set if we want to stop
>> + handling that event: */
>> + if (item_data->stop)
>> + delete_item(item_data->index);
>> + else {
>> + /* Clear the data that was written to the other end of
>> + our non-blocking socket: */
>> + do {
>> + if (read(item_data->fd, &socket_data, 1) < 1) {
>> + if ((errno == EAGAIN) ||
>> + (errno == EWOULDBLOCK))
>> + break;
>> + else
>> + goto error_unlock;
>> + }
>> + } while (item_data->events & EPOLLET);
>> +
>> + /* The item was one-shot, so re-enable it: */
>> + event_data.events = item_data->events;
>> + if (epoll_ctl(thread_data->epoll_set,
>> + EPOLL_CTL_MOD,
>> + item_data->fd,
>> + &event_data) < 0)
>> + goto error_unlock;
>> + }
>> +
>> + pthread_mutex_unlock(&item_data->mutex);
>> + }
>> +
>> +error_unlock:
>> + thread_data->status = item_data->status = errno;
>> + pthread_mutex_unlock(&item_data->mutex);
>> + return 0;
>> +}
>> +
>> +/*
>> + * A pointer to a write_thread_data structure will be passed as the argument to
>> + * the write thread:
>> + */
>> +struct write_thread_data {
>> + int stop;
>> + int status; /* Indicates any error encountered by the write thread. */
>> + int n_fds;
>> + int *fds;
>> +};
>> +
>> +/*
>> + * The function executed by the write thread. It writes a single byte to each
>> + * socket in turn until the stop condition for this thread is set. If writing to
>> + * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
>> + * the moment and just move on to the next socket in the list. We don't care
>> + * about the order in which we deliver events to the epoll set. In fact we don't
>> + * care about the data we're writing to the pipes at all; we just want to
>> + * trigger epoll events:
>> + */
>> +void *write_thread_function(void *function_data)
>> +{
>> + const char data = 'X';
>> + int index;
>> + struct write_thread_data *thread_data =
>> + (struct write_thread_data *)function_data;
>> + while (!write_thread_data->stop)
>> + for (index = 0;
>> + !thread_data->stop && (index < thread_data->n_fds);
>> + ++index)
>> + if ((write(thread_data->fds[index], &data, 1) < 1) &&
>> + (errno != EAGAIN) &&
>> + (errno != EWOULDBLOCK)) {
>> + write_thread_data->status = errno;
>> + return;
>> + }
>> +}
>> +
>> +/*
>> + * Arguments are currently ignored:
>> + */
>> +int main(int argc, char **argv)
>> +{
>> + const int n_read_threads = 100;
>> + const int n_epoll_items = 500;
>> + int index;
>> + int epoll_set = epoll_create1(0);
>> + struct write_thread_data write_thread_data = {
>> + 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
>> + };
>> + struct read_thread_data *read_thread_data =
>> + malloc(n_read_threads * sizeof(struct read_thread_data));
>> + pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
>> + pthread_t write_thread;
>> +
>> + printf("-----------------\n");
>> + printf("Runing test_epoll\n");
>> + printf("-----------------\n");
>> +
>> + epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
>> +
>> + if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
>> + read_thread_data == 0 || read_threads == 0)
>> + goto error;
>> +
>> + if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
>> + printf("Error: please run this test on a multi-core system.\n");
>> + goto error;
>> + }
>> +
>> + /* Create the socket pairs and epoll items: */
>> + for (index = 0; index < n_epoll_items; ++index) {
>> + int socket_pair[2];
>> + struct epoll_event event_data;
>> + if (socketpair(AF_UNIX,
>> + SOCK_STREAM | SOCK_NONBLOCK,
>> + 0,
>> + socket_pair) < 0)
>> + goto error;
>> + write_thread_data.fds[index] = socket_pair[0];
>> + epoll_items[index].index = index;
>> + epoll_items[index].fd = socket_pair[1];
>> + if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
>> + goto error;
>> + /* We always use EPOLLONESHOT because this test is currently
>> + structured to demonstrate the need for EPOLL_CTL_DISABLE,
>> + which only produces useful information in the EPOLLONESHOT
>> + case (without EPOLLONESHOT, calling epoll_ctl with
>> + EPOLL_CTL_DISABLE will never return EBUSY). If support for
>> + testing events without EPOLLONESHOT is desired, it should
>> + probably be implemented in a separate unit test. */
>> + epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
>> + if (index < n_epoll_items / 2)
>> + epoll_items[index].events |= EPOLLET;
>> + epoll_items[index].stop = 0;
>> + epoll_items[index].status = 0;
>> + epoll_items[index].deleted = 0;
>> + event_data.events = epoll_items[index].events;
>> + event_data.data.ptr = &epoll_items[index];
>> + if (epoll_ctl(epoll_set,
>> + EPOLL_CTL_ADD,
>> + epoll_items[index].fd,
>> + &event_data) < 0)
>> + goto error;
>> + }
>> +
>> + /* Create and start the read threads: */
>> + for (index = 0; index < n_read_threads; ++index) {
>> + read_thread_data[index].stop = 0;
>> + read_thread_data[index].status = 0;
>> + read_thread_data[index].epoll_set = epoll_set;
>> + if (pthread_create(&read_threads[index],
>> + NULL,
>> + read_thread_function,
>> + &read_thread_data[index]) != 0)
>> + goto error;
>> + }
>> +
>> + if (pthread_create(&write_thread,
>> + NULL,
>> + write_thread_function,
>> + &write_thread_data) != 0)
>> + goto error;
>> +
>> + /* Cancel all event pollers: */
>> +#ifdef EPOLL_CTL_DISABLE
>> + for (index = 0; index < n_epoll_items; ++index) {
>> + pthread_mutex_lock(&epoll_items[index].mutex);
>> + ++epoll_items[index].stop;
>> + if (epoll_ctl(epoll_set,
>> + EPOLL_CTL_DISABLE,
>> + epoll_items[index].fd,
>> + NULL) == 0)
>> + delete_item(index);
>> + else if (errno != EBUSY) {
>> + pthread_mutex_unlock(&epoll_items[index].mutex);
>> + goto error;
>> + }
>> + /* EBUSY means events were being handled; allow the other thread
>> + to delete the item. */
>> + pthread_mutex_unlock(&epoll_items[index].mutex);
>> + }
>> +#else
>> + for (index = 0; index < n_epoll_items; ++index) {
>> + pthread_mutex_lock(&epoll_items[index].mutex);
>> + ++epoll_items[index].stop;
>> + pthread_mutex_unlock(&epoll_items[index].mutex);
>> + /* Wait in case a thread running read_thread_function is
>> + currently executing code between epoll_wait and
>> + pthread_mutex_lock with this item. Note that a longer delay
>> + would make double-deletion less likely (at the expense of
>> + performance), but there is no guarantee that any delay would
>> + ever be sufficient. Note also that we delete all event
>> + pollers at once for testing purposes, but in a real-world
>> + environment we are likely to want to be able to cancel event
>> + pollers at arbitrary times. Therefore we can't improve this
>> + situation by just splitting this loop into two loops
>> + (i.e. signal 'stop' for all items, sleep, and then delete all
>> + items). We also can't fix the problem via EPOLL_CTL_DEL
>> + because that command can't prevent the case where some other
>> + thread is executing read_thread_function within the region
>> + mentioned above: */
>> + usleep(1);
>> + pthread_mutex_lock(&epoll_items[index].mutex);
>> + if (!epoll_items[index].deleted)
>> + delete_item(index);
>> + pthread_mutex_unlock(&epoll_items[index].mutex);
>> + }
>> +#endif
>> +
>> + /* Shut down the read threads: */
>> + for (index = 0; index < n_read_threads; ++index)
>> + __sync_fetch_and_add(&read_thread_data[index].stop, 1);
>> + for (index = 0; index < n_read_threads; ++index) {
>> + if (pthread_join(read_threads[index], NULL) != 0)
>> + goto error;
>> + if (read_thread_data[index].status)
>> + goto error;
>> + }
>> +
>> + /* Shut down the write thread: */
>> + __sync_fetch_and_add(&write_thread_data.stop, 1);
>> + if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
>> + goto error;
>> +
>> + /* Check for final error conditions: */
>> + for (index = 0; index < n_epoll_items; ++index) {
>> + if (epoll_items[index].status != 0)
>> + goto error;
>> + if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
>> + goto error;
>> + }
>> + for (index = 0; index < n_epoll_items; ++index)
>> + if (epoll_items[index].deleted != 1) {
>> + printf("Error: item data deleted %1d times.\n",
>> + epoll_items[index].deleted);
>> + goto error;
>> + }
>> +
>> + printf("[PASS]\n");
>> + return 0;
>> +
>> + error:
>> + printf("[FAIL]\n");
>> + return errno;
>> +}
>> --
>> 1.7.9.5
>
> --
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Author of "The Linux Programming Interface"; http://man7.org/tlpi/
> .
>
On 10/23/2012 07:23 PM, Paton J. Lewis wrote:
> [Re-sending without HTML formatting; all vger.kernel.org destination
> addresses bounced my original response.]
>
> On 10/16/12 8:12 AM, Michael Kerrisk (man-pages) wrote:
>> [CC += linux-api@]
>
> Thank you; is this sufficient to coordinate the required changes to the
> glibc version of epoll.h?
Paton, we normally review the diffs between kernel versions but noticing
us via libc-alpha is great.
So, you ask to get this added to <sys/epoll.h>?
#define EPOLL_CTL_DISABLE 4
Once 3.7 is out and contains it, we will add it. A friendly reminder
once the patch is in would be nice so that we don't miss it during the
review.
thanks,
Andreas
--
Andreas Jaeger aj@{suse.com,opensuse.org} Twitter/Identica: jaegerandi
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 N?rnberg, Germany
GF: Jeff Hawn,Jennifer Guild,Felix Imend?rffer,HRB16746 (AG N?rnberg)
GPG fingerprint = 93A3 365E CE47 B889 DF7F FED1 389A 563C C272 A126
On 10/23/12 6:26 AM, Michael Kerrisk (man-pages) wrote:
> On 10/23/12 10:23 AM, Paton J. Lewis wrote:
>> [Re-sending without HTML formatting; all vger.kernel.org destination
>> addresses bounced my original response.]
>>
>> On 10/16/12 8:12 AM, Michael Kerrisk (man-pages) wrote:
>>> [CC += linux-api@]
>>
>> Thank you; is this sufficient to coordinate the required changes to the
>> glibc version of epoll.h?
>
> I'm not sure. With a bit of luck, someone at glibc might monitor that list.
>
>>> Hello Paton,
>>>
>>> On Thu, Aug 23, 2012 at 11:15 PM, Paton J. Lewis <[email protected]>
>>> wrote:
>>>> From: "Paton J. Lewis" <[email protected]>
>>>>
>>>> Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an
>>>> epoll item.
>>>> If epoll_ctl doesn't return -EBUSY in this case, it is then safe to
>>>> delete the
>>>> epoll item in a multi-threaded environment. Also added a new
>>>> test_epoll self-
>>>> test app to both demonstrate the need for this feature and test it.
>>>
>>> (There's a lot of background missing from this version of the patch
>>> that was included in the previous version
>>> [http://thread.gmane.org/gmane.linux.kernel/1311457]. It helps to
>>> include the full rationale with a revised patch--best not to assume
>>> that someone has the context of past mails when reading a revised
>>> patch.)
>>
>> I was trying to present the same information in a more concise manner. I
>> was hoping that the test application would provide a clearer description
>> of the problem and the solution. However, I can include some of my
>> original prose from the v1 email in the next revision's email if you
>> think that would be helpful.
>
> Yes, it would be a good idea.
>
>>> I've taken a look at this patch as it currently stands in 3.7-rc1, and
>>> done a bit of testing. (By the way, the test program
>>> tools/testing/selftests/epoll/test_epoll.c does not compile...)
>>
>> Sorry, the test program compiled against commit
>> ecca5c3acc0d0933d89abc44e60afb0cc8170e35 from
>> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git, which
>> I believe was the current head commit when I emailed the v2 patch in
>> August. I thought that git's format-patch command would include the
>> necessary information so people could see which commit the diff was
>> created from. Should I be pulling from a different repository or working
>> on a different branch? Since this is my first patch submission and I
>> have essentially zero experience with git, I would appreciate any
>> guidance you could provide here.
>
> I'm not expert enough to provide guidance, unfortunately. I'd just
> make sure that the program gets updated with any future patch
> revision.
>
> I should have added that the changes to fix the program were fairly
> trivial (but they involved a misnamed variable--it looks like you did
> a scan renaming a variable but didn't catch all instances, so I'm not
> sure how the test program could have ever compiled in the form that
> was committed), and I did get it going.
Sorry, that error arose from my lack of experience with git and its
format-patch command. I created the patch without first committing the
final fix to my branch.
>>> There are one or two places where the behavior seems a little strange,
>>> so I have a question or two at the end of this mail. But other than
>>> that, I want to check my understanding so that the interface can be
>>> correctly documented.
>>>
>>> Just to go though my understanding, the problem is the following
>>> scenario in a multithreaded application:
>>>
>>> 1. Multiple threads are performing epoll_wait() operations,
>>> and maintaining a user-space cache that contains information
>>> corresponding to each file descriptor being monitored by
>>> epoll_wait().
>>>
>>> 2. At some point, a thread wants to delete (EPOLL_CTL_DEL)
>>> a file descriptor from the epoll interest list, and
>>> delete the corresponding record from the user-space cache.
>>>
>>> 3. The problem with (2) is that some other thread may have
>>> previously done an epoll_wait() that retrieved information
>>> about the fd in question, and may be in the middle of using
>>> information in the cache that relates to that fd. Thus,
>>> there is a potential race.
>>>
>>> 4. The race can't solved purely in user space, because doing
>>> so would require applying a mutex across the epoll_wait()
>>> call, which would of course blow thread concurrency.
>>>
>>> Right?
>>
>> Yes, that is an accurate description of the problem.
>
> Okay.
>
>>> Your solution is the EPOLL_CTL_DISABLE operation. I want to
>>> confirm my understanding about how to use this flag, since
>>> the description that has accompanied the patches so far
>>> has been a bit sparse
>>>
>>> 0. In the scenario you're concerned about, deleting a file
>>> descriptor means (safely) doing the following:
>>> (a) Deleting the file descriptor from the epoll interest list
>>> using EPOLL_CTL_DEL
>>> (b) Deleting the corresponding record in the user-space cache
>>>
>>> 1. It's only meaningful to use this EPOLL_CTL_DISABLE in
>>> conjunction with EPOLLONESHOT.
>>>
>>> 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in
>>> conjunction is a logical error.
>>>
>>> 3. The correct way to code multithreaded applications using
>>> EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows:
>>>
>>> a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should
>>> should EPOLLONESHOT.
>>>
>>> b. When a thread wants to delete a file descriptor, it
>>> should do the following:
>>>
>>> [1] Call epoll_ctl(EPOLL_CTL_DISABLE)
>>> [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE)
>>> was zero, then the file descriptor can be safely
>>> deleted by the thread that made this call.
>>> [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY,
>>> then the descriptor is in use. In this case, the calling
>>> thread should set a flag in the user-space cache to
>>> indicate that the thread that is using the descriptor
>>> should perform the deletion operation.
>>>
>>> Is all of the above correct?
>>
>> Yes, that is correct.
>
> Okay
>
>>
>>> The implementation depends on checking on whether
>>> (events & ~EP_PRIVATE_BITS) == 0
>>> This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always
>>> set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT
>>> causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be
>>> cleared.
>>>
>>> A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE
>>> is only useful in conjunction with EPOLLONESHOT. However, as things
>>> stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does
>>> not have EPOLLONESHOT set in 'events' This results in the following
>>> (slightly surprising) behavior:
>>>
>>> (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0
>>> (the indicator that the file descriptor can be safely deleted).
>>> (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY.
>>>
>>> This doesn't seem particularly useful, and in fact is probably an
>>> indication that the user made a logic error: they should only be using
>>> epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which
>>> EPOLLONESHOT was set in 'events'. If that is correct, then would it
>>> not make sense to return an error to user space for this case?
>>
>> Good point. I've implemented and tested your suggested change, and I've
>> updated the test application to cover that case. I will soon submit a
>> revised patch.
>
> I suppose that I have a concern that goes in the other direction. Is
> there not some other solution possible that doesn't require the use of
> EPOLLONESHOT? It seems overly restrictive to require that the caller
> must employ this flag, and imposes the burden that the caller must
> re-enable monitoring after each event.
>
> Does a solution like the following (with no requirement for EPOLLONESHOT) work?
>
> 0. Implement an epoll_ctl() operation EPOLL_CTL_XXX
> where the name XXX might be chosen based on the decision
> in 4(a).
> 1. EPOLL_CTL_XXX employs a private flag, EPOLLUSED, in the
> per-fd events mask in the ready list. By default,
> that flag is off.
> 2. epoll_wait() always clears the EPOLLUSED flag if a
> file descriptor is found to be ready.
> 3. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
> flag is NOT set, then
> a) it sets the EPOLLUSED flag
> b) It disables I/O events (as per EPOLL_CTL_DISABLE)
> (I'm not 100% sure if this is necesary).
> c) it returns EBUSY to the caller
> 4. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
> flag IS set, then it
> a) either deletes the fd or disables events for the fd
> (the choice here is a matter of design taste, I think;
> deletion has the virtue of simplicity; disabling provides
> the option to re-enable the fd later, if desired)
> b) returns 0 to the caller.
>
> All of the above with suitable locking around the user-space cache.
>
> Cheers,
>
> Michael
I don't believe that proposal will solve the problem. Consider the case
where a worker thread has just executed epoll_wait and is about to
execute the next line of code (which will access the data associated
with the fd receiving the event). If the deletion thread manages to call
epoll_ctl(EPOLL_CTL_XXX) for that fd twice in a row before the worker
thread is able to execute the next statement, then the deletion thread
will mistakenly conclude that it is safe to destroy the data that the
worker thread is about to access.
Pat
Hi Pat,
>> I suppose that I have a concern that goes in the other direction. Is
>> there not some other solution possible that doesn't require the use of
>> EPOLLONESHOT? It seems overly restrictive to require that the caller
>> must employ this flag, and imposes the burden that the caller must
>> re-enable monitoring after each event.
>>
>> Does a solution like the following (with no requirement for EPOLLONESHOT)
>> work?
>>
>> 0. Implement an epoll_ctl() operation EPOLL_CTL_XXX
>> where the name XXX might be chosen based on the decision
>> in 4(a).
>> 1. EPOLL_CTL_XXX employs a private flag, EPOLLUSED, in the
>> per-fd events mask in the ready list. By default,
>> that flag is off.
>> 2. epoll_wait() always clears the EPOLLUSED flag if a
>> file descriptor is found to be ready.
>> 3. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
>> flag is NOT set, then
>> a) it sets the EPOLLUSED flag
>> b) It disables I/O events (as per EPOLL_CTL_DISABLE)
>> (I'm not 100% sure if this is necesary).
>> c) it returns EBUSY to the caller
>> 4. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
>> flag IS set, then it
>> a) either deletes the fd or disables events for the fd
>> (the choice here is a matter of design taste, I think;
>> deletion has the virtue of simplicity; disabling provides
>> the option to re-enable the fd later, if desired)
>> b) returns 0 to the caller.
>>
>> All of the above with suitable locking around the user-space cache.
>>
>> Cheers,
>>
>> Michael
>
>
> I don't believe that proposal will solve the problem. Consider the case
> where a worker thread has just executed epoll_wait and is about to execute
> the next line of code (which will access the data associated with the fd
> receiving the event). If the deletion thread manages to call
> epoll_ctl(EPOLL_CTL_XXX) for that fd twice in a row before the worker thread
> is able to execute the next statement, then the deletion thread will
> mistakenly conclude that it is safe to destroy the data that the worker
> thread is about to access.
Okay -- I had the idea there might be a hole in my proposal ;-).
By the way, have you been reading the comments in the two LWN articles
on EPOLL_CTL_DISABLE?
https://lwn.net/Articles/520012/
http://lwn.net/SubscriberLink/520198/fd81ba0ecb1858a2/
There's some interesting proposals there--some suggesting that an
entirely user-space solution might be possible. I haven't looked
deeply into the ideas though.
Cheers,
Michael
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface"; http://man7.org/tlpi/
On 10/25/12 3:23 AM, Michael Kerrisk (man-pages) wrote:
> Hi Pat,
>
>
>>> I suppose that I have a concern that goes in the other direction. Is
>>> there not some other solution possible that doesn't require the use of
>>> EPOLLONESHOT? It seems overly restrictive to require that the caller
>>> must employ this flag, and imposes the burden that the caller must
>>> re-enable monitoring after each event.
>>>
>>> Does a solution like the following (with no requirement for EPOLLONESHOT)
>>> work?
>>>
>>> 0. Implement an epoll_ctl() operation EPOLL_CTL_XXX
>>> where the name XXX might be chosen based on the decision
>>> in 4(a).
>>> 1. EPOLL_CTL_XXX employs a private flag, EPOLLUSED, in the
>>> per-fd events mask in the ready list. By default,
>>> that flag is off.
>>> 2. epoll_wait() always clears the EPOLLUSED flag if a
>>> file descriptor is found to be ready.
>>> 3. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
>>> flag is NOT set, then
>>> a) it sets the EPOLLUSED flag
>>> b) It disables I/O events (as per EPOLL_CTL_DISABLE)
>>> (I'm not 100% sure if this is necesary).
>>> c) it returns EBUSY to the caller
>>> 4. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
>>> flag IS set, then it
>>> a) either deletes the fd or disables events for the fd
>>> (the choice here is a matter of design taste, I think;
>>> deletion has the virtue of simplicity; disabling provides
>>> the option to re-enable the fd later, if desired)
>>> b) returns 0 to the caller.
>>>
>>> All of the above with suitable locking around the user-space cache.
>>>
>>> Cheers,
>>>
>>> Michael
>>
>>
>> I don't believe that proposal will solve the problem. Consider the case
>> where a worker thread has just executed epoll_wait and is about to execute
>> the next line of code (which will access the data associated with the fd
>> receiving the event). If the deletion thread manages to call
>> epoll_ctl(EPOLL_CTL_XXX) for that fd twice in a row before the worker thread
>> is able to execute the next statement, then the deletion thread will
>> mistakenly conclude that it is safe to destroy the data that the worker
>> thread is about to access.
>
> Okay -- I had the idea there might be a hole in my proposal ;-).
>
> By the way, have you been reading the comments in the two LWN articles
> on EPOLL_CTL_DISABLE?
> https://lwn.net/Articles/520012/
> http://lwn.net/SubscriberLink/520198/fd81ba0ecb1858a2/
>
> There's some interesting proposals there--some suggesting that an
> entirely user-space solution might be possible. I haven't looked
> deeply into the ideas though.
Yes, thanks, I read through the article and comments. I believe all of
the objections raised there were either addressed by responses there, or
they were also voiced here on the kernel.org mailing lists and addressed
by either my or Paul Holland's responses here. If there is another
objection to the original proposal that you feel I have overlooked or
which has not been properly addressed, please let me know.
Pat
On 10/23/12 12:15 PM, Andreas Jaeger wrote:
> On 10/23/2012 07:23 PM, Paton J. Lewis wrote:
>> [Re-sending without HTML formatting; all vger.kernel.org destination
>> addresses bounced my original response.]
>>
>> On 10/16/12 8:12 AM, Michael Kerrisk (man-pages) wrote:
>>> [CC += linux-api@]
>>
>> Thank you; is this sufficient to coordinate the required changes to the
>> glibc version of epoll.h?
>
> Paton, we normally review the diffs between kernel versions but noticing
> us via libc-alpha is great.
>
> So, you ask to get this added to <sys/epoll.h>?
>
> #define EPOLL_CTL_DISABLE 4
Yes, that's correct (assuming that v3 the patch proposal is approved).
> Once 3.7 is out and contains it, we will add it. A friendly reminder
> once the patch is in would be nice so that we don't miss it during the
> review.
>
> thanks,
> Andreas
>
Thank you,
Pat
On Thu, Oct 25, 2012 at 12:23:24PM +0200, Michael Kerrisk (man-pages) wrote:
> Hi Pat,
>
>
> >> I suppose that I have a concern that goes in the other direction. Is
> >> there not some other solution possible that doesn't require the use of
> >> EPOLLONESHOT? It seems overly restrictive to require that the caller
> >> must employ this flag, and imposes the burden that the caller must
> >> re-enable monitoring after each event.
> >>
> >> Does a solution like the following (with no requirement for EPOLLONESHOT)
> >> work?
> >>
> >> 0. Implement an epoll_ctl() operation EPOLL_CTL_XXX
> >> where the name XXX might be chosen based on the decision
> >> in 4(a).
> >> 1. EPOLL_CTL_XXX employs a private flag, EPOLLUSED, in the
> >> per-fd events mask in the ready list. By default,
> >> that flag is off.
> >> 2. epoll_wait() always clears the EPOLLUSED flag if a
> >> file descriptor is found to be ready.
> >> 3. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
> >> flag is NOT set, then
> >> a) it sets the EPOLLUSED flag
> >> b) It disables I/O events (as per EPOLL_CTL_DISABLE)
> >> (I'm not 100% sure if this is necesary).
> >> c) it returns EBUSY to the caller
> >> 4. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
> >> flag IS set, then it
> >> a) either deletes the fd or disables events for the fd
> >> (the choice here is a matter of design taste, I think;
> >> deletion has the virtue of simplicity; disabling provides
> >> the option to re-enable the fd later, if desired)
> >> b) returns 0 to the caller.
> >>
> >> All of the above with suitable locking around the user-space cache.
> >>
> >> Cheers,
> >>
> >> Michael
> >
> >
> > I don't believe that proposal will solve the problem. Consider the case
> > where a worker thread has just executed epoll_wait and is about to execute
> > the next line of code (which will access the data associated with the fd
> > receiving the event). If the deletion thread manages to call
> > epoll_ctl(EPOLL_CTL_XXX) for that fd twice in a row before the worker thread
> > is able to execute the next statement, then the deletion thread will
> > mistakenly conclude that it is safe to destroy the data that the worker
> > thread is about to access.
>
> Okay -- I had the idea there might be a hole in my proposal ;-).
>
> By the way, have you been reading the comments in the two LWN articles
> on EPOLL_CTL_DISABLE?
> https://lwn.net/Articles/520012/
> http://lwn.net/SubscriberLink/520198/fd81ba0ecb1858a2/
>
> There's some interesting proposals there--some suggesting that an
> entirely user-space solution might be possible. I haven't looked
> deeply into the ideas though.
Yeah, I became quite interested so I wrote a crude epoll + urcu test.
Since it's RCU review to ensure I've not made any serious mistakes could
be quite helpful:
#define _LGPL_SOURCE 1
#define _GNU_SOURCE 1
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
#include <sys/epoll.h>
/*
* Locking Voodoo:
*
* The globabls prefixed by _ require special care because they will be
* accessed from multiple threads.
*
* The precise locking scheme we use varies whether READERS_USE_MUTEX is defined
* When we're using userspace RCU the mutex only gets acquired for writes
* to _-prefixed globals. Reads are done inside RCU read side critical
* sections.
* Otherwise the epmutex covers reads and writes to them all and the test
* is not very scalable.
*/
static pthread_mutex_t epmutex = PTHREAD_MUTEX_INITIALIZER;
static int _p[2]; /* Send dummy data from one thread to another */
static int _epfd; /* Threads wait to read/write on epfd */
static int _nepitems = 0;
#ifdef READERS_USE_MUTEX
#define init_lock() do {} while(0)
#define init_thread() do {} while(0)
#define read_lock pthread_mutex_lock
#define read_unlock pthread_mutex_unlock
#define fini_thread() do {} while(0)
/* Because readers use the mutex synchronize_rcu() is a no-op */
#define synchronize_rcu() do {} while(0)
#else
#include <urcu.h>
#define init_lock rcu_init
#define init_thread rcu_register_thread
#define read_lock(m) rcu_read_lock()
#define read_unlock(m) rcu_read_unlock()
#define fini_thread() do { rcu_unregister_thread(); } while(0)
#endif
#define write_lock pthread_mutex_lock
#define write_unlock pthread_mutex_unlock
/* We send this data through the pipe. */
static const char *data = "test";
const size_t dlen = 5;
static inline int harmless_errno(void)
{
return ((errno == EWOULDBLOCK) || (errno == EAGAIN) || (errno == EINTR));
}
static void* thread_main(void *thread_nr)
{
struct epoll_event ev;
int rc = 0;
char buffer[dlen];
unsigned long long _niterations = 0;
init_thread();
while (!rc) {
read_lock(&epmutex);
if (_nepitems < 1) {
read_unlock(&epmutex);
break;
}
rc = epoll_wait(_epfd, &ev, 1, 1);
if (rc < 1) {
read_unlock(&epmutex);
if (rc == 0)
continue;
if (harmless_errno()) {
rc = 0;
continue;
}
break;
}
if (ev.events & EPOLLOUT) {
rc = write(_p[1], data, dlen);
read_unlock(&epmutex);
if (rc < 0) {
if (harmless_errno()) {
rc = 0;
continue;
}
break;
}
rc = 0;
} else if (ev.events & EPOLLIN) {
rc = read(_p[0], buffer, dlen);
read_unlock(&epmutex);
if (rc < 0) {
if (harmless_errno()) {
rc = 0;
continue;
}
break;
}
rc = 0;
} else
read_unlock(&epmutex);
_niterations++;
}
fini_thread();
return (void *)_niterations;
}
/* Some sample numbers from varying MAX_THREADS on my laptop:
* With a global mutex:
* 1 core for the main thread
* 1 core for epoll_wait()'ing threads
* The mutex doesn't scale -- increasing the number of threads despite
* having more real cores just causes performance to go down.
* 7 threads, 213432.128160 iterations per second
* 3 threads, 606560.183997 iterations per second
* 2 threads, 1346006.413404 iterations per second
* 1 thread , 2148936.348793 iterations per second
*
* With URCU:
* 1 core for the main thread which spins reading niterations.
* N-1 cores for the epoll_wait()'ing threads.
* "Hyperthreading" doesn't help here -- I've got 4 cores:
* 7 threads, 1537304.965009 iterations per second
* 4 threads, 1912846.753203 iterations per second
* 3 threads, 2278639.336464 iterations per second
* 2 threads, 1928805.899146 iterations per second
* 1 thread , 2007198.066327 iterations per second
*/
#define MAX_THREADS 3
int main (int argc, char **argv)
{
struct timespec before, req, after;
unsigned long long niterations = 0;
pthread_t threads[MAX_THREADS];
struct epoll_event ev;
int nthreads = 0, rc;
init_lock();
/* Since we haven't made the threads yet we can safely use _ globals */
rc = pipe2(_p, O_NONBLOCK);
if (rc < 0)
goto error;
_epfd = epoll_create1(EPOLL_CLOEXEC);
if (_epfd < 0)
goto error;
/* Monitor the pipe via epoll */
ev.events = EPOLLIN;
ev.data.u32 = 0; /* index in _p[] */
rc = epoll_ctl(_epfd, EPOLL_CTL_ADD, _p[0], &ev);
if (rc < 0)
goto error;
_nepitems++;
printf("Added fd %d to epoll set %d\n", _p[0], _epfd);
ev.events = EPOLLOUT;
ev.data.u32 = 1;
rc = epoll_ctl(_epfd, EPOLL_CTL_ADD, _p[1], &ev);
if (rc < 0)
goto error;
_nepitems++;
printf("Added fd %d to epoll set %d\n", _p[1], _epfd);
fflush(stdout);
/*
* After the first pthread_create() we can't safely use _ globals
* without adhering to the locking scheme. pthread_create() should
* also imply some thorough memory barriers so all our previous
* modifications to the _ globals should be visible after this point.
*/
for (rc = 0; nthreads < MAX_THREADS; nthreads++) {
rc = pthread_create(&threads[nthreads], NULL, &thread_main,
(void *)(long)nthreads);
if (rc < 0)
goto error;
}
/* Wait for our child threads to do some "work" */
req.tv_sec = 30;
rc = clock_gettime(CLOCK_MONOTONIC_RAW, &before);
rc = nanosleep(&req, NULL);
rc = clock_gettime(CLOCK_MONOTONIC_RAW, &after);
/*
* Modify the epoll interest set. This can leave stale
* data in other threads because they may have done an
* epoll_wait() with RCU read lock held instead of the
* epmutex.
*/
write_lock(&epmutex);
rc = epoll_ctl(_epfd, EPOLL_CTL_DEL, _p[0], &ev);
if (rc == 0) {
_nepitems--;
printf("Removed fd %d from epoll set %d\n", _p[0], _epfd);
rc = epoll_ctl(_epfd, EPOLL_CTL_DEL, _p[1], &ev);
if (rc == 0) {
printf("Removed fd %d from epoll set %d\n", _p[1], _epfd);
_nepitems--;
}
}
write_unlock(&epmutex);
if (rc < 0)
goto error;
/*
* Wait until the stale data are no longer in use.
* We could use call_rcu() here too, but let's keep the test simple.
*/
printf("synchronize_rcu()\n");
fflush(stdout);
synchronize_rcu();
printf("closing fds\n");
fflush(stdout);
/* Clean up the stale data */
close(_p[0]);
close(_p[1]);
close(_epfd);
printf("closed fds (%d, %d, %d)\n", _p[0], _p[1], _epfd);
fflush(stdout);
/*
* Test is done. Join all the threads so that we give time for
* races to show up.
*/
niterations = 0;
for (; nthreads > 0; nthreads--) {
unsigned long long thread_iterations;
rc = pthread_join(threads[nthreads - 1],
(void *)&thread_iterations);
niterations += thread_iterations;
}
after.tv_sec -= before.tv_sec;
after.tv_nsec -= before.tv_nsec;
if (after.tv_nsec < 0) {
--after.tv_sec;
after.tv_nsec += 1000000000;
}
printf("%f iterations per second\n", (double)niterations/((double)after.tv_sec + (double)after.tv_nsec/1000000000.0));
exit(EXIT_SUCCESS);
error:
/* This is trashy testcase code -- it doesn't do full cleanup! */
for (; nthreads > 0; nthreads--)
rc = pthread_cancel(threads[nthreads - 1]);
exit(EXIT_FAILURE);
}
On 10/27/2012 05:52 AM, Matt Helsley wrote:
> On Thu, Oct 25, 2012 at 12:23:24PM +0200, Michael Kerrisk (man-pages) wrote:
>> Hi Pat,
>>
>>
>>>> I suppose that I have a concern that goes in the other direction. Is
>>>> there not some other solution possible that doesn't require the use of
>>>> EPOLLONESHOT? It seems overly restrictive to require that the caller
>>>> must employ this flag, and imposes the burden that the caller must
>>>> re-enable monitoring after each event.
>>>>
>>>> Does a solution like the following (with no requirement for EPOLLONESHOT)
>>>> work?
>>>>
>>>> 0. Implement an epoll_ctl() operation EPOLL_CTL_XXX
>>>> where the name XXX might be chosen based on the decision
>>>> in 4(a).
>>>> 1. EPOLL_CTL_XXX employs a private flag, EPOLLUSED, in the
>>>> per-fd events mask in the ready list. By default,
>>>> that flag is off.
>>>> 2. epoll_wait() always clears the EPOLLUSED flag if a
>>>> file descriptor is found to be ready.
>>>> 3. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
>>>> flag is NOT set, then
>>>> a) it sets the EPOLLUSED flag
>>>> b) It disables I/O events (as per EPOLL_CTL_DISABLE)
>>>> (I'm not 100% sure if this is necesary).
>>>> c) it returns EBUSY to the caller
>>>> 4. If an epoll_ctl(EPOLL_CTL_XXX) discovers that the EPOLLUSED
>>>> flag IS set, then it
>>>> a) either deletes the fd or disables events for the fd
>>>> (the choice here is a matter of design taste, I think;
>>>> deletion has the virtue of simplicity; disabling provides
>>>> the option to re-enable the fd later, if desired)
>>>> b) returns 0 to the caller.
>>>>
>>>> All of the above with suitable locking around the user-space cache.
>>>>
>>>> Cheers,
>>>>
>>>> Michael
>>>
>>>
>>> I don't believe that proposal will solve the problem. Consider the case
>>> where a worker thread has just executed epoll_wait and is about to execute
>>> the next line of code (which will access the data associated with the fd
>>> receiving the event). If the deletion thread manages to call
>>> epoll_ctl(EPOLL_CTL_XXX) for that fd twice in a row before the worker thread
>>> is able to execute the next statement, then the deletion thread will
>>> mistakenly conclude that it is safe to destroy the data that the worker
>>> thread is about to access.
>>
>> Okay -- I had the idea there might be a hole in my proposal ;-).
>>
>> By the way, have you been reading the comments in the two LWN articles
>> on EPOLL_CTL_DISABLE?
>> https://lwn.net/Articles/520012/
>> http://lwn.net/SubscriberLink/520198/fd81ba0ecb1858a2/
>>
>> There's some interesting proposals there--some suggesting that an
>> entirely user-space solution might be possible. I haven't looked
>> deeply into the ideas though.
>
> Yeah, I became quite interested so I wrote a crude epoll + urcu test.
I still think we could use this idea in kernel, actually implement the
rcu lock mechanism inside epoll_wait().
Since the epoll_wait() invoked by different threads, we could easily
track the status of epi usage by a thread, and DISABLE should return
0 only when all the threads who referenced the epi invoked epoll_wait()
again.
It's module would like:
delete thread:
1. set fd stop flag(user flag) //tell worker don't use fd any more
2. epoll_ctl(DISABLE)
3. if return BUSY, try later //rcu_syn
3. else, do delete
worker thread:
1. invoke epoll_wait()
2. if fd stop flag set, epoll_wait() again
3. else, do job on fd
in epoll_wait():
1. epi event arrived
2. if epi stop flag set(kernel flag), don't return it
3. else, record epi usage in current thread and add
it's ref count //rcu_lock
4. dec the epi ref count when thread invoke
epoll_wait() again //rcu_unlock
in epoll_ctl(DISABLE):
1. set epi stop flag(kernel flag)
2. if epi ref count is not 0, return BUSY
Please let me know if I miss some thing ;-)
Regards,
Michael Wang
> Since it's RCU review to ensure I've not made any serious mistakes could
> be quite helpful:
>
> #define _LGPL_SOURCE 1
> #define _GNU_SOURCE 1
>
> #include <stdlib.h>
> #include <stdio.h>
> #include <string.h>
> #include <unistd.h>
> #include <pthread.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <time.h>
>
> #include <sys/epoll.h>
>
> /*
> * Locking Voodoo:
> *
> * The globabls prefixed by _ require special care because they will be
> * accessed from multiple threads.
> *
> * The precise locking scheme we use varies whether READERS_USE_MUTEX is defined
> * When we're using userspace RCU the mutex only gets acquired for writes
> * to _-prefixed globals. Reads are done inside RCU read side critical
> * sections.
> * Otherwise the epmutex covers reads and writes to them all and the test
> * is not very scalable.
> */
> static pthread_mutex_t epmutex = PTHREAD_MUTEX_INITIALIZER;
> static int _p[2]; /* Send dummy data from one thread to another */
> static int _epfd; /* Threads wait to read/write on epfd */
> static int _nepitems = 0;
>
> #ifdef READERS_USE_MUTEX
> #define init_lock() do {} while(0)
> #define init_thread() do {} while(0)
> #define read_lock pthread_mutex_lock
> #define read_unlock pthread_mutex_unlock
> #define fini_thread() do {} while(0)
> /* Because readers use the mutex synchronize_rcu() is a no-op */
> #define synchronize_rcu() do {} while(0)
> #else
> #include <urcu.h>
> #define init_lock rcu_init
> #define init_thread rcu_register_thread
> #define read_lock(m) rcu_read_lock()
> #define read_unlock(m) rcu_read_unlock()
> #define fini_thread() do { rcu_unregister_thread(); } while(0)
> #endif
> #define write_lock pthread_mutex_lock
> #define write_unlock pthread_mutex_unlock
>
> /* We send this data through the pipe. */
> static const char *data = "test";
> const size_t dlen = 5;
>
> static inline int harmless_errno(void)
> {
> return ((errno == EWOULDBLOCK) || (errno == EAGAIN) || (errno == EINTR));
> }
>
> static void* thread_main(void *thread_nr)
> {
> struct epoll_event ev;
> int rc = 0;
> char buffer[dlen];
> unsigned long long _niterations = 0;
>
> init_thread();
> while (!rc) {
> read_lock(&epmutex);
> if (_nepitems < 1) {
> read_unlock(&epmutex);
> break;
> }
> rc = epoll_wait(_epfd, &ev, 1, 1);
> if (rc < 1) {
> read_unlock(&epmutex);
> if (rc == 0)
> continue;
> if (harmless_errno()) {
> rc = 0;
> continue;
> }
> break;
> }
>
> if (ev.events & EPOLLOUT) {
> rc = write(_p[1], data, dlen);
> read_unlock(&epmutex);
> if (rc < 0) {
> if (harmless_errno()) {
> rc = 0;
> continue;
> }
> break;
> }
> rc = 0;
> } else if (ev.events & EPOLLIN) {
> rc = read(_p[0], buffer, dlen);
> read_unlock(&epmutex);
> if (rc < 0) {
> if (harmless_errno()) {
> rc = 0;
> continue;
> }
> break;
> }
> rc = 0;
> } else
> read_unlock(&epmutex);
> _niterations++;
> }
> fini_thread();
> return (void *)_niterations;
> }
>
> /* Some sample numbers from varying MAX_THREADS on my laptop:
> * With a global mutex:
> * 1 core for the main thread
> * 1 core for epoll_wait()'ing threads
> * The mutex doesn't scale -- increasing the number of threads despite
> * having more real cores just causes performance to go down.
> * 7 threads, 213432.128160 iterations per second
> * 3 threads, 606560.183997 iterations per second
> * 2 threads, 1346006.413404 iterations per second
> * 1 thread , 2148936.348793 iterations per second
> *
> * With URCU:
> * 1 core for the main thread which spins reading niterations.
> * N-1 cores for the epoll_wait()'ing threads.
> * "Hyperthreading" doesn't help here -- I've got 4 cores:
> * 7 threads, 1537304.965009 iterations per second
> * 4 threads, 1912846.753203 iterations per second
> * 3 threads, 2278639.336464 iterations per second
> * 2 threads, 1928805.899146 iterations per second
> * 1 thread , 2007198.066327 iterations per second
> */
> #define MAX_THREADS 3
>
> int main (int argc, char **argv)
> {
> struct timespec before, req, after;
> unsigned long long niterations = 0;
> pthread_t threads[MAX_THREADS];
> struct epoll_event ev;
> int nthreads = 0, rc;
>
> init_lock();
>
> /* Since we haven't made the threads yet we can safely use _ globals */
> rc = pipe2(_p, O_NONBLOCK);
> if (rc < 0)
> goto error;
>
> _epfd = epoll_create1(EPOLL_CLOEXEC);
> if (_epfd < 0)
> goto error;
>
> /* Monitor the pipe via epoll */
> ev.events = EPOLLIN;
> ev.data.u32 = 0; /* index in _p[] */
> rc = epoll_ctl(_epfd, EPOLL_CTL_ADD, _p[0], &ev);
> if (rc < 0)
> goto error;
> _nepitems++;
> printf("Added fd %d to epoll set %d\n", _p[0], _epfd);
> ev.events = EPOLLOUT;
> ev.data.u32 = 1;
> rc = epoll_ctl(_epfd, EPOLL_CTL_ADD, _p[1], &ev);
> if (rc < 0)
> goto error;
> _nepitems++;
> printf("Added fd %d to epoll set %d\n", _p[1], _epfd);
> fflush(stdout);
>
> /*
> * After the first pthread_create() we can't safely use _ globals
> * without adhering to the locking scheme. pthread_create() should
> * also imply some thorough memory barriers so all our previous
> * modifications to the _ globals should be visible after this point.
> */
> for (rc = 0; nthreads < MAX_THREADS; nthreads++) {
> rc = pthread_create(&threads[nthreads], NULL, &thread_main,
> (void *)(long)nthreads);
> if (rc < 0)
> goto error;
> }
>
> /* Wait for our child threads to do some "work" */
> req.tv_sec = 30;
> rc = clock_gettime(CLOCK_MONOTONIC_RAW, &before);
> rc = nanosleep(&req, NULL);
> rc = clock_gettime(CLOCK_MONOTONIC_RAW, &after);
>
> /*
> * Modify the epoll interest set. This can leave stale
> * data in other threads because they may have done an
> * epoll_wait() with RCU read lock held instead of the
> * epmutex.
> */
> write_lock(&epmutex);
> rc = epoll_ctl(_epfd, EPOLL_CTL_DEL, _p[0], &ev);
> if (rc == 0) {
> _nepitems--;
> printf("Removed fd %d from epoll set %d\n", _p[0], _epfd);
> rc = epoll_ctl(_epfd, EPOLL_CTL_DEL, _p[1], &ev);
> if (rc == 0) {
> printf("Removed fd %d from epoll set %d\n", _p[1], _epfd);
> _nepitems--;
> }
> }
> write_unlock(&epmutex);
> if (rc < 0)
> goto error;
>
> /*
> * Wait until the stale data are no longer in use.
> * We could use call_rcu() here too, but let's keep the test simple.
> */
> printf("synchronize_rcu()\n");
> fflush(stdout);
> synchronize_rcu();
>
> printf("closing fds\n");
> fflush(stdout);
>
> /* Clean up the stale data */
> close(_p[0]);
> close(_p[1]);
> close(_epfd);
>
> printf("closed fds (%d, %d, %d)\n", _p[0], _p[1], _epfd);
> fflush(stdout);
>
> /*
> * Test is done. Join all the threads so that we give time for
> * races to show up.
> */
> niterations = 0;
> for (; nthreads > 0; nthreads--) {
> unsigned long long thread_iterations;
>
> rc = pthread_join(threads[nthreads - 1],
> (void *)&thread_iterations);
> niterations += thread_iterations;
> }
>
> after.tv_sec -= before.tv_sec;
> after.tv_nsec -= before.tv_nsec;
> if (after.tv_nsec < 0) {
> --after.tv_sec;
> after.tv_nsec += 1000000000;
> }
> printf("%f iterations per second\n", (double)niterations/((double)after.tv_sec + (double)after.tv_nsec/1000000000.0));
> exit(EXIT_SUCCESS);
> error:
> /* This is trashy testcase code -- it doesn't do full cleanup! */
> for (; nthreads > 0; nthreads--)
> rc = pthread_cancel(threads[nthreads - 1]);
> exit(EXIT_FAILURE);
> }
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>