Hi everyone,
This patch series adds support for read(), write(), and ioctl() operations
on eventpolls as well as an associated userspace structure to format the
eventpoll entries delivered via read()/write() buffers. The new structure,
struct epoll, differs from struct epoll_event mainly in that it also holds
the associated file descriptor. Using the normal I/O interface to manipulate
eventpolls is much neater than using epoll-specific syscalls while also
allowing for greater flexibility (theoretically, pipes could be used to
filter access). Specifically, write() creates, modifies, and/or removes event
entries stored in the supplied buffer, using the userspace identifier to
check whether an entry exists and removing it if no events are set to trigger
it, while read() simply waits for enough events to fill the provided buffer.
As timeout control is essential for polling to be practical, ioctl() is used
to configure an optional timeout, which is infinite by default.
Documentation/ioctl/ioctl-number.txt | 1 +
fs/eventpoll.c | 534 ++++++++++++++++++++++++-----------
include/uapi/linux/eventpoll.h | 10 +
3 files changed, 384 insertions(+), 161 deletions(-)
Reserve a small ioctl() command space for eventpolls, of which only two
are currently utilized.
Signed-off-by: Nathaniel Yazdani <[email protected]>
---
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index d7e43fa..3c6f8ac 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -81,6 +81,7 @@ Code Seq#(hex) Include File Comments
0x22 all scsi/sg.h
'#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem
'$' 00-0F linux/perf_counter.h, linux/perf_event.h
+'$' 10-1F include/uapi/linux/eventpoll.h
'&' 00-07 drivers/firewire/nosy-user.h
'1' 00-1F <linux/timepps.h> PPS kit from Ulrich Windl
<ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
Add a new 'struct epoll' to the userspace eventpoll interface. Buffers
supplied to read() & write() calls on eventpolls are interpreted as
arrays of this structure. The new structure's only functional difference
from epoll_event is it also holds the associated file descriptor (needed
for write() to properly create events but useful information in general).
Also define the ioctl() command macros to set & get the timeout of an
eventpoll.
Signed-off-by: Nathaniel Yazdani <[email protected]>
---
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index bc81fb2..73f817c 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -56,11 +56,21 @@
#define EPOLL_PACKED
#endif
+/* ioctl() requests */
+#define EPIOC_GETTIMEOUT _IOR('$', 0x10, int)
+#define EPIOC_SETTIMEOUT _IOW('$', 0x11, int)
+
struct epoll_event {
__u32 events;
__u64 data;
} EPOLL_PACKED;
+struct epoll {
+ int ep_fildes; /* file descriptor */
+ int ep_events; /* triggering events */
+ long long ep_ident; /* entry ID (cf. epoll_event->data) */
+} EPOLL_PACKED; /* A.K.A. "epe" for "eventpoll entry" */
+
#ifdef CONFIG_PM_SLEEP
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
The eventpoll implementation is largely interface-agnostic, aside from the
userspace structure format and epoll_ctl(). Particularly as each field of the
structure is handled independently, replacing usage of epoll_event internally
was straighforward and clarifies the code some. As for epoll_ctl(), its
functionality was moved into the new ep_eventpoll_write() function, and
epoll_ctl() just hands off its work to it. The ep_eventpoll_read() function is
very similar to epoll_wait(), which remains independent but shares the vast
majority of code for minimal redundancy. Finally, ep_eventpoll_ioctl() is a
simple interface to configure a default timeout for read() operations on the
given eventpoll.
Signed-off-by: Nathaniel Yazdani <[email protected]>
---
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index af90312..7f0ce59 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -168,8 +168,11 @@ struct epitem {
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
- /* The structure that describe the interested events and the source fd */
- struct epoll_event event;
+ /* Interested events */
+ int events;
+
+ /* The userspace identifier for this entry */
+ long long ident;
};
/*
@@ -216,6 +219,9 @@ struct eventpoll {
struct file *file;
+ /* Default timeout */
+ int timeout;
+
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
@@ -251,6 +257,13 @@ struct ep_send_events_data {
struct epoll_event __user *events;
};
+/* ep_scan_ready_list() callback data for ep_send_epes() */
+struct ep_send_epes_data
+{
+ int max;
+ struct epoll __user *epes;
+};
+
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
@@ -795,9 +808,9 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
{
- pt->_key = epi->event.events;
+ pt->_key = epi->events;
- return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->events;
}
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -881,8 +894,8 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f)
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
- epi->ffd.fd, epi->event.events,
- (long long)epi->event.data);
+ epi->ffd.fd, epi->events,
+ (long long)epi->ident);
if (ret)
break;
}
@@ -892,6 +905,15 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif
+static ssize_t ep_eventpoll_write(struct file *file, const char __user *buf,
+ size_t bufsz, loff_t *pos);
+
+static ssize_t ep_eventpoll_read(struct file *file, char __user *buf,
+ size_t bufsz, loff_t *pos);
+
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
@@ -899,6 +921,9 @@ static const struct file_operations eventpoll_fops = {
#endif
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
+ .read = ep_eventpoll_read,
+ .write = ep_eventpoll_write,
+ .unlocked_ioctl = ep_eventpoll_ioctl,
.llseek = noop_llseek,
};
@@ -1025,7 +1050,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
- if (!(epi->event.events & ~EP_PRIVATE_BITS))
+ if (!(epi->events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
@@ -1034,7 +1059,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
- if (key && !((unsigned long) key & epi->event.events))
+ if (key && !((unsigned long) key & epi->events))
goto out_unlock;
/*
@@ -1264,7 +1289,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
/*
* Must be called with "mtx" held.
*/
-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+static int ep_insert(struct eventpoll *ep, long long ident, int events,
struct file *tfile, int fd, int full_check)
{
int error, revents, pwake = 0;
@@ -1285,10 +1310,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
- epi->event = *event;
+ epi->ident = ident;
+ epi->events = events;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
- if (epi->event.events & EPOLLWAKEUP) {
+ if (epi->events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
@@ -1338,7 +1364,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
spin_lock_irqsave(&ep->lock, flags);
/* If the file is already "ready" we drop it inside the ready list */
- if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
+ if ((revents & events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1392,7 +1418,7 @@ error_create_wakeup_source:
* Modify the interest event mask by dropping an event if the new mask
* has a match in the current file status. Must be called with "mtx" held.
*/
-static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
+static int ep_modify(struct eventpoll *ep, struct epitem *epi, long long ident, int events)
{
int pwake = 0;
unsigned int revents;
@@ -1405,9 +1431,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* otherwise we might miss an event that happens between the
* f_op->poll() call and the new event set registering.
*/
- epi->event.events = event->events; /* need barrier below */
- epi->event.data = event->data; /* protected by mtx */
- if (epi->event.events & EPOLLWAKEUP) {
+ epi->events = events; /* need barrier below */
+ epi->ident = ident; /* protected by mtx */
+ if (epi->events & EPOLLWAKEUP) {
if (!ep_has_wakeup_source(epi))
ep_create_wakeup_source(epi);
} else if (ep_has_wakeup_source(epi)) {
@@ -1444,7 +1470,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* If the item is "hot" and it is not registered inside the ready
* list, push it inside.
*/
- if (revents & event->events) {
+ if (revents & events) {
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1516,16 +1542,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
*/
if (revents) {
if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
+ __put_user(epi->ident, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awake(epi);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
uevent++;
- if (epi->event.events & EPOLLONESHOT)
- epi->event.events &= EP_PRIVATE_BITS;
- else if (!(epi->event.events & EPOLLET)) {
+ if (epi->events & EPOLLONESHOT)
+ epi->events &= EP_PRIVATE_BITS;
+ else if (!(epi->events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
@@ -1546,17 +1572,103 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
return eventcnt;
}
-static int ep_send_events(struct eventpoll *ep,
- struct epoll_event __user *events, int maxevents)
+static int ep_send_events(struct eventpoll *ep, void __user *buf, size_t bufsz)
{
struct ep_send_events_data esed;
- esed.maxevents = maxevents;
- esed.events = events;
+ esed.maxevents = bufsz / sizeof(struct epoll_event);
+ esed.events = buf;
return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
+/*
+ * Mostly biolerplate code from ep_send_events_proc(), but much cleaner to put
+ * in a separate function.
+ */
+static int ep_send_epes_proc(struct eventpoll *ep, struct list_head *head,
+ void *priv)
+{
+ struct ep_send_epes_data *esed = priv;
+ unsigned int revents, i;
+ struct epitem *epi;
+ struct wakeup_source *ws;
+ poll_table pt;
+
+ init_poll_funcptr(&pt, NULL);
+
+ /*
+ * We can loop without lock because we are passed a task private list.
+ * Items cannot vanish during the loop because ep_scan_ready_list() is
+ * holding "mtx" during this call.
+ */
+ for (i = 0; !list_empty(head) && i < esed->max; ++i) {
+ epi = list_first_entry(head, struct epitem, rdllink);
+
+ /*
+ * Activate ep->ws before deactivating epi->ws to prevent
+ * triggering auto-suspend here (in case we reactive epi->ws
+ * below).
+ *
+ * This could be rearranged to delay the deactivation of epi->ws
+ * instead, but then epi->ws would temporarily be out of sync
+ * with ep_is_linked().
+ */
+ ws = ep_wakeup_source(epi);
+ if (ws) {
+ if (ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(ws);
+ }
+
+ list_del_init(&epi->rdllink);
+
+ revents = ep_item_poll(epi, &pt);
+
+ /*
+ * If the event mask intersect the caller-requested one,
+ * deliver the event to userspace. Again, ep_scan_ready_list()
+ * is holding "mtx", so no operations coming from userspace
+ * can change the item.
+ */
+ if (revents) {
+ if (__put_user(revents, &esed->epes[i].ep_events) ||
+ __put_user(epi->ident, &esed->epes[i].ep_ident) ||
+ __put_user(epi->ffd.fd, &esed->epes[i].ep_fildes)) {
+ list_add(&epi->rdllink, head);
+ ep_pm_stay_awake(epi);
+ return i ? i : -EFAULT;
+ }
+ if (epi->events & EPOLLONESHOT)
+ epi->events &= EP_PRIVATE_BITS;
+ else if (!(epi->events & EPOLLET)) {
+ /*
+ * If this file has been added with Level
+ * Trigger mode, we need to insert back inside
+ * the ready list, so that the next call to
+ * epoll_wait() will check again the events
+ * availability. At this point, no one can insert
+ * into ep->rdllist besides us. The epoll_ctl()
+ * callers are locked out by
+ * ep_scan_ready_list() holding "mtx" and the
+ * poll callback will queue them in ep->ovflist.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake(epi);
+ }
+ }
+ }
+
+ return i;
+}
+
+static int ep_send_epes(struct eventpoll *ep, void __user *buf, size_t bufsz)
+{
+ struct ep_send_epes_data esed = { .max = bufsz / sizeof(struct epoll),
+ .epes = buf };
+ return ep_scan_ready_list(ep, ep_send_epes_proc, &esed, 0, false);
+}
+
static inline struct timespec ep_set_mstimeout(long ms)
{
struct timespec now, ts = {
@@ -1581,12 +1693,14 @@ static inline struct timespec ep_set_mstimeout(long ms)
* while if the @timeout is less than zero, the function will block
* until at least one event has been retrieved (or an error
* occurred).
+ * @sender: Function to call to send ready events to userspace.
*
* Returns: Returns the number of ready events which have been fetched, or an
* error code, in case of error.
*/
-static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
- int maxevents, long timeout)
+static int ep_poll(struct eventpoll *ep, void __user *buffer, size_t length,
+ long timeout, int (*sender)(struct eventpoll *,
+ void __user *, size_t))
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
@@ -1658,7 +1772,7 @@ check_events:
* more luck.
*/
if (!res && eavail &&
- !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+ !(res = sender(ep, buffer, length)) && !timed_out)
goto fetch_events;
return res;
@@ -1761,6 +1875,213 @@ static void clear_tfile_check_list(void)
INIT_LIST_HEAD(&tfile_check_list);
}
+/**
+ *
+ * ep_eventpoll_write - Create, remove, or modify events to poll for. The epoll
+ * file distinguishes between events by file descriptor,
+ * but it will also store a user-defined identifier along
+ * with it. To modify an existing event, simply set
+ * ->ep_fildes to the target file desciptor and set
+ * ->ep_ident and ->ep_events to whatever values you wish
+ * to change them to. To remove an event, set ->ep_fildes
+ * to the relevant file descriptor and clear ->ep_events.
+ *
+ * @file: The epoll file being acted upon.
+ * @buf: Array of 'struct epoll' entries, to be inserted, modified, or removed
+ * from the epoll file depending on their contents.
+ * @bufsz: Number of 'struct epoll' entries in buffer times the size of the
+ * structure.
+ * @pos: Ignored, epoll files behave like character devices.
+ *
+ * Returns: The number of bytes from the userspace buffer successfully processed,
+ * always a multiple of sizeof(struct epoll), or an error code if the
+ * buffer is ill-aligned or inaccessible (nothing will have been
+ * processed).
+ */
+static ssize_t ep_eventpoll_write(struct file *file, const char __user *buf,
+ size_t bufsz, loff_t *pos)
+{
+ struct eventpoll *ep = file->private_data, *tep = NULL;
+ struct epitem *epi;
+ struct file *target;
+ const struct epoll __user *epes = (const struct epoll __user *)buf;
+ struct epoll epe;
+ bool full_check = false;
+ size_t num = bufsz / sizeof(struct epoll); /* Ignore any excess */
+ int i;
+
+ if (!access_ok(VERIFY_READ, buf, bufsz))
+ return -EFAULT;
+
+ for (i = 0; i < num; ++i) {
+
+ if (copy_from_user(&epe, &epes[i], sizeof(struct epoll)))
+ goto out;
+
+ target = fget(epe.ep_fildes);
+ if (target < 0)
+ goto out;
+
+ /* The target file descriptor must support poll */
+ if (!target->f_op || !target->f_op->poll)
+ goto out_fput;
+
+ /* Check if EPOLLWAKEUP is allowed */
+ if ((epe.ep_events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
+ epe.ep_events &= ~EPOLLWAKEUP;
+
+ /* We do not permit adding an epoll file descriptor inside itself. */
+ if (target == file)
+ goto out_fput;
+
+ mutex_lock_nested(&ep->mtx, 0);
+
+ /* Try to lookup the file inside our RB tree */
+ epi = ep_find(ep, target, epe.ep_fildes);
+
+ /*
+ * When we insert an epoll file descriptor, inside another epoll
+ * file descriptor, there is the chance of creating closed loops,
+ * which are better handled here, than in more critical paths.
+ * While we are checking for loops we also determine the list of
+ * files reachable and hang them on the tfile_check_list, so we
+ * can check that we haven't created too many possible wakeup
+ * paths.
+ *
+ * We do not need to take the global 'epumutex' to ep_insert()
+ * when the epoll file descriptor is attaching directly to a
+ * wakeup source, unless the epoll file descriptor is nested.
+ * The purpose of taking the 'epmutex' on add is to prevent
+ * complex toplogies such as loops and deep wakeup paths from
+ * forming in parallel through multiple ep_insert() operations.
+ */
+
+ if (epe.ep_events && !epi) {
+ /* add this epoll entry */
+ if (!list_empty(&file->f_ep_links) ||
+ is_file_epoll(target)) {
+ full_check = true;
+ mutex_unlock(&ep->mtx);
+ mutex_lock(&epmutex);
+ if (is_file_epoll(target) &&
+ ep_loop_check(ep, target) != 0) {
+ clear_tfile_check_list();
+ goto out_fput;
+ } else if (!is_file_epoll(target)) {
+ list_add(&target->f_tfile_llink,
+ &tfile_check_list);
+ }
+ mutex_lock_nested(&ep->mtx, 0);
+ if (is_file_epoll(target)) {
+ tep = target->private_data;
+ mutex_lock_nested(&tep->mtx, 1);
+ }
+ }
+ epe.ep_events |= POLLERR | POLLHUP;
+ if (ep_insert(ep, epe.ep_ident, epe.ep_events, target,
+ epe.ep_fildes, full_check))
+ goto out_unlock;
+ if (full_check)
+ clear_tfile_check_list();
+ } else if (epe.ep_events && epi) {
+ /* modify this epoll entry */
+ epe.ep_events |= POLLERR | POLLHUP;
+ if (ep_modify(ep, epi, epe.ep_ident, epe.ep_events))
+ goto out_unlock;
+ } else if (!epe.ep_events && epi) {
+ /* delete this epoll entry */
+ if (is_file_epoll(target)) {
+ tep = target->private_data;
+ mutex_lock_nested(&tep->mtx, 1);
+ }
+ if (is_file_epoll(target))
+ mutex_lock_nested(&tep->mtx, 1);
+ if (ep_remove(ep, epi))
+ goto out_unlock;
+ }
+
+ if (tep)
+ mutex_unlock(&tep->mtx);
+ tep = NULL;
+ mutex_unlock(&ep->mtx);
+ if (full_check)
+ mutex_unlock(&epmutex);
+ fput(target);
+ }
+ goto out;
+
+out_unlock:
+ if (tep)
+ mutex_unlock(&tep->mtx);
+ mutex_unlock(&ep->mtx);
+ if (full_check)
+ mutex_unlock(&epmutex);
+out_fput:
+ fput(target);
+out:
+ return i * sizeof(struct epoll);
+}
+
+/**
+ *
+ * ep_eventpoll_read - Read triggered events from an epoll file, delivered to
+ * userspace in 'struct epoll' packets. At most, as many
+ * events that wholly fit within the buffer are returned,
+ * with less being returned if the read times out.
+ *
+ * @file: The epoll file to retrieve events from.
+ * @buf: Preallocated buffer into which the kernel will store epoll entries.
+ * @bufsz: Size of buffer, which ought to be in multiples of the epoll entry
+ * structure. If not, the kernel will store as many structs as will
+ * wholly fit within the provided buffer, not exceeding EP_MAX_EVENTS.
+ * @pos: Ignored, epoll behaves like a character device.
+ *
+ * Returns: The number of triggered epoll entries multiplied by the size of the
+ * epoll entry structure.
+ */
+ssize_t ep_eventpoll_read(struct file *file, char __user *buf, size_t bufsz,
+ loff_t *pos)
+{
+ struct eventpoll *ep = file->private_data;
+ int tmp;
+
+ /* The event buffer must be of a reasonable size */
+ if (bufsz / sizeof(struct epoll) == 0 ||
+ bufsz / sizeof(struct epoll) > EP_MAX_EVENTS)
+ return -EINVAL;
+
+ /* Verify that the area passed by the user is writeable */
+ if (!access_ok(VERIFY_WRITE, buf, bufsz))
+ return -EFAULT;
+
+ /* Time to fish for events ... */
+ tmp = ep_poll(file->private_data, buf, bufsz, ep->timeout,
+ ep_send_epes);
+ return tmp < 0 ? tmp : (ssize_t)tmp * sizeof(struct epoll);
+}
+
+/*
+ * ep_eventpoll_ioctl - configure an eventpoll's behavior.
+ *
+ * @cmd: An EPIOC_* control command.
+ * @arg: A pointer whose type depends on @cmd (usually int).
+ *
+ * Returns: 0 on success or an errno code.
+ */
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct eventpoll *ep = file->private_data;
+ switch (cmd) {
+ case EPIOC_GETTIMEOUT:
+ return put_user(ep->timeout, (int __user *)arg);
+ case EPIOC_SETTIMEOUT:
+ return get_user(ep->timeout, (int __user *)arg);
+ default:
+ return -EINVAL;
+ }
+}
+
/*
* Open an eventpoll file descriptor.
*/
@@ -1775,6 +2096,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
+ flags |= O_RDWR;
+
/*
* Create the internal data structure ("struct eventpoll").
*/
@@ -1785,19 +2108,19 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+ fd = get_unused_fd_flags(flags);
if (fd < 0) {
error = fd;
goto out_free_ep;
}
- file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
- O_RDWR | (flags & O_CLOEXEC));
+ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, flags);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
+ ep->timeout = -1; /* infinite (i.e., no) timeout by default */
return fd;
out_free_fd:
@@ -1823,137 +2146,27 @@ SYSCALL_DEFINE1(epoll_create, int, size)
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
- int error;
- int full_check = 0;
- struct fd f, tf;
- struct eventpoll *ep;
- struct epitem *epi;
- struct epoll_event epds;
- struct eventpoll *tep = NULL;
-
- error = -EFAULT;
- if (ep_op_has_event(op) &&
- copy_from_user(&epds, event, sizeof(struct epoll_event)))
- goto error_return;
-
- error = -EBADF;
- f = fdget(epfd);
- if (!f.file)
- goto error_return;
-
- /* Get the "struct file *" for the target file */
- tf = fdget(fd);
- if (!tf.file)
- goto error_fput;
-
- /* The target file descriptor must support poll */
- error = -EPERM;
- if (!tf.file->f_op->poll)
- goto error_tgt_fput;
-
- /* Check if EPOLLWAKEUP is allowed */
- ep_take_care_of_epollwakeup(&epds);
-
- /*
- * We have to check that the file structure underneath the file descriptor
- * the user passed to us _is_ an eventpoll file. And also we do not permit
- * adding an epoll file descriptor inside itself.
- */
- error = -EINVAL;
- if (f.file == tf.file || !is_file_epoll(f.file))
- goto error_tgt_fput;
-
- /*
- * At this point it is safe to assume that the "private_data" contains
- * our own data structure.
- */
- ep = f.file->private_data;
-
- /*
- * When we insert an epoll file descriptor, inside another epoll file
- * descriptor, there is the change of creating closed loops, which are
- * better be handled here, than in more critical paths. While we are
- * checking for loops we also determine the list of files reachable
- * and hang them on the tfile_check_list, so we can check that we
- * haven't created too many possible wakeup paths.
- *
- * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
- * the epoll file descriptor is attaching directly to a wakeup source,
- * unless the epoll file descriptor is nested. The purpose of taking the
- * 'epmutex' on add is to prevent complex toplogies such as loops and
- * deep wakeup paths from forming in parallel through multiple
- * EPOLL_CTL_ADD operations.
- */
- mutex_lock_nested(&ep->mtx, 0);
- if (op == EPOLL_CTL_ADD) {
- if (!list_empty(&f.file->f_ep_links) ||
- is_file_epoll(tf.file)) {
- full_check = 1;
- mutex_unlock(&ep->mtx);
- mutex_lock(&epmutex);
- if (is_file_epoll(tf.file)) {
- error = -ELOOP;
- if (ep_loop_check(ep, tf.file) != 0) {
- clear_tfile_check_list();
- goto error_tgt_fput;
- }
- } else
- list_add(&tf.file->f_tfile_llink,
- &tfile_check_list);
- mutex_lock_nested(&ep->mtx, 0);
- if (is_file_epoll(tf.file)) {
- tep = tf.file->private_data;
- mutex_lock_nested(&tep->mtx, 1);
- }
- }
- }
+ struct epoll epe = { .ep_fildes = fd };
+ struct file *file = fget(epfd);
+ int err;
- /*
- * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
- * above, we can be sure to be able to use the item looked up by
- * ep_find() till we release the mutex.
- */
- epi = ep_find(ep, tf.file, fd);
+ err = -EBADF;
+ if (!file || !is_file_epoll(file))
+ goto out;
- error = -EINVAL;
- switch (op) {
- case EPOLL_CTL_ADD:
- if (!epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tf.file, fd, full_check);
- } else
- error = -EEXIST;
- if (full_check)
- clear_tfile_check_list();
- break;
- case EPOLL_CTL_DEL:
- if (epi)
- error = ep_remove(ep, epi);
- else
- error = -ENOENT;
- break;
- case EPOLL_CTL_MOD:
- if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
- } else
- error = -ENOENT;
- break;
- }
- if (tep != NULL)
- mutex_unlock(&tep->mtx);
- mutex_unlock(&ep->mtx);
-
-error_tgt_fput:
- if (full_check)
- mutex_unlock(&epmutex);
-
- fdput(tf);
-error_fput:
- fdput(f);
-error_return:
-
- return error;
+ err = -EFAULT;
+ if (ep_op_has_event(op) &&
+ (get_user(epe.ep_events, (int *)&event->events) ||
+ get_user(epe.ep_ident, (long long *)&event->data)))
+ goto out;
+
+ err = ep_eventpoll_write(file, (const char *)&epe,
+ sizeof(struct epoll), NULL);
+ if (!err)
+ err = -EBADF;
+out:
+ fput(file);
+ return err < 0 ? err : 0;
}
/*
@@ -1995,7 +2208,8 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
ep = f.file->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, timeout);
+ error = ep_poll(ep, events, maxevents * sizeof(struct epoll_event),
+ timeout, ep_send_events);
error_fput:
fdput(f);
Nathaniel Yazdani wrote:
> Using the normal I/O interface to manipulate eventpolls is much neater
> than using epoll-specific syscalls
But it introduces a _second_ API, which is epoll-specific too, and does
not use the standard semantics either.
> while also allowing for greater flexibility (theoretically, pipes could
> be used to filter access).
I do not understand this.
> read() simply waits for enough events to fill the provided buffer.
The usual semantics of read() are to return a partially filled buffer if
it would block otherwise, i.e., blocking is done only if the returned
buffer would have been empty.
> As timeout control is essential for polling to be practical, ioctl() is
> used to configure an optional timeout
This is what the timeout parameter of poll() and friends is for.
Regards,
Clemens
On 02/02/2014 06:17 PM, Nathaniel Yazdani wrote:
> Hi everyone,
>
> This patch series adds support for read(), write(), and ioctl() operations
> on eventpolls as well as an associated userspace structure to format the
> eventpoll entries delivered via read()/write() buffers. The new structure,
> struct epoll, differs from struct epoll_event mainly in that it also holds
> the associated file descriptor. Using the normal I/O interface to manipulate
> eventpolls is much neater than using epoll-specific syscalls while also
> allowing for greater flexibility (theoretically, pipes could be used to
> filter access). Specifically, write() creates, modifies, and/or removes event
> entries stored in the supplied buffer, using the userspace identifier to
> check whether an entry exists and removing it if no events are set to trigger
> it, while read() simply waits for enough events to fill the provided buffer.
> As timeout control is essential for polling to be practical, ioctl() is used
> to configure an optional timeout, which is infinite by default.
If major changes are made to the epoll API, I want a way to do a bunch
of EPOLL_CTL_MODs and a wait, all in one syscall. Even better: allow
more flexible timeouts (CLOCK_MONOTONIC, CLOCK_REALTIME, etc) at the
same time.
Since this can't do that, I'm not terribly inspired.
--Andy
>
> Documentation/ioctl/ioctl-number.txt | 1 +
> fs/eventpoll.c | 534 ++++++++++++++++++++++++-----------
> include/uapi/linux/eventpoll.h | 10 +
> 3 files changed, 384 insertions(+), 161 deletions(-)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On 2/3/14, Clemens Ladisch <[email protected]> wrote:
> Nathaniel Yazdani wrote:
>> Using the normal I/O interface to manipulate eventpolls is much neater
>> than using epoll-specific syscalls
>
> But it introduces a _second_ API, which is epoll-specific too, and does
> not use the standard semantics either.
>
>> while also allowing for greater flexibility (theoretically, pipes could
>> be used to filter access).
>
> I do not understand this.
The idea here was that if epoll is controlled by read()/write(), then
a program could be written so that it expects the epoll to dup()ed
to a second file descriptor, using one exclusively for writing & the
other exclusively for reading. That way, if an application is in
debug mode, for example, it could start up a thread to replace
those two file descriptors with pipes, so that thread would then
be able to tee, preprocess, or do whatever else to the epoll
streams.
>> read() simply waits for enough events to fill the provided buffer.
>
> The usual semantics of read() are to return a partially filled buffer if
> it would block otherwise, i.e., blocking is done only if the returned
> buffer would have been empty.
>
>> As timeout control is essential for polling to be practical, ioctl() is
>> used to configure an optional timeout
>
> This is what the timeout parameter of poll() and friends is for.
I admit that part of this approach isn't the best.
Either way I appreciate your feedback,
Nathaniel Yazdani
On 2/3/14, Andy Lutomirski <[email protected]> wrote:
> On 02/02/2014 06:17 PM, Nathaniel Yazdani wrote:
>> Hi everyone,
>>
>> This patch series adds support for read(), write(), and ioctl()
>> operations
>> on eventpolls as well as an associated userspace structure to format the
>> eventpoll entries delivered via read()/write() buffers. The new
>> structure,
>> struct epoll, differs from struct epoll_event mainly in that it also
>> holds
>> the associated file descriptor. Using the normal I/O interface to
>> manipulate
>> eventpolls is much neater than using epoll-specific syscalls while also
>> allowing for greater flexibility (theoretically, pipes could be used to
>> filter access). Specifically, write() creates, modifies, and/or removes
>> event
>> entries stored in the supplied buffer, using the userspace identifier to
>> check whether an entry exists and removing it if no events are set to
>> trigger
>> it, while read() simply waits for enough events to fill the provided
>> buffer.
>> As timeout control is essential for polling to be practical, ioctl() is
>> used
>> to configure an optional timeout, which is infinite by default.
>
> If major changes are made to the epoll API, I want a way to do a bunch
> of EPOLL_CTL_MODs and a wait, all in one syscall. Even better: allow
> more flexible timeouts (CLOCK_MONOTONIC, CLOCK_REALTIME, etc) at the
> same time.
>
> Since this can't do that, I'm not terribly inspired.
>
> --Andy
So are you saying that those features you mentioned are specifically sought
after for the kernel? If so I'd like to take a crack at some of them,
may as well
get some use out of my new knowledge of epoll internals :)
Thanks for your input,
Nate Yazdani
On Mon, Feb 3, 2014 at 11:42 AM, Nathaniel Yazdani
<[email protected]> wrote:
> On 2/3/14, Andy Lutomirski <[email protected]> wrote:
>> On 02/02/2014 06:17 PM, Nathaniel Yazdani wrote:
>>> Hi everyone,
>>>
>>> This patch series adds support for read(), write(), and ioctl()
>>> operations
>>> on eventpolls as well as an associated userspace structure to format the
>>> eventpoll entries delivered via read()/write() buffers. The new
>>> structure,
>>> struct epoll, differs from struct epoll_event mainly in that it also
>>> holds
>>> the associated file descriptor. Using the normal I/O interface to
>>> manipulate
>>> eventpolls is much neater than using epoll-specific syscalls while also
>>> allowing for greater flexibility (theoretically, pipes could be used to
>>> filter access). Specifically, write() creates, modifies, and/or removes
>>> event
>>> entries stored in the supplied buffer, using the userspace identifier to
>>> check whether an entry exists and removing it if no events are set to
>>> trigger
>>> it, while read() simply waits for enough events to fill the provided
>>> buffer.
>>> As timeout control is essential for polling to be practical, ioctl() is
>>> used
>>> to configure an optional timeout, which is infinite by default.
>>
>> If major changes are made to the epoll API, I want a way to do a bunch
>> of EPOLL_CTL_MODs and a wait, all in one syscall. Even better: allow
>> more flexible timeouts (CLOCK_MONOTONIC, CLOCK_REALTIME, etc) at the
>> same time.
>>
>> Since this can't do that, I'm not terribly inspired.
>>
>> --Andy
>
> So are you saying that those features you mentioned are specifically sought
> after for the kernel? If so I'd like to take a crack at some of them,
> may as well
> get some use out of my new knowledge of epoll internals :)
If by "sought after", you mean "is there at least one epoll user who
wants them", then yes :)
I think that EPOLLET and EPOLLONESHOT are giant hacks, and that what
everyone really wants is the ability to very efficiently toggle events
on and off. The ability to do it simultaneously and inexpensively
with epoll_wait would make it happen.
--Andy
>
> Thanks for your input,
> Nate Yazdani
--
Andy Lutomirski
AMA Capital Management, LLC
Andy Lutomirski <[email protected]> wrote:
> >> On 02/02/2014 06:17 PM, Nathaniel Yazdani wrote:
> > So are you saying that those features you mentioned are specifically sought
> > after for the kernel? If so I'd like to take a crack at some of them,
> > may as well
> > get some use out of my new knowledge of epoll internals :)
>
> If by "sought after", you mean "is there at least one epoll user who
> wants them", then yes :)
>
> I think that EPOLLET and EPOLLONESHOT are giant hacks, and that what
> everyone really wants is the ability to very efficiently toggle events
> on and off. The ability to do it simultaneously and inexpensively
> with epoll_wait would make it happen.
Everybody using single-threaded epoll, you mean? I suppose there's
quite a few of those.
I've pondered an epoll_xchg syscall which would behave like *BSD kevent
to satisfy single-threaded users, but never got around to it. All my
epoll uses are multithreaded w/ oneshot nowadays, so xchg would only
save one syscall per thread.
On Mon, Feb 3, 2014 at 1:51 PM, Eric Wong <[email protected]> wrote:
> Andy Lutomirski <[email protected]> wrote:
>> >> On 02/02/2014 06:17 PM, Nathaniel Yazdani wrote:
>> > So are you saying that those features you mentioned are specifically sought
>> > after for the kernel? If so I'd like to take a crack at some of them,
>> > may as well
>> > get some use out of my new knowledge of epoll internals :)
>>
>> If by "sought after", you mean "is there at least one epoll user who
>> wants them", then yes :)
>>
>> I think that EPOLLET and EPOLLONESHOT are giant hacks, and that what
>> everyone really wants is the ability to very efficiently toggle events
>> on and off. The ability to do it simultaneously and inexpensively
>> with epoll_wait would make it happen.
>
> Everybody using single-threaded epoll, you mean? I suppose there's
> quite a few of those.
>
> I've pondered an epoll_xchg syscall which would behave like *BSD kevent
> to satisfy single-threaded users, but never got around to it. All my
> epoll uses are multithreaded w/ oneshot nowadays, so xchg would only
> save one syscall per thread.
Even for multithreaded, the ability to rearm EPOLLONESHOT entries
without extra syscalls would probably be useful.
--Andy