2011-02-06 01:09:55

by Nelson Elhage

[permalink] [raw]
Subject: [PATCH] epoll: Prevent deadlock through unsafe ->f_op->poll() calls.

In several places, an epoll fd can call another file's ->f_op->poll() method
with ep->mtx held. This is in general unsafe, because that other file could
itself be an epoll fd that contains the original epoll fd.

The code defends against this possibility in its own ->poll() method using
ep_call_nested, but there are several other unsafe calls to ->poll elsewhere
that can be made to deadlock. For example, the following simple program causes
the call in ep_insert recursively call the original fd's ->poll, leading to
deadlock:

------------------------------8<------------------------------
#include <unistd.h>
#include <sys/epoll.h>

int main(void) {
int e1, e2, p[2];
struct epoll_event evt = {
.events = EPOLLIN
};

e1 = epoll_create(1);
e2 = epoll_create(2);
pipe(p);

epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt);
epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt);
write(p[1], p, sizeof p);
epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);

return 0;
}
------------------------------8<------------------------------

This patch fixes the problem by wrapping all such calls in ep_call_nested, using
the same nested_calls instance as the ->poll method. It's possible there are
more elegant solutions, but this works and is minimally invasive.

Signed-off-by: Nelson Elhage <[email protected]>
---
fs/eventpoll.c | 37 ++++++++++++++++++++++++++++++++++---
1 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242..4ca27cf5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -213,6 +213,12 @@ struct ep_send_events_data {
struct epoll_event __user *events;
};

+/* Used by the ep_poll_file() function as callback private data */
+struct ep_poll_file_data {
+ struct file *file;
+ struct poll_table_struct *pt;
+};
+
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
@@ -668,6 +674,31 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
return pollflags != -1 ? pollflags : 0;
}

+int ep_poll_file_proc(void *priv, void *cookie, int call_nests)
+{
+ struct ep_poll_file_data *data = priv;
+ return data->file->f_op->poll(data->file, data->pt);
+}
+
+static unsigned int ep_poll_file(struct eventpoll *ep, struct file *file,
+ struct poll_table_struct *pt)
+{
+ int pollflags;
+ struct ep_poll_file_data data;
+ /*
+ * This is merely a call to file->f_op->poll() under
+ * ep_call_nested. This shares a nested_calls struct with
+ * ep_eventpoll_poll in order to prevent other sites that call
+ * ->f_op->poll from recursing back into this file and deadlocking.
+ */
+ data.file = file;
+ data.pt = pt;
+ pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+ ep_poll_file_proc, &data, ep, current);
+
+ return pollflags != -1 ? pollflags : 0;
+}
+
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
@@ -928,7 +959,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
* this operation completes, the poll callback can start hitting
* the new item.
*/
- revents = tfile->f_op->poll(tfile, &epq.pt);
+ revents = ep_poll_file(ep, tfile, &epq.pt);

/*
* We have to check if something went wrong during the poll wait queue
@@ -1014,7 +1045,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+ revents = ep_poll_file(ep, epi->ffd.file, NULL);

/*
* If the item is "hot" and it is not registered inside the ready
@@ -1061,7 +1092,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,

list_del_init(&epi->rdllink);

- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ revents = ep_poll_file(ep, epi->ffd.file, NULL) &
epi->event.events;

/*
--
1.7.2.43.g68ef4


2011-02-06 03:22:31

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent deadlock through unsafe ->f_op->poll() calls.

On Sat, 5 Feb 2011, Nelson Elhage wrote:

> In several places, an epoll fd can call another file's ->f_op->poll() method
> with ep->mtx held. This is in general unsafe, because that other file could
> itself be an epoll fd that contains the original epoll fd.
>
> The code defends against this possibility in its own ->poll() method using
> ep_call_nested, but there are several other unsafe calls to ->poll elsewhere
> that can be made to deadlock. For example, the following simple program causes
> the call in ep_insert recursively call the original fd's ->poll, leading to
> deadlock:
>
> ------------------------------8<------------------------------
> #include <unistd.h>
> #include <sys/epoll.h>
>
> int main(void) {
> int e1, e2, p[2];
> struct epoll_event evt = {
> .events = EPOLLIN
> };
>
> e1 = epoll_create(1);
> e2 = epoll_create(2);
> pipe(p);
>
> epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt);
> epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt);
> write(p[1], p, sizeof p);
> epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
>
> return 0;
> }
> ------------------------------8<------------------------------

Yuck, true :|
The call nested function is heavy, and the patch below does it only if the
file descriptor is an epoll one.
But, that does not fix the problem WRT the send-events proc, even if we
call the new ep_poll_file().
At this point, we better remove the heavy checks from the fast path, and
perform a check at insetion time, if the inserted fd is an epoll one.
That way, the price for the check is paid once, and only if the inserted
fd is an epoll one.



- Davide


---
fs/eventpoll.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 107 insertions(+), 20 deletions(-)

Index: linux-2.6.mod/fs/eventpoll.c
===================================================================
--- linux-2.6.mod.orig/fs/eventpoll.c 2011-02-05 17:39:48.000000000 -0800
+++ linux-2.6.mod/fs/eventpoll.c 2011-02-05 19:04:46.000000000 -0800
@@ -214,6 +214,25 @@
};

/*
+ * Structure used to channel f_op->poll() data through the ep_call_nested()
+ * when calling f_op->poll() on an epoll descriptor.
+ */
+struct ep_poll_file_data {
+ struct file *file;
+ struct poll_table_struct *pt;
+};
+
+static int ep_eventpoll_release(struct inode *inode, struct file *file);
+static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
+
+/* File callbacks that implement the eventpoll file behaviour */
+static const struct file_operations eventpoll_fops = {
+ .release = ep_eventpoll_release,
+ .poll = ep_eventpoll_poll,
+ .llseek = noop_llseek,
+};
+
+/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
/* Maximum number of epoll watched descriptors, per user */
@@ -257,6 +276,11 @@
};
#endif /* CONFIG_SYSCTL */

+/* Fast test to see if the file is an evenpoll file */
+static inline int is_file_epoll(struct file *f)
+{
+ return f->f_op == &eventpoll_fops;
+}

/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -414,6 +438,82 @@
put_cpu();
}

+/**
+ * ep_poll_file_proc - Callback passed to @ep_call_nested() when calling
+ * @f_op->poll() on an epoll file descriptor.
+ *
+ * @priv: Pointer to an @ep_poll_file_data structure.
+ * @cookie: Cookie passed to the @ep_call_nested() API.
+ * @call_nests: Current value of nested calls, during the @ep_call_nested()
+ * processing.
+ *
+ * Returns: Returns the value of the wrapped @f_op->poll() call.
+ */
+static int ep_poll_file_proc(void *priv, void *cookie, int call_nests)
+{
+ struct ep_poll_file_data *epfd = priv;
+
+ return epfd->file->f_op->poll(epfd->file, epfd->pt);
+}
+
+/**
+ * ep_nested_poll_file - Uses the @ep_call_nested() API to call a file
+ * pointer @f_op->poll() function.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the file of which we need to call the @f_op->poll()
+ * function.
+ * @pt: Pointer to the @poll_table_struct structure to be passed to
+ * the @f_op->poll() call.
+ *
+ * Returns: Returns the ready flags returned by the file's @f_op->poll() API.
+ */
+static unsigned int ep_nested_poll_file(struct eventpoll *ep, struct file *file,
+ struct poll_table_struct *pt)
+{
+ int events;
+ struct ep_poll_file_data epfd;
+
+ /*
+ * This is merely a call to file->f_op->poll() under
+ * ep_call_nested. This shares a nested_calls struct with
+ * ep_eventpoll_poll in order to prevent other sites that call
+ * ->f_op->poll from recursing back into this file and deadlocking.
+ */
+ epfd.file = file;
+ epfd.pt = pt;
+ events = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+ ep_poll_file_proc, &epfd, ep, current);
+
+ return events != -1 ? events : 0;
+}
+
+/**
+ * ep_poll_file - Calls the file's @f_op->poll() callback, possibly
+ * using the @ep_call_nested() API if the target file is
+ * an epoll file.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the file of which we need to call the @f_op->poll()
+ * function.
+ * @pt: Pointer to the @poll_table_struct structure to be passed to
+ * the @f_op->poll() call.
+ *
+ * Returns: Returns the ready flags returned by the file's @f_op->poll() API.
+ */
+static inline unsigned int ep_poll_file(struct eventpoll *ep, struct file *file,
+ struct poll_table_struct *pt)
+{
+ /*
+ * Optimize for the common path. Most of the target files are not epoll
+ * file descriptors.
+ */
+ if (unlikely(is_file_epoll(file)))
+ return ep_nested_poll_file(ep, file, pt);
+
+ return file->f_op->poll(file, pt);
+}
+
/*
* This function unregisters poll callbacks from the associated file
* descriptor. Must be called with "mtx" held (or "epmutex" if called from
@@ -652,7 +752,7 @@

static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
- int pollflags;
+ int events;
struct eventpoll *ep = file->private_data;

/* Insert inside our poll wait queue */
@@ -664,23 +764,10 @@
* supervision, since the call to f_op->poll() done on listed files
* could re-enter here.
*/
- pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
- ep_poll_readyevents_proc, ep, ep, current);
-
- return pollflags != -1 ? pollflags : 0;
-}
+ events = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+ ep_poll_readyevents_proc, ep, ep, current);

-/* File callbacks that implement the eventpoll file behaviour */
-static const struct file_operations eventpoll_fops = {
- .release = ep_eventpoll_release,
- .poll = ep_eventpoll_poll,
- .llseek = noop_llseek,
-};
-
-/* Fast test to see if the file is an evenpoll file */
-static inline int is_file_epoll(struct file *f)
-{
- return f->f_op == &eventpoll_fops;
+ return events != -1 ? events : 0;
}

/*
@@ -931,7 +1018,7 @@
* this operation completes, the poll callback can start hitting
* the new item.
*/
- revents = tfile->f_op->poll(tfile, &epq.pt);
+ revents = ep_poll_file(ep, tfile, &epq.pt);

/*
* We have to check if something went wrong during the poll wait queue
@@ -1017,7 +1104,7 @@
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+ revents = ep_poll_file(ep, epi->ffd.file, NULL);

/*
* If the item is "hot" and it is not registered inside the ready
@@ -1064,7 +1151,7 @@

list_del_init(&epi->rdllink);

- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ revents = ep_poll_file(ep, epi->ffd.file, NULL) &
epi->event.events;

/*

2011-02-06 20:57:53

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent deadlock through unsafe ->f_op->poll() calls.

On Sat, 5 Feb 2011, Davide Libenzi wrote:

> On Sat, 5 Feb 2011, Nelson Elhage wrote:
>
> > In several places, an epoll fd can call another file's ->f_op->poll() method
> > with ep->mtx held. This is in general unsafe, because that other file could
> > itself be an epoll fd that contains the original epoll fd.
> >
> > The code defends against this possibility in its own ->poll() method using
> > ep_call_nested, but there are several other unsafe calls to ->poll elsewhere
> > that can be made to deadlock. For example, the following simple program causes
> > the call in ep_insert recursively call the original fd's ->poll, leading to
> > deadlock:
> >
> > ------------------------------8<------------------------------
> > #include <unistd.h>
> > #include <sys/epoll.h>
> >
> > int main(void) {
> > int e1, e2, p[2];
> > struct epoll_event evt = {
> > .events = EPOLLIN
> > };
> >
> > e1 = epoll_create(1);
> > e2 = epoll_create(2);
> > pipe(p);
> >
> > epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt);
> > epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt);
> > write(p[1], p, sizeof p);
> > epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
> >
> > return 0;
> > }
> > ------------------------------8<------------------------------
>
> Yuck, true :|
> The call nested function is heavy, and the patch below does it only if the
> file descriptor is an epoll one.
> But, that does not fix the problem WRT the send-events proc, even if we
> call the new ep_poll_file().
> At this point, we better remove the heavy checks from the fast path, and
> perform a check at insetion time, if the inserted fd is an epoll one.
> That way, the price for the check is paid once, and only if the inserted
> fd is an epoll one.

Like the one below. I haven't tested it yet, but it should catch closed
loops attempts at insertion time (and only for epoll fds), and remove
heavy calls from the fast path.



- Davide


---
fs/eventpoll.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 74 insertions(+)

Index: linux-2.6.mod/fs/eventpoll.c
===================================================================
--- linux-2.6.mod.orig/fs/eventpoll.c 2011-02-06 11:42:38.000000000 -0800
+++ linux-2.6.mod/fs/eventpoll.c 2011-02-06 12:51:27.000000000 -0800
@@ -224,6 +224,9 @@
*/
static DEFINE_MUTEX(epmutex);

+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;

@@ -1198,6 +1201,62 @@
return res;
}

+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ * API, to verify that adding an epoll file inside another
+ * epoll structure, does not violate the constraints, in
+ * terms of closed loops, or too deep chains (which can
+ * result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ * data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct eventpoll *ep = file->private_data;
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (unlikely(is_file_epoll(epi->ffd.file))) {
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, epi->ffd.file,
+ ep, current);
+ if (error != 0)
+ break;
+ }
+ }
+ mutex_unlock(&ep->mtx);
+
+ return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ * another epoll file (represented by @ep) does not create
+ * closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+ return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, file, ep, current);
+}
+
/*
* Open an eventpoll file descriptor.
*/
@@ -1287,6 +1346,15 @@
*/
ep = file->private_data;

+ /*
+ * When we insert an epoll file descriptor, inside another epoll file
+ * descriptor, there is the change of creating closed loops, which are
+ * better be handled here, than in more critical paths.
+ */
+ if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD &&
+ ep_loop_check(ep, tfile) != 0))
+ goto error_tgt_fput;
+
mutex_lock(&ep->mtx);

/*
@@ -1441,6 +1509,12 @@
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);

+ /*
+ * Initialize the structure used to perform epoll file descriptor
+ * inclusion loops checks.
+ */
+ ep_nested_calls_init(&poll_loop_ncalls);
+
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);

2011-02-06 22:45:14

by Nelson Elhage

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent deadlock through unsafe ->f_op->poll() calls.

Suppose thread A does epoll_ctl(e1, EPOLL_CTL_ADD, e2, evt) concurrently with
thread B doing epoll_ctl(e2, EPOLL_CTL_ADD, e1, evt).

Since you do the recursive scan before grabbing ep->mtx, I think there is
nothing to prevent the two scans from both succeeding, followed by the two
inserts, so you end up with a cycle anyways.

Of course, if you move the scan after acquiring ep->mtx, then the threads would
be grabbing them in opposite orders, and you'd have an AB/BA deadlock.

One possibility is grabbing epmutex, or another global mutex, for both the scan
and the duration of inserting one epoll fd onto another. That's really
heavyweight, but maybe inserting an epoll fd onto another epoll fd is rare
enough that it's Ok.

- Nelson

On Sun, Feb 06, 2011 at 12:56:17PM -0800, Davide Libenzi wrote:
> On Sat, 5 Feb 2011, Davide Libenzi wrote:
>
> > On Sat, 5 Feb 2011, Nelson Elhage wrote:
> >
> > > In several places, an epoll fd can call another file's ->f_op->poll() method
> > > with ep->mtx held. This is in general unsafe, because that other file could
> > > itself be an epoll fd that contains the original epoll fd.
> > >
> > > The code defends against this possibility in its own ->poll() method using
> > > ep_call_nested, but there are several other unsafe calls to ->poll elsewhere
> > > that can be made to deadlock. For example, the following simple program causes
> > > the call in ep_insert recursively call the original fd's ->poll, leading to
> > > deadlock:
> > >
> > > ------------------------------8<------------------------------
> > > #include <unistd.h>
> > > #include <sys/epoll.h>
> > >
> > > int main(void) {
> > > int e1, e2, p[2];
> > > struct epoll_event evt = {
> > > .events = EPOLLIN
> > > };
> > >
> > > e1 = epoll_create(1);
> > > e2 = epoll_create(2);
> > > pipe(p);
> > >
> > > epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt);
> > > epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt);
> > > write(p[1], p, sizeof p);
> > > epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
> > >
> > > return 0;
> > > }
> > > ------------------------------8<------------------------------
> >
> > Yuck, true :|
> > The call nested function is heavy, and the patch below does it only if the
> > file descriptor is an epoll one.
> > But, that does not fix the problem WRT the send-events proc, even if we
> > call the new ep_poll_file().
> > At this point, we better remove the heavy checks from the fast path, and
> > perform a check at insetion time, if the inserted fd is an epoll one.
> > That way, the price for the check is paid once, and only if the inserted
> > fd is an epoll one.
>
> Like the one below. I haven't tested it yet, but it should catch closed
> loops attempts at insertion time (and only for epoll fds), and remove
> heavy calls from the fast path.
>
>
>
> - Davide
>
>
> ---
> fs/eventpoll.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 74 insertions(+)
>
> Index: linux-2.6.mod/fs/eventpoll.c
> ===================================================================
> --- linux-2.6.mod.orig/fs/eventpoll.c 2011-02-06 11:42:38.000000000 -0800
> +++ linux-2.6.mod/fs/eventpoll.c 2011-02-06 12:51:27.000000000 -0800
> @@ -224,6 +224,9 @@
> */
> static DEFINE_MUTEX(epmutex);
>
> +/* Used to check for epoll file descriptor inclusion loops */
> +static struct nested_calls poll_loop_ncalls;
> +
> /* Used for safe wake up implementation */
> static struct nested_calls poll_safewake_ncalls;
>
> @@ -1198,6 +1201,62 @@
> return res;
> }
>
> +/**
> + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
> + * API, to verify that adding an epoll file inside another
> + * epoll structure, does not violate the constraints, in
> + * terms of closed loops, or too deep chains (which can
> + * result in excessive stack usage).
> + *
> + * @priv: Pointer to the epoll file to be currently checked.
> + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
> + * data structure pointer.
> + * @call_nests: Current dept of the @ep_call_nested() call stack.
> + *
> + * Returns: Returns zero if adding the epoll @file inside current epoll
> + * structure @ep does not violate the constraints, or -1 otherwise.
> + */
> +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
> +{
> + int error = 0;
> + struct file *file = priv;
> + struct eventpoll *ep = file->private_data;
> + struct rb_node *rbp;
> + struct epitem *epi;
> +
> + mutex_lock(&ep->mtx);
> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> + epi = rb_entry(rbp, struct epitem, rbn);
> + if (unlikely(is_file_epoll(epi->ffd.file))) {
> + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
> + ep_loop_check_proc, epi->ffd.file,
> + ep, current);
> + if (error != 0)
> + break;
> + }
> + }
> + mutex_unlock(&ep->mtx);
> +
> + return error;
> +}
> +
> +/**
> + * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
> + * another epoll file (represented by @ep) does not create
> + * closed loops or too deep chains.
> + *
> + * @ep: Pointer to the epoll private data structure.
> + * @file: Pointer to the epoll file to be checked.
> + *
> + * Returns: Returns zero if adding the epoll @file inside current epoll
> + * structure @ep does not violate the constraints, or -1 otherwise.
> + */
> +static int ep_loop_check(struct eventpoll *ep, struct file *file)
> +{
> + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
> + ep_loop_check_proc, file, ep, current);
> +}
> +
> /*
> * Open an eventpoll file descriptor.
> */
> @@ -1287,6 +1346,15 @@
> */
> ep = file->private_data;
>
> + /*
> + * When we insert an epoll file descriptor, inside another epoll file
> + * descriptor, there is the change of creating closed loops, which are
> + * better be handled here, than in more critical paths.
> + */
> + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD &&
> + ep_loop_check(ep, tfile) != 0))
> + goto error_tgt_fput;
> +
> mutex_lock(&ep->mtx);
>
> /*
> @@ -1441,6 +1509,12 @@
> EP_ITEM_COST;
> BUG_ON(max_user_watches < 0);
>
> + /*
> + * Initialize the structure used to perform epoll file descriptor
> + * inclusion loops checks.
> + */
> + ep_nested_calls_init(&poll_loop_ncalls);
> +
> /* Initialize the structure used to perform safe poll wait head wake ups */
> ep_nested_calls_init(&poll_safewake_ncalls);
>

2011-02-06 23:25:22

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent deadlock through unsafe ->f_op->poll() calls.

On Sun, 6 Feb 2011, Nelson Elhage wrote:

> Suppose thread A does epoll_ctl(e1, EPOLL_CTL_ADD, e2, evt) concurrently with
> thread B doing epoll_ctl(e2, EPOLL_CTL_ADD, e1, evt).
>
> Since you do the recursive scan before grabbing ep->mtx, I think there is
> nothing to prevent the two scans from both succeeding, followed by the two
> inserts, so you end up with a cycle anyways.

Good point!


> One possibility is grabbing epmutex, or another global mutex, for both the scan
> and the duration of inserting one epoll fd onto another. That's really
> heavyweight, but maybe inserting an epoll fd onto another epoll fd is rare
> enough that it's Ok.

Yes, that'd be fine, since that is not a fast path.


Signed-off-by: Davide Libenzi <[email protected]>


- Davide


---
fs/eventpoll.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 94 insertions(+), 2 deletions(-)

Index: linux-2.6.mod/fs/eventpoll.c
===================================================================
--- linux-2.6.mod.orig/fs/eventpoll.c 2011-02-06 11:42:38.000000000 -0800
+++ linux-2.6.mod/fs/eventpoll.c 2011-02-06 15:17:25.000000000 -0800
@@ -224,6 +224,9 @@
*/
static DEFINE_MUTEX(epmutex);

+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;

@@ -592,7 +595,6 @@
*/
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
-
ep_unregister_pollwait(ep, epi);
}

@@ -711,7 +713,6 @@

while (!list_empty(lsthead)) {
epi = list_first_entry(lsthead, struct epitem, fllink);
-
ep = epi->ep;
list_del_init(&epi->fllink);
mutex_lock(&ep->mtx);
@@ -1198,6 +1199,82 @@
return res;
}

+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ * API, to verify that adding an epoll file inside another
+ * epoll structure, does not violate the constraints, in
+ * terms of closed loops, or too deep chains (which can
+ * result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ * data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct eventpoll *ep = file->private_data;
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (unlikely(is_file_epoll(epi->ffd.file))) {
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, epi->ffd.file,
+ ep, current);
+ if (error != 0)
+ break;
+ }
+ }
+ mutex_unlock(&ep->mtx);
+
+ return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ * another epoll file (represented by @ep) does not create
+ * closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+ int error;
+
+ /*
+ * It needs to grab the @epmutex lock, before doing this.
+ * This because two concurrent @ep_loop_check() might be going on,
+ * resulting in a closed loop due to both @ep_loop_check() functions
+ * succeeding:
+ *
+ * e1 = epoll_create();
+ * e2 = epoll_create();
+ *
+ * THREAD-0 THREAD-1
+ *
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2) epoll_ctl(e2, EPOLL_CTL_ADD, e1)
+ *
+ */
+ mutex_lock(&epmutex);
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, file, ep, current);
+ mutex_unlock(&epmutex);
+
+ return error;
+}
+
/*
* Open an eventpoll file descriptor.
*/
@@ -1287,6 +1364,15 @@
*/
ep = file->private_data;

+ /*
+ * When we insert an epoll file descriptor, inside another epoll file
+ * descriptor, there is the change of creating closed loops, which are
+ * better be handled here, than in more critical paths.
+ */
+ if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD &&
+ ep_loop_check(ep, tfile) != 0))
+ goto error_tgt_fput;
+
mutex_lock(&ep->mtx);

/*
@@ -1441,6 +1527,12 @@
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);

+ /*
+ * Initialize the structure used to perform epoll file descriptor
+ * inclusion loops checks.
+ */
+ ep_nested_calls_init(&poll_loop_ncalls);
+
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);

2011-02-07 23:15:47

by Nelson Elhage

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent deadlock through unsafe ->f_op->poll() calls.

The point of grabbing epmutex is to make sure that the loop-check and the insert
happen atomically with respect to any other such pair of operations, so you need
to hold epmutex across both the check and the insert. Something like the
following. It's not particularly pretty, but it seems to work.

I also had to change the "cookie" value in ep_loop_check_proc from 'ep' to
'epi->ffd.file->private_data'; I think this is correct -- you want to avoid
visiting the same _containing_ 'struct eventpoll' more than once. Without that
fix, it doesn't detect the loop in my test case.

This is minimally tested.

---
fs/eventpoll.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 88 insertions(+), 0 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7..8a05f33 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -224,6 +224,9 @@ static long max_user_watches __read_mostly;
*/
static DEFINE_MUTEX(epmutex);

+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;

@@ -1188,6 +1191,62 @@ retry:
return res;
}

+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ * API, to verify that adding an epoll file inside another
+ * epoll structure, does not violate the constraints, in
+ * terms of closed loops, or too deep chains (which can
+ * result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ * data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct eventpoll *ep = file->private_data;
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (unlikely(is_file_epoll(epi->ffd.file))) {
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, epi->ffd.file,
+ epi->ffd.file->private_data, current);
+ if (error != 0)
+ break;
+ }
+ }
+ mutex_unlock(&ep->mtx);
+
+ return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ * another epoll file (represented by @ep) does not create
+ * closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+ return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, file, ep, current);
+}
+
/*
* Open an eventpoll file descriptor.
*/
@@ -1236,6 +1295,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
+ int did_lock_epmutex = 0;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
@@ -1277,6 +1337,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
*/
ep = file->private_data;

+ /*
+ * When we insert an epoll file descriptor, inside another epoll file
+ * descriptor, there is the change of creating closed loops, which are
+ * better be handled here, than in more critical paths.
+ *
+ * We hold epmutex across the loop check and the insert in this case, in
+ * order to prevent two separate inserts from racing and each doing the
+ * insert "at the same time" such that ep_loop_check passes on both
+ * before either one does the insert, thereby creating a cycle.
+ */
+ if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+ mutex_lock(&epmutex);
+ did_lock_epmutex = 1;
+ error = -ELOOP;
+ if (ep_loop_check(ep, tfile) != 0)
+ goto error_tgt_fput;
+ }
+
+
mutex_lock(&ep->mtx);

/*
@@ -1312,6 +1391,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
mutex_unlock(&ep->mtx);

error_tgt_fput:
+ if (did_lock_epmutex)
+ mutex_unlock(&epmutex);
+
fput(tfile);
error_fput:
fput(file);
@@ -1431,6 +1513,12 @@ static int __init eventpoll_init(void)
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);

+ /*
+ * Initialize the structure used to perform epoll file descriptor
+ * inclusion loops checks.
+ */
+ ep_nested_calls_init(&poll_loop_ncalls);
+
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);

--
1.7.2.43.g68ef4

2011-02-11 01:32:02

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent deadlock through unsafe ->f_op->poll() calls.

On Mon, 7 Feb 2011, Nelson Elhage wrote:

> The point of grabbing epmutex is to make sure that the loop-check and the insert
> happen atomically with respect to any other such pair of operations, so you need
> to hold epmutex across both the check and the insert. Something like the
> following. It's not particularly pretty, but it seems to work.

Right we need to hold epmutex, and this is getting uglier than I though :|


> I also had to change the "cookie" value in ep_loop_check_proc from 'ep' to
> 'epi->ffd.file->private_data'; I think this is correct -- you want to avoid
> visiting the same _containing_ 'struct eventpoll' more than once. Without that
> fix, it doesn't detect the loop in my test case.

Also, an 'unlikely' on that 'if (did_lock_epmutex)' is due.
It probably deserves a few lines of comment at the top comment, where we
describe the locking.
Do you mind posting a revised patch, while I go buying a brownbag to help
me getting over those hacks?
having said NO to the 'epoll inside epoll' thing would have saved us a lot
of headaches.



> This is minimally tested.
>
> ---
> fs/eventpoll.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 88 insertions(+), 0 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index cc8a9b7..8a05f33 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -224,6 +224,9 @@ static long max_user_watches __read_mostly;
> */
> static DEFINE_MUTEX(epmutex);
>
> +/* Used to check for epoll file descriptor inclusion loops */
> +static struct nested_calls poll_loop_ncalls;
> +
> /* Used for safe wake up implementation */
> static struct nested_calls poll_safewake_ncalls;
>
> @@ -1188,6 +1191,62 @@ retry:
> return res;
> }
>
> +/**
> + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
> + * API, to verify that adding an epoll file inside another
> + * epoll structure, does not violate the constraints, in
> + * terms of closed loops, or too deep chains (which can
> + * result in excessive stack usage).
> + *
> + * @priv: Pointer to the epoll file to be currently checked.
> + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
> + * data structure pointer.
> + * @call_nests: Current dept of the @ep_call_nested() call stack.
> + *
> + * Returns: Returns zero if adding the epoll @file inside current epoll
> + * structure @ep does not violate the constraints, or -1 otherwise.
> + */
> +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
> +{
> + int error = 0;
> + struct file *file = priv;
> + struct eventpoll *ep = file->private_data;
> + struct rb_node *rbp;
> + struct epitem *epi;
> +
> + mutex_lock(&ep->mtx);
> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> + epi = rb_entry(rbp, struct epitem, rbn);
> + if (unlikely(is_file_epoll(epi->ffd.file))) {
> + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
> + ep_loop_check_proc, epi->ffd.file,
> + epi->ffd.file->private_data, current);
> + if (error != 0)
> + break;
> + }
> + }
> + mutex_unlock(&ep->mtx);
> +
> + return error;
> +}
> +
> +/**
> + * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
> + * another epoll file (represented by @ep) does not create
> + * closed loops or too deep chains.
> + *
> + * @ep: Pointer to the epoll private data structure.
> + * @file: Pointer to the epoll file to be checked.
> + *
> + * Returns: Returns zero if adding the epoll @file inside current epoll
> + * structure @ep does not violate the constraints, or -1 otherwise.
> + */
> +static int ep_loop_check(struct eventpoll *ep, struct file *file)
> +{
> + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
> + ep_loop_check_proc, file, ep, current);
> +}
> +
> /*
> * Open an eventpoll file descriptor.
> */
> @@ -1236,6 +1295,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> struct epoll_event __user *, event)
> {
> int error;
> + int did_lock_epmutex = 0;
> struct file *file, *tfile;
> struct eventpoll *ep;
> struct epitem *epi;
> @@ -1277,6 +1337,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> */
> ep = file->private_data;
>
> + /*
> + * When we insert an epoll file descriptor, inside another epoll file
> + * descriptor, there is the change of creating closed loops, which are
> + * better be handled here, than in more critical paths.
> + *
> + * We hold epmutex across the loop check and the insert in this case, in
> + * order to prevent two separate inserts from racing and each doing the
> + * insert "at the same time" such that ep_loop_check passes on both
> + * before either one does the insert, thereby creating a cycle.
> + */
> + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
> + mutex_lock(&epmutex);
> + did_lock_epmutex = 1;
> + error = -ELOOP;
> + if (ep_loop_check(ep, tfile) != 0)
> + goto error_tgt_fput;
> + }
> +
> +
> mutex_lock(&ep->mtx);
>
> /*
> @@ -1312,6 +1391,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> mutex_unlock(&ep->mtx);
>
> error_tgt_fput:
> + if (did_lock_epmutex)
> + mutex_unlock(&epmutex);
> +
> fput(tfile);
> error_fput:
> fput(file);
> @@ -1431,6 +1513,12 @@ static int __init eventpoll_init(void)
> EP_ITEM_COST;
> BUG_ON(max_user_watches < 0);
>
> + /*
> + * Initialize the structure used to perform epoll file descriptor
> + * inclusion loops checks.
> + */
> + ep_nested_calls_init(&poll_loop_ncalls);
> +
> /* Initialize the structure used to perform safe poll wait head wake ups */
> ep_nested_calls_init(&poll_safewake_ncalls);
>
> --
> 1.7.2.43.g68ef4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


- Davide

2011-02-12 19:05:36

by Nelson Elhage

[permalink] [raw]
Subject: [PATCH] epoll: Prevent creating circular epoll structures.

From: Davide Libenzi <[email protected]>

On insert, check whether the inserted file is itself a struct epoll, and if so,
do a recursive walk to detect whether inserting this file would create a loop of
epoll structures, which could lead to deadlock.

Signed-off-by: Davide Libenzi <[email protected]>
[[email protected]: Use epmutex to serialize concurrent inserts]
Signed-off-by: Nelson Elhage <[email protected]>
---
fs/eventpoll.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 95 insertions(+), 0 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7..d159cb7 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -63,6 +63,13 @@
* cleanup path and it is also acquired by eventpoll_release_file()
* if a file has been pushed inside an epoll set and it is then
* close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
* It is possible to drop the "ep->mtx" and to use the global
* mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
*/
static DEFINE_MUTEX(epmutex);

+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;

@@ -1188,6 +1198,62 @@ retry:
return res;
}

+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ * API, to verify that adding an epoll file inside another
+ * epoll structure, does not violate the constraints, in
+ * terms of closed loops, or too deep chains (which can
+ * result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ * data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct eventpoll *ep = file->private_data;
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (unlikely(is_file_epoll(epi->ffd.file))) {
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, epi->ffd.file,
+ epi->ffd.file->private_data, current);
+ if (error != 0)
+ break;
+ }
+ }
+ mutex_unlock(&ep->mtx);
+
+ return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ * another epoll file (represented by @ep) does not create
+ * closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+ return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, file, ep, current);
+}
+
/*
* Open an eventpoll file descriptor.
*/
@@ -1236,6 +1302,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
+ int did_lock_epmutex = 0;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
@@ -1277,6 +1344,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
*/
ep = file->private_data;

+ /*
+ * When we insert an epoll file descriptor, inside another epoll file
+ * descriptor, there is the change of creating closed loops, which are
+ * better be handled here, than in more critical paths.
+ *
+ * We hold epmutex across the loop check and the insert in this case, in
+ * order to prevent two separate inserts from racing and each doing the
+ * insert "at the same time" such that ep_loop_check passes on both
+ * before either one does the insert, thereby creating a cycle.
+ */
+ if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+ mutex_lock(&epmutex);
+ did_lock_epmutex = 1;
+ error = -ELOOP;
+ if (ep_loop_check(ep, tfile) != 0)
+ goto error_tgt_fput;
+ }
+
+
mutex_lock(&ep->mtx);

/*
@@ -1312,6 +1398,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
mutex_unlock(&ep->mtx);

error_tgt_fput:
+ if (unlikely(did_lock_epmutex))
+ mutex_unlock(&epmutex);
+
fput(tfile);
error_fput:
fput(file);
@@ -1431,6 +1520,12 @@ static int __init eventpoll_init(void)
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);

+ /*
+ * Initialize the structure used to perform epoll file descriptor
+ * inclusion loops checks.
+ */
+ ep_nested_calls_init(&poll_loop_ncalls);
+
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);

--
1.7.2.43.g36c08.dirty

2011-02-14 21:56:08

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent creating circular epoll structures.

On Sat, 12 Feb 2011, Nelson Elhage wrote:

> From: Davide Libenzi <[email protected]>
>
> On insert, check whether the inserted file is itself a struct epoll, and if so,
> do a recursive walk to detect whether inserting this file would create a loop of
> epoll structures, which could lead to deadlock.

Can't say I love it, but IMO it's the minor damage compared to add custom
checks all over into the fast paths.


Acked-by: Davide Libenzi <[email protected]>



> Signed-off-by: Davide Libenzi <[email protected]>
> [[email protected]: Use epmutex to serialize concurrent inserts]
> Signed-off-by: Nelson Elhage <[email protected]>
> ---
> fs/eventpoll.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 95 insertions(+), 0 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index cc8a9b7..d159cb7 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -63,6 +63,13 @@
> * cleanup path and it is also acquired by eventpoll_release_file()
> * if a file has been pushed inside an epoll set and it is then
> * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
> + * It is also acquired when inserting an epoll fd onto another epoll
> + * fd. We do this so that we walk the epoll tree and ensure that this
> + * insertion does not create a cycle of epoll file descriptors, which
> + * could lead to deadlock. We need a global mutex to prevent two
> + * simultaneous inserts (A into B and B into A) from racing and
> + * constructing a cycle without either insert observing that it is
> + * going to.
> * It is possible to drop the "ep->mtx" and to use the global
> * mutex "epmutex" (together with "ep->lock") to have it working,
> * but having "ep->mtx" will make the interface more scalable.
> @@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
> */
> static DEFINE_MUTEX(epmutex);
>
> +/* Used to check for epoll file descriptor inclusion loops */
> +static struct nested_calls poll_loop_ncalls;
> +
> /* Used for safe wake up implementation */
> static struct nested_calls poll_safewake_ncalls;
>
> @@ -1188,6 +1198,62 @@ retry:
> return res;
> }
>
> +/**
> + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
> + * API, to verify that adding an epoll file inside another
> + * epoll structure, does not violate the constraints, in
> + * terms of closed loops, or too deep chains (which can
> + * result in excessive stack usage).
> + *
> + * @priv: Pointer to the epoll file to be currently checked.
> + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
> + * data structure pointer.
> + * @call_nests: Current dept of the @ep_call_nested() call stack.
> + *
> + * Returns: Returns zero if adding the epoll @file inside current epoll
> + * structure @ep does not violate the constraints, or -1 otherwise.
> + */
> +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
> +{
> + int error = 0;
> + struct file *file = priv;
> + struct eventpoll *ep = file->private_data;
> + struct rb_node *rbp;
> + struct epitem *epi;
> +
> + mutex_lock(&ep->mtx);
> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> + epi = rb_entry(rbp, struct epitem, rbn);
> + if (unlikely(is_file_epoll(epi->ffd.file))) {
> + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
> + ep_loop_check_proc, epi->ffd.file,
> + epi->ffd.file->private_data, current);
> + if (error != 0)
> + break;
> + }
> + }
> + mutex_unlock(&ep->mtx);
> +
> + return error;
> +}
> +
> +/**
> + * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
> + * another epoll file (represented by @ep) does not create
> + * closed loops or too deep chains.
> + *
> + * @ep: Pointer to the epoll private data structure.
> + * @file: Pointer to the epoll file to be checked.
> + *
> + * Returns: Returns zero if adding the epoll @file inside current epoll
> + * structure @ep does not violate the constraints, or -1 otherwise.
> + */
> +static int ep_loop_check(struct eventpoll *ep, struct file *file)
> +{
> + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
> + ep_loop_check_proc, file, ep, current);
> +}
> +
> /*
> * Open an eventpoll file descriptor.
> */
> @@ -1236,6 +1302,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> struct epoll_event __user *, event)
> {
> int error;
> + int did_lock_epmutex = 0;
> struct file *file, *tfile;
> struct eventpoll *ep;
> struct epitem *epi;
> @@ -1277,6 +1344,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> */
> ep = file->private_data;
>
> + /*
> + * When we insert an epoll file descriptor, inside another epoll file
> + * descriptor, there is the change of creating closed loops, which are
> + * better be handled here, than in more critical paths.
> + *
> + * We hold epmutex across the loop check and the insert in this case, in
> + * order to prevent two separate inserts from racing and each doing the
> + * insert "at the same time" such that ep_loop_check passes on both
> + * before either one does the insert, thereby creating a cycle.
> + */
> + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
> + mutex_lock(&epmutex);
> + did_lock_epmutex = 1;
> + error = -ELOOP;
> + if (ep_loop_check(ep, tfile) != 0)
> + goto error_tgt_fput;
> + }
> +
> +
> mutex_lock(&ep->mtx);
>
> /*
> @@ -1312,6 +1398,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> mutex_unlock(&ep->mtx);
>
> error_tgt_fput:
> + if (unlikely(did_lock_epmutex))
> + mutex_unlock(&epmutex);
> +
> fput(tfile);
> error_fput:
> fput(file);
> @@ -1431,6 +1520,12 @@ static int __init eventpoll_init(void)
> EP_ITEM_COST;
> BUG_ON(max_user_watches < 0);
>
> + /*
> + * Initialize the structure used to perform epoll file descriptor
> + * inclusion loops checks.
> + */
> + ep_nested_calls_init(&poll_loop_ncalls);
> +
> /* Initialize the structure used to perform safe poll wait head wake ups */
> ep_nested_calls_init(&poll_safewake_ncalls);
>
> --
> 1.7.2.43.g36c08.dirty
>
>


- Davide

2011-02-14 23:46:06

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] epoll: Prevent creating circular epoll structures.

On Mon, 14 Feb 2011 13:56:01 -0800 (PST)
Davide Libenzi <[email protected]> wrote:

> On Sat, 12 Feb 2011, Nelson Elhage wrote:
>
> > From: Davide Libenzi <[email protected]>
> >
> > On insert, check whether the inserted file is itself a struct epoll, and if so,
> > do a recursive walk to detect whether inserting this file would create a loop of
> > epoll structures, which could lead to deadlock.
>
> Can't say I love it, but IMO it's the minor damage compared to add custom
> checks all over into the fast paths.
>
>
> Acked-by: Davide Libenzi <[email protected]>

I beefed up the changelog so that it includes Nelson's original description
of the actual bug. Please don't omit that from changelogs!

I assume we want to backport this into -stable? The patch applies to
2.6.34 and possibly earlier - that's as far as I checked.



From: Davide Libenzi <[email protected]>

In several places, an epoll fd can call another file's ->f_op->poll()
method with ep->mtx held. This is in general unsafe, because that other
file could itself be an epoll fd that contains the original epoll fd.

The code defends against this possibility in its own ->poll() method using
ep_call_nested, but there are several other unsafe calls to ->poll
elsewhere that can be made to deadlock. For example, the following simple
program causes the call in ep_insert recursively call the original fd's
->poll, leading to deadlock:

#include <unistd.h>
#include <sys/epoll.h>

int main(void) {
int e1, e2, p[2];
struct epoll_event evt = {
.events = EPOLLIN
};

e1 = epoll_create(1);
e2 = epoll_create(2);
pipe(p);

epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt);
epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt);
write(p[1], p, sizeof p);
epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);

return 0;
}



On insertion, check whether the inserted file is itself a struct epoll,
and if so, do a recursive walk to detect whether inserting this file would
create a loop of epoll structures, which could lead to deadlock.

[[email protected]: Use epmutex to serialize concurrent inserts]
Signed-off-by: Davide Libenzi <[email protected]>
Signed-off-by: Nelson Elhage <[email protected]>
Reported-by: Nelson Elhage <[email protected]>
Tested-by: Nelson Elhage <[email protected]>
Cc: <[email protected]> [2.6.34+, possibly earlier]
Signed-off-by: Andrew Morton <[email protected]>
---

fs/eventpoll.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 95 insertions(+)

diff -puN fs/eventpoll.c~epoll-prevent-creating-circular-epoll-structures fs/eventpoll.c
--- a/fs/eventpoll.c~epoll-prevent-creating-circular-epoll-structures
+++ a/fs/eventpoll.c
@@ -63,6 +63,13 @@
* cleanup path and it is also acquired by eventpoll_release_file()
* if a file has been pushed inside an epoll set and it is then
* close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
* It is possible to drop the "ep->mtx" and to use the global
* mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
@@ -224,6 +231,9 @@ static long max_user_watches __read_most
*/
static DEFINE_MUTEX(epmutex);

+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;

@@ -1198,6 +1208,62 @@ retry:
return res;
}

+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ * API, to verify that adding an epoll file inside another
+ * epoll structure, does not violate the constraints, in
+ * terms of closed loops, or too deep chains (which can
+ * result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ * data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct eventpoll *ep = file->private_data;
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (unlikely(is_file_epoll(epi->ffd.file))) {
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, epi->ffd.file,
+ epi->ffd.file->private_data, current);
+ if (error != 0)
+ break;
+ }
+ }
+ mutex_unlock(&ep->mtx);
+
+ return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ * another epoll file (represented by @ep) does not create
+ * closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+ return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, file, ep, current);
+}
+
/*
* Open an eventpoll file descriptor.
*/
@@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, in
struct epoll_event __user *, event)
{
int error;
+ int did_lock_epmutex = 0;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
@@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, in
*/
ep = file->private_data;

+ /*
+ * When we insert an epoll file descriptor, inside another epoll file
+ * descriptor, there is the change of creating closed loops, which are
+ * better be handled here, than in more critical paths.
+ *
+ * We hold epmutex across the loop check and the insert in this case, in
+ * order to prevent two separate inserts from racing and each doing the
+ * insert "at the same time" such that ep_loop_check passes on both
+ * before either one does the insert, thereby creating a cycle.
+ */
+ if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+ mutex_lock(&epmutex);
+ did_lock_epmutex = 1;
+ error = -ELOOP;
+ if (ep_loop_check(ep, tfile) != 0)
+ goto error_tgt_fput;
+ }
+
+
mutex_lock(&ep->mtx);

/*
@@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, in
mutex_unlock(&ep->mtx);

error_tgt_fput:
+ if (unlikely(did_lock_epmutex))
+ mutex_unlock(&epmutex);
+
fput(tfile);
error_fput:
fput(file);
@@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void)
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);

+ /*
+ * Initialize the structure used to perform epoll file descriptor
+ * inclusion loops checks.
+ */
+ ep_nested_calls_init(&poll_loop_ncalls);
+
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);

_