2013-03-02 07:25:10

by Eric Wong

[permalink] [raw]
Subject: [PATCH] epoll: preserve ordering of events from ovflist

Events arriving in ovflist are stored in LIFO order, so
we should account for that when inserting them into rddlist.

Signed-off-by: Eric Wong <[email protected]>
Cc: Davide Libenzi <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Andrew Morton <[email protected]>
---
I think this can lead to starvation in some rare cases, but I have not
been able to trigger it. The window for ovflist insertion is tiny.

fs/eventpoll.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec183..5a1a596 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -598,7 +598,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ list_add(&epi->rdllink, &ep->rdllist);
__pm_stay_awake(epi->ws);
}
}
--
Eric Wong


2013-03-04 03:05:03

by Eric Wong

[permalink] [raw]
Subject: Re: [PATCH] epoll: preserve ordering of events from ovflist

Eric Wong <[email protected]> wrote:
> Events arriving in ovflist are stored in LIFO order, so
> we should account for that when inserting them into rddlist.

Fwiw, I noticed this oddity because I wanted to start tracing epitem
readiness (to detect when my application is not calling epoll_wait()
fast enough).

I have two more patches coming on top of this one:

Eric Wong (2):
epoll: hoist out duplicated wake up logic
epoll: add tracepoints for epitem enqueue/dequeue

fs/eventpoll.c | 62 +++++++++++----------
include/linux/eventpoll.h | 8 +++
include/trace/events/eventpoll.h | 113 +++++++++++++++++++++++++++++++++++++++
3 files changed, 155 insertions(+), 28 deletions(-)

2013-03-04 03:06:28

by Eric Wong

[permalink] [raw]
Subject: [PATCH 2/2] epoll: add tracepoints for epitem enqueue/dequeue

Tracing the enqueue/dequeue of each epitem is useful for tracking
latency of userspace applications which use epoll.

This also adds tracing for spurious readiness of epitems. Tracing
spurious readiness should allow us to notice (and hopefully fix)
kernel bugs causing unnecessary wakeups.

Signed-off-by: Eric Wong <[email protected]>
Cc: Davide Libenzi <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Andrew Morton <[email protected]>
---
fs/eventpoll.c | 35 +++++++-----
include/linux/eventpoll.h | 8 +++
include/trace/events/eventpoll.h | 113 +++++++++++++++++++++++++++++++++++++++
3 files changed, 144 insertions(+), 12 deletions(-)
create mode 100644 include/trace/events/eventpoll.h

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 6a70b7d..cfc4b16 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -41,6 +41,9 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

+#define CREATE_TRACE_POINTS
+#include <trace/events/eventpoll.h>
+
/*
* LOCKING:
* There are three level of locking required by epoll :
@@ -543,6 +546,14 @@ static int __must_check ep_wake_up_locked(struct eventpoll *ep)
return waitqueue_active(&ep->poll_wait);
}

+static void ep_enqueue(struct eventpoll *ep, struct epitem *epi,
+ enum ep_ready_reason reason)
+{
+ trace_ep_ready(ep, epi, reason);
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
+}
+
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
@@ -971,6 +982,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
*/
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
if (epi->next == EP_UNACTIVE_PTR) {
+ trace_ep_ovfready(ep, epi);
epi->next = ep->ovflist;
ep->ovflist = epi;
if (epi->ws) {
@@ -986,10 +998,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
}

/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
- }
+ if (!ep_is_linked(&epi->rdllink))
+ ep_enqueue(ep, epi, EPR_POLL_CB);

/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
@@ -1247,8 +1257,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,

/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_enqueue(ep, epi, EPR_CTL_ADD);

/* Notify waiting tasks that events are available */
pwake = ep_wake_up_locked(ep);
@@ -1354,8 +1363,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
if (revents & event->events) {
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_enqueue(ep, epi, EPR_CTL_MOD);

/* Notify waiting tasks that events are available */
pwake = ep_wake_up_locked(ep);
@@ -1376,6 +1384,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
struct ep_send_events_data *esed = priv;
int eventcnt;
unsigned int revents;
+ unsigned int pevents;
struct epitem *epi;
struct epoll_event __user *uevent;
poll_table pt;
@@ -1406,8 +1415,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
list_del_init(&epi->rdllink);

pt._key = epi->event.events;
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events;
+ pevents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
+ revents = pevents & epi->event.events;

/*
* If the event mask intersect the caller-requested one,
@@ -1422,6 +1431,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
__pm_stay_awake(epi->ws);
return eventcnt ? eventcnt : -EFAULT;
}
+ trace_ep_sent(ep, epi, revents);
eventcnt++;
uevent++;
if (epi->event.events & EPOLLONESHOT)
@@ -1438,9 +1448,10 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
- list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_enqueue(ep, epi, EPR_LEVEL_TRIGGER);
}
+ } else {
+ trace_ep_spurious(ep, epi, epi->event.events, pevents);
}
}

diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 6daf6d4..53f3850 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -22,6 +22,14 @@ struct file;

#ifdef CONFIG_EPOLL

+enum ep_ready_reason {
+ EPR_POLL_CB,
+ EPR_OVFLIST, /* corner-case of ER_POLL_CB */
+ EPR_CTL_ADD,
+ EPR_CTL_MOD,
+ EPR_LEVEL_TRIGGER
+};
+
/* Used to initialize the epoll bits inside the "struct file" */
static inline void eventpoll_init_file(struct file *file)
{
diff --git a/include/trace/events/eventpoll.h b/include/trace/events/eventpoll.h
new file mode 100644
index 0000000..d17db05
--- /dev/null
+++ b/include/trace/events/eventpoll.h
@@ -0,0 +1,113 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM eventpoll
+
+#if !defined(_TRACE_EVENTPOLL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENTPOLL_H
+
+#include <linux/eventpoll.h>
+#include <linux/tracepoint.h>
+
+struct eventpoll;
+struct epitem;
+
+#define READY_REASON \
+ {EPR_POLL_CB, "poll_callback"}, \
+ {EPR_OVFLIST, "ovflist"}, \
+ {EPR_CTL_ADD, "EPOLL_CTL_ADD"}, \
+ {EPR_CTL_MOD, "EPOLL_CTL_MOD"}, \
+ {EPR_LEVEL_TRIGGER, "level_trigger"}
+
+TRACE_EVENT(ep_ready,
+
+ TP_PROTO(struct eventpoll *ep, struct epitem *epi,
+ enum ep_ready_reason reason),
+
+ TP_ARGS(ep, epi, reason),
+
+ TP_STRUCT__entry(
+ __field( struct eventpoll *, ep )
+ __field( struct epitem *, epi )
+ __field( enum ep_ready_reason, reason )
+ ),
+
+ TP_fast_assign(
+ __entry->ep = ep;
+ __entry->epi = epi;
+ __entry->reason = reason;
+ ),
+
+ TP_printk("%p,%p ep_ready %s", __entry->ep, __entry->epi,
+ __print_symbolic(__entry->reason, READY_REASON))
+);
+
+TRACE_EVENT(ep_ovfready,
+
+ TP_PROTO(struct eventpoll *ep, struct epitem *epi),
+
+ TP_ARGS(ep, epi),
+
+ TP_STRUCT__entry(
+ __field( struct eventpoll *, ep )
+ __field( struct epitem *, epi )
+ ),
+
+ TP_fast_assign(
+ __entry->ep = ep;
+ __entry->epi = epi;
+ ),
+
+ TP_printk("%p,%p ep_ovfready", __entry->ep, __entry->epi)
+);
+
+TRACE_EVENT(ep_sent,
+
+ TP_PROTO(struct eventpoll *ep, struct epitem *epi, unsigned int events),
+
+ TP_ARGS(ep, epi, events),
+
+ TP_STRUCT__entry(
+ __field( struct eventpoll *, ep )
+ __field( struct epitem *, epi )
+ __field( unsigned int, events )
+ ),
+
+ TP_fast_assign(
+ __entry->ep = ep;
+ __entry->epi = epi;
+ __entry->events = events;
+ ),
+
+ TP_printk("%p,%p ep_sent(events=0x%x)",
+ __entry->ep, __entry->epi, __entry->events)
+);
+
+TRACE_EVENT(ep_spurious,
+
+ TP_PROTO(struct eventpoll *ep, struct epitem *epi,
+ unsigned int events, unsigned int poll_events),
+
+ TP_ARGS(ep, epi, events, poll_events),
+
+ TP_STRUCT__entry(
+ __field( struct eventpoll *, ep )
+ __field( struct epitem *, epi )
+ __field( unsigned int, events )
+ __field( unsigned int, poll_events )
+ ),
+
+ TP_fast_assign(
+ __entry->ep = ep;
+ __entry->epi = epi;
+ __entry->events = events;
+ __entry->poll_events = poll_events;
+ ),
+
+ TP_printk("%p,%p ep_spurious events=0x%x,poll_events=0x%x",
+ __entry->ep, __entry->epi,
+ __entry->events, __entry->poll_events)
+);
+
+#endif /* _TRACE_EVENTPOLL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--
1.8.1.2.526.gf51a757

2013-03-04 03:06:25

by Eric Wong

[permalink] [raw]
Subject: [PATCH 1/2] epoll: hoist out duplicated wake up logic

This makes the kernel slightly smaller, and hopefully easier to follow.

Signed-off-by: Eric Wong <[email protected]>
Cc: Davide Libenzi <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Andrew Morton <[email protected]>
---
fs/eventpoll.c | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5a1a596..6a70b7d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -536,6 +536,13 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
}
}

+static int __must_check ep_wake_up_locked(struct eventpoll *ep)
+{
+ if (waitqueue_active(&ep->wq))
+ wake_up_locked(&ep->wq);
+ return waitqueue_active(&ep->poll_wait);
+}
+
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
@@ -620,10 +627,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* Wake up (if active) both the eventpoll wait list and
* the ->poll() wait list (delayed after we release the lock).
*/
- if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
+ pwake = ep_wake_up_locked(ep);
}
spin_unlock_irqrestore(&ep->lock, flags);

@@ -991,10 +995,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
- if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
+ pwake = ep_wake_up_locked(ep);

out_unlock:
spin_unlock_irqrestore(&ep->lock, flags);
@@ -1250,10 +1251,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
__pm_stay_awake(epi->ws);

/* Notify waiting tasks that events are available */
- if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
+ pwake = ep_wake_up_locked(ep);
}

spin_unlock_irqrestore(&ep->lock, flags);
@@ -1360,10 +1358,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
__pm_stay_awake(epi->ws);

/* Notify waiting tasks that events are available */
- if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
+ pwake = ep_wake_up_locked(ep);
}
spin_unlock_irq(&ep->lock);
}
--
1.8.1.2.526.gf51a757

2013-03-08 01:32:20

by Eric Wong

[permalink] [raw]
Subject: Re: [PATCH 2/2] epoll: add tracepoints for epitem enqueue/dequeue

Putting this on hold for now.

I'm awaiting answers for <[email protected]>,
(Subject: epoll: possible bug from wakeup_source activation)
this patch may hide the possible bug I'm referring to in that email.