2008-11-20 13:58:43

by Tejun Heo

[permalink] [raw]
Subject: [PATCH RESEND] poll: allow f_op->poll to sleep, take #2

f_op->poll is the only vfs operation which is not allowed to sleep.
It's because poll and select implementation used task state to
synchronize against wake ups, which doesn't have to be the case
anymore as wait/wake interface can now use custom wake up functions.
The non-sleep restriction can be a bit tricky because ->poll is not
called from an atomic context and the result of accidentally sleeping
in ->poll only shows up as temporary busy looping when the timing is
right or rather wrong.

This patch converts poll/select to use custom wake up function and use
separate triggered variable to synchronize against wake up events.
The only added overhead is an extra function call during wake up and
negligible.

This patch removes the one non-sleep exception from vfs locking rules
and is beneficial to userland filesystem implementations like FUSE, 9p
or peculiar fs like spufs as it's very difficult for those to
implement non-sleeping poll method.

Signed-off-by: Tejun Heo <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Ron Minnich <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Christoph Hellwig <[email protected]>
---
I forgot to cc LKML when posting the patch. Resending for reference.
This patch has been acked by Linus and is scheduled to go through
Miklos' tree.

The original exchange reproduced below.

Linus Torvalds wrote:
> On Thu, 13 Nov 2008, Tejun Heo wrote:
>> Updated to fit hrtimer changes. 9p changes are already merged, so
>> this patch is sufficient to allow all ->poll implementations to
>> sleep. spufs is broken without this change && future FUSE change
>> requires this. If there's no objection, I'd like to push this
>> along with other FUSE changes. Linus, are you still against this
>> change?
>
> Hmm. Looks ok to me.
>
>> FYI, the previous discussion on this subject can be found at...
>>
>> http://thread.gmane.org/gmane.linux.kernel/726176/focus=726178
>>

Thanks.

Documentation/filesystems/Locking | 2 -
drivers/media/video/v4l1-compat.c | 4 ---
fs/select.c | 50 ++++++++++++++++++++++++++++----------
include/linux/poll.h | 9 ++++++
4 files changed, 49 insertions(+), 16 deletions(-)

Index: work/fs/select.c
===================================================================
--- work.orig/fs/select.c
+++ work/fs/select.c
@@ -109,6 +109,7 @@ static void __pollwait(struct file *filp
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->polling_task = current;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
@@ -145,9 +146,8 @@ void poll_freewait(struct poll_wqueues *

EXPORT_SYMBOL(poll_freewait);

-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
- struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;

if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +159,6 @@ static struct poll_table_entry *poll_get
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
return NULL;
}
new_table->entry = new_table->entries;
@@ -171,20 +170,51 @@ static struct poll_table_entry *poll_get
return table->entry++;
}

+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_wqueues *pwq = wait->private;
+ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+ set_mb(pwq->triggered, 1);
+
+ /* perform the default wake up operation */
+ return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
- struct poll_table_entry *entry = poll_get_entry(p);
+ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+ struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
+ init_waitqueue_func_entry(&entry->wait, pollwake);
+ entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
}

+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack)
+{
+ int rc = -EINTR;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!pwq->triggered)
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
+ /* clear triggered for the next iteration */
+ pwq->triggered = 0;
+
+ return rc;
+}
+
+EXPORT_SYMBOL(poll_schedule_timeout);
+
/**
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
@@ -340,8 +370,6 @@ int do_select(int n, fd_set_bits *fds, s
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

- set_current_state(TASK_INTERRUPTIBLE);
-
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

@@ -411,10 +439,10 @@ int do_select(int n, fd_set_bits *fds, s
to = &expire;
}

- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+ to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);

poll_freewait(&table);

@@ -666,7 +694,6 @@ static int do_poll(unsigned int nfds, s
for (;;) {
struct poll_list *walk;

- set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;

@@ -709,10 +736,9 @@ static int do_poll(unsigned int nfds, s
to = &expire;
}

- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
return count;
}

Index: work/include/linux/poll.h
===================================================================
--- work.orig/include/linux/poll.h
+++ work/include/linux/poll.h
@@ -57,6 +57,8 @@ struct poll_table_entry {
struct poll_wqueues {
poll_table pt;
struct poll_table_page * table;
+ struct task_struct * polling_task;
+ int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
@@ -64,6 +66,13 @@ struct poll_wqueues {

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
+extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack);
+
+static inline int poll_schedule(struct poll_wqueues *pwq, int state)
+{
+ return poll_schedule_timeout(pwq, state, NULL, 0);
+}

/*
* Scaleable version of the fd_set.
Index: work/drivers/media/video/v4l1-compat.c
===================================================================
--- work.orig/drivers/media/video/v4l1-compat.c
+++ work/drivers/media/video/v4l1-compat.c
@@ -203,7 +203,6 @@ static int poll_one(struct file *file, s
table = &pwq->pt;
for (;;) {
int mask;
- set_current_state(TASK_INTERRUPTIBLE);
mask = file->f_op->poll(file, table);
if (mask & POLLIN)
break;
@@ -212,9 +211,8 @@ static int poll_one(struct file *file, s
retval = -ERESTARTSYS;
break;
}
- schedule();
+ poll_schedule(pwq, TASK_INTERRUPTIBLE);
}
- set_current_state(TASK_RUNNING);
poll_freewait(pwq);
return retval;
}
Index: work/Documentation/filesystems/Locking
===================================================================
--- work.orig/Documentation/filesystems/Locking
+++ work/Documentation/filesystems/Locking
@@ -398,7 +398,7 @@ prototypes:
};

locking rules:
- All except ->poll() may block.
+ All may block.
BKL
llseek: no (see below)
read: no


2008-11-21 16:43:29

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RESEND] poll: allow f_op->poll to sleep, take #2

On Thu, 20 Nov 2008, Tejun Heo wrote:

[snip]

> +int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
> + ktime_t *expires, unsigned long slack)

The 'state' parameter is unused, and is always called with the
TASK_INTERRUPTIBLE value. Shouldn't it be removed?

> +{
> + int rc = -EINTR;
> +
> + set_current_state(TASK_INTERRUPTIBLE);
> + if (!pwq->triggered)
> + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
> + __set_current_state(TASK_RUNNING);
> +
> + /* clear triggered for the next iteration */
> + pwq->triggered = 0;
> +
> + return rc;
> +}
> +
> +EXPORT_SYMBOL(poll_schedule_timeout);

Checkpatch warning:

WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable
#118: FILE: fs/select.c:216:
+EXPORT_SYMBOL(poll_schedule_timeout);

[snip]

> Index: work/include/linux/poll.h
> ===================================================================
> --- work.orig/include/linux/poll.h
> +++ work/include/linux/poll.h
> @@ -57,6 +57,8 @@ struct poll_table_entry {
> struct poll_wqueues {
> poll_table pt;
> struct poll_table_page * table;
> + struct task_struct * polling_task;
> + int triggered;

Checkpatch error:

ERROR: "foo * bar" should be "foo *bar"
#173: FILE: include/linux/poll.h:60:
+ struct task_struct * polling_task;

Miklos

2008-11-21 17:12:00

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH RESEND] poll: allow f_op->poll to sleep, take #2

Miklos Szeredi wrote:
> On Thu, 20 Nov 2008, Tejun Heo wrote:
>
> [snip]
>
>> +int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
>> + ktime_t *expires, unsigned long slack)
>
> The 'state' parameter is unused, and is always called with the
> TASK_INTERRUPTIBLE value. Shouldn't it be removed?
>
>> +{
>> + int rc = -EINTR;
>> +
>> + set_current_state(TASK_INTERRUPTIBLE);

Aieee... this should have been set_current_state(state). We can also
remove @state but this being a schedule() function I think it's better
to pass @state explicitly.

>> + if (!pwq->triggered)
>> + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
>> + __set_current_state(TASK_RUNNING);
>> +
>> + /* clear triggered for the next iteration */
>> + pwq->triggered = 0;
>> +
>> + return rc;
>> +}
>> +
>> +EXPORT_SYMBOL(poll_schedule_timeout);
>
> Checkpatch warning:
>
> WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable
> #118: FILE: fs/select.c:216:
> +EXPORT_SYMBOL(poll_schedule_timeout);
>
> [snip]
>
>> Index: work/include/linux/poll.h
>> ===================================================================
>> --- work.orig/include/linux/poll.h
>> +++ work/include/linux/poll.h
>> @@ -57,6 +57,8 @@ struct poll_table_entry {
>> struct poll_wqueues {
>> poll_table pt;
>> struct poll_table_page * table;
>> + struct task_struct * polling_task;
>> + int triggered;
>
> Checkpatch error:
>
> ERROR: "foo * bar" should be "foo *bar"
> #173: FILE: include/linux/poll.h:60:
> + struct task_struct * polling_task;

For both, I was trying to stay consistent with the environment. I
find mixed styles in close proximity much uglier than slightly
different but consistent style. Eh... Is the consensus checkpatch or
die?

Thanks.

--
tejun

2008-11-21 17:23:52

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RESEND] poll: allow f_op->poll to sleep, take #2

On Sat, 22 Nov 2008, Tejun Heo wrote:
> For both, I was trying to stay consistent with the environment. I
> find mixed styles in close proximity much uglier than slightly
> different but consistent style. Eh... Is the consensus checkpatch or
> die?

I think some common sense can be applied in these cases. For example
a small amount of offending style in close proximity can be fixed as
well, as long as it doesn't interfere too much with the readability of
the patch.

Miklos

2008-11-21 17:32:18

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH RESEND] poll: allow f_op->poll to sleep, take #2

Miklos Szeredi wrote:
> On Sat, 22 Nov 2008, Tejun Heo wrote:
>> For both, I was trying to stay consistent with the environment. I
>> find mixed styles in close proximity much uglier than slightly
>> different but consistent style. Eh... Is the consensus checkpatch or
>> die?
>
> I think some common sense can be applied in these cases. For example
> a small amount of offending style in close proximity can be fixed as
> well, as long as it doesn't interfere too much with the readability of
> the patch.

Well, yeah, given that fs/select.c is not a very hot file. Introduced
inconsistencies are gonna stay there for a long time. The two issues
in this patch is minor and can be easily adjusted but in general I'm
not too sure whether always sticking with checkpatch is a good idea
especially on fairly cold areas. Anyways, I'll fix it up.

Thanks.

--
tejun

2008-11-22 07:28:55

by Tejun Heo

[permalink] [raw]
Subject: [PATCH] poll: allow f_op->poll to sleep, take #3

f_op->poll is the only vfs operation which is not allowed to sleep.
It's because poll and select implementation used task state to
synchronize against wake ups, which doesn't have to be the case
anymore as wait/wake interface can now use custom wake up functions.
The non-sleep restriction can be a bit tricky because ->poll is not
called from an atomic context and the result of accidentally sleeping
in ->poll only shows up as temporary busy looping when the timing is
right or rather wrong.

This patch converts poll/select to use custom wake up function and use
separate triggered variable to synchronize against wake up events.
The only added overhead is an extra function call during wake up and
negligible.

This patch removes the one non-sleep exception from vfs locking rules
and is beneficial to userland filesystem implementations like FUSE, 9p
or peculiar fs like spufs as it's very difficult for those to
implement non-sleeping poll method.

While at it, make the following cosmetic changes to make poll.h and
select.c checkpatch friendly.

* s/type * symbol/type *symbol/ : three places in poll.h
* remove blank line before EXPORT_SYMBOL() : two places in select.c

Signed-off-by: Tejun Heo <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Ron Minnich <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Christoph Hellwig <[email protected]>
---
@state parameter is now honored properly and checkpatch is happy.

Thanks.

Documentation/filesystems/Locking | 2 -
drivers/media/video/v4l1-compat.c | 4 --
fs/select.c | 51 +++++++++++++++++++++++++++-----------
include/linux/poll.h | 15 ++++++++---
4 files changed, 51 insertions(+), 21 deletions(-)

Index: work/fs/select.c
===================================================================
--- work.orig/fs/select.c
+++ work/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->polling_task = current;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
-
EXPORT_SYMBOL(poll_initwait);

static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *
free_page((unsigned long) old);
}
}
-
EXPORT_SYMBOL(poll_freewait);

-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
- struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;

if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
return NULL;
}
new_table->entry = new_table->entries;
@@ -171,20 +168,50 @@ static struct poll_table_entry *poll_get
return table->entry++;
}

+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_wqueues *pwq = wait->private;
+ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+ set_mb(pwq->triggered, 1);
+
+ /* perform the default wake up operation */
+ return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
- struct poll_table_entry *entry = poll_get_entry(p);
+ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+ struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
+ init_waitqueue_func_entry(&entry->wait, pollwake);
+ entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
}

+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack)
+{
+ int rc = -EINTR;
+
+ set_current_state(state);
+ if (!pwq->triggered)
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
+ /* clear triggered for the next iteration */
+ pwq->triggered = 0;
+
+ return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
/**
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
@@ -340,8 +367,6 @@ int do_select(int n, fd_set_bits *fds, s
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

- set_current_state(TASK_INTERRUPTIBLE);
-
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

@@ -411,10 +436,10 @@ int do_select(int n, fd_set_bits *fds, s
to = &expire;
}

- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+ to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);

poll_freewait(&table);

@@ -666,7 +691,6 @@ static int do_poll(unsigned int nfds, s
for (;;) {
struct poll_list *walk;

- set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;

@@ -709,10 +733,9 @@ static int do_poll(unsigned int nfds, s
to = &expire;
}

- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
return count;
}

Index: work/include/linux/poll.h
===================================================================
--- work.orig/include/linux/poll.h
+++ work/include/linux/poll.h
@@ -46,9 +46,9 @@ static inline void init_poll_funcptr(pol
}

struct poll_table_entry {
- struct file * filp;
+ struct file *filp;
wait_queue_t wait;
- wait_queue_head_t * wait_address;
+ wait_queue_head_t *wait_address;
};

/*
@@ -56,7 +56,9 @@ struct poll_table_entry {
*/
struct poll_wqueues {
poll_table pt;
- struct poll_table_page * table;
+ struct poll_table_page *table;
+ struct task_struct *polling_task;
+ int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
@@ -64,6 +66,13 @@ struct poll_wqueues {

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
+extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack);
+
+static inline int poll_schedule(struct poll_wqueues *pwq, int state)
+{
+ return poll_schedule_timeout(pwq, state, NULL, 0);
+}

/*
* Scaleable version of the fd_set.
Index: work/drivers/media/video/v4l1-compat.c
===================================================================
--- work.orig/drivers/media/video/v4l1-compat.c
+++ work/drivers/media/video/v4l1-compat.c
@@ -203,7 +203,6 @@ static int poll_one(struct file *file, s
table = &pwq->pt;
for (;;) {
int mask;
- set_current_state(TASK_INTERRUPTIBLE);
mask = file->f_op->poll(file, table);
if (mask & POLLIN)
break;
@@ -212,9 +211,8 @@ static int poll_one(struct file *file, s
retval = -ERESTARTSYS;
break;
}
- schedule();
+ poll_schedule(pwq, TASK_INTERRUPTIBLE);
}
- set_current_state(TASK_RUNNING);
poll_freewait(pwq);
return retval;
}
Index: work/Documentation/filesystems/Locking
===================================================================
--- work.orig/Documentation/filesystems/Locking
+++ work/Documentation/filesystems/Locking
@@ -398,7 +398,7 @@ prototypes:
};

locking rules:
- All except ->poll() may block.
+ All may block.
BKL
llseek: no (see below)
read: no