[Fwd to Andrew, CC fsdevel & Arjan]
Andrew, can you please apply this? I don't think it should be sneaked
in through the fuse tree (even though the immediate requirement is
from the poll implementation in fuse).
Thanks,
Miklos
----
From: Tejun Heo <[email protected]>
f_op->poll is the only vfs operation which is not allowed to sleep.
It's because poll and select implementation used task state to
synchronize against wake ups, which doesn't have to be the case
anymore as wait/wake interface can now use custom wake up functions.
The non-sleep restriction can be a bit tricky because ->poll is not
called from an atomic context and the result of accidentally sleeping
in ->poll only shows up as temporary busy looping when the timing is
right or rather wrong.
This patch converts poll/select to use custom wake up function and use
separate triggered variable to synchronize against wake up events.
The only added overhead is an extra function call during wake up and
negligible.
This patch removes the one non-sleep exception from vfs locking rules
and is beneficial to userland filesystem implementations like FUSE, 9p
or peculiar fs like spufs as it's very difficult for those to
implement non-sleeping poll method.
While at it, make the following cosmetic changes to make poll.h and
select.c checkpatch friendly.
* s/type * symbol/type *symbol/ : three places in poll.h
* remove blank line before EXPORT_SYMBOL() : two places in select.c
Signed-off-by: Tejun Heo <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Ron Minnich <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
Documentation/filesystems/Locking | 2 -
drivers/media/video/v4l1-compat.c | 4 --
fs/select.c | 51 +++++++++++++++++++++++++++-----------
include/linux/poll.h | 15 ++++++++---
4 files changed, 51 insertions(+), 21 deletions(-)
Index: work/fs/select.c
===================================================================
--- work.orig/fs/select.c
+++ work/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->polling_task = current;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
-
EXPORT_SYMBOL(poll_initwait);
static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *
free_page((unsigned long) old);
}
}
-
EXPORT_SYMBOL(poll_freewait);
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
- struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;
if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
return NULL;
}
new_table->entry = new_table->entries;
@@ -171,20 +168,50 @@ static struct poll_table_entry *poll_get
return table->entry++;
}
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_wqueues *pwq = wait->private;
+ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+ set_mb(pwq->triggered, 1);
+
+ /* perform the default wake up operation */
+ return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
- struct poll_table_entry *entry = poll_get_entry(p);
+ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+ struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
+ init_waitqueue_func_entry(&entry->wait, pollwake);
+ entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
}
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack)
+{
+ int rc = -EINTR;
+
+ set_current_state(state);
+ if (!pwq->triggered)
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
+ /* clear triggered for the next iteration */
+ pwq->triggered = 0;
+
+ return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
/**
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
@@ -340,8 +367,6 @@ int do_select(int n, fd_set_bits *fds, s
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
- set_current_state(TASK_INTERRUPTIBLE);
-
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -411,10 +436,10 @@ int do_select(int n, fd_set_bits *fds, s
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+ to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
poll_freewait(&table);
@@ -666,7 +691,6 @@ static int do_poll(unsigned int nfds, s
for (;;) {
struct poll_list *walk;
- set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
@@ -709,10 +733,9 @@ static int do_poll(unsigned int nfds, s
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
return count;
}
Index: work/include/linux/poll.h
===================================================================
--- work.orig/include/linux/poll.h
+++ work/include/linux/poll.h
@@ -46,9 +46,9 @@ static inline void init_poll_funcptr(pol
}
struct poll_table_entry {
- struct file * filp;
+ struct file *filp;
wait_queue_t wait;
- wait_queue_head_t * wait_address;
+ wait_queue_head_t *wait_address;
};
/*
@@ -56,7 +56,9 @@ struct poll_table_entry {
*/
struct poll_wqueues {
poll_table pt;
- struct poll_table_page * table;
+ struct poll_table_page *table;
+ struct task_struct *polling_task;
+ int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
@@ -64,6 +66,13 @@ struct poll_wqueues {
extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
+extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack);
+
+static inline int poll_schedule(struct poll_wqueues *pwq, int state)
+{
+ return poll_schedule_timeout(pwq, state, NULL, 0);
+}
/*
* Scaleable version of the fd_set.
Index: work/drivers/media/video/v4l1-compat.c
===================================================================
--- work.orig/drivers/media/video/v4l1-compat.c
+++ work/drivers/media/video/v4l1-compat.c
@@ -203,7 +203,6 @@ static int poll_one(struct file *file, s
table = &pwq->pt;
for (;;) {
int mask;
- set_current_state(TASK_INTERRUPTIBLE);
mask = file->f_op->poll(file, table);
if (mask & POLLIN)
break;
@@ -212,9 +211,8 @@ static int poll_one(struct file *file, s
retval = -ERESTARTSYS;
break;
}
- schedule();
+ poll_schedule(pwq, TASK_INTERRUPTIBLE);
}
- set_current_state(TASK_RUNNING);
poll_freewait(pwq);
return retval;
}
Index: work/Documentation/filesystems/Locking
===================================================================
--- work.orig/Documentation/filesystems/Locking
+++ work/Documentation/filesystems/Locking
@@ -398,7 +398,7 @@ prototypes:
};
locking rules:
- All except ->poll() may block.
+ All may block.
BKL
llseek: no (see below)
read: no
On Sat, 22 Nov 2008 09:58:33 +0100 Miklos Szeredi <[email protected]> wrote:
> Andrew, can you please apply this?
I did so earlier today.
On Sat, Nov 22, 2008 at 09:58:33AM +0100, Miklos Szeredi wrote:
> +int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
> + ktime_t *expires, unsigned long slack)
All callers of poll_schedule() and poll_schedule_timeout() pass
TASK_INTERRUPTIBLE. We can elide the 'state' argument.
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
Matthew Wilcox wrote:
> On Sat, Nov 22, 2008 at 09:58:33AM +0100, Miklos Szeredi wrote:
>> +int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
>> + ktime_t *expires, unsigned long slack)
>
> All callers of poll_schedule() and poll_schedule_timeout() pass
> TASK_INTERRUPTIBLE. We can elide the 'state' argument.
Well, I wanted to keep it as to keep it more consistent with other
schedule() functions but both Miklos and you don't seem to like it, so I
might as well just drop it. Andrew, what do you think?
Thanks.
--
tejun
On Sat, 22 Nov 2008 21:43:51 +0900 Tejun Heo <[email protected]> wrote:
> Matthew Wilcox wrote:
> > On Sat, Nov 22, 2008 at 09:58:33AM +0100, Miklos Szeredi wrote:
> >> +int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
> >> + ktime_t *expires, unsigned long slack)
> >
> > All callers of poll_schedule() and poll_schedule_timeout() pass
> > TASK_INTERRUPTIBLE. We can elide the 'state' argument.
>
> Well, I wanted to keep it as to keep it more consistent with other
> schedule() functions but both Miklos and you don't seem to like it, so I
> might as well just drop it. Andrew, what do you think?
I guess that if any poll/select syscall were to sleep in
uninterruptible state, people would get upset about the effect upon their
load average and we'd have to go in and fix it.
So, yup, I expect that hard-coding TASK_INTERRUPTIBLE would be OK.
f_op->poll is the only vfs operation which is not allowed to sleep.
It's because poll and select implementation used task state to
synchronize against wake ups, which doesn't have to be the case
anymore as wait/wake interface can now use custom wake up functions.
The non-sleep restriction can be a bit tricky because ->poll is not
called from an atomic context and the result of accidentally sleeping
in ->poll only shows up as temporary busy looping when the timing is
right or rather wrong.
This patch converts poll/select to use custom wake up function and use
separate triggered variable to synchronize against wake up events.
The only added overhead is an extra function call during wake up and
negligible.
This patch removes the one non-sleep exception from vfs locking rules
and is beneficial to userland filesystem implementations like FUSE, 9p
or peculiar fs like spufs as it's very difficult for those to
implement non-sleeping poll method.
Signed-off-by: Tejun Heo <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Ron Minnich <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Christoph Hellwig <[email protected]>
---
@state dropped.
Documentation/filesystems/Locking | 2 -
drivers/media/video/v4l1-compat.c | 4 ---
fs/select.c | 50 +++++++++++++++++++++++++++-----------
include/linux/poll.h | 15 +++++++++--
4 files changed, 50 insertions(+), 21 deletions(-)
Index: work/fs/select.c
===================================================================
--- work.orig/fs/select.c
+++ work/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->polling_task = current;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
-
EXPORT_SYMBOL(poll_initwait);
static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *
free_page((unsigned long) old);
}
}
-
EXPORT_SYMBOL(poll_freewait);
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
- struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;
if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
return NULL;
}
new_table->entry = new_table->entries;
@@ -171,20 +168,50 @@ static struct poll_table_entry *poll_get
return table->entry++;
}
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_wqueues *pwq = wait->private;
+ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+ set_mb(pwq->triggered, 1);
+
+ /* perform the default wake up operation */
+ return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
- struct poll_table_entry *entry = poll_get_entry(p);
+ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+ struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
+ init_waitqueue_func_entry(&entry->wait, pollwake);
+ entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
}
+int poll_schedule_timeout(struct poll_wqueues *pwq,
+ ktime_t *expires, unsigned long slack)
+{
+ int rc = -EINTR;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!pwq->triggered)
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
+ /* clear triggered for the next iteration */
+ pwq->triggered = 0;
+
+ return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
/**
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
@@ -340,8 +367,6 @@ int do_select(int n, fd_set_bits *fds, s
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
- set_current_state(TASK_INTERRUPTIBLE);
-
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -411,10 +436,9 @@ int do_select(int n, fd_set_bits *fds, s
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(&table, to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
poll_freewait(&table);
@@ -666,7 +690,6 @@ static int do_poll(unsigned int nfds, s
for (;;) {
struct poll_list *walk;
- set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
@@ -709,10 +732,9 @@ static int do_poll(unsigned int nfds, s
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(wait, to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
return count;
}
Index: work/include/linux/poll.h
===================================================================
--- work.orig/include/linux/poll.h
+++ work/include/linux/poll.h
@@ -46,9 +46,9 @@ static inline void init_poll_funcptr(pol
}
struct poll_table_entry {
- struct file * filp;
+ struct file *filp;
wait_queue_t wait;
- wait_queue_head_t * wait_address;
+ wait_queue_head_t *wait_address;
};
/*
@@ -56,7 +56,9 @@ struct poll_table_entry {
*/
struct poll_wqueues {
poll_table pt;
- struct poll_table_page * table;
+ struct poll_table_page *table;
+ struct task_struct *polling_task;
+ int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
@@ -64,6 +66,13 @@ struct poll_wqueues {
extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
+extern int poll_schedule_timeout(struct poll_wqueues *pwq,
+ ktime_t *expires, unsigned long slack);
+
+static inline int poll_schedule(struct poll_wqueues *pwq)
+{
+ return poll_schedule_timeout(pwq, NULL, 0);
+}
/*
* Scaleable version of the fd_set.
Index: work/drivers/media/video/v4l1-compat.c
===================================================================
--- work.orig/drivers/media/video/v4l1-compat.c
+++ work/drivers/media/video/v4l1-compat.c
@@ -203,7 +203,6 @@ static int poll_one(struct file *file, s
table = &pwq->pt;
for (;;) {
int mask;
- set_current_state(TASK_INTERRUPTIBLE);
mask = file->f_op->poll(file, table);
if (mask & POLLIN)
break;
@@ -212,9 +211,8 @@ static int poll_one(struct file *file, s
retval = -ERESTARTSYS;
break;
}
- schedule();
+ poll_schedule(pwq);
}
- set_current_state(TASK_RUNNING);
poll_freewait(pwq);
return retval;
}
Index: work/Documentation/filesystems/Locking
===================================================================
--- work.orig/Documentation/filesystems/Locking
+++ work/Documentation/filesystems/Locking
@@ -398,7 +398,7 @@ prototypes:
};
locking rules:
- All except ->poll() may block.
+ All may block.
BKL
llseek: no (see below)
read: no
On Sun, 23 Nov 2008, Tejun Heo wrote:
> +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
> +{
> + struct poll_wqueues *pwq = wait->private;
> + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
> +
> + set_mb(pwq->triggered, 1);
> +
> + /* perform the default wake up operation */
> + return default_wake_function(&dummy_wait, mode, sync, key);
> +}
Wouldn't it be nicer to un-static try_to_wake_up() (or a wrapper) instead
of setting up a fake wait queue just to use default_wake_function(), just
to wake up a task?
- Davide
Davide Libenzi wrote:
> On Sun, 23 Nov 2008, Tejun Heo wrote:
>
>> +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
>> +{
>> + struct poll_wqueues *pwq = wait->private;
>> + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
>> +
>> + set_mb(pwq->triggered, 1);
>> +
>> + /* perform the default wake up operation */
>> + return default_wake_function(&dummy_wait, mode, sync, key);
>> +}
>
> Wouldn't it be nicer to un-static try_to_wake_up() (or a wrapper) instead
> of setting up a fake wait queue just to use default_wake_function(), just
> to wake up a task?
I thought try_to_wake_up() was made static to avoid abuse but then again
creating dummy waitqueue is an obvious abuse of waitqueue. What do
other people think? I'll be happy to use try_to_wake_up() directly.
Thanks.
--
tejun
On Sun, Nov 23, 2008 at 12:05:53PM +0900, Tejun Heo wrote:
> I thought try_to_wake_up() was made static to avoid abuse but then again
> creating dummy waitqueue is an obvious abuse of waitqueue. What do
> other people think? I'll be happy to use try_to_wake_up() directly.
Do you need all the extra arguments? The function wake_up_process()
is already a wrapper around try_to_wake_up() and is exported, but
it doesn't have any arguments other than the task_struct and uses
defaults for the other arguments. I'm not sure if anything in your
code would break by ignoring the other possible values instead of
passing them along from the arguments into the caller.
Brad Boyer
[email protected]
Brad Boyer wrote:
> On Sun, Nov 23, 2008 at 12:05:53PM +0900, Tejun Heo wrote:
>> I thought try_to_wake_up() was made static to avoid abuse but then again
>> creating dummy waitqueue is an obvious abuse of waitqueue. What do
>> other people think? I'll be happy to use try_to_wake_up() directly.
>
> Do you need all the extra arguments? The function wake_up_process()
> is already a wrapper around try_to_wake_up() and is exported, but
> it doesn't have any arguments other than the task_struct and uses
> defaults for the other arguments. I'm not sure if anything in your
> code would break by ignoring the other possible values instead of
> passing them along from the arguments into the caller.
Hmmm... there was something which made wake_up_process() inappropriate.
Ah, okay, it was @mode. We can add a WARN_ON() if @mode is an
unexpected value and use a fixed one - TASK_INTERRUPTIBLE or TASK_ALL -
but that's even hackier than the waitqueue hack.
Thanks.
--
tejun
* Davide Libenzi <[email protected]> wrote:
> On Sun, 23 Nov 2008, Tejun Heo wrote:
>
> > +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
> > +{
> > + struct poll_wqueues *pwq = wait->private;
> > + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
> > +
> > + set_mb(pwq->triggered, 1);
> > +
> > + /* perform the default wake up operation */
> > + return default_wake_function(&dummy_wait, mode, sync, key);
> > +}
>
> Wouldn't it be nicer to un-static try_to_wake_up() (or a wrapper)
> instead of setting up a fake wait queue just to use
> default_wake_function(), just to wake up a task?
the already existing single-task-wakeup shortcut method made available
by the scheduler is wake_up_process(). A number of primitives where
waitqueue overhead is of concern already make use of it.
(mutexes/rtmutexes and more)
if you need a different mode flag, then please extend the
wake_up_process() family of APIs.
Ingo
Ingo Molnar wrote:
> * Davide Libenzi <[email protected]> wrote:
>
>> On Sun, 23 Nov 2008, Tejun Heo wrote:
>>
>>> +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
>>> +{
>>> + struct poll_wqueues *pwq = wait->private;
>>> + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
>>> +
>>> + set_mb(pwq->triggered, 1);
>>> +
>>> + /* perform the default wake up operation */
>>> + return default_wake_function(&dummy_wait, mode, sync, key);
>>> +}
>> Wouldn't it be nicer to un-static try_to_wake_up() (or a wrapper)
>> instead of setting up a fake wait queue just to use
>> default_wake_function(), just to wake up a task?
>
> the already existing single-task-wakeup shortcut method made available
> by the scheduler is wake_up_process(). A number of primitives where
> waitqueue overhead is of concern already make use of it.
> (mutexes/rtmutexes and more)
>
> if you need a different mode flag, then please extend the
> wake_up_process() family of APIs.
Well, the problem is that for wait wake up functions @mode and @sync are
passed as parameter and should be passed over to actual wake up function
as-is, if I add @mode and @sync to wake_up_process() family of APIs,
it's just try_to_wake_up().
Thanks.
--
tejun
* Tejun Heo <[email protected]> wrote:
> Ingo Molnar wrote:
> > * Davide Libenzi <[email protected]> wrote:
> >
> >> On Sun, 23 Nov 2008, Tejun Heo wrote:
> >>
> >>> +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
> >>> +{
> >>> + struct poll_wqueues *pwq = wait->private;
> >>> + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
> >>> +
> >>> + set_mb(pwq->triggered, 1);
> >>> +
> >>> + /* perform the default wake up operation */
> >>> + return default_wake_function(&dummy_wait, mode, sync, key);
> >>> +}
> >> Wouldn't it be nicer to un-static try_to_wake_up() (or a wrapper)
> >> instead of setting up a fake wait queue just to use
> >> default_wake_function(), just to wake up a task?
> >
> > the already existing single-task-wakeup shortcut method made available
> > by the scheduler is wake_up_process(). A number of primitives where
> > waitqueue overhead is of concern already make use of it.
> > (mutexes/rtmutexes and more)
> >
> > if you need a different mode flag, then please extend the
> > wake_up_process() family of APIs.
>
> Well, the problem is that for wait wake up functions @mode and @sync
> are passed as parameter and should be passed over to actual wake up
> function as-is, if I add @mode and @sync to wake_up_process() family
> of APIs, it's just try_to_wake_up().
_sync() is not something that should normally be done from poll
handlers. But ->poll() handlers should all be TASK_INTERRUPTIBLE,
right? So wake_up_process_interruptible() should be the thing you
need?
Anyway, if you really want to pass in a state filter, you can use the
already existing wake_up_state() method as well.
Ingo
Ingo Molnar wrote:
> _sync() is not something that should normally be done from poll
> handlers. But ->poll() handlers should all be TASK_INTERRUPTIBLE,
> right? So wake_up_process_interruptible() should be the thing you
> need?
>
> Anyway, if you really want to pass in a state filter, you can use the
> already existing wake_up_state() method as well.
It's not really about what I want but more about how the interface
looks in the first place. Something like the following is simply
ugly.
int my_callback(param a, param b, param c)
{
WARN_ON(b != B);
do_something(a);
}
And @sync might be useful depending on who's waking it up, so we
either need to change the wake interface or give it an easier way to
pass those parameters as received. The callback function isn't the
right place to ignore those parameters. It simply doesn't know why
the caller is passing them in or what they mean under the
circumstances.
Thanks.
--
tejun
* Tejun Heo <[email protected]> wrote:
> Ingo Molnar wrote:
> > _sync() is not something that should normally be done from poll
> > handlers. But ->poll() handlers should all be TASK_INTERRUPTIBLE,
> > right? So wake_up_process_interruptible() should be the thing you
> > need?
> >
> > Anyway, if you really want to pass in a state filter, you can use the
> > already existing wake_up_state() method as well.
>
> It's not really about what I want but more about how the interface
> looks in the first place. Something like the following is simply
> ugly.
>
> int my_callback(param a, param b, param c)
> {
> WARN_ON(b != B);
> do_something(a);
> }
>
> And @sync might be useful depending on who's waking it up, so we
> either need to change the wake interface or give it an easier way to
> pass those parameters as received. The callback function isn't the
> right place to ignore those parameters. It simply doesn't know why
> the caller is passing them in or what they mean under the
> circumstances.
We'll likely eliminate the 'sync' parameter from the scheduler. It's
not a flag that should be proliferated.
Ingo
Ingo Molnar wrote:
>> And @sync might be useful depending on who's waking it up, so we
>> either need to change the wake interface or give it an easier way to
>> pass those parameters as received. The callback function isn't the
>> right place to ignore those parameters. It simply doesn't know why
>> the caller is passing them in or what they mean under the
>> circumstances.
>
> We'll likely eliminate the 'sync' parameter from the scheduler. It's
> not a flag that should be proliferated.
But it's still being used in quite hot paths (pipe, splice, socket)
and I don't really wanna mix up a change which can cause subtle
scheduling related performance regression into this patch. How about
using the dummy waitqueue hack for now and when removing the @sync
param, switch it to one of wakeup APIs? I'll be happy to add big /*
TODO */ comment in the function.
Thanks.
--
tejun
On Mon, 24 Nov 2008, Tejun Heo wrote:
> Ingo Molnar wrote:
> >> And @sync might be useful depending on who's waking it up, so we
> >> either need to change the wake interface or give it an easier way to
> >> pass those parameters as received. The callback function isn't the
> >> right place to ignore those parameters. It simply doesn't know why
> >> the caller is passing them in or what they mean under the
> >> circumstances.
> >
> > We'll likely eliminate the 'sync' parameter from the scheduler. It's
> > not a flag that should be proliferated.
>
> But it's still being used in quite hot paths (pipe, splice, socket)
> and I don't really wanna mix up a change which can cause subtle
> scheduling related performance regression into this patch. How about
> using the dummy waitqueue hack for now and when removing the @sync
> param, switch it to one of wakeup APIs? I'll be happy to add big /*
> TODO */ comment in the function.
I was noticing that wake ups on the poll queues do not use sync=1, AFAICS.
So using wake_up_state() and ignoring 'sync' should be fine. And 'key' is
already ignored by default by default_wake_function().
- Davide
On Sun, 23 Nov 2008, Davide Libenzi wrote:
> I was noticing that wake ups on the poll queues do not use sync=1, AFAICS.
> So using wake_up_state() and ignoring 'sync' should be fine. And 'key' is
> already ignored by default by default_wake_function().
*cough* wrong *cough* :) Pipes & more.
- Davide
f_op->poll is the only vfs operation which is not allowed to sleep.
It's because poll and select implementation used task state to
synchronize against wake ups, which doesn't have to be the case
anymore as wait/wake interface can now use custom wake up functions.
The non-sleep restriction can be a bit tricky because ->poll is not
called from an atomic context and the result of accidentally sleeping
in ->poll only shows up as temporary busy looping when the timing is
right or rather wrong.
This patch converts poll/select to use custom wake up function and use
separate triggered variable to synchronize against wake up events.
The only added overhead is an extra function call during wake up and
negligible.
This patch removes the one non-sleep exception from vfs locking rules
and is beneficial to userland filesystem implementations like FUSE, 9p
or peculiar fs like spufs as it's very difficult for those to
implement non-sleeping poll method.
While at it, make the following cosmetic changes to make poll.h and
select.c checkpatch friendly.
* s/type * symbol/type *symbol/ : three places in poll.h
* remove blank line before EXPORT_SYMBOL() : two places in select.c
Signed-off-by: Tejun Heo <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Ron Minnich <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
Cc: Davide Libenzi <[email protected]>
Cc: Brad Boyer <[email protected]>
---
Added TODO comment to switch to wake_up_process() once @sync is gone.
Thanks.
Documentation/filesystems/Locking | 2 -
drivers/media/video/v4l1-compat.c | 4 --
fs/select.c | 58
++++++++++++++++++++++++++++----------
include/linux/poll.h | 15 +++++++--
4 files changed, 58 insertions(+), 21 deletions(-)
diff --git a/Documentation/filesystems/Locking
b/Documentation/filesystems/Locking
index 23d2f44..250f3c4 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -398,7 +398,7 @@ prototypes:
};
locking rules:
- All except ->poll() may block.
+ All may block.
BKL
llseek: no (see below)
read: no
diff --git a/drivers/media/video/v4l1-compat.c
b/drivers/media/video/v4l1-compat.c
index f13c0a9..d0253da 100644
--- a/drivers/media/video/v4l1-compat.c
+++ b/drivers/media/video/v4l1-compat.c
@@ -203,7 +203,6 @@ static int poll_one(struct file *file, struct
poll_wqueues *pwq)
table = &pwq->pt;
for (;;) {
int mask;
- set_current_state(TASK_INTERRUPTIBLE);
mask = file->f_op->poll(file, table);
if (mask & POLLIN)
break;
@@ -212,9 +211,8 @@ static int poll_one(struct file *file, struct
poll_wqueues *pwq)
retval = -ERESTARTSYS;
break;
}
- schedule();
+ poll_schedule(pwq, TASK_INTERRUPTIBLE);
}
- set_current_state(TASK_RUNNING);
poll_freewait(pwq);
return retval;
}
diff --git a/fs/select.c b/fs/select.c
index 87df51e..f2ef68b 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp,
wait_queue_head_t *wait_address,
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->polling_task = current;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
-
EXPORT_SYMBOL(poll_initwait);
static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
free_page((unsigned long) old);
}
}
-
EXPORT_SYMBOL(poll_freewait);
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
- struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;
if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry
*poll_get_entry(poll_table *_p)
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
return NULL;
}
new_table->entry = new_table->entries;
@@ -171,20 +168,57 @@ static struct poll_table_entry
*poll_get_entry(poll_table *_p)
return table->entry++;
}
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_wqueues *pwq = wait->private;
+ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+ set_mb(pwq->triggered, 1);
+
+ /*
+ * Perform the default wake up operation using a dummy
+ * waitqueue.
+ *
+ * TODO: This is hacky but there currently is no interface to
+ * pass in @sync. @sync is scheduled to be removed and once
+ * that happens, wake_up_process() can be used directly.
+ */
+ return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
- struct poll_table_entry *entry = poll_get_entry(p);
+ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+ struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
+ init_waitqueue_func_entry(&entry->wait, pollwake);
+ entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
}
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack)
+{
+ int rc = -EINTR;
+
+ set_current_state(state);
+ if (!pwq->triggered)
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
+ /* clear triggered for the next iteration */
+ pwq->triggered = 0;
+
+ return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
/**
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
@@ -340,8 +374,6 @@ int do_select(int n, fd_set_bits *fds, struct
timespec *end_time)
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
- set_current_state(TASK_INTERRUPTIBLE);
-
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -411,10 +443,10 @@ int do_select(int n, fd_set_bits *fds, struct
timespec *end_time)
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+ to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
poll_freewait(&table);
@@ -666,7 +698,6 @@ static int do_poll(unsigned int nfds, struct
poll_list *list,
for (;;) {
struct poll_list *walk;
- set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
@@ -709,10 +740,9 @@ static int do_poll(unsigned int nfds, struct
poll_list *list,
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
return count;
}
diff --git a/include/linux/poll.h b/include/linux/poll.h
index badd98a..8c24ef8 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -46,9 +46,9 @@ static inline void init_poll_funcptr(poll_table *pt,
poll_queue_proc qproc)
}
struct poll_table_entry {
- struct file * filp;
+ struct file *filp;
wait_queue_t wait;
- wait_queue_head_t * wait_address;
+ wait_queue_head_t *wait_address;
};
/*
@@ -56,7 +56,9 @@ struct poll_table_entry {
*/
struct poll_wqueues {
poll_table pt;
- struct poll_table_page * table;
+ struct poll_table_page *table;
+ struct task_struct *polling_task;
+ int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
@@ -64,6 +66,13 @@ struct poll_wqueues {
extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
+extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack);
+
+static inline int poll_schedule(struct poll_wqueues *pwq, int state)
+{
+ return poll_schedule_timeout(pwq, state, NULL, 0);
+}
/*
* Scaleable version of the fd_set.