2010-04-28 05:30:16

by Changli Gao

[permalink] [raw]
Subject: [RFC] sched: implement the exclusive wait queue as a LIFO queue

implement the exclusive wait queue as a LIFO queue

If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
process who goes to sleep recently, will be woke up first. As its memory is
more likely in cache, we will get better performance. And when there are many
processes waiting on a exclusive wait queue, some of them may not be woke up,
if the others can handle the workload, and it will reduce the load of
the scheduler.

Note: before applying this patch, you need my previous patch patched first.
https://patchwork.kernel.org/patch/95600/

Signed-off-by: Changli Gao <[email protected]>
----
fs/eventpoll.c | 3 +--
include/linux/wait.h | 17 +++++++----------
kernel/sched.c | 8 ++++----
kernel/wait.c | 9 +++------
4 files changed, 15 insertions(+), 22 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5..e9b3ebe 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue(&ep->wq, &wait);
+ __add_wait_queue_ex(&ep->wq, &wait);

for (;;) {
/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b..95c127d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -30,8 +30,6 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);

struct __wait_queue {
- unsigned int flags;
-#define WQ_FLAG_EXCLUSIVE 0x01
void *private;
wait_queue_func_t func;
struct list_head task_list;
@@ -50,6 +48,7 @@ struct wait_bit_queue {
struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list;
+ struct list_head task_list_ex;
};
typedef struct __wait_queue_head wait_queue_head_t;

@@ -69,7 +68,8 @@ struct task_struct;

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
- .task_list = { &(name).task_list, &(name).task_list } }
+ .task_list = { &(name).task_list, &(name).task_list }, \
+ .task_list_ex = { &(name).task_list_ex, &(name).task_list_ex } }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@ -97,7 +97,6 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *)

static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
- q->flags = 0;
q->private = p;
q->func = default_wake_function;
}
@@ -105,14 +104,13 @@ static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
static inline void init_waitqueue_func_entry(wait_queue_t *q,
wait_queue_func_t func)
{
- q->flags = 0;
q->private = NULL;
q->func = func;
}

static inline int waitqueue_active(wait_queue_head_t *q)
{
- return !list_empty(&q->task_list);
+ return !list_empty(&q->task_list) || !list_empty(&q->task_list);
}

extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
@@ -127,10 +125,10 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
/*
* Used for wake-one threads:
*/
-static inline void __add_wait_queue_tail(wait_queue_head_t *head,
+static inline void __add_wait_queue_ex(wait_queue_head_t *head,
wait_queue_t *new)
{
- list_add_tail(&new->task_list, &head->task_list);
+ list_add(&new->task_list, &head->task_list_ex);
}

static inline void __remove_wait_queue(wait_queue_head_t *head,
@@ -409,8 +407,7 @@ do { \
static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
wait_queue_t * wait)
{
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
}

/*
diff --git a/kernel/sched.c b/kernel/sched.c
index be5ab70..59b1534 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3903,11 +3903,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
{
wait_queue_t *curr, *next;

- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list)
+ curr->func(curr, mode, wake_flags, key);

- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ list_for_each_entry_safe(curr, next, &q->task_list_ex, task_list) {
+ if (curr->func(curr, mode, wake_flags, key) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d8..a0559df 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -15,6 +15,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
spin_lock_init(&q->lock);
lockdep_set_class(&q->lock, key);
INIT_LIST_HEAD(&q->task_list);
+ INIT_LIST_HEAD(&q->task_list_ex);
}

EXPORT_SYMBOL(__init_waitqueue_head);
@@ -23,7 +24,6 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;

- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
@@ -34,9 +34,8 @@ void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;

- wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
@@ -69,7 +68,6 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;

- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
@@ -83,10 +81,9 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;

- wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}


2010-04-28 06:23:11

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <[email protected]> wrote:
>
>  static inline int waitqueue_active(wait_queue_head_t *q)
>  {
> -       return !list_empty(&q->task_list);
> +       return !list_empty(&q->task_list) || !list_empty(&q->task_list);
>  }
>

I am sorry. the later task_list should be task_list_ex. The updated
patch is attached. And I will replace task_list list with hlist for
saving space.

--
Regards,
Changli Gao([email protected])


Attachments:
wq-ex-lifo.diff (5.21 kB)

2010-04-28 07:47:59

by Xiaotian Feng

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <[email protected]> wrote:
> implement the exclusive wait queue as a LIFO queue
>
> If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
> process who goes to sleep recently, will be woke up first. As its memory is
> more likely in cache, we will get better performance. And when there are many
> processes waiting on a exclusive wait queue, some of them may not be woke up,
> if the others can handle the workload, and it will reduce the load of
> the scheduler.
>

Starve some processes for performance?

> Note: before applying this patch, you need my previous patch patched first.
> https://patchwork.kernel.org/patch/95600/
>
> Signed-off-by: Changli Gao <[email protected]>
> ----
>  fs/eventpoll.c       |    3 +--
>  include/linux/wait.h |   17 +++++++----------
>  kernel/sched.c       |    8 ++++----
>  kernel/wait.c        |    9 +++------
>  4 files changed, 15 insertions(+), 22 deletions(-)
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index bd056a5..e9b3ebe 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -1140,8 +1140,7 @@ retry:
>                 * ep_poll_callback() when events will become available.
>                 */
>                init_waitqueue_entry(&wait, current);
> -               wait.flags |= WQ_FLAG_EXCLUSIVE;
> -               __add_wait_queue(&ep->wq, &wait);
> +               __add_wait_queue_ex(&ep->wq, &wait);
>
>                for (;;) {
>                        /*
> diff --git a/include/linux/wait.h b/include/linux/wait.h
> index a48e16b..95c127d 100644
> --- a/include/linux/wait.h
> +++ b/include/linux/wait.h
> @@ -30,8 +30,6 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
>  int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
>
>  struct __wait_queue {
> -       unsigned int flags;
> -#define WQ_FLAG_EXCLUSIVE      0x01
>        void *private;
>        wait_queue_func_t func;
>        struct list_head task_list;
> @@ -50,6 +48,7 @@ struct wait_bit_queue {
>  struct __wait_queue_head {
>        spinlock_t lock;
>        struct list_head task_list;
> +       struct list_head task_list_ex;
>  };
>  typedef struct __wait_queue_head wait_queue_head_t;
>
> @@ -69,7 +68,8 @@ struct task_struct;
>
>  #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                          \
>        .lock           = __SPIN_LOCK_UNLOCKED(name.lock),              \
> -       .task_list      = { &(name).task_list, &(name).task_list } }
> +       .task_list      = { &(name).task_list, &(name).task_list },     \
> +       .task_list_ex   = { &(name).task_list_ex, &(name).task_list_ex } }
>
>  #define DECLARE_WAIT_QUEUE_HEAD(name) \
>        wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
> @@ -97,7 +97,6 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *)
>
>  static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
>  {
> -       q->flags = 0;
>        q->private = p;
>        q->func = default_wake_function;
>  }
> @@ -105,14 +104,13 @@ static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
>  static inline void init_waitqueue_func_entry(wait_queue_t *q,
>                                        wait_queue_func_t func)
>  {
> -       q->flags = 0;
>        q->private = NULL;
>        q->func = func;
>  }
>
>  static inline int waitqueue_active(wait_queue_head_t *q)
>  {
> -       return !list_empty(&q->task_list);
> +       return !list_empty(&q->task_list) || !list_empty(&q->task_list);
>  }
>
>  extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
> @@ -127,10 +125,10 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
>  /*
>  * Used for wake-one threads:
>  */
> -static inline void __add_wait_queue_tail(wait_queue_head_t *head,
> +static inline void __add_wait_queue_ex(wait_queue_head_t *head,
>                                                wait_queue_t *new)
>  {
> -       list_add_tail(&new->task_list, &head->task_list);
> +       list_add(&new->task_list, &head->task_list_ex);
>  }
>
>  static inline void __remove_wait_queue(wait_queue_head_t *head,
> @@ -409,8 +407,7 @@ do {                                                                        \
>  static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
>                                                   wait_queue_t * wait)
>  {
> -       wait->flags |= WQ_FLAG_EXCLUSIVE;
> -       __add_wait_queue_tail(q,  wait);
> +       __add_wait_queue_ex(q,  wait);
>  }
>
>  /*
> diff --git a/kernel/sched.c b/kernel/sched.c
> index be5ab70..59b1534 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -3903,11 +3903,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
>  {
>        wait_queue_t *curr, *next;
>
> -       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
> -               unsigned flags = curr->flags;
> +       list_for_each_entry_safe(curr, next, &q->task_list, task_list)
> +               curr->func(curr, mode, wake_flags, key);
>
> -               if (curr->func(curr, mode, wake_flags, key) &&
> -                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
> +       list_for_each_entry_safe(curr, next, &q->task_list_ex, task_list) {
> +               if (curr->func(curr, mode, wake_flags, key) && !--nr_exclusive)
>                        break;
>        }
>  }
> diff --git a/kernel/wait.c b/kernel/wait.c
> index c4bd3d8..a0559df 100644
> --- a/kernel/wait.c
> +++ b/kernel/wait.c
> @@ -15,6 +15,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
>        spin_lock_init(&q->lock);
>        lockdep_set_class(&q->lock, key);
>        INIT_LIST_HEAD(&q->task_list);
> +       INIT_LIST_HEAD(&q->task_list_ex);
>  }
>
>  EXPORT_SYMBOL(__init_waitqueue_head);
> @@ -23,7 +24,6 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
>  {
>        unsigned long flags;
>
> -       wait->flags &= ~WQ_FLAG_EXCLUSIVE;
>        spin_lock_irqsave(&q->lock, flags);
>        __add_wait_queue(q, wait);
>        spin_unlock_irqrestore(&q->lock, flags);
> @@ -34,9 +34,8 @@ void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
>  {
>        unsigned long flags;
>
> -       wait->flags |= WQ_FLAG_EXCLUSIVE;
>        spin_lock_irqsave(&q->lock, flags);
> -       __add_wait_queue_tail(q, wait);
> +       __add_wait_queue_ex(q, wait);
>        spin_unlock_irqrestore(&q->lock, flags);
>  }
>  EXPORT_SYMBOL(add_wait_queue_exclusive);
> @@ -69,7 +68,6 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
>  {
>        unsigned long flags;
>
> -       wait->flags &= ~WQ_FLAG_EXCLUSIVE;
>        spin_lock_irqsave(&q->lock, flags);
>        if (list_empty(&wait->task_list))
>                __add_wait_queue(q, wait);
> @@ -83,10 +81,9 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
>  {
>        unsigned long flags;
>
> -       wait->flags |= WQ_FLAG_EXCLUSIVE;
>        spin_lock_irqsave(&q->lock, flags);
>        if (list_empty(&wait->task_list))
> -               __add_wait_queue_tail(q, wait);
> +               __add_wait_queue_ex(q, wait);
>        set_current_state(state);
>        spin_unlock_irqrestore(&q->lock, flags);
>  }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

2010-04-28 13:22:09

by David Howells

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao <[email protected]> wrote:

> You are right. I made the wrong assumption. But we indeed need some
> primitive to add wait_queue at the head of the wait_queue_head, and I
> know epoll needs it, at least.
>
> fs/eventpoll.c: 1443.
> wait.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(&ep->wq, &wait);

You can add at either end of the wait queue with __add_wait_queue() and
__add_wait_queue_tail().

David

2010-04-28 13:37:27

by David Howells

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao <[email protected]> wrote:

> @@ -50,6 +48,7 @@ struct wait_bit_queue {
> struct __wait_queue_head {
> spinlock_t lock;
> struct list_head task_list;
> + struct list_head task_list_ex;

It would be preferable it if you could avoid making struct __wait_queue_head
bigger. That will increase the size of a lot of things.

David

2010-04-28 13:56:54

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 5:32 PM, David Howells <[email protected]> wrote:
> Changli Gao <[email protected]> wrote:
>
>> @@ -50,6 +48,7 @@ struct wait_bit_queue {
>>  struct __wait_queue_head {
>>       spinlock_t lock;
>>       struct list_head task_list;
>> +     struct list_head task_list_ex;
>
> It would be preferable it if you could avoid making struct __wait_queue_head
> bigger.  That will increase the size of a lot of things.
>

I don't know how to do that, as maybe there are non-exclusive and
exclusive wait queues in the same wait queue head. If we want to
enqueue exclusive wait queues at the head of exclusive queues, we have
to know where the head is, otherwise, we have to loop to find the head
when enqueuing.

--
Regards,
Changli Gao([email protected])

2010-04-28 13:42:35

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 9:21 PM, Jamie Lokier <[email protected]> wrote:
> Changli Gao wrote:
>>
>> fs/eventpoll.c: 1443.
>>                 wait.flags |= WQ_FLAG_EXCLUSIVE;
>>                 __add_wait_queue(&ep->wq, &wait);
>
> The same thing about assumptions applies here.  The userspace process
> may be waiting for an epoll condition to get access to a resource,
> rather than being a worker thread interchangeable with others.

Oh, the lines above are the current ones. So the assumptions applies
and works here.

>
> For example, userspace might be using a pipe as a signal-safe lock, or
> signal-safe multi-token semaphore, and epoll to wait for that pipe.
>
> WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
> pointless thundering herd.  It doesn't mean unfairness is ok.

The users should not make any assumption about the waking up sequence,
neither LIFO nor FIFO.

>
> The LIFO idea _might_ make sense for interchangeable worker-thread
> situations - including userspace.  It would make sense for pipe
> waiters, socket waiters (especially accept), etc.

Yea, and my following patches are for socket waiters.

>
> Do you have any measurements which showing the LIFO mode performing
> better than FIFO, and by how much?
>

I didn't do any test yet. But some work done by LSE project years ago
showed that it is better.

http://lse.sourceforge.net/io/aionotes.txt

" Also in view of
better cache utilization the wake queue mechanism is LIFO by default.
(A new exclusive LIFO wakeup option has been introduced for this purpose)"

--
Regards,
Changli Gao([email protected])

2010-04-28 13:23:21

by David Howells

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue


Can you split the wait queue code and differentiate exclusive wait queues with
LIFO functionality from wait queues with FIFO functionality. I can see why
your suggestion is desirable.

David

2010-04-28 11:17:51

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 5:29 PM, David Howells <[email protected]> wrote:
> Changli Gao <[email protected]> wrote:
>
>> If there isn't enough work to be done, we'd better not disrupt them
>> and  leave them sleeping forever to keep the scheduler happier. Do we
>> have reason to keep fair to all the workers? Does it have benefit?
>
> You've made one important assumption: the processes on the wait queue are
> sleeping waiting to service things... but what if the wait queue governs
> access to a resource, and all the processes on that wait queue need access to
> that resource to do things?  Some of the processes waiting for it may never
> get a go, and so necessary work may be left undone.
>

You are right. I made the wrong assumption. But we indeed need some
primitive to add wait_queue at the head of the wait_queue_head, and I
know epoll needs it, at least.

fs/eventpoll.c: 1443.
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue(&ep->wq, &wait);

--
Regards,
Changli Gao([email protected])

2010-04-28 14:09:00

by David Howells

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao <[email protected]> wrote:

> I don't know how to do that, as maybe there are non-exclusive and
> exclusive wait queues in the same wait queue head. If we want to
> enqueue exclusive wait queues at the head of exclusive queues, we have
> to know where the head is, otherwise, we have to loop to find the head
> when enqueuing.

I suspect you really want to have the semantics defined per-queue. _Either_ a
queue is FIFO (such as processes waiting for a resource so they can do
something with it) _or_ it is LIFO (such as a pool of processes waiting to be
given work).

How often do the two actually mix? And if they do, is that really an error?

David

2010-04-28 15:26:16

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao wrote:
> On Wed, Apr 28, 2010 at 9:21 PM, Jamie Lokier <[email protected]> wrote:
> > Changli Gao wrote:
> >>
> >> fs/eventpoll.c: 1443.
> >> ? ? ? ? ? ? ? ? wait.flags |= WQ_FLAG_EXCLUSIVE;
> >> ? ? ? ? ? ? ? ? __add_wait_queue(&ep->wq, &wait);
> >
> > The same thing about assumptions applies here. ?The userspace process
> > may be waiting for an epoll condition to get access to a resource,
> > rather than being a worker thread interchangeable with others.
>
> Oh, the lines above are the current ones. So the assumptions applies
> and works here.

No, because WQ_FLAG_EXCLUSIVE doesn't have your LIFO semantic at the moment.

Your patch changes the behaviour of epoll, though I don't know if it
matters. Perhaps all programs which have multiple tasks waiting on
the same epoll fd are "interchangeable worker thread" types anyway :-)

> > For example, userspace might be using a pipe as a signal-safe lock, or
> > signal-safe multi-token semaphore, and epoll to wait for that pipe.
> >
> > WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
> > pointless thundering herd. ?It doesn't mean unfairness is ok.
>
> The users should not make any assumption about the waking up sequence,
> neither LIFO nor FIFO.

Correct, but they should be able to assume non-starvation (eventual
progress) for all waiters.

It's one of those subtle things, possibly a unixy thing: Non-RT tasks
should always make progress when the competition is just other non-RT
tasks, even if the progress is slow.

Starvation can spread out beyond the starved process, to cause
priority inversions in other tasks that are waiting on a resource
locked by the starved process. Among other things, that can cause
higher priority tasks, and RT priority tasks, to block permanently.
Very unpleasant.

> > The LIFO idea _might_ make sense for interchangeable worker-thread
> > situations - including userspace. ?It would make sense for pipe
> > waiters, socket waiters (especially accept), etc.
>
> Yea, and my following patches are for socket waiters.

Occasionally unix socketpairs are occasionally used in the above ways too.

I'm not against your patch, but I worry that starvation is a new
semantic, and it may have a significant effect on something - either
in the kernel, or in userspace which is harder to check.

> > Do you have any measurements which showing the LIFO mode performing
> > better than FIFO, and by how much?
>
> I didn't do any test yet. But some work done by LSE project years ago
> showed that it is better.
>
> http://lse.sourceforge.net/io/aionotes.txt
>
> " Also in view of
> better cache utilization the wake queue mechanism is LIFO by default.
> (A new exclusive LIFO wakeup option has been introduced for this purpose)"

I suspect it's possible to combine LIFO-ish and FIFO-ish queuing to
prevent starvation while getting some of the locality benefit.
Something like add-LIFO and increment a small counter in the next wait
entry, but never add in front of an entry whose counter has reached
MAX_LIFO_WAITERS? :-)

-- Jamie

2010-04-28 13:48:06

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 5:34 PM, David Howells <[email protected]> wrote:
>
> Can you split the wait queue code and differentiate exclusive wait queues with
> LIFO functionality from wait queues with FIFO functionality.  I can see why
> your suggestion is desirable.
>

OK. I will use two lists: one for non-exclusive wait queues, and the
other for exclusive wait queues, and I'll add new interfaces for LIFO
functionality instead of changing the current interfaces.

--
Regards,
Changli Gao([email protected])

2010-04-28 14:53:48

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 10:06 PM, David Howells <[email protected]> wrote:
>
> I suspect you really want to have the semantics defined per-queue.  _Either_ a
> queue is FIFO (such as processes waiting for a resource so they can do
> something with it) _or_ it is LIFO (such as a pool of processes waiting to be
> given work).
>
> How often do the two actually mix?  And if they do, is that really an error?
>

The sock.sk_sleep is used by exclusive and non-exclusive wait queues.
exclusive and non-exclusive is identified by wait queues, not wait
queue heads. Maybe there is a historical reason. It is much like a
hack.

--
Regards,
Changli Gao([email protected])

2010-04-28 08:17:27

by Yong Zhang

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 03:52:01PM +0800, Changli Gao wrote:
> On Wed, Apr 28, 2010 at 3:47 PM, Xiaotian Feng <[email protected]> wrote:
> > On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <[email protected]> wrote:
> >> implement the exclusive wait queue as a LIFO queue
> >>
> >> If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
> >> process who goes to sleep recently, will be woke up first. As its memory is
> >> more likely in cache, we will get better performance. And when there are many
> >> processes waiting on a exclusive wait queue, some of them may not be woke up,
> >> if the others can handle the workload, and it will reduce the load of
> >> the scheduler.
> >>
> >
> > Starve some processes for performance?
> >
>
> Starve? Oh, No. If we don't need these processes, and we can do better

What do you mean "we don't need these processes"?

> without them, why we wake them up?

So some processs(at the tail of exclusive list)will be treated abnormally
and it will sleep for a long time, is this reasonable?

>
>
> --
> Regards,
> Changli Gao([email protected])
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2010-04-28 08:06:23

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 2:22 PM, Changli Gao <[email protected]> wrote:
>
> I am sorry. the later task_list should be task_list_ex. The updated
> patch is attached. And I will replace task_list list with hlist for
> saving space.
>

This version replaces list with hlist.


--
Regards,
Changli Gao([email protected])


Attachments:
wq-ex-lifo.diff (8.43 kB)

2010-04-28 08:24:16

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 4:15 PM, Yong Zhang <[email protected]> wrote:
>
> What do you mean "we don't need these processes"?

If the work is less than the workers, we don't need the workers at the
tail of the exculsive list.

>
>> without them, why we wake them up?
>
> So some processs(at the tail of exclusive list)will be treated abnormally
> and it will sleep for a long time, is this reasonable?
>

If there isn't enough work to be done, we'd better not disrupt them
and leave them sleeping forever to keep the scheduler happier. Do we
have reason to keep fair to all the workers? Does it have benefit?

--
Regards,
Changli Gao([email protected])

2010-04-28 13:22:49

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao wrote:
> On Wed, Apr 28, 2010 at 5:29 PM, David Howells <[email protected]> wrote:
> > Changli Gao <[email protected]> wrote:
> >
> >> If there isn't enough work to be done, we'd better not disrupt them
> >> and ?leave them sleeping forever to keep the scheduler happier. Do we
> >> have reason to keep fair to all the workers? Does it have benefit?
> >
> > You've made one important assumption: the processes on the wait queue are
> > sleeping waiting to service things... but what if the wait queue governs
> > access to a resource, and all the processes on that wait queue need access to
> > that resource to do things? ?Some of the processes waiting for it may never
> > get a go, and so necessary work may be left undone.
> >
>
> You are right. I made the wrong assumption. But we indeed need some
> primitive to add wait_queue at the head of the wait_queue_head, and I
> know epoll needs it, at least.
>
> fs/eventpoll.c: 1443.
> wait.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(&ep->wq, &wait);

The same thing about assumptions applies here. The userspace process
may be waiting for an epoll condition to get access to a resource,
rather than being a worker thread interchangeable with others.

For example, userspace might be using a pipe as a signal-safe lock, or
signal-safe multi-token semaphore, and epoll to wait for that pipe.

WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
pointless thundering herd. It doesn't mean unfairness is ok.

The LIFO idea _might_ make sense for interchangeable worker-thread
situations - including userspace. It would make sense for pipe
waiters, socket waiters (especially accept), etc.

Do you have any measurements which showing the LIFO mode performing
better than FIFO, and by how much?

-- Jamie

2010-04-28 18:57:08

by Davide Libenzi

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, 28 Apr 2010, Changli Gao wrote:

> On Wed, Apr 28, 2010 at 5:29 PM, David Howells <[email protected]> wrote:
> > Changli Gao <[email protected]> wrote:
> >
> >> If there isn't enough work to be done, we'd better not disrupt them
> >> and  leave them sleeping forever to keep the scheduler happier. Do we
> >> have reason to keep fair to all the workers? Does it have benefit?
> >
> > You've made one important assumption: the processes on the wait queue are
> > sleeping waiting to service things... but what if the wait queue governs
> > access to a resource, and all the processes on that wait queue need access to
> > that resource to do things?  Some of the processes waiting for it may never
> > get a go, and so necessary work may be left undone.
> >
>
> You are right. I made the wrong assumption. But we indeed need some
> primitive to add wait_queue at the head of the wait_queue_head, and I
> know epoll needs it, at least.
>
> fs/eventpoll.c: 1443.
> wait.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(&ep->wq, &wait);

I'm not sure one user deserves a new function, but like it has been
noticed, the patch for that should eventually be totally isolated from
other bits.


- Davide

2010-04-28 15:34:10

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 11:00 PM, David Howells <[email protected]> wrote:
>
> But is it actually used in both modes at the same time?
>

It should be seldom in good applications. So I only need to add a
function, like this:

add_wait_queue_head_exclusive(wq, wait)
first_exclusive = find_the_first_exclusive(wq); // it is fast, as
there should no or few non-exclusive wait queues
list_add(wait, first_exclusive);


--
Regards,
Changli Gao([email protected])

2010-04-28 15:02:39

by David Howells

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao <[email protected]> wrote:

> > How often do the two actually mix? ?And if they do, is that really an
> > error?
>
> The sock.sk_sleep is used by exclusive and non-exclusive wait queues.
> exclusive and non-exclusive is identified by wait queues, not wait
> queue heads. Maybe there is a historical reason. It is much like a
> hack.

But is it actually used in both modes at the same time?

David

2010-04-28 15:49:47

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 11:25 PM, Jamie Lokier <[email protected]> wrote:
> Changli Gao wrote:
>> On Wed, Apr 28, 2010 at 9:21 PM, Jamie Lokier <[email protected]> wrote:
>> > Changli Gao wrote:
>> >>
>> >> fs/eventpoll.c: 1443.
>> >>                 wait.flags |= WQ_FLAG_EXCLUSIVE;
>> >>                 __add_wait_queue(&ep->wq, &wait);
>> >
>> > The same thing about assumptions applies here.  The userspace process
>> > may be waiting for an epoll condition to get access to a resource,
>> > rather than being a worker thread interchangeable with others.
>>
>> Oh, the lines above are the current ones. So the assumptions applies
>> and works here.
>
> No, because WQ_FLAG_EXCLUSIVE doesn't have your LIFO semantic at the moment.
>
> Your patch changes the behaviour of epoll, though I don't know if it
> matters.  Perhaps all programs which have multiple tasks waiting on
> the same epoll fd are "interchangeable worker thread" types anyway :-)
>

No. You are wrong. I meant epoll implemented LIFO on its own. You
should check the code. :)

>> > For example, userspace might be using a pipe as a signal-safe lock, or
>> > signal-safe multi-token semaphore, and epoll to wait for that pipe.
>> >
>> > WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
>> > pointless thundering herd.  It doesn't mean unfairness is ok.
>>
>> The users should not make any assumption about the waking up sequence,
>> neither LIFO nor FIFO.
>
> Correct, but they should be able to assume non-starvation (eventual
> progress) for all waiters.
>
> It's one of those subtle things, possibly a unixy thing: Non-RT tasks
> should always make progress when the competition is just other non-RT
> tasks, even if the progress is slow.
>
> Starvation can spread out beyond the starved process, to cause
> priority inversions in other tasks that are waiting on a resource
> locked by the starved process.  Among other things, that can cause
> higher priority tasks, and RT priority tasks, to block permanently.
> Very unpleasant.
>
>> > The LIFO idea _might_ make sense for interchangeable worker-thread
>> > situations - including userspace.  It would make sense for pipe
>> > waiters, socket waiters (especially accept), etc.
>>
>> Yea, and my following patches are for socket waiters.
>
> Occasionally unix socketpairs are occasionally used in the above ways too.
>
> I'm not against your patch, but I worry that starvation is a new
> semantic, and it may have a significant effect on something - either
> in the kernel, or in userspace which is harder to check.

Thanks for your reminding.

>
> I suspect it's possible to combine LIFO-ish and FIFO-ish queuing to
> prevent starvation while getting some of the locality benefit.
> Something like add-LIFO and increment a small counter in the next wait
> entry, but never add in front of an entry whose counter has reached
> MAX_LIFO_WAITERS? :-)
>

It is a little complex, and I'll keep it simple and improve it when necessary.


--
Regards,
Changli Gao([email protected])

2010-04-28 07:52:24

by Changli Gao

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 3:47 PM, Xiaotian Feng <[email protected]> wrote:
> On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <[email protected]> wrote:
>> implement the exclusive wait queue as a LIFO queue
>>
>> If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
>> process who goes to sleep recently, will be woke up first. As its memory is
>> more likely in cache, we will get better performance. And when there are many
>> processes waiting on a exclusive wait queue, some of them may not be woke up,
>> if the others can handle the workload, and it will reduce the load of
>> the scheduler.
>>
>
> Starve some processes for performance?
>

Starve? Oh, No. If we don't need these processes, and we can do better
without them, why we wake them up?


--
Regards,
Changli Gao([email protected])

2010-04-28 09:26:46

by Johannes Weiner

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

On Wed, Apr 28, 2010 at 04:23:52PM +0800, Changli Gao wrote:
> On Wed, Apr 28, 2010 at 4:15 PM, Yong Zhang <[email protected]> wrote:
> >
> > What do you mean "we don't need these processes"?
>
> If the work is less than the workers, we don't need the workers at the
> tail of the exculsive list.

Have you checked how exclusive waitqueues are even used?

> > So some processs(at the tail of exclusive list)will be treated abnormally
> > and it will sleep for a long time, is this reasonable?
> >
>
> If there isn't enough work to be done, we'd better not disrupt them
> and leave them sleeping forever to keep the scheduler happier. Do we
> have reason to keep fair to all the workers? Does it have benefit?

How about starving lock contenders? See wait_on_bit_lock() and grep
for the users e.g.

2010-04-28 09:31:45

by David Howells

[permalink] [raw]
Subject: Re: [RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao <[email protected]> wrote:

> If there isn't enough work to be done, we'd better not disrupt them
> and leave them sleeping forever to keep the scheduler happier. Do we
> have reason to keep fair to all the workers? Does it have benefit?

You've made one important assumption: the processes on the wait queue are
sleeping waiting to service things... but what if the wait queue governs
access to a resource, and all the processes on that wait queue need access to
that resource to do things? Some of the processes waiting for it may never
get a go, and so necessary work may be left undone.

So NACK.

David