2008-11-24 11:24:41

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH] Poll : add poll_wait_set_exclusive (fixing thundering herd problem in LTTng)

Problem description :

In LTTng, all lttd readers are polling all the available debugfs files
for data. This is principally because the number of reader threads is
user-defined and there are typical workloads where a single CPU is
producing most of the tracing data and all other CPUs are idle,
available to consume data. It therefore makes sense not to tie those
threads to specific buffers. However, when the number of threads grows,
we face a "thundering herd" problem where many threads can be woken up
and put back to sleep, leaving only a single thread doing useful work.

Solution :

I just created a patch which adds a poll_wait_set_exclusive() primitive
to poll(), so the code which implements the pollfd operation can specify
that only a single waiter must be woken up.

This patch applies both on 2.6.27.7 and current -tip. It is integrated
and used in the LTTng tree as of LTTng 0.59.

poll_wait_set_exclusive : set poll wait queue to exclusive
Sets up a poll wait queue to use exclusive wakeups. This is useful to
wake up only one waiter at each wakeup. Used to work-around "thundering herd"
problem.

Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: William Lee Irwin III <[email protected]>
CC: Ingo Molnar <[email protected]>
---
fs/select.c | 42 +++++++++++++++++++++++++++++++++++++++---
include/linux/poll.h | 2 ++
2 files changed, 41 insertions(+), 3 deletions(-)

Index: linux-2.6-lttng/fs/select.c
===================================================================
--- linux-2.6-lttng.orig/fs/select.c 2008-11-24 05:16:33.000000000 -0500
+++ linux-2.6-lttng/fs/select.c 2008-11-24 05:55:07.000000000 -0500
@@ -50,6 +50,9 @@ struct poll_table_page {
*/
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p);
+static void __pollwait_exclusive(struct file *filp,
+ wait_queue_head_t *wait_address,
+ poll_table *p);

void poll_initwait(struct poll_wqueues *pwq)
{
@@ -90,6 +93,21 @@ void poll_freewait(struct poll_wqueues *

EXPORT_SYMBOL(poll_freewait);

+/**
+ * poll_wait_set_exclusive : set poll wait queue to exclusive
+ *
+ * Sets up a poll wait queue to use exclusive wakeups. This is useful to
+ * wake up only one waiter at each wakeup. Used to work-around "thundering herd"
+ * problem.
+ */
+void poll_wait_set_exclusive(poll_table *p)
+{
+ if (p)
+ init_poll_funcptr(p, __pollwait_exclusive);
+}
+
+EXPORT_SYMBOL(poll_wait_set_exclusive);
+
static struct poll_table_entry *poll_get_entry(poll_table *_p)
{
struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
@@ -117,8 +135,10 @@ static struct poll_table_entry *poll_get
}

/* Add a new entry */
-static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p)
+static void __pollwait_common(struct file *filp,
+ wait_queue_head_t *wait_address,
+ poll_table *p,
+ int exclusive)
{
struct poll_table_entry *entry = poll_get_entry(p);
if (!entry)
@@ -127,7 +147,23 @@ static void __pollwait(struct file *filp
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
- add_wait_queue(wait_address, &entry->wait);
+ if (!exclusive)
+ add_wait_queue(wait_address, &entry->wait);
+ else
+ add_wait_queue_exclusive(wait_address, &entry->wait);
+}
+
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+ poll_table *p)
+{
+ __pollwait_common(filp, wait_address, p, 0);
+}
+
+static void __pollwait_exclusive(struct file *filp,
+ wait_queue_head_t *wait_address,
+ poll_table *p)
+{
+ __pollwait_common(filp, wait_address, p, 1);
}

#define FDS_IN(fds, n) (fds->in + n)
Index: linux-2.6-lttng/include/linux/poll.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/poll.h 2008-11-24 05:16:28.000000000 -0500
+++ linux-2.6-lttng/include/linux/poll.h 2008-11-24 05:25:35.000000000 -0500
@@ -65,6 +65,8 @@ struct poll_wqueues {
extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);

+extern void poll_wait_set_exclusive(poll_table *p);
+
/*
* Scaleable version of the fd_set.
*/
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68


2008-11-24 11:44:26

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [ltt-dev] [RFC PATCH] Poll : add poll_wait_set_exclusive (fixing thundering herd problem in LTTng)

Hi

> static struct poll_table_entry *poll_get_entry(poll_table *_p)
> {
> struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
> @@ -117,8 +135,10 @@ static struct poll_table_entry *poll_get
> }
>
> /* Add a new entry */
> -static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
> - poll_table *p)
> +static void __pollwait_common(struct file *filp,
> + wait_queue_head_t *wait_address,
> + poll_table *p,
> + int exclusive)
> {
> struct poll_table_entry *entry = poll_get_entry(p);
> if (!entry)
> @@ -127,7 +147,23 @@ static void __pollwait(struct file *filp
> entry->filp = filp;
> entry->wait_address = wait_address;
> init_waitqueue_entry(&entry->wait, current);
> - add_wait_queue(wait_address, &entry->wait);
> + if (!exclusive)
> + add_wait_queue(wait_address, &entry->wait);
> + else
> + add_wait_queue_exclusive(wait_address, &entry->wait);
> +}


I fully agreed this feature is needed.
Actually, I've made similar patch at one years ago.

http://marc.info/?l=linux-kernel&m=120257050719087&w=2


but, I have one question.
My version have epoll support, but yours donesn't have.
Is it intensionally?

this is just dumb question, it doesn't mean any objection.



2008-11-24 11:51:48

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [ltt-dev] [RFC PATCH] Poll : add poll_wait_set_exclusive (fixing thundering herd problem in LTTng)

* KOSAKI Motohiro ([email protected]) wrote:
> Hi
>
> > static struct poll_table_entry *poll_get_entry(poll_table *_p)
> > {
> > struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
> > @@ -117,8 +135,10 @@ static struct poll_table_entry *poll_get
> > }
> >
> > /* Add a new entry */
> > -static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
> > - poll_table *p)
> > +static void __pollwait_common(struct file *filp,
> > + wait_queue_head_t *wait_address,
> > + poll_table *p,
> > + int exclusive)
> > {
> > struct poll_table_entry *entry = poll_get_entry(p);
> > if (!entry)
> > @@ -127,7 +147,23 @@ static void __pollwait(struct file *filp
> > entry->filp = filp;
> > entry->wait_address = wait_address;
> > init_waitqueue_entry(&entry->wait, current);
> > - add_wait_queue(wait_address, &entry->wait);
> > + if (!exclusive)
> > + add_wait_queue(wait_address, &entry->wait);
> > + else
> > + add_wait_queue_exclusive(wait_address, &entry->wait);
> > +}
>
>
> I fully agreed this feature is needed.
> Actually, I've made similar patch at one years ago.
>
> http://marc.info/?l=linux-kernel&m=120257050719087&w=2
>
>
> but, I have one question.
> My version have epoll support, but yours donesn't have.
> Is it intensionally?
>
> this is just dumb question, it doesn't mean any objection.

Hrm, actually, your patch seems cleaner than mine, but it adds a branch
in the standard hotpath, which mine does not do (but I am not sure it is
such an important optimization...). Is there any reason why your patch
did not get merged ?

The only reason I did not make a epoll version is simply because LTTng
currently does not support it. :)

Mathieu

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-11-24 12:11:41

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [ltt-dev] [RFC PATCH] Poll : add poll_wait_set_exclusive (fixing thundering herd problem in LTTng)

> > I fully agreed this feature is needed.
> > Actually, I've made similar patch at one years ago.
> >
> > http://marc.info/?l=linux-kernel&m=120257050719087&w=2
> >
> >
> > but, I have one question.
> > My version have epoll support, but yours donesn't have.
> > Is it intensionally?
> >
> > this is just dumb question, it doesn't mean any objection.
>
> Hrm, actually, your patch seems cleaner than mine, but it adds a branch
> in the standard hotpath, which mine does not do (but I am not sure it is
> such an important optimization...).

Why do you think poll_wait() is hotpath?
I think sysm_poll() isn't hotpath because it often cause task sleeping.


> Is there any reason why your patch
> did not get merged ?

my patch was developed for a part of mem_notify patch series.
but the mem_notify was naked by akpm.
therefore it lost merging motivation ;-)

Ingo, I'll rebase and post my patch for -tip tommorow.
Could you please review it?


> The only reason I did not make a epoll version is simply because LTTng
> currently does not support it. :)

thanks.
I understand your original intension.


2008-11-24 12:17:17

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [ltt-dev] [RFC PATCH] Poll : add poll_wait_set_exclusive (fixing thundering herd problem in LTTng)

* KOSAKI Motohiro ([email protected]) wrote:
> > > I fully agreed this feature is needed.
> > > Actually, I've made similar patch at one years ago.
> > >
> > > http://marc.info/?l=linux-kernel&m=120257050719087&w=2
> > >
> > >
> > > but, I have one question.
> > > My version have epoll support, but yours donesn't have.
> > > Is it intensionally?
> > >
> > > this is just dumb question, it doesn't mean any objection.
> >
> > Hrm, actually, your patch seems cleaner than mine, but it adds a branch
> > in the standard hotpath, which mine does not do (but I am not sure it is
> > such an important optimization...).
>
> Why do you think poll_wait() is hotpath?
> I think sysm_poll() isn't hotpath because it often cause task sleeping.
>
>
> > Is there any reason why your patch
> > did not get merged ?
>
> my patch was developed for a part of mem_notify patch series.
> but the mem_notify was naked by akpm.
> therefore it lost merging motivation ;-)
>
> Ingo, I'll rebase and post my patch for -tip tommorow.
> Could you please review it?
>
>
> > The only reason I did not make a epoll version is simply because LTTng
> > currently does not support it. :)
>
> thanks.
> I understand your original intension.
>

Great, please CC me on this one so I can integrate it to the LTTng tree.

Mathieu


--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-11-25 10:50:44

by KOSAKI Motohiro

[permalink] [raw]
Subject: [PATCH] Poll : introduce poll_wait_exclusive() new function


patch againt: tip/tracing/marker

==========
Currently, wake_up() function behavior depend on the way of
wait queue adding function.


wake_up() wake_up_all()
---------------------------------------------------------------
add_wait_queue() wake up all wake up all
add_wait_queue_exclusive() wake up one task wake up all


Unforunately, poll_wait() always use add_wait_queue().
it means there is no way that wake up only one process in polled processes.
wake_up() also wake up all sleeping processes, not 1 process.


Mathieu Desnoyers explained it cause following problem to LTTng.

In LTTng, all lttd readers are polling all the available debugfs files
for data. This is principally because the number of reader threads is
user-defined and there are typical workloads where a single CPU is
producing most of the tracing data and all other CPUs are idle,
available to consume data. It therefore makes sense not to tie those
threads to specific buffers. However, when the number of threads grows,
we face a "thundering herd" problem where many threads can be woken up
and put back to sleep, leaving only a single thread doing useful work.


this patch introduce poll_wait_exclusive() new API for allow wake up
only one process.

<usage example>
unsigned int foo_device_poll(struct file *file,
struct poll_table_struct *wait)
{
poll_wait_exclusive(file, &foo_wait_queue, wait);
if (data_exist)
return POLLIN | POLLRDNORM;
return 0;
}
</usage example>


Signed-off-by: KOSAKI Motohiro <[email protected]>
CC: Mathieu Desnoyers <[email protected]>
CC: Ingo Molnar <[email protected]>
---
fs/eventpoll.c | 7 +++++--
fs/select.c | 9 ++++++---
include/linux/poll.h | 13 +++++++++++--
3 files changed, 22 insertions(+), 7 deletions(-)



Index: b/fs/eventpoll.c
===================================================================
--- a/fs/eventpoll.c 2008-11-25 19:05:28.000000000 +0900
+++ b/fs/eventpoll.c 2008-11-25 19:15:50.000000000 +0900
@@ -655,7 +655,7 @@ out_unlock:
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt)
+ poll_table *pt, int exclusive)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
@@ -664,7 +664,10 @@ static void ep_ptable_queue_proc(struct
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
- add_wait_queue(whead, &pwq->wait);
+ if (exclusive)
+ add_wait_queue_exclusive(whead, &pwq->wait);
+ else
+ add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
Index: b/fs/select.c
===================================================================
--- a/fs/select.c 2008-11-25 19:04:26.000000000 +0900
+++ b/fs/select.c 2008-11-25 19:15:50.000000000 +0900
@@ -104,7 +104,7 @@ struct poll_table_page {
* poll table.
*/
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p);
+ poll_table *p, int exclusive);

void poll_initwait(struct poll_wqueues *pwq)
{
@@ -173,7 +173,7 @@ static struct poll_table_entry *poll_get

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p)
+ poll_table *p, int exclusive)
{
struct poll_table_entry *entry = poll_get_entry(p);
if (!entry)
@@ -182,7 +182,10 @@ static void __pollwait(struct file *filp
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
- add_wait_queue(wait_address, &entry->wait);
+ if (exclusive)
+ add_wait_queue_exclusive(wait_address, &entry->wait);
+ else
+ add_wait_queue(wait_address, &entry->wait);
}

/**
Index: b/include/linux/poll.h
===================================================================
--- a/include/linux/poll.h 2008-11-25 19:04:26.000000000 +0900
+++ b/include/linux/poll.h 2008-11-25 19:19:54.000000000 +0900
@@ -28,7 +28,8 @@ struct poll_table_struct;
/*
* structures and helpers for f_op->poll implementations
*/
-typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
+typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
+ struct poll_table_struct *, int);

typedef struct poll_table_struct {
poll_queue_proc qproc;
@@ -37,7 +38,15 @@ typedef struct poll_table_struct {
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
- p->qproc(filp, wait_address, p);
+ p->qproc(filp, wait_address, p, 0);
+}
+
+static inline void poll_wait_exclusive(struct file *filp,
+ wait_queue_head_t *wait_address,
+ poll_table *p)
+{
+ if (p && wait_address)
+ p->qproc(filp, wait_address, p, 1);
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)

2008-11-25 21:21:45

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] Poll : introduce poll_wait_exclusive() new function

On Tue, 25 Nov 2008, KOSAKI Motohiro wrote:

>
> patch againt: tip/tracing/marker
>
> ==========
> Currently, wake_up() function behavior depend on the way of
> wait queue adding function.
>
>
> wake_up() wake_up_all()
> ---------------------------------------------------------------
> add_wait_queue() wake up all wake up all
> add_wait_queue_exclusive() wake up one task wake up all
>
>
> Unforunately, poll_wait() always use add_wait_queue().
> it means there is no way that wake up only one process in polled processes.
> wake_up() also wake up all sleeping processes, not 1 process.
>
>
> Mathieu Desnoyers explained it cause following problem to LTTng.
>
> In LTTng, all lttd readers are polling all the available debugfs files
> for data. This is principally because the number of reader threads is
> user-defined and there are typical workloads where a single CPU is
> producing most of the tracing data and all other CPUs are idle,
> available to consume data. It therefore makes sense not to tie those
> threads to specific buffers. However, when the number of threads grows,
> we face a "thundering herd" problem where many threads can be woken up
> and put back to sleep, leaving only a single thread doing useful work.

Why do you need to have so many threads banging a single device/file?
Have one (or any other very little number) puller thread(s), that
activates with chucks of pulled data the other processing threads. That
way there's no need for a new wakeup abstraction.



- Davide

2008-11-26 01:09:44

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH] Poll : introduce poll_wait_exclusive() new function

> > Mathieu Desnoyers explained it cause following problem to LTTng.
> >
> > In LTTng, all lttd readers are polling all the available debugfs files
> > for data. This is principally because the number of reader threads is
> > user-defined and there are typical workloads where a single CPU is
> > producing most of the tracing data and all other CPUs are idle,
> > available to consume data. It therefore makes sense not to tie those
> > threads to specific buffers. However, when the number of threads grows,
> > we face a "thundering herd" problem where many threads can be woken up
> > and put back to sleep, leaving only a single thread doing useful work.
>
> Why do you need to have so many threads banging a single device/file?
> Have one (or any other very little number) puller thread(s), that
> activates with chucks of pulled data the other processing threads. That
> way there's no need for a new wakeup abstraction.

Mathieu, I don't hope I unstrictly explain to LTTng.
Could you please explain LTTng design?


2008-11-26 11:15:27

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function

* Davide Libenzi ([email protected]) wrote:
> On Tue, 25 Nov 2008, KOSAKI Motohiro wrote:
>
> >
> > patch againt: tip/tracing/marker
> >
> > ==========
> > Currently, wake_up() function behavior depend on the way of
> > wait queue adding function.
> >
> >
> > wake_up() wake_up_all()
> > ---------------------------------------------------------------
> > add_wait_queue() wake up all wake up all
> > add_wait_queue_exclusive() wake up one task wake up all
> >
> >
> > Unforunately, poll_wait() always use add_wait_queue().
> > it means there is no way that wake up only one process in polled processes.
> > wake_up() also wake up all sleeping processes, not 1 process.
> >
> >
> > Mathieu Desnoyers explained it cause following problem to LTTng.
> >
> > In LTTng, all lttd readers are polling all the available debugfs files
> > for data. This is principally because the number of reader threads is
> > user-defined and there are typical workloads where a single CPU is
> > producing most of the tracing data and all other CPUs are idle,
> > available to consume data. It therefore makes sense not to tie those
> > threads to specific buffers. However, when the number of threads grows,
> > we face a "thundering herd" problem where many threads can be woken up
> > and put back to sleep, leaving only a single thread doing useful work.
>
> Why do you need to have so many threads banging a single device/file?
> Have one (or any other very little number) puller thread(s), that
> activates with chucks of pulled data the other processing threads. That
> way there's no need for a new wakeup abstraction.
>
>
>
> - Davide

One of the key design rule of LTTng is to do not depend on such
system-wide data structures, or entity (e.g. single manager thread).
Everything is per-cpu, and it does scale very well.

I wonder how badly the approach you propose can scale on large NUMA
systems, where having to synchronize everything through a single thread
might become an important point of contention, just due to the cacheline
bouncing and extra scheduler activity involved.

Mathieu

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-11-26 11:22:17

by Andrew McDermott

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function


Mathieu Desnoyers <[email protected]> writes:

[...]

>> > Mathieu Desnoyers explained it cause following problem to LTTng.
>> >
>> > In LTTng, all lttd readers are polling all the available debugfs files
>> > for data. This is principally because the number of reader threads is
>> > user-defined and there are typical workloads where a single CPU is
>> > producing most of the tracing data and all other CPUs are idle,
>> > available to consume data. It therefore makes sense not to tie those
>> > threads to specific buffers. However, when the number of threads grows,
>> > we face a "thundering herd" problem where many threads can be woken up
>> > and put back to sleep, leaving only a single thread doing useful work.
>>
>> Why do you need to have so many threads banging a single device/file?
>> Have one (or any other very little number) puller thread(s), that
>> activates with chucks of pulled data the other processing threads. That
>> way there's no need for a new wakeup abstraction.
>>
>>
>>
>> - Davide
>
> One of the key design rule of LTTng is to do not depend on such
> system-wide data structures, or entity (e.g. single manager thread).
> Everything is per-cpu, and it does scale very well.
>
> I wonder how badly the approach you propose can scale on large NUMA
> systems, where having to synchronize everything through a single thread
> might become an important point of contention, just due to the cacheline
> bouncing and extra scheduler activity involved.

But at the end of the day these threads end up writing to the (possibly)
single spindle. Isn't that the biggest bottlneck here?

--
andy

2008-11-26 22:27:28

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function

* Andrew McDermott ([email protected]) wrote:
>
> Mathieu Desnoyers <[email protected]> writes:
>
> [...]
>
> >> > Mathieu Desnoyers explained it cause following problem to LTTng.
> >> >
> >> > In LTTng, all lttd readers are polling all the available debugfs files
> >> > for data. This is principally because the number of reader threads is
> >> > user-defined and there are typical workloads where a single CPU is
> >> > producing most of the tracing data and all other CPUs are idle,
> >> > available to consume data. It therefore makes sense not to tie those
> >> > threads to specific buffers. However, when the number of threads grows,
> >> > we face a "thundering herd" problem where many threads can be woken up
> >> > and put back to sleep, leaving only a single thread doing useful work.
> >>
> >> Why do you need to have so many threads banging a single device/file?
> >> Have one (or any other very little number) puller thread(s), that
> >> activates with chucks of pulled data the other processing threads. That
> >> way there's no need for a new wakeup abstraction.
> >>
> >>
> >>
> >> - Davide
> >
> > One of the key design rule of LTTng is to do not depend on such
> > system-wide data structures, or entity (e.g. single manager thread).
> > Everything is per-cpu, and it does scale very well.
> >
> > I wonder how badly the approach you propose can scale on large NUMA
> > systems, where having to synchronize everything through a single thread
> > might become an important point of contention, just due to the cacheline
> > bouncing and extra scheduler activity involved.
>
> But at the end of the day these threads end up writing to the (possibly)
> single spindle. Isn't that the biggest bottlneck here?
>

Not if those threads are either

- analysing the data on-the-fly without exporting it to disk
- sending the data through more than one network card
- Writing data to multiple disks

There are therefore ways to improve scalability by adding more data
output paths. Therefore, I don't want to limit scalability due to the
inner design, so that if someone has the resources to send the
information out at great speed scaleably, he can.

Mathieu

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-11-27 00:08:22

by Davide Libenzi

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function

On Wed, 26 Nov 2008, Mathieu Desnoyers wrote:

> One of the key design rule of LTTng is to do not depend on such
> system-wide data structures, or entity (e.g. single manager thread).
> Everything is per-cpu, and it does scale very well.
>
> I wonder how badly the approach you propose can scale on large NUMA
> systems, where having to synchronize everything through a single thread
> might become an important point of contention, just due to the cacheline
> bouncing and extra scheduler activity involved.

I dunno the LTT architecture, so I'm staying out of that discussion.
But, if the patch you're trying to push is to avoid thundering herd of so
many threads waiting on the single file*, you've got the same problem
right there. You've got at least the spinlock protecting the queue
where these threads are focusing, whose cacheline is bounced gets bounced
all over the CPUs.
Do you have any measure of the improvements that such poll_wait_exclusive()
will eventually lead to?



- Davide

2008-11-27 12:50:29

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function

> > One of the key design rule of LTTng is to do not depend on such
> > system-wide data structures, or entity (e.g. single manager thread).
> > Everything is per-cpu, and it does scale very well.
> >
> > I wonder how badly the approach you propose can scale on large NUMA
> > systems, where having to synchronize everything through a single thread
> > might become an important point of contention, just due to the cacheline
> > bouncing and extra scheduler activity involved.
>
> I dunno the LTT architecture, so I'm staying out of that discussion.
> But, if the patch you're trying to push is to avoid thundering herd of so
> many threads waiting on the single file*, you've got the same problem
> right there. You've got at least the spinlock protecting the queue
> where these threads are focusing, whose cacheline is bounced gets bounced
> all over the CPUs.
> Do you have any measure of the improvements that such poll_wait_exclusive()
> will eventually lead to?

Also my lttng knowledge isn't perfect.
Then, I'd like to talk about another aspect.

This patch was originally written for memory shortage notification to userland patch.
Currently, many application has own various cache (e.g. almost GUI app
has image cache, the VM of the langueage with GC feature has droppable
garbege memory)

However, To wake up all process makes thundering herd easily.
because it wake up almost process in system.
In addision, any process independent on each other. then, I couldn't
choice your proposed workaround at that time.

Currently, we and container folks restart to discuss cgroup
oom notification again.
I think this patch can provide its infrastructure too.

if possible, Could you please don't only review description,
but also the code?


2008-11-28 13:13:22

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function

* Davide Libenzi ([email protected]) wrote:
> On Wed, 26 Nov 2008, Mathieu Desnoyers wrote:
>
> > One of the key design rule of LTTng is to do not depend on such
> > system-wide data structures, or entity (e.g. single manager thread).
> > Everything is per-cpu, and it does scale very well.
> >
> > I wonder how badly the approach you propose can scale on large NUMA
> > systems, where having to synchronize everything through a single thread
> > might become an important point of contention, just due to the cacheline
> > bouncing and extra scheduler activity involved.
>
> I dunno the LTT architecture, so I'm staying out of that discussion.
> But, if the patch you're trying to push is to avoid thundering herd of so
> many threads waiting on the single file*, you've got the same problem
> right there. You've got at least the spinlock protecting the queue
> where these threads are focusing, whose cacheline is bounced gets bounced
> all over the CPUs.
> Do you have any measure of the improvements that such poll_wait_exclusive()
> will eventually lead to?
>

Nope, sorry, I don't own any machine with such huge amount of CPUs,
therefore I can't afford to test that scalability level.

You say that "You've got at least the spinlock protecting the queue
where these threads are focusing", you assume we stay limited to the
current implementation inability to scale correctly. We could think of a
scheme with :

- Per cpu waiters. Waiters are added to their own CPU waiting list.
- Per cpu wakeups, where the wakeup will try to wake up a waiter on the
local CPU first.

If there happens to be no waiters for a local CPU wakeup, the wakeup
would then be broadcasted to other CPUs, which involves proper locking
which I think could be done pretty efficiently so we don't hurt the "add
waiter" fastpath.

By doing this, we could end up not even taking a global spinlock in the
add waiter/wakeup fastpaths. So then we would have fixed both the
thundering herd problem _and_ the global spinlock issue altogether.

Any thought ?

Mathieu

>
>
> - Davide
>
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-12-02 01:27:34

by Davide Libenzi

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function

On Fri, 28 Nov 2008, Mathieu Desnoyers wrote:

> Nope, sorry, I don't own any machine with such huge amount of CPUs,
> therefore I can't afford to test that scalability level.

OK, but maybe before trying to push a performance-based patch, some
pseudo-real numbers should be given ;)



> You say that "You've got at least the spinlock protecting the queue
> where these threads are focusing", you assume we stay limited to the
> current implementation inability to scale correctly. We could think of a
> scheme with :
>
> - Per cpu waiters. Waiters are added to their own CPU waiting list.
> - Per cpu wakeups, where the wakeup will try to wake up a waiter on the
> local CPU first.
>
> If there happens to be no waiters for a local CPU wakeup, the wakeup
> would then be broadcasted to other CPUs, which involves proper locking
> which I think could be done pretty efficiently so we don't hurt the "add
> waiter" fastpath.
>
> By doing this, we could end up not even taking a global spinlock in the
> add waiter/wakeup fastpaths. So then we would have fixed both the
> thundering herd problem _and_ the global spinlock issue altogether.
>
> Any thought ?

That looks fine, but before waving the Thundering Herd flag some
performance numbers should be given IMO, considerng also that this is a
rather particular use of the poll subsystem.
Eventually, isn't something like the patch below better than adding custom
case names to poll_wait() ALA poll_wait_LONGNAMESFOLLOW()?
That way you can pass your own queueing function:

poll_wait_qfunc(file, head, pt, add_wait_queue_exclusive);



- Davide



---
fs/eventpoll.c | 7 +++++--
fs/select.c | 28 +++++++++++++---------------
include/linux/poll.h | 18 +++++++++++++++---
3 files changed, 33 insertions(+), 20 deletions(-)

Index: linux-2.6.mod/fs/select.c
===================================================================
--- linux-2.6.mod.orig/fs/select.c 2008-12-01 16:27:15.000000000 -0800
+++ linux-2.6.mod/fs/select.c 2008-12-01 16:47:53.000000000 -0800
@@ -103,19 +103,6 @@
* as all select/poll functions have to call it to add an entry to the
* poll table.
*/
-static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p);
-
-void poll_initwait(struct poll_wqueues *pwq)
-{
- init_poll_funcptr(&pwq->pt, __pollwait);
- pwq->error = 0;
- pwq->table = NULL;
- pwq->inline_index = 0;
-}
-
-EXPORT_SYMBOL(poll_initwait);
-
static void free_poll_entry(struct poll_table_entry *entry)
{
remove_wait_queue(entry->wait_address, &entry->wait);
@@ -173,7 +160,8 @@

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p)
+ poll_table *p, void (*qfunc)(wait_queue_head_t *,
+ wait_queue_t *))
{
struct poll_table_entry *entry = poll_get_entry(p);
if (!entry)
@@ -182,9 +170,19 @@
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
- add_wait_queue(wait_address, &entry->wait);
+ (*qfunc)(wait_address, &entry->wait);
}

+void poll_initwait(struct poll_wqueues *pwq)
+{
+ init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->error = 0;
+ pwq->table = NULL;
+ pwq->inline_index = 0;
+}
+
+EXPORT_SYMBOL(poll_initwait);
+
/**
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
Index: linux-2.6.mod/include/linux/poll.h
===================================================================
--- linux-2.6.mod.orig/include/linux/poll.h 2008-12-01 16:27:15.000000000 -0800
+++ linux-2.6.mod/include/linux/poll.h 2008-12-01 16:55:11.000000000 -0800
@@ -28,16 +28,28 @@
/*
* structures and helpers for f_op->poll implementations
*/
-typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
+typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
+ struct poll_table_struct *,
+ void (*)(wait_queue_head_t *, wait_queue_t *));

typedef struct poll_table_struct {
poll_queue_proc qproc;
} poll_table;

-static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
+static inline void poll_wait_qfunc(struct file *filp,
+ wait_queue_head_t *wait_address,
+ poll_table *p,
+ void (*qfunc)(wait_queue_head_t *,
+ wait_queue_t *))
{
if (p && wait_address)
- p->qproc(filp, wait_address, p);
+ p->qproc(filp, wait_address, p, qfunc);
+}
+
+static inline void poll_wait(struct file *filp, wait_queue_head_t *wait_address,
+ poll_table *p)
+{
+ poll_wait_qfunc(filp, wait_address, p, add_wait_queue);
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
Index: linux-2.6.mod/fs/eventpoll.c
===================================================================
--- linux-2.6.mod.orig/fs/eventpoll.c 2008-12-01 16:27:15.000000000 -0800
+++ linux-2.6.mod/fs/eventpoll.c 2008-12-01 16:39:02.000000000 -0800
@@ -655,7 +655,10 @@
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt)
+ poll_table *pt,
+ void (*qfunc)(wait_queue_head_t *,
+ wait_queue_t *))
+
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
@@ -664,7 +667,7 @@
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
- add_wait_queue(whead, &pwq->wait);
+ (*qfunc)(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {

2008-12-02 04:45:10

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [ltt-dev] [PATCH] Poll : introduce poll_wait_exclusive() new function

> > Any thought ?
>
> That looks fine, but before waving the Thundering Herd flag some
> performance numbers should be given IMO, considerng also that this is a
> rather particular use of the poll subsystem.
> Eventually, isn't something like the patch below better than adding custom
> case names to poll_wait() ALA poll_wait_LONGNAMESFOLLOW()?
> That way you can pass your own queueing function:
>
> poll_wait_qfunc(file, head, pt, add_wait_queue_exclusive);

make sense.
I don't found any bug in your patch.

Reviewed-by: KOSAKI Motohiro <[email protected]>


Thanks.