2007-05-02 05:36:43

by Davi Arnaut

[permalink] [raw]
Subject: [patch 14/22] pollfs: pollable futex

Asynchronously wait for FUTEX_WAKE operation on a futex if it still contains
a given value. There can be only one futex wait per file descriptor. However,
it can be rearmed (possibly at a different address) anytime.

The pollable futex approach is far superior (send and receive events from
userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.

Building block for pollable semaphores and user-defined events.

Signed-off-by: Davi E. M. Arnaut <[email protected]>

---
fs/pollfs/Makefile | 1
fs/pollfs/futex.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++
init/Kconfig | 7 ++
3 files changed, 162 insertions(+)

Index: linux-2.6/fs/pollfs/Makefile
===================================================================
--- linux-2.6.orig/fs/pollfs/Makefile
+++ linux-2.6/fs/pollfs/Makefile
@@ -3,3 +3,4 @@ pollfs-y := file.o

pollfs-$(CONFIG_POLLFS_SIGNAL) += signal.o
pollfs-$(CONFIG_POLLFS_TIMER) += timer.o
+pollfs-$(CONFIG_POLLFS_FUTEX) += futex.o
Index: linux-2.6/fs/pollfs/futex.c
===================================================================
--- /dev/null
+++ linux-2.6/fs/pollfs/futex.c
@@ -0,0 +1,154 @@
+/*
+ * pollable futex
+ *
+ * Copyright (C) 2007 Davi E. M. Arnaut
+ *
+ * Licensed under the GNU GPL. See the file COPYING for details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/pollfs_fs.h>
+#include <linux/futex.h>
+
+struct futex_event {
+ union {
+ void __user *addr;
+ u64 padding;
+ };
+ int val;
+};
+
+struct pfs_futex {
+ struct futex_q q;
+ struct futex_event fevt;
+ struct mutex mutex;
+ unsigned volatile queued;
+ struct pfs_file file;
+};
+
+static ssize_t read(struct pfs_futex *evs, struct futex_event __user *ufevt)
+{
+ int ret;
+ struct futex_event fevt;
+
+ mutex_lock(&evs->mutex);
+
+ fevt = evs->fevt;
+
+ ret = -EAGAIN;
+
+ if (!evs->queued)
+ ret = -EINVAL;
+ else if (list_empty(&evs->q.list))
+ ret = futex_wait_unqueue(&evs->q);
+
+ switch (ret) {
+ case 1:
+ ret = -EAGAIN;
+ case 0:
+ evs->queued = 0;
+ }
+
+ mutex_unlock(&evs->mutex);
+
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(ufevt, &fevt, sizeof(fevt)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static ssize_t write(struct pfs_futex *evs,
+ const struct futex_event __user *ufevt)
+{
+ int ret;
+ struct futex_event fevt;
+
+ if (copy_from_user(&fevt, ufevt, sizeof(fevt)))
+ return -EFAULT;
+
+ mutex_lock(&evs->mutex);
+
+ if (evs->queued)
+ futex_wait_unqueue(&evs->q);
+
+ ret = futex_wait_queue(&evs->q, fevt.addr, fevt.val);
+
+ if (ret)
+ evs->queued = 0;
+ else {
+ evs->queued = 1;
+ evs->fevt = fevt;
+ }
+
+ mutex_unlock(&evs->mutex);
+
+ return ret;
+}
+
+static int poll(struct pfs_futex *evs)
+{
+ int ret;
+
+ while (!mutex_trylock(&evs->mutex))
+ cpu_relax();
+
+ ret = evs->queued && list_empty(&evs->q.list) ? POLLIN : 0;
+
+ mutex_unlock(&evs->mutex);
+
+ return ret;
+}
+
+static int release(struct pfs_futex *evs)
+{
+ if (evs->queued)
+ futex_wait_unqueue(&evs->q);
+
+ mutex_destroy(&evs->mutex);
+
+ kfree(evs);
+
+ return 0;
+}
+
+static const struct pfs_operations futex_ops = {
+ .read = PFS_READ(read, struct pfs_futex, struct futex_event),
+ .write = PFS_WRITE(write, struct pfs_futex, struct futex_event),
+ .poll = PFS_POLL(poll, struct pfs_futex),
+ .release = PFS_RELEASE(release, struct pfs_futex),
+ .rsize = sizeof(struct futex_event),
+ .wsize = sizeof(struct futex_event),
+};
+
+asmlinkage long sys_plfutex(void)
+{
+ long error;
+ struct pfs_futex *evs;
+
+ evs = kzalloc(sizeof(*evs), GFP_KERNEL);
+ if (!evs)
+ return -ENOMEM;
+
+ mutex_init(&evs->mutex);
+ init_waitqueue_head(&evs->q.waiters);
+
+ evs->file.data = evs;
+ evs->file.fops = &futex_ops;
+ evs->file.wait = &evs->q.waiters;
+
+ error = pfs_open(&evs->file);
+
+ if (error < 0)
+ release(evs);
+
+ return error;
+}
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -483,6 +483,13 @@ config POLLFS_TIMER
help
Pollable timer support

+config POLLFS_FUTEX
+ bool "Enable pollfs futex" if EMBEDDED
+ default y
+ depends on POLLFS && FUTEX
+ help
+ Pollable futex support
+
config SHMEM
bool "Use full shmem filesystem" if EMBEDDED
default y

--


2007-05-02 05:55:07

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Davi Arnaut a ?crit :
> Asynchronously wait for FUTEX_WAKE operation on a futex if it still contains
> a given value. There can be only one futex wait per file descriptor. However,
> it can be rearmed (possibly at a different address) anytime.
>
> The pollable futex approach is far superior (send and receive events from
> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
>
> Building block for pollable semaphores and user-defined events.
>
> Signed-off-by: Davi E. M. Arnaut <[email protected]>
>
> ---
> fs/pollfs/Makefile | 1
> fs/pollfs/futex.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> init/Kconfig | 7 ++
> 3 files changed, 162 insertions(+)
>
> Index: linux-2.6/fs/pollfs/Makefile
> ===================================================================
> --- linux-2.6.orig/fs/pollfs/Makefile
> +++ linux-2.6/fs/pollfs/Makefile
> @@ -3,3 +3,4 @@ pollfs-y := file.o
>
> pollfs-$(CONFIG_POLLFS_SIGNAL) += signal.o
> pollfs-$(CONFIG_POLLFS_TIMER) += timer.o
> +pollfs-$(CONFIG_POLLFS_FUTEX) += futex.o
> Index: linux-2.6/fs/pollfs/futex.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/fs/pollfs/futex.c
> @@ -0,0 +1,154 @@
> +/*
> + * pollable futex
> + *
> + * Copyright (C) 2007 Davi E. M. Arnaut
> + *
> + * Licensed under the GNU GPL. See the file COPYING for details.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/sched.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/err.h>
> +#include <linux/wait.h>
> +#include <linux/poll.h>
> +#include <linux/pollfs_fs.h>
> +#include <linux/futex.h>
> +
> +struct futex_event {
> + union {
> + void __user *addr;
> + u64 padding;
> + };
> + int val;
> +};

Hum... Here we might have a problem with 64 bit futexes, or private futexes

So I believe this interface is not well defined and not expandable: in case of
future additions to futexes, an old application compiled with an old pollable
futex_event type might fail.



2007-05-02 06:16:08

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Eric Dumazet wrote:
> Davi Arnaut a ?crit :
>> Asynchronously wait for FUTEX_WAKE operation on a futex if it still contains
>> a given value. There can be only one futex wait per file descriptor. However,
>> it can be rearmed (possibly at a different address) anytime.
>>
>> The pollable futex approach is far superior (send and receive events from
>> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
>>
>> Building block for pollable semaphores and user-defined events.
>>
>> Signed-off-by: Davi E. M. Arnaut <[email protected]>
>>
>> ---
>> fs/pollfs/Makefile | 1
>> fs/pollfs/futex.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>> init/Kconfig | 7 ++
>> 3 files changed, 162 insertions(+)
>>
>> Index: linux-2.6/fs/pollfs/Makefile
>> ===================================================================
>> --- linux-2.6.orig/fs/pollfs/Makefile
>> +++ linux-2.6/fs/pollfs/Makefile
>> @@ -3,3 +3,4 @@ pollfs-y := file.o
>>
>> pollfs-$(CONFIG_POLLFS_SIGNAL) += signal.o
>> pollfs-$(CONFIG_POLLFS_TIMER) += timer.o
>> +pollfs-$(CONFIG_POLLFS_FUTEX) += futex.o
>> Index: linux-2.6/fs/pollfs/futex.c
>> ===================================================================
>> --- /dev/null
>> +++ linux-2.6/fs/pollfs/futex.c
>> @@ -0,0 +1,154 @@
>> +/*
>> + * pollable futex
>> + *
>> + * Copyright (C) 2007 Davi E. M. Arnaut
>> + *
>> + * Licensed under the GNU GPL. See the file COPYING for details.
>> + */
>> +
>> +#include <linux/kernel.h>
>> +#include <linux/sched.h>
>> +#include <linux/module.h>
>> +#include <linux/slab.h>
>> +#include <linux/err.h>
>> +#include <linux/wait.h>
>> +#include <linux/poll.h>
>> +#include <linux/pollfs_fs.h>
>> +#include <linux/futex.h>
>> +
>> +struct futex_event {
>> + union {
>> + void __user *addr;
>> + u64 padding;
>> + };
>> + int val;
>> +};
>
> Hum... Here we might have a problem with 64 bit futexes, or private futexes
>
> So I believe this interface is not well defined and not expandable: in case of
> future additions to futexes, an old application compiled with an old pollable
> futex_event type might fail.
>

Hmm, how about:

struct futex_event {
union {
void __user *addr;
u64 padding;
};
union {
int val;
s64 val64;
};
/* whatever room is necessary for future improvements */
};

I haven't been keeping up with 64 bit or private futexes. What else
could probably go wrong?

--
Davi Arnaut

2007-05-02 06:41:19

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Davi Arnaut a ?crit :
> Eric Dumazet wrote:
>> Davi Arnaut a ?crit :
>>> Asynchronously wait for FUTEX_WAKE operation on a futex if it still contains
>>> a given value. There can be only one futex wait per file descriptor. However,
>>> it can be rearmed (possibly at a different address) anytime.
>>>
>>> The pollable futex approach is far superior (send and receive events from
>>> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
>>>
>>> Building block for pollable semaphores and user-defined events.
>>>
>>> Signed-off-by: Davi E. M. Arnaut <[email protected]>
>>>
>>> ---
>>> fs/pollfs/Makefile | 1
>>> fs/pollfs/futex.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>> init/Kconfig | 7 ++
>>> 3 files changed, 162 insertions(+)
>>>
>>> Index: linux-2.6/fs/pollfs/Makefile
>>> ===================================================================
>>> --- linux-2.6.orig/fs/pollfs/Makefile
>>> +++ linux-2.6/fs/pollfs/Makefile
>>> @@ -3,3 +3,4 @@ pollfs-y := file.o
>>>
>>> pollfs-$(CONFIG_POLLFS_SIGNAL) += signal.o
>>> pollfs-$(CONFIG_POLLFS_TIMER) += timer.o
>>> +pollfs-$(CONFIG_POLLFS_FUTEX) += futex.o
>>> Index: linux-2.6/fs/pollfs/futex.c
>>> ===================================================================
>>> --- /dev/null
>>> +++ linux-2.6/fs/pollfs/futex.c
>>> @@ -0,0 +1,154 @@
>>> +/*
>>> + * pollable futex
>>> + *
>>> + * Copyright (C) 2007 Davi E. M. Arnaut
>>> + *
>>> + * Licensed under the GNU GPL. See the file COPYING for details.
>>> + */
>>> +
>>> +#include <linux/kernel.h>
>>> +#include <linux/sched.h>
>>> +#include <linux/module.h>
>>> +#include <linux/slab.h>
>>> +#include <linux/err.h>
>>> +#include <linux/wait.h>
>>> +#include <linux/poll.h>
>>> +#include <linux/pollfs_fs.h>
>>> +#include <linux/futex.h>
>>> +
>>> +struct futex_event {
>>> + union {
>>> + void __user *addr;
>>> + u64 padding;
>>> + };
>>> + int val;
>>> +};
>> Hum... Here we might have a problem with 64 bit futexes, or private futexes
>>
>> So I believe this interface is not well defined and not expandable: in case of
>> future additions to futexes, an old application compiled with an old pollable
>> futex_event type might fail.
>>
>
> Hmm, how about:
>
> struct futex_event {
> union {
> void __user *addr;
> u64 padding;
> };
> union {
> int val;
> s64 val64;
> };
> /* whatever room is necessary for future improvements */
> };
>
> I haven't been keeping up with 64 bit or private futexes. What else
> could probably go wrong?

Well, that's the point : This interface is like an ioctl() one : pretty bad if
not properly designed :)

You probably need to stick one field containing one command or version number,
something like that.


struct futex_event {
int type;
union {
void __user *addr;
u64 padding;
};
union {
int val;
s64 val64;
};
};

#define FUTEX_EVENT_SHARED32 1
#define FUTEX_EVENT_SHARED64 2
#define FUTEX_EVENT_PRIVATE32 (128|1)
#define FUTEX_EVENT_PRIVATE64 (128|2)

...

Also, you should take care of alignements constraints (a 32bit user program
might run on a 64bit kernel)

2007-05-02 06:54:25

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Eric Dumazet wrote:
> Davi Arnaut a ?crit :
>> Eric Dumazet wrote:
>>> Davi Arnaut a ?crit :
>>>> Asynchronously wait for FUTEX_WAKE operation on a futex if it still contains
>>>> a given value. There can be only one futex wait per file descriptor. However,
>>>> it can be rearmed (possibly at a different address) anytime.
>>>>
>>>> The pollable futex approach is far superior (send and receive events from
>>>> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
>>>>
>>>> Building block for pollable semaphores and user-defined events.
>>>>
>>>> Signed-off-by: Davi E. M. Arnaut <[email protected]>
>>>>

<snip>

>>>> +
>>>> +struct futex_event {
>>>> + union {
>>>> + void __user *addr;
>>>> + u64 padding;
>>>> + };
>>>> + int val;
>>>> +};
>>> Hum... Here we might have a problem with 64 bit futexes, or private futexes
>>>
>>> So I believe this interface is not well defined and not expandable: in case of
>>> future additions to futexes, an old application compiled with an old pollable
>>> futex_event type might fail.
>>>
>> Hmm, how about:
>>
>> struct futex_event {
>> union {
>> void __user *addr;
>> u64 padding;
>> };
>> union {
>> int val;
>> s64 val64;
>> };
>> /* whatever room is necessary for future improvements */
>> };
>>
>> I haven't been keeping up with 64 bit or private futexes. What else
>> could probably go wrong?
>
> Well, that's the point : This interface is like an ioctl() one : pretty bad if
> not properly designed :)

I was merely mirroring the futex syscall arguments for FUTEX_WAIT. Will
those change? I hope not :)

> You probably need to stick one field containing one command or version number,
> something like that.

I'm a bit skeptical that we need versioning for such a simple operation
(command) as FUTEX_WAIT that takes an address and a value.

>
>
> struct futex_event {
> int type;
> union {
> void __user *addr;
> u64 padding;
> };
> union {
> int val;
> s64 val64;
> };
> };
>
> #define FUTEX_EVENT_SHARED32 1
> #define FUTEX_EVENT_SHARED64 2
> #define FUTEX_EVENT_PRIVATE32 (128|1)
> #define FUTEX_EVENT_PRIVATE64 (128|2)

I will take a look at the private futexes patches before commenting further.

> ...
>
> Also, you should take care of alignements constraints (a 32bit user program
> might run on a 64bit kernel)
>

Compat code? or futex alignements constraints?

--
Davi Arnaut

2007-05-02 07:12:06

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Eric Dumazet wrote:
> Davi Arnaut a ?crit :
>> Eric Dumazet wrote:
>>> Davi Arnaut a ?crit :
>>>> Asynchronously wait for FUTEX_WAKE operation on a futex if it still contains
>>>> a given value. There can be only one futex wait per file descriptor. However,
>>>> it can be rearmed (possibly at a different address) anytime.
>>>>
>>>> The pollable futex approach is far superior (send and receive events from
>>>> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
>>>>
>>>> Building block for pollable semaphores and user-defined events.
>>>>
>>>> Signed-off-by: Davi E. M. Arnaut <[email protected]>
>>>>
>>>> ---
>>>> fs/pollfs/Makefile | 1
>>>> fs/pollfs/futex.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>> init/Kconfig | 7 ++
>>>> 3 files changed, 162 insertions(+)
>>>>
>>>> Index: linux-2.6/fs/pollfs/Makefile
>>>> ===================================================================
>>>> --- linux-2.6.orig/fs/pollfs/Makefile
>>>> +++ linux-2.6/fs/pollfs/Makefile
>>>> @@ -3,3 +3,4 @@ pollfs-y := file.o
>>>>
>>>> pollfs-$(CONFIG_POLLFS_SIGNAL) += signal.o
>>>> pollfs-$(CONFIG_POLLFS_TIMER) += timer.o
>>>> +pollfs-$(CONFIG_POLLFS_FUTEX) += futex.o
>>>> Index: linux-2.6/fs/pollfs/futex.c
>>>> ===================================================================
>>>> --- /dev/null
>>>> +++ linux-2.6/fs/pollfs/futex.c
>>>> @@ -0,0 +1,154 @@
>>>> +/*
>>>> + * pollable futex
>>>> + *
>>>> + * Copyright (C) 2007 Davi E. M. Arnaut
>>>> + *
>>>> + * Licensed under the GNU GPL. See the file COPYING for details.
>>>> + */
>>>> +
>>>> +#include <linux/kernel.h>
>>>> +#include <linux/sched.h>
>>>> +#include <linux/module.h>
>>>> +#include <linux/slab.h>
>>>> +#include <linux/err.h>
>>>> +#include <linux/wait.h>
>>>> +#include <linux/poll.h>
>>>> +#include <linux/pollfs_fs.h>
>>>> +#include <linux/futex.h>
>>>> +
>>>> +struct futex_event {
>>>> + union {
>>>> + void __user *addr;
>>>> + u64 padding;
>>>> + };
>>>> + int val;
>>>> +};
>>> Hum... Here we might have a problem with 64 bit futexes, or private futexes
>>>
>>> So I believe this interface is not well defined and not expandable: in case of
>>> future additions to futexes, an old application compiled with an old pollable
>>> futex_event type might fail.
>>>
>> Hmm, how about:
>>
>> struct futex_event {
>> union {
>> void __user *addr;
>> u64 padding;
>> };
>> union {
>> int val;
>> s64 val64;
>> };
>> /* whatever room is necessary for future improvements */
>> };
>>
>> I haven't been keeping up with 64 bit or private futexes. What else
>> could probably go wrong?
>
> Well, that's the point : This interface is like an ioctl() one : pretty bad if
> not properly designed :)
>
> You probably need to stick one field containing one command or version number,
> something like that.
>
>
> struct futex_event {
> int type;
> union {
> void __user *addr;
> u64 padding;
> };
> union {
> int val;
> s64 val64;
> };
> };
>
> #define FUTEX_EVENT_SHARED32 1
> #define FUTEX_EVENT_SHARED64 2
> #define FUTEX_EVENT_PRIVATE32 (128|1)
> #define FUTEX_EVENT_PRIVATE64 (128|2)
>

I'm changing the structure to:

struct futex_event {
union {
void __user *addr;
u64 addr64;
};
union {
int val;
s64 val64;
};
union {
s32 flags;
s64 flags64;
};
};

Plenty room for future FUTEX_WAIT growth ?

--
Davi Arnaut

2007-05-02 07:40:21

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/1/07, Davi Arnaut <[email protected]> wrote:
> The pollable futex approach is far superior (send and receive events from
> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
> [...]

You have to explain in detail how these interfaces are supposed to
work. From first sight and without understanding (all) the it seems
it's far from useful.

Pollable futexes are useful, but any solution which gets implemented
must be sufficiently useful for all the uses we might have.

- the trivial is that you have a futex and you are just interest in
seeing it change. The
same as FUTEX_WAIT. I cannot figure out how all this works in your
code. Does your
read() call (that's the one to wait, yes?) work with O_NONBLOCK or
how else do you get
that behavior?

- more complicated case: I have to wait for multiple futexes and lock
them all at the same
time or don't return at all. This is possible with SysV semaphores
and generally useful
and needed. How can this be implemented with your scheme?

- how does it work with PI futexes?

- can I use a futex at the same time through this mechanism and using the normal
FUTEX_WAIT operation? This is a killer if it's not the case.

- if you have multiple threads polling a futex and the waker wakes up
one, what happens?
It is simply not acceptable to have more than one thread return from
the poll() call, this
would waste too many cycles, just to put all threads but one back to sleep.

2007-05-02 07:55:08

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Wed, 2 May 2007 00:40:17 -0700
"Ulrich Drepper" <[email protected]> wrote:

> - if you have multiple threads polling a futex and the waker wakes up
> one, what happens?
> It is simply not acceptable to have more than one thread return from
> the poll() call, this
> would waste too many cycles, just to put all threads but one back to sleep.
>

Well, poll() level edge semantic is well defined, you cannot cheat or change it.

If many threads call poll() on the same end point, they should *all* return POLLIN/whatever status.

This is why programs usually use one thread to dispatch events to workers, or at least dont queue XXXX threads calling poll() on one fd.

Only system calls that actually returns an 'work_done' can avoid waking all waiting threads and putting them back in queue. Example of such system calls are accept() or read()

2007-05-02 08:08:31

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Eric Dumazet <[email protected]> wrote:
> Well, poll() level edge semantic is well defined, you cannot cheat or change it.
>
> If many threads call poll() on the same end point, they should *all* return POLLIN/whatever status.

This means to me it's the wrong abstraction for this. We had a nice
solution for this with Evgeniy's kevent interfaces. It worked without
forcing futexes is this inflexible poll() interface.



> This is why programs usually use one thread to dispatch events to workers, or at least dont queue XXXX threads calling poll() on one fd.

No. This is why programs are forced to waste cycles by doing this.
Ideally this would not happen. Ideally you'd park all worker thread
in the same place and have them woken up one by one. Again, Evgeniy's
code was able to do this. This approach seems to be a big step
backward.

2007-05-02 08:49:40

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Wed, 2 May 2007 01:08:26 -0700
"Ulrich Drepper" <[email protected]> wrote:

> On 5/2/07, Eric Dumazet <[email protected]> wrote:
> > Well, poll() level edge semantic is well defined, you cannot cheat or change it.
> >
> > If many threads call poll() on the same end point, they should *all* return POLLIN/whatever status.
>
> This means to me it's the wrong abstraction for this. We had a nice
> solution for this with Evgeniy's kevent interfaces. It worked without
> forcing futexes is this inflexible poll() interface.

poll() is a generalist interface. Not the *perfect* one, but well spreaded on other OS as well.

>
>
>
> > This is why programs usually use one thread to dispatch events to workers, or at least dont queue XXXX threads calling poll() on one fd.
>
> No. This is why programs are forced to waste cycles by doing this.
> Ideally this would not happen. Ideally you'd park all worker thread
> in the same place and have them woken up one by one. Again, Evgeniy's
> code was able to do this. This approach seems to be a big step
> backward.

I understand your concerns, but *this* patch bundle extends poll()/select()/epoll, and is not an alternative to kevent or other work in progress, (and linux centered)

Are you suggesting poll() system call should be deprecated ?

Most programs still use the archaic select() thing you know ...

2007-05-02 12:20:21

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Ulrich Drepper wrote:
> On 5/1/07, Davi Arnaut <[email protected]> wrote:
>> The pollable futex approach is far superior (send and receive events from
>> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
>> [...]
>
> You have to explain in detail how these interfaces are supposed to
> work. From first sight and without understanding (all) the it seems
> it's far from useful.

It's basically a asynchronous FUTEX_WAIT with notification delivery
through a file descriptor.

> Pollable futexes are useful, but any solution which gets implemented
> must be sufficiently useful for all the uses we might have.

It's very useful for asynchronous event notification libraries
(libevent, liboop, libivykis, etc) because it integrates nicely with
their (e)poll main loops.

Usage schenario: you have 10 worker threads (and 10 futexes) for disk
i/o (or whatever) and one manager thread which is a state machine
serving many clients (epoll loop).

In this scenario the workers threads have only two possible ways of
notifying the manager thread once a job is done: signals and pipe tricks.

For libraries, signals sux. They dont integrate well with poll() loops,
may have overflow issues (RT), and signal numbers may clash with other
libraries/code. The self-pipe trick waste resources (mostly unused pipe
buffer).

By using pollable futexes, all the manager thread has todo is to
associate each of these futexes with a file descriptor (plfutex) and
epoll() for their completion. Once the futex is signaled, epoll()
returns POLLIN for the file descriptor and the manager thread may
dequeue the notification status from anywhere.


> - the trivial is that you have a futex and you are just interest in
> seeing it change. The same as FUTEX_WAIT.

I'm just interested in seeing a FUTEX_WAKE. Yes, same as FUTEX_WAIT.

> I cannot figure out how all this works in your code.

Every futex has a wait queue (q->waiters) which is used to track
processes waiting on the futex. When the futex receives a FUTEX_WAKE it
wakes up all waiters on the wait queue. Also, a futex is considered
woken when it wait queue is empty (or lock_ptr == NULL).

When you register a file descriptor with select(), poll() or epoll() a
callback is queued into the futex wait queue. When the futex receives a
FUTEX_WAKE every callback is called and the event is registered within
each select(), poll() or epoll() table. This initiates a chain reaction
waking up all process sleeping on poll()/whatever.

> Does your read() call (that's the one to wait, yes?) work with O_NONBLOCK
> or how else do you get that behavior?

If the fd is marked O_NONBLOCK and the futex is not woken yet, it simply
returns -EAGAIN (pfs_read_nonblock). If O_NONBLOCK is not set, it waits
synchronously (pfs_read_block/wait_event_interruptible) on the futex
wait queue.

> - more complicated case: I have to wait for multiple futexes and lock
> them all at the same time or don't return at all. This is possible with
> SysV semaphores and generally useful and needed. How can this be
> implemented with your scheme?

Remember, it's only about FUTEX_WAIT.

> - how does it work with PI futexes?

It dosen't work. AFAICS PI futexes don't use FUTEX_WAKE.

> - can I use a futex at the same time through this mechanism and using the normal
> FUTEX_WAIT operation? This is a killer if it's not the case.

Yes.

> - if you have multiple threads polling a futex and the waker wakes up
> one, what happens? It is simply not acceptable to have more than one
> thread return from the poll() call, this would waste too many cycles,
> just to put all threads but one back to sleep.

Only one is waked up (whatever matches first on the futex hashed bucket).

--
Davi Arnaut

2007-05-02 12:39:35

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Ulrich Drepper wrote:
> On 5/1/07, Davi Arnaut <[email protected]> wrote:
>> The pollable futex approach is far superior (send and receive events from
>> userspace or kernel) to eventfd and fixes (supercedes) FUTEX_FD at the same time.
>> [...]
>

<snip>

>
> - more complicated case: I have to wait for multiple futexes and lock
> them all at the same time or don't return at all. This is possible with
> SysV semaphores and generally useful and needed.
> How can this be implemented with your scheme?

It's quite easy to implement this scheme by write()ing the futexes all
at once but that would break the one futex per fd association. For
atomicity: if one of the futexes can't be queued, we would rollback
(unqueue) the others.

Sounds sane?

--
Davi Arnaut

2007-05-02 16:39:42

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Eric Dumazet <[email protected]> wrote:
> I understand your concerns, but *this* patch bundle extends poll()/select()/epoll, and is not an alternative to kevent or other work in progress, (and linux centered)

It is adding huge amounts of complexity and at the same time is not
future-safe. I consider this enough reason to reject this approach.
You never can get rid of the interface. It's much cleaner and safer
to do it right instead of piling on more and more workarounds for
special situations.

2007-05-02 16:46:56

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Davi Arnaut <[email protected]> wrote:
> It's quite easy to implement this scheme by write()ing the futexes all
> at once but that would break the one futex per fd association. For
> atomicity: if one of the futexes can't be queued, we would rollback
> (unqueue) the others.
>
> Sounds sane?

I don't know how you use "unqueue" in this context. If a queued futex
is one which is /locked/ by te call, then yes, this is the semantics
needed. Atomically locking a number of futexes means that if one of
the set cannot be locked all operations done to lock the others have
to be undone. It's an all-or-nothing situation.

Locking is not as easy as you might think, though. For non-PI futexes
there is deliberately no protocol in place describing what "locked"
means. The locking operation has to be customizable. This is what
the FUTEX_OP_* stuff is about.

And you wrote that currently each futex needs its own file descriptor.
So this would have to be changed, too.

2007-05-02 16:59:50

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Ulrich Drepper wrote:
> On 5/2/07, Eric Dumazet <[email protected]> wrote:
>> I understand your concerns, but *this* patch bundle extends
>> poll()/select()/epoll, and is not an alternative to kevent or other
>> work in progress, (and linux centered)
>
> It is adding huge amounts of complexity and at the same time is not
> future-safe. I consider this enough reason to reject this approach.
Huge amounts of complexity? It just _moves_ some futex code around!
The intended use is not for locking, but for event signaling. Why can't
it be future-safe? It just needs a address and a value! Pseudocode:

thread A:
int fd = plfutex(addr, 0);
do
poll(fdset+fd);
process network events
queue obj to thread B
if fd:
job processed

thread B:
wait_job();
process_job();
raise_event(addr);
> You never can get rid of the interface. It's much cleaner and safer
> to do it right instead of piling on more and more workarounds for
> special situations.
It simple as is, there is no need to overdesign.

--
Davi Arnaut

2007-05-02 17:05:55

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Ulrich Drepper wrote:
> On 5/2/07, Davi Arnaut <[email protected]> wrote:
>> It's quite easy to implement this scheme by write()ing the futexes all
>> at once but that would break the one futex per fd association. For
>> atomicity: if one of the futexes can't be queued, we would rollback
>> (unqueue) the others.
>>
>> Sounds sane?
>
> I don't know how you use "unqueue" in this context. If a queued futex
> is one which is /locked/ by te call, then yes, this is the semantics
> needed. Atomically locking a number of futexes means that if one of
> the set cannot be locked all operations done to lock the others have
> to be undone. It's an all-or-nothing situation.
The waits are queued, thus then can be "unqueued". It's quite simple to
extend futex_wait_queue() to support this, but again you are thinking of
locks while what I want is fast events.
> Locking is not as easy as you might think, though. For non-PI futexes
> there is deliberately no protocol in place describing what "locked"
> means. The locking operation has to be customizable. This is what
> the FUTEX_OP_* stuff is about.
Events are simple. A event is either signaled or not. A futex value 0 means
not signaled, 1+ signaled.
> And you wrote that currently each futex needs its own file descriptor.
> So this would have to be changed, too.
If it's really worth, I have no problem with it.

--
Davi Arnaut

2007-05-02 17:10:26

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Davi Arnaut <[email protected]> wrote:
> thread A:
> int fd = plfutex(addr, 0);
> do
> poll(fdset+fd);
> process network events
> queue obj to thread B
> if fd:
> job processed
>
> thread B:
> wait_job();
> process_job();
> raise_event(addr);

This is not the model you can implement with your changes. Because
every single waiter is woken you need one thread listening for the
jobs and then distribute the work. Otherwise you have thundering
herds of threads and only one gets to do some work.


> It simple as is, there is no need to overdesign.

There is no reason to go with a limited, too-simple minded design if
we've already identified a much better design. The fact that poll is
used today does not excuse piling on more and more code which makes
additional functions which don't fit into the poll framework barely
work. Plus, poll/epoll itself is a problem.

And you cannot talk about little changes and no "overdesign". You
have 22 patches for all this. It's not just limited to futexes, it's
the whole thing which IMO is unnecessary ballast going forward.

2007-05-02 17:29:54

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Wed, 2 May 2007, Ulrich Drepper wrote:

> > It simple as is, there is no need to overdesign.
>
> There is no reason to go with a limited, too-simple minded design if
> we've already identified a much better design. The fact that poll is
> used today does not excuse piling on more and more code which makes
> additional functions which don't fit into the poll framework barely
> work. Plus, poll/epoll itself is a problem.

Is it? Please do tell me more...


- Davide


2007-05-02 17:38:00

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Ulrich Drepper wrote:
> On 5/2/07, Davi Arnaut <[email protected]> wrote:
>> thread A:
>> int fd = plfutex(addr, 0);
>> do
>> poll(fdset+fd);
>> process network events
>> queue obj to thread B
>> if fd:
>> job processed
>>
>> thread B:
>> wait_job();
>> process_job();
>> raise_event(addr);
>
> This is not the model you can implement with your changes. Because
> every single waiter is woken you need one thread listening for the
> jobs and then distribute the work. Otherwise you have thundering
> herds of threads and only one gets to do some work.
>

NO! Every single waiter of the _file descriptor_ is waked, not of the futex.
One can associate N fds with a single futex address. FUTEX_WAKE with
nproc = 1 will only wake one of the file descriptors. Its up to the user
to decide if he wants a broadcast or not.

Have you seen the email where I told you exactly this?

>> It simple as is, there is no need to overdesign.
>
> There is no reason to go with a limited, too-simple minded design if
> we've already identified a much better design. The fact that poll is
> used today does not excuse piling on more and more code which makes
> additional functions which don't fit into the poll framework barely
> work. Plus, poll/epoll itself is a problem.
>
epoll itself is a problem?! sorry, but i didn't know that. Care to
elaborate?
I really need some guidance here. I just want to unify the epoll for various
event sources. It seems a lot of people like this, just look at the
popularity
of libevent and other "unifying" event loops.

I don't think we need another epoll clone.

> And you cannot talk about little changes and no "overdesign". You
> have 22 patches for all this. It's not just limited to futexes, it's
> the whole thing which IMO is unnecessary ballast going forward.
davi@karmic:~/git/linux-2.6$ find patches/ -name *.patch |grep -v
syscall | wc -l
10

davi@karmic:~/git/linux-2.6$ find patches/ -name *.patch |grep -v
syscall |grep futex
patches/pollfs-futex-async-wait.patch
patches/pollfs-futex.patch

--
Davi Arnaut

2007-05-02 17:49:39

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Davi Arnaut <[email protected]> wrote:
> NO! Every single waiter of the _file descriptor_ is waked, not of the futex.

And how is this better? In this world of yours a program must have
one file descriptor for each single futex which is used like this *per
thread*. There can be hundreds, thousands of threads. And there can
be large numbers of futexes, too.

This is not going to fly. You reach the file descriptor limit just
with this. And this in many processes on the system.


> davi@karmic:~/git/linux-2.6$ find patches/ -name *.patch |grep -v
> syscall | wc -l
> 10
>
> davi@karmic:~/git/linux-2.6$ find patches/ -name *.patch |grep -v
> syscall |grep futex
> patches/pollfs-futex-async-wait.patch
> patches/pollfs-futex.patch

I don't know what you want to show here. You 10 new syscalls? You
have two patches alone modifying futexes? And 22 patches in total.
That's not "a lot"?

2007-05-02 17:53:43

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Davide Libenzi <[email protected]> wrote:
> Is it? Please do tell me more...

Come on, we went through all this. Having to do syscalls for event
retrieval plus the limited channel available for feedback (the POLL*
bits) is to limiting. This is where the kevent stuff innovated and
really fixed the problems. Userlevel ring buffers are more efficient.

Yes, a unifying event look is what is wanted. But it does not have to
be poll based. Given the right abstraction you can fit in the kevent
technology or similar things.

And seeing all these requirements of this approach: kevent is also
much more resource efficient. No "one file desriptor per thread per
object". These are important factors.

2007-05-02 18:05:55

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Ulrich Drepper wrote:
> On 5/2/07, Davi Arnaut <[email protected]> wrote:
>> NO! Every single waiter of the _file descriptor_ is waked, not of the
>> futex.
>
> And how is this better? In this world of yours a program must have
> one file descriptor for each single futex which is used like this *per
> thread*. There can be hundreds, thousands of threads. And there can
> be large numbers of futexes, too.
>

The usage cases of yours are quite different from mine. We don't use a
single file descriptor to to manage various resources. The worker threads
are _not going_ to have a file descriptor, _only_ the event dispatching
(poll)
thread. The worker threads are just going to increase the futex value and
call FUTEX_WAKE in case the previous value was 0.

A pollable futex is even more useful for _single_ threaded programs that
don't want to go into lengthy hacks to monitor events coming from the
outside
world.

I, at least, don't want to port my epoll applications to yet another event
notification facility.

> This is not going to fly. You reach the file descriptor limit just
> with this. And this in many processes on the system.
>
>
>> davi@karmic:~/git/linux-2.6$ find patches/ -name *.patch |grep -v
>> syscall | wc -l
>> 10
>>
>> davi@karmic:~/git/linux-2.6$ find patches/ -name *.patch |grep -v
>> syscall |grep futex
>> patches/pollfs-futex-async-wait.patch
>> patches/pollfs-futex.patch
>
> I don't know what you want to show here. You 10 new syscalls? You
> have two patches alone modifying futexes? And 22 patches in total.
> That's not "a lot"?

No. 12 patches are for i386/x86_64 obligatory syscall housekeeping. I don't
want to sound rude, but have you actually looked at the patches?

--
Davi Arnaut

2007-05-02 18:21:46

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Wed, 2 May 2007, Ulrich Drepper wrote:

> On 5/2/07, Davide Libenzi <[email protected]> wrote:
> > Is it? Please do tell me more...
>
> Come on, we went through all this. Having to do syscalls for event
> retrieval plus the limited channel available for feedback (the POLL*
> bits) is to limiting. This is where the kevent stuff innovated and
> really fixed the problems. Userlevel ring buffers are more efficient.
>
> Yes, a unifying event look is what is wanted. But it does not have to
> be poll based. Given the right abstraction you can fit in the kevent
> technology or similar things.
>
> And seeing all these requirements of this approach: kevent is also
> much more resource efficient. No "one file desriptor per thread per
> object". These are important factors.

99% of the fds you'll find inside an event loop you care to scale about,
are *already* fd based. The handful of the remaining ones (signals,
timers, AIO signaling, ??) will likely account for a *very limited* number
of fds. On top of that, those fds are very cheap in terms of memory
(they're basically wakeup targets), since the new code shares the inode
for them. So we have a limited number of fds, using a pretty limited
amount of memory each.
And this approach is not bound to a completely new and monolitic interface.
All these things need to basically deliver notifications of completion,
and being able to read results. Things that the existing f_op->poll and
f_op->read are already able to give us. Is that really a strange concept
to base it on? Because, to me, it seems pretty natural.



- Davide


2007-05-03 13:40:57

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Davi Arnaut <[email protected]> wrote:
> The usage cases of yours are quite different from mine. We don't use a
> single file descriptor to to manage various resources. The worker threads
> are _not going_ to have a file descriptor, _only_ the event dispatching
> (poll)
> thread.

An model which doesn't scale well.


> A pollable futex is even more useful for _single_ threaded programs that
> don't want to go into lengthy hacks to monitor events coming from the
> outside
> world.

There is nothing here that cannot be done with a more complete model
for event handling. It's Linus decision whether he wants to add yet
more code, yet more possible problems, yet more maintenance
overhead/nightmare for an interim solution which isn't necessary,
which cannot solve all the problems, and which is not as scalable as
other proposed methods.

I can only say that I would be trickly against it. It makes just no sense.

2007-05-03 13:46:24

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/2/07, Davide Libenzi <[email protected]> wrote:
> 99% of the fds you'll find inside an event loop you care to scale about,
> are *already* fd based.

You are missing the point. To get acceptable behavior of the wakeup
it is necessary with this approach to open one descriptor _per thread_
for a futex. Otherwise all threads get woken upon FUTEX_WAKE.

This also means you need individual epoll sets for each thread. You
cannot share them anymore among all the threads in the process.


> On top of that, those fds are very cheap in terms of memory

They might be when they are counted in dozens. But here we are
talking about the possible need to use thousands of additional file
descriptors. If they are so cheap to allow thousands of descriptors
with ease, why would the rlimit for files default to a small number
(1024 on Fedora right now)?


> And this approach is not bound to a completely new and monolitic interface.

So? It's stil additional, new code for an approach which will have to
be superceded real soon. That's just pure overhead to me.

2007-05-03 18:24:55

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex


I thought you were talking about the poll/epoll interface in general, and
the approach on how to extend it for the very few cases that ppl asks for.
but I see we're focusing on futexes ...


On Thu, 3 May 2007, Ulrich Drepper wrote:

> On 5/2/07, Davide Libenzi <[email protected]> wrote:
> > 99% of the fds you'll find inside an event loop you care to scale about,
> > are *already* fd based.
>
> You are missing the point. To get acceptable behavior of the wakeup
> it is necessary with this approach to open one descriptor _per thread_
> for a futex. Otherwise all threads get woken upon FUTEX_WAKE.
>
> This also means you need individual epoll sets for each thread. You
> cannot share them anymore among all the threads in the process.

I'm not sure if futexes are the best approach to do that, but a way for
the user to signal an event into a main event loop is needed.



> > On top of that, those fds are very cheap in terms of memory
>
> They might be when they are counted in dozens. But here we are
> talking about the possible need to use thousands of additional file
> descriptors. If they are so cheap to allow thousands of descriptors
> with ease, why would the rlimit for files default to a small number
> (1024 on Fedora right now)?

Right now, ppl do that using pipes. That costs 2 file descriptors and at
least 4KB of kernel data (plus an inode, a dentry and a file). This just
to have a way to signal to an event loop dispatcher. The patches I posted
a few weeks ago introduce an eventfd, that reduces the amount of kernel
memory to basically a dentry and a file (plus uses only one file
descriptor, and its 2-3 times faster than pipes. Add to that cost, about
200 lines of code in fs/eventfd.c.



> > And this approach is not bound to a completely new and monolitic interface.
>
> So? It's stil additional, new code for an approach which will have to
> be superceded real soon. That's just pure overhead to me.

IMO it is better to leave futexes alone. They are great for syncronizing
MT apps, but do not properly fit an fd-based solution. For that, something
like eventfd is enough.



- Davide


2007-05-03 19:03:40

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/3/07, Davide Libenzi <[email protected]> wrote:
>
> I thought you were talking about the poll/epoll interface in general, and
> the approach on how to extend it for the very few cases that ppl asks for.
> but I see we're focusing on futexes ...

Futexes must be part of the whole approach. If they cannot sanely be
integrated the whole approach is more than questionable IMO.


> I'm not sure if futexes are the best approach to do that, but a way for
> the user to signal an event into a main event loop is needed.

I haven't necessarily seen much of this demand and, as you pointed out
yourself, there is already a completely valid and POSIX compliant way
to achieve that. The situation would be very different if you
couldn't reliably implement this.

I don't suggest this as a long term solution, it's neither nice nor
fast. But it is a way to achieve the goal until a real soution comes
along. Signals cannot serve as a justification for introducing these
new concepts.


> IMO it is better to leave futexes alone. They are great for syncronizing
> MT apps, but do not properly fit an fd-based solution. For that, something
> like eventfd is enough.

That's ridiculously short-sighted. All objects upon which one can
wait must be unified. This is possible. The kevent interface gives
enough flexibility.


Let's just finish the design and implementation of the real solution.
Be it kevent (modified to meet the last comments, I think I still have
some myself), or something completely different which you can propose.
Then all programs which really care about performance can use that
code.

If a program doesn't care about performance then they might just as
well use pipes in signal handlers.

2007-05-03 22:14:51

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Thu, 3 May 2007, Ulrich Drepper wrote:

> On 5/3/07, Davide Libenzi <[email protected]> wrote:
> >
> > I thought you were talking about the poll/epoll interface in general, and
> > the approach on how to extend it for the very few cases that ppl asks for.
> > but I see we're focusing on futexes ...
>
> Futexes must be part of the whole approach. If they cannot sanely be
> integrated the whole approach is more than questionable IMO.

Why is that futexes *must* be part of the "whole solution"? Ppl needs
solutions to specific problems, not an bloated interface that, like a
giant blob, includes everything just because it exists.



> > I'm not sure if futexes are the best approach to do that, but a way for
> > the user to signal an event into a main event loop is needed.
>
> I haven't necessarily seen much of this demand and, as you pointed out
> yourself, there is already a completely valid and POSIX compliant way
> to achieve that. The situation would be very different if you
> couldn't reliably implement this.

Before you try to bash a solution becuase it's costly, then you bounce
back from another angle, and say that a solution (pipes) that uses 2
descriptors, one file, one inode, one dentry and 4KB of kernel memory for
each instance, is a perfectly legal solution.
The 1024 file cap is a bogus problem. If you decided to leave the POSIX
compatibility (poll/select) for your code, to use something like epoll, it
means already that your application is handling quite a huge amount of
files and the 1024 cap must be out of the way. And here the cost
associated with each file is already pretty big (inode, dentry, file, and
buffers - for each one of them). We cannot change that cost.



> I don't suggest this as a long term solution, it's neither nice nor
> fast. But it is a way to achieve the goal until a real soution comes
> along. Signals cannot serve as a justification for introducing these
> new concepts.

Fast, I think we have that pretty much covered with Ingo poiting out a few
flaws in the numbers posted previously. Nice, I'll leave that out.
Monolitic and interface-centric solutions, or better, solutions in search
of a problem, do not fit the "nice" category IMO.
So, let's leave hand-waving and ugly/nice BS out of the picture, and let's
see what is currently missing.
Epoll scales and already covers a large amount of things you may be
interested in receiving events from. Basically everything that have a
working f_op->poll.
The other big piece is AIO. Now you can have *another* layer on top of
AIO, that is included in your blob interface, but why? The AIO API is
already defined, and all you need is a way to signal the main loop that
AIO events will be ready to be spilled out from the AIO context. And at
that point you use the *already existing* AIO API for it. Why do you want
to add another layer on top? What you end up doing, is pushing userspace
code into the kernel.
The 20 lines AIO patch I posted, simply signals to an eventfd when the
AIO context has something to be fetched.
Then we have signals and timers, covered in the other two patches. And all
this works without being bound to an interface. Your application can just
use poll if it does not have scalability problems.




- Davide


2007-05-04 15:28:47

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/3/07, Davide Libenzi <[email protected]> wrote:
> Why is that futexes *must* be part of the "whole solution"? Ppl needs
> solutions to specific problems, not an bloated interface that, like a
> giant blob, includes everything just because it exists.

Sync objects are essential parts of many programs today and most
programs tomorrow. Currently you cannot efficiently implement working
on multiple independent areas which are protected through some sync
object (mutex, condvar, ...). You have to create a separate thread
for each. Looping with the nonlocking mutex, for instance, is no
possibility. This is solved by being able to get events for the
availability of the sync object.

And before you start and claim that this is no common cases take a
look at the waitformultipleobjects (with studdly caps somewhere) for
windows' API. The actual interface is horrible, but the concept is
sound (it comes from VMS). This is the basis of many programs on that
platform. Basically, the central loop contains such a call.
Currently programs would have to be completely redesigned when ported
to Linux if they use any object which cannot be waited on.

There is much more. As I tried to point out in last year's OLS paper,
central loops around such a call are the perfect scalability mechanism
and this is what is needed for the processors from today and tomorrow.


> Before you try to bash a solution becuase it's costly, then you bounce
> back from another angle, and say that a solution (pipes) that uses 2
> descriptors, one file, one inode, one dentry and 4KB of kernel memory for
> each instance, is a perfectly legal solution.

Stop. I call the proposed code costly in terms of the code added to
the kernel which must be maintained and kept in mind when writing the
real next-gen event mechanism. Not having this code in the kernel
certainly would make a difference.


> Fast, I think we have that pretty much covered with Ingo poiting out a few
> flaws in the numbers posted previously. Nice, I'll leave that out.

You again miss the context. I was talking about the pipe-based
solution using a signal handler.


> Epoll scales and already covers a large amount of things you may be
> interested in receiving events from. Basically everything that have a
> working f_op->poll.

epoll doesn't scale if every thread needs its own epoll set. Beside
the overhead this also has huge program design problems: how do you
atomically remove a file descriptor from a collection of epoll sets?


> The other big piece is AIO. Now you can have *another* layer on top of
> AIO, that is included in your blob interface, but why?

I don't know how you arrive at AIO now. kevent itself is independent
of the AIO code which was done at the same time by the same person.
It was just one kernel service which uses the event functionality.
The two must be judged independently.

2007-05-04 19:15:21

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Fri, 4 May 2007, Ulrich Drepper wrote:

> On 5/3/07, Davide Libenzi <[email protected]> wrote:
> > Why is that futexes *must* be part of the "whole solution"? Ppl needs
> > solutions to specific problems, not an bloated interface that, like a
> > giant blob, includes everything just because it exists.
>
> Sync objects are essential parts of many programs today and most
> programs tomorrow. Currently you cannot efficiently implement working
> on multiple independent areas which are protected through some sync
> object (mutex, condvar, ...). You have to create a separate thread
> for each. Looping with the nonlocking mutex, for instance, is no
> possibility. This is solved by being able to get events for the
> availability of the sync object.
>
> And before you start and claim that this is no common cases take a
> look at the waitformultipleobjects (with studdly caps somewhere) for
> windows' API. The actual interface is horrible, but the concept is
> sound (it comes from VMS). This is the basis of many programs on that
> platform. Basically, the central loop contains such a call.
> Currently programs would have to be completely redesigned when ported
> to Linux if they use any object which cannot be waited on.
>
> There is much more. As I tried to point out in last year's OLS paper,
> central loops around such a call are the perfect scalability mechanism
> and this is what is needed for the processors from today and tomorrow.

This is a pretty specific case, that is not very typical to find in the
usual common event loop dispatch application design.
But strange you went even there, because, as you know, WaitForMultipleObjects
works with HANDLEs, that are the closest thing to the Unix file you can
find. They can be read (ReadFile/read), written (WriteFile/write), closed
(CloseHandle/close), duplicated (DuplicateHandle/dup) and waited
(WaitForMultipleObjects/poll), with a common interface.
And if you *really* want your truly generic WaitForMultipleObjects
implementation, your only way is to base it on files. Files are our almost
perfect match to HANDLEs in our world. We have the basic infrastructure
already there.




- Davide


2007-05-04 19:46:04

by Ryan Ordway

[permalink] [raw]
Subject: 2.6.20.4 / 2.6.21.1 AT91SAM9260-EK oops



I am having issues getting both a working kernel and rootfs going on an
Atmel AT91SAM9260-EK board. I can boot the Atmel-provided 2.6.18-rc4 kernel
and my rootfs image created with buildroot. But when I try to boot my own
2.6.20.4 or 2.6.21.1 kernels, I get an oops as below. Forgive the
formatting... Below that is my kernel config.

Any ideas why the kernel might be dying trying to open and initialize a TTY?


Thanks!

Ryan




Uncompressing
Linux.............................................................
................... done, booting the kernel.
Linux version 2.6.21.1-pml1 ([email protected]) (gcc version
4.1.2) #2
Thu May 3 13:47:43 PDT 2007
CPU: ARM926EJ-S [41069265] revision 5 (ARMv5TEJ), cr=00053177
Machine: Atmel AT91SAM9260-EK
Memory policy: ECC disabled, Data cache writeback
Clocks: CPU 198 MHz, master 99 MHz, main 18.432 MHz
CPU0: D VIVT write-back cache
CPU0: I cache: 8192 bytes, associativity 4, 32 byte lines, 64 sets
CPU0: D cache: 8192 bytes, associativity 4, 32 byte lines, 64 sets
Built 1 zonelists. Total pages: 16256
Kernel command line: ram=64M console=ttyS0,115200 initrd=0x21000000
root=/dev/ram0 init=/linuxrc rw
AT91: 96 gpio irqs in 3 banks
PID hash table entries: 256 (order: 8, 1024 bytes)
Console: colour dummy device 80x30
Dentry cache hash table entries: 8192 (order: 3, 32768 bytes)
Inode-cache hash table entries: 4096 (order: 2, 16384 bytes)
Memory: 64MB = 64MB total
Memory: 59312KB available (2224K code, 330K data, 104K init)
Mount-cache hash table entries: 512
CPU: Testing write buffer coherency: ok
NET: Registered protocol family 16
Generic PHY: Registered new driver
usbcore: registered new interface driver usbfs
usbcore: registered new interface driver hub
usbcore: registered new device driver usb
NET: Registered protocol family 2
IP route cache hash table entries: 1024 (order: 0, 4096 bytes)
TCP established hash table entries: 2048 (order: 3, 40960 bytes)
TCP bind hash table entries: 2048 (order: 3, 40960 bytes)
TCP: Hash tables configured (established 2048 bind 2048)
TCP reno registered
checking if image is initramfs...it isn't (no cpio magic); looks like an
initrd
Freeing initrd memory: 2888K
NetWinder Floating Point Emulator V0.97 (extended precision)
JFFS2 version 2.2. (NAND) (C) 2001-2006 Red Hat, Inc.
io scheduler noop registered (default)
Serial: 8250/16550 driver $Revision: 1.90 $ 4 ports, IRQ sharing enabled
RAMDISK driver initialized: 16 RAM disks of 16384K size 1024 blocksize
loop: loaded (max 8 devices)
Davicom DM9161E: Registered new driver
Davicom DM9131: Registered new driver
dm9000 Ethernet Driver
macb macb: detected PHY at address 0 (ID 0181:b8a0)
eth0: Atmel MACB at 0xfffc4000 irq 21 (02:03:04:05:06:07)
NFTL driver: nftlcore.c $Revision: 1.98 $, nftlmount.c $Revision: 1.41 $
SSFDC read-only Flash Translation layer
NAND device: Manufacturer ID: 0xec, Chip ID: 0xda (Samsung NAND 256MiB 3,3V
8-bi
t)
NAND bus width 16 instead 8 bit
No NAND device found!!!
at91_ohci at91_ohci: AT91 OHCI
at91_ohci at91_ohci: new USB bus registered, assigned bus number 1
at91_ohci at91_ohci: irq 20, io mem 0x00500000
usb usb1: configuration #1 chosen from 1 choice
hub 1-0:1.0: USB hub found
hub 1-0:1.0: 2 ports detected
usbcore: registered new interface driver usbserial
drivers/usb/serial/usb-serial.c: USB Serial support registered for generic
usbcore: registered new interface driver usbserial_generic
drivers/usb/serial/usb-serial.c: USB Serial Driver core
drivers/usb/serial/usb-serial.c: USB Serial support registered for cp2101
usbcore: registered new interface driver cp2101
drivers/usb/serial/cp2101.c: Silicon Labs CP2101/CP2102 RS232 serial adaptor
dri
ver v0.07
udc: at91_udc version 3 May 2006
ether gadget: using random self ethernet address
ether gadget: using random host ethernet address
usb0: Ethernet Gadget, version: May Day 2005
usb0: using at91_udc, OUT ep2 IN ep1 STATUS ep4
usb0: MAC da:af:79:f3:97:1e
usb0: HOST MAC ee:9f:94:c8:5d:22
mice: PS/2 mouse device common for all mice
AT91 MMC: 4 wire bus mode not supported by this driver - using 1 wire
TCP cubic registered
Initializing XFRM netlink socket
NET: Registered protocol family 1
NET: Registered protocol family 17
NET: Registered protocol family 15
drivers/rtc/hctosys.c: unable to open rtc device (rtc0)
RAMDISK: Compressed image found at block 0
VFS: Mounted root (ext2 filesystem).
Freeing init memory: 104K
Unable to handle kernel NULL pointer dereference at virtual address 00000000
pgd = c0004000
[00000000] *pgd=00000000
Internal error: Oops: 5 [#1]
Modules linked in:
CPU: 0
PC is at init_dev+0x28/0x4e8
LR is at tty_open+0x120/0x304
pc : [<c00f8a18>] lr : [<c00fbce4>] Not tainted
sp : c12e1e08 ip : c12e1e50 fp : c12e1e4c
r10: 00000002 r9 : 00000000 r8 : c12e0000
r7 : 00000001 r6 : c1237c00 r5 : c0299c38 r4 : c12fb8a0
r3 : 00000000 r2 : c12e1e54 r1 : 00000000 r0 : c1237c00
Flags: nZCv IRQs on FIQs on Mode SVC_32 Segment kernel
Control: 5317F
Table: 20004000 DAC: 00000017
Process swapper (pid: 1, stack limit = 0xc12e0258)
Stack: (0xc12e1e08 to 0xc12e2000)
1e00: c12e1e54 c025fce8 c12e1e50 c1237c00 00000000
c12e0000
1e20: c12e1e4c c12fb8a0 c0299c38 00500001 00000001 c12e0000 00000000
00000002
1e40: c12e1e7c c12e1e50 c00fbce4 c00f8a00 00000000 c3d78a40 00000000
c0299c38
1e60: c3d78a40 c12fb8a0 c1257bc8 00000000 c12e1ea4 c12e1e80 c0080d7c
c00fbbd4
1e80: c12e1ea4 00000000 c12fb8a0 c3d78a40 c0080c2c c12d3220 c12e1ecc
c12e1ea8
1ea0: c007c758 c0080c3c c12fb8a0 c12e1ef8 c03a4000 00000000 ffffff9c
00000000
1ec0: c12e1eec c12e1ed0 c007c8f4 c007c674 00000000 ffffff9c 00000000
00000002
1ee0: c12e1f5c c12e1ef0 c007c950 c007c8d0 c12e1ef8 c00ef830 c1257bc8
c12d3220
1f00: c0065970 00000002 c12e0000 00000101 00000001 00000000 c12e1f34
c12e1f28
1f20: c01d74f0 c00ef830 c12e1f5c c12e1f38 c007c658 c01d74f0 00000002
00000003
1f40: 00000000 c12fb8a0 00000002 00000000 c12e1f84 c12e1f60 c007c9ac
c007c91c
1f60: c0029be8 c0269f98 c001f87c c12e0000 00000000 00000000 c12e1f94
c12e1f88
1f80: c007ca28 c007c968 c12e1fac c12e1f98 c002205c c007ca14 c0269f98
c0269f98
1fa0: c12e1ff4 c12e1fb0 c000886c c002203c 00000000 00000000 c00086fc
c00406bc
1fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000000
1fe0: 00000000 00000000 00000000 c12e1ff8 c00406bc c000870c 00220008
00020015
Backtrace:
[<c00f89f0>] (init_dev+0x0/0x4e8) from [<c00fbce4>] (tty_open+0x120/0x304)
[<c00fbbc4>] (tty_open+0x0/0x304) from [<c0080d7c>]
(chrdev_open+0x150/0x1a0)
[<c0080c2c>] (chrdev_open+0x0/0x1a0) from [<c007c758>]
(__dentry_open+0xf4/0x1e4)
r7 = C12D3220 r6 = C0080C2C r5 = C3D78A40 r4 = C12FB8A0
[<c007c664>] (__dentry_open+0x0/0x1e4) from [<c007c8f4>]
(nameidata_to_filp+0x34/0x4c)
[<c007c8c0>] (nameidata_to_filp+0x0/0x4c) from [<c007c950>]
(do_filp_open+0x44/0x4c)
r4 = 00000002
[<c007c90c>] (do_filp_open+0x0/0x4c) from [<c007c9ac>]
(do_sys_open+0x54/0x98)
r5 = 00000000 r4 = 00000002
[<c007c958>] (do_sys_open+0x0/0x98) from [<c007ca28>] (sys_open+0x24/0x28)
r8 = 00000000 r7 = 00000000 r6 = C12E0000 r5 = C001F87C
r4 = C0269F98
[<c007ca04>] (sys_open+0x0/0x28) from [<c002205c>] (init_post+0x30/0xe8)
[<c002202c>] (init_post+0x0/0xe8) from [<c000886c>] (init+0x170/0x1b8)
r4 = C0269F98
[<c00086fc>] (init+0x0/0x1b8) from [<c00406bc>] (do_exit+0x0/0x808)
r7 = 00000000 r6 = 00000000 r5 = 00000000 r4 = 00000000
Code: e3130010 059030c8 e1a06000 e1a09001 (07935101)
Kernel panic - not syncing: Attempted to kill init!






#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.20.4
# Fri May 4 11:13:33 2007
#
CONFIG_ARM=y
# CONFIG_GENERIC_TIME is not set
CONFIG_MMU=y
CONFIG_GENERIC_HARDIRQS=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_HARDIRQS_SW_RESEND=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
# CONFIG_ARCH_HAS_ILOG2_U32 is not set
# CONFIG_ARCH_HAS_ILOG2_U64 is not set
CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_VECTORS_BASE=0xffff0000
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"

#
# Code maturity level options
#
CONFIG_EXPERIMENTAL=y
CONFIG_BROKEN_ON_SMP=y
CONFIG_INIT_ENV_ARG_LIMIT=32

#
# General setup
#
CONFIG_LOCALVERSION="-pml1"
CONFIG_LOCALVERSION_AUTO=y
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
# CONFIG_IPC_NS is not set
CONFIG_POSIX_MQUEUE=y
# CONFIG_BSD_PROCESS_ACCT is not set
# CONFIG_TASKSTATS is not set
# CONFIG_UTS_NS is not set
# CONFIG_AUDIT is not set
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_SYSFS_DEPRECATED=y
# CONFIG_RELAY is not set
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
# CONFIG_KALLSYMS_ALL is not set
CONFIG_KALLSYMS_EXTRA_PASS=y
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_EPOLL=y
CONFIG_SHMEM=y
CONFIG_SLAB=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_RT_MUTEXES=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
# CONFIG_SLOB is not set

#
# Loadable module support
#
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_MODULE_FORCE_UNLOAD is not set
CONFIG_MODVERSIONS=y
# CONFIG_MODULE_SRCVERSION_ALL is not set
CONFIG_KMOD=y

#
# Block layer
#
CONFIG_BLOCK=y
# CONFIG_LBD is not set
# CONFIG_BLK_DEV_IO_TRACE is not set
# CONFIG_LSF is not set

#
# IO Schedulers
#
CONFIG_IOSCHED_NOOP=y
# CONFIG_IOSCHED_AS is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
# CONFIG_DEFAULT_AS is not set
# CONFIG_DEFAULT_DEADLINE is not set
# CONFIG_DEFAULT_CFQ is not set
CONFIG_DEFAULT_NOOP=y
CONFIG_DEFAULT_IOSCHED="noop"

#
# System Type
#
# CONFIG_ARCH_AAEC2000 is not set
# CONFIG_ARCH_INTEGRATOR is not set
# CONFIG_ARCH_REALVIEW is not set
# CONFIG_ARCH_VERSATILE is not set
CONFIG_ARCH_AT91=y
# CONFIG_ARCH_CLPS7500 is not set
# CONFIG_ARCH_CLPS711X is not set
# CONFIG_ARCH_CO285 is not set
# CONFIG_ARCH_EBSA110 is not set
# CONFIG_ARCH_EP93XX is not set
# CONFIG_ARCH_FOOTBRIDGE is not set
# CONFIG_ARCH_NETX is not set
# CONFIG_ARCH_H720X is not set
# CONFIG_ARCH_IMX is not set
# CONFIG_ARCH_IOP32X is not set
# CONFIG_ARCH_IOP33X is not set
# CONFIG_ARCH_IOP13XX is not set
# CONFIG_ARCH_IXP4XX is not set
# CONFIG_ARCH_IXP2000 is not set
# CONFIG_ARCH_IXP23XX is not set
# CONFIG_ARCH_L7200 is not set
# CONFIG_ARCH_PNX4008 is not set
# CONFIG_ARCH_PXA is not set
# CONFIG_ARCH_RPC is not set
# CONFIG_ARCH_SA1100 is not set
# CONFIG_ARCH_S3C2410 is not set
# CONFIG_ARCH_SHARK is not set
# CONFIG_ARCH_LH7A40X is not set
# CONFIG_ARCH_OMAP is not set

#
# Atmel AT91 System-on-Chip
#
# CONFIG_ARCH_AT91RM9200 is not set
CONFIG_ARCH_AT91SAM9260=y
# CONFIG_ARCH_AT91SAM9261 is not set
# CONFIG_ARCH_AT91SAM9263 is not set

#
# AT91SAM9260 Variants
#
# CONFIG_ARCH_AT91SAM9260_SAM9XE is not set

#
# AT91SAM9260 / AT91SAM9XE Board Type
#
CONFIG_MACH_AT91SAM9260EK=y

#
# AT91 Board Options
#
CONFIG_MTD_AT91_DATAFLASH_CARD=y
CONFIG_MTD_NAND_AT91_BUSWIDTH_16=y

#
# AT91 Feature Selections
#
CONFIG_AT91_PROGRAMMABLE_CLOCKS=y

#
# Processor Type
#
CONFIG_CPU_32=y
CONFIG_CPU_ARM926T=y
CONFIG_CPU_32v5=y
CONFIG_CPU_ABRT_EV5TJ=y
CONFIG_CPU_CACHE_VIVT=y
CONFIG_CPU_COPY_V4WB=y
CONFIG_CPU_TLB_V4WBI=y
CONFIG_CPU_CP15=y
CONFIG_CPU_CP15_MMU=y

#
# Processor Features
#
CONFIG_ARM_THUMB=y
# CONFIG_CPU_ICACHE_DISABLE is not set
# CONFIG_CPU_DCACHE_DISABLE is not set
# CONFIG_CPU_DCACHE_WRITETHROUGH is not set
# CONFIG_CPU_CACHE_ROUND_ROBIN is not set

#
# Bus support
#

#
# PCCARD (PCMCIA/CardBus) support
#
# CONFIG_PCCARD is not set

#
# Kernel Features
#
# CONFIG_PREEMPT is not set
# CONFIG_NO_IDLE_HZ is not set
CONFIG_HZ=100
CONFIG_AEABI=y
CONFIG_OABI_COMPAT=y
# CONFIG_ARCH_DISCONTIGMEM_ENABLE is not set
CONFIG_SELECT_MEMORY_MODEL=y
CONFIG_FLATMEM_MANUAL=y
# CONFIG_DISCONTIGMEM_MANUAL is not set
# CONFIG_SPARSEMEM_MANUAL is not set
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
# CONFIG_SPARSEMEM_STATIC is not set
CONFIG_SPLIT_PTLOCK_CPUS=4096
CONFIG_RESOURCES_64BIT=y
CONFIG_LEDS=y
CONFIG_LEDS_TIMER=y
CONFIG_LEDS_CPU=y
CONFIG_ALIGNMENT_TRAP=y

#
# Boot options
#
CONFIG_ZBOOT_ROM_TEXT=0
CONFIG_ZBOOT_ROM_BSS=0
CONFIG_CMDLINE=""
# CONFIG_XIP_KERNEL is not set

#
# Floating point emulation
#

#
# At least one emulation must be selected
#
CONFIG_FPE_NWFPE=y
# CONFIG_FPE_NWFPE_XP is not set
# CONFIG_FPE_FASTFPE is not set
# CONFIG_VFP is not set

#
# Userspace binary formats
#
CONFIG_BINFMT_ELF=y
# CONFIG_BINFMT_AOUT is not set
CONFIG_BINFMT_MISC=y

#
# Power management options
#
# CONFIG_PM is not set
# CONFIG_APM is not set

#
# Networking
#
CONFIG_NET=y

#
# Networking options
#
# CONFIG_NETDEBUG is not set
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
CONFIG_XFRM=y
# CONFIG_XFRM_USER is not set
# CONFIG_XFRM_SUB_POLICY is not set
CONFIG_NET_KEY=y
CONFIG_INET=y
# CONFIG_IP_MULTICAST is not set
# CONFIG_IP_ADVANCED_ROUTER is not set
CONFIG_IP_FIB_HASH=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
# CONFIG_IP_PNP_BOOTP is not set
# CONFIG_IP_PNP_RARP is not set
# CONFIG_NET_IPIP is not set
# CONFIG_NET_IPGRE is not set
# CONFIG_ARPD is not set
# CONFIG_SYN_COOKIES is not set
# CONFIG_INET_AH is not set
# CONFIG_INET_ESP is not set
# CONFIG_INET_IPCOMP is not set
# CONFIG_INET_XFRM_TUNNEL is not set
# CONFIG_INET_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
# CONFIG_INET_DIAG is not set
# CONFIG_TCP_CONG_ADVANCED is not set
CONFIG_TCP_CONG_CUBIC=y
CONFIG_DEFAULT_TCP_CONG="cubic"
# CONFIG_TCP_MD5SIG is not set
# CONFIG_IPV6 is not set
# CONFIG_INET6_XFRM_TUNNEL is not set
# CONFIG_INET6_TUNNEL is not set
# CONFIG_NETWORK_SECMARK is not set
# CONFIG_NETFILTER is not set

#
# DCCP Configuration (EXPERIMENTAL)
#
# CONFIG_IP_DCCP is not set

#
# SCTP Configuration (EXPERIMENTAL)
#
# CONFIG_IP_SCTP is not set

#
# TIPC Configuration (EXPERIMENTAL)
#
# CONFIG_TIPC is not set
# CONFIG_ATM is not set
# CONFIG_BRIDGE is not set
# CONFIG_VLAN_8021Q is not set
# CONFIG_DECNET is not set
# CONFIG_LLC2 is not set
# CONFIG_IPX is not set
# CONFIG_ATALK is not set
# CONFIG_X25 is not set
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set

#
# QoS and/or fair queueing
#
# CONFIG_NET_SCHED is not set

#
# Network testing
#
# CONFIG_NET_PKTGEN is not set
# CONFIG_HAMRADIO is not set
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_IEEE80211 is not set

#
# Device Drivers
#

#
# Generic Driver Options
#
CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
CONFIG_FW_LOADER=y
# CONFIG_DEBUG_DRIVER is not set
# CONFIG_SYS_HYPERVISOR is not set

#
# Connector - unified userspace <-> kernelspace linker
#
# CONFIG_CONNECTOR is not set

#
# Memory Technology Devices (MTD)
#
CONFIG_MTD=y
# CONFIG_MTD_DEBUG is not set
CONFIG_MTD_CONCAT=y
CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1
# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set
# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set
# CONFIG_MTD_CMDLINE_PARTS is not set
# CONFIG_MTD_AFS_PARTS is not set

#
# User Modules And Translation Layers
#
CONFIG_MTD_CHAR=y
CONFIG_MTD_BLKDEVS=y
CONFIG_MTD_BLOCK=y
CONFIG_FTL=y
CONFIG_NFTL=y
CONFIG_NFTL_RW=y
# CONFIG_INFTL is not set
# CONFIG_RFD_FTL is not set
CONFIG_SSFDC=y

#
# RAM/ROM/Flash chip drivers
#
CONFIG_MTD_CFI=y
CONFIG_MTD_JEDECPROBE=y
CONFIG_MTD_GEN_PROBE=y
# CONFIG_MTD_CFI_ADV_OPTIONS is not set
CONFIG_MTD_MAP_BANK_WIDTH_1=y
CONFIG_MTD_MAP_BANK_WIDTH_2=y
CONFIG_MTD_MAP_BANK_WIDTH_4=y
# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
CONFIG_MTD_CFI_I1=y
CONFIG_MTD_CFI_I2=y
# CONFIG_MTD_CFI_I4 is not set
# CONFIG_MTD_CFI_I8 is not set
CONFIG_MTD_CFI_INTELEXT=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_CFI_STAA=y
CONFIG_MTD_CFI_UTIL=y
CONFIG_MTD_RAM=y
CONFIG_MTD_ROM=y
CONFIG_MTD_ABSENT=y
# CONFIG_MTD_OBSOLETE_CHIPS is not set

#
# Mapping drivers for chip access
#
CONFIG_MTD_COMPLEX_MAPPINGS=y
# CONFIG_MTD_PHYSMAP is not set
# CONFIG_MTD_ARM_INTEGRATOR is not set
# CONFIG_MTD_IMPA7 is not set
# CONFIG_MTD_PLATRAM is not set

#
# Self-contained MTD device drivers
#
CONFIG_MTD_DATAFLASH=y
# CONFIG_MTD_M25P80 is not set
# CONFIG_MTD_SLRAM is not set
# CONFIG_MTD_PHRAM is not set
# CONFIG_MTD_MTDRAM is not set
# CONFIG_MTD_BLOCK2MTD is not set

#
# Disk-On-Chip Device Drivers
#
# CONFIG_MTD_DOC2000 is not set
# CONFIG_MTD_DOC2001 is not set
# CONFIG_MTD_DOC2001PLUS is not set

#
# NAND Flash Device Drivers
#
CONFIG_MTD_NAND=y
# CONFIG_MTD_NAND_VERIFY_WRITE is not set
# CONFIG_MTD_NAND_ECC_SMC is not set
CONFIG_MTD_NAND_IDS=y
# CONFIG_MTD_NAND_DISKONCHIP is not set
CONFIG_MTD_NAND_AT91=y
# CONFIG_MTD_NAND_NANDSIM is not set

#
# OneNAND Flash Device Drivers
#
# CONFIG_MTD_ONENAND is not set

#
# Parallel port support
#
# CONFIG_PARPORT is not set

#
# Plug and Play support
#

#
# Block devices
#
# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=y
# CONFIG_BLK_DEV_CRYPTOLOOP is not set
# CONFIG_BLK_DEV_NBD is not set
# CONFIG_BLK_DEV_UB is not set
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=16384
CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
CONFIG_BLK_DEV_INITRD=y
# CONFIG_CDROM_PKTCDVD is not set
# CONFIG_ATA_OVER_ETH is not set

#
# SCSI device support
#
# CONFIG_RAID_ATTRS is not set
# CONFIG_SCSI is not set
# CONFIG_SCSI_NETLINK is not set

#
# Serial ATA (prod) and Parallel ATA (experimental) drivers
#
# CONFIG_ATA is not set

#
# Multi-device support (RAID and LVM)
#
# CONFIG_MD is not set

#
# Fusion MPT device support
#
# CONFIG_FUSION is not set

#
# IEEE 1394 (FireWire) support
#

#
# I2O device support
#

#
# Network device support
#
CONFIG_NETDEVICES=y
CONFIG_DUMMY=y
# CONFIG_BONDING is not set
# CONFIG_EQUALIZER is not set
# CONFIG_TUN is not set

#
# PHY device support
#
CONFIG_PHYLIB=y

#
# MII PHY device drivers
#
# CONFIG_MARVELL_PHY is not set
CONFIG_DAVICOM_PHY=y
# CONFIG_QSEMI_PHY is not set
# CONFIG_LXT_PHY is not set
# CONFIG_CICADA_PHY is not set
# CONFIG_VITESSE_PHY is not set
# CONFIG_SMSC_PHY is not set
# CONFIG_BROADCOM_PHY is not set
# CONFIG_FIXED_PHY is not set

#
# Ethernet (10 or 100Mbit)
#
CONFIG_NET_ETHERNET=y
CONFIG_MII=y
CONFIG_MACB=y
# CONFIG_SMC91X is not set
CONFIG_DM9000=y

#
# Ethernet (1000 Mbit)
#

#
# Ethernet (10000 Mbit)
#

#
# Token Ring devices
#

#
# Wireless LAN (non-hamradio)
#
# CONFIG_NET_RADIO is not set

#
# Wan interfaces
#
# CONFIG_WAN is not set
# CONFIG_PPP is not set
# CONFIG_SLIP is not set
# CONFIG_SHAPER is not set
# CONFIG_NETCONSOLE is not set
# CONFIG_NETPOLL is not set
# CONFIG_NET_POLL_CONTROLLER is not set

#
# ISDN subsystem
#
# CONFIG_ISDN is not set

#
# Input device support
#
CONFIG_INPUT=y
# CONFIG_INPUT_FF_MEMLESS is not set

#
# Userland interfaces
#
CONFIG_INPUT_MOUSEDEV=y
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
# CONFIG_INPUT_JOYDEV is not set
# CONFIG_INPUT_TSDEV is not set
# CONFIG_INPUT_EVDEV is not set
# CONFIG_INPUT_EVBUG is not set

#
# Input Device Drivers
#
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_INPUT_JOYSTICK is not set
# CONFIG_INPUT_TOUCHSCREEN is not set
# CONFIG_INPUT_MISC is not set

#
# Hardware I/O ports
#
CONFIG_SERIO=y
CONFIG_SERIO_SERPORT=y
# CONFIG_SERIO_RAW is not set
# CONFIG_GAMEPORT is not set

#
# Character devices
#
CONFIG_VT=y
CONFIG_VT_CONSOLE=y
CONFIG_HW_CONSOLE=y
# CONFIG_VT_HW_CONSOLE_BINDING is not set
# CONFIG_SERIAL_NONSTANDARD is not set

#
# Serial drivers
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_8250_NR_UARTS=4
CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
# CONFIG_SERIAL_8250_MANY_PORTS is not set
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y

#
# Non-8250 serial port support
#
CONFIG_SERIAL_ATMEL=y
CONFIG_SERIAL_ATMEL_CONSOLE=y
# CONFIG_SERIAL_ATMEL_TTYAT is not set
CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
CONFIG_UNIX98_PTYS=y
# CONFIG_LEGACY_PTYS is not set

#
# IPMI
#
# CONFIG_IPMI_HANDLER is not set

#
# Watchdog Cards
#
# CONFIG_WATCHDOG is not set
# CONFIG_HW_RANDOM is not set
# CONFIG_NVRAM is not set
# CONFIG_DTLK is not set
# CONFIG_R3964 is not set
# CONFIG_RAW_DRIVER is not set

#
# TPM devices
#
# CONFIG_TCG_TPM is not set

#
# I2C support
#
CONFIG_I2C=y
CONFIG_I2C_CHARDEV=y

#
# I2C Algorithms
#
# CONFIG_I2C_ALGOBIT is not set
# CONFIG_I2C_ALGOPCF is not set
# CONFIG_I2C_ALGOPCA is not set

#
# I2C Hardware Bus support
#
# CONFIG_I2C_ATMELTWI is not set
CONFIG_I2C_AT91=y
CONFIG_I2C_AT91_CLOCKRATE=100000
# CONFIG_I2C_OCORES is not set
# CONFIG_I2C_PARPORT_LIGHT is not set
# CONFIG_I2C_STUB is not set
# CONFIG_I2C_PCA_ISA is not set

#
# Miscellaneous I2C Chip support
#
# CONFIG_SENSORS_DS1337 is not set
# CONFIG_SENSORS_DS1374 is not set
# CONFIG_SENSORS_EEPROM is not set
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_SENSORS_PCA9539 is not set
# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_MAX6875 is not set
# CONFIG_I2C_DEBUG_CORE is not set
# CONFIG_I2C_DEBUG_ALGO is not set
# CONFIG_I2C_DEBUG_BUS is not set
# CONFIG_I2C_DEBUG_CHIP is not set

#
# SPI support
#
CONFIG_SPI=y
# CONFIG_SPI_DEBUG is not set
CONFIG_SPI_MASTER=y

#
# SPI Master Controller Drivers
#
CONFIG_SPI_ATMEL=y
# CONFIG_SPI_BITBANG is not set

#
# SPI Protocol Masters
#

#
# Dallas's 1-wire bus
#
# CONFIG_W1 is not set

#
# Hardware Monitoring support
#
# CONFIG_HWMON is not set
# CONFIG_HWMON_VID is not set

#
# Misc devices
#
# CONFIG_TIFM_CORE is not set

#
# LED devices
#
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y

#
# LED drivers
#

#
# LED Triggers
#
CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_TIMER=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=y

#
# Multimedia devices
#
# CONFIG_VIDEO_DEV is not set

#
# Digital Video Broadcasting Devices
#
# CONFIG_DVB is not set
# CONFIG_USB_DABUSB is not set

#
# Graphics support
#
# CONFIG_FIRMWARE_EDID is not set
# CONFIG_FB is not set

#
# Console display driver support
#
# CONFIG_VGA_CONSOLE is not set
CONFIG_DUMMY_CONSOLE=y
# CONFIG_BACKLIGHT_LCD_SUPPORT is not set

#
# Sound
#
# CONFIG_SOUND is not set

#
# HID Devices
#
CONFIG_HID=y

#
# USB support
#
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
# CONFIG_USB_ARCH_HAS_EHCI is not set
CONFIG_USB=y
# CONFIG_USB_DEBUG is not set

#
# Miscellaneous USB options
#
CONFIG_USB_DEVICEFS=y
# CONFIG_USB_BANDWIDTH is not set
# CONFIG_USB_DYNAMIC_MINORS is not set
# CONFIG_USB_OTG is not set

#
# USB Host Controller Drivers
#
# CONFIG_USB_ISP116X_HCD is not set
CONFIG_USB_OHCI_HCD=y
# CONFIG_USB_OHCI_BIG_ENDIAN is not set
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
# CONFIG_USB_SL811_HCD is not set

#
# USB Device Class drivers
#
# CONFIG_USB_ACM is not set
# CONFIG_USB_PRINTER is not set

#
# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
#

#
# may also be needed; see USB_STORAGE Help for more information
#
# CONFIG_USB_LIBUSUAL is not set

#
# USB Input Devices
#
CONFIG_USB_HID=y
# CONFIG_USB_HIDINPUT_POWERBOOK is not set
# CONFIG_HID_FF is not set
# CONFIG_USB_HIDDEV is not set
# CONFIG_USB_AIPTEK is not set
# CONFIG_USB_WACOM is not set
# CONFIG_USB_ACECAD is not set
# CONFIG_USB_KBTAB is not set
# CONFIG_USB_POWERMATE is not set
# CONFIG_USB_TOUCHSCREEN is not set
# CONFIG_USB_YEALINK is not set
# CONFIG_USB_XPAD is not set
# CONFIG_USB_ATI_REMOTE is not set
# CONFIG_USB_ATI_REMOTE2 is not set
# CONFIG_USB_KEYSPAN_REMOTE is not set
# CONFIG_USB_APPLETOUCH is not set

#
# USB Imaging devices
#
# CONFIG_USB_MDC800 is not set

#
# USB Network Adapters
#
# CONFIG_USB_CATC is not set
# CONFIG_USB_KAWETH is not set
# CONFIG_USB_PEGASUS is not set
# CONFIG_USB_RTL8150 is not set
# CONFIG_USB_USBNET_MII is not set
# CONFIG_USB_USBNET is not set
# CONFIG_USB_MON is not set

#
# USB port drivers
#

#
# USB Serial Converter support
#
CONFIG_USB_SERIAL=y
# CONFIG_USB_SERIAL_CONSOLE is not set
CONFIG_USB_SERIAL_GENERIC=y
# CONFIG_USB_SERIAL_AIRCABLE is not set
# CONFIG_USB_SERIAL_AIRPRIME is not set
# CONFIG_USB_SERIAL_ARK3116 is not set
# CONFIG_USB_SERIAL_BELKIN is not set
# CONFIG_USB_SERIAL_WHITEHEAT is not set
# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set
CONFIG_USB_SERIAL_CP2101=y
# CONFIG_USB_SERIAL_CYPRESS_M8 is not set
# CONFIG_USB_SERIAL_EMPEG is not set
# CONFIG_USB_SERIAL_FTDI_SIO is not set
# CONFIG_USB_SERIAL_FUNSOFT is not set
# CONFIG_USB_SERIAL_VISOR is not set
# CONFIG_USB_SERIAL_IPAQ is not set
# CONFIG_USB_SERIAL_IR is not set
# CONFIG_USB_SERIAL_EDGEPORT is not set
# CONFIG_USB_SERIAL_EDGEPORT_TI is not set
# CONFIG_USB_SERIAL_GARMIN is not set
# CONFIG_USB_SERIAL_IPW is not set
# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set
# CONFIG_USB_SERIAL_KEYSPAN is not set
# CONFIG_USB_SERIAL_KLSI is not set
# CONFIG_USB_SERIAL_KOBIL_SCT is not set
# CONFIG_USB_SERIAL_MCT_U232 is not set
# CONFIG_USB_SERIAL_MOS7720 is not set
# CONFIG_USB_SERIAL_MOS7840 is not set
# CONFIG_USB_SERIAL_NAVMAN is not set
# CONFIG_USB_SERIAL_PL2303 is not set
# CONFIG_USB_SERIAL_HP4X is not set
# CONFIG_USB_SERIAL_SAFE is not set
# CONFIG_USB_SERIAL_SIERRAWIRELESS is not set
# CONFIG_USB_SERIAL_TI is not set
# CONFIG_USB_SERIAL_CYBERJACK is not set
# CONFIG_USB_SERIAL_XIRCOM is not set
# CONFIG_USB_SERIAL_OPTION is not set
# CONFIG_USB_SERIAL_OMNINET is not set
# CONFIG_USB_SERIAL_DEBUG is not set

#
# USB Miscellaneous drivers
#
# CONFIG_USB_EMI62 is not set
# CONFIG_USB_EMI26 is not set
# CONFIG_USB_ADUTUX is not set
# CONFIG_USB_AUERSWALD is not set
# CONFIG_USB_RIO500 is not set
# CONFIG_USB_LEGOTOWER is not set
# CONFIG_USB_LCD is not set
# CONFIG_USB_LED is not set
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
# CONFIG_USB_PHIDGET is not set
# CONFIG_USB_IDMOUSE is not set
# CONFIG_USB_FTDI_ELAN is not set
# CONFIG_USB_APPLEDISPLAY is not set
# CONFIG_USB_LD is not set
# CONFIG_USB_TRANCEVIBRATOR is not set
# CONFIG_USB_TEST is not set

#
# USB DSL modem support
#

#
# USB Gadget Support
#
CONFIG_USB_GADGET=y
# CONFIG_USB_GADGET_DEBUG_FILES is not set
CONFIG_USB_GADGET_SELECTED=y
# CONFIG_USB_GADGET_NET2280 is not set
# CONFIG_USB_GADGET_PXA2XX is not set
# CONFIG_USB_GADGET_GOKU is not set
# CONFIG_USB_GADGET_LH7A40X is not set
# CONFIG_USB_GADGET_HUSB2DEV is not set
# CONFIG_USB_GADGET_OMAP is not set
CONFIG_USB_GADGET_AT91=y
CONFIG_USB_AT91=y
# CONFIG_USB_GADGET_DUMMY_HCD is not set
# CONFIG_USB_GADGET_DUALSPEED is not set
# CONFIG_USB_ZERO is not set
CONFIG_USB_ETH=y
# CONFIG_USB_ETH_RNDIS is not set
# CONFIG_USB_GADGETFS is not set
# CONFIG_USB_FILE_STORAGE is not set
# CONFIG_USB_G_SERIAL is not set
# CONFIG_USB_MIDI_GADGET is not set

#
# MMC/SD Card support
#
CONFIG_MMC=y
# CONFIG_MMC_DEBUG is not set
CONFIG_MMC_BLOCK=y
CONFIG_MMC_AT91=y
# CONFIG_MMC_TIFM_SD is not set

#
# Real Time Clock
#
CONFIG_RTC_LIB=y
# CONFIG_RTC_CLASS is not set

#
# File systems
#
CONFIG_EXT2_FS=y
CONFIG_EXT2_FS_XATTR=y
CONFIG_EXT2_FS_POSIX_ACL=y
CONFIG_EXT2_FS_SECURITY=y
# CONFIG_EXT2_FS_XIP is not set
CONFIG_EXT3_FS=y
CONFIG_EXT3_FS_XATTR=y
# CONFIG_EXT3_FS_POSIX_ACL is not set
# CONFIG_EXT3_FS_SECURITY is not set
# CONFIG_EXT4DEV_FS is not set
CONFIG_JBD=y
# CONFIG_JBD_DEBUG is not set
CONFIG_FS_MBCACHE=y
# CONFIG_REISERFS_FS is not set
# CONFIG_JFS_FS is not set
CONFIG_FS_POSIX_ACL=y
# CONFIG_XFS_FS is not set
# CONFIG_GFS2_FS is not set
# CONFIG_OCFS2_FS is not set
# CONFIG_MINIX_FS is not set
# CONFIG_ROMFS_FS is not set
# CONFIG_INOTIFY is not set
# CONFIG_QUOTA is not set
CONFIG_DNOTIFY=y
# CONFIG_AUTOFS_FS is not set
# CONFIG_AUTOFS4_FS is not set
# CONFIG_FUSE_FS is not set

#
# CD-ROM/DVD Filesystems
#
# CONFIG_ISO9660_FS is not set
# CONFIG_UDF_FS is not set

#
# DOS/FAT/NT Filesystems
#
# CONFIG_MSDOS_FS is not set
# CONFIG_VFAT_FS is not set
# CONFIG_NTFS_FS is not set

#
# Pseudo filesystems
#
CONFIG_PROC_FS=y
CONFIG_PROC_SYSCTL=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
# CONFIG_TMPFS_POSIX_ACL is not set
# CONFIG_HUGETLB_PAGE is not set
CONFIG_RAMFS=y
CONFIG_CONFIGFS_FS=y

#
# Miscellaneous filesystems
#
# CONFIG_ADFS_FS is not set
# CONFIG_AFFS_FS is not set
# CONFIG_HFS_FS is not set
# CONFIG_HFSPLUS_FS is not set
# CONFIG_BEFS_FS is not set
# CONFIG_BFS_FS is not set
# CONFIG_EFS_FS is not set
CONFIG_JFFS2_FS=y
CONFIG_JFFS2_FS_DEBUG=0
CONFIG_JFFS2_FS_WRITEBUFFER=y
# CONFIG_JFFS2_SUMMARY is not set
# CONFIG_JFFS2_FS_XATTR is not set
# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set
CONFIG_JFFS2_ZLIB=y
CONFIG_JFFS2_RTIME=y
# CONFIG_JFFS2_RUBIN is not set
CONFIG_CRAMFS=y
# CONFIG_VXFS_FS is not set
# CONFIG_HPFS_FS is not set
# CONFIG_QNX4FS_FS is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set

#
# Network File Systems
#
# CONFIG_NFS_FS is not set
# CONFIG_NFSD is not set
# CONFIG_SMB_FS is not set
# CONFIG_CIFS is not set
# CONFIG_NCP_FS is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set
# CONFIG_9P_FS is not set

#
# Partition Types
#
# CONFIG_PARTITION_ADVANCED is not set
CONFIG_MSDOS_PARTITION=y

#
# Native Language Support
#
# CONFIG_NLS is not set

#
# Distributed Lock Manager
#
# CONFIG_DLM is not set

#
# Profiling support
#
# CONFIG_PROFILING is not set

#
# Kernel hacking
#
# CONFIG_PRINTK_TIME is not set
CONFIG_ENABLE_MUST_CHECK=y
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
# CONFIG_DEBUG_FS is not set
# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
CONFIG_LOG_BUF_SHIFT=17
CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_SCHEDSTATS is not set
# CONFIG_DEBUG_SLAB is not set
# CONFIG_DEBUG_RT_MUTEXES is not set
# CONFIG_RT_MUTEX_TESTER is not set
CONFIG_DEBUG_SPINLOCK=y
# CONFIG_DEBUG_MUTEXES is not set
# CONFIG_DEBUG_RWSEMS is not set
CONFIG_DEBUG_SPINLOCK_SLEEP=y
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
# CONFIG_DEBUG_KOBJECT is not set
CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_INFO is not set
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
CONFIG_FRAME_POINTER=y
CONFIG_FORCED_INLINING=y
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_DEBUG_USER is not set
# CONFIG_DEBUG_ERRORS is not set
# CONFIG_DEBUG_LL is not set

#
# Security options
#
# CONFIG_KEYS is not set
# CONFIG_SECURITY is not set

#
# Cryptographic options
#
CONFIG_CRYPTO=y
CONFIG_CRYPTO_ALGAPI=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_HMAC=y
# CONFIG_CRYPTO_XCBC is not set
# CONFIG_CRYPTO_NULL is not set
# CONFIG_CRYPTO_MD4 is not set
CONFIG_CRYPTO_MD5=y
# CONFIG_CRYPTO_SHA1 is not set
# CONFIG_CRYPTO_SHA256 is not set
# CONFIG_CRYPTO_SHA512 is not set
# CONFIG_CRYPTO_WP512 is not set
# CONFIG_CRYPTO_TGR192 is not set
# CONFIG_CRYPTO_GF128MUL is not set
# CONFIG_CRYPTO_ECB is not set
# CONFIG_CRYPTO_CBC is not set
# CONFIG_CRYPTO_LRW is not set
# CONFIG_CRYPTO_DES is not set
# CONFIG_CRYPTO_BLOWFISH is not set
# CONFIG_CRYPTO_TWOFISH is not set
# CONFIG_CRYPTO_SERPENT is not set
# CONFIG_CRYPTO_AES is not set
# CONFIG_CRYPTO_CAST5 is not set
# CONFIG_CRYPTO_CAST6 is not set
# CONFIG_CRYPTO_TEA is not set
# CONFIG_CRYPTO_ARC4 is not set
# CONFIG_CRYPTO_KHAZAD is not set
# CONFIG_CRYPTO_ANUBIS is not set
# CONFIG_CRYPTO_DEFLATE is not set
# CONFIG_CRYPTO_MICHAEL_MIC is not set
# CONFIG_CRYPTO_CRC32C is not set
# CONFIG_CRYPTO_TEST is not set

#
# Hardware crypto devices
#

#
# Library routines
#
CONFIG_BITREVERSE=y
CONFIG_CRC_CCITT=y
CONFIG_CRC16=y
CONFIG_CRC32=y
CONFIG_LIBCRC32C=y
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=y
CONFIG_PLIST=y
CONFIG_IOMAP_COPY=y






--
Ryan Ordway E-mail: [email protected]
Unix Systems Administrator [email protected]
OSU Libraries, Corvallis, OR 97370 Office: Valley Library #4657


2007-05-04 23:40:23

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/4/07, Davide Libenzi <[email protected]> wrote:
> This is a pretty specific case, that is not very typical to find in the
> usual common event loop dispatch application design.

This is where you are very wrong. Yes, it's rare in the Unix world
because non-trivial programs cannot implement this in most cases with
the available infrastructure. But it is very common in other places
and what is more, it makes a lot of sense. It gives you scalability
with the size of the machines at no cost associated to reorganizing
the program.


> And if you *really* want your truly generic WaitForMultipleObjects
> implementation, your only way is to base it on files. Files are our almost
> perfect match to HANDLEs in our world. We have the basic infrastructure
> already there.

"basic", but not complete. And I never said that the implementation
thye have is perfect, far from it. The concept is good and if we now
can implement it, with all the event sources available, using an
efficient event delivery mechanism we are far ahead of their design.

The proposal now on the table doesn't bring us there all the way and
it has the potential to make future work in the area of event delivery
harder just because there is more legacy code to be kept happy. This
is why I propose to not consider these changes and instead go for the
gold, i.e., the full solution.

2007-05-05 18:55:06

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Fri, 4 May 2007, Ulrich Drepper wrote:

> On 5/4/07, Davide Libenzi <[email protected]> wrote:
> > This is a pretty specific case, that is not very typical to find in the
> > usual common event loop dispatch application design.
>
> This is where you are very wrong. Yes, it's rare in the Unix world
> because non-trivial programs cannot implement this in most cases with
> the available infrastructure. But it is very common in other places
> and what is more, it makes a lot of sense. It gives you scalability
> with the size of the machines at no cost associated to reorganizing
> the program.

But we have our own *sane* version of WaitForMultipleObjects, and it's
called poll(2).



> > And if you *really* want your truly generic WaitForMultipleObjects
> > implementation, your only way is to base it on files. Files are our almost
> > perfect match to HANDLEs in our world. We have the basic infrastructure
> > already there.
>
> "basic", but not complete. And I never said that the implementation
> thye have is perfect, far from it. The concept is good and if we now
> can implement it, with all the event sources available, using an
> efficient event delivery mechanism we are far ahead of their design.
>
> The proposal now on the table doesn't bring us there all the way and
> it has the potential to make future work in the area of event delivery
> harder just because there is more legacy code to be kept happy. This
> is why I propose to not consider these changes and instead go for the
> gold, i.e., the full solution.

So, on one side we have a proposal made by a set of new modular objects
that fits our own infrastructure (internal - kernel, and external - POSIX)
and that are not bound to a specific interface.
On the other side we have a completely new, monolitic interface, whose
objects are strictly bound to it and are not usable if not only inside the
interface itself.
Now, considering that POSIX is the backbone of Linux (and *nix in
general), and considering that we certainly cannot drop existing POSIX
semantics, where the lagacy code will come from?
I really do not understand your point. You're too smart to not appreciate
the beauty and the simmetry of objects that responds to a common interface
(our files, win32 handles), and that fits our existing kernel infrastructure.




- Davide


2007-05-06 07:50:49

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/5/07, Davide Libenzi <[email protected]> wrote:
> But we have our own *sane* version of WaitForMultipleObjects, and it's
> called poll(2).

No, we don't. Don't start all over again. The interface of poll it
to primitive. See the kevent code, each record is, IIRC, 16 bytes in
size to return more data. For poll you only have bits.


> Now, considering that POSIX is the backbone of Linux (and *nix in
> general), and considering that we certainly cannot drop existing POSIX
> semantics, where the lagacy code will come from?

The legacy part comes from all this extra "make into a file
descriptor" stuff which is new, not needed now and especially not when
a full solution is available.


> I really do not understand your point. You're too smart to not appreciate
> the beauty and the simmetry of objects that responds to a common interface
> (our files, win32 handles), and that fits our existing kernel infrastructure.

You're blinded by this symmetry. Not everything that looks like a
good fit is a good idea. This is one case. Get over it, poll is not
powerful enough to serve as the unifying event mechanism.

2007-05-06 19:47:27

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Sun, 6 May 2007, Ulrich Drepper wrote:

> On 5/5/07, Davide Libenzi <[email protected]> wrote:
> > But we have our own *sane* version of WaitForMultipleObjects, and it's
> > called poll(2).
>
> No, we don't. Don't start all over again. The interface of poll it
> to primitive. See the kevent code, each record is, IIRC, 16 bytes in
> size to return more data. For poll you only have bits.

Yes, event bits plus opaque token are enough for most of it. Then you use
POSIX read/write to fetch/store the data. All the files (sockets, pipes, ...)
works this way. Signals you fetch a siginfo-like structure, through POSIX
read. Timers, you fetch a counter, through POSIX read. AIO, you use the
native AIO API (that I'd prefer, or you can choose to have a POSIX read
too). All these through isolated POSIX read semantics.
Now let's see how it'd look with a monolitic kevent-like interface. You'll
have a mosnter-union ala siginfo_t, with multiple nested structures, and
every time you need to extend it, you'll go through pain. Come on, that's
beyond ugly. With a file-like interface, each new addition comes to a
seaparate isolated interface, with separate POSIX read/write ABI.
Do you realise that to justify your all new bulk interface,
you had to pull out of the hat a Windows WaitForMultipleObjects?
Please drop the BS. I made you a full list of things that are readily and
POSIX-friendly handled/signaled with file-like interfaces.
Any sockets, pipes, all devices, signals, timers, AIO, and I'm probably
forgetting something.
You pulled "it's slow". False.
You pulled "it's memory expensive". It's not.
So far, I did not hear a single valid reason to go with a new, monolitic
interface. WaitForMultipleObjects? Please ...



- Davide


2007-05-06 19:56:59

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Sun, 6 May 2007 00:50:47 -0700 "Ulrich Drepper" <[email protected]> wrote:

> > I really do not understand your point. You're too smart to not appreciate
> > the beauty and the simmetry of objects that responds to a common interface
> > (our files, win32 handles), and that fits our existing kernel infrastructure.
>
> You're blinded by this symmetry. Not everything that looks like a
> good fit is a good idea. This is one case. Get over it, poll is not
> powerful enough to serve as the unifying event mechanism.

What is your position on the timerfd/signalfd/etc patches?

Seems to me that if we were to have fancy new event-delivery machinery
like kevent then the timerfd/signalfd work is heading in the other
direction and ultimately would prove to have been unneeded?

2007-05-06 20:18:50

by Davide Libenzi

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On Sun, 6 May 2007, Andrew Morton wrote:

> On Sun, 6 May 2007 00:50:47 -0700 "Ulrich Drepper" <[email protected]> wrote:
>
> > > I really do not understand your point. You're too smart to not appreciate
> > > the beauty and the simmetry of objects that responds to a common interface
> > > (our files, win32 handles), and that fits our existing kernel infrastructure.
> >
> > You're blinded by this symmetry. Not everything that looks like a
> > good fit is a good idea. This is one case. Get over it, poll is not
> > powerful enough to serve as the unifying event mechanism.
>
> What is your position on the timerfd/signalfd/etc patches?
>
> Seems to me that if we were to have fancy new event-delivery machinery
> like kevent then the timerfd/signalfd work is heading in the other
> direction and ultimately would prove to have been unneeded?

Yes, of course. If we're heading to yet-another monolitic interface, we're
heading with no valid reasons given if other than some handwaving. While
there are quite a few (modularity, compatibilty, plus the other ones that
came in my mind and that I explained in the way-too-many emails) to back a
file-based approach.
Conversation with Uli, as often happen when arguing about software, got
stuck. And since noone else seems interested in bringing valid points in
one way or another, I'll leave the discussion as is, and I'll let you sort
it out.



- Davide


2007-05-06 21:57:59

by Davi Arnaut

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

Andrew Morton wrote:
> On Sun, 6 May 2007 00:50:47 -0700 "Ulrich Drepper" <[email protected]> wrote:
>
>>> I really do not understand your point. You're too smart to not appreciate
>>> the beauty and the simmetry of objects that responds to a common interface
>>> (our files, win32 handles), and that fits our existing kernel infrastructure.
>> You're blinded by this symmetry. Not everything that looks like a
>> good fit is a good idea. This is one case. Get over it, poll is not
>> powerful enough to serve as the unifying event mechanism.
>
> What is your position on the timerfd/signalfd/etc patches?
>
> Seems to me that if we were to have fancy new event-delivery machinery
> like kevent then the timerfd/signalfd work is heading in the other
> direction and ultimately would prove to have been unneeded?

IMHO, I thought we had already gone down the *fd road with inotify,
posix message queue, and _hundred_ others file objects with poll methods.

I also think that inotify+(e)poll proves how well the fd/epoll model
fits together, scales, and that a new fancy event-delivery machinery is
not necessary. And it makes me wonder why I hadn't followed its "watch"
approach for futexes:

futex_init(); // Davide's anon fd
futex_add_watch(int fd, void *addr, int val, uint32_t mask);
futex_rm_watch(int fd, uint32_t wd);

Anyway, this unifying event machinery can be built, if needed, in user
space by libevent and others.

--
Davi Arnaut

2007-05-07 05:33:50

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/6/07, Andrew Morton <[email protected]> wrote:
> What is your position on the timerfd/signalfd/etc patches?
>
> Seems to me that if we were to have fancy new event-delivery machinery
> like kevent then the timerfd/signalfd work is heading in the other
> direction and ultimately would prove to have been unneeded?

That's my point. I think we ultimately have to have something like
kevent and then all this *fd() work is unnecessary and just adds code
to the kernel which has to be kept around and which might hinder
further work in this area.

2007-05-07 05:46:05

by Ulrich Drepper

[permalink] [raw]
Subject: Re: [patch 14/22] pollfs: pollable futex

On 5/6/07, Andrew Morton <[email protected]> wrote:
> What is your position on the timerfd/signalfd/etc patches?

One more thing: recently in a network-related discussion with DaveM
et.al. we came across a situation where we want events from the
kernel. The requirement is for fast event notification (or
non-blocking polling) and the event is only implicitly needed. Having
a file descriptor open is not an option. The possibilities are

- open a file in /proc or /sys or a socket for every call to te
function to check for events

- have a memory-mapped interface like kevent which does not keep file
descriptors open

File descriptions are problematic when it comes implicitly uses in the
runtime. This is, for instance, why we have MAP_ANON instead of
keeping a /dev/null file descriptor open all the time.