"This time for sure"
I have a new primitive (thanks Paul Mackerras): sleep if the word at
this address is equal to this value. It also has an optional timeout.
On top of this, I have made pthreads locking primitives and futexes
(marginally slower than the completely-in-kernel version, but not
much).
Userspace stuff (including nonpthreads.c code):
http://www.kernel.org/pub/linux/kernel/people/rusty/futex-2.0.tar.gz
Please complain now, or I'll send to Linus as is,
Rusty.
PS. Will be on plane in 24 hours, for 40 hours. Incommunicado.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.7/include/linux/futex.h working-2.5.7-futex/include/linux/futex.h
--- linux-2.5.7/include/linux/futex.h Thu Mar 21 14:14:54 2002
+++ working-2.5.7-futex/include/linux/futex.h Tue Apr 2 10:56:40 2002
@@ -2,7 +2,7 @@
#define _LINUX_FUTEX_H
/* Second argument to futex syscall */
-#define FUTEX_UP (0)
-#define FUTEX_DOWN (1)
+#define FUTEX_WAIT (0)
+#define FUTEX_WAKE (1)
#endif
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.7/kernel/futex.c working-2.5.7-futex/kernel/futex.c
--- linux-2.5.7/kernel/futex.c Thu Mar 21 14:14:56 2002
+++ working-2.5.7-futex/kernel/futex.c Tue Apr 2 10:56:40 2002
@@ -32,7 +32,8 @@
#include <linux/fs.h>
#include <linux/futex.h>
#include <linux/highmem.h>
-#include <asm/atomic.h>
+#include <linux/time.h>
+#include <asm/uaccess.h>
/* These mutexes are a very simple counter: the winner is the one who
decrements from 1 to 0. The counter starts at 1 when the lock is
@@ -68,22 +69,27 @@
return &futex_queues[hash_long(h, FUTEX_HASHBITS)];
}
-static inline void wake_one_waiter(struct list_head *head,
- struct page *page,
- unsigned int offset)
+static int futex_wake(struct list_head *head,
+ struct page *page,
+ unsigned int offset,
+ int num)
{
- struct list_head *i;
+ struct list_head *i, *next;
+ int num_woken = 0;
spin_lock(&futex_lock);
- list_for_each(i, head) {
+ list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
if (this->page == page && this->offset == offset) {
+ list_del_init(i);
wake_up_process(this->task);
- break;
+ num_woken++;
+ if (num_woken >= num) break;
}
}
spin_unlock(&futex_lock);
+ return num_woken;
}
/* Add at end to avoid starvation */
@@ -101,11 +107,17 @@
spin_unlock(&futex_lock);
}
-static inline void unqueue_me(struct futex_q *q)
+/* Return 1 if we were still queued (ie. 0 means we were woken) */
+static inline int unqueue_me(struct futex_q *q)
{
+ int ret = 0;
spin_lock(&futex_lock);
- list_del(&q->list);
+ if (!list_empty(&q->list)) {
+ list_del(&q->list);
+ ret = 1;
+ }
spin_unlock(&futex_lock);
+ return ret;
}
/* Get kernel address of the user page and pin it. */
@@ -129,74 +141,62 @@
return page;
}
-/* Try to decrement the user count to zero. */
-static int decrement_to_zero(struct page *page, unsigned int offset)
-{
- atomic_t *count;
- int ret = 0;
-
- count = kmap(page) + offset;
- /* If we take the semaphore from 1 to 0, it's ours. If it's
- zero, decrement anyway, to indicate we are waiting. If
- it's negative, don't decrement so we don't wrap... */
- if (atomic_read(count) >= 0 && atomic_dec_and_test(count))
- ret = 1;
- kunmap(page);
- return ret;
-}
-
-/* Simplified from arch/ppc/kernel/semaphore.c: Paul M. is a genius. */
-static int futex_down(struct list_head *head, struct page *page, int offset)
+static int futex_wait(struct list_head *head,
+ struct page *page,
+ int offset,
+ int val,
+ unsigned long time)
{
- int retval = 0;
+ int *count;
struct futex_q q;
+ int ret = 0;
- current->state = TASK_INTERRUPTIBLE;
+ set_current_state(TASK_INTERRUPTIBLE);
queue_me(head, &q, page, offset);
- while (!decrement_to_zero(page, offset)) {
- if (signal_pending(current)) {
- retval = -EINTR;
- break;
- }
- schedule();
- current->state = TASK_INTERRUPTIBLE;
- }
- current->state = TASK_RUNNING;
- unqueue_me(&q);
- /* If we were signalled, we might have just been woken: we
- must wake another one. Otherwise we need to wake someone
- else (if they are waiting) so they drop the count below 0,
- and when we "up" in userspace, we know there is a
- waiter. */
- wake_one_waiter(head, page, offset);
- return retval;
-}
-
-static int futex_up(struct list_head *head, struct page *page, int offset)
-{
- atomic_t *count;
-
count = kmap(page) + offset;
- atomic_set(count, 1);
- smp_wmb();
+ if (*count != val) {
+ ret = -EWOULDBLOCK;
+ set_current_state(TASK_RUNNING);
+ goto out;
+ }
+ time = schedule_timeout(time);
+ if (time == 0) {
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ out:
kunmap(page);
- wake_one_waiter(head, page, offset);
- return 0;
+ /* Were we woken up anyway? */
+ if (!unqueue_me(&q))
+ return 0;
+ return ret;
}
-asmlinkage int sys_futex(void *uaddr, int op)
+asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime)
{
int ret;
unsigned long pos_in_page;
struct list_head *head;
struct page *page;
+ unsigned long time = MAX_SCHEDULE_TIMEOUT;
+
+ if (utime) {
+ struct timespec t;
+ if (copy_from_user(&t, utime, sizeof(t)) != 0)
+ return -EFAULT;
+ time = timespec_to_jiffies(&t) + 1;
+ }
pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE;
/* Must be "naturally" aligned, and not on page boundary. */
- if ((pos_in_page % __alignof__(atomic_t)) != 0
- || pos_in_page + sizeof(atomic_t) > PAGE_SIZE)
+ if ((pos_in_page % __alignof__(int)) != 0
+ || pos_in_page + sizeof(int) > PAGE_SIZE)
return -EINVAL;
/* Simpler if it doesn't vanish underneath us. */
@@ -206,13 +206,12 @@
head = hash_futex(page, pos_in_page);
switch (op) {
- case FUTEX_UP:
- ret = futex_up(head, page, pos_in_page);
+ case FUTEX_WAIT:
+ ret = futex_wait(head, page, pos_in_page, val, time);
break;
- case FUTEX_DOWN:
- ret = futex_down(head, page, pos_in_page);
+ case FUTEX_WAKE:
+ ret = futex_wake(head, page, pos_in_page, val);
break;
- /* Add other lock types here... */
default:
ret = -EINVAL;
}
On Thursday 04 April 2002 02:52 am, Rusty Russell wrote:
> "This time for sure"
>
> I have a new primitive (thanks Paul Mackerras): sleep if the word at
> this address is equal to this value. It also has an optional timeout.
> On top of this, I have made pthreads locking primitives and futexes
> (marginally slower than the completely-in-kernel version, but not
> much).
>
> Userspace stuff (including nonpthreads.c code):
> http://www.kernel.org/pub/linux/kernel/people/rusty/futex-2.0.tar.gz
>
> Please complain now, or I'll send to Linus as is,
> Rusty.
> PS. Will be on plane in 24 hours, for 40 hours. Incommunicado.
Rusty, here are a few comments !
In futex_wait we have
kmap(page)
schedule_timeout()
kunmap()
Are there potential for deadlocks, the mappings can be held for a long time
and the KMAPs are limited.
Otherwise, kernel patch looks good. Will run it this weekend through
ulockflex.
Other changes proposed for FUTEX 2.0
---------------------
A) in futex_down_timeout
get ride of woken, don't see why you need that.
optimize the while statement. Unless there are some hidden gcc issues.
static inline int futex_down_timeout(struct futex *futx, struct timespec *rel)
{
int val, woken = 0;
/* Returns new value */
while ((val = __futex_down(&futx->count)) != 0) {
switch (__futex_down_slow(futx, val, rel)) {
case -1:
return -1; /* error */
case 0:
futx->count = -1; /* slept */
/* fall through */
case 1:
return 0; /* passed */
}
}
}
---------------------
Still missing something on the futex_trydown !!
futex_trydown ::= futex_down == 1 ? 0 : -1
So P1 holds the lock, P2 runs "while (1) { futex_trydown }" will decrement
the counter yielding at some point "1" and thus granting the lock.
At one GHz on 32 way system this only requires a lock hold time of a few
seconds. Doesn't sound like a good idea.
This brings back the discussion on compare and swap. This would be trivial to
do with compare and swap.
Another solution would be to sync through the kernel at wraparound, so that
the count variable reflects the number of waiting processes.
That's for now....
--
-- Hubertus Franke ([email protected])
In message <[email protected]> you write:
> In futex_wait we have
> kmap(page)
> schedule_timeout()
> kunmap()
Oops! Good catch.,.. I've moved the kunmap to before the timeout...
> ---------------------
> A) in futex_down_timeout
> get ride of woken, don't see why you need that.
> optimize the while statement. Unless there are some hidden gcc issues.
We don't need to set to -1 if we never slept, that's why we have the
woken flag.
> static inline int futex_down_timeout(struct futex *futx, struct timespec *rel
)
> {
> int val, woken = 0;
>
> /* Returns new value */
> while ((val = __futex_down(&futx->count)) != 0) {
> switch (__futex_down_slow(futx, val, rel)) {
> case -1:
> return -1; /* error */
> case 0:
> futx->count = -1; /* slept */
> /* fall through */
> case 1:
> return 0; /* passed */
> }
> }
> }
case 0 does not return, it sleeps! This is wrong...
> Still missing something on the futex_trydown !!
>
> futex_trydown ::= futex_down == 1 ? 0 : -1
>
> So P1 holds the lock, P2 runs "while (1) { futex_trydown }" will decrement
> the counter yielding at some point "1" and thus granting the lock.
> At one GHz on 32 way system this only requires a lock hold time of a few
> seconds. Doesn't sound like a good idea.
Look closer at __futex_down: it doesn't decrement if futx->count < 0.
> This brings back the discussion on compare and swap. This would be trivial to
> do with compare and swap.
Yes, this is what the PPC code does.
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
On Saturday 06 April 2002 04:48 am, Rusty Russell wrote:
> In message <[email protected]> you write:
Enclosed is an "asynchronous" extension to futexes.
This mechanism is required to implement global locks in the presence of an
M:N threading package.
Here "N" denotes a set of kernel threads, that serve as virtual processors on
which the M user threads can be dispatched by the user level thread
manager/scheduler.
Under no circumstances can one block the kernel threads through the futex
interface. Instead, I want to indicate to the futex-subsystem, that I want a
notification for a particular futex, when its woken up.
Working with some M:N threading folks we came up with the following design
implemented in the attached patch to Rusty's previously posted futex
generalization patch. I can also provide a integrated patch over 2.5.7
vanilla.
The futex_q structure is extended to hold a <tgid>, the <uaddr> of the futex
and a signal with which to notify a caller.
We now provide a FUTEX_AWAIT call which attaches a futex_q object to the wait
queue, similar the FUTEX_WAIT. Differences are as follows.
(a) futex_q must be dynamically allocated in the AWAIT case, instead of on
the stack. We use the tgid to identify <async> vs. <sync> == 0.
(b) there will be no blocking in the kernel and hence we don't release the
page upon return from this AWAIT call.
(c) Upon wakeup, if the futex_q is an async wait (tgid != 0) we move it to a
global notification queue and signal the <tgid> with the specified signal.
We release the page (put_page) at this point.
If the <tgid> doesn't exist (e.g. died), we delete futex_q and signal the
next.
(d) A FUTEX_AWAKERS call is provided to the application to retrieve waiting
notifications, typically called within the signal handler.
We need a cleanup mechanism for the notification queue, when a process dies
prematurely before it had a chance to get their items out of the queue.
For this I introduced _exit_futex(task) to be called from do_exit().
It simply wakes the notifcation queue and deletes everything that is
associated with its pid. Right now that is done for each exiting task.
In general one might want to think about a flag indicating whether this
callback is necessary.
For easier reading I also attach the futex.c program.
Comments.
--
-- Hubertus Franke ([email protected])
In message <[email protected]> you write:
> Enclosed is an "asynchronous" extension to futexes.
Wow... I never thought of that. Cool!
My main concern is the DoS of multiple kmallocs. Alan Cox suggested
tying it to an fd (ie. naturally limited), and I think this might work
(I don't know much about async io). ie. (int)utime is the fd to wake
up, and then it can be used for async io OR a poll/select interface
using existing infrastructure.
Probably it has to be a special fd (/dev/futex?).
Thoughts?
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
On Wednesday 10 April 2002 12:37 pm, Rusty Russell wrote:
> In message <[email protected]> you write:
> > Enclosed is an "asynchronous" extension to futexes.
>
> Wow... I never thought of that. Cool!
>
> My main concern is the DoS of multiple kmallocs. Alan Cox suggested
> tying it to an fd (ie. naturally limited), and I think this might work
> (I don't know much about async io). ie. (int)utime is the fd to wake
> up, and then it can be used for async io OR a poll/select interface
> using existing infrastructure.
>
> Probably it has to be a special fd (/dev/futex?).
>
> Thoughts?
> Rusty.
Is the idea to write one's own poll/select routine for the /dev/futex files?
Dependent on the circumstance:
(A) you want to globally limit the number of outstanding waits !
Why so complicated, (assuming I understand the suggestion above) ?
Simply have a /proc/futex subsystem that tell's you how many outstanding
awaits can be active at any time.
kmalloc() becomes if (fq_count++ > LIMIT) return -EAGAIN;
kfree() becomes kfree() ; fq_count--;
(B)
if you really want per process limits, then ofcourse, you need something
different than my suggestions under (A)
We allocate a file structure with private data for opening /dev/futex !
We associate that file descriptor
(a) a counter (reflecting the number of outstanding futex_q)
(b) a notify_queue
(a) is limited by a per FD limit of outstanding futex_q (still need a
global limit here)
(b) solves the __exit_futex() problem.
We wouldn't need that call, as the pending notifications will be deleted with
the FD destruction... cool...
Can somebody in the thread world weight in what there preferred mechanism is
regarding limits etc. Do you need to control the issue on how the signal is
delivered? Is a file descriptor good enough or you want a sys_call interface ?
All this is merely a "clean/usefull interface" issue (other then the DoS).
One other idea here ...
Can you sketch out the file design and I code it up.
We need this async functionality for M:N threads and we need it soon.
--
-- Hubertus Franke ([email protected])
On 04/10/2002 at 12:37:53 PM AST, Hubertus Franke <[email protected]>
wrote:
>
> Can somebody in the thread world weight in what there preferred mechanism
is
> regarding limits etc. Do you need to control the issue on how the signal
is
> delivered? Is a file descriptor good enough or you want a sys_call
interface ?
>
I went thru the POSIX specification and couldn't find any specified limits
regarding this and in most cases these limits are enforced at the library
level. It probably should be left up to the kernel folks to determine the
kernel limits they can live with. The library can adapt to this value. I
don't believe a pthread library would need any "extra" control of how the
signal is delivered. A file descriptor is good enough, seems a waste to
have to provide a sys_call interface.
Regards,
Bill Abt
Senior Software Engineer
Next Generation POSIX Threading for Linux
IBM Cambridge, MA, USA 02142
Ext: +(00)1 617-693-1591
T/L: 693-1591 (M/W/F)
T/L: 253-9938 (T/Th/Eves.)
Cell: +(00)1 617-803-7514
[email protected] or [email protected]
http://oss.software.ibm.com/developerworks/opensource/pthreads
On Wednesday 10 April 2002 02:09 pm, Bill Abt wrote:
> On 04/10/2002 at 12:37:53 PM AST, Hubertus Franke <[email protected]>
>
> wrote:
> > Can somebody in the thread world weight in what there preferred mechanism
>
> is
>
> > regarding limits etc. Do you need to control the issue on how the signal
>
> is
>
> > delivered? Is a file descriptor good enough or you want a sys_call
>
> interface ?
>
>
> I went thru the POSIX specification and couldn't find any specified limits
> regarding this and in most cases these limits are enforced at the library
> level. It probably should be left up to the kernel folks to determine the
> kernel limits they can live with. The library can adapt to this value. I
> don't believe a pthread library would need any "extra" control of how the
> signal is delivered. A file descriptor is good enough, seems a waste to
> have to provide a sys_call interface.
>
So you are OK with having only poll or select. That seems odd.
It seems you still need SIGIO on your fd to get the async notification.
> Regards,
> Bill Abt
> Senior Software Engineer
> Next Generation POSIX Threading for Linux
> IBM Cambridge, MA, USA 02142
> Ext: +(00)1 617-693-1591
> T/L: 693-1591 (M/W/F)
> T/L: 253-9938 (T/Th/Eves.)
> Cell: +(00)1 617-803-7514
> [email protected] or [email protected]
> http://oss.software.ibm.com/developerworks/opensource/pthreads
-- Hubertus
--
-- Hubertus Franke ([email protected])
On 04/10/2002 at 02:10:59 PM AST, Hubertus Franke <[email protected]>
wrote:
>
> So you are OK with having only poll or select. That seems odd.
> It seems you still need SIGIO on your fd to get the async notification.
>
Duh... You're right. I forgot about that...
Regards,
Bill Abt
Senior Software Engineer
Next Generation POSIX Threading for Linux
IBM Cambridge, MA, USA 02142
Ext: +(00)1 617-693-1591
T/L: 693-1591 (M/W/F)
T/L: 253-9938 (T/Th/Eves.)
Cell: +(00)1 617-803-7514
[email protected] or [email protected]
http://oss.software.ibm.com/developerworks/opensource/pthreads
On Wednesday 10 April 2002 03:30 pm, Bill Abt wrote:
> On 04/10/2002 at 02:10:59 PM AST, Hubertus Franke <[email protected]>
>
> wrote:
> > So you are OK with having only poll or select. That seems odd.
> > It seems you still need SIGIO on your fd to get the async notification.
>
> Duh... You're right. I forgot about that...
>
> Regards,
> Bill Abt
> Senior Software Engineer
> Next Generation POSIX Threading for Linux
> IBM Cambridge, MA, USA 02142
> Ext: +(00)1 617-693-1591
> T/L: 693-1591 (M/W/F)
> T/L: 253-9938 (T/Th/Eves.)
> Cell: +(00)1 617-803-7514
> [email protected] or [email protected]
> http://oss.software.ibm.com/developerworks/opensource/pthreads
Yes,
The current interface is
(A)
async wait:
sys_futex (uaddr, FUTEX_AWAIT, value, (struct timespec*) sig);
upon signal handling
sys_futex(uaddrs[], FUTEX_WAIT, size, NULL);
to retrieve the uaddrs that got woken up...
If you simply want a notification with SIGIO (or whatever you desire)
We can change that to
(A)
sys_futex(uaddr, FUTEX_WAIT, value, (truct timespec*) fd);
I send a SIGIO and you can request via ioctl or read the pending
notifications from fd.
(B) { struct futex *notarray[N]
int n = read( futex_fd, (void**)notarray,
N*sizeof(struct futex));
}
I am mainly concerned that SIGIO can be overloaded in a thread package ?
How would you know whether a SIGIO came from the futex or from other file
handle.
That is your call to make. Let me know !!!
--
-- Hubertus Franke ([email protected])
On 04/10/2002 at 02:47:50 PM AST, Hubertus Franke <[email protected]>
wrote:
> The current interface is
>
> (A)
> async wait:
> sys_futex (uaddr, FUTEX_AWAIT, value, (struct timespec*) sig);
> upon signal handling
> sys_futex(uaddrs[], FUTEX_WAIT, size, NULL);
> to retrieve the uaddrs that got woken up...
This is actually the preferred way. I must've misinterpeted what you said
earlier. I believe this is actually a more generic way of handling. A
thread package can specify the signal to used much in the way the current
LinuxThreads pre-allocates and uses certain real-time signals...
> I am mainly concerned that SIGIO can be overloaded in a thread package ?
> How would you know whether a SIGIO came from the futex or from other file
> handle.
By keep with the original interface, we don't have to contend with this
problem. The thread package can use the signal that most suits its'
implementation...
Make sense?
Regards,
Bill Abt
Senior Software Engineer
Next Generation POSIX Threading for Linux
IBM Cambridge, MA, USA 02142
Ext: +(00)1 617-693-1591
T/L: 693-1591 (M/W/F)
T/L: 253-9938 (T/Th/Eves.)
Cell: +(00)1 617-803-7514
[email protected] or [email protected]
http://oss.software.ibm.com/developerworks/opensource/pthreads
On Wednesday 10 April 2002 03:59 pm, Bill Abt wrote:
> On 04/10/2002 at 02:47:50 PM AST, Hubertus Franke <[email protected]>
>
> wrote:
> > The current interface is
> >
> > (A)
> > async wait:
> > sys_futex (uaddr, FUTEX_AWAIT, value, (struct timespec*) sig);
> > upon signal handling
> > sys_futex(uaddrs[], FUTEX_WAIT, size, NULL);
> > to retrieve the uaddrs that got woken up...
>
> This is actually the preferred way. I must've misinterpeted what you said
> earlier. I believe this is actually a more generic way of handling. A
> thread package can specify the signal to used much in the way the current
> LinuxThreads pre-allocates and uses certain real-time signals...
>
> > I am mainly concerned that SIGIO can be overloaded in a thread package ?
> > How would you know whether a SIGIO came from the futex or from other file
> >
> > handle.
>
> By keep with the original interface, we don't have to contend with this
> problem. The thread package can use the signal that most suits its'
> implementation...
>
> Make sense?
>
I wasn't precise...
There are ++ and -- for the file descriptors approach
++)
(a) it automatically cleans up the notify queue associated with the FD.
[ notify queue could still be global, if we want that ]
when the process disappears, no callback required in do_exit()
--) There is more overhead in the wakeup and the wait because I need to
move from the fd to the filestruct which is always some lookup and
I have to verify that the file descriptor is actually a valid /dev/futex
file (yikes).
Another problem I see is that we only have one additional
argument (void *utime) to piggyback the fd and the signal on.
What could be done is the following.
Make utime a (void *utime) argument
In FUTEX_WAIT interpret it as a pointer <struct timespec>
In FUTEX_AWAIT inteprest it as { short fd, short sig };
There should be no limitation by casting it to shorts ?
Comments, anyone....
I suggest, we leave it as is right now until more comments come.
> Regards,
> Bill Abt
> Senior Software Engineer
> Next Generation POSIX Threading for Linux
> IBM Cambridge, MA, USA 02142
> Ext: +(00)1 617-693-1591
> T/L: 693-1591 (M/W/F)
> T/L: 253-9938 (T/Th/Eves.)
> Cell: +(00)1 617-803-7514
> [email protected] or [email protected]
> http://oss.software.ibm.com/developerworks/opensource/pthreads
--
-- Hubertus Franke ([email protected])
In message <[email protected]> you write:
> --) There is more overhead in the wakeup and the wait because I need to
> move from the fd to the filestruct which is always some lookup and
> I have to verify that the file descriptor is actually a valid /dev/futex
> file (yikes).
Hmmm... verify on the FUTEX_AWAIT: you can maybe hold the struct file
directly (or maybe not).
> In FUTEX_AWAIT inteprest it as { short fd, short sig };
> There should be no limitation by casting it to shorts ?
No, that's bad. But I thought there was already a way to set what
signal occurred instead of SIGIO. Is that not per-fd?
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Hubertus Franke wrote:
>
> On Wednesday 10 April 2002 03:30 pm, Bill Abt wrote:
> > On 04/10/2002 at 02:10:59 PM AST, Hubertus Franke <[email protected]>
> >
> > wrote:
> > > So you are OK with having only poll or select. That seems odd.
> > > It seems you still need SIGIO on your fd to get the async notification.
> >
> > Duh... You're right. I forgot about that...
> >
>
> Yes,
>
> The current interface is
>
> (A)
> async wait:
> sys_futex (uaddr, FUTEX_AWAIT, value, (struct timespec*) sig);
> upon signal handling
> sys_futex(uaddrs[], FUTEX_WAIT, size, NULL);
> to retrieve the uaddrs that got woken up...
>
> If you simply want a notification with SIGIO (or whatever you desire)
> We can change that to
> (A)
> sys_futex(uaddr, FUTEX_WAIT, value, (truct timespec*) fd);
>
> I send a SIGIO and you can request via ioctl or read the pending
> notifications from fd.
> (B) { struct futex *notarray[N]
> int n = read( futex_fd, (void**)notarray,
> N*sizeof(struct futex));
> }
> I am mainly concerned that SIGIO can be overloaded in a thread package ?
> How would you know whether a SIGIO came from the futex or from other file
> handle.
>
I want to vote for using POSIX realtime signals. With them (and SA_SIGINFO)
you can carry small amounts of userdata, passed in the
struct siginfo_t
---susv2---
The <signal.h> header defines the siginfo_t type as a structure that
includes at least the following members:
int si_signo signal number
int si_errno if non-zero, an errno value associated with
this signal, as defined in <errno.h>
int si_code signal code
pid_t si_pid sending process ID
uid_t si_uid real user ID of sending process
void *si_addr address of faulting instruction
int si_status exit value or signal
long si_band band event for SIGPOLL
union sigval si_value signal value
[and further on]
Implementations may support additional si_code values not included in this
list, may generate values included in this list under circumstances other
than those described in this list, and may contain extensions or limitations
that prevent some values from being generated. Implementations will not
generate a different value from the ones described in this list for
circumstances described in this list.
---susv2---
So we could use si_code=SI_QUEUE and pass the uaddr in sival_ptr
or even si_code=SIGPOLL and pass the data in si_band.
We could also add our own si_code (SI_FUTEX) and add the tid in
siginfo_t (if needed for NGPT)
Why pass this data over a file descriptor?
The user space library can block on sigtimedwait() for notifications.
And with the DoS (letting the kernel pin too much memory on behalf
of a user process) we could use the "max locked memory" ulimit.
On Friday 12 April 2002 11:36 am, Peter W?chtler wrote:
> Hubertus Franke wrote:
> > On Wednesday 10 April 2002 03:30 pm, Bill Abt wrote:
> > > On 04/10/2002 at 02:10:59 PM AST, Hubertus Franke
> > > <[email protected]>
> > >
> > > wrote:
> > > > So you are OK with having only poll or select. That seems odd.
> > > > It seems you still need SIGIO on your fd to get the async
> > > > notification.
> > >
> > > Duh... You're right. I forgot about that...
> >
> > Yes,
> >
> > The current interface is
> >
> > (A)
> > async wait:
> > sys_futex (uaddr, FUTEX_AWAIT, value, (struct timespec*) sig);
> > upon signal handling
> > sys_futex(uaddrs[], FUTEX_WAIT, size, NULL);
> > to retrieve the uaddrs that got woken up...
> >
> > If you simply want a notification with SIGIO (or whatever you desire)
> > We can change that to
> > (A)
> > sys_futex(uaddr, FUTEX_WAIT, value, (truct timespec*) fd);
> >
> > I send a SIGIO and you can request via ioctl or read the pending
> > notifications from fd.
> > (B) { struct futex *notarray[N]
> > int n = read( futex_fd, (void**)notarray,
> > N*sizeof(struct futex));
> > }
> > I am mainly concerned that SIGIO can be overloaded in a thread package ?
> > How would you know whether a SIGIO came from the futex or from other file
> > handle.
>
> I want to vote for using POSIX realtime signals. With them (and SA_SIGINFO)
> you can carry small amounts of userdata, passed in the
>
> struct siginfo_t
> ---susv2---
> The <signal.h> header defines the siginfo_t type as a structure that
> includes at least the following members:
>
> int si_signo signal number
> int si_errno if non-zero, an errno value associated with
> this signal, as defined in <errno.h>
> int si_code signal code
> pid_t si_pid sending process ID
> uid_t si_uid real user ID of sending process
> void *si_addr address of faulting instruction
> int si_status exit value or signal
> long si_band band event for SIGPOLL
> union sigval si_value signal value
>
> [and further on]
> Implementations may support additional si_code values not included in this
> list, may generate values included in this list under circumstances other
> than those described in this list, and may contain extensions or
> limitations that prevent some values from being generated. Implementations
> will not generate a different value from the ones described in this list
> for circumstances described in this list.
>
> ---susv2---
>
Need to digest your suggestion a bit more. Not too familiar with that
interface.
One question I have though is whether information can get lost?
Assume , I have a few notifications pending and signal the app.
We signal the app? what would the app call upon notification.
Remember, I don't want to pass in a heavy weight object into the futex kernel
call.
> So we could use si_code=SI_QUEUE and pass the uaddr in sival_ptr
> or even si_code=SIGPOLL and pass the data in si_band.
>
> We could also add our own si_code (SI_FUTEX) and add the tid in
> siginfo_t (if needed for NGPT)
>
> Why pass this data over a file descriptor?
> The user space library can block on sigtimedwait() for notifications.
>
> And with the DoS (letting the kernel pin too much memory on behalf
> of a user process) we could use the "max locked memory" ulimit.
--
-- Hubertus Franke ([email protected])
Hubertus Franke wrote:
>
> On Friday 12 April 2002 11:36 am, Peter W?chtler wrote:
> > Hubertus Franke wrote:
> > > On Wednesday 10 April 2002 03:30 pm, Bill Abt wrote:
> > > > On 04/10/2002 at 02:10:59 PM AST, Hubertus Franke
> > > > <[email protected]>
> > > >
> > > > wrote:
> > > > > So you are OK with having only poll or select. That seems odd.
> > > > > It seems you still need SIGIO on your fd to get the async
> > > > > notification.
> > > >
> > > > Duh... You're right. I forgot about that...
> > >
> > > Yes,
> > >
> > > The current interface is
> > >
> > > (A)
> > > async wait:
> > > sys_futex (uaddr, FUTEX_AWAIT, value, (struct timespec*) sig);
> > > upon signal handling
> > > sys_futex(uaddrs[], FUTEX_WAIT, size, NULL);
> > > to retrieve the uaddrs that got woken up...
> > >
> > > If you simply want a notification with SIGIO (or whatever you desire)
> > > We can change that to
> > > (A)
> > > sys_futex(uaddr, FUTEX_WAIT, value, (truct timespec*) fd);
> > >
> > > I send a SIGIO and you can request via ioctl or read the pending
> > > notifications from fd.
> > > (B) { struct futex *notarray[N]
> > > int n = read( futex_fd, (void**)notarray,
> > > N*sizeof(struct futex));
> > > }
> > > I am mainly concerned that SIGIO can be overloaded in a thread package ?
> > > How would you know whether a SIGIO came from the futex or from other file
> > > handle.
> >
> > I want to vote for using POSIX realtime signals. With them (and SA_SIGINFO)
> > you can carry small amounts of userdata, passed in the
> >
> > struct siginfo_t
> > ---susv2---
> > The <signal.h> header defines the siginfo_t type as a structure that
> > includes at least the following members:
> >
> > int si_signo signal number
> > int si_errno if non-zero, an errno value associated with
> > this signal, as defined in <errno.h>
> > int si_code signal code
> > pid_t si_pid sending process ID
> > uid_t si_uid real user ID of sending process
> > void *si_addr address of faulting instruction
> > int si_status exit value or signal
> > long si_band band event for SIGPOLL
> > union sigval si_value signal value
> >
> > [and further on]
> > Implementations may support additional si_code values not included in this
> > list, may generate values included in this list under circumstances other
> > than those described in this list, and may contain extensions or
> > limitations that prevent some values from being generated. Implementations
> > will not generate a different value from the ones described in this list
> > for circumstances described in this list.
> >
> > ---susv2---
> >
>
> Need to digest your suggestion a bit more. Not too familiar with that
> interface.
> One question I have though is whether information can get lost?
>
> Assume , I have a few notifications pending and signal the app.
> We signal the app? what would the app call upon notification.
> Remember, I don't want to pass in a heavy weight object into the futex kernel
> call.
>
Well, realtime signals are reliable in that, that the implementation has to
assure, that NO signal gets lost (in contrast to the early unreliable signals
[in Version7 ?]). For every notification a realtime signal gets queued.
Do you want to be able to read all (or at least a lot of) pending notifications
at once? Then your interface _could_ be more efficient than using a realtime
signal for every notification.
OTOH, if you pass the uaddr with the signal, you save one syscall ...
Here is an advantage on low lock contention (number of waiters).
So you want to read several WAKEUP events in your signal handler. Except for the
pthread_cond_broadcast(which is not often used?) do you think that this is a
realistic real world scenario or only good for a synthetic benchmark
that shows: hey, on heavy lock contention: this implementation is faster.
Looking closer at the pthread_cond_broadcast:
a) NGPT and n waiters on the same kernel vehicle
the thread manager could request one WAITA for all, would
get just one signal and wakes up all threads on his own
b) NGPT and n waiters on m kernel vehicles
all threads on their own vehicle will block on sys_futex(FUTEX_WAIT)
all others are managed on user space queue and woken up
by manager who gets only 1 notification
c) NGPT and n waiters on n kernel vehicles (like on LinuxThreads)
every kernel vehicle will request a WAIT and block
So when we have 500 threads competing for dozens of mutexes on a big SMP box:
is the bottleneck in the kernel (and on the locked bus) or in the notification
mechanism? I can't imagine (don't hit me on this :) that if the kernel releases a
lock, and the app wants a fast notification (and delivered/scheduled in priority
order) that there is a big chance that a lot of notifications pile up...
What do you think?
Also: the waitqueues in the kernel are protected by a mutex themselves, while in
progress of waking up, no new waiters can be added.
If the waiter has higher priority I want to deliver the notify immediatly so that
it's getting the CPU and preempts the lock releaser - I wouldn't want a lazy wake up.
[And yes, I am missing the wake up in priority order. Uh, now we can discuss the
preempt patch again... and I will check how realtime processes are treated]
On Saturday 13 April 2002 09:52 am, Peter W?chtler wrote:
> Hubertus Franke wrote:
> > On Friday 12 April 2002 11:36 am, Peter W?chtler wrote:
> > > Hubertus Franke wrote:
> > > > On Wednesday 10 April 2002 03:30 pm, Bill Abt wrote:
> > > > > On 04/10/2002 at 02:10:59 PM AST, Hubertus Franke
> > > > > <[email protected]>
> > > > >
> > > > > wrote:
> > > > > > So you are OK with having only poll or select. That seems odd.
> > > > > > It seems you still need SIGIO on your fd to get the async
> > > > > > notification.
> > > > >
> > > > > Duh... You're right. I forgot about that...
> > > >
> > > > Yes,
> > > >
> > > > The current interface is
> > > >
> > > > (A)
> > > > async wait:
> > > > sys_futex (uaddr, FUTEX_AWAIT, value, (struct timespec*)
> > > > sig); upon signal handling
> > > > sys_futex(uaddrs[], FUTEX_WAIT, size, NULL);
> > > > to retrieve the uaddrs that got woken up...
> > > >
> > > > If you simply want a notification with SIGIO (or whatever you desire)
> > > > We can change that to
> > > > (A)
> > > > sys_futex(uaddr, FUTEX_WAIT, value, (truct timespec*) fd);
> > > >
> > > > I send a SIGIO and you can request via ioctl or read the pending
> > > > notifications from fd.
> > > > (B) { struct futex *notarray[N]
> > > > int n = read( futex_fd, (void**)notarray,
> > > > N*sizeof(struct futex));
> > > > }
> > > > I am mainly concerned that SIGIO can be overloaded in a thread
> > > > package ? How would you know whether a SIGIO came from the futex or
> > > > from other file handle.
> > >
> > > I want to vote for using POSIX realtime signals. With them (and
> > > SA_SIGINFO) you can carry small amounts of userdata, passed in the
> > >
> > > struct siginfo_t
> > > ---susv2---
> > > The <signal.h> header defines the siginfo_t type as a structure that
> > > includes at least the following members:
> > >
> > > int si_signo signal number
> > > int si_errno if non-zero, an errno value associated
> > > with this signal, as defined in <errno.h> int si_code
> > > signal code
> > > pid_t si_pid sending process ID
> > > uid_t si_uid real user ID of sending process
> > > void *si_addr address of faulting instruction
> > > int si_status exit value or signal
> > > long si_band band event for SIGPOLL
> > > union sigval si_value signal value
> > >
> > > [and further on]
> > > Implementations may support additional si_code values not included in
> > > this list, may generate values included in this list under
> > > circumstances other than those described in this list, and may contain
> > > extensions or limitations that prevent some values from being
> > > generated. Implementations will not generate a different value from the
> > > ones described in this list for circumstances described in this list.
> > >
> > > ---susv2---
> >
> > Need to digest your suggestion a bit more. Not too familiar with that
> > interface.
> > One question I have though is whether information can get lost?
> >
> > Assume , I have a few notifications pending and signal the app.
> > We signal the app? what would the app call upon notification.
> > Remember, I don't want to pass in a heavy weight object into the futex
> > kernel call.
>
> Well, realtime signals are reliable in that, that the implementation has to
> assure, that NO signal gets lost (in contrast to the early unreliable
> signals [in Version7 ?]). For every notification a realtime signal gets
> queued.
>
That's great. Question is can the NGPT folks live with that realtime signal.
> Do you want to be able to read all (or at least a lot of) pending
> notifications at once? Then your interface _could_ be more efficient than
> using a realtime signal for every notification.
Occasionally, there is a chance that more than 1 notification is pending.
Hence I don't want to be in the situation to constantly get one notification
and have to check for more.
I think you solution makes a whole lot of sense.
> OTOH, if you pass the uaddr with the signal, you save one syscall ...
> Here is an advantage on low lock contention (number of waiters).
>
Yipp.
> So you want to read several WAKEUP events in your signal handler. Except
> for the pthread_cond_broadcast(which is not often used?) do you think that
> this is a realistic real world scenario or only good for a synthetic
> benchmark that shows: hey, on heavy lock contention: this implementation is
> faster.
>
> Looking closer at the pthread_cond_broadcast:
>
> a) NGPT and n waiters on the same kernel vehicle
> the thread manager could request one WAITA for all, would
> get just one signal and wakes up all threads on his own
>
Correct.
> b) NGPT and n waiters on m kernel vehicles
> all threads on their own vehicle will block on sys_futex(FUTEX_WAIT)
> all others are managed on user space queue and woken up
> by manager who gets only 1 notification
>
Correct.
> c) NGPT and n waiters on n kernel vehicles (like on LinuxThreads)
> every kernel vehicle will request a WAIT and block
>
Correct.
All <corrects> contingent on my limited knowledge on how an M:N package is
implemented.
> So when we have 500 threads competing for dozens of mutexes on a big SMP
> box: is the bottleneck in the kernel (and on the locked bus) or in the
> notification mechanism? I can't imagine (don't hit me on this :) that if
> the kernel releases a lock, and the app wants a fast notification (and
> delivered/scheduled in priority order) that there is a big chance that a
> lot of notifications pile up... What do you think?
> Also: the waitqueues in the kernel are protected by a mutex themselves,
> while in progress of waking up, no new waiters can be added.
>
> If the waiter has higher priority I want to deliver the notify immediatly
> so that it's getting the CPU and preempts the lock releaser - I wouldn't
> want a lazy wake up.
>
> [And yes, I am missing the wake up in priority order. Uh, now we can
> discuss the preempt patch again... and I will check how realtime processes
> are treated]
--
-- Hubertus Franke ([email protected])
Dealing with the realtime signal is not a problem. Also, saving the extra
system call is *BIG* bonus.
Regards,
Bill Abt
Senior Software Engineer
Next Generation POSIX Threading for Linux
IBM Cambridge, MA, USA 02142
Ext: +(00)1 617-693-1591
T/L: 693-1591 (M/W/F)
T/L: 253-9938 (T/Th/Eves.)
Cell: +(00)1 617-803-7514
[email protected] or [email protected]
http://oss.software.ibm.com/developerworks/opensource/pthreads
On Monday 15 April 2002 10:49 am, Bill Abt wrote:
> Dealing with the realtime signal is not a problem. Also, saving the extra
> system call is *BIG* bonus.
>
>
> Regards,
> Bill Abt
> Senior Software Engineer
> Next Generation POSIX Threading for Linux
> IBM Cambridge, MA, USA 02142
> Ext: +(00)1 617-693-1591
> T/L: 693-1591 (M/W/F)
> T/L: 253-9938 (T/Th/Eves.)
> Cell: +(00)1 617-803-7514
> [email protected] or [email protected]
> http://oss.software.ibm.com/developerworks/opensource/pthreads
Cool
As of Peter's initial message. I took a look at the siginfo_t and Peter's
statement needs to be corrected "a bit".
All the members he listed are NOT necessarily available.
typedef struct siginfo {
int si_signo;
int si_errno;
int si_code;
union {
int _pad[SI_PAD_SIZE];
/* kill() */
struct {
pid_t _pid; /* sender's pid */
uid_t _uid; /* sender's uid */
} _kill;
/* POSIX.1b timers */
struct {
unsigned int _timer1;
unsigned int _timer2;
} _timer;
/* POSIX.1b signals */
struct {
pid_t _pid; /* sender's pid */
uid_t _uid; /* sender's uid */
sigval_t _sigval;
} _rt;
/* SIGCHLD */
struct {
pid_t _pid; /* which child */
uid_t _uid; /* sender's uid */
int _status; /* exit code */
clock_t _utime;
clock_t _stime;
} _sigchld;
/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
struct {
void *_addr; /* faulting insn/memory ref. */
} _sigfault;
/* SIGPOLL */
struct {
int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
int _fd;
} _sigpoll;
} _sifields;
} siginfo_t;
I'd suggest we tag along the _sigfault semantics.
We don't need to know who woke us up, just which <addr> got signalled.
--
-- Hubertus Franke ([email protected])
On Mon, Apr 15, 2002 at 12:22:59PM -0400, Hubertus Franke wrote:
> typedef struct siginfo {
> ...
> union {
> int _pad[SI_PAD_SIZE];
>
> struct {
> ...
> } _kill;
> ...
>
> I'd suggest we tag along the _sigfault semantics.
> We don't need to know who woke us up, just which <addr> got signalled.
>
Is there issues with creating a new struct in the union that represents
exactly what you wish it to represent?
mark
--
[email protected]/[email protected]/[email protected] __________________________
. . _ ._ . . .__ . . ._. .__ . . . .__ | Neighbourhood Coder
|\/| |_| |_| |/ |_ |\/| | |_ | |/ |_ |
| | | | | \ | \ |__ . | | .|. |__ |__ | \ |__ | Ottawa, Ontario, Canada
One ring to rule them all, one ring to find them, one ring to bring them all
and in the darkness bind them...
http://mark.mielke.cc/
On Monday 15 April 2002 04:57 pm, Mark Mielke wrote:
> On Mon, Apr 15, 2002 at 12:22:59PM -0400, Hubertus Franke wrote:
> > typedef struct siginfo {
> > ...
> > union {
> > int _pad[SI_PAD_SIZE];
> >
> > struct {
> > ...
> > } _kill;
> > ...
> >
> > I'd suggest we tag along the _sigfault semantics.
> > We don't need to know who woke us up, just which <addr> got signalled.
>
> Is there issues with creating a new struct in the union that represents
> exactly what you wish it to represent?
>
> mark
No, but then again there seems to be no need either.
All we need is the <addr> that is to be woken up, which
carries similarity to a SEGV signal handler.
--
-- Hubertus Franke ([email protected])
Hubertus Franke wrote:
>
> On Monday 15 April 2002 10:49 am, Bill Abt wrote:
> > Dealing with the realtime signal is not a problem. Also, saving the extra
> > system call is *BIG* bonus.
> >
>
> Cool
>
> As of Peter's initial message. I took a look at the siginfo_t and Peter's
> statement needs to be corrected "a bit".
> All the members he listed are NOT necessarily available.
>
Well, then we do not comply to susv2. But that's not so bad.
But what we need is there:
typedef union sigval {
int sival_int;
void *sival_ptr;
} sigval_t;
[...]
> /* POSIX.1b signals */
> struct {
> pid_t _pid; /* sender's pid */
> uid_t _uid; /* sender's uid */
> sigval_t _sigval;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> } _rt;
>
> } _sifields;
> } siginfo_t;
>
> I'd suggest we tag along the _sigfault semantics.
> We don't need to know who woke us up, just which <addr> got signalled.
>
Please not, this would be confusing.
we can add our si_code and our own struct to the union as suggested
by Mark and me.
--- /usr/local/src/linux-2.5.7/include/asm-i386/siginfo.h ---
/*
* si_code values
* Digital reserves positive values for kernel-generated signals.
*/
#define SI_USER 0 /* sent by kill, sigsend, raise */
#define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
#define SI_QUEUE -1 /* sent by sigqueue */
#define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
#define SI_MESGQ -3 /* sent by real time mesq state change */
#define SI_ASYNCIO -4 /* sent by AIO completion */
#define SI_SIGIO -5 /* sent by queued SIGIO */
#define SI_TKILL -6 /* sent by tkill system call */
#define SI_DETHREAD -7 /* sent by execve() killing subsidiary threads */
--- snip ---