Hi,
Today, all threads waiting for a given futex are woken in FIFO order (first
waiter woken first) instead of priority order.
This patch makes use of plist (pirotity ordered lists) instead of simple list in
futex_hash_bucket.
---
futex.c | 66 +++++++++++++++++++++++++++++++++++-----------------------------
1 file changed, 37 insertions(+), 29 deletions(-)
---
Signed-off-by: S?bastien Dugu? <[email protected]>
Signed-off-by: Pierre Peiffer <[email protected]>
---
Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c 2007-01-08 10:40:22.000000000 +0100
+++ linux-2.6/kernel/futex.c 2007-01-08 10:42:07.000000000 +0100
@@ -106,12 +106,12 @@ struct futex_pi_state {
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiters, then make the second condition true.
*/
struct futex_q {
- struct list_head list;
+ struct plist_node list;
wait_queue_head_t waiters;
/* Which hash list lock to use: */
@@ -133,8 +133,8 @@ struct futex_q {
* Split the global futex_lock into every hash list lock.
*/
struct futex_hash_bucket {
- spinlock_t lock;
- struct list_head chain;
+ spinlock_t lock;
+ struct plist_head chain;
};
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -465,13 +465,13 @@ lookup_pi_state(u32 uval, struct futex_h
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
struct task_struct *p;
pid_t pid;
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex(&this->key, &me->key)) {
/*
* Another waiter already exists - bump up
@@ -535,12 +535,12 @@ lookup_pi_state(u32 uval, struct futex_h
*/
static void wake_futex(struct futex_q *q)
{
- list_del_init(&q->list);
+ plist_del(&q->list, &q->list.plist);
if (q->filp)
send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
- * list_del_init() and also before assigning to q->lock_ptr.
+ * plist_del() and also before assigning to q->lock_ptr.
*/
wake_up_all(&q->waiters);
/*
@@ -653,7 +653,7 @@ static int futex_wake(u32 __user *uaddr,
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret;
@@ -667,7 +667,7 @@ static int futex_wake(u32 __user *uaddr,
spin_lock(&hb->lock);
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state) {
ret = -EINVAL;
@@ -695,7 +695,7 @@ futex_wake_op(u32 __user *uaddr1, u32 __
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head;
+ struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
@@ -768,7 +768,7 @@ retry:
head = &hb1->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
wake_futex(this);
if (++ret >= nr_wake)
@@ -780,7 +780,7 @@ retry:
head = &hb2->chain;
op_ret = 0;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key2)) {
wake_futex(this);
if (++op_ret >= nr_wake2)
@@ -807,7 +807,7 @@ static int futex_requeue(u32 __user *uad
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head1;
+ struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
@@ -856,7 +856,7 @@ static int futex_requeue(u32 __user *uad
}
head1 = &hb1->chain;
- list_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
@@ -867,9 +867,13 @@ static int futex_requeue(u32 __user *uad
* requeue.
*/
if (likely(head1 != &hb2->chain)) {
- list_move_tail(&this->list, &hb2->chain);
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
this->lock_ptr = &hb2->lock;
- }
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
this->key = key2;
get_key_refs(&key2);
drop_count++;
@@ -914,7 +918,11 @@ queue_lock(struct futex_q *q, int fd, st
static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
- list_add_tail(&q->list, &hb->chain);
+ plist_node_init(&q->list, current->normal_prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+ plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
}
@@ -969,8 +977,8 @@ static int unqueue_me(struct futex_q *q)
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(q->pi_state);
@@ -988,8 +996,8 @@ static int unqueue_me(struct futex_q *q)
*/
static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
{
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
@@ -1082,10 +1090,10 @@ static int futex_wait(u32 __user *uaddr,
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
/*
- * !list_empty() is safe here without any lock.
+ * !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (likely(!list_empty(&q.list)))
+ if (likely(!plist_node_empty(&q.list)))
time = schedule_timeout(time);
__set_current_state(TASK_RUNNING);
@@ -1358,7 +1366,7 @@ static int futex_unlock_pi(u32 __user *u
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret, attempt = 0;
@@ -1409,7 +1417,7 @@ retry_locked:
*/
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -1483,10 +1491,10 @@ static unsigned int futex_poll(struct fi
poll_wait(filp, &q->waiters, wait);
/*
- * list_empty() is safe here without any lock.
+ * plist_node_empty() is safe here without any lock.
* q->lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (list_empty(&q->list))
+ if (plist_node_empty(&q->list))
ret = POLLIN | POLLRDNORM;
return ret;
@@ -1869,7 +1877,7 @@ static int __init init(void)
}
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- INIT_LIST_HEAD(&futex_queues[i].chain);
+ plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
spin_lock_init(&futex_queues[i].lock);
}
return 0;
--
Pierre Peiffer
Pierre Peiffer wrote:
> This patch makes use of plist (pirotity ordered lists) instead of simple
> list in
> futex_hash_bucket.
I have never seen performance numbers for this. If it is punishing
existing code in a measurable way I think it's not anacceptable default
behavior.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
On Tue, 2007-01-09 at 17:16 +0100, Pierre Peiffer wrote:
> @@ -133,8 +133,8 @@ struct futex_q {
> * Split the global futex_lock into every hash list lock.
> */
> struct futex_hash_bucket {
> - spinlock_t lock;
> - struct list_head chain;
> + spinlock_t lock;
> + struct plist_head chain;
Should have tabs between spinlock_t and lock , and plist_head and
chain.. It looks like the original didn't, but as long as your cleaning
up may as well get add them.
Daniel
Ulrich Drepper a écrit :
>
> I have never seen performance numbers for this. If it is punishing
> existing code in a measurable way I think it's not anacceptable default
> behavior.
>
Here are some numbers. My test program measures the latency of pthread_broadcast
with 1000 pthreads (all threads are blocked on pthread_cond_wait, the time is
measured between the broadcast call and the last woken pthread).
Here are the average latencies after 5000 measures.
[only this patch is used, not the following.
The system is a dual Xeon 2.80GHz with HT enable]
First case: all threads are SCHED_OTHER
* with simple list:
Iterations=5000
Latency (us) min max avg stddev
3869 7400 6656.73 539.35
* with plist:
Iterations=5000
Latency (us) min max avg stddev
3684 7629 6787.97 479.41
Second case: all threads are SCHED_FIFO with priority equally distributed from
priomin to priomax
* with simple list:
Iterations=5000
Latency (us) min max avg stddev
4548 7197 6656.85 463.30
* with plist:
Iterations=5000
Latency (us) min max avg stddev
8289 11752 9720.12 426.45
So, yes it (logically) has a cost, depending of the number of different
priorities used, so it's specially measurable with real-time threads.
With SCHED_OTHER, I suppose that the priorities are not be very distributed.
May be, supposing it makes sense to respect the priority order only for
real-time pthreads, I can register all SCHED_OTHER threads to the same
MAX_RT_PRIO priotity ?
Or do you think this must be set behind a CONFIG* option ?
(Or finally not interesting enough for mainline ?)
--
Pierre
Pierre Peiffer a écrit :
> Ulrich Drepper a écrit :
>>
>> I have never seen performance numbers for this. If it is punishing
>> existing code in a measurable way I think it's not anacceptable default
>> behavior.
> May be, supposing it makes sense to respect the priority order only for
> real-time pthreads, I can register all SCHED_OTHER threads to the same
> MAX_RT_PRIO priotity ?
Moreover, the performance must be considered, sure, but after all, "man
pthread_cond_broadcast" says:
<<
If more than one thread is blocked on a condition variable, the
scheduling policy shall determine the order in which threads are
unblocked.
>>
... this is not true today ...
(of course, "shall" does not mean "mandatory", I know ;-) )
--
Pierre
On Wed, Jan 10, 2007 at 12:47:21PM +0100, Pierre Peiffer wrote:
> So, yes it (logically) has a cost, depending of the number of different
> priorities used, so it's specially measurable with real-time threads.
> With SCHED_OTHER, I suppose that the priorities are not be very distributed.
>
> May be, supposing it makes sense to respect the priority order only for
> real-time pthreads, I can register all SCHED_OTHER threads to the same
> MAX_RT_PRIO priotity ?
> Or do you think this must be set behind a CONFIG* option ?
> (Or finally not interesting enough for mainline ?)
As soon as there is at least one non-SCHED_OTHER thread among the waiters,
there is no question about whether plist should be used or not, that's
a correctness issue and if we want to conform to POSIX, we have to use that.
I guess Ulrich's question was mainly about performance differences
with/without plist wakeup when all threads are SCHED_OTHER. I'd say for
that a pure pthread_mutex_{lock,unlock} benchmark or even just a program
which uses futex FUTEX_WAIT/FUTEX_WAKE in a bunch of threads would be
better.
In the past we talked with Ingo about the possibilities here, one is use
plist always and prove that it doesn't add measurable overhead over current
FIFO (when only SCHED_OTHER is involved), the other possibility would be
to start using FIFOs as before, but when the first non-SCHED_OTHER thread
decides to wait on the futex, switch it to plist wakeup mode (convert the
FIFO into a plist) and from that point on just use plist wakeups on it.
Jakub
Yes, I agree with all, that was what I have in mind too.
And by registering all SHED_OTHER threads with the same priority MAX_RT_PRIO, we
have exactly this behavior, I think :
* the plist, when used with only one priority, behaves exactly as a simple list
(plist is a double simple list: first list contains all nodes sorted by
priority, second list contains the first element of each "priority-based"
sub-list of the first one):
Thus, when only one priority is used, there is nothing to sort at each add
operation. (first list contains all elements by FIFO order, second list contains
one element). So the overhead in this case is minimal and quasi-null.
* Now, if a SCHED_FIFO thread comes to the plist, its priority will be lower
than MAX_RT_PRIO and it will be "naturally" sorted by prio order, and thus, it
will be woken before the SCHED_OTHER threads because of its higher priority
(i.e. lower prio-value in the kernel).
But there can be a performance impact when several processes use different
futexes which have the same hash key.
In fact, the plist contains all waiters _of_all_futexes_ having the same hash
key, not only the waiters of a given futex. This can be more a problem, if one
process uses SCHED_FIFO threads, and the other SCHED_OTHER: the first will
penalize the second... but even in this case, as the second has a lower
priority, this can be acceptable, I think ?
Jakub Jelinek a ?crit :
> On Wed, Jan 10, 2007 at 12:47:21PM +0100, Pierre Peiffer wrote:
>> So, yes it (logically) has a cost, depending of the number of different
>> priorities used, so it's specially measurable with real-time threads.
>> With SCHED_OTHER, I suppose that the priorities are not be very distributed.
>>
>> May be, supposing it makes sense to respect the priority order only for
>> real-time pthreads, I can register all SCHED_OTHER threads to the same
>> MAX_RT_PRIO priotity ?
>> Or do you think this must be set behind a CONFIG* option ?
>> (Or finally not interesting enough for mainline ?)
>
> As soon as there is at least one non-SCHED_OTHER thread among the waiters,
> there is no question about whether plist should be used or not, that's
> a correctness issue and if we want to conform to POSIX, we have to use that.
>
> I guess Ulrich's question was mainly about performance differences
> with/without plist wakeup when all threads are SCHED_OTHER. I'd say for
> that a pure pthread_mutex_{lock,unlock} benchmark or even just a program
> which uses futex FUTEX_WAIT/FUTEX_WAKE in a bunch of threads would be
> better.
>
> In the past we talked with Ingo about the possibilities here, one is use
> plist always and prove that it doesn't add measurable overhead over current
> FIFO (when only SCHED_OTHER is involved), the other possibility would be
> to start using FIFOs as before, but when the first non-SCHED_OTHER thread
> decides to wait on the futex, switch it to plist wakeup mode (convert the
> FIFO into a plist) and from that point on just use plist wakeups on it.
>
> Jakub
>
--
Pierre Peiffer
On Tue, 2007-01-09 at 17:16 +0100, Pierre Peiffer wrote:
> @@ -1358,7 +1366,7 @@ static int futex_unlock_pi(u32 __user *u
> struct futex_hash_bucket *hb;
> struct futex_q *this, *next;
> u32 uval;
> - struct list_head *head;
> + struct plist_head *head;
> union futex_key key;
> int ret, attempt = 0;
>
> @@ -1409,7 +1417,7 @@ retry_locked:
> */
> head = &hb->chain;
>
> - list_for_each_entry_safe(this, next, head, list) {
> + plist_for_each_entry_safe(this, next, head, list) {
> if (!match_futex (&this->key, &key))
> continue;
> ret = wake_futex_pi(uaddr, uval, this);
Is this really necessary? The rtmutex will priority sort the waiters
when you enable priority inheritance. Inside the wake_futex_pi() it
actually just pulls the new owner off another plist inside the the
rtmutex structure.
Daniel
Daniel Walker a ?crit :
> On Tue, 2007-01-09 at 17:16 +0100, Pierre Peiffer wrote:
>> @@ -1358,7 +1366,7 @@ static int futex_unlock_pi(u32 __user *u
>> struct futex_hash_bucket *hb;
>> struct futex_q *this, *next;
>> u32 uval;
>> - struct list_head *head;
>> + struct plist_head *head;
>> union futex_key key;
>> int ret, attempt = 0;
>>
>> @@ -1409,7 +1417,7 @@ retry_locked:
>> */
>> head = &hb->chain;
>>
>> - list_for_each_entry_safe(this, next, head, list) {
>> + plist_for_each_entry_safe(this, next, head, list) {
>> if (!match_futex (&this->key, &key))
>> continue;
>> ret = wake_futex_pi(uaddr, uval, this);
>
>
> Is this really necessary? The rtmutex will priority sort the waiters
> when you enable priority inheritance. Inside the wake_futex_pi() it
> actually just pulls the new owner off another plist inside the the
> rtmutex structure.
Yes. ... necessary for non-PI-futex (ie "normal" futex)...
As the hash_bucket_list is used and common for both futex and PI-futex, yes, in
case of PI_futex, the task is queued two times in two plist.
--
Pierre
On Wed, 2007-01-10 at 17:29 +0100, Pierre Peiffer wrote:
> >
> > Is this really necessary? The rtmutex will priority sort the waiters
> > when you enable priority inheritance. Inside the wake_futex_pi() it
> > actually just pulls the new owner off another plist inside the the
> > rtmutex structure.
>
> Yes. ... necessary for non-PI-futex (ie "normal" futex)...
>
> As the hash_bucket_list is used and common for both futex and PI-futex, yes, in
> case of PI_futex, the task is queued two times in two plist.
You could make them distinct .. Also Did you consider merging the PI
path and the non-PI code path? then you could modify the rtmutex to
toggle PI depending .
Daniel
Pierre Peiffer wrote:
> But there can be a performance impact when several processes use
> different futexes which have the same hash key.
> In fact, the plist contains all waiters _of_all_futexes_ having the same
> hash key, not only the waiters of a given futex. This can be more a
> problem,
s/can be/is/
There are systems with thousands of active futexes, maybe tens of
thousands. Not only is hash collision likely, it's also a matter of
using and administering the plist. We have to make futexes less
connected, not more. Now I definitely want to see real world tests first.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
Pierre Peiffer wrote:
> Here are the average latencies after 5000 measures.
> [...]
Use something more realistic. I suggest using the Volano benchmark
under your favorite JVM. I found it to be quite representative and you
get a nice number you can show.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖