Adds an additional function call to the sched_setscheduler to update the
waiter position of a task if it happens to be waiting on a futex. This
ensures that the kernel level waiter ordering is correctly maintained
based on the changed priority of the task.
I fixed the locking issue noticed by Thomas Gleixner.
This doesn't address userspace at all, only the kernel level wakeups and
kernel level ordering.
The additional locking added to the futex_wait function has no visible speed
impact, and only effects waiters which actual enter the kernel.
Signed-off-by: Daniel Walker <[email protected]>
---
include/linux/sched.h | 4 ++++
kernel/futex.c | 41 +++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 1 +
3 files changed, 46 insertions(+)
Index: linux-2.6.25/include/linux/sched.h
===================================================================
--- linux-2.6.25.orig/include/linux/sched.h
+++ linux-2.6.25/include/linux/sched.h
@@ -1027,6 +1027,7 @@ struct sched_rt_entity {
enum lock_waiter_type {
MUTEX_WAITER = 1,
RT_MUTEX_WAITER,
+ FUTEX_WAITER
};
struct lock_waiter_state {
@@ -1034,6 +1035,7 @@ struct lock_waiter_state {
union {
struct mutex_waiter *mutex_blocked_on;
struct rt_mutex_waiter *rt_blocked_on;
+ union futex_key *futex_blocked_on;
};
};
@@ -1675,6 +1677,8 @@ static inline int rt_mutex_getprio(struc
# define rt_mutex_adjust_pi(p) do { } while (0)
#endif
+extern void futex_adjust_waiters(struct task_struct *p);
+
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
extern int task_nice(const struct task_struct *p);
Index: linux-2.6.25/kernel/futex.c
===================================================================
--- linux-2.6.25.orig/kernel/futex.c
+++ linux-2.6.25/kernel/futex.c
@@ -327,6 +327,38 @@ static int get_futex_value_locked(u32 *d
return ret ? -EFAULT : 0;
}
+void futex_adjust_waiters(struct task_struct *p)
+{
+
+ if (p->blocked_on) {
+ struct futex_hash_bucket *hb;
+ struct futex_q *q, *next;
+ union futex_key key;
+
+ spin_lock_irq(&p->pi_lock);
+ if (p->blocked_on && p->blocked_on->lock_type == FUTEX_WAITER) {
+ key = *p->blocked_on->futex_blocked_on;
+ spin_unlock_irq(&p->pi_lock);
+ } else {
+ spin_unlock_irq(&p->pi_lock);
+ return;
+ }
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+ plist_for_each_entry_safe(q, next, &hb->chain, list) {
+ if (match_futex(&q->key, &key) && q->task == p) {
+ int prio = min(p->normal_prio, MAX_RT_PRIO);
+ plist_del(&q->list, &hb->chain);
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ break;
+ }
+ }
+ spin_unlock(&hb->lock);
+ }
+}
+
/*
* Fault handling.
* if fshared is non NULL, current->mm->mmap_sem is already held
@@ -1159,6 +1191,8 @@ static int futex_wait(u32 __user *uaddr,
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
+ struct lock_waiter_state blocked_on = {
+ .lock_type = FUTEX_WAITER, { .futex_blocked_on = &q.key } };
u32 uval;
int ret;
struct hrtimer_sleeper t;
@@ -1176,6 +1210,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret != 0))
goto out_release_sem;
+ set_blocked_on(current, &blocked_on);
+
hb = queue_lock(&q);
/*
@@ -1203,6 +1239,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret)) {
queue_unlock(&q, hb);
+ set_blocked_on(current, NULL);
+
/*
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
@@ -1276,6 +1314,8 @@ static int futex_wait(u32 __user *uaddr,
}
__set_current_state(TASK_RUNNING);
+ set_blocked_on(current, NULL);
+
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
@@ -1310,6 +1350,7 @@ static int futex_wait(u32 __user *uaddr,
out_unlock_release_sem:
queue_unlock(&q, hb);
+ set_blocked_on(current, NULL);
out_release_sem:
futex_unlock_mm(fshared);
Index: linux-2.6.25/kernel/sched.c
===================================================================
--- linux-2.6.25.orig/kernel/sched.c
+++ linux-2.6.25/kernel/sched.c
@@ -5209,6 +5209,7 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
rt_mutex_adjust_pi(p);
+ futex_adjust_waiters(p);
return 0;
}
--
On Wed, 2008-06-11 at 13:49 -0700, Daniel Walker wrote:
> plain text document attachment (blocked_on-futex.patch)
I thought everybody and his dog didn't care for this?
> +void futex_adjust_waiters(struct task_struct *p)
> +{
> +
Seemingly extra whitespace ^
> + if (p->blocked_on) {
> + struct futex_hash_bucket *hb;
> + struct futex_q *q, *next;
> + union futex_key key;
> +
> + spin_lock_irq(&p->pi_lock);
> + if (p->blocked_on && p->blocked_on->lock_type == FUTEX_WAITER) {
> + key = *p->blocked_on->futex_blocked_on;
> + spin_unlock_irq(&p->pi_lock);
> + } else {
> + spin_unlock_irq(&p->pi_lock);
> + return;
> + }
> +
> + hb = hash_futex(&key);
> + spin_lock(&hb->lock);
> + plist_for_each_entry_safe(q, next, &hb->chain, list) {
> + if (match_futex(&q->key, &key) && q->task == p) {
> + int prio = min(p->normal_prio, MAX_RT_PRIO);
> + plist_del(&q->list, &hb->chain);
> + plist_node_init(&q->list, prio);
> + plist_add(&q->list, &hb->chain);
> + break;
> + }
> + }
> + spin_unlock(&hb->lock);
> + }
> +}
Also, if you write it like:
if (!p->blocked_on)
return
do_other_stuff
you loose one nesting level - which imho looks better.
On Wed, 11 Jun 2008, Daniel Walker wrote:
> Adds an additional function call to the sched_setscheduler to update the
> waiter position of a task if it happens to be waiting on a futex. This
> ensures that the kernel level waiter ordering is correctly maintained
> based on the changed priority of the task.
>
> I fixed the locking issue noticed by Thomas Gleixner.
>
> This doesn't address userspace at all, only the kernel level wakeups and
> kernel level ordering.
>
> The additional locking added to the futex_wait function has no visible speed
> impact, and only effects waiters which actual enter the kernel.
The additional locking is just broken and you did not even bother to
test your changes with lockdep.
Aside of this, these patches still add 100 lines of code to achieve
nothing - as dicussed when you previously submitted your changes.
Please stop wasting everyone's time with that.
Thanks,
tglx
On Thu, 2008-06-12 at 08:07 +0200, Peter Zijlstra wrote:
> On Wed, 2008-06-11 at 13:49 -0700, Daniel Walker wrote:
> > plain text document attachment (blocked_on-futex.patch)
>
> I thought everybody and his dog didn't care for this?
Just Thomas ..
> Also, if you write it like:
>
> if (!p->blocked_on)
> return
>
> do_other_stuff
>
> you loose one nesting level - which imho looks better.
Yeah ..
Daniel
On Thu, 2008-06-12 at 10:56 +0200, Thomas Gleixner wrote:
> On Wed, 11 Jun 2008, Daniel Walker wrote:
> > Adds an additional function call to the sched_setscheduler to update the
> > waiter position of a task if it happens to be waiting on a futex. This
> > ensures that the kernel level waiter ordering is correctly maintained
> > based on the changed priority of the task.
> >
> > I fixed the locking issue noticed by Thomas Gleixner.
> >
> > This doesn't address userspace at all, only the kernel level wakeups and
> > kernel level ordering.
> >
> > The additional locking added to the futex_wait function has no visible speed
> > impact, and only effects waiters which actual enter the kernel.
>
> The additional locking is just broken and you did not even bother to
> test your changes with lockdep.
I ran it with lockdep enabled , I didn't get any warnings..
> Aside of this, these patches still add 100 lines of code to achieve
> nothing - as dicussed when you previously submitted your changes.
>
> Please stop wasting everyone's time with that.
It achieves correct ordering of the futex waiters inside the kernel,
that is in fact _something_ ..
Daniel
On Thu, 12 Jun 2008, Daniel Walker wrote:
> On Thu, 2008-06-12 at 10:56 +0200, Thomas Gleixner wrote:
> > Please stop wasting everyone's time with that.
>
> It achieves correct ordering of the futex waiters inside the kernel,
> that is in fact _something_ ..
Yeah, just something _useless_
Thanks,
tglx
On Thu, 2008-06-12 at 15:33 +0200, Thomas Gleixner wrote:
> On Thu, 12 Jun 2008, Daniel Walker wrote:
> > On Thu, 2008-06-12 at 10:56 +0200, Thomas Gleixner wrote:
> > > Please stop wasting everyone's time with that.
> >
> > It achieves correct ordering of the futex waiters inside the kernel,
> > that is in fact _something_ ..
>
> Yeah, just something _useless_
Just because you don't use it, doesn't make it useless .. At least
there's enough people asking for this that it warrants me writing it..
Daniel
On Thu, 2008-06-12 at 06:22 -0700, Daniel Walker wrote:
> On Thu, 2008-06-12 at 08:07 +0200, Peter Zijlstra wrote:
> > On Wed, 2008-06-11 at 13:49 -0700, Daniel Walker wrote:
> > > plain text document attachment (blocked_on-futex.patch)
> >
> > I thought everybody and his dog didn't care for this?
>
> Just Thomas ..
And me, I really don't see the point in this, use PI futexes already if
you care for wakeup ordering.
On Thu, 2008-06-12 at 15:57 +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-12 at 06:22 -0700, Daniel Walker wrote:
> > On Thu, 2008-06-12 at 08:07 +0200, Peter Zijlstra wrote:
> > > On Wed, 2008-06-11 at 13:49 -0700, Daniel Walker wrote:
> > > > plain text document attachment (blocked_on-futex.patch)
> > >
> > > I thought everybody and his dog didn't care for this?
> >
> > Just Thomas ..
>
> And me, I really don't see the point in this, use PI futexes already if
> you care for wakeup ordering.
Does it make sense that we allow a priority based plist ordering, but we
don't keep the priorities up to date? That doesn't make sense to me.
Daniel
On Thu, 12 Jun 2008, Daniel Walker wrote:
> On Thu, 2008-06-12 at 15:33 +0200, Thomas Gleixner wrote:
> > On Thu, 12 Jun 2008, Daniel Walker wrote:
> > > On Thu, 2008-06-12 at 10:56 +0200, Thomas Gleixner wrote:
> > > > Please stop wasting everyone's time with that.
> > >
> > > It achieves correct ordering of the futex waiters inside the kernel,
> > > that is in fact _something_ ..
> >
> > Yeah, just something _useless_
>
> Just because you don't use it, doesn't make it useless .. At least
> there's enough people asking for this that it warrants me writing it..
Which is not really a good technical reason to merge such a
patch. Your handwaving about "enough people" is just irrelevant. Are
you going to implement a root hole as well when enough people ask for
it ?
But there is also a Real Good technical reason why these patches are
going nowhere else than into /dev/null:
your approach of hijacking blocked_on is fundamentaly wrong as it
mixes kernel internal state with user space state.
It will break in preempt-rt at the point when this state is set and
the code blocks on a spinlock, which uses the rtmutex based sleeping
spinlock implementation and overwrites blocked_on.
If it can acquire the spinlock in the fast path without modifying
blocked_on it will cause trouble with the priority boosting chain
when a higher priority task becomes blocked on the spinlock.
If there would be a real good technical reason to fix this priority
ordering it could be done with less than 20 lines of code without
extra locking and wreckage waiting left and right, but I have not yet
seen a single convincing technical argument or a relevant use case
which might justify that.
Thanks,
tglx
On Thu, 2008-06-12 at 17:24 +0200, Thomas Gleixner wrote:
> > Just because you don't use it, doesn't make it useless .. At least
> > there's enough people asking for this that it warrants me writing it..
>
> Which is not really a good technical reason to merge such a
> patch. Your handwaving about "enough people" is just irrelevant. Are
> you going to implement a root hole as well when enough people ask for
> it ?
People asking for something is a very good reason to merge "features" ..
You can like or dislike implementations , but that doesn't reflect on
the nature of the feature.
> But there is also a Real Good technical reason why these patches are
> going nowhere else than into /dev/null:
>
> your approach of hijacking blocked_on is fundamentaly wrong as it
> mixes kernel internal state with user space state.
It mixes kernel state with kernel state, not to mention each state is
isolated from the others.
> It will break in preempt-rt at the point when this state is set and
> the code blocks on a spinlock, which uses the rtmutex based sleeping
> spinlock implementation and overwrites blocked_on.
That's an intersting point, however "preempt-rt" is out of tree, so it's
certainly not going be a reason to reject mainline changes.
> If there would be a real good technical reason to fix this priority
> ordering it could be done with less than 20 lines of code without
> extra locking and wreckage waiting left and right, but I have not yet
> seen a single convincing technical argument or a relevant use case
> which might justify that.
The technical reasons for including this are the same technical reasons
why we want the waiters queued in priority order .. It's a requirement
of posix, where many calls need the ordering and ultimately feed into
the futex interface. So we have every reason to do the ordering
correctly..
If you have a 20 line fix for this then great tell me what it is..
Daniel
On Thu, 12 Jun 2008, Daniel Walker wrote:
>
> On Thu, 2008-06-12 at 17:24 +0200, Thomas Gleixner wrote:
> > > Just because you don't use it, doesn't make it useless .. At least
> > > there's enough people asking for this that it warrants me writing it..
> >
> > Which is not really a good technical reason to merge such a
> > patch. Your handwaving about "enough people" is just irrelevant. Are
> > you going to implement a root hole as well when enough people ask for
> > it ?
>
> People asking for something is a very good reason to merge "features" ..
> You can like or dislike implementations , but that doesn't reflect on
> the nature of the feature.
Again, I have not yet seen a single request aside of yours. Who is
asking for that and what is the rationale for their request?
> > But there is also a Real Good technical reason why these patches are
> > going nowhere else than into /dev/null:
> >
> > your approach of hijacking blocked_on is fundamentaly wrong as it
> > mixes kernel internal state with user space state.
>
> It mixes kernel state with kernel state, not to mention each state is
> isolated from the others.
Wrong. The task is blocked on the user space futex and not on an in
kernel concurrency control. What's the isolation, the enum or the
union ? LOL.
The blocked_on mechanism is restricted to kernel internal concurrency
controls and while mutex and rt_mutex could share a common storage,
the user space futex can not. That would preclude to ever use a
(rt_)mutex in the futex code.
The reason is simple: A task can only be blocked on one concurrency
control, but it can be blocked on a user space futex and later on an
in kernel concurrency control.
Just get it. These are different concepts and do not mix at all.
> > It will break in preempt-rt at the point when this state is set and
> > the code blocks on a spinlock, which uses the rtmutex based sleeping
> > spinlock implementation and overwrites blocked_on.
>
> That's an intersting point, however "preempt-rt" is out of tree, so it's
> certainly not going be a reason to reject mainline changes.
It is a perfectly good reason, simply because we know that rt is on
the way to mainline and it would be pretty stupid to merge a change
with a questionable technical merit which needs to be reverted and
overhauled in the foreseeable future. Other not yet mainline features
/ changes coordinate as well to avoid wreckage like that.
Also putting on my preempt-rt hat I just wonder about your brashly
impertinence to expect that others clean up the mess you create.
> > If there would be a real good technical reason to fix this priority
> > ordering it could be done with less than 20 lines of code without
> > extra locking and wreckage waiting left and right, but I have not yet
> > seen a single convincing technical argument or a relevant use case
> > which might justify that.
>
> The technical reasons for including this are the same technical reasons
> why we want the waiters queued in priority order .. It's a requirement
> of posix, where many calls need the ordering and ultimately feed into
> the futex interface. So we have every reason to do the ordering
> correctly..
Making a halfways correct thing a bit more correct is not going to
reach full correctness.
Also your interpretation of the POSIX requirement is very
questionable:
"If there are threads blocked on the mutex object referenced by mutex
when pthread_mutex_unlock() is called, resulting in the mutex
becoming available, the scheduling policy shall determine which
thread shall acquire the mutex."
So there is no explicit guarantee requirement AFAICT. "... the
scheduling policy shall determine ..." is a rather vague term which
has enough room for interpretation.
My interpretation is, whoever has/gets the CPU will get the mutex. So
why should I care about the priority order correctness, which is only
incorrect in a corner case. Especially in a corner case which is not a
hot path in any sane application:
It requires thread A to block on a futex together with other threads
and thread B to adjust the priority of thread A while it is
blocked.
This is definitely _NOT_ the common case in any application I'm aware
of.
If you think that this actually matters, then please stop your
handwaving and provide proof. I'm really keen to add the absurdities
of those use cases to my Ugly Code Museum.
Furthermore the rationale section says explicitely:
"The mutex functions and the particular default settings of the mutex
attributes have been motivated by the desire to not preclude fast,
inlined implementations of mutex locking and unlocking."
This is another confirmation that the default pthread_mutex
implementation aims for performance and not for correctness.
We have an PI implementation which provides full correctness, so there
is no need to add artificial correctness to something which is by
default incorrect.
> If you have a 20 line fix for this then great tell me what it is..
I might write the patch once I can see a real world use case where
this actually matters.
Thanks,
tglx
On Thu, 2008-06-12 at 21:55 +0200, Thomas Gleixner wrote:
> Also your interpretation of the POSIX requirement is very
> questionable:
>
> "If there are threads blocked on the mutex object referenced by mutex
> when pthread_mutex_unlock() is called, resulting in the mutex
> becoming available, the scheduling policy shall determine which
> thread shall acquire the mutex."
The key is "scheduling policy" .. When the mutex is un-blocked the next
task to run is the same as if the scheduler was selecting tasks from the
list of blocked tasks .. For Linux, that means the highest priority
tasks should be selected.. So it's no more acceptable for the scheduler
to priority invert some tasks than it is for the futex to do it.
Daniel
On Thu, 12 Jun 2008, Daniel Walker wrote:
>
> On Thu, 2008-06-12 at 21:55 +0200, Thomas Gleixner wrote:
> > Also your interpretation of the POSIX requirement is very
> > questionable:
> >
> > "If there are threads blocked on the mutex object referenced by mutex
> > when pthread_mutex_unlock() is called, resulting in the mutex
> > becoming available, the scheduling policy shall determine which
> > thread shall acquire the mutex."
>
> The key is "scheduling policy" .. When the mutex is un-blocked the next
> task to run is the same as if the scheduler was selecting tasks from the
> list of blocked tasks .. For Linux, that means the highest priority
> tasks should be selected.. So it's no more acceptable for the scheduler
> to priority invert some tasks than it is for the futex to do it.
Sigh, when do you actually get a gripe that the default futex
implementation does not and can not guarantee that at all and therefor
your "correctness" patch is as important as a bag of rice which
toopled over in China ?
Provide answers to the real questions I asked more than once:
What's the real world problem ? Who cares about that - except you ?
Up to the point where you are actually able to come up with that
answers please direct your replies to /dev/null. That avoids that I
have to touch my .procmailrc.
Thanks,
tglx
On Fri, 2008-06-13 at 00:43 +0200, Thomas Gleixner wrote:
> On Thu, 12 Jun 2008, Daniel Walker wrote:
>
> >
> > On Thu, 2008-06-12 at 21:55 +0200, Thomas Gleixner wrote:
> > > Also your interpretation of the POSIX requirement is very
> > > questionable:
> > >
> > > "If there are threads blocked on the mutex object referenced by mutex
> > > when pthread_mutex_unlock() is called, resulting in the mutex
> > > becoming available, the scheduling policy shall determine which
> > > thread shall acquire the mutex."
> >
> > The key is "scheduling policy" .. When the mutex is un-blocked the next
> > task to run is the same as if the scheduler was selecting tasks from the
> > list of blocked tasks .. For Linux, that means the highest priority
> > tasks should be selected.. So it's no more acceptable for the scheduler
> > to priority invert some tasks than it is for the futex to do it.
>
> Sigh, when do you actually get a gripe that the default futex
> implementation does not and can not guarantee that at all and therefor
> your "correctness" patch is as important as a bag of rice which
> toopled over in China ?
Well, the last email I got from Arjan said this,
".. Don't look at the release path... look at the acquire path.
If a thread sees the futex is free, it'll take it, without even going
to the kernel at all."
And yes, I understand that fully.
> Provide answers to the real questions I asked more than once:
>
> What's the real world problem ? Who cares about that - except you ?
Any application which starts a thread, and later changes the priority
can observe the miss-ordering.. That's pretty common..
Daniel
On Thu, 12 Jun 2008, Daniel Walker wrote:
> On Fri, 2008-06-13 at 00:43 +0200, Thomas Gleixner wrote:
> > On Thu, 12 Jun 2008, Daniel Walker wrote:
> >
> > >
> > > On Thu, 2008-06-12 at 21:55 +0200, Thomas Gleixner wrote:
> > > > Also your interpretation of the POSIX requirement is very
> > > > questionable:
> > > >
> > > > "If there are threads blocked on the mutex object referenced by mutex
> > > > when pthread_mutex_unlock() is called, resulting in the mutex
> > > > becoming available, the scheduling policy shall determine which
> > > > thread shall acquire the mutex."
> > >
> > > The key is "scheduling policy" .. When the mutex is un-blocked the next
> > > task to run is the same as if the scheduler was selecting tasks from the
> > > list of blocked tasks .. For Linux, that means the highest priority
> > > tasks should be selected.. So it's no more acceptable for the scheduler
> > > to priority invert some tasks than it is for the futex to do it.
> >
> > Sigh, when do you actually get a gripe that the default futex
> > implementation does not and can not guarantee that at all and therefor
> > your "correctness" patch is as important as a bag of rice which
> > toopled over in China ?
>
> Well, the last email I got from Arjan said this,
>
> ".. Don't look at the release path... look at the acquire path.
> If a thread sees the futex is free, it'll take it, without even going
> to the kernel at all."
>
> And yes, I understand that fully.
Great. Case closed, nothing to argue about.
Thanks,
tglx