Adds an additional function call to the sched_setscheduler to update the
waiter position of a task if it happens to be waiting on a futex. This
ensures that the kernel level waiter ordering is correctly maintained
based on the changed priority of the task.
I fixed the locking issue noticed by Thomas Gleixner.
This doesn't address userspace at all, only the kernel level wakeups and
kernel level ordering.
The additional locking added to the futex_wait function has no visible speed
impact, and only effects waiters which actual enter the kernel.
Signed-off-by: Daniel Walker <[email protected]>
---
include/linux/sched.h | 10 ++++++++--
kernel/fork.c | 3 ++-
kernel/futex.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 1 +
4 files changed, 56 insertions(+), 3 deletions(-)
Index: linux-2.6.25/include/linux/sched.h
===================================================================
--- linux-2.6.25.orig/include/linux/sched.h
+++ linux-2.6.25/include/linux/sched.h
@@ -1026,6 +1026,7 @@ struct sched_rt_entity {
enum lock_waiter_type {
MUTEX_WAITER = 1,
RT_MUTEX_WAITER,
+ FUTEX_WAITER
};
struct lock_waiter_state {
@@ -1033,6 +1034,7 @@ struct lock_waiter_state {
union {
struct mutex_waiter *mutex_blocked_on;
struct rt_mutex_waiter *rt_blocked_on;
+ union futex_key *futex_blocked_on;
};
struct lock_waiter_state *next;
};
@@ -1222,7 +1224,8 @@ struct task_struct {
struct plist_head pi_waiters;
#endif
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_RT_MUTEXES)
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_RT_MUTEXES) \
+ || defined(CONFIG_FUTEX)
/*
* Deadlock detection and priority inheritance handling,
* and any other out of line mutex operations
@@ -1321,7 +1324,8 @@ struct task_struct {
#endif
};
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_RT_MUTEXES)
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_RT_MUTEXES) \
+ || defined(CONFIG_FUTEX)
/*
* set_blocked_on - Set the blocked on field in the task struct.
*/
@@ -1680,6 +1684,8 @@ static inline int rt_mutex_getprio(struc
# define rt_mutex_adjust_pi(p) do { } while (0)
#endif
+extern void futex_adjust_waiters(struct task_struct *p);
+
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
extern int task_nice(const struct task_struct *p);
Index: linux-2.6.25/kernel/fork.c
===================================================================
--- linux-2.6.25.orig/kernel/fork.c
+++ linux-2.6.25/kernel/fork.c
@@ -1027,7 +1027,8 @@ static struct task_struct *copy_process(
p->lockdep_recursion = 0;
#endif
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_RT_MUTEXES)
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_RT_MUTEXES) \
+ || defined(CONFIG_FUTEX)
p->blocked_on = NULL; /* not blocked yet */
#endif
Index: linux-2.6.25/kernel/futex.c
===================================================================
--- linux-2.6.25.orig/kernel/futex.c
+++ linux-2.6.25/kernel/futex.c
@@ -328,6 +328,42 @@ static int get_futex_value_locked(u32 *d
}
/*
+ * Used to update a waiters priority in the plist structure.
+ */
+void futex_adjust_waiters(struct task_struct *p)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *q, *next;
+ union futex_key key;
+
+ if (!p->blocked_on)
+ return;
+
+ spin_lock_irq(&p->pi_lock);
+ if (p->blocked_on && p->blocked_on->lock_type == FUTEX_WAITER) {
+ key = *p->blocked_on->futex_blocked_on;
+ spin_unlock_irq(&p->pi_lock);
+ } else {
+ spin_unlock_irq(&p->pi_lock);
+ return;
+ }
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+ plist_for_each_entry_safe(q, next, &hb->chain, list) {
+ if (match_futex(&q->key, &key) && q->task == p) {
+ int prio = min(p->normal_prio, MAX_RT_PRIO);
+
+ plist_del(&q->list, &hb->chain);
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ break;
+ }
+ }
+ spin_unlock(&hb->lock);
+}
+
+/*
* Fault handling.
* if fshared is non NULL, current->mm->mmap_sem is already held
*/
@@ -1160,6 +1196,8 @@ static int futex_wait(u32 __user *uaddr,
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
+ struct lock_waiter_state blocked_on = { .lock_type = FUTEX_WAITER,
+ { .futex_blocked_on = &q.key }, .next = NULL};
u32 uval;
int ret;
struct hrtimer_sleeper t;
@@ -1177,6 +1215,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret != 0))
goto out_release_sem;
+ set_blocked_on(current, &blocked_on);
+
hb = queue_lock(&q);
/*
@@ -1204,6 +1244,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret)) {
queue_unlock(&q, hb);
+ set_blocked_on(current, NULL);
+
/*
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
@@ -1277,6 +1319,8 @@ static int futex_wait(u32 __user *uaddr,
}
__set_current_state(TASK_RUNNING);
+ set_blocked_on(current, NULL);
+
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
@@ -1311,6 +1355,7 @@ static int futex_wait(u32 __user *uaddr,
out_unlock_release_sem:
queue_unlock(&q, hb);
+ set_blocked_on(current, NULL);
out_release_sem:
futex_unlock_mm(fshared);
Index: linux-2.6.25/kernel/sched.c
===================================================================
--- linux-2.6.25.orig/kernel/sched.c
+++ linux-2.6.25/kernel/sched.c
@@ -4869,6 +4869,7 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
rt_mutex_adjust_pi(p);
+ futex_adjust_waiters(p);
return 0;
}
--
On Tue, 2008-06-24 at 16:20 -0700, Daniel Walker wrote:
> plain text document attachment (blocked_on-futex.patch)
> Adds an additional function call to the sched_setscheduler to update the
> waiter position of a task if it happens to be waiting on a futex. This
> ensures that the kernel level waiter ordering is correctly maintained
> based on the changed priority of the task.
>
> I fixed the locking issue noticed by Thomas Gleixner.
>
> This doesn't address userspace at all, only the kernel level wakeups and
> kernel level ordering.
>
> The additional locking added to the futex_wait function has no visible speed
> impact, and only effects waiters which actual enter the kernel.
Daniel, I'm not sure what to think,.. you were told how broken this
approach was, you were told to give proper justification for this
change. You did neither and just reposted the same old broken shite
again.
Wake up.
(Just in case you missed it, that's a NAK)
On Wed, 2008-06-25 at 07:29 +0200, Peter Zijlstra wrote:
> Daniel, I'm not sure what to think,.. you were told how broken this
> approach was, you were told to give proper justification for this
> change. You did neither and just reposted the same old broken shite
> again.
Broken approach ? Never heard that before, in fact the problem is
whether or not the changes are needed (not weather their broken).. I
gave justification in the last thread, and I'm not sure why it's unclear
to you..
Daniel
On Wed, 2008-06-25 at 07:36 -0700, Daniel Walker wrote:
> On Wed, 2008-06-25 at 07:29 +0200, Peter Zijlstra wrote:
>
> > Daniel, I'm not sure what to think,.. you were told how broken this
> > approach was, you were told to give proper justification for this
> > change. You did neither and just reposted the same old broken shite
> > again.
>
> Broken approach ? Never heard that before,
I suggest you re-read some of Thomas' emails from last time...
http://lkml.org/lkml/2008/6/12/275
> in fact the problem is
> whether or not the changes are needed (not weather their broken).. I
> gave justification in the last thread, and I'm not sure why it's unclear
> to you..
You failed to convince, also justification goes in the changelog, not in
random lkml threads.
On Wed, 2008-06-25 at 17:07 +0200, Peter Zijlstra wrote:
> On Wed, 2008-06-25 at 07:36 -0700, Daniel Walker wrote:
> > On Wed, 2008-06-25 at 07:29 +0200, Peter Zijlstra wrote:
> >
> > > Daniel, I'm not sure what to think,.. you were told how broken this
> > > approach was, you were told to give proper justification for this
> > > change. You did neither and just reposted the same old broken shite
> > > again.
> >
> > Broken approach ? Never heard that before,
>
> I suggest you re-read some of Thomas' emails from last time...
>
> http://lkml.org/lkml/2008/6/12/275
Most of what he's saying there is that it breaks real time, and I
provided a real time fix in this set of patches. I don't have a problem
with the state mixing, since 99.9% of the time we're dealing operations
that don't interact (and it's perfectly ok when they do interact).
> > in fact the problem is
> > whether or not the changes are needed (not weather their broken).. I
> > gave justification in the last thread, and I'm not sure why it's unclear
> > to you..
>
> You failed to convince, also justification goes in the changelog, not in
> random lkml threads.
It boils down to POSIX compliance which was discussed in the last
thread. POSIX requires the waiters to be sorts for 5-10 different API's
which ultimately use the futex (most of which aren't at all related to
PI).
And yes I can add it to the headers, before it goes up stream.
Daniel
On Wed, 2008-06-25 at 08:25 -0700, Daniel Walker wrote:
> On Wed, 2008-06-25 at 17:07 +0200, Peter Zijlstra wrote:
> > On Wed, 2008-06-25 at 07:36 -0700, Daniel Walker wrote:
> > > On Wed, 2008-06-25 at 07:29 +0200, Peter Zijlstra wrote:
> > >
> > > > Daniel, I'm not sure what to think,.. you were told how broken this
> > > > approach was, you were told to give proper justification for this
> > > > change. You did neither and just reposted the same old broken shite
> > > > again.
> > >
> > > Broken approach ? Never heard that before,
> >
> > I suggest you re-read some of Thomas' emails from last time...
> >
> > http://lkml.org/lkml/2008/6/12/275
>
> Most of what he's saying there is that it breaks real time, and I
> provided a real time fix in this set of patches. I don't have a problem
> with the state mixing, since 99.9% of the time we're dealing operations
> that don't interact (and it's perfectly ok when they do interact).
You're not the maintainer, and you fail to respect their opinion - so
what makes you think your patches are going anywhere but /dev/null?
Also, the main point was about mixing user and kernel space state, you
still do so by including the futex waiter in the same union. That's a
fundamental fugly - no matter if you can make it work.
> > > in fact the problem is
> > > whether or not the changes are needed (not weather their broken).. I
> > > gave justification in the last thread, and I'm not sure why it's unclear
> > > to you..
> >
> > You failed to convince, also justification goes in the changelog, not in
> > random lkml threads.
>
> It boils down to POSIX compliance which was discussed in the last
> thread. POSIX requires the waiters to be sorts for 5-10 different API's
> which ultimately use the futex (most of which aren't at all related to
> PI).
I'm unconvinced, my reading of the spec doesn't say that at all. It says
its up to how things get scheduled.
Also, you have failed to say what real world use cases care about this
behaviour. This was asked multiple times - you never answered any of
those queries.
> And yes I can add it to the headers, before it goes up stream.
Don't bother, at this rate that will be never.
On Wed, 2008-06-25 at 18:17 +0200, Peter Zijlstra wrote:
> You're not the maintainer, and you fail to respect their opinion - so
> what makes you think your patches are going anywhere but /dev/null?
It may not be going anyplace. I'm not submitting it to anyone but LKML,
at this point anyway.
> Also, the main point was about mixing user and kernel space state, you
> still do so by including the futex waiter in the same union. That's a
> fundamental fugly - no matter if you can make it work.
I don't think it's ugly at all, but I'm open to suggestion for alternate
methods of implementing it .. I don't need to unify the blocked_on
structures, but it does allow for some nice things like reducing the
size of the task struct, and potentially later doing PI across different
API's.
> > > > in fact the problem is
> > > > whether or not the changes are needed (not weather their broken).. I
> > > > gave justification in the last thread, and I'm not sure why it's unclear
> > > > to you..
> > >
> > > You failed to convince, also justification goes in the changelog, not in
> > > random lkml threads.
> >
> > It boils down to POSIX compliance which was discussed in the last
> > thread. POSIX requires the waiters to be sorts for 5-10 different API's
> > which ultimately use the futex (most of which aren't at all related to
> > PI).
>
> I'm unconvinced, my reading of the spec doesn't say that at all. It says
> its up to how things get scheduled.
Any way you read it there is an ordering that isn't random or FIFO or
un-ordered, which is what we end up with now depending on the order the
syscalls are used.
We already go to the trouble of sorting the waiters, which I think
speaks for itself ..
> Also, you have failed to say what real world use cases care about this
> behaviour. This was asked multiple times - you never answered any of
> those queries.
At this point it all seems academic .. We all know how the code works
now, and the issue I'm addressing .. There's no crash associated with
this ..
I don't have access to code where this causes a devastating problem, all
I have is people asking me about the lack of determinism in this
interface.
Daniel
On Wed, 25 Jun 2008, Daniel Walker wrote:
> On Wed, 2008-06-25 at 18:17 +0200, Peter Zijlstra wrote:
> > Also, the main point was about mixing user and kernel space state, you
> > still do so by including the futex waiter in the same union. That's a
> > fundamental fugly - no matter if you can make it work.
>
> I don't think it's ugly at all, but I'm open to suggestion for alternate
> methods of implementing it .. I don't need to unify the blocked_on
> structures, but it does allow for some nice things like reducing the
> size of the task struct, and potentially later doing PI across different
> API's.
Just get it. Mixing concurrency controls and futex waiters is
fundamentally wrong. A task can be blocked on exactly one concurrency
control, but it can be on a futex waiter _AND_ then block on a
concurrency control.
Unifying the mutex and the rtmutex blocked_on is semantically correct
and is a worthwhile thing to do, but adding the futex waiter to it is
simply a semantical brain fart which can not be excused by reducing
the size of task struct.
No thanks,
tglx
On Wed, 2008-06-25 at 21:06 +0200, Thomas Gleixner wrote:
> Unifying the mutex and the rtmutex blocked_on is semantically correct
> and is a worthwhile thing to do, but adding the futex waiter to it is
> simply a semantical brain fart which can not be excused by reducing
> the size of task struct.
I can unify just the rtmutex and the mutex, then add the futex waiter as
another entity.
Daniel