2014-06-11 20:45:44

by Thomas Gleixner

[permalink] [raw]
Subject: [patch 1/5] futex: Make unlock_pi more robust

The kernel tries to atomically unlock the futex without checking
whether there is kernel state associated to the futex.

So if user space manipulated the user space value, this will leave
kernel internal state around associated to the owner task.

For robustness sake, lookup first whether there are waiters on the
futex. If there are waiters, wake the top priority waiter with all the
proper sanity checks applied.

If there are no waiters, do the atomic release. We do not have to
preserve the waiters bit in this case, because a potentially incoming
waiter is blocked on the hb->lock and will acquire the futex
atomically. We neither have to preserve the owner died bit. The caller
is the owner and it was supposed to cleanup the mess.

Signed-off-by: Thomas Gleixner <[email protected]>
---
kernel/futex.c | 78 +++++++++++++++++++--------------------------------------
1 file changed, 26 insertions(+), 52 deletions(-)

Index: linux/kernel/futex.c
===================================================================
--- linux.orig/kernel/futex.c
+++ linux/kernel/futex.c
@@ -1186,22 +1186,6 @@ static int wake_futex_pi(u32 __user *uad
return 0;
}

-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
-{
- u32 uninitialized_var(oldval);
-
- /*
- * There is no waiter, so we unlock the futex. The owner died
- * bit has not to be preserved here. We are the owner:
- */
- if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
- return -EFAULT;
- if (oldval != uval)
- return -EAGAIN;
-
- return 0;
-}
-
/*
* Express the locking dependencies for lockdep:
*/
@@ -2401,10 +2385,10 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
+ u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- u32 uval, vpid = task_pid_vnr(current);
+ struct futex_hash_bucket *hb;
+ struct futex_q *match;
int ret;

retry:
@@ -2417,57 +2401,47 @@ retry:
return -EPERM;

ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out;
+ if (ret)
+ return ret;

hb = hash_futex(&key);
spin_lock(&hb->lock);

/*
- * To avoid races, try to do the TID -> 0 atomic transition
- * again. If it succeeds then we can return without waking
- * anyone else up. We only try this if neither the waiters nor
- * the owner died bit are set.
- */
- if (!(uval & ~FUTEX_TID_MASK) &&
- cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
- goto pi_faulted;
- /*
- * Rare case: we managed to release the lock atomically,
- * no need to wake anyone else up:
- */
- if (unlikely(uval == vpid))
- goto out_unlock;
-
- /*
- * Ok, other tasks may need to be woken up - check waiters
- * and do the wakeup if necessary:
- */
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (!match_futex (&this->key, &key))
- continue;
- ret = wake_futex_pi(uaddr, uval, this);
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
+ */
+ match = futex_top_waiter(hb, &key);
+ if (match) {
+ ret = wake_futex_pi(uaddr, uval, match);
/*
- * The atomic access to the futex value
- * generated a pagefault, so retry the
- * user-access and the wakeup:
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
goto out_unlock;
}
+
/*
- * No waiters - kernel unlocks the futex:
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themself are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
*/
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
goto pi_faulted;

+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
out_unlock:
spin_unlock(&hb->lock);
put_futex_key(&key);
-
-out:
return ret;

pi_faulted:


2014-06-16 16:11:31

by Darren Hart

[permalink] [raw]
Subject: Re: [patch 1/5] futex: Make unlock_pi more robust

On Wed, 2014-06-11 at 20:45 +0000, Thomas Gleixner wrote:
> The kernel tries to atomically unlock the futex without checking
> whether there is kernel state associated to the futex.
>
> So if user space manipulated the user space value, this will leave
> kernel internal state around associated to the owner task.
>
> For robustness sake, lookup first whether there are waiters on the
> futex. If there are waiters, wake the top priority waiter with all the
> proper sanity checks applied.
>
> If there are no waiters, do the atomic release. We do not have to
> preserve the waiters bit in this case, because a potentially incoming
> waiter is blocked on the hb->lock and will acquire the futex
> atomically. We neither have to preserve the owner died bit. The caller
> is the owner and it was supposed to cleanup the mess.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> kernel/futex.c | 78
> +++++++++++++++++++--------------------------------------
> 1 file changed, 26 insertions(+), 52 deletions(-)
>
> Index: linux/kernel/futex.c
> ===================================================================
> --- linux.orig/kernel/futex.c
> +++ linux/kernel/futex.c
> @@ -1186,22 +1186,6 @@ static int wake_futex_pi(u32 __user *uad
> return 0;
> }
>
> -static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
> -{
> - u32 uninitialized_var(oldval);
> -
> - /*
> - * There is no waiter, so we unlock the futex. The owner died
> - * bit has not to be preserved here. We are the owner:
> - */
> - if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
> - return -EFAULT;
> - if (oldval != uval)
> - return -EAGAIN;
> -
> - return 0;
> -}

I thought we used that in multiple places, but apparently not. Worth
consolidating.

> -
> /*
> * Express the locking dependencies for lockdep:
> */
> @@ -2401,10 +2385,10 @@ uaddr_faulted:
> */
> static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
> {
> - struct futex_hash_bucket *hb;
> - struct futex_q *this, *next;
> + u32 uninitialized_var(curval), uval, vpid =
> task_pid_vnr(current);
> union futex_key key = FUTEX_KEY_INIT;
> - u32 uval, vpid = task_pid_vnr(current);
> + struct futex_hash_bucket *hb;
> + struct futex_q *match;
> int ret;
>
> retry:
> @@ -2417,57 +2401,47 @@ retry:
> return -EPERM;
>
> ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key,
> VERIFY_WRITE);
> - if (unlikely(ret != 0))
> - goto out;
> + if (ret)
> + return ret;

Looks like you're also trying to move away from a single exit point to
multiple exit points. I prefer the single exit (which you've probably
noticed :-), but it's a subjective thing, and so long as we are not
duplicating cleanup logic, I guess it's fine either way. This change was
not mentioned in the commit message though.

>
> hb = hash_futex(&key);
> spin_lock(&hb->lock);
>
> /*
> - * To avoid races, try to do the TID -> 0 atomic transition
> - * again. If it succeeds then we can return without waking
> - * anyone else up. We only try this if neither the waiters nor
> - * the owner died bit are set.
> - */
> - if (!(uval & ~FUTEX_TID_MASK) &&
> - cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
> - goto pi_faulted;
> - /*
> - * Rare case: we managed to release the lock atomically,
> - * no need to wake anyone else up:
> - */
> - if (unlikely(uval == vpid))
> - goto out_unlock;
> -
> - /*
> - * Ok, other tasks may need to be woken up - check waiters
> - * and do the wakeup if necessary:
> - */
> - plist_for_each_entry_safe(this, next, &hb->chain, list) {
> - if (!match_futex (&this->key, &key))
> - continue;
> - ret = wake_futex_pi(uaddr, uval, this);
> + * Check waiters first. We do not trust user space values at
> + * all and we at least want to know if user space fiddled
> + * with the futex value instead of blindly unlocking.
> + */
> + match = futex_top_waiter(hb, &key);
> + if (match) {
> + ret = wake_futex_pi(uaddr, uval, match);
> /*
> - * The atomic access to the futex value
> - * generated a pagefault, so retry the
> - * user-access and the wakeup:
> + * The atomic access to the futex value generated a
> + * pagefault, so retry the user-access and the wakeup:
> */
> if (ret == -EFAULT)
> goto pi_faulted;
> goto out_unlock;
> }
> +
> /*
> - * No waiters - kernel unlocks the futex:
> + * We have no kernel internal state, i.e. no waiters in the
> + * kernel. Waiters which are about to queue themself are stuck

themselves

> + * on hb->lock. So we can safely ignore them. We do neither
> + * preserve the WAITERS bit not the OWNER_DIED one. We are the

We preserve neither the WAITERS bit nor the OWNER_DIED bit.
(the above use of "do" and "not" is incorrect and could easily be
misinterpreted).

> + * owner.

In wake_futex_pi we verify ownership by matching pi_state->owner ==
current, but here the only test is the TID value, which is set by
userspace - which we don't trust...

I'm trying to determine if it matters in this case... if there are no
waiters, is the pi_state still around? If so, it does indeed matter, and
we should be verifying.


> */
> - ret = unlock_futex_pi(uaddr, uval);
> - if (ret == -EFAULT)
> + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
> goto pi_faulted;
>

This refactoring seems like it would be best done as a prequel patch so
as not to confuse cleanup with functional change. At least that is what
you and others have beaten into me over the years ;-)

> + /*
> + * If uval has changed, let user space handle it.
> + */
> + ret = (curval == uval) ? 0 : -EAGAIN;
> +
> out_unlock:
> spin_unlock(&hb->lock);
> put_futex_key(&key);
> -
> -out:
> return ret;
>

By dropping this you won't return ret, but rather fall through into
pi_faulted... which certainly isn't what you wanted.

The need for better test coverage is very evident now :-)

--
Darren

> pi_faulted:
>

2014-06-16 22:16:04

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 1/5] futex: Make unlock_pi more robust

On Mon, 16 Jun 2014, Darren Hart wrote:
> On Wed, 2014-06-11 at 20:45 +0000, Thomas Gleixner wrote:
> > static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
> > @@ -2417,57 +2401,47 @@ retry:
> > return -EPERM;
> >
> > ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key,
> > VERIFY_WRITE);
> > - if (unlikely(ret != 0))
> > - goto out;
> > + if (ret)
> > + return ret;
>
> Looks like you're also trying to move away from a single exit point to
> multiple exit points. I prefer the single exit (which you've probably
> noticed :-), but it's a subjective thing, and so long as we are not
> duplicating cleanup logic, I guess it's fine either way. This change was
> not mentioned in the commit message though.

I really did not think about mentioning that :)

> > + * Check waiters first. We do not trust user space values at
> > + * all and we at least want to know if user space fiddled
> > + * with the futex value instead of blindly unlocking.
> > + */
> > + match = futex_top_waiter(hb, &key);
> > + if (match) {
> > + ret = wake_futex_pi(uaddr, uval, match);
> > /*
> > - * The atomic access to the futex value
> > - * generated a pagefault, so retry the
> > - * user-access and the wakeup:
> > + * The atomic access to the futex value generated a
> > + * pagefault, so retry the user-access and the wakeup:
> > */
> > if (ret == -EFAULT)
> > goto pi_faulted;
> > goto out_unlock;
> > }
> > +
> > /*
> > - * No waiters - kernel unlocks the futex:
> > + * We have no kernel internal state, i.e. no waiters in the
> > + * kernel. Waiters which are about to queue themself are stuck
>
> themselves
>
> > + * on hb->lock. So we can safely ignore them. We do neither
> > + * preserve the WAITERS bit not the OWNER_DIED one. We are the
>
> We preserve neither the WAITERS bit nor the OWNER_DIED bit.
> (the above use of "do" and "not" is incorrect and could easily be
> misinterpreted).
>
> > + * owner.
>
> In wake_futex_pi we verify ownership by matching pi_state->owner ==
> current, but here the only test is the TID value, which is set by
> userspace - which we don't trust...
>
> I'm trying to determine if it matters in this case... if there are no
> waiters, is the pi_state still around? If so, it does indeed matter, and
> we should be verifying.

Erm. The whole point of this patch is to do:

- Find existing state first and handle it.

- If no state exists and TID == current, take it

- Otherwise create state

This all happens under hb->lock. So how should something create new
state after we looked up existing state?

> > */
> > - ret = unlock_futex_pi(uaddr, uval);
> > - if (ret == -EFAULT)
> > + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
> > goto pi_faulted;
> >
>
> This refactoring seems like it would be best done as a prequel patch so
> as not to confuse cleanup with functional change. At least that is what
> you and others have beaten into me over the years ;-)

Well, yes and no. I'll hapilly discuss that without after clarifying
the issue below.

> > + /*
> > + * If uval has changed, let user space handle it.
> > + */
> > + ret = (curval == uval) ? 0 : -EAGAIN;
> > +
> > out_unlock:
> > spin_unlock(&hb->lock);
> > put_futex_key(&key);
> > -
> > -out:
> > return ret;
> >
>
> By dropping this you won't return ret, but rather fall through into
> pi_faulted... which certainly isn't what you wanted.

By dropping the now unused "out" label I'm not longer returning ret?

The resulting code is:

out_unlock:
spin_unlock(&hb->lock);
put_futex_key(&key);
return ret;

If you can explain me how that "return ret" falls through to
pi_faulted magically, then I'm definitely agreeing with you on this:

> The need for better test coverage is very evident now :-)

-ENOTENOUGHSLEEP or -ENOTENOUGHCOFFEE or -ENOGLASSES perhaps?

I'm omitting some other politicially incorrect speculations for now.

Thanks,

tglx

2014-06-16 22:28:08

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 1/5] futex: Make unlock_pi more robust

On Tue, 17 Jun 2014, Thomas Gleixner wrote:
> On Mon, 16 Jun 2014, Darren Hart wrote:
> > On Wed, 2014-06-11 at 20:45 +0000, Thomas Gleixner wrote:
> > In wake_futex_pi we verify ownership by matching pi_state->owner ==
> > current, but here the only test is the TID value, which is set by
> > userspace - which we don't trust...
> >
> > I'm trying to determine if it matters in this case... if there are no
> > waiters, is the pi_state still around? If so, it does indeed matter, and
> > we should be verifying.
>
> Erm. The whole point of this patch is to do:
>
> - Find existing state first and handle it.
>
> - If no state exists and TID == current, take it
>
> - Otherwise create state

Duh, that was the lock path. But here the point is:

- Find existing state first and handle it.

- If no state exists and TID == current, release it

The retry is obvious, right?

Thanks,

tglx

2014-06-16 22:32:20

by Darren Hart

[permalink] [raw]
Subject: Re: [patch 1/5] futex: Make unlock_pi more robust

On Tue, 2014-06-17 at 00:15 +0200, Thomas Gleixner wrote:
> On Mon, 16 Jun 2014, Darren Hart wrote:
> > On Wed, 2014-06-11 at 20:45 +0000, Thomas Gleixner wrote:
> > > static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
> > > @@ -2417,57 +2401,47 @@ retry:
> > > return -EPERM;
> > >
> > > ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key,
> > > VERIFY_WRITE);
> > > - if (unlikely(ret != 0))
> > > - goto out;
> > > + if (ret)
> > > + return ret;
> >
> > Looks like you're also trying to move away from a single exit point to
> > multiple exit points. I prefer the single exit (which you've probably
> > noticed :-), but it's a subjective thing, and so long as we are not
> > duplicating cleanup logic, I guess it's fine either way. This change was
> > not mentioned in the commit message though.
>
> I really did not think about mentioning that :)
>
> > > + * Check waiters first. We do not trust user space values at
> > > + * all and we at least want to know if user space fiddled
> > > + * with the futex value instead of blindly unlocking.
> > > + */
> > > + match = futex_top_waiter(hb, &key);
> > > + if (match) {
> > > + ret = wake_futex_pi(uaddr, uval, match);
> > > /*
> > > - * The atomic access to the futex value
> > > - * generated a pagefault, so retry the
> > > - * user-access and the wakeup:
> > > + * The atomic access to the futex value generated a
> > > + * pagefault, so retry the user-access and the wakeup:
> > > */
> > > if (ret == -EFAULT)
> > > goto pi_faulted;
> > > goto out_unlock;
> > > }
> > > +
> > > /*
> > > - * No waiters - kernel unlocks the futex:
> > > + * We have no kernel internal state, i.e. no waiters in the
> > > + * kernel. Waiters which are about to queue themself are stuck
> >
> > themselves
> >
> > > + * on hb->lock. So we can safely ignore them. We do neither
> > > + * preserve the WAITERS bit not the OWNER_DIED one. We are the
> >
> > We preserve neither the WAITERS bit nor the OWNER_DIED bit.
> > (the above use of "do" and "not" is incorrect and could easily be
> > misinterpreted).
> >
> > > + * owner.
> >
> > In wake_futex_pi we verify ownership by matching pi_state->owner ==
> > current, but here the only test is the TID value, which is set by
> > userspace - which we don't trust...
> >
> > I'm trying to determine if it matters in this case... if there are no
> > waiters, is the pi_state still around? If so, it does indeed matter, and
> > we should be verifying.
>
> Erm. The whole point of this patch is to do:
>
> - Find existing state first and handle it.hehheh
>
> - If no state exists and TID == current, take it
>
> - Otherwise create state

Right, by 5/5 I realized that, duh, if there are no waiters, you can't
find a pi_state to check, so we have no choice here. Sorry, should have
come back and said as much. We can drop this concern.

> This all happens under hb->lock. So how should something create new
> state after we looked up existing state?
>
> > > */
> > > - ret = unlock_futex_pi(uaddr, uval);
> > > - if (ret == -EFAULT)
> > > + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
> > > goto pi_faulted;
> > >
> >
> > This refactoring seems like it would be best done as a prequel patch so
> > as not to confuse cleanup with functional change. At least that is what
> > you and others have beaten into me over the years ;-)
>
> Well, yes and no. I'll hapilly discuss that without after clarifying
> the issue below.
>
> > > + /*
> > > + * If uval has changed, let user space handle it.
> > > + */
> > > + ret = (curval == uval) ? 0 : -EAGAIN;
> > > +
> > > out_unlock:
> > > spin_unlock(&hb->lock);
> > > put_futex_key(&key);
> > > -
> > > -out:
> > > return ret;
> > >
> >
> > By dropping this you won't return ret, but rather fall through into
> > pi_faulted... which certainly isn't what you wanted.
>
> By dropping the now unused "out" label I'm not longer returning ret?
>

Sigh. Nevermind.

> The resulting code is:
>
> out_unlock:
> spin_unlock(&hb->lock);
> put_futex_key(&key);
> return ret;
>
> If you can explain me how that "return ret" falls through to
> pi_faulted magically, then I'm definitely agreeing with you on this:
>
> > The need for better test coverage is very evident now :-)
>
> -ENOTENOUGHSLEEP or -ENOTENOUGHCOFFEE or -ENOGLASSES perhaps?
>
> I'm omitting some other politicially incorrect speculations for now.

Guilty as charged, on all counts I'm afraid.

So, my only reservation is the possible splitting out of the refactoring
bit. But that's minor. Either way:

Reviewed-by: Darren Hart <[email protected]>


--
Darren Hart
Intel Open Source Technology Center

2014-06-16 22:42:07

by Darren Hart

[permalink] [raw]
Subject: Re: [patch 1/5] futex: Make unlock_pi more robust

On Tue, 2014-06-17 at 00:28 +0200, Thomas Gleixner wrote:
> On Tue, 17 Jun 2014, Thomas Gleixner wrote:
> > On Mon, 16 Jun 2014, Darren Hart wrote:
> > > On Wed, 2014-06-11 at 20:45 +0000, Thomas Gleixner wrote:
> > > In wake_futex_pi we verify ownership by matching pi_state->owner ==
> > > current, but here the only test is the TID value, which is set by
> > > userspace - which we don't trust...
> > >
> > > I'm trying to determine if it matters in this case... if there are no
> > > waiters, is the pi_state still around? If so, it does indeed matter, and
> > > we should be verifying.
> >
> > Erm. The whole point of this patch is to do:
> >
> > - Find existing state first and handle it.
> >
> > - If no state exists and TID == current, take it
> >
> > - Otherwise create state
>
> Duh, that was the lock path. But here the point is:
>
> - Find existing state first and handle it.
>
> - If no state exists and TID == current, release it
>

Right, I understood your meaning, and I withdraw the concern.

> The retry is obvious, right?

Yes.

--
Darren Hart
Intel Open Source Technology Center

Subject: [tip:locking/core] futex: Make unlock_pi more robust

Commit-ID: ccf9e6a80d9e1b9df69c98e6b9745cf49869ee15
Gitweb: http://git.kernel.org/tip/ccf9e6a80d9e1b9df69c98e6b9745cf49869ee15
Author: Thomas Gleixner <[email protected]>
AuthorDate: Wed, 11 Jun 2014 20:45:38 +0000
Committer: Thomas Gleixner <[email protected]>
CommitDate: Sat, 21 Jun 2014 22:26:23 +0200

futex: Make unlock_pi more robust

The kernel tries to atomically unlock the futex without checking
whether there is kernel state associated to the futex.

So if user space manipulated the user space value, this will leave
kernel internal state around associated to the owner task.

For robustness sake, lookup first whether there are waiters on the
futex. If there are waiters, wake the top priority waiter with all the
proper sanity checks applied.

If there are no waiters, do the atomic release. We do not have to
preserve the waiters bit in this case, because a potentially incoming
waiter is blocked on the hb->lock and will acquire the futex
atomically. We neither have to preserve the owner died bit. The caller
is the owner and it was supposed to cleanup the mess.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Darren Hart <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
kernel/futex.c | 76 +++++++++++++++++++---------------------------------------
1 file changed, 25 insertions(+), 51 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index e5c6c40..346d5c2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1186,22 +1186,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
return 0;
}

-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
-{
- u32 uninitialized_var(oldval);
-
- /*
- * There is no waiter, so we unlock the futex. The owner died
- * bit has not to be preserved here. We are the owner:
- */
- if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
- return -EFAULT;
- if (oldval != uval)
- return -EAGAIN;
-
- return 0;
-}
-
/*
* Express the locking dependencies for lockdep:
*/
@@ -2401,10 +2385,10 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
+ u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- u32 uval, vpid = task_pid_vnr(current);
+ struct futex_hash_bucket *hb;
+ struct futex_q *match;
int ret;

retry:
@@ -2417,57 +2401,47 @@ retry:
return -EPERM;

ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out;
+ if (ret)
+ return ret;

hb = hash_futex(&key);
spin_lock(&hb->lock);

/*
- * To avoid races, try to do the TID -> 0 atomic transition
- * again. If it succeeds then we can return without waking
- * anyone else up. We only try this if neither the waiters nor
- * the owner died bit are set.
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
*/
- if (!(uval & ~FUTEX_TID_MASK) &&
- cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
- goto pi_faulted;
- /*
- * Rare case: we managed to release the lock atomically,
- * no need to wake anyone else up:
- */
- if (unlikely(uval == vpid))
- goto out_unlock;
-
- /*
- * Ok, other tasks may need to be woken up - check waiters
- * and do the wakeup if necessary:
- */
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (!match_futex (&this->key, &key))
- continue;
- ret = wake_futex_pi(uaddr, uval, this);
+ match = futex_top_waiter(hb, &key);
+ if (match) {
+ ret = wake_futex_pi(uaddr, uval, match);
/*
- * The atomic access to the futex value
- * generated a pagefault, so retry the
- * user-access and the wakeup:
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
goto out_unlock;
}
+
/*
- * No waiters - kernel unlocks the futex:
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
*/
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
goto pi_faulted;

+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
out_unlock:
spin_unlock(&hb->lock);
put_futex_key(&key);
-
-out:
return ret;

pi_faulted: