I investigate a fio hang issue. When I run fio multi-process
testing on many disks, fio traps into kernel and doesn't exit
(mostly hit once after runing sub test cases for hundreds of times).
Oprofile data shows kernel consumes time with some futex functions.
Command kill couldn't kill the process and machine reboot also hangs.
Eventually, I locate the root cause as a bug of futex. Kernel enters
a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
By unknown reason (might be an issue of fio or glibc), parameter uaddr2
points to an area which is READONLY. So futex_atomic_op_inuser returns
-EFAULT when trying to changing the data at uaddr2, but later get_user
still succeeds becasue the area is READONLY. Then go back to retry.
I create a simple test case to trigger it, which just shmat an READONLY
area for address uaddr2.
It could be used as a DOS attack.
'git log kernel/futex.c' shows below commit creates the issue obviously.
commit e4dc5b7a36a49eff97050894cf1b3a9a02523717
Author: Darren Hart <[email protected]>
Date: Thu Mar 12 00:56:13 2009 -0700
futex: clean up fault logic
Impact: cleanup
Older versions of the futex code held the mmap_sem which had to
be dropped in order to call get_user(), so a two-pronged fault
handling mechanism was employed to handle faults of the atomic
operations. The mmap_sem is no longer held, so get_user()
should be adequate. This patch greatly simplifies the logic and
improves legibility.
Reverting it fixes the issue. In addition, the patch deletes some comments
around calling futex_handle_fault, but forgots in futex_unlock_pi.
There is a confliction to revert the commit. I worked out a patch.
The futex codes seem complicated. We could work out cleanup patches later after
applying the reverting patch.
---
--- linux-2.6.30-rc8/kernel/futex.c 2009-06-10 06:32:19.000000000 +0800
+++ linux-2.6.30-rc8_futex/kernel/futex.c 2009-06-10 00:07:08.000000000 +0800
@@ -300,6 +300,41 @@ static int get_futex_value_locked(u32 *d
return ret ? -EFAULT : 0;
}
+/*
+ * Fault handling.
+ */
+static int futex_handle_fault(unsigned long address, int attempt)
+{
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+ int ret = -EFAULT;
+
+ if (attempt > 2)
+ return ret;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ if (vma && address >= vma->vm_start &&
+ (vma->vm_flags & VM_WRITE)) {
+ int fault;
+ fault = handle_mm_fault(mm, vma, address, 1);
+ if (unlikely((fault & VM_FAULT_ERROR))) {
+#if 0
+ /* XXX: let's do this when we verify it is OK */
+ if (ret & VM_FAULT_OOM)
+ ret = -ENOMEM;
+#endif
+ } else {
+ ret = 0;
+ if (fault & VM_FAULT_MAJOR)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+ }
+ }
+ up_read(&mm->mmap_sem);
+ return ret;
+}
/*
* PI code:
@@ -722,9 +757,9 @@ futex_wake_op(u32 __user *uaddr1, int fs
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head;
struct futex_q *this, *next;
- int ret, op_ret;
+ int ret, op_ret, attempt = 0;
-retry:
+retryfull:
ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
@@ -735,8 +770,9 @@ retry:
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
+retry:
double_lock_hb(hb1, hb2);
-retry_private:
+
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
u32 dummy;
@@ -757,16 +793,28 @@ retry_private:
goto out_put_keys;
}
+ /*
+ * futex_atomic_op_inuser needs to both read and write
+ * *(int __user *)uaddr2, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ */
+ if (attempt++) {
+ ret = futex_handle_fault((unsigned long)uaddr2,
+ attempt);
+ if (ret)
+ goto out_put_keys;
+ goto retry;
+ }
+
ret = get_user(dummy, uaddr2);
if (ret)
goto out_put_keys;
- if (!fshared)
- goto retry_private;
-
put_futex_key(fshared, &key2);
put_futex_key(fshared, &key1);
- goto retry;
+ goto retryfull;
}
head = &hb1->chain;
@@ -826,7 +874,6 @@ retry:
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
-retry_private:
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -837,16 +884,15 @@ retry_private:
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
+
ret = get_user(curval, uaddr1);
- if (ret)
- goto out_put_keys;
- if (!fshared)
- goto retry_private;
+ if (!ret)
+ goto retry;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
- goto retry;
+ goto out_put_keys;
}
if (curval != *cmpval) {
ret = -EAGAIN;
@@ -1026,7 +1072,7 @@ static int fixup_pi_state_owner(u32 __us
struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner = pi_state->owner;
u32 uval, curval, newval;
- int ret;
+ int ret, attempt = 0;
/* Owner died? */
if (!pi_state->owner)
@@ -1097,7 +1143,7 @@ retry:
handle_fault:
spin_unlock(q->lock_ptr);
- ret = get_user(uval, uaddr);
+ ret = futex_handle_fault((unsigned long)uaddr, attempt++);
spin_lock(q->lock_ptr);
@@ -1146,7 +1192,6 @@ retry:
if (unlikely(ret != 0))
goto out;
-retry_private:
hb = queue_lock(&q);
/*
@@ -1173,16 +1218,13 @@ retry_private:
if (unlikely(ret)) {
queue_unlock(&q, hb);
+ put_futex_key(fshared, &q.key);
ret = get_user(uval, uaddr);
- if (ret)
- goto out_put_key;
- if (!fshared)
- goto retry_private;
-
- put_futex_key(fshared, &q.key);
- goto retry;
+ if (!ret)
+ goto retry;
+ goto out;
}
ret = -EWOULDBLOCK;
if (unlikely(uval != val)) {
@@ -1316,7 +1358,7 @@ static int futex_lock_pi(u32 __user *uad
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
- int ret, lock_taken, ownerdied = 0;
+ int ret, lock_taken, ownerdied = 0, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
@@ -1336,7 +1378,7 @@ retry:
if (unlikely(ret != 0))
goto out;
-retry_private:
+retry_unlocked:
hb = queue_lock(&q);
retry_locked:
@@ -1561,15 +1603,18 @@ uaddr_faulted:
*/
queue_unlock(&q, hb);
- ret = get_user(uval, uaddr);
- if (ret)
- goto out_put_key;
+ if (attempt++) {
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out_put_key;
+ goto retry_unlocked;
+ }
- if (!fshared)
- goto retry_private;
+ ret = get_user(uval, uaddr);
+ if (!ret)
+ goto retry_unlocked;
- put_futex_key(fshared, &q.key);
- goto retry;
+ goto out_put_key;
}
@@ -1585,7 +1630,7 @@ static int futex_unlock_pi(u32 __user *u
u32 uval;
struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
- int ret;
+ int ret, attempt = 0;
retry:
if (get_user(uval, uaddr))
@@ -1601,6 +1646,7 @@ retry:
goto out;
hb = hash_futex(&key);
+retry_unlocked:
spin_lock(&hb->lock);
/*
@@ -1665,9 +1711,17 @@ pi_faulted:
* we have to drop the mmap_sem in order to call get_user().
*/
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+
+ if (attempt++) {
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out;
+ uval = 0;
+ goto retry_unlocked;
+ }
ret = get_user(uval, uaddr);
+ put_futex_key(fshared, &key);
if (!ret)
goto retry;
On Thu, 2009-06-11 at 11:08 +0800, Zhang, Yanmin wrote:
> I investigate a fio hang issue. When I run fio multi-process
> testing on many disks, fio traps into kernel and doesn't exit
> (mostly hit once after runing sub test cases for hundreds of times).
>
> Oprofile data shows kernel consumes time with some futex functions.
> Command kill couldn't kill the process and machine reboot also hangs.
>
> Eventually, I locate the root cause as a bug of futex. Kernel enters
> a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> points to an area which is READONLY. So futex_atomic_op_inuser returns
> -EFAULT when trying to changing the data at uaddr2, but later get_user
> still succeeds becasue the area is READONLY. Then go back to retry.
>
> I create a simple test case to trigger it, which just shmat an READONLY
> area for address uaddr2.
>
> It could be used as a DOS attack.
commit 2070887fdeacd9c13f3e805e3f0086c9f22a4d93
Author: Thomas Gleixner <[email protected]>
Date: Tue May 19 23:04:59 2009 +0200
futex: fix restart in wait_requeue_pi
If the waiter has been requeued to the outer PI futex and is
interrupted by a signal and the thread handles the signal then
ERESTART_RESTARTBLOCK is changed to EINTR and the restart block is
discarded. That way we return an unexcpected EINTR to user space
instead of ending up in futex_lock_pi_restart.
But we do not need to restart the syscall because we know that the
condition has changed since we have been requeued. If we would simply
restart the syscall then we would drop out via the comparison of the
user space value with EWOULDBLOCK.
The user space side needs to handle EWOULDBLOCK anyway as the
enqueueing on the inner futex can race with a requeue/wake. So we can
simply return EWOULDBLOCK to user space which also signals that we did
not take the outer futex and let user space handle it in the same way
it has to handle the requeue/wake race.
Signed-off-by: Thomas Gleixner <[email protected]>
Zhang, Yanmin wrote:
Hi Zhang,
> I investigate a fio hang issue. When I run fio multi-process
> testing on many disks, fio traps into kernel and doesn't exit
> (mostly hit once after runing sub test cases for hundreds of times).
>
> Oprofile data shows kernel consumes time with some futex functions.
> Command kill couldn't kill the process and machine reboot also hangs.
>
> Eventually, I locate the root cause as a bug of futex. Kernel enters
> a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> points to an area which is READONLY. So futex_atomic_op_inuser returns
> -EFAULT when trying to changing the data at uaddr2, but later get_user
> still succeeds becasue the area is READONLY. Then go back to retry.
>
> I create a simple test case to trigger it, which just shmat an READONLY
> area for address uaddr2.
>
> It could be used as a DOS attack.
Nice work on the diagnosis. I recall discussing something like this a
couple weeks back. I thought this was fixed with a patch to ensure the
pages were writable. Cc'ing Thomas G. to confirm. I didn't see a
kernel version in your report, what are you running?
--
Darren
>
> 'git log kernel/futex.c' shows below commit creates the issue obviously.
>
> commit e4dc5b7a36a49eff97050894cf1b3a9a02523717
> Author: Darren Hart <[email protected]>
> Date: Thu Mar 12 00:56:13 2009 -0700
>
> futex: clean up fault logic
>
> Impact: cleanup
>
> Older versions of the futex code held the mmap_sem which had to
> be dropped in order to call get_user(), so a two-pronged fault
> handling mechanism was employed to handle faults of the atomic
> operations. The mmap_sem is no longer held, so get_user()
> should be adequate. This patch greatly simplifies the logic and
> improves legibility.
>
>
> Reverting it fixes the issue. In addition, the patch deletes some comments
> around calling futex_handle_fault, but forgots in futex_unlock_pi.
>
> There is a confliction to revert the commit. I worked out a patch.
>
> The futex codes seem complicated. We could work out cleanup patches later after
> applying the reverting patch.
>
> ---
>
> --- linux-2.6.30-rc8/kernel/futex.c 2009-06-10 06:32:19.000000000 +0800
> +++ linux-2.6.30-rc8_futex/kernel/futex.c 2009-06-10 00:07:08.000000000 +0800
> @@ -300,6 +300,41 @@ static int get_futex_value_locked(u32 *d
> return ret ? -EFAULT : 0;
> }
>
> +/*
> + * Fault handling.
> + */
> +static int futex_handle_fault(unsigned long address, int attempt)
> +{
> + struct vm_area_struct * vma;
> + struct mm_struct *mm = current->mm;
> + int ret = -EFAULT;
> +
> + if (attempt > 2)
> + return ret;
> +
> + down_read(&mm->mmap_sem);
> + vma = find_vma(mm, address);
> + if (vma && address >= vma->vm_start &&
> + (vma->vm_flags & VM_WRITE)) {
> + int fault;
> + fault = handle_mm_fault(mm, vma, address, 1);
> + if (unlikely((fault & VM_FAULT_ERROR))) {
> +#if 0
> + /* XXX: let's do this when we verify it is OK */
> + if (ret & VM_FAULT_OOM)
> + ret = -ENOMEM;
> +#endif
> + } else {
> + ret = 0;
> + if (fault & VM_FAULT_MAJOR)
> + current->maj_flt++;
> + else
> + current->min_flt++;
> + }
> + }
> + up_read(&mm->mmap_sem);
> + return ret;
> +}
>
> /*
> * PI code:
> @@ -722,9 +757,9 @@ futex_wake_op(u32 __user *uaddr1, int fs
> struct futex_hash_bucket *hb1, *hb2;
> struct plist_head *head;
> struct futex_q *this, *next;
> - int ret, op_ret;
> + int ret, op_ret, attempt = 0;
>
> -retry:
> +retryfull:
> ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> if (unlikely(ret != 0))
> goto out;
> @@ -735,8 +770,9 @@ retry:
> hb1 = hash_futex(&key1);
> hb2 = hash_futex(&key2);
>
> +retry:
> double_lock_hb(hb1, hb2);
> -retry_private:
> +
> op_ret = futex_atomic_op_inuser(op, uaddr2);
> if (unlikely(op_ret < 0)) {
> u32 dummy;
> @@ -757,16 +793,28 @@ retry_private:
> goto out_put_keys;
> }
>
> + /*
> + * futex_atomic_op_inuser needs to both read and write
> + * *(int __user *)uaddr2, but we can't modify it
> + * non-atomically. Therefore, if get_user below is not
> + * enough, we need to handle the fault ourselves, while
> + * still holding the mmap_sem.
> + */
> + if (attempt++) {
> + ret = futex_handle_fault((unsigned long)uaddr2,
> + attempt);
> + if (ret)
> + goto out_put_keys;
> + goto retry;
> + }
> +
> ret = get_user(dummy, uaddr2);
> if (ret)
> goto out_put_keys;
>
> - if (!fshared)
> - goto retry_private;
> -
> put_futex_key(fshared, &key2);
> put_futex_key(fshared, &key1);
> - goto retry;
> + goto retryfull;
> }
>
> head = &hb1->chain;
> @@ -826,7 +874,6 @@ retry:
> hb1 = hash_futex(&key1);
> hb2 = hash_futex(&key2);
>
> -retry_private:
> double_lock_hb(hb1, hb2);
>
> if (likely(cmpval != NULL)) {
> @@ -837,16 +884,15 @@ retry_private:
> if (unlikely(ret)) {
> double_unlock_hb(hb1, hb2);
>
> + put_futex_key(fshared, &key2);
> + put_futex_key(fshared, &key1);
> +
> ret = get_user(curval, uaddr1);
> - if (ret)
> - goto out_put_keys;
>
> - if (!fshared)
> - goto retry_private;
> + if (!ret)
> + goto retry;
>
> - put_futex_key(fshared, &key2);
> - put_futex_key(fshared, &key1);
> - goto retry;
> + goto out_put_keys;
> }
> if (curval != *cmpval) {
> ret = -EAGAIN;
> @@ -1026,7 +1072,7 @@ static int fixup_pi_state_owner(u32 __us
> struct futex_pi_state *pi_state = q->pi_state;
> struct task_struct *oldowner = pi_state->owner;
> u32 uval, curval, newval;
> - int ret;
> + int ret, attempt = 0;
>
> /* Owner died? */
> if (!pi_state->owner)
> @@ -1097,7 +1143,7 @@ retry:
> handle_fault:
> spin_unlock(q->lock_ptr);
>
> - ret = get_user(uval, uaddr);
> + ret = futex_handle_fault((unsigned long)uaddr, attempt++);
>
> spin_lock(q->lock_ptr);
>
> @@ -1146,7 +1192,6 @@ retry:
> if (unlikely(ret != 0))
> goto out;
>
> -retry_private:
> hb = queue_lock(&q);
>
> /*
> @@ -1173,16 +1218,13 @@ retry_private:
>
> if (unlikely(ret)) {
> queue_unlock(&q, hb);
> + put_futex_key(fshared, &q.key);
>
> ret = get_user(uval, uaddr);
> - if (ret)
> - goto out_put_key;
>
> - if (!fshared)
> - goto retry_private;
> -
> - put_futex_key(fshared, &q.key);
> - goto retry;
> + if (!ret)
> + goto retry;
> + goto out;
> }
> ret = -EWOULDBLOCK;
> if (unlikely(uval != val)) {
> @@ -1316,7 +1358,7 @@ static int futex_lock_pi(u32 __user *uad
> struct futex_hash_bucket *hb;
> u32 uval, newval, curval;
> struct futex_q q;
> - int ret, lock_taken, ownerdied = 0;
> + int ret, lock_taken, ownerdied = 0, attempt = 0;
>
> if (refill_pi_state_cache())
> return -ENOMEM;
> @@ -1336,7 +1378,7 @@ retry:
> if (unlikely(ret != 0))
> goto out;
>
> -retry_private:
> +retry_unlocked:
> hb = queue_lock(&q);
>
> retry_locked:
> @@ -1561,15 +1603,18 @@ uaddr_faulted:
> */
> queue_unlock(&q, hb);
>
> - ret = get_user(uval, uaddr);
> - if (ret)
> - goto out_put_key;
> + if (attempt++) {
> + ret = futex_handle_fault((unsigned long)uaddr, attempt);
> + if (ret)
> + goto out_put_key;
> + goto retry_unlocked;
> + }
>
> - if (!fshared)
> - goto retry_private;
> + ret = get_user(uval, uaddr);
> + if (!ret)
> + goto retry_unlocked;
>
> - put_futex_key(fshared, &q.key);
> - goto retry;
> + goto out_put_key;
> }
>
>
> @@ -1585,7 +1630,7 @@ static int futex_unlock_pi(u32 __user *u
> u32 uval;
> struct plist_head *head;
> union futex_key key = FUTEX_KEY_INIT;
> - int ret;
> + int ret, attempt = 0;
>
> retry:
> if (get_user(uval, uaddr))
> @@ -1601,6 +1646,7 @@ retry:
> goto out;
>
> hb = hash_futex(&key);
> +retry_unlocked:
> spin_lock(&hb->lock);
>
> /*
> @@ -1665,9 +1711,17 @@ pi_faulted:
> * we have to drop the mmap_sem in order to call get_user().
> */
> spin_unlock(&hb->lock);
> - put_futex_key(fshared, &key);
> +
> + if (attempt++) {
> + ret = futex_handle_fault((unsigned long)uaddr, attempt);
> + if (ret)
> + goto out;
> + uval = 0;
> + goto retry_unlocked;
> + }
>
> ret = get_user(uval, uaddr);
> + put_futex_key(fshared, &key);
> if (!ret)
> goto retry;
>
>
>
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
On Wed, 2009-06-10 at 22:58 -0700, Darren Hart wrote:
> Zhang, Yanmin wrote:
>
> Hi Zhang,
>
> > I investigate a fio hang issue. When I run fio multi-process
> > testing on many disks, fio traps into kernel and doesn't exit
> > (mostly hit once after runing sub test cases for hundreds of times).
> >
> > Oprofile data shows kernel consumes time with some futex functions.
> > Command kill couldn't kill the process and machine reboot also hangs.
> >
> > Eventually, I locate the root cause as a bug of futex. Kernel enters
> > a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> > By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> > points to an area which is READONLY. So futex_atomic_op_inuser returns
> > -EFAULT when trying to changing the data at uaddr2, but later get_user
> > still succeeds becasue the area is READONLY. Then go back to retry.
> >
> > I create a simple test case to trigger it, which just shmat an READONLY
> > area for address uaddr2.
> >
> > It could be used as a DOS attack.
>
> Nice work on the diagnosis. I recall discussing something like this a
> couple weeks back. I thought this was fixed with a patch to ensure the
> pages were writable. Cc'ing Thomas G. to confirm.
> I didn't see a
> kernel version in your report, what are you running?
2.6.30-rc1~rc8.
On Thu, 2009-06-11 at 07:55 +0200, Peter Zijlstra wrote:
> On Thu, 2009-06-11 at 11:08 +0800, Zhang, Yanmin wrote:
> > I investigate a fio hang issue. When I run fio multi-process
> > testing on many disks, fio traps into kernel and doesn't exit
> > (mostly hit once after runing sub test cases for hundreds of times).
> >
> > Oprofile data shows kernel consumes time with some futex functions.
> > Command kill couldn't kill the process and machine reboot also hangs.
> >
> > Eventually, I locate the root cause as a bug of futex. Kernel enters
> > a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> > By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> > points to an area which is READONLY. So futex_atomic_op_inuser returns
> > -EFAULT when trying to changing the data at uaddr2, but later get_user
> > still succeeds becasue the area is READONLY. Then go back to retry.
> >
> > I create a simple test case to trigger it, which just shmat an READONLY
> > area for address uaddr2.
> >
> > It could be used as a DOS attack.
/me has morning juice and notices he sent the wrong commit...
commit 64d1304a64477629cb16b75491a77bafe6f86963
Author: Thomas Gleixner <[email protected]>
Date: Mon May 18 21:20:10 2009 +0200
futex: setup writeable mapping for futex ops which modify user space data
The futex code installs a read only mapping via get_user_pages_fast()
even if the futex op function has to modify user space data. The
eventual fault was fixed up by futex_handle_fault() which walked the
VMA with mmap_sem held.
After the cleanup patches which removed the mmap_sem dependency of the
futex code commit 4dc5b7a36a49eff97050894cf1b3a9a02523717 (futex:
clean up fault logic) removed the private VMA walk logic from the
futex code. This change results in a stale RO mapping which is not
fixed up.
Instead of reintroducing the previous fault logic we set up the
mapping in get_user_pages_fast() read/write for all operations which
modify user space data. Also handle private futexes in the same way
and make the current unconditional access_ok(VERIFY_WRITE) depend on
the futex op.
Reported-by: Andreas Schwab <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
CC: [email protected]
Peter Zijlstra wrote:
> On Thu, 2009-06-11 at 07:55 +0200, Peter Zijlstra wrote:
>> On Thu, 2009-06-11 at 11:08 +0800, Zhang, Yanmin wrote:
>>> I investigate a fio hang issue. When I run fio multi-process
>>> testing on many disks, fio traps into kernel and doesn't exit
>>> (mostly hit once after runing sub test cases for hundreds of times).
>>>
>>> Oprofile data shows kernel consumes time with some futex functions.
>>> Command kill couldn't kill the process and machine reboot also hangs.
>>>
>>> Eventually, I locate the root cause as a bug of futex. Kernel enters
>>> a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
>>> By unknown reason (might be an issue of fio or glibc), parameter uaddr2
>>> points to an area which is READONLY. So futex_atomic_op_inuser returns
>>> -EFAULT when trying to changing the data at uaddr2, but later get_user
>>> still succeeds becasue the area is READONLY. Then go back to retry.
>>>
>>> I create a simple test case to trigger it, which just shmat an READONLY
>>> area for address uaddr2.
>>>
>>> It could be used as a DOS attack.
>
> /me has morning juice and notices he sent the wrong commit...
>
> commit 64d1304a64477629cb16b75491a77bafe6f86963
> Author: Thomas Gleixner <[email protected]>
> Date: Mon May 18 21:20:10 2009 +0200
>
> futex: setup writeable mapping for futex ops which modify user space data
Yup, that's the one. I was trying to locate it myself, but you beat me
to it. Thanks Peter.
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
On Thu, 2009-06-11 at 08:18 +0200, Peter Zijlstra wrote:
> On Thu, 2009-06-11 at 07:55 +0200, Peter Zijlstra wrote:
> > On Thu, 2009-06-11 at 11:08 +0800, Zhang, Yanmin wrote:
> > > I investigate a fio hang issue. When I run fio multi-process
> > > testing on many disks, fio traps into kernel and doesn't exit
> > > (mostly hit once after runing sub test cases for hundreds of times).
> > >
> > > Oprofile data shows kernel consumes time with some futex functions.
> > > Command kill couldn't kill the process and machine reboot also hangs.
> > >
> > > Eventually, I locate the root cause as a bug of futex. Kernel enters
> > > a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> > > By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> > > points to an area which is READONLY. So futex_atomic_op_inuser returns
> > > -EFAULT when trying to changing the data at uaddr2, but later get_user
> > > still succeeds becasue the area is READONLY. Then go back to retry.
> > >
> > > I create a simple test case to trigger it, which just shmat an READONLY
> > > area for address uaddr2.
> > >
> > > It could be used as a DOS attack.
>
> /me has morning juice and notices he sent the wrong commit...
>
> commit 64d1304a64477629cb16b75491a77bafe6f86963
> Author: Thomas Gleixner <[email protected]>
> Date: Mon May 18 21:20:10 2009 +0200
2.6.30 includes the new commit. I did a quick testing with my simple
test case and it traps into kernel without exiting.
The reason is I use flag FUTEX_PRIVATE_FLAG. So the fshared part in function
get_futex_key should be deleted. That might hurt performance.
Yanmin
On Thu, 2009-06-11 at 16:33 +0800, Zhang, Yanmin wrote:
> On Thu, 2009-06-11 at 08:18 +0200, Peter Zijlstra wrote:
> > On Thu, 2009-06-11 at 07:55 +0200, Peter Zijlstra wrote:
> > > On Thu, 2009-06-11 at 11:08 +0800, Zhang, Yanmin wrote:
> > > > I investigate a fio hang issue. When I run fio multi-process
> > > > testing on many disks, fio traps into kernel and doesn't exit
> > > > (mostly hit once after runing sub test cases for hundreds of times).
> > > >
> > > > Oprofile data shows kernel consumes time with some futex functions.
> > > > Command kill couldn't kill the process and machine reboot also hangs.
> > > >
> > > > Eventually, I locate the root cause as a bug of futex. Kernel enters
> > > > a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> > > > By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> > > > points to an area which is READONLY. So futex_atomic_op_inuser returns
> > > > -EFAULT when trying to changing the data at uaddr2, but later get_user
> > > > still succeeds becasue the area is READONLY. Then go back to retry.
> > > >
> > > > I create a simple test case to trigger it, which just shmat an READONLY
> > > > area for address uaddr2.
> > > >
> > > > It could be used as a DOS attack.
> >
> > /me has morning juice and notices he sent the wrong commit...
> >
> > commit 64d1304a64477629cb16b75491a77bafe6f86963
> > Author: Thomas Gleixner <[email protected]>
> > Date: Mon May 18 21:20:10 2009 +0200
> 2.6.30 includes the new commit. I did a quick testing with my simple
> test case and it traps into kernel without exiting.
>
> The reason is I use flag FUTEX_PRIVATE_FLAG. So the fshared part in function
> get_futex_key should be deleted. That might hurt performance.
No reason to, the easiest fix would be to simply propagate that -EFAULT
to userspace and let them chew on it, there's really nothing you can do
with a RO mapping.
On Thu, 2009-06-11 at 16:33 +0800, Zhang, Yanmin wrote:
> On Thu, 2009-06-11 at 08:18 +0200, Peter Zijlstra wrote:
> > On Thu, 2009-06-11 at 07:55 +0200, Peter Zijlstra wrote:
> > > On Thu, 2009-06-11 at 11:08 +0800, Zhang, Yanmin wrote:
> > > > I investigate a fio hang issue. When I run fio multi-process
> > > > testing on many disks, fio traps into kernel and doesn't exit
> > > > (mostly hit once after runing sub test cases for hundreds of times).
> > > >
> > > > Oprofile data shows kernel consumes time with some futex functions.
> > > > Command kill couldn't kill the process and machine reboot also hangs.
> > > >
> > > > Eventually, I locate the root cause as a bug of futex. Kernel enters
> > > > a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> > > > By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> > > > points to an area which is READONLY. So futex_atomic_op_inuser returns
> > > > -EFAULT when trying to changing the data at uaddr2, but later get_user
> > > > still succeeds becasue the area is READONLY. Then go back to retry.
> > > >
> > > > I create a simple test case to trigger it, which just shmat an READONLY
> > > > area for address uaddr2.
> > > >
> > > > It could be used as a DOS attack.
> >
> > /me has morning juice and notices he sent the wrong commit...
> >
> > commit 64d1304a64477629cb16b75491a77bafe6f86963
> > Author: Thomas Gleixner <[email protected]>
> > Date: Mon May 18 21:20:10 2009 +0200
> 2.6.30 includes the new commit. I did a quick testing with my simple
> test case and it traps into kernel without exiting.
>
> The reason is I use flag FUTEX_PRIVATE_FLAG. So the fshared part in function
> get_futex_key should be deleted. That might hurt performance.
FWIW, using a private futex on a shm section is wrong in and of itself.
tglx: should we create CONFIG_DEBUG_FUTEX and do a vma lookup to
validate that private futexes are indeed in private anonymous memory?
But you would be able to trigger the same using an PROT_READ anonymous
mmap().
It appears access_ok() isn't as strict as we'd like it to be:
/*
...
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
*/
#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
Thomas is working on a fix for this.
On Thu, 2009-06-11 at 13:36 +0200, Peter Zijlstra wrote:
> On Thu, 2009-06-11 at 16:33 +0800, Zhang, Yanmin wrote:
> > On Thu, 2009-06-11 at 08:18 +0200, Peter Zijlstra wrote:
> > > On Thu, 2009-06-11 at 07:55 +0200, Peter Zijlstra wrote:
> > > > On Thu, 2009-06-11 at 11:08 +0800, Zhang, Yanmin wrote:
> > > > > I investigate a fio hang issue. When I run fio multi-process
> > > > > testing on many disks, fio traps into kernel and doesn't exit
> > > > > (mostly hit once after runing sub test cases for hundreds of times).
> > > > >
> > > > > Oprofile data shows kernel consumes time with some futex functions.
> > > > > Command kill couldn't kill the process and machine reboot also hangs.
> > > > >
> > > > > Eventually, I locate the root cause as a bug of futex. Kernel enters
> > > > > a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> > > > > By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> > > > > points to an area which is READONLY. So futex_atomic_op_inuser returns
> > > > > -EFAULT when trying to changing the data at uaddr2, but later get_user
> > > > > still succeeds becasue the area is READONLY. Then go back to retry.
> > > > >
> > > > > I create a simple test case to trigger it, which just shmat an READONLY
> > > > > area for address uaddr2.
> > > > >
> > > > > It could be used as a DOS attack.
> > >
> > > /me has morning juice and notices he sent the wrong commit...
> > >
> > > commit 64d1304a64477629cb16b75491a77bafe6f86963
> > > Author: Thomas Gleixner <[email protected]>
> > > Date: Mon May 18 21:20:10 2009 +0200
> > 2.6.30 includes the new commit. I did a quick testing with my simple
> > test case and it traps into kernel without exiting.
> >
> > The reason is I use flag FUTEX_PRIVATE_FLAG. So the fshared part in function
> > get_futex_key should be deleted. That might hurt performance.
>
> FWIW, using a private futex on a shm section is wrong in and of itself.
What I mean is it could be used as a DOS attack.
Did you try my test case? Could you kill it when it runs?
>
> tglx: should we create CONFIG_DEBUG_FUTEX and do a vma lookup to
> validate that private futexes are indeed in private anonymous memory?
>
> But you would be able to trigger the same using an PROT_READ anonymous
> mmap().
>
> It appears access_ok() isn't as strict as we'd like it to be:
>
> /*
> ...
> * Note that, depending on architecture, this function probably just
> * checks that the pointer is in the user space range - after calling
> * this function, memory access functions may still return -EFAULT.
> */
> #define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
>
> Thomas is working on a fix for this.
>
On Fri, 12 Jun 2009, Zhang, Yanmin wrote:
> On Thu, 2009-06-11 at 13:36 +0200, Peter Zijlstra wrote:
> > FWIW, using a private futex on a shm section is wrong in and of itself.
>
> What I mean is it could be used as a DOS attack.
Right. Fix is on the way.
> Did you try my test case? Could you kill it when it runs?
No, you can not kill it. That's why it needs a proper fix. Will send
out today.
Thanks,
tglx
On Fri, 12 Jun 2009, Thomas Gleixner wrote:
> On Fri, 12 Jun 2009, Zhang, Yanmin wrote:
> > On Thu, 2009-06-11 at 13:36 +0200, Peter Zijlstra wrote:
> > > FWIW, using a private futex on a shm section is wrong in and of itself.
> >
> > What I mean is it could be used as a DOS attack.
>
> Right. Fix is on the way.
>
> > Did you try my test case? Could you kill it when it runs?
>
> No, you can not kill it. That's why it needs a proper fix. Will send
> out today.
Can you please verify the patch below ? It's against 2.6.30.
Thanks,
tglx
-------------->
futex: Fix the write access fault problem for real
commit 64d1304a64 (futex: setup writeable mapping for futex ops which
modify user space data) did address only half of the problem of write
access faults.
The patch was made on two wrong assumptions:
1) access_ok(VERIFY_WRITE,...) would actually check write access.
On x86 it does _NOT_. It's a pure address range check.
2) a RW mapped region can not go away under us.
That's wrong as well. Nobody can prevent another thread to call
mprotect(PROT_READ) on that region where the futex resides. If that
call hits between the get_user_pages_fast() verification and the
actual write access in the atomic region we are toast again.
The solution is to not rely on access_ok and get_user() for any write
access related fault on private and shared futexes. Instead we need to
go through get_user_pages_fast() in the fault path to avoid any of the
above pitfalls. If get_user_pages_fast() returns -EFAULT we know that
we can not fix it anymore and need to bail out to user space.
Remove a bunch of confusing comments on this issue as well.
Signed-off-by: Thomas Gleixner <[email protected]>
---
kernel/futex.c | 48 +++++++++++++++++++++++++++++-------------------
1 file changed, 29 insertions(+), 19 deletions(-)
Index: linux-2.6-tip/kernel/futex.c
===================================================================
--- linux-2.6-tip.orig/kernel/futex.c
+++ linux-2.6-tip/kernel/futex.c
@@ -278,6 +278,31 @@ void put_futex_key(int fshared, union fu
drop_futex_key_refs(key);
}
+/*
+ * get_user_writeable - get user page and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * We cannot write to the user space address and get_user just faults
+ * the page in, but does not tell us whether the mapping is writeable.
+ *
+ * We can not rely on access_ok() for private futexes as it is just a
+ * range check and we can neither rely on get_user_pages() as there
+ * might be a mprotect(PROT_READ) for that mapping after
+ * get_user_pages() and before the fault in the atomic write access.
+ */
+static int get_user_writeable(u32 __user *uaddr)
+{
+ unsigned long addr = (unsigned long)uaddr;
+ struct page *page;
+ int ret;
+
+ ret = get_user_pages_fast(addr, 1, 1, &page);
+ if (!ret)
+ put_page(page);
+
+ return ret;
+}
+
static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
{
u32 curval;
@@ -739,7 +764,6 @@ retry:
retry_private:
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
- u32 dummy;
double_unlock_hb(hb1, hb2);
@@ -757,7 +781,7 @@ retry_private:
goto out_put_keys;
}
- ret = get_user(dummy, uaddr2);
+ ret = get_user_writeable(uaddr2);
if (ret)
goto out_put_keys;
@@ -1097,7 +1121,7 @@ retry:
handle_fault:
spin_unlock(q->lock_ptr);
- ret = get_user(uval, uaddr);
+ ret = get_user_writeable(uaddr);
spin_lock(q->lock_ptr);
@@ -1552,16 +1576,9 @@ out:
return ret;
uaddr_faulted:
- /*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
- */
queue_unlock(&q, hb);
- ret = get_user(uval, uaddr);
+ ret = get_user_writeable(uaddr);
if (ret)
goto out_put_key;
@@ -1657,17 +1674,10 @@ out:
return ret;
pi_faulted:
- /*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
- */
spin_unlock(&hb->lock);
put_futex_key(fshared, &key);
- ret = get_user(uval, uaddr);
+ ret = get_user_writeable(uaddr);
if (!ret)
goto retry;
On Fri, 2009-06-12 at 10:39 +0200, Thomas Gleixner wrote:
> On Fri, 12 Jun 2009, Thomas Gleixner wrote:
> > On Fri, 12 Jun 2009, Zhang, Yanmin wrote:
> > > On Thu, 2009-06-11 at 13:36 +0200, Peter Zijlstra wrote:
> > > > FWIW, using a private futex on a shm section is wrong in and of itself.
> > >
> > > What I mean is it could be used as a DOS attack.
> >
> > Right. Fix is on the way.
> >
> > > Did you try my test case? Could you kill it when it runs?
> >
> > No, you can not kill it. That's why it needs a proper fix. Will send
> > out today.
>
> Can you please verify the patch below ? It's against 2.6.30.
I tested it with my test case and it doesn't hang again. But I still
have considerations.
>
> Thanks,
>
> tglx
>
> -------------->
> futex: Fix the write access fault problem for real
>
> commit 64d1304a64 (futex: setup writeable mapping for futex ops which
> modify user space data) did address only half of the problem of write
> access faults.
>
> The patch was made on two wrong assumptions:
>
> 1) access_ok(VERIFY_WRITE,...) would actually check write access.
>
> On x86 it does _NOT_. It's a pure address range check.
>
> 2) a RW mapped region can not go away under us.
>
> That's wrong as well. Nobody can prevent another thread to call
> mprotect(PROT_READ) on that region where the futex resides. If that
> call hits between the get_user_pages_fast() verification and the
> actual write access in the atomic region we are toast again.
>
> The solution is to not rely on access_ok and get_user() for any write
> access related fault on private and shared futexes. Instead we need to
> go through get_user_pages_fast() in the fault path to avoid any of the
> above pitfalls. If get_user_pages_fast() returns -EFAULT we know that
> we can not fix it anymore and need to bail out to user space.
>
> Remove a bunch of confusing comments on this issue as well.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> kernel/futex.c | 48 +++++++++++++++++++++++++++++-------------------
> 1 file changed, 29 insertions(+), 19 deletions(-)
>
> Index: linux-2.6-tip/kernel/futex.c
> ===================================================================
> --- linux-2.6-tip.orig/kernel/futex.c
> +++ linux-2.6-tip/kernel/futex.c
> @@ -278,6 +278,31 @@ void put_futex_key(int fshared, union fu
> drop_futex_key_refs(key);
> }
>
> +/*
> + * get_user_writeable - get user page and verify RW access
> + * @uaddr: pointer to faulting user space address
> + *
> + * We cannot write to the user space address and get_user just faults
> + * the page in, but does not tell us whether the mapping is writeable.
> + *
> + * We can not rely on access_ok() for private futexes as it is just a
> + * range check and we can neither rely on get_user_pages() as there
> + * might be a mprotect(PROT_READ) for that mapping after
> + * get_user_pages() and before the fault in the atomic write access.
> + */
> +static int get_user_writeable(u32 __user *uaddr)
> +{
> + unsigned long addr = (unsigned long)uaddr;
> + struct page *page;
> + int ret;
> +
> + ret = get_user_pages_fast(addr, 1, 1, &page);
I checked function get_user_pages_fast. It might return negative, 0, or
positive value. 0 means it doesn't pin any page. So why does below statement
use (!ret) to put_page?
I changed my test case and run it for unlimited times. It seems memory
leak is big.
get_user_pages_fast is used by get_futex_key with the similiar issue.
> + if (!ret)
> + put_page(page);
> +
> + return ret;
> +}
> +
> static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
> {
> u32 curval;
> @@ -739,7 +764,6 @@ retry:
> retry_private:
> op_ret = futex_atomic_op_inuser(op, uaddr2);
> if (unlikely(op_ret < 0)) {
> - u32 dummy;
>
> double_unlock_hb(hb1, hb2);
>
> @@ -757,7 +781,7 @@ retry_private:
> goto out_put_keys;
> }
>
> - ret = get_user(dummy, uaddr2);
> + ret = get_user_writeable(uaddr2);
> if (ret)
> goto out_put_keys;
>
> @@ -1097,7 +1121,7 @@ retry:
> handle_fault:
> spin_unlock(q->lock_ptr);
>
> - ret = get_user(uval, uaddr);
> + ret = get_user_writeable(uaddr);
>
> spin_lock(q->lock_ptr);
>
> @@ -1552,16 +1576,9 @@ out:
> return ret;
>
> uaddr_faulted:
> - /*
> - * We have to r/w *(int __user *)uaddr, and we have to modify it
> - * atomically. Therefore, if we continue to fault after get_user()
> - * below, we need to handle the fault ourselves, while still holding
> - * the mmap_sem. This can occur if the uaddr is under contention as
> - * we have to drop the mmap_sem in order to call get_user().
> - */
> queue_unlock(&q, hb);
>
> - ret = get_user(uval, uaddr);
> + ret = get_user_writeable(uaddr);
X86 pte entry has no READABLE flag. Other platforms might have. If their pte
only set WRITE flag, Is it poosible to create a similiar DOS attack with
WRITEONLY area on such platforms?
> if (ret)
> goto out_put_key;
>
> @@ -1657,17 +1674,10 @@ out:
> return ret;
>
> pi_faulted:
> - /*
> - * We have to r/w *(int __user *)uaddr, and we have to modify it
> - * atomically. Therefore, if we continue to fault after get_user()
> - * below, we need to handle the fault ourselves, while still holding
> - * the mmap_sem. This can occur if the uaddr is under contention as
> - * we have to drop the mmap_sem in order to call get_user().
> - */
> spin_unlock(&hb->lock);
> put_futex_key(fshared, &key);
>
> - ret = get_user(uval, uaddr);
> + ret = get_user_writeable(uaddr);
> if (!ret)
> goto retry;
>
On Mon, 15 Jun 2009, Zhang, Yanmin wrote:
> > + ret = get_user_pages_fast(addr, 1, 1, &page);
>
> I checked function get_user_pages_fast. It might return negative, 0, or
> positive value. 0 means it doesn't pin any page. So why does below statement
> use (!ret) to put_page?
Hmm, darn. You are right. It needs to be (ret > 0)
Thanks,
tglx
On Mon, 15 Jun 2009, Zhang, Yanmin wrote:
> On Fri, 2009-06-12 at 10:39 +0200, Thomas Gleixner wrote:
>
> > + ret = get_user_writeable(uaddr);
>
> X86 pte entry has no READABLE flag. Other platforms might have. If their pte
> only set WRITE flag, Is it poosible to create a similiar DOS attack with
> WRITEONLY area on such platforms?
Just checked with Peter. The kernel assumes that when its present we
can read. We looked at the arches that do support wr-only and it seems
that this is a valid assumption. i.e. we don't support wr-only
mappings.
Thanks,
tglx
On Mon, 2009-06-15 at 14:03 +0800, Zhang, Yanmin wrote:
> > + ret = get_user_writeable(uaddr);
> X86 pte entry has no READABLE flag. Other platforms might have. If their pte
> only set WRITE flag, Is it poosible to create a similiar DOS attack with
> WRITEONLY area on such platforms?
I just checked a few such platforms and generic code, we seem to assume
read on write, and read when present in the generic code.
And those few platforms that supported wr-only also implemented that
(alpha, avr32 -- there might be more didn't go through all).
On Mon, 2009-06-15 at 09:57 +0200, Thomas Gleixner wrote:
> On Mon, 15 Jun 2009, Zhang, Yanmin wrote:
> > > + ret = get_user_pages_fast(addr, 1, 1, &page);
> >
> > I checked function get_user_pages_fast. It might return negative, 0, or
> > positive value. 0 means it doesn't pin any page. So why does below statement
> > use (!ret) to put_page?
>
> Hmm, darn. You are right. It needs to be (ret > 0)
I tested (ret > 0). It does work and I didn't find memory leak.