This patch need both futex and zero-page developer's ack.
==========================================================
commit a13ea5b7 (mm: reinstate ZERO_PAGE) made the unfortunate regression.
following test program never finish and waste 100% cpu time.
At the making commit 38d47c1b7 (rely on get_user_pages() for shared
futexes). There isn't zero page in linux kernel. then, futex developers
thought gup retry is safe. but we reinstated zero page later...
This patch fixes it.
futex-zero.c
---------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
int main(int argc, char **argv)
{
long page_size;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return 0;
}
---------------------------------------------------------------------
Signed-off-by: KOSAKI Motohiro <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: KAMEZAWA Hiroyuki <[email protected]>
Cc: Nick Piggin <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
include/linux/mm.h | 16 ++++++++++++++++
kernel/futex.c | 3 +++
mm/memory.c | 14 --------------
3 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2265f28..dd755ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -751,6 +751,22 @@ struct zap_details {
unsigned long truncate_count; /* Compare vm_truncate_count */
};
+#ifndef is_zero_pfn
+extern unsigned long zero_pfn;
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ return pfn == zero_pfn;
+}
+#endif
+
+#ifndef my_zero_pfn
+extern unsigned long zero_pfn;
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ return zero_pfn;
+}
+#endif
+
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..79b89cc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -254,6 +254,8 @@ again:
page = compound_head(page);
lock_page(page);
+ if (is_zero_pfn(page_to_pfn(page)))
+ goto anon_key;
if (!page->mapping) {
unlock_page(page);
put_page(page);
@@ -268,6 +270,7 @@ again:
* the object not the particular process.
*/
if (PageAnon(page)) {
+ anon_key:
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1b..3743fb5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -457,20 +457,6 @@ static inline int is_cow_mapping(unsigned int flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- return pfn == zero_pfn;
-}
-#endif
-
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
--
1.6.5.2
Added tglx and dvhart to the CC.
On Thu, 2009-12-24 at 18:29 +0900, KOSAKI Motohiro wrote:
> This patch need both futex and zero-page developer's ack.
> ==========================================================
>
> commit a13ea5b7 (mm: reinstate ZERO_PAGE) made the unfortunate regression.
> following test program never finish and waste 100% cpu time.
>
> At the making commit 38d47c1b7 (rely on get_user_pages() for shared
> futexes). There isn't zero page in linux kernel. then, futex developers
> thought gup retry is safe. but we reinstated zero page later...
>
> This patch fixes it.
>
> futex-zero.c
> ---------------------------------------------------------------------
> #include <stdio.h>
> #include <stdlib.h>
> #include <sys/mman.h>
> #include <syscall.h>
> #include <sys/time.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <errno.h>
> #include <linux/futex.h>
> #include <pthread.h>
>
> int main(int argc, char **argv)
> {
> long page_size;
> int ret;
> void *buf;
>
> page_size = sysconf(_SC_PAGESIZE);
>
> buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> if (buf == (void *)-1) {
> perror("mmap error.\n");
> exit(1);
> }
>
> fprintf(stderr, "futex wait\n");
> ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
> if (ret != 0 && errno != EWOULDBLOCK) {
> perror("futex error.\n");
> exit(1);
> }
> fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
>
> return 0;
> }
> ---------------------------------------------------------------------
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> Cc: Hugh Dickins <[email protected]>
> Cc: KAMEZAWA Hiroyuki <[email protected]>
> Cc: Nick Piggin <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> ---
> include/linux/mm.h | 16 ++++++++++++++++
> kernel/futex.c | 3 +++
> mm/memory.c | 14 --------------
> 3 files changed, 19 insertions(+), 14 deletions(-)
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 8e3c3ff..79b89cc 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -254,6 +254,8 @@ again:
>
> page = compound_head(page);
> lock_page(page);
> + if (is_zero_pfn(page_to_pfn(page)))
> + goto anon_key;
> if (!page->mapping) {
> unlock_page(page);
> put_page(page);
> @@ -268,6 +270,7 @@ again:
> * the object not the particular process.
> */
> if (PageAnon(page)) {
> + anon_key:
> key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
> key->private.mm = mm;
> key->private.address = address;
Doesn't it make more sense to simply fail the futex op?
What I mean is, all (?) futex ops assume userspace actually wrote
something to the address in question to begin with, therefore it can
never be the zero page.
So it being the zero page means someone goofed up, and we should bail,
no?
On Thu, Dec 24, 2009 at 01:39, Peter Zijlstra <[email protected]> wrote:
> What I mean is, all (?) futex ops assume userspace actually wrote
> something to the address in question to begin with, therefore it can
> never be the zero page.
In practice today this is likely always the case. But it doesn't have
to be like that. In theory a zero-initialized futex is a reasonable
object on which to wait. You should not preclude this from happening
just because it's not done like this today.
> > diff --git a/kernel/futex.c b/kernel/futex.c
> > index 8e3c3ff..79b89cc 100644
> > --- a/kernel/futex.c
> > +++ b/kernel/futex.c
> > @@ -254,6 +254,8 @@ again:
> >
> > page = compound_head(page);
> > lock_page(page);
> > + if (is_zero_pfn(page_to_pfn(page)))
> > + goto anon_key;
> > if (!page->mapping) {
> > unlock_page(page);
> > put_page(page);
> > @@ -268,6 +270,7 @@ again:
> > * the object not the particular process.
> > */
> > if (PageAnon(page)) {
> > + anon_key:
> > key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
> > key->private.mm = mm;
> > key->private.address = address;
>
> Doesn't it make more sense to simply fail the futex op?
>
> What I mean is, all (?) futex ops assume userspace actually wrote
> something to the address in question to begin with, therefore it can
> never be the zero page.
>
> So it being the zero page means someone goofed up, and we should bail,
> no?
Hm. probably we need to discuss more.
Firstly, if we assume current glibc implimentation, you are right,
we can assume userland always initialize the page explicitly before using
futex. then we never seen zero page in futex.
but, I don't think futex itself assume it now. at least man page
doesn't describe such limilation. so, if you prefer bail and man fix,
I'm acceptable maybe.
I don't know any library except glibc use futex directly, or not.
but we don't have any reason to assume futex is used only glibc.
(plus, glibc might change its implementation in future release, perhaps)
following testcase is more practice than last mail's one.
if we add zero page bail logic, this testcase mihgt be fail.
----------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
void * futex_wait(void *arg)
{
void *addr = arg;
int ret;
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, addr, FUTEX_WAIT, 0, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return NULL;
}
int main(int argc, char **argv)
{
long page_size;
pthread_t thr;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
ret = pthread_create(&thr, NULL, futex_wait, buf);
if (ret < 0) {
fprintf(stderr, "pthread_create error\n");
exit(1);
}
sleep(10);
fprintf(stderr, "futex wake\n");
*((int*)buf) = 1;
ret = syscall( SYS_futex, buf, FUTEX_WAKE, 1, NULL, NULL, NULL);
fprintf(stderr, "futex_wake: ret = %d, errno = %d\n", ret, errno);
fprintf(stderr, "join\n");
pthread_join(thr, NULL);
return 0;
}
--------------------------------------------------------------------------
man page fix has another difficulty. in past days, zero pages was perfectly
transparent from userland. then any man page don't describe "what's zero page".
then some administrator don't know about zero page at all.
So, if your main disliked point is goto statement, following patch is ok
to me.
Finally, I agree this is really corner case issue. I can agree man page fix
too. but I hope to know which point you dislike in my patch.
Thanks.
>From 09b0a7426c707e2fab5ac8e1ef8feec14415ee62 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <[email protected]>
Date: Thu, 24 Dec 2009 18:07:42 +0900
Subject: [PATCH] futex: Fix ZERO_PAGE cause infinite loop
commit a13ea5b7 (mm: reinstate ZERO_PAGE) made the unfortunate regression.
following test program never finish and waste 100% cpu time.
At the making commit 38d47c1b7 (rely on get_user_pages() for shared
futexes). There isn't zero page in linux kernel. then, futex developers
thought gup retry is safe. but we reinstated zero page later...
This patch fixes it.
futex-zero.c
---------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
int main(int argc, char **argv)
{
long page_size;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return 0;
}
---------------------------------------------------------------------
Signed-off-by: KOSAKI Motohiro <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: KAMEZAWA Hiroyuki <[email protected]>
Cc: Nick Piggin <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
include/linux/mm.h | 16 ++++++++++++++++
kernel/futex.c | 6 ++++--
mm/memory.c | 14 --------------
3 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2265f28..dd755ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -751,6 +751,22 @@ struct zap_details {
unsigned long truncate_count; /* Compare vm_truncate_count */
};
+#ifndef is_zero_pfn
+extern unsigned long zero_pfn;
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ return pfn == zero_pfn;
+}
+#endif
+
+#ifndef my_zero_pfn
+extern unsigned long zero_pfn;
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ return zero_pfn;
+}
+#endif
+
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..ad72989 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -222,6 +222,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
struct mm_struct *mm = current->mm;
struct page *page;
int err;
+ int is_zero_page;
/*
* The futex address must be "naturally" aligned.
@@ -253,8 +254,9 @@ again:
return err;
page = compound_head(page);
+ is_zero_page = is_zero_pfn(page_to_pfn(page));
lock_page(page);
- if (!page->mapping) {
+ if (!is_zero_page && !page->mapping) {
unlock_page(page);
put_page(page);
goto again;
@@ -267,7 +269,7 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (is_zero_page || PageAnon(page)) {
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1b..3743fb5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -457,20 +457,6 @@ static inline int is_cow_mapping(unsigned int flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- return pfn == zero_pfn;
-}
-#endif
-
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
--
1.6.5.2
This was a good find, thanks a lot for discovering it.
I'm afraid I'd quite forgotten all the mm-implementation-dependence
that has accumulated (for good performance reasons) in kernel/futex.c.
(Luckily, given that KSM pages are also PageAnon, it will happen to
be working correctly on them: that's not your concern, but something
your report has reminded me to consider.)
On Fri, 25 Dec 2009, KOSAKI Motohiro wrote:
> > > diff --git a/kernel/futex.c b/kernel/futex.c
> > > index 8e3c3ff..79b89cc 100644
> > > --- a/kernel/futex.c
> > > +++ b/kernel/futex.c
> > > @@ -254,6 +254,8 @@ again:
> > >
> > > page = compound_head(page);
> > > lock_page(page);
> > > + if (is_zero_pfn(page_to_pfn(page)))
> > > + goto anon_key;
If something like this or your replacment does go forward,
then I think that test is better inside the "if (!page->mapping)"
below. Admittedly that adds even more mm-dependence here (relying
on a zero page to have NULL page->mapping); but isn't page_to_pfn()
one of those functions which is trivial on many configs but expensive
on some? Better call it only in the rare case that it's needed.
Though wouldn't it be even better not to use is_zero_pfn() at all?
That was convenient in mm/memory.c because it had the pfn or pte right
at hand, but here a traditional (page == ZERO_PAGE(address)) would be
more efficient.
Which would save having to move is_zero_pfn() from mm/memory.c
to include/linux/mm.h - I'd prefer to keep it private if we can.
But for completeness, this would involve resurrecting the 2.6.19
MIPS move_pte(), which makes sure mremap() move doesn't interfere
with our assumptions. Something like
#define __HAVE_ARCH_MOVE_PTE
pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr,
unsigned long new_addr)
{
if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
pte = mk_pte(ZERO_PAGE(new_addr), prot);
return pte;
}
in arch/mips/include/asm/pgtable.h.
> > > if (!page->mapping) {
> > > unlock_page(page);
> > > put_page(page);
> > > @@ -268,6 +270,7 @@ again:
> > > * the object not the particular process.
> > > */
> > > if (PageAnon(page)) {
> > > + anon_key:
> > > key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
> > > key->private.mm = mm;
> > > key->private.address = address;
> >
> > Doesn't it make more sense to simply fail the futex op?
If we were to decide to fail the futex op on an unmodified ZERO_PAGE,
couldn't we simply delete the "again" label and just say
if (!page->mapping) {
/* Truncated file page, or a ZERO_PAGE */
unlock_page(page);
put_page(page);
return -EFAULT;
}
I don't know what the retry on that case was expected to achieve.
> >
> > What I mean is, all (?) futex ops assume userspace actually wrote
> > something to the address in question to begin with, therefore it can
> > never be the zero page.
> >
> > So it being the zero page means someone goofed up, and we should bail,
> > no?
I share Peter's wish to avoid cluttering this up with more special
case testing, but also your and Ulrich's wish to keep generality.
>
> Hm. probably we need to discuss more.
>
> Firstly, if we assume current glibc implimentation, you are right,
> we can assume userland always initialize the page explicitly before using
> futex. then we never seen zero page in futex.
>
> but, I don't think futex itself assume it now. at least man page
> doesn't describe such limilation. so, if you prefer bail and man fix,
> I'm acceptable maybe.
Here's another worry with the current futex implementation,
which might help me to decide which way to jump on the ZERO_PAGE.
Originally, a futex on a MAP_PRIVATE (!VM_MAYSHARE) area was necessarily
FUT_OFF_MMSHARED. Nowadays, with the get_user_pages_fast implementation,
we have no idea whether the vma is VM_MAYSHARE or not. So if a futex is
placed in a private area backed by a file, then it could be regarded as
FUT_OFF_INODE at futex_wait() time, but FUT_OFF_MMSHARED at futex_wake()
time.
Perhaps that's no problem at all, it's a long time since I was involved
with futexes, I think you and Peter will grasp the consequences quicker
than I shall.
But it seems no more improbable than the ZERO_PAGE case: some app
might place its futexes in the .data section of the executable,
which is a private mapping of the executable file.
If this case is also an issue, then perhaps we just need to update
the man page to say that whatever is responsible for initializing a
futex does need to write to it (or the page it's in) before it's used,
otherwise behaviour is undefined. (But we should then use the -EFAULT
patch above, we'd all prefer an error to busylooping.)
>
> I don't know any library except glibc use futex directly, or not.
> but we don't have any reason to assume futex is used only glibc.
> (plus, glibc might change its implementation in future release, perhaps)
I expect there are other implementations; but apparently none
that have yet reported this ZERO_PAGE behaviour as a problem.
...
>
> man page fix has another difficulty. in past days, zero pages was perfectly
> transparent from userland. then any man page don't describe "what's zero page".
> then some administrator don't know about zero page at all.
>
> So, if your main disliked point is goto statement, following patch is ok
> to me.
Although I'm as indecisive as Hamlet,
"goto or no goto, that is NOT the question":
I don't care, I think the goto in your original was fine.
>
> Finally, I agree this is really corner case issue. I can agree man page fix
> too. but I hope to know which point you dislike in my patch.
I just prefer not to have to add extra tests for the ZERO_PAGE
if we can get away without them.
Hugh
Hi
sorry very delayed responce, I've review futex code again.
> > Hm. probably we need to discuss more.
> >
> > Firstly, if we assume current glibc implimentation, you are right,
> > we can assume userland always initialize the page explicitly before using
> > futex. then we never seen zero page in futex.
> >
> > but, I don't think futex itself assume it now. at least man page
> > doesn't describe such limilation. so, if you prefer bail and man fix,
> > I'm acceptable maybe.
>
> Here's another worry with the current futex implementation,
> which might help me to decide which way to jump on the ZERO_PAGE.
>
> Originally, a futex on a MAP_PRIVATE (!VM_MAYSHARE) area was necessarily
> FUT_OFF_MMSHARED. Nowadays, with the get_user_pages_fast implementation,
> we have no idea whether the vma is VM_MAYSHARE or not. So if a futex is
> placed in a private area backed by a file, then it could be regarded as
> FUT_OFF_INODE at futex_wait() time, but FUT_OFF_MMSHARED at futex_wake()
> time.
very true!
> Perhaps that's no problem at all, it's a long time since I was involved
> with futexes, I think you and Peter will grasp the consequences quicker
> than I shall.
>
> But it seems no more improbable than the ZERO_PAGE case: some app
> might place its futexes in the .data section of the executable,
> which is a private mapping of the executable file.
>
> If this case is also an issue, then perhaps we just need to update
> the man page to say that whatever is responsible for initializing a
> futex does need to write to it (or the page it's in) before it's used,
> otherwise behaviour is undefined. (But we should then use the -EFAULT
> patch above, we'd all prefer an error to busylooping.)
I have a question. Why can't we use write mode get_user_pages_fast()?
I mean glibc always mekes write access before calling futex. it mean
write mode get_user_pages() doesn't mekes cow on practical usage.
Following patch is implemented such policy. What do you think?
>From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <[email protected]>
Date: Tue, 5 Jan 2010 11:33:00 +0900
Subject: [PATCH] futex: remove rw parameter from get_futex_key()
Currently, futex have two problem.
A) current futex doesn't handle private file mappings properly.
get_futex_key() use PageAnon() to distinguish file and anon. it can
makes following bad scenario.
1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
sleep on file mapping object.
2) thread-B write a variable and it makes cow.
3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
sleeped thread on the anonymous page. (but it's nothing)
following testcase reproduce this issue.
actual result)
FUTEX_WAIT thread never wake up.
expect result)
FUTEX_WAIT thread wake up by FUTEX_WAKE.
--------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <syscall.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
char pad[4096] = {1};
int val = 1;
char pad2[4096] = {1};
void * futex_wait(void *arg)
{
int ret;
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, &val, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return NULL;
}
int main(int argc, char **argv)
{
pthread_t thr;
int ret;
ret = pthread_create(&thr, NULL, futex_wait, NULL);
if (ret < 0) {
fprintf(stderr, "pthread_create error\n");
exit(1);
}
sleep(10);
fprintf(stderr, "futex wake\n");
val = 2;
ret = syscall( SYS_futex, &val, FUTEX_WAKE, 1, NULL, NULL, NULL);
fprintf(stderr, "futex_wake: ret = %d, errno = %d\n", ret, errno);
fprintf(stderr, "join\n");
pthread_join(thr, NULL);
return 0;
}
--------------------------------------------------------------------
B) current futex doesn't handle zero page properly.
read mode get_user_pages() can return zero page. but current futex code doesn't
handle it at all. Then, zero page makes infinite loop internally.
following testcase can reproduce this issue.
actual result)
FUTEX_WAIT never return and waste cpu time 100%.
expected result)
FUTEX_WAIT return immediately.
------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
int main(int argc, char **argv)
{
long page_size;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return 0;
}
------------------------------------------------------------------
The solution is to use write mode get_user_page() always for page lookup.
It prevent to lookup both file page of private mappings and zero page.
performance concern:
Probaly very little. because glibc always initialize variables for futex
before to call futex(). It mean glibc user never seen the overhead of this
patch.
compatibility concern:
This patch have few compatibility break. After this patch, FUTEX_WAIT require
writable access to futex variables (read-only mappings makes EFAULT).
But practically it's no problem. again glibc always initalize variables for futex
explicitly. nobody use read-only mappings.
Reported-by: Hugh Dickins <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
kernel/futex.c | 27 ++++++++++++---------------
1 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..d9b3a22 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: VERIFY_READ,
- * VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
- err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
+ err = get_user_pages_fast(address, 1, 1, &page);
if (err < 0)
return err;
@@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1175,11 +1173,10 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2,
- requeue_pi ? VERIFY_WRITE : VERIFY_READ);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
*/
retry:
q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(ret != 0))
return ret;
@@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out;
@@ -2023,7 +2020,7 @@ retry:
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
rt_waiter.task = NULL;
key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
--
1.6.5.2
On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
> > > Hm. probably we need to discuss more.
> > >
> > > Firstly, if we assume current glibc implimentation, you are right,
> > > we can assume userland always initialize the page explicitly before using
> > > futex. then we never seen zero page in futex.
> > >
> > > but, I don't think futex itself assume it now. at least man page
> > > doesn't describe such limilation. so, if you prefer bail and man fix,
> > > I'm acceptable maybe.
> >
> > Here's another worry with the current futex implementation,
> > which might help me to decide which way to jump on the ZERO_PAGE.
> >
> > Originally, a futex on a MAP_PRIVATE (!VM_MAYSHARE) area was necessarily
> > FUT_OFF_MMSHARED. Nowadays, with the get_user_pages_fast implementation,
> > we have no idea whether the vma is VM_MAYSHARE or not. So if a futex is
> > placed in a private area backed by a file, then it could be regarded as
> > FUT_OFF_INODE at futex_wait() time, but FUT_OFF_MMSHARED at futex_wake()
> > time.
>
> very true!
>
>
> > Perhaps that's no problem at all, it's a long time since I was involved
> > with futexes, I think you and Peter will grasp the consequences quicker
> > than I shall.
> >
> > But it seems no more improbable than the ZERO_PAGE case: some app
> > might place its futexes in the .data section of the executable,
> > which is a private mapping of the executable file.
> >
> > If this case is also an issue, then perhaps we just need to update
> > the man page to say that whatever is responsible for initializing a
> > futex does need to write to it (or the page it's in) before it's used,
> > otherwise behaviour is undefined. (But we should then use the -EFAULT
> > patch above, we'd all prefer an error to busylooping.)
>
> I have a question. Why can't we use write mode get_user_pages_fast()?
> I mean glibc always mekes write access before calling futex. it mean
> write mode get_user_pages() doesn't mekes cow on practical usage.
>
> Following patch is implemented such policy. What do you think?
It crossed my mind to do it that way too - honest!
I pushed it away thinking that some app may sometimes be using
mprotect PROT_READ or PROT_NONE over these areas, for one reason
or another - perhaps debugging or self-monitoring.
But I've no experience of futex use at all: if futexperts think it's
reasonable always to get_futex_key() for writing, then that is much
the neatest way to deal with both zero page and private file cases.
Over to the experts...
Hugh
>
> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
> From: KOSAKI Motohiro <[email protected]>
> Date: Tue, 5 Jan 2010 11:33:00 +0900
> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>
> Currently, futex have two problem.
>
> A) current futex doesn't handle private file mappings properly.
>
> get_futex_key() use PageAnon() to distinguish file and anon. it can
> makes following bad scenario.
>
> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
> sleep on file mapping object.
> 2) thread-B write a variable and it makes cow.
> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
> sleeped thread on the anonymous page. (but it's nothing)
>
> following testcase reproduce this issue.
>
> actual result)
> FUTEX_WAIT thread never wake up.
>
> expect result)
> FUTEX_WAIT thread wake up by FUTEX_WAKE.
>
> --------------------------------------------------------------------
> #include <stdio.h>
> #include <stdlib.h>
> #include <syscall.h>
> #include <unistd.h>
> #include <errno.h>
> #include <linux/futex.h>
> #include <pthread.h>
>
> char pad[4096] = {1};
> int val = 1;
> char pad2[4096] = {1};
>
> void * futex_wait(void *arg)
> {
> int ret;
>
> fprintf(stderr, "futex wait\n");
> ret = syscall( SYS_futex, &val, FUTEX_WAIT, 1, NULL, NULL, NULL);
> if (ret != 0 && errno != EWOULDBLOCK) {
> perror("futex error.\n");
> exit(1);
> }
> fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
>
> return NULL;
> }
>
> int main(int argc, char **argv)
> {
> pthread_t thr;
> int ret;
>
> ret = pthread_create(&thr, NULL, futex_wait, NULL);
> if (ret < 0) {
> fprintf(stderr, "pthread_create error\n");
> exit(1);
> }
>
> sleep(10);
>
> fprintf(stderr, "futex wake\n");
> val = 2;
> ret = syscall( SYS_futex, &val, FUTEX_WAKE, 1, NULL, NULL, NULL);
> fprintf(stderr, "futex_wake: ret = %d, errno = %d\n", ret, errno);
>
> fprintf(stderr, "join\n");
> pthread_join(thr, NULL);
>
> return 0;
> }
> --------------------------------------------------------------------
>
> B) current futex doesn't handle zero page properly.
>
> read mode get_user_pages() can return zero page. but current futex code doesn't
> handle it at all. Then, zero page makes infinite loop internally.
>
> following testcase can reproduce this issue.
>
> actual result)
> FUTEX_WAIT never return and waste cpu time 100%.
>
> expected result)
> FUTEX_WAIT return immediately.
>
> ------------------------------------------------------------------
> #include <stdio.h>
> #include <stdlib.h>
> #include <sys/mman.h>
> #include <syscall.h>
> #include <sys/time.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <errno.h>
> #include <linux/futex.h>
> #include <pthread.h>
>
> int main(int argc, char **argv)
> {
> long page_size;
> int ret;
> void *buf;
>
> page_size = sysconf(_SC_PAGESIZE);
>
> buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> if (buf == (void *)-1) {
> perror("mmap error.\n");
> exit(1);
> }
>
> fprintf(stderr, "futex wait\n");
> ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
> if (ret != 0 && errno != EWOULDBLOCK) {
> perror("futex error.\n");
> exit(1);
> }
> fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
>
> return 0;
> }
> ------------------------------------------------------------------
>
> The solution is to use write mode get_user_page() always for page lookup.
> It prevent to lookup both file page of private mappings and zero page.
>
> performance concern:
>
> Probaly very little. because glibc always initialize variables for futex
> before to call futex(). It mean glibc user never seen the overhead of this
> patch.
>
> compatibility concern:
>
> This patch have few compatibility break. After this patch, FUTEX_WAIT require
> writable access to futex variables (read-only mappings makes EFAULT).
> But practically it's no problem. again glibc always initalize variables for futex
> explicitly. nobody use read-only mappings.
>
> Reported-by: Hugh Dickins <[email protected]>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> kernel/futex.c | 27 ++++++++++++---------------
> 1 files changed, 12 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 8e3c3ff..d9b3a22 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
> * @uaddr: virtual address of the futex
> * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
> * @key: address where result is stored.
> - * @rw: mapping needs to be read/write (values: VERIFY_READ,
> - * VERIFY_WRITE)
> *
> * Returns a negative error code or 0
> * The key words are stored in *key on success.
> @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
> * lock_page() might sleep, the caller should not hold a spinlock.
> */
> static int
> -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
> {
> unsigned long address = (unsigned long)uaddr;
> struct mm_struct *mm = current->mm;
> @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> * but access_ok() should be faster than find_vma()
> */
> if (!fshared) {
> - if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
> + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
> return -EFAULT;
> key->private.mm = mm;
> key->private.address = address;
> @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> }
>
> again:
> - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
> + err = get_user_pages_fast(address, 1, 1, &page);
> if (err < 0)
> return err;
>
> @@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
> if (!bitset)
> return -EINVAL;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
> int ret, op_ret;
>
> retry:
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1175,11 +1173,10 @@ retry:
> pi_state = NULL;
> }
>
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2,
> - requeue_pi ? VERIFY_WRITE : VERIFY_READ);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
> */
> retry:
> q->key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &q->key);
> if (unlikely(ret != 0))
> return ret;
>
> @@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
> q.requeue_pi_key = NULL;
> retry:
> q.key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &q.key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2023,7 +2020,7 @@ retry:
> if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
> return -EPERM;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
> rt_waiter.task = NULL;
>
> key2 = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out;
>
> --
> 1.6.5.2
KOSAKI Motohiro wrote:
> I don't know any library except glibc use futex directly, or not.
> but we don't have any reason to assume futex is used only glibc.
> (plus, glibc might change its implementation in future release, perhaps)
I only know of 3 off-hand. The checkpoint restart guys have test suites
that use the futex syscalls. The futextest test suite obviously does.
There is also the userspace RCU library that Mathieu Desnoyers has been
workingon. None of these make use off the zero page as far as I know.
However, once we sort this out, I shall have to add such a test to
futextest, perhaps something similar to the test in this thread.
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
Hugh Dickins wrote:
> On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
>> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
>> From: KOSAKI Motohiro <[email protected]>
>> Date: Tue, 5 Jan 2010 11:33:00 +0900
>> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>>
>> Currently, futex have two problem.
>>
>> A) current futex doesn't handle private file mappings properly.
>>
>> get_futex_key() use PageAnon() to distinguish file and anon. it can
>> makes following bad scenario.
>>
>> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
>> sleep on file mapping object.
>> 2) thread-B write a variable and it makes cow.
>> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
>> sleeped thread on the anonymous page. (but it's nothing)
>>
Excellent test case, thank you! Would you consider preparing a patch to
futextest?
http://git.kernel.org/?p=linux/kernel/git/dvhart/futextest.git;a=summary
I did some experimentation here and found that:
o The test works if the *_PRIVATE op codes are used.
This is because the futex keys are generated using only the virtual
address of the page, which doesn't change on a COW.
o If the waiter writes to the val first, it works.
This forces the COW before the waiter generates it's futex key.
So the waiter's key is generated based on the page cache page address
for shared futexes when the value hasn't been written to prior to wait.
The only scenario where I could think of wanting this behavior is if
another process were to try and wake the waiter via the same file backed
page. However, as I understand it, the re-use of the same page for
unwritten-to private pages is an optimization and can't be relied upon.
So this scenario is out. Another would be to use the futex as a very
simple wait queue where the value is never changed. In this case
however, the implementation is racy as the value check is effectively
negated, so this use case is also out.
As such, I see no reason not to always use VERIFY_WRITE and force a COW
prior to generating the futex_key for shared futexes. It is not
necessary for private futexes however as they use only the virtual address.
I am not sure on whether or not it makes sense to avoid the VERIFY_WRITE
on the private futexes. Could be it is just more code for negligible
benefit. Thoughts?
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
> Hugh Dickins wrote:
> > On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
>
> >> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
> >> From: KOSAKI Motohiro <[email protected]>
> >> Date: Tue, 5 Jan 2010 11:33:00 +0900
> >> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
> >>
> >> Currently, futex have two problem.
> >>
> >> A) current futex doesn't handle private file mappings properly.
> >>
> >> get_futex_key() use PageAnon() to distinguish file and anon. it can
> >> makes following bad scenario.
> >>
> >> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
> >> sleep on file mapping object.
> >> 2) thread-B write a variable and it makes cow.
> >> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
> >> sleeped thread on the anonymous page. (but it's nothing)
> >>
>
> Excellent test case, thank you! Would you consider preparing a patch to
> futextest?
>
> http://git.kernel.org/?p=linux/kernel/git/dvhart/futextest.git;a=summary
Patch attached. you can feel free any modify such file. thanks.
> I did some experimentation here and found that:
>
> o The test works if the *_PRIVATE op codes are used.
> This is because the futex keys are generated using only the virtual
> address of the page, which doesn't change on a COW.
> o If the waiter writes to the val first, it works.
> This forces the COW before the waiter generates it's futex key.
True.
> So the waiter's key is generated based on the page cache page address
> for shared futexes when the value hasn't been written to prior to wait.
>
> The only scenario where I could think of wanting this behavior is if
> another process were to try and wake the waiter via the same file backed
> page. However, as I understand it, the re-use of the same page for
> unwritten-to private pages is an optimization and can't be relied upon.
> So this scenario is out. Another would be to use the futex as a very
> simple wait queue where the value is never changed. In this case
> however, the implementation is racy as the value check is effectively
> negated, so this use case is also out.
>
> As such, I see no reason not to always use VERIFY_WRITE and force a COW
> prior to generating the futex_key for shared futexes. It is not
> necessary for private futexes however as they use only the virtual address.
>
> I am not sure on whether or not it makes sense to avoid the VERIFY_WRITE
> on the private futexes. Could be it is just more code for negligible
> benefit. Thoughts?
I think it's no problem. because,
as performance view:
access_ok() of almost arch (included x86) ignore VERIFY_WRITE.
then, this change doesn't cause performance loss of course.
(current futex mainly handle ro-mapping problem by fault_in_user_writeable)
as consistency view:
This patch have better consistency without FUTEX_PRIVATE_FLAG case.
as usability view:
Nobody want to use ro-mappings for private futex. it's obviously meaningless
and useless.
KOSAKI Motohiro wrote:
>> Hugh Dickins wrote:
>>> On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
>>>> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
>>>> From: KOSAKI Motohiro <[email protected]>
>>>> Date: Tue, 5 Jan 2010 11:33:00 +0900
>>>> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>>>>
>>>> Currently, futex have two problem.
>>>>
>>>> A) current futex doesn't handle private file mappings properly.
>>>>
>>>> get_futex_key() use PageAnon() to distinguish file and anon. it can
>>>> makes following bad scenario.
>>>>
>>>> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
>>>> sleep on file mapping object.
>>>> 2) thread-B write a variable and it makes cow.
>>>> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
>>>> sleeped thread on the anonymous page. (but it's nothing)
>>>>
>> Excellent test case, thank you! Would you consider preparing a patch to
>> futextest?
>>
>> http://git.kernel.org/?p=linux/kernel/git/dvhart/futextest.git;a=summary
>
> Patch attached. you can feel free any modify such file. thanks.
Kosaki,
Thank you for the patch! I've merged your patch into futextest and added
some logic to detect failure and not hang the box (so the test can be
run unattended).
Thanks,
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
KOSAKI Motohiro wrote:
Hi Kosaki,
Some test results below. Things look good, you have my Ack.
> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
> From: KOSAKI Motohiro <[email protected]>
> Date: Tue, 5 Jan 2010 11:33:00 +0900
> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>
> Currently, futex have two problem.
>
> A) current futex doesn't handle private file mappings properly.
>
> get_futex_key() use PageAnon() to distinguish file and anon. it can
> makes following bad scenario.
>
> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
> sleep on file mapping object.
> 2) thread-B write a variable and it makes cow.
> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
> sleeped thread on the anonymous page. (but it's nothing)
>
> following testcase reproduce this issue.
>
> actual result)
> FUTEX_WAIT thread never wake up.
>
> expect result)
> FUTEX_WAIT thread wake up by FUTEX_WAKE.
>
<snipped test source>
>
> B) current futex doesn't handle zero page properly.
>
> read mode get_user_pages() can return zero page. but current futex code doesn't
> handle it at all. Then, zero page makes infinite loop internally.
>
> following testcase can reproduce this issue.
>
> actual result)
> FUTEX_WAIT never return and waste cpu time 100%.
>
> expected result)
> FUTEX_WAIT return immediately.
>
<snipped test source>
> The solution is to use write mode get_user_page() always for page lookup.
> It prevent to lookup both file page of private mappings and zero page.
>
> performance concern:
>
> Probaly very little. because glibc always initialize variables for futex
> before to call futex(). It mean glibc user never seen the overhead of this
> patch.
>
> compatibility concern:
>
> This patch have few compatibility break. After this patch, FUTEX_WAIT require
> writable access to futex variables (read-only mappings makes EFAULT).
> But practically it's no problem. again glibc always initalize variables for futex
> explicitly. nobody use read-only mappings.
>
> Reported-by: Hugh Dickins <[email protected]>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
The tests have been added to futextest. In addition to confirming these
tests, I compared the performance/futex_wait test results before and
after this patch and found no significant delta.
Threads 2.6.33-rc3 2.6.33-rc3-patched
------------------------------------------
1 42373 42373
2 24938 24450
3 5931 7163
4 4715 4480
5 4593 4227
6 4993 4608
8 4448 5294
10 4778 5149
12 4836 5079
16 4476 3828
24 4188 4314
32 4380 4384
64 4598 4764
128 4429 5102
256 5081 4462
521 4854 4444
1024 4933 4272
------------------------------------------
System: 4xAMD Opteron 270 @ 2GHz
Units: Thousands of iterations per second
Ingo and Thomas, please consider applying this patch. Should this also
be sent to Stable?
Acked-by: Darren Hart <[email protected]>
> ---
> kernel/futex.c | 27 ++++++++++++---------------
> 1 files changed, 12 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 8e3c3ff..d9b3a22 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
> * @uaddr: virtual address of the futex
> * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
> * @key: address where result is stored.
> - * @rw: mapping needs to be read/write (values: VERIFY_READ,
> - * VERIFY_WRITE)
> *
> * Returns a negative error code or 0
> * The key words are stored in *key on success.
> @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
> * lock_page() might sleep, the caller should not hold a spinlock.
> */
> static int
> -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
> {
> unsigned long address = (unsigned long)uaddr;
> struct mm_struct *mm = current->mm;
> @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> * but access_ok() should be faster than find_vma()
> */
> if (!fshared) {
> - if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
> + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
> return -EFAULT;
> key->private.mm = mm;
> key->private.address = address;
> @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> }
>
> again:
> - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
> + err = get_user_pages_fast(address, 1, 1, &page);
> if (err < 0)
> return err;
>
> @@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
> if (!bitset)
> return -EINVAL;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
> int ret, op_ret;
>
> retry:
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1175,11 +1173,10 @@ retry:
> pi_state = NULL;
> }
>
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2,
> - requeue_pi ? VERIFY_WRITE : VERIFY_READ);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
> */
> retry:
> q->key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &q->key);
> if (unlikely(ret != 0))
> return ret;
>
> @@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
> q.requeue_pi_key = NULL;
> retry:
> q.key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &q.key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2023,7 +2020,7 @@ retry:
> if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
> return -EPERM;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
> rt_waiter.task = NULL;
>
> key2 = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out;
>
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
CC to mips folks.
> If something like this or your replacment does go forward,
> then I think that test is better inside the "if (!page->mapping)"
> below. Admittedly that adds even more mm-dependence here (relying
> on a zero page to have NULL page->mapping); but isn't page_to_pfn()
> one of those functions which is trivial on many configs but expensive
> on some? Better call it only in the rare case that it's needed.
>
> Though wouldn't it be even better not to use is_zero_pfn() at all?
> That was convenient in mm/memory.c because it had the pfn or pte right
> at hand, but here a traditional (page == ZERO_PAGE(address)) would be
> more efficient.
>
> Which would save having to move is_zero_pfn() from mm/memory.c
> to include/linux/mm.h - I'd prefer to keep it private if we can.
> But for completeness, this would involve resurrecting the 2.6.19
> MIPS move_pte(), which makes sure mremap() move doesn't interfere
> with our assumptions. Something like
>
> #define __HAVE_ARCH_MOVE_PTE
> pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr,
> unsigned long new_addr)
> {
> if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
> pte = mk_pte(ZERO_PAGE(new_addr), prot);
> return pte;
> }
>
> in arch/mips/include/asm/pgtable.h.
I agree with resurrecting mips move_pte. At least your patch
passed my cross compile test :)
Ralf, can you please review following patch?
======================================================
Subject: [PATCH] mips,mm: Reinstate move_pte optimization
From: Hugh Dickins <[email protected]>
About three years ago, we removed mips specific move_pte by commit
701dfbc1cb (mm: mremap correct rmap accounting). because it is only
small optimization and it has bug.
However new zero-page thing doesn't have such problem and behavior
consistency of mremap have worth a bit.
This patch reinstate it.
Signed-off-by: Hugh Dickins <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>
Cc: Ralf Baechle <[email protected]>
Cc: [email protected]
Cc: Nick Piggin <[email protected]>
Cc: Carsten Otte <[email protected]>
---
arch/mips/include/asm/pgtable.h | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 1854336..6ad2f73 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -387,6 +387,14 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
remap_pfn_range(vma, vaddr, pfn, size, prot)
#endif
+#define __HAVE_ARCH_MOVE_PTE
+pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr, unsigned long new_addr)
+{
+ if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
+ pte = mk_pte(ZERO_PAGE(new_addr), prot);
+ return pte;
+}
+
#include <asm-generic/pgtable.h>
/*
--
1.6.5.2
On Thu, Jan 07, 2010 at 03:32:57PM +0900, KOSAKI Motohiro wrote:
> > to include/linux/mm.h - I'd prefer to keep it private if we can.
> > But for completeness, this would involve resurrecting the 2.6.19
> > MIPS move_pte(), which makes sure mremap() move doesn't interfere
> > with our assumptions. Something like
> >
> > #define __HAVE_ARCH_MOVE_PTE
> > pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr,
> > unsigned long new_addr)
> > {
> > if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
> > pte = mk_pte(ZERO_PAGE(new_addr), prot);
> > return pte;
> > }
> >
> > in arch/mips/include/asm/pgtable.h.
>
> I agree with resurrecting mips move_pte. At least your patch
> passed my cross compile test :)
>
> Ralf, can you please review following patch?
Looks good.
Acked-by: Ralf Baechle <[email protected]>
Thanks,
Ralf
Commit-ID: 7485d0d3758e8e6491a5c9468114e74dc050785d
Gitweb: http://git.kernel.org/tip/7485d0d3758e8e6491a5c9468114e74dc050785d
Author: KOSAKI Motohiro <[email protected]>
AuthorDate: Tue, 5 Jan 2010 16:32:43 +0900
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 13 Jan 2010 09:17:36 +0100
futexes: Remove rw parameter from get_futex_key()
Currently, futexes have two problem:
A) The current futex code doesn't handle private file mappings properly.
get_futex_key() uses PageAnon() to distinguish file and
anon, which can cause the following bad scenario:
1) thread-A call futex(private-mapping, FUTEX_WAIT), it
sleeps on file mapping object.
2) thread-B writes a variable and it makes it cow.
3) thread-B calls futex(private-mapping, FUTEX_WAKE), it
wakes up blocked thread on the anonymous page. (but it's nothing)
B) Current futex code doesn't handle zero page properly.
Read mode get_user_pages() can return zero page, but current
futex code doesn't handle it at all. Then, zero page makes
infinite loop internally.
The solution is to use write mode get_user_page() always for
page lookup. It prevents the lookup of both file page of private
mappings and zero page.
Performance concerns:
Probaly very little, because glibc always initialize variables
for futex before to call futex(). It means glibc users never see
the overhead of this patch.
Compatibility concerns:
This patch has few compatibility issues. After this patch,
FUTEX_WAIT require writable access to futex variables (read-only
mappings makes EFAULT). But practically it's not a problem,
glibc always initalizes variables for futexes explicitly - nobody
uses read-only mappings.
Reported-by: Hugh Dickins <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Acked-by: Darren Hart <[email protected]>
Cc: <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: KAMEZAWA Hiroyuki <[email protected]>
Cc: Nick Piggin <[email protected]>
Cc: Ulrich Drepper <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/futex.c | 27 ++++++++++++---------------
1 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..d9b3a22 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: VERIFY_READ,
- * VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
- err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
+ err = get_user_pages_fast(address, 1, 1, &page);
if (err < 0)
return err;
@@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1175,11 +1173,10 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2,
- requeue_pi ? VERIFY_WRITE : VERIFY_READ);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
*/
retry:
q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(ret != 0))
return ret;
@@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out;
@@ -2023,7 +2020,7 @@ retry:
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
rt_waiter.task = NULL;
key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;