Hi Andrew, Ingo,
Ingo was (rightfully) concerned about the amount of memory
which could be pinned using FUTEX_FD. This solution doesn't keep
pages pinned any more: it hashes on mapping + offset, or for anonomous
pages, uses a callback to move them out and back into the hash table
as they are swapped out and in.
Searching the (256-bucket) hash table to find potential pages
is fairly slow, but we're swapping anyway, so it's lost in the noise.
The hash table is usually fairly empty.
My only concern is for a race between swapping a page in and
it being accessed, but I think I have the callbacks in the right place
in the swap code: more VM-aware eyes welcome.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Futexes without pinning pages
Author: Rusty Russell
Status: Tested on 2.6.0-test4-bk2
Depends: Misc/futex-minor-tweaks.patch.gz
Depends: Misc/qemu-page-offset.patch.gz
D: Avoid pinning pages with futexes in them, to resolve FUTEX_FD DoS.
D: For file-backed mappings, change hash to use page->mapping and
D: page->index, which should uniquely identify each one. For
D: anonymous mappings, insert callbacks in swap code to unhash them
D: when they are swapped out and rehash them when they are swapped
D: back in.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .24048-2.6.0-test4-bk2-futex-swap.pre/include/linux/futex.h .24048-2.6.0-test4-bk2-futex-swap/include/linux/futex.h
--- .24048-2.6.0-test4-bk2-futex-swap.pre/include/linux/futex.h 2003-05-27 15:02:21.000000000 +1000
+++ .24048-2.6.0-test4-bk2-futex-swap/include/linux/futex.h 2003-08-26 12:43:19.000000000 +1000
@@ -17,4 +17,7 @@ asmlinkage long sys_futex(u32 __user *ua
long do_futex(unsigned long uaddr, int op, int val,
unsigned long timeout, unsigned long uaddr2, int val2);
+/* For mm/page_io.c to tell us about swapping of (anonymous) pages. */
+extern void futex_swap_out(struct page *page);
+extern void futex_swap_in(struct page *page);
#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .24048-2.6.0-test4-bk2-futex-swap.pre/kernel/futex.c .24048-2.6.0-test4-bk2-futex-swap/kernel/futex.c
--- .24048-2.6.0-test4-bk2-futex-swap.pre/kernel/futex.c 2003-08-26 12:43:18.000000000 +1000
+++ .24048-2.6.0-test4-bk2-futex-swap/kernel/futex.c 2003-08-26 12:43:19.000000000 +1000
@@ -52,14 +52,17 @@ struct futex_q {
/* the virtual => physical COW-safe cache */
vcache_t vcache;
+ /* When anonymous memory swapped out, this stores the index. */
+ unsigned long swap_index;
+
/* For fd, sigio sent using these. */
int fd;
struct file *filp;
};
-/* The key for the hash is the address + index + offset within page */
static struct list_head futex_queues[1<<FUTEX_HASHBITS];
static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(futex_swapped);
extern void send_sigio(struct fown_struct *fown, int fd, int band);
@@ -84,13 +87,62 @@ static inline void unlock_futex_mm(void)
spin_unlock(¤t->mm->page_table_lock);
}
-/*
- * The physical page is shared, so we can hash on its address:
+/* For pages which are file backed, we can simply hash by mapping and
+ * index. For anonymous regions, we hash by the actual struct page *,
+ * and move them in and out of the hash if they are swapped out.
*/
static inline struct list_head *hash_futex(struct page *page, int offset)
{
- return &futex_queues[hash_long((unsigned long)page + offset,
- FUTEX_HASHBITS)];
+ unsigned long hashin;
+ if (page->mapping)
+ hashin = (unsigned long)page->mapping + page->index;
+ else
+ hashin = (unsigned long)page;
+
+ return &futex_queues[hash_long(hashin+offset, FUTEX_HASHBITS)];
+}
+
+/* Called when we're going to swap this page out. */
+void futex_swap_out(struct page *page)
+{
+ unsigned int i;
+
+ /* It should have the mapping (== &swapper_space) and index
+ * set by now */
+ BUG_ON(!page->mapping);
+
+ spin_lock(&futex_lock);
+ for (i = 0; i < 1 << FUTEX_HASHBITS; i++) {
+ struct list_head *l, *next;
+ list_for_each_safe(l, next, &futex_queues[i]) {
+ struct futex_q *q = list_entry(l, struct futex_q,list);
+ if (q->page == page) {
+ list_del(&q->list);
+ q->swap_index = page->index;
+ q->page = NULL;
+ list_add(&q->list, &futex_swapped);
+ }
+ }
+ }
+ spin_unlock(&futex_lock);
+}
+
+/* Called when we're going to swap this page in. */
+void futex_swap_in(struct page *page)
+{
+ struct list_head *l, *next;
+
+ spin_lock(&futex_lock);
+ list_for_each_safe(l, next, &futex_swapped) {
+ struct futex_q *q = list_entry(l, struct futex_q, list);
+
+ if (q->swap_index == page->index) {
+ list_del(&q->list);
+ q->page = page;
+ list_add(&q->list, hash_futex(q->page, q->offset));
+ }
+ }
+ spin_unlock(&futex_lock);
}
/*
@@ -201,10 +253,8 @@ static void futex_vcache_callback(vcache
spin_lock(&futex_lock);
if (!list_empty(&q->list)) {
- put_page(q->page);
- q->page = new_page;
- __pin_page_atomic(new_page);
list_del(&q->list);
+ q->page = new_page;
list_add_tail(&q->list, head);
}
@@ -246,8 +296,6 @@ static inline int futex_requeue(unsigned
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- put_page(this->page);
- __pin_page_atomic (page2);
list_add_tail(i, head2);
__attach_vcache(&this->vcache, uaddr2,
current->mm, futex_vcache_callback);
@@ -331,18 +379,19 @@ static inline int futex_wait(unsigned lo
*/
if (get_user(curval, (int *)uaddr) != 0) {
ret = -EFAULT;
- goto unlock;
+ goto putpage;
}
if (curval != val) {
ret = -EWOULDBLOCK;
- goto unlock;
+ goto putpage;
}
__queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
unlock_futex_mm();
+ put_page(page);
time = schedule_timeout(time);
@@ -361,9 +410,10 @@ static inline int futex_wait(unsigned lo
else
ret = -EINTR;
- put_page(q.page);
return ret;
+putpage:
+ put_page(page);
unlock:
unlock_futex_mm();
return ret;
@@ -374,7 +424,6 @@ static int futex_close(struct inode *ino
struct futex_q *q = filp->private_data;
unqueue_me(q);
- put_page(q->page);
kfree(filp->private_data);
return 0;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .24048-2.6.0-test4-bk2-futex-swap.pre/mm/page_io.c .24048-2.6.0-test4-bk2-futex-swap/mm/page_io.c
--- .24048-2.6.0-test4-bk2-futex-swap.pre/mm/page_io.c 2003-02-07 19:18:58.000000000 +1100
+++ .24048-2.6.0-test4-bk2-futex-swap/mm/page_io.c 2003-08-26 12:43:19.000000000 +1000
@@ -19,6 +19,7 @@
#include <linux/buffer_head.h> /* for block_sync_page() */
#include <linux/mpage.h>
#include <linux/writeback.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
static struct bio *
@@ -77,6 +78,7 @@ static int end_swap_bio_read(struct bio
ClearPageUptodate(page);
} else {
SetPageUptodate(page);
+ futex_swap_in(page);
}
unlock_page(page);
bio_put(bio);
@@ -105,6 +107,7 @@ int swap_writepage(struct page *page, st
}
inc_page_state(pswpout);
SetPageWriteback(page);
+ futex_swap_out(page);
unlock_page(page);
submit_bio(WRITE, bio);
out:
Rusty Russell <[email protected]> wrote:
>
> Hi Andrew, Ingo,
>
> Ingo was (rightfully) concerned about the amount of memory
> which could be pinned using FUTEX_FD. This solution doesn't keep
> pages pinned any more: it hashes on mapping + offset, or for anonomous
> pages, uses a callback to move them out and back into the hash table
> as they are swapped out and in.
I have a bad feeling about this. We shall see...
> Searching the (256-bucket) hash table to find potential pages
> is fairly slow, but we're swapping anyway, so it's lost in the noise.
> The hash table is usually fairly empty.
>
> My only concern is for a race between swapping a page in and
> it being accessed, but I think I have the callbacks in the right place
> in the swap code: more VM-aware eyes welcome.
Looks to be OK: as long as the page is locked you can't go far wrong.
But end_swap_bio_read() is called from interrupt context. Hence the
spinlock you have in there needs to become IRQ safe.
Two issues:
a) what to do about futexes in file-backed pages? At present the
attacker can pin arbitrary amount of memory by backing it with a file.
Your solution won't scale to solving this, because we need to perform
a futex lookup on every add_to_page_cache(). (Well, it will scale
fairly well because add_to_page_cache() is ratelimited by the IO speed.
But it will still suck quite a bit for some people).
b) Assuming a) is solved: futexes which are backed by tmpfs. tmpfs
(which is a study in conceptual baroquenness) will rewrite page->mapping
as pages are moved between tmpfs inodes and swapper_space. I _think_
lock_page() is sufficient to serialise against this. If not, both
->page_locks are needed.
This of course will break your hashing scheme. Hooks in
move_to_swap_cache() and move_from_swap_cache() are needed.
So what to do? One option is to just not pin the pages at all: let them be
reclaimed and/or swapped out. Let the kernel fault them in. We have all
the stuff to do this over in fs/aio.c, with use_mm().
It does mean that the futex code would have to change its representation of
a futex's location from page/offset to mm/vaddr, and the futex code would
need to hook into the exit path, similar to exit_aio().
Or create RLIM_NRFUTEX.
On Mon, 25 Aug 2003, Andrew Morton wrote:
> Two issues:
>
> a) what to do about futexes in file-backed pages? At present the
> attacker can pin arbitrary amount of memory by backing it with a file.
>
> Your solution won't scale to solving this, because we need to perform
> a futex lookup on every add_to_page_cache(). (Well, it will scale
> fairly well because add_to_page_cache() is ratelimited by the IO speed.
> But it will still suck quite a bit for some people).
add_to_page_cache() can be fairly high-frequency when the pagecache is
created anew (write) and torn down immediately afterwards (temporary
files). So i dont think it's 100% ratelimited by IO speed.
the cleanest thing would be to do the hashtable registration with every
futex existing in the system. But we might get away with doing it only for
FUTEX_FD 'asynchronous' futexes (which are only limited by the # of open
files, per process), and do the usual page pinning (and thus fast) thing
with the 'synchronous' futexes (which are limited to one per context, so
they are resource-limited automatically). As long as the hashtable is
empty, the lookup ought to be really fast.
> Or create RLIM_NRFUTEX.
futexes do have some small lowmem footprint nevertheless (they have a
queue entry structure, etc.), but much less than PAGE_SIZE. So basically
the overhead can be added to the already existing fd footprint, without
changing the balance too much.
but if all futexes pin down one page (worst-case), then to make it really
safe we'll have to use a fairly low default RLIM_NRFUTEX value - which
will decrease the generic utility of futexes.
Ingo
Ingo Molnar <[email protected]> wrote:
>
> but if all futexes pin down one page (worst-case), then to make it really
> safe we'll have to use a fairly low default RLIM_NRFUTEX value - which
> will decrease the generic utility of futexes.
We could make it RLIM_NRFUTEX_PAGES: the number of pages which the
user can pin via futexes, perhaps.
On Mon, 25 Aug 2003, Andrew Morton wrote:
> > but if all futexes pin down one page (worst-case), then to make it really
> > safe we'll have to use a fairly low default RLIM_NRFUTEX value - which
> > will decrease the generic utility of futexes.
>
> We could make it RLIM_NRFUTEX_PAGES: the number of pages which the user
> can pin via futexes, perhaps.
the problem is that this is not really a deterministic limit. The nr of
thread or open files limit is deterministic: it will either fail or
succeed at clone() or open() time - and can be freely used afterwards. The
kernel doesnt in fact know about the first use of a futex: no-contention
futexes have zero kernel footprint. This is the big plus of them. So i'd
really favor some sort of hashing method and no limits, that way the Linux
VM is extended and every VM address is waitable and wakable on - a pretty
powerful concept.
Ingo
Ingo Molnar <[email protected]> wrote:
>
> The
> kernel doesnt in fact know about the first use of a futex: no-contention
> futexes have zero kernel footprint. This is the big plus of them. So i'd
> really favor some sort of hashing method and no limits, that way the Linux
> VM is extended and every VM address is waitable and wakable on - a pretty
> powerful concept.
What about the option of not pinning the pages at all: just fault
them in when required?
On Mon, 25 Aug 2003, Andrew Morton wrote:
> Ingo Molnar <[email protected]> wrote:
> >
> > The
> > kernel doesnt in fact know about the first use of a futex: no-contention
> > futexes have zero kernel footprint. This is the big plus of them. So i'd
> > really favor some sort of hashing method and no limits, that way the Linux
> > VM is extended and every VM address is waitable and wakable on - a pretty
> > powerful concept.
>
> What about the option of not pinning the pages at all: just fault
> them in when required?
precisely what scheme do you mean by this? There are two important points:
when a thread sleeps on a futex, and when some thread does a wakeup on a
futex address. We cannot hash the futex based on the virtual address
(arbitrary share-ability of futex pages is another key appeal of them),
and if by the time the wakeup happens the physical page goes away how do
we find which threads are queued on this address?
Ingo
Ingo Molnar <[email protected]> wrote:
>
> > What about the option of not pinning the pages at all: just fault
> > them in when required?
>
> precisely what scheme do you mean by this? There are two important points:
> when a thread sleeps on a futex, and when some thread does a wakeup on a
> futex address. We cannot hash the futex based on the virtual address
> (arbitrary share-ability of futex pages is another key appeal of them),
> and if by the time the wakeup happens the physical page goes away how do
> we find which threads are queued on this address?
I was suggesting that we hash the futex on mm+vaddr. Yes, differing vaddrs
break that.
umm, how about hashing only on offset into page? That reduces the number of
threads which need to be visited in futex_wake() by a factor of up to 1024.
In futext_wake(), fault in and pin the page, then for each thread which is
waiting on a futex which has the same offset into a page, do a
get_user_pages+follow_page, see if he's waiting on the just-faulted-in
page?
On Tue, 2003-08-26 at 09:02, Andrew Morton wrote:
> umm, how about hashing only on offset into page? That reduces the number of
> threads which need to be visited in futex_wake() by a factor of up to 1024.
How likely do you consider it that we then get a major collision?
I wouldn't be surprised if, say, glibc lays some hot futexes out in a
way that's the same for all processes in the system, like start of the
page.... Might as well not hash :)
Greetings,
Arjan van de Ven
On Tue, Aug 26, 2003 at 11:08:00AM +0300, Muli Ben-Yehuda wrote:
> On Tue, Aug 26, 2003 at 09:56:51AM +0200, Arjan van de Ven wrote:
> > On Tue, 2003-08-26 at 09:02, Andrew Morton wrote:
> >
> > > umm, how about hashing only on offset into page? That reduces the number of
> > > threads which need to be visited in futex_wake() by a factor of up to 1024.
> >
> > How likely do you consider it that we then get a major collision?
> > I wouldn't be surprised if, say, glibc lays some hot futexes out in a
> > way that's the same for all processes in the system, like start of the
> > page.... Might as well not hash :)
>
> How about combining something that's shared to all of the threads that
> share a futex but not system wide (the mm?) with something simple that
> won't change, like the page offset? Adding the mm into the mix wil
> make collisions harder, and limiting the buckets to the number of
> different futex offsets will make it simple and differentiate between
> different futexes in the same mm.
The problem is that you can (and
want to) share futexes between different processes via shm....
glibc actualy uses this to implement cross-process posix mutexes (mutici?)
On Tue, Aug 26, 2003 at 09:56:51AM +0200, Arjan van de Ven wrote:
> On Tue, 2003-08-26 at 09:02, Andrew Morton wrote:
>
> > umm, how about hashing only on offset into page? That reduces the number of
> > threads which need to be visited in futex_wake() by a factor of up to 1024.
>
> How likely do you consider it that we then get a major collision?
> I wouldn't be surprised if, say, glibc lays some hot futexes out in a
> way that's the same for all processes in the system, like start of the
> page.... Might as well not hash :)
How about combining something that's shared to all of the threads that
share a futex but not system wide (the mm?) with something simple that
won't change, like the page offset? Adding the mm into the mix wil
make collisions harder, and limiting the buckets to the number of
different futex offsets will make it simple and differentiate between
different futexes in the same mm.
--
Muli Ben-Yehuda
http://www.mulix.org
Muli Ben-Yehuda <[email protected]> wrote:
>
> > > umm, how about hashing only on offset into page? That reduces the number of
> > > threads which need to be visited in futex_wake() by a factor of up to 1024.
> >
> > How likely do you consider it that we then get a major collision?
> > I wouldn't be surprised if, say, glibc lays some hot futexes out in a
> > way that's the same for all processes in the system, like start of the
> > page.... Might as well not hash :)
>
> How about combining something that's shared to all of the threads that
> share a futex but not system wide (the mm?) with something simple that
> won't change, like the page offset?
The mm's could well be independent.
Some userspace help would be needed to avoid defeating the hash. In the
case where a bunch of threads with a shared mm are waiting on the same
futex things should automatically be OK.
William Lee Irwin III <[email protected]> wrote:
>
> except you have the usual intractable disaster
> whenever file-backed pages are anonymized via truncate().
They only arose due to races between major faults and truncate.
That got fixed.
William Lee Irwin III <[email protected]> wrote:
>> except you have the usual intractable disaster
>> whenever file-backed pages are anonymized via truncate().
On Tue, Aug 26, 2003 at 03:44:58AM -0700, Andrew Morton wrote:
> They only arose due to races between major faults and truncate.
> That got fixed.
Then it sounds relatively easy to localize the search structure (if you
care to do so), apart from a policy decision about what on earth to do
about waiters on truncated futexes.
-- wli
On Tue, Aug 26, 2003 at 11:08:00AM +0300, Muli Ben-Yehuda wrote:
> How about combining something that's shared to all of the threads that
> share a futex but not system wide (the mm?) with something simple that
> won't change, like the page offset? Adding the mm into the mix wil
> make collisions harder, and limiting the buckets to the number of
> different futex offsets will make it simple and differentiate between
> different futexes in the same mm.
The mm is not the sharing boundary; it can be shared along the usual
sharing boundaries; hence we have the usual two cases:
(a) shared between all those mmap()'ing some inode for file-backed memory
Can be handled by embedding it into a struct address_space.
(b) shared between all fork()'s between exec()'s for anonymous memory
Can be handled by embedding it in some new object representing
all fork()'s between exec()'s.
(c) anonymous pages formerly owned by inodes orphaned by truncate().
This is hard.
The containing object (for (a) and (b)) can be located with a back
pointer from the futex, except you have the usual intractable disaster
whenever file-backed pages are anonymized via truncate().
-- wli
On Tue, Aug 26, 2003 at 01:25:29AM -0700, Andrew Morton wrote:
> Muli Ben-Yehuda <[email protected]> wrote:
> > How about combining something that's shared to all of the threads that
> > share a futex but not system wide (the mm?) with something simple that
> > won't change, like the page offset?
>
> The mm's could well be independent.
I assumed all threads sharing a futex share an mm since you suggested
hashing on mm + vaddr?
> Some userspace help would be needed to avoid defeating the hash. In the
> case where a bunch of threads with a shared mm are waiting on the same
> futex things should automatically be OK.
Userspace help would be fine, but relying only on userspace could lead
to an immediate DoS by forcing the hash to always hash to the same
bucket.
--
Muli Ben-Yehuda
http://www.mulix.org
William Lee Irwin III <[email protected]> wrote:
>
> William Lee Irwin III <[email protected]> wrote:
> >> except you have the usual intractable disaster
> >> whenever file-backed pages are anonymized via truncate().
>
> On Tue, Aug 26, 2003 at 03:44:58AM -0700, Andrew Morton wrote:
> > They only arose due to races between major faults and truncate.
> > That got fixed.
>
> Then it sounds relatively easy to localize the search structure (if you
> care to do so),
The "group of all processes which could potentially (or really do) share a
chunk of anon memory" thing sounds tricky.
> apart from a policy decision about what on earth to do
> about waiters on truncated futexes.
erk, screwed.
William Lee Irwin III <[email protected]> wrote:
>> Then it sounds relatively easy to localize the search structure (if you
>> care to do so),
On Tue, Aug 26, 2003 at 10:29:31AM -0700, Andrew Morton wrote:
> The "group of all processes which could potentially (or really do) share a
> chunk of anon memory" thing sounds tricky.
Not really; it's just a random data structure with very low odds of
proliferating. Hugh did it once (for anobjrmap) and I rearranged and/or
rewrote it to suit my preferences (I think I RCU'd the lock in it or
some ridiculous nonsense on that order).
William Lee Irwin III <[email protected]> wrote:
>> apart from a policy decision about what on earth to do
>> about waiters on truncated futexes.
On Tue, Aug 26, 2003 at 10:29:31AM -0700, Andrew Morton wrote:
> erk, screwed.
Well, the decision is essentially arbitrary, it just has to be made
(and by definition some decision is being made regardless of what the
implementation does or fails to do).
-- wli
In message <[email protected]> you write:
> But end_swap_bio_read() is called from interrupt context. Hence the
> spinlock you have in there needs to become IRQ safe.
OK, I've fixed that, with conservative assumptions (so it doesn't
assume context). Or is _bh sufficient?
> Two issues:
>
> a) what to do about futexes in file-backed pages? At present the
> attacker can pin arbitrary amount of memory by backing it with a file.
At present == 2.6.0-test4? In 2.6.0-test4, the attacker can pin one
page per process (OK), or on per FD using FUTEX_FD (not OK). This
patch changes it so that pages are *never* pinned, whatever is backing
them.
> Your solution won't scale to solving this, because we need to perform
> a futex lookup on every add_to_page_cache(). (Well, it will scale
> fairly well because add_to_page_cache() is ratelimited by the IO speed.
> But it will still suck quite a bit for some people).
I assumed that for non-anonymous pages the mapping + index was always
a unique identifier, even as they were swapped out. We need a
persistent unique identifier for a page, OR a callback to
unhash/rehash it when the identifier changes. Hence mapping + index
where mapping != NULL, and the struct page and callbacks for swap
pages. Using the callbacks for wherever else page->mapping changes is
simple (but may be slow).
> b) Assuming a) is solved: futexes which are backed by tmpfs. tmpfs
> (which is a study in conceptual baroquenness) will rewrite page->mapping
> as pages are moved between tmpfs inodes and swapper_space. I _think_
> lock_page() is sufficient to serialise against this. If not, both
> ->page_locks are needed.
Looks like the call to move_to_swap_cache is already serialized by
spin_lock(&info->lock) in shmem_writepage AFAICT, in fact,
move_to_swap_cache seems to assume serialization, since it does
(outside any of its locks):
/* shift page from clean_pages to dirty_pages list */
BUG_ON(PageDirty(page));
set_page_dirty(page);
> This of course will break your hashing scheme. Hooks in
> move_to_swap_cache() and move_from_swap_cache() are needed.
I've added it, but how often does that get called?
A simple optimization would be to have a page bit which says "has ever
had someone waiting on a futex" which would optimize this: it could be
reset in the futex callbacks if it finds noone is waiting on that
page.
> Or create RLIM_NRFUTEX.
This is the option of last resort: shades of SysV semaphores. Even if
it only applied to FUTEX_FD, it makes them basically unusable.
OK, here's a new one (modified previous patch prepended for completeness),
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Minor futex comment tweaks and cleanups
Author: Rusty Russell
Status: Tested on 2.6.0-test4-bk2
D: Changes:
D:
D: (1) don't return 0 from futex_wait if we are somehow
D: spuriously woken up, loop in that case.
D:
D: (2) remove bogus comment about address no longer being in this
D: address space: we hold the mm lock, and __pin_page succeeded, so it
D: can't be true,
D:
D: (3) remove bogus comment about "get_user might fault and schedule",
D:
D: (4) clarify comment about hashing: we hash address of struct page,
D: not page itself,
D:
D: (4) remove list_empty check: we still hold the lock, so it can
D: never happen, and
D:
D: (5) single error exit path, and move __queue_me to the end (order
D: doesn't matter since we're inside the futex lock).
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .9416-linux-2.6.0-test4-bk2/kernel/futex.c .9416-linux-2.6.0-test4-bk2.updated/kernel/futex.c
--- .9416-linux-2.6.0-test4-bk2/kernel/futex.c 2003-05-27 15:02:23.000000000 +1000
+++ .9416-linux-2.6.0-test4-bk2.updated/kernel/futex.c 2003-08-27 12:34:53.000000000 +1000
@@ -84,9 +84,7 @@ static inline void unlock_futex_mm(void)
spin_unlock(¤t->mm->page_table_lock);
}
-/*
- * The physical page is shared, so we can hash on its address:
- */
+/* The struct page is shared, so we can hash on its address. */
static inline struct list_head *hash_futex(struct page *page, int offset)
{
return &futex_queues[hash_long((unsigned long)page + offset,
@@ -311,67 +309,68 @@ static inline int futex_wait(unsigned lo
int val,
unsigned long time)
{
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0, curval;
+ wait_queue_t wait;
+ int ret, curval;
struct page *page;
struct futex_q q;
+again:
init_waitqueue_head(&q.waiters);
+ init_waitqueue_entry(wait, current);
lock_futex_mm();
page = __pin_page(uaddr - offset);
if (!page) {
- unlock_futex_mm();
- return -EFAULT;
+ ret = -EFAULT;
+ goto unlock;
}
- __queue_me(&q, page, uaddr, offset, -1, NULL);
/*
- * Page is pinned, but may no longer be in this address space.
+ * Page is pinned, but may be a kernel address.
* It cannot schedule, so we access it with the spinlock held.
*/
if (get_user(curval, (int *)uaddr) != 0) {
- unlock_futex_mm();
ret = -EFAULT;
- goto out;
+ goto unlock;
}
+
if (curval != val) {
- unlock_futex_mm();
ret = -EWOULDBLOCK;
- goto out;
+ goto unlock;
}
- /*
- * The get_user() above might fault and schedule so we
- * cannot just set TASK_INTERRUPTIBLE state when queueing
- * ourselves into the futex hash. This code thus has to
- * rely on the futex_wake() code doing a wakeup after removing
- * the waiter from the list.
- */
+
+ __queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&q.list)) {
- unlock_futex_mm();
- time = schedule_timeout(time);
- }
+ unlock_futex_mm();
+
+ time = schedule_timeout(time);
+
set_current_state(TASK_RUNNING);
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- if (time == 0) {
- ret = -ETIMEDOUT;
- goto out;
- }
- if (signal_pending(current))
- ret = -EINTR;
-out:
- /* Were we woken up anyway? */
+ put_page(q.page);
+
+ /* Were we woken up (and removed from queue)? Always return
+ * success when this happens. */
if (!unqueue_me(&q))
ret = 0;
- put_page(q.page);
+ else if (time == 0)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -EINTR;
+ else
+ /* Spurious wakeup somehow. Loop. */
+ goto again;
return ret;
+
+unlock:
+ unlock_futex_mm();
+ return ret;
}
static int futex_close(struct inode *inode, struct file *filp)
Name: Futexes without pinning pages
Author: Rusty Russell
Status: Tested on 2.6.0-test4-bk2
Depends: Misc/futex-minor-tweaks.patch.gz
Depends: Misc/qemu-page-offset.patch.gz
D: Avoid pinning pages with futexes in them, to resolve FUTEX_FD DoS.
D: Insert callbacks in swap code to unhash them when they are swapped
D: out and rehash them when they are swapped back in.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26024-linux-2.6.0-test4-bk2/include/linux/futex.h .26024-linux-2.6.0-test4-bk2.updated/include/linux/futex.h
--- .26024-linux-2.6.0-test4-bk2/include/linux/futex.h 2003-05-27 15:02:21.000000000 +1000
+++ .26024-linux-2.6.0-test4-bk2.updated/include/linux/futex.h 2003-08-27 15:05:37.000000000 +1000
@@ -17,4 +17,7 @@ asmlinkage long sys_futex(u32 __user *ua
long do_futex(unsigned long uaddr, int op, int val,
unsigned long timeout, unsigned long uaddr2, int val2);
+/* For mm/page_io.c to tell us about swapping of (anonymous) pages. */
+extern void futex_swap_out(struct page *page);
+extern void futex_swap_in(struct page *page);
#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26024-linux-2.6.0-test4-bk2/kernel/futex.c .26024-linux-2.6.0-test4-bk2.updated/kernel/futex.c
--- .26024-linux-2.6.0-test4-bk2/kernel/futex.c 2003-08-27 15:05:36.000000000 +1000
+++ .26024-linux-2.6.0-test4-bk2.updated/kernel/futex.c 2003-08-27 15:07:31.000000000 +1000
@@ -52,14 +52,17 @@ struct futex_q {
/* the virtual => physical COW-safe cache */
vcache_t vcache;
+ /* When anonymous memory swapped out, this stores the index. */
+ unsigned long swap_index;
+
/* For fd, sigio sent using these. */
int fd;
struct file *filp;
};
-/* The key for the hash is the address + index + offset within page */
static struct list_head futex_queues[1<<FUTEX_HASHBITS];
static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(futex_swapped);
extern void send_sigio(struct fown_struct *fown, int fd, int band);
@@ -74,21 +77,74 @@ static inline void lock_futex_mm(void)
{
spin_lock(¤t->mm->page_table_lock);
spin_lock(&vcache_lock);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
}
static inline void unlock_futex_mm(void)
{
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
spin_unlock(&vcache_lock);
spin_unlock(¤t->mm->page_table_lock);
}
-/* The struct page is shared, so we can hash on its address. */
+/* For pages which are file backed, we can simply hash by mapping and
+ * index. For anonymous regions, we hash by the actual struct page *,
+ * and move them in and out of the hash if they are swapped out.
+ */
static inline struct list_head *hash_futex(struct page *page, int offset)
{
- return &futex_queues[hash_long((unsigned long)page + offset,
- FUTEX_HASHBITS)];
+ unsigned long hashin;
+ if (page->mapping)
+ hashin = (unsigned long)page->mapping + page->index;
+ else
+ hashin = (unsigned long)page;
+
+ return &futex_queues[hash_long(hashin+offset, FUTEX_HASHBITS)];
+}
+
+/* Called when we're going to swap this page out (ie. whenever mapping
+ * changes). */
+void futex_swap_out(struct page *page)
+{
+ unsigned int i;
+
+ /* It should have the mapping (== &swapper_space) and index
+ * set by now */
+ BUG_ON(!page->mapping);
+
+ spin_lock_irq(&futex_lock);
+ for (i = 0; i < 1 << FUTEX_HASHBITS; i++) {
+ struct list_head *l, *next;
+ list_for_each_safe(l, next, &futex_queues[i]) {
+ struct futex_q *q = list_entry(l, struct futex_q,list);
+ if (q->page == page) {
+ list_del(&q->list);
+ q->swap_index = page->index;
+ q->page = NULL;
+ list_add(&q->list, &futex_swapped);
+ }
+ }
+ }
+ spin_unlock_irq(&futex_lock);
+}
+
+/* Called when we're going to swap this page in (can be interrupt context) */
+void futex_swap_in(struct page *page)
+{
+ struct list_head *l, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&futex_lock, flags);
+ list_for_each_safe(l, next, &futex_swapped) {
+ struct futex_q *q = list_entry(l, struct futex_q, list);
+
+ if (q->swap_index == page->index) {
+ list_del(&q->list);
+ q->page = page;
+ list_add(&q->list, hash_futex(q->page, q->offset));
+ }
+ }
+ spin_unlock_irqrestore(&futex_lock, flags);
}
/*
@@ -196,17 +252,15 @@ static void futex_vcache_callback(vcache
struct futex_q *q = container_of(vcache, struct futex_q, vcache);
struct list_head *head = hash_futex(new_page, q->offset);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
if (!list_empty(&q->list)) {
- put_page(q->page);
- q->page = new_page;
- __pin_page_atomic(new_page);
list_del(&q->list);
+ q->page = new_page;
list_add_tail(&q->list, head);
}
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
}
/*
@@ -244,8 +298,6 @@ static inline int futex_requeue(unsigned
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- put_page(this->page);
- __pin_page_atomic (page2);
list_add_tail(i, head2);
__attach_vcache(&this->vcache, uaddr2,
current->mm, futex_vcache_callback);
@@ -293,13 +345,13 @@ static inline int unqueue_me(struct fute
int ret = 0;
spin_lock(&vcache_lock);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
__detach_vcache(&q->vcache);
ret = 1;
}
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
spin_unlock(&vcache_lock);
return ret;
}
@@ -316,7 +368,7 @@ static inline int futex_wait(unsigned lo
again:
init_waitqueue_head(&q.waiters);
- init_waitqueue_entry(wait, current);
+ init_waitqueue_entry(&wait, current);
lock_futex_mm();
@@ -332,18 +384,19 @@ again:
*/
if (get_user(curval, (int *)uaddr) != 0) {
ret = -EFAULT;
- goto unlock;
+ goto putpage;
}
if (curval != val) {
ret = -EWOULDBLOCK;
- goto unlock;
+ goto putpage;
}
__queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
unlock_futex_mm();
+ put_page(page);
time = schedule_timeout(time);
@@ -352,7 +405,6 @@ again:
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- put_page(q.page);
/* Were we woken up (and removed from queue)? Always return
* success when this happens. */
@@ -368,6 +420,8 @@ again:
return ret;
+putpage:
+ put_page(page);
unlock:
unlock_futex_mm();
return ret;
@@ -378,7 +432,6 @@ static int futex_close(struct inode *ino
struct futex_q *q = filp->private_data;
unqueue_me(q);
- put_page(q->page);
kfree(filp->private_data);
return 0;
}
@@ -391,10 +444,10 @@ static unsigned int futex_poll(struct fi
int ret = 0;
poll_wait(filp, &q->waiters, wait);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM;
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
return ret;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26024-linux-2.6.0-test4-bk2/mm/page_io.c .26024-linux-2.6.0-test4-bk2.updated/mm/page_io.c
--- .26024-linux-2.6.0-test4-bk2/mm/page_io.c 2003-02-07 19:18:58.000000000 +1100
+++ .26024-linux-2.6.0-test4-bk2.updated/mm/page_io.c 2003-08-27 15:05:37.000000000 +1000
@@ -19,6 +19,7 @@
#include <linux/buffer_head.h> /* for block_sync_page() */
#include <linux/mpage.h>
#include <linux/writeback.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
static struct bio *
@@ -77,6 +78,7 @@ static int end_swap_bio_read(struct bio
ClearPageUptodate(page);
} else {
SetPageUptodate(page);
+ futex_swap_in(page);
}
unlock_page(page);
bio_put(bio);
@@ -105,6 +107,7 @@ int swap_writepage(struct page *page, st
}
inc_page_state(pswpout);
SetPageWriteback(page);
+ futex_swap_out(page);
unlock_page(page);
submit_bio(WRITE, bio);
out:
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26024-linux-2.6.0-test4-bk2/mm/swap_state.c .26024-linux-2.6.0-test4-bk2.updated/mm/swap_state.c
--- .26024-linux-2.6.0-test4-bk2/mm/swap_state.c 2003-08-12 06:58:06.000000000 +1000
+++ .26024-linux-2.6.0-test4-bk2.updated/mm/swap_state.c 2003-08-27 15:09:30.000000000 +1000
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
@@ -214,6 +215,7 @@ int move_to_swap_cache(struct page *page
BUG_ON(PageDirty(page));
set_page_dirty(page);
INC_CACHE_INFO(add_total);
+ futex_swap_out(page);
} else if (err == -EEXIST)
INC_CACHE_INFO(exist_race);
return err;
@@ -248,6 +250,7 @@ int move_from_swap_cache(struct page *pa
/* shift page from clean_pages to dirty_pages list */
ClearPageDirty(page);
set_page_dirty(page);
+ futex_swap_in(page);
}
return err;
}
Rusty Russell <[email protected]> wrote:
>
> In message <[email protected]> you write:
> > But end_swap_bio_read() is called from interrupt context. Hence the
> > spinlock you have in there needs to become IRQ safe.
>
> OK, I've fixed that, with conservative assumptions (so it doesn't
> assume context). Or is _bh sufficient?
spin_lock_irq/irqsave is correct.
> > Two issues:
> >
> > a) what to do about futexes in file-backed pages? At present the
> > attacker can pin arbitrary amount of memory by backing it with a file.
>
> At present == 2.6.0-test4? In 2.6.0-test4, the attacker can pin one
> page per process (OK), or on per FD using FUTEX_FD (not OK). This
> patch changes it so that pages are *never* pinned, whatever is backing
> them.
oh, OK.
> > Your solution won't scale to solving this, because we need to perform
> > a futex lookup on every add_to_page_cache(). (Well, it will scale
> > fairly well because add_to_page_cache() is ratelimited by the IO speed.
> > But it will still suck quite a bit for some people).
>
> I assumed that for non-anonymous pages the mapping + index was always
> a unique identifier, even as they were swapped out. We need a
> persistent unique identifier for a page, OR a callback to
> unhash/rehash it when the identifier changes. Hence mapping + index
> where mapping != NULL, and the struct page and callbacks for swap
> pages. Using the callbacks for wherever else page->mapping changes is
> simple (but may be slow).
swap_writepage() and end_swap_bio_read() are not really companion
functions. The page is in use and may be mapped into user pagetables
during swap_writepage(). It won't actually be freed up for a very long
time, if at all.
I guess this means that there could be a large number of futexes which are
considered "swapped out" which are in fact not swapped out at all.
I'm starting to dimly understand what this code does. You get 2/10 for
patch explanation ;)
I think a better place to rehash the futex would be at the point where the
page is added to and removed from swapcache.
When the page is in swapcache it has stable ->mapping and ->index and can
be treated in the same way as file-backed MAP_SHARED memory.
If this works then the places to be looking are:
__delete_from_swap_cache(): page moves from swapcache to anon
add_to_swap(): page moves from anon to swapcache.
move_to_swap_cache(): file-backed to swapcache
move_from_swap_cache(): swapcache to file-backed.
The locking you have there in move_to_swap_cache() and
move_from_swap_cache() look wrong. Take move_to_swap_cache(): there is a
window in which the page has mapping==&swapper_space, but it is hashed over
in futex land by the old tmpfs mapping. A futex lookup which is concurrent
with move_to_swap_cache() will fail to find the futex.
I think that to resolve this you need to take futex_lock while swizzling
the mapping and index in move_to_swap_cache():
spin_lock(&swapper_space.page_lock);
spin_lock(&mapping->page_lock);
+ spin_lock(&futex_lock);
err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
if (!err) {
__remove_from_page_cache(page);
___add_to_page_cache(page, &swapper_space, entry.val);
+ __futex_rehash(page);
}
+ spin_unlock(&futex_lock);
spin_unlock(&mapping->page_lock);
spin_unlock(&swapper_space.page_lock);
Similarly, all places which change the page's hash keys (mapping and index)
need to be locked against the futex lookup code.
None of the above four functions are performance-critical; they already take
a ton of global locks.
Alternative: just use swapper_space.page_lock when you're doing futex
lookups. That will pin down the ->mapping and ->index of anonymous,
swapcache and tmpfs pages.
Please make sure it builds with CONFIG_SWAP=n
Please make sure it builds with CONFIG_FUTEX=n (sorry)
Please augment the lock ranking comment at the top of filemap.c
If a futex resides in a pagecache page which is then truncated, a
futex_wake() should really send the caller a SIGBUS; it looks like the code
will return -EFAULT, which is good enough. Any waiters on that futex will
not be wakeable, but they will be killable.
Not pinning the pages, and going by mapping,index (including
swapper_space,swap_index case) seems fine; with a special case
of struct page * for the anonymous not yet assigned to swap.
(Very tempting to assign to swap early to avoid that special
case; but swapoff,swapless,swapon make that unreasonable.)
But I do not understand how futex_wake can still be doing a
this->page == page test: its __pin_page will ensure that some
page is faulted in, but that's not necessarily the same page
as in this->page.
I believe you need mapping,index in struct futex_q (with
swapper_space just one possibility for mapping), and the
struct page * considered an exceptional (though common) case.
I strongly agree with Andrew that add_to_swap and
delete_from_swap_cache (probably the one without the __s)
are the places for switching the anonymous, not page_io.c:
page->mapping will be set and unset in those, and it's
page->mapping that you're keying off in hash_futex.
But I disagree over move_to/from_swap_cache: nothing should
be done there at all. Once you have mapping,index in struct
futex_q, it's irrelevant what tmpfs might be doing to the
page->mapping,page->index of the unmapped page.
I dare not think what locking may be necessary, to manage
the switch from hashing by struct page * to hashing by
swapper_space,index.
Hugh
On Wed, Aug 27, 2003 at 09:37:25AM +0100, Hugh Dickins wrote:
> But I disagree over move_to/from_swap_cache: nothing should
> be done there at all. Once you have mapping,index in struct
> futex_q, it's irrelevant what tmpfs might be doing to the
> page->mapping,page->index of the unmapped page.
> I dare not think what locking may be necessary, to manage
> the switch from hashing by struct page * to hashing by
> swapper_space,index.
PG_locked and mapping->page_lock held for writing are needed to
switch in general AIUI; adding the vcache/futex locks into the mix
sounds like some deep hierarchy, especially since the page is
meant to go away in the middle of this process. move_to_swap_cache()
has mapping->page_lock held for writing, as is needed; PG_locked is
required to add_to_swap() and some other callers are from call chains
not checking BUG_ON(!PageLocked(page)) so it sounds as if things are
partly there, assuming vcache_lock/futex_lock stay at the bottom.
I think it's worth coming up with an answer to in order to remove
the DoS scenario and/or resource scalability limitations.
-- wli
Hugh Dickins <[email protected]> wrote:
>
> But I disagree over move_to/from_swap_cache: nothing should
> be done there at all. Once you have mapping,index in struct
> futex_q, it's irrelevant what tmpfs might be doing to the
> page->mapping,page->index of the unmapped page.
But move_to_swap_cache() alters a page's ->mapping and ->index when that
page is potentially mapped into user pagetables.
On Wed, 27 Aug 2003, Andrew Morton wrote:
> Hugh Dickins <[email protected]> wrote:
> >
> > But I disagree over move_to/from_swap_cache: nothing should
> > be done there at all. Once you have mapping,index in struct
> > futex_q, it's irrelevant what tmpfs might be doing to the
> > page->mapping,page->index of the unmapped page.
>
> But move_to_swap_cache() alters a page's ->mapping and ->index when that
> page is potentially mapped into user pagetables.
It'd better not: BUG_ON(page_mapped(page)) at start of shmem_writepage,
sole caller of move_to_swap_cache. Things fall apart if tmpfs pages
get reassigned to swap while they're mapped.
Hugh
In message <[email protected]> you write:
> But I do not understand how futex_wake can still be doing a
> this->page == page test: its __pin_page will ensure that some
> page is faulted in, but that's not necessarily the same page
> as in this->page.
Yeah, my complete screwup. See below, with new routine
page_matches().
> I strongly agree with Andrew that add_to_swap and
> delete_from_swap_cache (probably the one without the __s)
> are the places for switching the anonymous, not page_io.c:
> page->mapping will be set and unset in those, and it's
> page->mapping that you're keying off in hash_futex.
Agreed, it's simplest, at least, to call futex_rehash every time
->mapping changes. But that's alot, so we'll need that futex
page->flags optimization.
I've included 3 patches and my test code. This includes debugging
output, so it's not a final patch.
The test code sets up 3 futexes (one in a file-backed mmap, one in a
malloc, and one in a shared memory segment), then runs through <N> MB
forcing things to swap. When you see the kernel message saying a
futex has been swapped out, press Enter and it will wake the futexes.
A successful run looks like (it can take a few attempts before the
futex gets swapped out, depending on the phase of the moon):
test:~# swapon /swap
Adding 14992k swap on /swap. Priority:-1 extents:82
test:~# ./test-swap2 36
MMAP=8, SHM=16, ANON=24, beginning chill...
.
futex_rehash 13370: offset 24 00000000 -> 902bd100
Exiting...
Waking up 0x30012008
Waking up 0x804c018
Waking up 0x30013010
test:~#
Thanks for feedback,
Rusty
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Minor futex comment tweaks and cleanups
Author: Rusty Russell
Status: Booted on 2.6.0-test4-bk2
D: Changes:
D:
D: (1) don't return 0 from futex_wait if we are somehow
D: spuriously woken up, loop in that case.
D:
D: (2) remove bogus comment about address no longer being in this
D: address space: we hold the mm lock, and __pin_page succeeded, so it
D: can't be true,
D:
D: (3) remove bogus comment about "get_user might fault and schedule",
D:
D: (4) clarify comment about hashing: we hash address of struct page,
D: not page itself,
D:
D: (4) remove list_empty check: we still hold the lock, so it can
D: never happen, and
D:
D: (5) single error exit path, and move __queue_me to the end (order
D: doesn't matter since we're inside the futex lock).
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .9416-linux-2.6.0-test4-bk2/kernel/futex.c .9416-linux-2.6.0-test4-bk2.updated/kernel/futex.c
--- .9416-linux-2.6.0-test4-bk2/kernel/futex.c 2003-05-27 15:02:23.000000000 +1000
+++ .9416-linux-2.6.0-test4-bk2.updated/kernel/futex.c 2003-08-27 12:34:53.000000000 +1000
@@ -84,9 +84,7 @@ static inline void unlock_futex_mm(void)
spin_unlock(¤t->mm->page_table_lock);
}
-/*
- * The physical page is shared, so we can hash on its address:
- */
+/* The struct page is shared, so we can hash on its address. */
static inline struct list_head *hash_futex(struct page *page, int offset)
{
return &futex_queues[hash_long((unsigned long)page + offset,
@@ -311,67 +309,68 @@ static inline int futex_wait(unsigned lo
int val,
unsigned long time)
{
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0, curval;
+ wait_queue_t wait;
+ int ret, curval;
struct page *page;
struct futex_q q;
+again:
init_waitqueue_head(&q.waiters);
+ init_waitqueue_entry(&wait, current);
lock_futex_mm();
page = __pin_page(uaddr - offset);
if (!page) {
- unlock_futex_mm();
- return -EFAULT;
+ ret = -EFAULT;
+ goto unlock;
}
- __queue_me(&q, page, uaddr, offset, -1, NULL);
/*
- * Page is pinned, but may no longer be in this address space.
+ * Page is pinned, but may be a kernel address.
* It cannot schedule, so we access it with the spinlock held.
*/
if (get_user(curval, (int *)uaddr) != 0) {
- unlock_futex_mm();
ret = -EFAULT;
- goto out;
+ goto unlock;
}
+
if (curval != val) {
- unlock_futex_mm();
ret = -EWOULDBLOCK;
- goto out;
+ goto unlock;
}
- /*
- * The get_user() above might fault and schedule so we
- * cannot just set TASK_INTERRUPTIBLE state when queueing
- * ourselves into the futex hash. This code thus has to
- * rely on the futex_wake() code doing a wakeup after removing
- * the waiter from the list.
- */
+
+ __queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&q.list)) {
- unlock_futex_mm();
- time = schedule_timeout(time);
- }
+ unlock_futex_mm();
+
+ time = schedule_timeout(time);
+
set_current_state(TASK_RUNNING);
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- if (time == 0) {
- ret = -ETIMEDOUT;
- goto out;
- }
- if (signal_pending(current))
- ret = -EINTR;
-out:
- /* Were we woken up anyway? */
+ put_page(q.page);
+
+ /* Were we woken up (and removed from queue)? Always return
+ * success when this happens. */
if (!unqueue_me(&q))
ret = 0;
- put_page(q.page);
+ else if (time == 0)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -EINTR;
+ else
+ /* Spurious wakeup somehow. Loop. */
+ goto again;
return ret;
+
+unlock:
+ unlock_futex_mm();
+ return ret;
}
static int futex_close(struct inode *inode, struct file *filp)
Name: Allow Futex Rehashing In Interrupt
Author: Rusty Russell
Status: Tested on 2.6.0-test4-bk2
Depends: Misc/futex-minor-tweaks.patch.gz
Depends: Misc/qemu-page-offset.patch.gz
D: This patch simply uses spin_lock_irq() instead of spin_lock for the
D: futex lock, in preparation for the futex_rehash patch which needs to
D: operate on the futex hash table in IRQ context.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5392-linux-2.6.0-test4-bk2/kernel/futex.c .5392-linux-2.6.0-test4-bk2.updated/kernel/futex.c
--- .5392-linux-2.6.0-test4-bk2/kernel/futex.c 2003-08-28 17:51:06.000000000 +1000
+++ .5392-linux-2.6.0-test4-bk2.updated/kernel/futex.c 2003-08-28 17:51:37.000000000 +1000
@@ -74,12 +74,12 @@ static inline void lock_futex_mm(void)
{
spin_lock(¤t->mm->page_table_lock);
spin_lock(&vcache_lock);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
}
static inline void unlock_futex_mm(void)
{
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
spin_unlock(&vcache_lock);
spin_unlock(¤t->mm->page_table_lock);
}
@@ -196,7 +196,7 @@ static void futex_vcache_callback(vcache
struct futex_q *q = container_of(vcache, struct futex_q, vcache);
struct list_head *head = hash_futex(new_page, q->offset);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
if (!list_empty(&q->list)) {
put_page(q->page);
@@ -206,7 +206,7 @@ static void futex_vcache_callback(vcache
list_add_tail(&q->list, head);
}
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
}
/*
@@ -293,13 +293,13 @@ static inline int unqueue_me(struct fute
int ret = 0;
spin_lock(&vcache_lock);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
__detach_vcache(&q->vcache);
ret = 1;
}
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
spin_unlock(&vcache_lock);
return ret;
}
@@ -391,10 +391,10 @@ static unsigned int futex_poll(struct fi
int ret = 0;
poll_wait(filp, &q->waiters, wait);
- spin_lock(&futex_lock);
+ spin_lock_irq(&futex_lock);
if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM;
- spin_unlock(&futex_lock);
+ spin_unlock_irq(&futex_lock);
return ret;
}
Name: Futexes without pinning pages
Author: Rusty Russell
Status: Tested on 2.6.0-test4-bk2
Depends: Misc/futex-irqsave.patch.gz
Depends: Misc/qemu-page-offset.patch.gz
D: TODO: Fix compilation with CONFIG_FUTEX=n.
D:
D: Avoid pinning pages with futexes in them, to resolve FUTEX_FD DoS.
D: This means we need another way to uniquely identify pages, rather
D: than simply comparing the "struct page *" (and using a hash table
D: based on the "struct page *". For file-backed pages, the
D: page->mapping and page->index provide a unique identifier, which is
D: persistent even if they get swapped out to the file and back.
D: There are cases where the mapping changes: for these we need a
D: callback to rehash all futexes in that page.
D:
D: For anonymous pages, page->mapping == NULL. So for this case we
D: use the "struct page *" itself to hash and compare, because the
D: mapping will be set (to &swapper_space) if the page is moved to the
D: swap cache, and will be rehashed, and we can find it again.
D:
D: The current futex_rehash() call walks the entire hash table, which
D: is slow. The simplest optimization is to have a page->flags bit
D: which indicates a futex has been placed in this page: we can clear
D: it if futex_rehash() finds no futexes. This will come in a
D: followup patch.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5947-2.6.0-test4-bk2-futex-swap.pre/include/linux/futex.h .5947-2.6.0-test4-bk2-futex-swap/include/linux/futex.h
--- .5947-2.6.0-test4-bk2-futex-swap.pre/include/linux/futex.h 2003-05-27 15:02:21.000000000 +1000
+++ .5947-2.6.0-test4-bk2-futex-swap/include/linux/futex.h 2003-08-28 17:52:47.000000000 +1000
@@ -17,4 +17,10 @@ asmlinkage long sys_futex(u32 __user *ua
long do_futex(unsigned long uaddr, int op, int val,
unsigned long timeout, unsigned long uaddr2, int val2);
+/* To tell us when the page->mapping or page->index changes. */
+struct page;
+struct address_space;
+extern void futex_rehash(struct page *page,
+ struct address_space *new_mapping,
+ unsigned long new_index);
#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5947-2.6.0-test4-bk2-futex-swap.pre/include/linux/pagemap.h .5947-2.6.0-test4-bk2-futex-swap/include/linux/pagemap.h
--- .5947-2.6.0-test4-bk2-futex-swap.pre/include/linux/pagemap.h 2003-08-25 11:58:34.000000000 +1000
+++ .5947-2.6.0-test4-bk2-futex-swap/include/linux/pagemap.h 2003-08-28 17:52:47.000000000 +1000
@@ -11,6 +11,7 @@
#include <linux/pagemap.h>
#include <asm/uaccess.h>
#include <linux/gfp.h>
+#include <linux/futex.h>
/*
* Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page
@@ -143,6 +144,7 @@ static inline void ___add_to_page_cache(
struct address_space *mapping, unsigned long index)
{
list_add(&page->list, &mapping->clean_pages);
+ futex_rehash(page, mapping, index);
page->mapping = mapping;
page->index = index;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5947-2.6.0-test4-bk2-futex-swap.pre/kernel/futex.c .5947-2.6.0-test4-bk2-futex-swap/kernel/futex.c
--- .5947-2.6.0-test4-bk2-futex-swap.pre/kernel/futex.c 2003-08-28 17:52:46.000000000 +1000
+++ .5947-2.6.0-test4-bk2-futex-swap/kernel/futex.c 2003-08-28 17:52:47.000000000 +1000
@@ -35,6 +35,15 @@
#include <linux/vcache.h>
#include <linux/mount.h>
+/* Futexes need to have a way of identifying pages which are the same,
+ when they may be in different address spaces (ie. virtual address
+ might be different, eg. shared mmap). We don't want to pin the
+ pages, so we use page->mapping & page->index where page->mapping is
+ not NULL (file-backed pages), and hash the page struct itself for
+ other pages. Callbacks rehash pages when page->mapping is changed
+ or set (such as when anonymous pages enter the swap cache), so we
+ recognize them when the get swapped back in. */
+
#define FUTEX_HASHBITS 8
/*
@@ -45,7 +54,10 @@ struct futex_q {
struct list_head list;
wait_queue_head_t waiters;
- /* Page struct and offset within it. */
+ /* These match if mapping != NULL */
+ struct address_space *mapping;
+ unsigned long index;
+ /* Otherwise, the page itself. */
struct page *page;
int offset;
@@ -57,7 +69,6 @@ struct futex_q {
struct file *filp;
};
-/* The key for the hash is the address + index + offset within page */
static struct list_head futex_queues[1<<FUTEX_HASHBITS];
static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
@@ -84,11 +95,68 @@ static inline void unlock_futex_mm(void)
spin_unlock(¤t->mm->page_table_lock);
}
-/* The struct page is shared, so we can hash on its address. */
-static inline struct list_head *hash_futex(struct page *page, int offset)
+static inline int page_matches(struct page *page, struct futex_q *elem)
{
- return &futex_queues[hash_long((unsigned long)page + offset,
- FUTEX_HASHBITS)];
+ if (elem->mapping)
+ return page->mapping == elem->mapping
+ && page->index == elem->index;
+ return page == elem->page;
+}
+
+/* For pages which are file backed, we can simply hash by mapping and
+ * index, which is persistent as they get swapped out. For anonymous
+ * regions, we hash by the actual struct page *: their mapping will
+ * change and they will be rehashed if they are swapped out.
+ */
+static inline struct list_head *hash_futex(struct address_space *mapping,
+ unsigned long index,
+ struct page *page,
+ int offset)
+{
+ unsigned long hashin;
+ if (mapping)
+ hashin = (unsigned long)mapping + index;
+ else
+ hashin = (unsigned long)page;
+ return &futex_queues[hash_long(hashin+offset, FUTEX_HASHBITS)];
+}
+
+/* Called when we change page->mapping (or page->index). Can be in
+ * interrupt context. */
+void futex_rehash(struct page *page,
+ struct address_space *new_mapping, unsigned long new_index)
+{
+ unsigned int i;
+ unsigned long flags;
+ struct futex_q *q;
+ LIST_HEAD(gather);
+ static int rehash_count = 0;
+
+ spin_lock_irqsave(&futex_lock, flags);
+ rehash_count++;
+ for (i = 0; i < 1 << FUTEX_HASHBITS; i++) {
+ struct list_head *l, *next;
+ list_for_each_safe(l, next, &futex_queues[i]) {
+ q = list_entry(l, struct futex_q, list);
+ if (page_matches(page, q)) {
+ list_del(&q->list);
+ list_add(&q->list, &gather);
+ printk("futex_rehash %i: offset %i %p -> %p\n",
+ rehash_count,
+ q->offset, page->mapping, new_mapping);
+ }
+ }
+ }
+ while (!list_empty(&gather)) {
+ q = list_entry(gather.next, struct futex_q, list);
+ q->mapping = new_mapping;
+ q->index = new_index;
+ q->page = page;
+ list_del(&q->list);
+ list_add(&q->list,
+ hash_futex(new_mapping, new_index, page, q->offset));
+ }
+ spin_unlock_irqrestore(&futex_lock, flags);
}
/*
@@ -162,12 +230,12 @@ static inline int futex_wake(unsigned lo
return -EFAULT;
}
- head = hash_futex(page, offset);
+ head = hash_futex(page->mapping, page->index, page, offset);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page && this->offset == offset) {
+ if (page_matches(page, this) && this->offset == offset) {
list_del_init(i);
__detach_vcache(&this->vcache);
wake_up_all(&this->waiters);
@@ -194,15 +262,16 @@ static inline int futex_wake(unsigned lo
static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
{
struct futex_q *q = container_of(vcache, struct futex_q, vcache);
- struct list_head *head = hash_futex(new_page, q->offset);
+ struct list_head *head = hash_futex(new_page->mapping, new_page->index,
+ new_page, q->offset);
spin_lock_irq(&futex_lock);
if (!list_empty(&q->list)) {
- put_page(q->page);
- q->page = new_page;
- __pin_page_atomic(new_page);
list_del(&q->list);
+ q->mapping = new_page->mapping;
+ q->index = new_page->index;
+ q->page = new_page;
list_add_tail(&q->list, head);
}
@@ -229,13 +298,13 @@ static inline int futex_requeue(unsigned
if (!page2)
goto out;
- head1 = hash_futex(page1, offset1);
- head2 = hash_futex(page2, offset2);
+ head1 = hash_futex(page1->mapping, page1->index, page1, offset1);
+ head2 = hash_futex(page2->mapping, page2->index, page2, offset2);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page1 && this->offset == offset1) {
+ if (page_matches(page1, this) && this->offset == offset1) {
list_del_init(i);
__detach_vcache(&this->vcache);
if (++ret <= nr_wake) {
@@ -244,8 +313,6 @@ static inline int futex_requeue(unsigned
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- put_page(this->page);
- __pin_page_atomic (page2);
list_add_tail(i, head2);
__attach_vcache(&this->vcache, uaddr2,
current->mm, futex_vcache_callback);
@@ -272,11 +339,14 @@ static inline void __queue_me(struct fut
unsigned long uaddr, int offset,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(page, offset);
+ struct list_head *head
+ = hash_futex(page->mapping, page->index, page, offset);
q->offset = offset;
q->fd = fd;
q->filp = filp;
+ q->mapping = page->mapping;
+ q->index = page->index;
q->page = page;
list_add_tail(&q->list, head);
@@ -332,18 +402,19 @@ again:
*/
if (get_user(curval, (int *)uaddr) != 0) {
ret = -EFAULT;
- goto unlock;
+ goto putpage;
}
if (curval != val) {
ret = -EWOULDBLOCK;
- goto unlock;
+ goto putpage;
}
__queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
unlock_futex_mm();
+ put_page(page);
time = schedule_timeout(time);
@@ -352,7 +423,6 @@ again:
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- put_page(q.page);
/* Were we woken up (and removed from queue)? Always return
* success when this happens. */
@@ -368,6 +438,8 @@ again:
return ret;
+putpage:
+ put_page(page);
unlock:
unlock_futex_mm();
return ret;
@@ -378,7 +450,6 @@ static int futex_close(struct inode *ino
struct futex_q *q = filp->private_data;
unqueue_me(q);
- put_page(q->page);
kfree(filp->private_data);
return 0;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5947-2.6.0-test4-bk2-futex-swap.pre/mm/filemap.c .5947-2.6.0-test4-bk2-futex-swap/mm/filemap.c
--- .5947-2.6.0-test4-bk2-futex-swap.pre/mm/filemap.c 2003-08-25 11:58:36.000000000 +1000
+++ .5947-2.6.0-test4-bk2-futex-swap/mm/filemap.c 2003-08-28 17:52:47.000000000 +1000
@@ -27,6 +27,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
+#include <linux/futex.h>
/*
* This is needed for the following functions:
* - try_to_release_page
@@ -92,6 +93,7 @@ void __remove_from_page_cache(struct pag
radix_tree_delete(&mapping->page_tree, page->index);
list_del(&page->list);
+ futex_rehash(page, NULL, 0);
page->mapping = NULL;
mapping->nrpages--;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5947-2.6.0-test4-bk2-futex-swap.pre/mm/page_io.c .5947-2.6.0-test4-bk2-futex-swap/mm/page_io.c
--- .5947-2.6.0-test4-bk2-futex-swap.pre/mm/page_io.c 2003-02-07 19:18:58.000000000 +1100
+++ .5947-2.6.0-test4-bk2-futex-swap/mm/page_io.c 2003-08-28 17:52:47.000000000 +1000
@@ -19,6 +19,7 @@
#include <linux/buffer_head.h> /* for block_sync_page() */
#include <linux/mpage.h>
#include <linux/writeback.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
static struct bio *
@@ -151,6 +152,7 @@ int rw_swap_page_sync(int rw, swp_entry_
lock_page(page);
BUG_ON(page->mapping);
+ futex_rehash(page, &swapper_space, entry.val);
page->mapping = &swapper_space;
page->index = entry.val;
@@ -161,7 +163,10 @@ int rw_swap_page_sync(int rw, swp_entry_
ret = swap_writepage(page, &swap_wbc);
wait_on_page_writeback(page);
}
+ lock_page(page);
+ futex_rehash(page, NULL, 0);
page->mapping = NULL;
+ unlock_page(page);
if (ret == 0 && (!PageUptodate(page) || PageError(page)))
ret = -EIO;
return ret;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5947-2.6.0-test4-bk2-futex-swap.pre/mm/swap_state.c .5947-2.6.0-test4-bk2-futex-swap/mm/swap_state.c
--- .5947-2.6.0-test4-bk2-futex-swap.pre/mm/swap_state.c 2003-08-12 06:58:06.000000000 +1000
+++ .5947-2.6.0-test4-bk2-futex-swap/mm/swap_state.c 2003-08-28 17:52:47.000000000 +1000
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
@@ -200,6 +201,7 @@ int move_to_swap_cache(struct page *page
err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
if (!err) {
+ futex_rehash(page, &swapper_space, entry.val);
__remove_from_page_cache(page);
___add_to_page_cache(page, &swapper_space, entry.val);
}
@@ -236,6 +238,7 @@ int move_from_swap_cache(struct page *pa
err = radix_tree_insert(&mapping->page_tree, index, page);
if (!err) {
+ futex_rehash(page, mapping, index);
__delete_from_swap_cache(page);
___add_to_page_cache(page, mapping, index);
}
================
/* Test program for unpinned futexes. */
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include <errno.h>
#include <string.h>
#include <sys/shm.h>
#include <sys/poll.h>
#include <sys/mman.h>
#include "usersem.h"
#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof((arr)[0]))
#define streq(a,b) (strcmp((a),(b)) == 0)
#define UNTOUCHED_VALUE 7
#define TOUCHED_VALUE 100
/* Recognizable offsets within page so printks can easily identify the futex */
#define MMAP_OFFSET 8
#define SHMEM_OFFSET 16
#define ANON_OFFSET 24
static void chill(int size)
{
char *mem;
unsigned int i, numpages;
unsigned int pagesize = getpagesize();
numpages = size * 1024*1024 / pagesize;
mem = malloc(numpages * pagesize);
if (!mem) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
for (;;) {
for (i = 0; i < numpages; i++)
*(mem + i*pagesize) = '\0';
printf(".\n");
}
}
static void *adjust_to_offset(void *addr, unsigned int off)
{
unsigned int pagesize = getpagesize();
while ((unsigned long)addr % pagesize != off)
addr++;
return addr;
}
/* Give an FD waiting on futex at this addr. */
static int fd_for_addr(void *addr)
{
return sys_futex(addr, FUTEX_FD, 0, NULL);
}
static void *get_anon_page(void)
{
return malloc(getpagesize());
}
static void *get_shared_memory(void)
{
unsigned int pagesize = getpagesize();
void *memory;
static int shm;
shm = shmget(IPC_PRIVATE, pagesize, IPC_CREAT);
if (shm < 0) {
perror("shmget");
return NULL;
}
memory = shmat(shm, NULL, 0);
if (memory == (void *)-1) {
perror("shmat");
shmctl(shm, IPC_RMID, NULL);
return NULL;
}
/* Delete when we exit. */
shmctl(shm, IPC_RMID, NULL);
return memory;
}
static void *get_mmap(void)
{
unsigned int pagesize = getpagesize();
void *memory;
int fd;
fd = open("/tmp/test-swap2", O_CREAT|O_EXCL|O_RDWR, 0600);
if (fd < 0) {
perror("Opening /tmp/test-swap2");
return NULL;
}
unlink("/tmp/test-swap2");
if (write(fd, malloc(pagesize), pagesize) != pagesize) {
perror("Writing /tmp/test-swap2");
return NULL;
}
memory = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (memory == MAP_FAILED) {
perror("mmap /tmp/test-swap2");
return NULL;
}
return memory;
}
/* Touching the page swaps it back in, which is slightly different
* from waking on a page which is swapped out. */
static void wake_futex(int *futex, int touch, const char action[])
{
if (touch)
*futex = TOUCHED_VALUE;
printf("Waking up %p\n", futex);
if (sys_futex(futex, FUTEX_WAKE, 1, NULL) != 1) {
perror(action);
exit(1);
}
}
int main(int argc, char *argv[])
{
int *map, *anon, *shm, touch = 0;
pid_t child;
char c;
struct pollfd pollarr[3];
if (argc > 1 && streq(argv[1], "-t")) {
touch = 1;
argc--;
argv++;
}
if (argc != 2) {
fprintf(stderr, "Usage: test-swap2 [-t] <mb-to-thrash>\n"
" Where -t means touch before waking futexes\n");
exit(1);
}
map = get_mmap();
anon = get_anon_page();
shm = get_shared_memory();
if (!map || !anon || !shm)
exit(1);
/* Different offsets so we can recognize them in printks. */
map = adjust_to_offset(map, MMAP_OFFSET);
anon = adjust_to_offset(anon, ANON_OFFSET);
shm = adjust_to_offset(shm, SHMEM_OFFSET);
*map = *anon = *shm = UNTOUCHED_VALUE;
pollarr[0].fd = fd_for_addr(map);
pollarr[1].fd = fd_for_addr(anon);
pollarr[2].fd = fd_for_addr(shm);
pollarr[0].events = pollarr[1].events = pollarr[2].events = POLLIN;
/* None should be ready yet */
if (poll(pollarr, ARRAY_SIZE(pollarr), 0)) {
perror("poll");
exit(1);
}
printf("MMAP=%u, SHM=%u, ANON=%u, beginning chill...\n",
MMAP_OFFSET, SHMEM_OFFSET, ANON_OFFSET);
child = fork();
if (child < 0) {
perror("fork");
exit(1);
}
if (child == 0)
chill(atoi(argv[1]));
/* Wait until they hit <CR>. */
if (read(0, &c, 1) != 1) {
perror("read");
exit(1);
}
kill(child, SIGTERM);
printf("Exiting...\n");
wake_futex(map, touch, "wake up mmap");
wake_futex(anon, touch, "wake up anon");
wake_futex(shm, touch, "wake up shm");
if (poll(pollarr, ARRAY_SIZE(pollarr), -1) != 3) {
perror("poll after wakeup");
exit(1);
}
if (touch)
c = TOUCHED_VALUE;
else
c = UNTOUCHED_VALUE;
if (*map != c) {
fprintf(stderr, "mmap didn't get value %u: it's %u\n",
c, *mmap);
exit(1);
}
if (*anon != c) {
fprintf(stderr, "anon didn't get value %u: it's %u\n",
c, *anon);
exit(1);
}
if (*shm != c) {
fprintf(stderr, "shm didn't get value %u: it's %u\n",
c, *shm);
exit(1);
}
return 0;
}
================
In message <[email protected]> you write:
> > I assumed that for non-anonymous pages the mapping + index was always
> > a unique identifier, even as they were swapped out. We need a
> > persistent unique identifier for a page, OR a callback to
> > unhash/rehash it when the identifier changes. Hence mapping + index
> > where mapping != NULL, and the struct page and callbacks for swap
> > pages. Using the callbacks for wherever else page->mapping changes is
> > simple (but may be slow).
>
> swap_writepage() and end_swap_bio_read() are not really companion
> functions. The page is in use and may be mapped into user pagetables
> during swap_writepage(). It won't actually be freed up for a very long
> time, if at all.
>
> I guess this means that there could be a large number of futexes which are
> considered "swapped out" which are in fact not swapped out at all.
>
> I'm starting to dimly understand what this code does. You get 2/10 for
> patch explanation ;)
Heh, you get 4/10 for reading comprehension. See: "Name: Futexes
without pinning pages" at the top of the patch.
lkml trains people to use Alan Cox-style maximal density descriptions,
to avoid being accused of insulting colleagues' intelligence (or being
accused of not being 31337 enough). You seem to be cut of a different
cloth. I will try to be more verbose here, and I think we'll all be
better for it.
> I think a better place to rehash the futex would be at the point where the
> page is added to and removed from swapcache.
This is simplest: the current code actually moves the futex queue out
of the hash. If we make the rule: "call futex_rehash" every time
page->mapping (or page->index) changes, we avoid races and make the
code simpler.
But this means it could be called quite often. One answer is to
restrict the futex hashing so we don't have to search the entire
table. Another is to have a separate futexed pages hash. The third
is to implement the "futex_was_here" bit in the page->flags, which I
think will work well in practice. I'll implement it as a separate
patch however.
> Similarly, all places which change the page's hash keys (mapping and index)
> need to be locked against the futex lookup code.
Yes. I'll look at everywhere that mapping is changed: thanks for the
hints [snipped].
> Please make sure it builds with CONFIG_SWAP=n
>
> Please make sure it builds with CONFIG_FUTEX=n (sorry)
Will do.
> Please augment the lock ranking comment at the top of filemap.c
Yes.
> If a futex resides in a pagecache page which is then truncated, a
> futex_wake() should really send the caller a SIGBUS; it looks like the code
> will return -EFAULT, which is good enough. Any waiters on that futex will
> not be wakeable, but they will be killable.
It would be nice, but it's not worth more than a couple of lines of
kernel code. There's a similar case where one thread is waiting in an
mmapped file and it is unmapped by the other thread. It's a
programmer bug since obviously noone can now wake the futex you're
waiting on...
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell <[email protected]> wrote:
>
> > I think a better place to rehash the futex would be at the point where the
> > page is added to and removed from swapcache.
>
> This is simplest: the current code actually moves the futex queue out
> of the hash. If we make the rule: "call futex_rehash" every time
> page->mapping (or page->index) changes, we avoid races and make the
> code simpler.
>
> But this means it could be called quite often.
Moving pages to and from swapcache really is not a fastpath at all,
so I wouldn't be worrying about that.
And even if the code is sucky, it will only be sucky when there is a lot of
swapcache activity AND a lot of futexes are in use.
In message <[email protected]> you write:
> Rusty Russell <[email protected]> wrote:
> > But this means it could be called quite often.
>
> Moving pages to and from swapcache really is not a fastpath at all,
> so I wouldn't be worrying about that.
>
> And even if the code is sucky, it will only be sucky when there is a lot of
> swapcache activity AND a lot of futexes are in use.
Well, the patch I posted adds a futex_rehash to ___add_to_page_cache,
which seems to get called more often (ie. even new pages added to the
page cache), but is by far the most logical and simple solution
(ie. where you actually change the mapping, the locking *has* to be
sufficient).
Walking a 256 entry hash table isn't free even if it's empty. Example
patch below.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Futexes Rehash Optimization
Author: Rusty Russell
Status: Experimental
Depends: Misc/futex-swap.patch.gz
D: The current futex_rehash() call walks the entire hash table, which
D: is slow. The simplest optimization is to have a page->flags bit
D: which indicates a futex has been placed in this page: we can clear
D: it if futex_rehash() finds no futexes.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5816-2.6.0-test4-bk2-futex-page-bit.pre/include/linux/page-flags.h .5816-2.6.0-test4-bk2-futex-page-bit/include/linux/page-flags.h
--- .5816-2.6.0-test4-bk2-futex-page-bit.pre/include/linux/page-flags.h 2003-06-23 10:52:59.000000000 +1000
+++ .5816-2.6.0-test4-bk2-futex-page-bit/include/linux/page-flags.h 2003-08-29 13:43:32.000000000 +1000
@@ -75,6 +75,7 @@
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
+#define PG_futexed 20 /* Has/had a futex waiting in it */
/*
@@ -267,6 +268,10 @@ extern void get_full_page_state(struct p
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)
+#define PageFutexed(page) test_bit(PG_futexed, &(page)->flags)
+#define SetPageFutexed(page) set_bit(PG_futexed, &(page)->flags)
+#define ClearPageFutexed(page) clear_bit(PG_futexed, &(page)->flags)
+
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .5816-2.6.0-test4-bk2-futex-page-bit.pre/kernel/futex.c .5816-2.6.0-test4-bk2-futex-page-bit/kernel/futex.c
--- .5816-2.6.0-test4-bk2-futex-page-bit.pre/kernel/futex.c 2003-08-29 13:43:32.000000000 +1000
+++ .5816-2.6.0-test4-bk2-futex-page-bit/kernel/futex.c 2003-08-29 13:43:32.000000000 +1000
@@ -130,6 +131,10 @@ void futex_rehash(struct page *page,
static int rehash_count = 0;
spin_lock_irqsave(&futex_lock, flags);
+ if (likely(!PageFutexed(page)))
+ goto out;
+
+ ClearPageFutexed(page);
rehash_count++;
for (i = 0; i < 1 << FUTEX_HASHBITS; i++) {
struct list_head *l, *next;
@@ -141,6 +145,7 @@ void futex_rehash(struct page *page,
if (page_matches(page, q)) {
list_del(&q->list);
list_add(&q->list, &gather);
+ SetPageFutexed(page);
printk("futex_rehash %i: offset %i %p -> %p\n",
rehash_count,
q->offset, page->mapping, new_mapping);
@@ -156,6 +161,7 @@ void futex_rehash(struct page *page,
list_add(&q->list,
hash_futex(new_mapping, new_index, page, q->offset));
}
+out:
spin_unlock_irqrestore(&futex_lock, flags);
}
@@ -348,6 +354,7 @@ static inline void __queue_me(struct fut
q->mapping = page->mapping;
q->index = page->index;
q->page = page;
+ SetPageFutexed(page);
list_add_tail(&q->list, head);
/*
Rusty Russell <[email protected]> wrote:
>
> Well, the patch I posted adds a futex_rehash to ___add_to_page_cache,
That's a real hotpath. Certainly we couldn't take a global lock there.
Need to find a way to rehash when moving pages around only in swapcache. I
thought the earlier patches were structured that way?
In message <[email protected]> you write:
> Rusty Russell <[email protected]> wrote:
> >
> > Well, the patch I posted adds a futex_rehash to ___add_to_page_cache,
>
> That's a real hotpath. Certainly we couldn't take a global lock there.
Agreed.
> Need to find a way to rehash when moving pages around only in swapcache. I
> thought the earlier patches were structured that way?
Yes, but I'm fairly sure they were racy. Which is enough for me to
dislike it.
Rehashing when the ->mapping changes is simple and clear. It's fairly
easy to push it up to the callers, and then I'll figure out which ones
are adding new pages, and which ones are moving pages from one
->mapping to another (these ones we care about). But it has to be
under the same locks (ie. rehashing and changing the mapping an atomic
operation) otherwise someone has to prove that noone can do a futex
lookup on the page with the new mapping before it's been updated in
the hash.
The other possibility is to use some lock we already have, rather than
the futex_lock, to protect that PG_futexed bit.
I'll keep working on it. I'm learning more about the VM, at least.
Thanks,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell wrote:
> Walking a 256 entry hash table isn't free even if it's empty. Example
> patch below.
You can avoid walking the whole hash table. Instead of this:
(page, offset) -> list of futexes at (p,o)
You can use a two-level map like this:
(page) -> (offset) -> list of futexes at (p,o)
Or you can use two one-level maps, like this:
(page) -> list of futexes at (p)
(page, offset) -> list of futexes at (p,o)
Either of these gives you the list of futexes give then page in
O(list_size). They also mean you don't need the page->flags bit to
get O(1) for a page with no futexes, but it's probably a good
optimisation anyway.
-- Jamie
In message <[email protected]> you write:
> Rusty Russell wrote:
> > Walking a 256 entry hash table isn't free even if it's empty. Example
> > patch below.
>
> You can avoid walking the whole hash table. Instead of this:
>
> (page, offset) -> list of futexes at (p,o)
>
> You can use a two-level map like this:
>
> (page) -> (offset) -> list of futexes at (p,o)
>
> Or you can use two one-level maps, like this:
>
> (page) -> list of futexes at (p)
> (page, offset) -> list of futexes at (p,o)
>
> Either of these gives you the list of futexes give then page in
> O(list_size).
Yes, it's effectively segmenting your hash table, which reduces the
effective hash table size for futexes in the same page. I'm reluctant
to do that unless there's a real speed problem.
This is the latest version of my futex patch. I think we're getting
closer.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Futexes without pinning pages
Author: Rusty Russell
Status: Tested on 2.6.0-test4-bk2
Depends: Misc/futex-minor-tweaks.patch.gz
Depends: Misc/qemu-page-offset.patch.gz
D: Avoid pinning pages with futexes in them, to resolve FUTEX_FD DoS.
D: This means we need another way to uniquely identify pages, rather
D: than simply comparing the "struct page *" (and using a hash table
D: based on the "struct page *"). For file-backed pages, the
D: page->mapping and page->index provide a unique identifier, which is
D: persistent even if they get swapped out to the file and back.
D:
D: For anonymous pages, page->mapping == NULL. So for this case we
D: use the "struct page *" itself to hash and compare: if the page is
D: going to be swapped out, the mapping will be changed (to
D: &swapper_space).
D:
D: We need to catch cases where the mapping changes (anon or tmpfs
D: pages moving into swapcache and back): for these we now have a
D: futex_rehash() callback to rehash all futexes in that page, which
D: must be done atomic with the change in ->mapping, so we hold the
D: futex_lock around both. (This also means any calls to hash_futex()
D: must be inside the futex lock, as in futex_vcache_callback).
D:
D: Since most calls to ___add_to_page_cache() (which actually does the
D: ->mapping change) are actually new pages (for which the
D: futex_rehash is not required), we do the locking and futex_rehash
D: in the (few) callers who require it.
D:
D: The main twist is that add_to_page_cache() can only be called for
D: pages which are actually unused (ie. new pages in which there can
D: be no futexes): add_to_swap() called it on already "live" pages.
D: So a new variant "move_to_page_cache()" which calls the
D: futex_rehash() is added, and called from add_to_swap().
D:
D: One remaining FIXME: is in truncate. It would be nice to wake all
D: futexes in mmaped pages when the pages are truncated underneath the
D: mmap: the callers would then get -EFAULT. It's a politeness thing,
D: not a requirement.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29219-2.6.0-test4-bk2-futex-swap.pre/include/linux/futex.h .29219-2.6.0-test4-bk2-futex-swap/include/linux/futex.h
--- .29219-2.6.0-test4-bk2-futex-swap.pre/include/linux/futex.h 2003-05-27 15:02:21.000000000 +1000
+++ .29219-2.6.0-test4-bk2-futex-swap/include/linux/futex.h 2003-09-01 14:10:33.000000000 +1000
@@ -1,9 +1,9 @@
#ifndef _LINUX_FUTEX_H
#define _LINUX_FUTEX_H
+#include <linux/config.h>
+#include <linux/spinlock.h>
/* Second argument to futex syscall */
-
-
#define FUTEX_WAIT (0)
#define FUTEX_WAKE (1)
#define FUTEX_FD (2)
@@ -17,4 +17,22 @@ asmlinkage long sys_futex(u32 __user *ua
long do_futex(unsigned long uaddr, int op, int val,
unsigned long timeout, unsigned long uaddr2, int val2);
+/* To tell us when the page->mapping or page->index changes on page
+ * which might have futex: must hold futex_lock across futex_rehash
+ * and actual change. */
+extern spinlock_t futex_lock;
+struct page;
+struct address_space;
+#ifdef CONFIG_FUTEX
+extern void futex_rehash(struct page *page,
+ struct address_space *new_mapping,
+ unsigned long new_index);
+#else
+static inline void futex_rehash(struct page *page,
+ struct address_space *new_mapping,
+ unsigned long new_index)
+{
+}
+#endif /*CONFIG_FUTEX*/
+
#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29219-2.6.0-test4-bk2-futex-swap.pre/include/linux/pagemap.h .29219-2.6.0-test4-bk2-futex-swap/include/linux/pagemap.h
--- .29219-2.6.0-test4-bk2-futex-swap.pre/include/linux/pagemap.h 2003-08-25 11:58:34.000000000 +1000
+++ .29219-2.6.0-test4-bk2-futex-swap/include/linux/pagemap.h 2003-09-01 14:10:33.000000000 +1000
@@ -94,6 +94,8 @@ int add_to_page_cache(struct page *page,
unsigned long index, int gfp_mask);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
unsigned long index, int gfp_mask);
+int move_to_page_cache(struct page *page, struct address_space *mapping,
+ unsigned long index, int gfp_mask);
extern void remove_from_page_cache(struct page *page);
extern void __remove_from_page_cache(struct page *page);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29219-2.6.0-test4-bk2-futex-swap.pre/kernel/futex.c .29219-2.6.0-test4-bk2-futex-swap/kernel/futex.c
--- .29219-2.6.0-test4-bk2-futex-swap.pre/kernel/futex.c 2003-09-01 14:10:32.000000000 +1000
+++ .29219-2.6.0-test4-bk2-futex-swap/kernel/futex.c 2003-09-01 14:10:33.000000000 +1000
@@ -35,6 +35,15 @@
#include <linux/vcache.h>
#include <linux/mount.h>
+/* Futexes need to have a way of identifying pages which are the same,
+ when they may be in different address spaces (ie. virtual address
+ might be different, eg. shared mmap). We don't want to pin the
+ pages, so we use page->mapping & page->index where page->mapping is
+ not NULL (file-backed pages), and hash the page struct itself for
+ other pages. Callbacks rehash pages when page->mapping is changed
+ or set (such as when anonymous pages enter the swap cache), so we
+ recognize them when the get swapped back in. */
+
#define FUTEX_HASHBITS 8
/*
@@ -45,7 +54,10 @@ struct futex_q {
struct list_head list;
wait_queue_head_t waiters;
- /* Page struct and offset within it. */
+ /* These match if mapping != NULL */
+ struct address_space *mapping;
+ unsigned long index;
+ /* Otherwise, the page itself. */
struct page *page;
int offset;
@@ -57,9 +69,8 @@ struct futex_q {
struct file *filp;
};
-/* The key for the hash is the address + index + offset within page */
static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
extern void send_sigio(struct fown_struct *fown, int fd, int band);
@@ -84,11 +95,66 @@ static inline void unlock_futex_mm(void)
spin_unlock(¤t->mm->page_table_lock);
}
-/* The struct page is shared, so we can hash on its address. */
-static inline struct list_head *hash_futex(struct page *page, int offset)
+static inline int page_matches(struct page *page, struct futex_q *elem)
{
- return &futex_queues[hash_long((unsigned long)page + offset,
- FUTEX_HASHBITS)];
+ if (elem->mapping)
+ return page->mapping == elem->mapping
+ && page->index == elem->index;
+ return page == elem->page;
+}
+
+/* For pages which are file backed, we can simply hash by mapping and
+ * index, which is persistent as they get swapped out. For anonymous
+ * regions, we hash by the actual struct page *: their mapping will
+ * change and they will be rehashed if they are swapped out.
+ */
+static inline struct list_head *hash_futex(struct address_space *mapping,
+ unsigned long index,
+ struct page *page,
+ int offset)
+{
+ unsigned long hashin;
+ if (mapping)
+ hashin = (unsigned long)mapping + index;
+ else
+ hashin = (unsigned long)page;
+ return &futex_queues[hash_long(hashin+offset, FUTEX_HASHBITS)];
+}
+
+/* Called when we change page->mapping (or page->index). Must be
+ * holding futex_lock across change of page->mapping and call to
+ * futex_rehash. */
+void futex_rehash(struct page *page,
+ struct address_space *new_mapping, unsigned long new_index)
+{
+ unsigned int i;
+ struct futex_q *q;
+ LIST_HEAD(gather);
+ static int rehash_count = 0;
+
+ rehash_count++;
+ for (i = 0; i < 1 << FUTEX_HASHBITS; i++) {
+ struct list_head *l, *next;
+ list_for_each_safe(l, next, &futex_queues[i]) {
+ q = list_entry(l, struct futex_q, list);
+ if (page_matches(page, q)) {
+ list_del(&q->list);
+ list_add(&q->list, &gather);
+ printk("futex_rehash %i: offset %i %p -> %p\n",
+ rehash_count,
+ q->offset, page->mapping, new_mapping);
+ }
+ }
+ }
+ while (!list_empty(&gather)) {
+ q = list_entry(gather.next, struct futex_q, list);
+ q->mapping = new_mapping;
+ q->index = new_index;
+ q->page = page;
+ list_del(&q->list);
+ list_add(&q->list,
+ hash_futex(new_mapping, new_index, page, q->offset));
+ }
}
/*
@@ -162,12 +228,12 @@ static inline int futex_wake(unsigned lo
return -EFAULT;
}
- head = hash_futex(page, offset);
+ head = hash_futex(page->mapping, page->index, page, offset);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page && this->offset == offset) {
+ if (page_matches(page, this) && this->offset == offset) {
list_del_init(i);
__detach_vcache(&this->vcache);
wake_up_all(&this->waiters);
@@ -194,15 +260,16 @@ static inline int futex_wake(unsigned lo
static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
{
struct futex_q *q = container_of(vcache, struct futex_q, vcache);
- struct list_head *head = hash_futex(new_page, q->offset);
+ struct list_head *head;
spin_lock(&futex_lock);
-
+ head = hash_futex(new_page->mapping, new_page->index,
+ new_page, q->offset);
if (!list_empty(&q->list)) {
- put_page(q->page);
- q->page = new_page;
- __pin_page_atomic(new_page);
list_del(&q->list);
+ q->mapping = new_page->mapping;
+ q->index = new_page->index;
+ q->page = new_page;
list_add_tail(&q->list, head);
}
@@ -229,13 +296,13 @@ static inline int futex_requeue(unsigned
if (!page2)
goto out;
- head1 = hash_futex(page1, offset1);
- head2 = hash_futex(page2, offset2);
+ head1 = hash_futex(page1->mapping, page1->index, page1, offset1);
+ head2 = hash_futex(page2->mapping, page2->index, page2, offset2);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page1 && this->offset == offset1) {
+ if (page_matches(page1, this) && this->offset == offset1) {
list_del_init(i);
__detach_vcache(&this->vcache);
if (++ret <= nr_wake) {
@@ -244,8 +311,6 @@ static inline int futex_requeue(unsigned
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- put_page(this->page);
- __pin_page_atomic (page2);
list_add_tail(i, head2);
__attach_vcache(&this->vcache, uaddr2,
current->mm, futex_vcache_callback);
@@ -272,11 +337,14 @@ static inline void __queue_me(struct fut
unsigned long uaddr, int offset,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(page, offset);
+ struct list_head *head
+ = hash_futex(page->mapping, page->index, page, offset);
q->offset = offset;
q->fd = fd;
q->filp = filp;
+ q->mapping = page->mapping;
+ q->index = page->index;
q->page = page;
list_add_tail(&q->list, head);
@@ -332,18 +400,19 @@ again:
*/
if (get_user(curval, (int *)uaddr) != 0) {
ret = -EFAULT;
- goto unlock;
+ goto putpage;
}
if (curval != val) {
ret = -EWOULDBLOCK;
- goto unlock;
+ goto putpage;
}
__queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
unlock_futex_mm();
+ put_page(page);
time = schedule_timeout(time);
@@ -352,7 +421,6 @@ again:
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- put_page(q.page);
/* Were we woken up (and removed from queue)? Always return
* success when this happens. */
@@ -368,6 +436,8 @@ again:
return ret;
+putpage:
+ put_page(page);
unlock:
unlock_futex_mm();
return ret;
@@ -378,7 +448,6 @@ static int futex_close(struct inode *ino
struct futex_q *q = filp->private_data;
unqueue_me(q);
- put_page(q->page);
kfree(filp->private_data);
return 0;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29219-2.6.0-test4-bk2-futex-swap.pre/mm/filemap.c .29219-2.6.0-test4-bk2-futex-swap/mm/filemap.c
--- .29219-2.6.0-test4-bk2-futex-swap.pre/mm/filemap.c 2003-08-25 11:58:36.000000000 +1000
+++ .29219-2.6.0-test4-bk2-futex-swap/mm/filemap.c 2003-09-01 14:10:33.000000000 +1000
@@ -27,6 +27,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
+#include <linux/futex.h>
/*
* This is needed for the following functions:
* - try_to_release_page
@@ -79,6 +80,9 @@
* ->private_lock (try_to_unmap_one)
* ->page_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
+ *
+ * ->mapping->page_lock
+ * ->futex_lock (move_to_page_cache, move_xxx_swap_cache)
*/
/*
@@ -220,16 +224,11 @@ restart:
* This adds a page to the page cache, starting out as locked, unreferenced,
* not uptodate and with no errors.
*
- * This function is used for two things: adding newly allocated pagecache
- * pages and for moving existing anon pages into swapcache.
- *
- * In the case of pagecache pages, the page is new, so we can just run
- * SetPageLocked() against it. The other page state flags were set by
- * rmqueue()
+ * This function is used for adding newly allocated pagecache pages.
+ * See move_to_page_cache for moving existing pages into pagecache.
*
- * In the case of swapcache, try_to_swap_out() has already locked the page, so
- * SetPageLocked() is ugly-but-OK there too. The required page state has been
- * set up by swap_out_add_to_swap_cache().
+ * The page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue()
*
* This function does not add the page to the LRU. The caller must do that.
*/
@@ -264,6 +263,39 @@ int add_to_page_cache_lru(struct page *p
return ret;
}
+/*
+ * This is exactly like add_to_page_cache(), except the page may have
+ * a futex in it (ie. it's not a new page).
+ *
+ * This is currently called from try_to_swap_out(), which has already
+ * locked the page, so SetPageLocked() is unneeded, but harmless. The
+ * required page state has been set up by
+ * swap_out_add_to_swap_cache().
+ */
+int move_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset, int gfp_mask)
+{
+ int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+
+ if (error == 0) {
+ page_cache_get(page);
+ spin_lock(&mapping->page_lock);
+ error = radix_tree_insert(&mapping->page_tree, offset, page);
+ if (!error) {
+ SetPageLocked(page);
+ spin_lock(&futex_lock);
+ futex_rehash(page, mapping, offset);
+ ___add_to_page_cache(page, mapping, offset);
+ spin_unlock(&futex_lock);
+ } else {
+ page_cache_release(page);
+ }
+ spin_unlock(&mapping->page_lock);
+ radix_tree_preload_end();
+ }
+ return error;
+}
+
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29219-2.6.0-test4-bk2-futex-swap.pre/mm/page_io.c .29219-2.6.0-test4-bk2-futex-swap/mm/page_io.c
--- .29219-2.6.0-test4-bk2-futex-swap.pre/mm/page_io.c 2003-02-07 19:18:58.000000000 +1100
+++ .29219-2.6.0-test4-bk2-futex-swap/mm/page_io.c 2003-09-01 14:10:33.000000000 +1000
@@ -19,6 +19,7 @@
#include <linux/buffer_head.h> /* for block_sync_page() */
#include <linux/mpage.h>
#include <linux/writeback.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
static struct bio *
@@ -151,8 +152,11 @@ int rw_swap_page_sync(int rw, swp_entry_
lock_page(page);
BUG_ON(page->mapping);
+ spin_lock(&futex_lock);
+ futex_rehash(page, &swapper_space, entry.val);
page->mapping = &swapper_space;
page->index = entry.val;
+ spin_unlock(&futex_lock);
if (rw == READ) {
ret = swap_readpage(NULL, page);
@@ -161,7 +165,10 @@ int rw_swap_page_sync(int rw, swp_entry_
ret = swap_writepage(page, &swap_wbc);
wait_on_page_writeback(page);
}
+ spin_lock(&futex_lock);
+ futex_rehash(page, NULL, 0);
page->mapping = NULL;
+ spin_unlock(&futex_lock);
if (ret == 0 && (!PageUptodate(page) || PageError(page)))
ret = -EIO;
return ret;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29219-2.6.0-test4-bk2-futex-swap.pre/mm/swap_state.c .29219-2.6.0-test4-bk2-futex-swap/mm/swap_state.c
--- .29219-2.6.0-test4-bk2-futex-swap.pre/mm/swap_state.c 2003-08-12 06:58:06.000000000 +1000
+++ .29219-2.6.0-test4-bk2-futex-swap/mm/swap_state.c 2003-09-01 14:10:33.000000000 +1000
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
@@ -140,8 +141,8 @@ int add_to_swap(struct page * page)
/*
* Add it to the swap cache and mark it dirty
*/
- err = add_to_page_cache(page, &swapper_space,
- entry.val, GFP_ATOMIC);
+ err = move_to_page_cache(page, &swapper_space,
+ entry.val, GFP_ATOMIC);
if (pf_flags & PF_MEMALLOC)
current->flags |= PF_MEMALLOC;
@@ -200,8 +201,11 @@ int move_to_swap_cache(struct page *page
err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
if (!err) {
+ spin_lock(&futex_lock);
+ futex_rehash(page, &swapper_space, entry.val);
__remove_from_page_cache(page);
___add_to_page_cache(page, &swapper_space, entry.val);
+ spin_unlock(&futex_lock);
}
spin_unlock(&mapping->page_lock);
@@ -236,8 +240,11 @@ int move_from_swap_cache(struct page *pa
err = radix_tree_insert(&mapping->page_tree, index, page);
if (!err) {
+ spin_lock(&futex_lock);
+ futex_rehash(page, mapping, index);
__delete_from_swap_cache(page);
___add_to_page_cache(page, mapping, index);
+ spin_unlock(&futex_lock);
}
spin_unlock(&mapping->page_lock);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29219-2.6.0-test4-bk2-futex-swap.pre/mm/truncate.c .29219-2.6.0-test4-bk2-futex-swap/mm/truncate.c
--- .29219-2.6.0-test4-bk2-futex-swap.pre/mm/truncate.c 2003-05-27 15:02:24.000000000 +1000
+++ .29219-2.6.0-test4-bk2-futex-swap/mm/truncate.c 2003-09-01 14:10:33.000000000 +1000
@@ -53,6 +53,7 @@ truncate_complete_page(struct address_sp
clear_page_dirty(page);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
+ /* FIXME: Rehash futexes, or send signal? --RR */
remove_from_page_cache(page);
page_cache_release(page); /* pagecache ref */
}
On Mon, 1 Sep 2003, Rusty Russell wrote:
>
> This is the latest version of my futex patch. I think we're getting
> closer.
Miscellaneous comments:
1. Please leave mm/page_io.c out of it. rw_swap_page_sync used to
be just for swapon to read the header page of a swap area: it seems
swapon no longer uses it, but kernel/power/swsusp.c has grabbed it.
It would be very much better if that had a bdev_write_page to match
its bdev_read_page, and we could delete rw_swap_page_sync; but that's
not for your patch (and I've no inclination to get into swsusp either).
rw_swap_page_sync is just a way of hacking page->mapping and page->index
temporarily, in order to make use of some lowlevel swapio routines. If
futexes really need to be looked up at that stage of swsusp, you're in
trouble anyway. And (though I may be quite wrong, not knowing swsusp)
it looks like doing futex_rehash there could actually introduce errors -
the page containing the corresponding struct futex_q would get suspended
to disk with mapping wrongly set to &swapper_space, so lookups would get
missed after resume.
2. Please leave mm/swap_state.c's move_to_swap_cache and move_from_swap_
cache out of it. I already explained how those are for tmpfs files, and
it's only the file mapping and index you need to worry about, you won't
see a such page while it's assigned to swapper_space. If you're anxious
to show that you've visited everywhere that modifies page->mapping, then
add a comment or BUG, but not code which could mislead people into
thinking futexes really need to be rehashed there.
3. Please remove reference to move_xxx_swap_cache from locking hierarchy
comment in mm/filemap.c. And reference to swap_out_add_to_swap_cache:
probably meant to say move_to_page_cache (I'm not keen on that name,
too much like the unrelated _swap_cache pair, but never mind, it'll do).
4. You've added a comment line to mm/truncate.c truncate_complete_page,
but that's the wrong place for it. If you are going to do something
about truncation, then you need to do it whether or not the page is
currently in cache: needs to be a mapping/indexrange thing not a struct
page* thing. Do you need to do anything at all? I hope not, but unsure.
5. If you're not doing anything in __remove_from_page_cache (rightly
trying to avoid hotpath), you do need to futex_rehash in mm/swap_state.c
__delete_from_swap_cache (last time I did say without the __s, but that
would miss an instance you need to catch). That will handle the swapoff
case amongst others.
6. The futex_q has no reference count on struct address_space *mapping.
A task might set a futex in a shared file mapping, get a futex fd,
unmap the mapping, delete the file, poll the fd, and be woken for
events on whatever (if anything) next got a struct address_space at
that same address? Is this a possible scenario, and is it a worry?
Hugh
On Mon, 1 Sep 2003, Hugh Dickins wrote:
>
> 5. If you're not doing anything in __remove_from_page_cache (rightly
> trying to avoid hotpath), you do need to futex_rehash in mm/swap_state.c
> __delete_from_swap_cache (last time I did say without the __s, but that
> would miss an instance you need to catch). That will handle the swapoff
> case amongst others.
Of course, the reason I originally said without the __s, was because
move_from_swap_cache uses __delete_from_swap_cache, and we don't want
interference there. So best convert that to use __remove_from_page_cache
instead, with INC_CACHE_INFO(del_total) outside the locking, after the
set_page_dirty: would improve symmetry between move_from_ and move_to_.
The instance of __delete_from_swap_cache I say you need to catch, is
that in remove_exclusive_swap_page; though I think its count restrictions
limit it to cornercases like your test program, rather than real
communication between processes. But hold on, in such a cornercase
(no other references to the swap), what if the process unmaps the vma
containing the futex while it is swapped out and not even in page cache?
Unclear.
Hugh
In message <[email protected]> you
write:
> On Mon, 1 Sep 2003, Rusty Russell wrote:
> >
> > This is the latest version of my futex patch. I think we're getting
> > closer.
>
> Miscellaneous comments:
Hi Hugh!
Thanks for the analysis.
> 1. Please leave mm/page_io.c out of it. rw_swap_page_sync used to
> be just for swapon to read the header page of a swap area: it seems
> swapon no longer uses it, but kernel/power/swsusp.c has grabbed it.
> It would be very much better if that had a bdev_write_page to match
> its bdev_read_page, and we could delete rw_swap_page_sync; but that's
> not for your patch (and I've no inclination to get into swsusp either).
Right, I just looked for everywhere that ->mapping was set, and
touched every one I wasn't *sure* was safe. I've deleted the change
to rw_swap_page_sync().
> 2. Please leave mm/swap_state.c's move_to_swap_cache and move_from_swap_
> cache out of it. I already explained how those are for tmpfs files, and
> it's only the file mapping and index you need to worry about, you won't
> see a such page while it's assigned to swapper_space. If you're anxious
> to show that you've visited everywhere that modifies page->mapping, then
> add a comment or BUG, but not code which could mislead people into
> thinking futexes really need to be rehashed there.
OK. I've removed it, too. Patch is getting shorter 8)
> 3. Please remove reference to move_xxx_swap_cache from locking hierarchy
> comment in mm/filemap.c.
Andrew told me to add a comment 8(. I've removed it again.
> And reference to swap_out_add_to_swap_cache:
> probably meant to say move_to_page_cache (I'm not keen on that name,
> too much like the unrelated _swap_cache pair, but never mind, it'll do).
Hmm, comment was moved, not modified. If it was wrong before, it's
wrong now 8(
> 4. You've added a comment line to mm/truncate.c truncate_complete_page,
> but that's the wrong place for it. If you are going to do something
> about truncation, then you need to do it whether or not the page is
> currently in cache: needs to be a mapping/indexrange thing not a struct
> page* thing. Do you need to do anything at all? I hope not, but unsure.
No, we don't *need* to do anything. If a file is mmapped and someone
truncates the file, it'd be nice to scan the futex table and wake
everyone so they can -EFAULT or SEGV. Similar case is whenever any
area is munmaped, it'd be nice to find any futexes waiting in those
pages and wake them. But it's not worth working up a sweat over,
IMHO, it's undefined behavior (don't do that, then).
A FIXME at the top of futex.c in case someone gets ambitious is a
better choice: done.
> 5. If you're not doing anything in __remove_from_page_cache (rightly
> trying to avoid hotpath), you do need to futex_rehash in mm/swap_state.c
> __delete_from_swap_cache (last time I did say without the __s, but that
> would miss an instance you need to catch). That will handle the swapoff
> case amongst others.
Thanks. I looked at the callers, and thought it unneccessary, but
only mm/vmscan.c's shrink_list seems to just free the page. Done.
> 6. The futex_q has no reference count on struct address_space *mapping.
> A task might set a futex in a shared file mapping, get a futex fd,
> unmap the mapping, delete the file, poll the fd, and be woken for
> events on whatever (if anything) next got a struct address_space at
> that same address? Is this a possible scenario, and is it a worry?
Yes. It's the same "it'd be nice to wake on unmap" problem: undefined
behavior but not a requirement.
A futex page bit would probably be the best way to do this sanely if
someone was interested.
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Minor futex comment tweaks and cleanups
Author: Rusty Russell
Status: Booted on 2.6.0-test4-bk3
D: Changes:
D:
D: (1) don't return 0 from futex_wait if we are somehow
D: spuriously woken up, loop in that case.
D:
D: (2) remove bogus comment about address no longer being in this
D: address space: we hold the mm lock, and __pin_page succeeded, so it
D: can't be true,
D:
D: (3) remove bogus comment about "get_user might fault and schedule",
D:
D: (4) clarify comment about hashing: we hash address of struct page,
D: not page itself,
D:
D: (4) remove list_empty check: we still hold the lock, so it can
D: never happen, and
D:
D: (5) single error exit path, and move __queue_me to the end (order
D: doesn't matter since we're inside the futex lock).
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .9416-linux-2.6.0-test4-bk2/kernel/futex.c .9416-linux-2.6.0-test4-bk2.updated/kernel/futex.c
--- .9416-linux-2.6.0-test4-bk2/kernel/futex.c 2003-05-27 15:02:23.000000000 +1000
+++ .9416-linux-2.6.0-test4-bk2.updated/kernel/futex.c 2003-08-27 12:34:53.000000000 +1000
@@ -84,9 +84,7 @@ static inline void unlock_futex_mm(void)
spin_unlock(¤t->mm->page_table_lock);
}
-/*
- * The physical page is shared, so we can hash on its address:
- */
+/* The struct page is shared, so we can hash on its address. */
static inline struct list_head *hash_futex(struct page *page, int offset)
{
return &futex_queues[hash_long((unsigned long)page + offset,
@@ -311,67 +309,68 @@ static inline int futex_wait(unsigned lo
int val,
unsigned long time)
{
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0, curval;
+ wait_queue_t wait;
+ int ret, curval;
struct page *page;
struct futex_q q;
+again:
init_waitqueue_head(&q.waiters);
+ init_waitqueue_entry(&wait, current);
lock_futex_mm();
page = __pin_page(uaddr - offset);
if (!page) {
- unlock_futex_mm();
- return -EFAULT;
+ ret = -EFAULT;
+ goto unlock;
}
- __queue_me(&q, page, uaddr, offset, -1, NULL);
/*
- * Page is pinned, but may no longer be in this address space.
+ * Page is pinned, but may be a kernel address.
* It cannot schedule, so we access it with the spinlock held.
*/
if (get_user(curval, (int *)uaddr) != 0) {
- unlock_futex_mm();
ret = -EFAULT;
- goto out;
+ goto unlock;
}
+
if (curval != val) {
- unlock_futex_mm();
ret = -EWOULDBLOCK;
- goto out;
+ goto unlock;
}
- /*
- * The get_user() above might fault and schedule so we
- * cannot just set TASK_INTERRUPTIBLE state when queueing
- * ourselves into the futex hash. This code thus has to
- * rely on the futex_wake() code doing a wakeup after removing
- * the waiter from the list.
- */
+
+ __queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&q.list)) {
- unlock_futex_mm();
- time = schedule_timeout(time);
- }
+ unlock_futex_mm();
+
+ time = schedule_timeout(time);
+
set_current_state(TASK_RUNNING);
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- if (time == 0) {
- ret = -ETIMEDOUT;
- goto out;
- }
- if (signal_pending(current))
- ret = -EINTR;
-out:
- /* Were we woken up anyway? */
+ put_page(q.page);
+
+ /* Were we woken up (and removed from queue)? Always return
+ * success when this happens. */
if (!unqueue_me(&q))
ret = 0;
- put_page(q.page);
+ else if (time == 0)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -EINTR;
+ else
+ /* Spurious wakeup somehow. Loop. */
+ goto again;
return ret;
+
+unlock:
+ unlock_futex_mm();
+ return ret;
}
static int futex_close(struct inode *inode, struct file *filp)
Name: Futexes without pinning pages
Author: Rusty Russell
Status: Booted on 2.6.0-test4-bk3
Depends: Misc/futex-minor-tweaks.patch.gz
Depends: Misc/qemu-page-offset.patch.gz
D: Avoid pinning pages with futexes in them, to resolve FUTEX_FD DoS.
D: This means we need another way to uniquely identify pages, rather
D: than simply comparing the "struct page *" (and using a hash table
D: based on the "struct page *"). For file-backed pages, the
D: page->mapping and page->index provide a unique identifier, which is
D: persistent even if they get swapped out to the file and back.
D:
D: For anonymous pages, page->mapping == NULL. So for this case we
D: use the "struct page *" itself to hash and compare: if the page is
D: going to be swapped out, the mapping will be changed (to
D: &swapper_space).
D:
D: We need to catch cases where the mapping changes (anon or tmpfs
D: pages moving into swapcache and back): for these we now have a
D: futex_rehash() callback to rehash all futexes in that page, which
D: must be done atomic with the change in ->mapping, so we hold the
D: futex_lock around both. (This also means any calls to hash_futex()
D: must be inside the futex lock, as in futex_vcache_callback).
D:
D: Since most calls to ___add_to_page_cache() (which actually does the
D: ->mapping change) are actually new pages (for which the
D: futex_rehash is not required), we do the locking and futex_rehash
D: in the (few) callers who require it.
D:
D: The main twist is that add_to_page_cache() can only be called for
D: pages which are actually unused (ie. new pages in which there can
D: be no futexes): add_to_swap() called it on already "live" pages.
D: So a new variant "move_to_page_cache()" which calls the
D: futex_rehash() is added, and called from add_to_swap().
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .20125-linux-2.6.0-test4-bk3/include/linux/futex.h .20125-linux-2.6.0-test4-bk3.updated/include/linux/futex.h
--- .20125-linux-2.6.0-test4-bk3/include/linux/futex.h 2003-05-27 15:02:21.000000000 +1000
+++ .20125-linux-2.6.0-test4-bk3.updated/include/linux/futex.h 2003-09-02 12:49:33.000000000 +1000
@@ -1,9 +1,9 @@
#ifndef _LINUX_FUTEX_H
#define _LINUX_FUTEX_H
+#include <linux/config.h>
+#include <linux/spinlock.h>
/* Second argument to futex syscall */
-
-
#define FUTEX_WAIT (0)
#define FUTEX_WAKE (1)
#define FUTEX_FD (2)
@@ -17,4 +17,26 @@ asmlinkage long sys_futex(u32 __user *ua
long do_futex(unsigned long uaddr, int op, int val,
unsigned long timeout, unsigned long uaddr2, int val2);
+/* To tell us when the page->mapping or page->index changes on page
+ * which might have futex: must hold futex_lock across futex_rehash
+ * and actual change. */
+struct page;
+struct address_space;
+#ifdef CONFIG_FUTEX
+extern spinlock_t futex_lock;
+extern void futex_rehash(struct page *page,
+ struct address_space *new_mapping,
+ unsigned long new_index);
+#define lock_futex() spin_lock(&futex_lock)
+#define unlock_futex() spin_unlock(&futex_lock)
+#else
+static inline void futex_rehash(struct page *page,
+ struct address_space *new_mapping,
+ unsigned long new_index)
+{
+}
+#define lock_futex()
+#define unlock_futex()
+#endif /*CONFIG_FUTEX*/
+
#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .20125-linux-2.6.0-test4-bk3/include/linux/pagemap.h .20125-linux-2.6.0-test4-bk3.updated/include/linux/pagemap.h
--- .20125-linux-2.6.0-test4-bk3/include/linux/pagemap.h 2003-08-25 11:58:34.000000000 +1000
+++ .20125-linux-2.6.0-test4-bk3.updated/include/linux/pagemap.h 2003-09-02 12:48:44.000000000 +1000
@@ -94,6 +94,8 @@ int add_to_page_cache(struct page *page,
unsigned long index, int gfp_mask);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
unsigned long index, int gfp_mask);
+int move_to_page_cache(struct page *page, struct address_space *mapping,
+ unsigned long index, int gfp_mask);
extern void remove_from_page_cache(struct page *page);
extern void __remove_from_page_cache(struct page *page);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .20125-linux-2.6.0-test4-bk3/kernel/futex.c .20125-linux-2.6.0-test4-bk3.updated/kernel/futex.c
--- .20125-linux-2.6.0-test4-bk3/kernel/futex.c 2003-09-02 12:48:43.000000000 +1000
+++ .20125-linux-2.6.0-test4-bk3.updated/kernel/futex.c 2003-09-02 12:48:44.000000000 +1000
@@ -36,6 +36,18 @@
#include <linux/vcache.h>
#include <linux/mount.h>
+/* Futexes need to have a way of identifying pages which are the same,
+ when they may be in different address spaces (ie. virtual address
+ might be different, eg. shared mmap). We don't want to pin the
+ pages, so we use page->mapping & page->index where page->mapping is
+ not NULL (file-backed pages), and hash the page struct itself for
+ other pages. Callbacks rehash pages when page->mapping is changed
+ or set (such as when anonymous pages enter the swap cache), so we
+ recognize them when the get swapped back in. */
+/* FIXME: It'd be polite to wake any waiters if the page is munmapped
+ underneath them (or mmaped file truncated). But I consider that
+ undefined behavior, so if someone implements it cheaply, great. --RR */
+
#define FUTEX_HASHBITS 8
/*
@@ -46,7 +58,10 @@ struct futex_q {
struct list_head list;
wait_queue_head_t waiters;
- /* Page struct and offset within it. */
+ /* These match if mapping != NULL */
+ struct address_space *mapping;
+ unsigned long index;
+ /* Otherwise, the page itself. */
struct page *page;
int offset;
@@ -58,9 +73,8 @@ struct futex_q {
struct file *filp;
};
-/* The key for the hash is the address + index + offset within page */
static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
/* Futex-fs vfsmount entry: */
static struct vfsmount *futex_mnt;
@@ -83,11 +97,66 @@ static inline void unlock_futex_mm(void)
spin_unlock(¤t->mm->page_table_lock);
}
-/* The struct page is shared, so we can hash on its address. */
-static inline struct list_head *hash_futex(struct page *page, int offset)
+static inline int page_matches(struct page *page, struct futex_q *elem)
{
- return &futex_queues[hash_long((unsigned long)page + offset,
- FUTEX_HASHBITS)];
+ if (elem->mapping)
+ return page->mapping == elem->mapping
+ && page->index == elem->index;
+ return page == elem->page;
+}
+
+/* For pages which are file backed, we can simply hash by mapping and
+ * index, which is persistent as they get swapped out. For anonymous
+ * regions, we hash by the actual struct page *: their mapping will
+ * change and they will be rehashed if they are swapped out.
+ */
+static inline struct list_head *hash_futex(struct address_space *mapping,
+ unsigned long index,
+ struct page *page,
+ int offset)
+{
+ unsigned long hashin;
+ if (mapping)
+ hashin = (unsigned long)mapping + index;
+ else
+ hashin = (unsigned long)page;
+ return &futex_queues[hash_long(hashin+offset, FUTEX_HASHBITS)];
+}
+
+/* Called when we change page->mapping (or page->index). Must be
+ * holding futex_lock across change of page->mapping and call to
+ * futex_rehash. */
+void futex_rehash(struct page *page,
+ struct address_space *new_mapping, unsigned long new_index)
+{
+ unsigned int i;
+ struct futex_q *q;
+ LIST_HEAD(gather);
+ static int rehash_count = 0;
+
+ rehash_count++;
+ for (i = 0; i < 1 << FUTEX_HASHBITS; i++) {
+ struct list_head *l, *next;
+ list_for_each_safe(l, next, &futex_queues[i]) {
+ q = list_entry(l, struct futex_q, list);
+ if (page_matches(page, q)) {
+ list_del(&q->list);
+ list_add(&q->list, &gather);
+ printk("futex_rehash %i: offset %i %p -> %p\n",
+ rehash_count,
+ q->offset, page->mapping, new_mapping);
+ }
+ }
+ }
+ while (!list_empty(&gather)) {
+ q = list_entry(gather.next, struct futex_q, list);
+ q->mapping = new_mapping;
+ q->index = new_index;
+ q->page = page;
+ list_del(&q->list);
+ list_add(&q->list,
+ hash_futex(new_mapping, new_index, page, q->offset));
+ }
}
/*
@@ -161,12 +230,12 @@ static inline int futex_wake(unsigned lo
return -EFAULT;
}
- head = hash_futex(page, offset);
+ head = hash_futex(page->mapping, page->index, page, offset);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page && this->offset == offset) {
+ if (page_matches(page, this) && this->offset == offset) {
list_del_init(i);
__detach_vcache(&this->vcache);
wake_up_all(&this->waiters);
@@ -193,15 +262,16 @@ static inline int futex_wake(unsigned lo
static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
{
struct futex_q *q = container_of(vcache, struct futex_q, vcache);
- struct list_head *head = hash_futex(new_page, q->offset);
+ struct list_head *head;
spin_lock(&futex_lock);
-
+ head = hash_futex(new_page->mapping, new_page->index,
+ new_page, q->offset);
if (!list_empty(&q->list)) {
- put_page(q->page);
- q->page = new_page;
- __pin_page_atomic(new_page);
list_del(&q->list);
+ q->mapping = new_page->mapping;
+ q->index = new_page->index;
+ q->page = new_page;
list_add_tail(&q->list, head);
}
@@ -228,13 +298,13 @@ static inline int futex_requeue(unsigned
if (!page2)
goto out;
- head1 = hash_futex(page1, offset1);
- head2 = hash_futex(page2, offset2);
+ head1 = hash_futex(page1->mapping, page1->index, page1, offset1);
+ head2 = hash_futex(page2->mapping, page2->index, page2, offset2);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page1 && this->offset == offset1) {
+ if (page_matches(page1, this) && this->offset == offset1) {
list_del_init(i);
__detach_vcache(&this->vcache);
if (++ret <= nr_wake) {
@@ -243,8 +313,6 @@ static inline int futex_requeue(unsigned
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- put_page(this->page);
- __pin_page_atomic (page2);
list_add_tail(i, head2);
__attach_vcache(&this->vcache, uaddr2,
current->mm, futex_vcache_callback);
@@ -271,11 +339,14 @@ static inline void __queue_me(struct fut
unsigned long uaddr, int offset,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(page, offset);
+ struct list_head *head
+ = hash_futex(page->mapping, page->index, page, offset);
q->offset = offset;
q->fd = fd;
q->filp = filp;
+ q->mapping = page->mapping;
+ q->index = page->index;
q->page = page;
list_add_tail(&q->list, head);
@@ -331,18 +402,19 @@ again:
*/
if (get_user(curval, (int *)uaddr) != 0) {
ret = -EFAULT;
- goto unlock;
+ goto putpage;
}
if (curval != val) {
ret = -EWOULDBLOCK;
- goto unlock;
+ goto putpage;
}
__queue_me(&q, page, uaddr, offset, -1, NULL);
add_wait_queue(&q.waiters, &wait);
set_current_state(TASK_INTERRUPTIBLE);
unlock_futex_mm();
+ put_page(page);
time = schedule_timeout(time);
@@ -351,7 +423,6 @@ again:
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- put_page(q.page);
/* Were we woken up (and removed from queue)? Always return
* success when this happens. */
@@ -367,6 +438,8 @@ again:
return ret;
+putpage:
+ put_page(page);
unlock:
unlock_futex_mm();
return ret;
@@ -377,7 +450,6 @@ static int futex_close(struct inode *ino
struct futex_q *q = filp->private_data;
unqueue_me(q);
- put_page(q->page);
kfree(filp->private_data);
return 0;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .20125-linux-2.6.0-test4-bk3/mm/filemap.c .20125-linux-2.6.0-test4-bk3.updated/mm/filemap.c
--- .20125-linux-2.6.0-test4-bk3/mm/filemap.c 2003-08-25 11:58:36.000000000 +1000
+++ .20125-linux-2.6.0-test4-bk3.updated/mm/filemap.c 2003-09-02 12:48:44.000000000 +1000
@@ -27,6 +27,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
+#include <linux/futex.h>
/*
* This is needed for the following functions:
* - try_to_release_page
@@ -220,16 +221,11 @@ restart:
* This adds a page to the page cache, starting out as locked, unreferenced,
* not uptodate and with no errors.
*
- * This function is used for two things: adding newly allocated pagecache
- * pages and for moving existing anon pages into swapcache.
- *
- * In the case of pagecache pages, the page is new, so we can just run
- * SetPageLocked() against it. The other page state flags were set by
- * rmqueue()
+ * This function is used for adding newly allocated pagecache pages.
+ * See move_to_page_cache for moving existing pages into pagecache.
*
- * In the case of swapcache, try_to_swap_out() has already locked the page, so
- * SetPageLocked() is ugly-but-OK there too. The required page state has been
- * set up by swap_out_add_to_swap_cache().
+ * The page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue()
*
* This function does not add the page to the LRU. The caller must do that.
*/
@@ -264,6 +260,39 @@ int add_to_page_cache_lru(struct page *p
return ret;
}
+/*
+ * This is exactly like add_to_page_cache(), except the page may have
+ * a futex in it (ie. it's not a new page).
+ *
+ * This is currently called from try_to_swap_out(), which has already
+ * locked the page, so SetPageLocked() is unneeded, but harmless. The
+ * required page state has been set up by
+ * swap_out_add_to_swap_cache().
+ */
+int move_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset, int gfp_mask)
+{
+ int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+
+ if (error == 0) {
+ page_cache_get(page);
+ spin_lock(&mapping->page_lock);
+ error = radix_tree_insert(&mapping->page_tree, offset, page);
+ if (!error) {
+ SetPageLocked(page);
+ lock_futex();
+ futex_rehash(page, mapping, offset);
+ ___add_to_page_cache(page, mapping, offset);
+ unlock_futex();
+ } else {
+ page_cache_release(page);
+ }
+ spin_unlock(&mapping->page_lock);
+ radix_tree_preload_end();
+ }
+ return error;
+}
+
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .20125-linux-2.6.0-test4-bk3/mm/swap_state.c .20125-linux-2.6.0-test4-bk3.updated/mm/swap_state.c
--- .20125-linux-2.6.0-test4-bk3/mm/swap_state.c 2003-08-12 06:58:06.000000000 +1000
+++ .20125-linux-2.6.0-test4-bk3.updated/mm/swap_state.c 2003-09-02 12:48:44.000000000 +1000
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
+#include <linux/futex.h>
#include <asm/pgtable.h>
@@ -96,7 +97,10 @@ void __delete_from_swap_cache(struct pag
BUG_ON(!PageLocked(page));
BUG_ON(!PageSwapCache(page));
BUG_ON(PageWriteback(page));
+ lock_futex();
+ futex_rehash(page, NULL, 0);
__remove_from_page_cache(page);
+ unlock_futex();
INC_CACHE_INFO(del_total);
}
@@ -140,8 +144,8 @@ int add_to_swap(struct page * page)
/*
* Add it to the swap cache and mark it dirty
*/
- err = add_to_page_cache(page, &swapper_space,
- entry.val, GFP_ATOMIC);
+ err = move_to_page_cache(page, &swapper_space,
+ entry.val, GFP_ATOMIC);
if (pf_flags & PF_MEMALLOC)
current->flags |= PF_MEMALLOC;
What happens after this sequence:
1. process A forks, making process B
2. B does FUTEX_FD, or splits into threads and one does FUTEX_WAIT,
on a private page that has not been written to since the fork
3. A does FUTEX_WAIT on the same address
3. The page is swapped out
4. B does FUTEX_WAKE at the same address
Won't the futex be hashed on the swap entry at step 4, so that
both processes are woken, yet only the waiter in B should be woken?
Related: could COW sharing after fork() explain the spurious wakeups I
saw mentioned earlier in the thread?
-- Jamie
On Tue, 2 Sep 2003, Jamie Lokier wrote:
> What happens after this sequence:
>
> 1. process A forks, making process B
> 2. B does FUTEX_FD, or splits into threads and one does FUTEX_WAIT,
> on a private page that has not been written to since the fork
> 3. A does FUTEX_WAIT on the same address
> 3. The page is swapped out
> 4. B does FUTEX_WAKE at the same address
>
> Won't the futex be hashed on the swap entry at step 4, so that
> both processes are woken, yet only the waiter in B should be woken?
I don't see that step 3 (the second!) makes any difference:
it behaves like that whether or not the page is swapped out, doesn't it?
I agree with you that behaviour seems wrong for a private anonymous page.
And we'd agree it's right behaviour for a shared file page. The case of
a MAP_PRIVATE or !PROT_WRITE file page may be harder to decide, but I'm
inclined to follow you and say distinction should depend on MAP_SHARED
(shm included as MAP_SHARED mapping of unnamed shmem object).
I know nothing of the user/glibc end of futexes, perhaps it makes
your case academic. But I'd still like a consistent definition
for how sys_futex should behave.
I'd been wondering along similar lines (worried by futex on uninstantiated
anon page, which would end up on the empty zero page), thinking futex.c's
__pin_page ought to pass write flag set to follow_page and get_user_pages.
But now (and I'd like to switch to capitals, but restrain myself) I think
most of the COW/vcache/callback/swap/page/pinning stuff is just a waste
of space and time, creating its own problems which it then has to solve.
When sys_futex passes a uaddr in a VM_MAYSHARE vma, it should be handled
by mapping/index (or inode/offset). When sys_futex passes a uaddr in a
!VM_MAYSHARE vma, it should be handled by mm/uaddr. (If outside vma?)
That's it. Doesn't a whole lot of code and complication fall away?
The physical page is pretty much irrelevant.
For a while I thought this would change the behaviour if futex is
mremapped. Well, yes, but nobody has remembered to do anything
about vcache in mremap anyway, so it's already broken.
What am I missing?
Hugh
Hugh Dickins wrote:
> > 1. process A forks, making process B
> > 2. B does FUTEX_FD, or splits into threads and one does FUTEX_WAIT,
> > on a private page that has not been written to since the fork
> > 3. A does FUTEX_WAIT on the same address
> > 3. The page is swapped out
> > 4. B does FUTEX_WAKE at the same address
> >
> > Won't the futex be hashed on the swap entry at step 4, so that
> > both processes are woken, yet only the waiter in B should be woken?
>
> I don't see that step 3 (the second!) makes any difference:
> it behaves like that whether or not the page is swapped out, doesn't it?
You're right.
I had assumed that the reformed futex keyed on (mapping, offset) for
shared pages and (mm, page) for private anonymous pages. But I see it
only keys on (page) in the latter case.
Now I see that it _ought_ to key on (mapping, index, offset) for
shared mappings, (mm, uaddr) for private mappings.
> I agree with you that behaviour seems wrong for a private anonymous page.
> And we'd agree it's right behaviour for a shared file page.
Yes.
> The case of a MAP_PRIVATE or !PROT_WRITE file page may be harder to
> decide, but I'm inclined to follow you and say distinction should
> depend on MAP_SHARED (shm included as MAP_SHARED mapping of unnamed
> shmem object).
Yes, absolutely.
> I know nothing of the user/glibc end of futexes, perhaps it makes
> your case academic. But I'd still like a consistent definition
> for how sys_futex should behave.
Futexes are actually a very nice primitive, and used by more than
glibc already. In particular, they are used inside database files and
similar things.
> I'd been wondering along similar lines (worried by futex on uninstantiated
> anon page, which would end up on the empty zero page), thinking futex.c's
> __pin_page ought to pass write flag set to follow_page and get_user_pages.
Shared read-only mappings should be futexable too. Also, there is no
need to clone a page at __pin_page() time, because the vcache should
take care of any subsequent COW. (Also it wouldn't work because the
page can become a COW page after the futex is established, due to a
subsequent fork).
> But now (and I'd like to switch to capitals, but restrain myself) I think
> most of the COW/vcache/callback/swap/page/pinning stuff is just a waste
> of space and time, creating its own problems which it then has to solve.
Not pinning pages fixes a real resource leak problem, and is obviously
the right thing to do - lots of threads queued up on futexes in a
database file, for example, should still be pageable as much as possible.
> When sys_futex passes a uaddr in a VM_MAYSHARE vma, it should be handled
> by mapping/index (or inode/offset). When sys_futex passes a uaddr in a
> !VM_MAYSHARE vma, it should be handled by mm/uaddr. (If outside vma?)
>
> That's it. Doesn't a whole lot of code and complication fall away?
> The physical page is pretty much irrelevant.
>
> For a while I thought this would change the behaviour if futex is
> mremapped. Well, yes, but nobody has remembered to do anything
> about vcache in mremap anyway, so it's already broken.
I have thought it over and I agree.
Keying on (mm,uaddr) instead of (page) does solve the problem of
private mappings and COW. It also removes the need for the page to be
in memory (a minor bonus, not an important reason).
The vcache is horribly broken with mremap(). If vcache is fixed to
rehash vcache entries on mremap, that could rehash the futexes too.
Rusty, I see that futex.c does a vcache lookup for every wait and wake
at the moment.
That means every wait and wake does two hash lookups at the moment.
We can reduce that to one :)
I think this solves the COW incorrect/spurious wakeups problem _and_
removes page pinning (without mm hooks) _and_ makes futex_rehash _and_
speeds up futex operations. Credit for the idea goes to Hugh:
1. Remove all references to struct page from futex.c (!)
2. Call find_extend_vma instead of get_user_pages,
and decide how to look up the futex_q based on
(vma & VM_SHARED).
3. For (vma & VM_SHARED), look up futex_qs keyed on
(vma->vm_file, vma->vm_pgoff + (uaddr - vma->vm_start) >>
PAGE_SHIFT, offset).
4. For !(vma & VM_SHARED), look up futex_qs keyed on
(mm, uaddr).
Note how pages aren't pinned by this, vcache isn't needed, it is fine
with swapping, and works correctly with COW pages. tmpfs & shared
memory (all 3 kinds) are also correct: if they are mapped privately,
(mm, uaddr) is a fine key, and if they are mapped shared, (file,
offset) is a correct key at all times whether swapped or not.
mremap does not work with the above, but mremap is broken anyway, as
all futexes use the vcache at present which is broken by mremap. When
mremap is fixed to rehash a vcache address range, it can be fixed to
rehash a futex address range too.
The more I think about it, the better it looks. So sure, I am, that I
must have missed something. What do you think, Rusty?
-- Jamie
Andrew Morton wrote:
> Jamie Lokier <[email protected]> wrote:
> >
> > 3. For (vma & VM_SHARED), look up futex_qs keyed on
> > (vma->vm_file, vma->vm_pgoff + (uaddr - vma->vm_start) >>
> > PAGE_SHIFT, offset).
>
> That's a bit meaningless in non-linear mappings.
You think it's worth supporting futexes on non-linear mappings?
I see your point of view; they could be useful.
In that case, the (vma & VM_SHARED) case needs to look up the correct
pgoff or page after all.
It might be worth a VM_NONLINEAR flag, to skip walking the page table
in the non-linear case, but that's just an optimisation.
-- Jamie
Jamie Lokier <[email protected]> wrote:
>
> 3. For (vma & VM_SHARED), look up futex_qs keyed on
> (vma->vm_file, vma->vm_pgoff + (uaddr - vma->vm_start) >>
> PAGE_SHIFT, offset).
That's a bit meaningless in non-linear mappings.
In message <[email protected]> you write:
> What happens after this sequence:
>
> 1. process A forks, making process B
> 2. B does FUTEX_FD, or splits into threads and one does FUTEX_WAIT,
> on a private page that has not been written to since the fork
> 3. A does FUTEX_WAIT on the same address
> 3. The page is swapped out
> 4. B does FUTEX_WAKE at the same address
>
> Won't the futex be hashed on the swap entry at step 4, so that
> both processes are woken, yet only the waiter in B should be woken?
Part of step (4) is to swap the page back in (see __pin_page).
> Related: could COW sharing after fork() explain the spurious wakeups I
> saw mentioned earlier in the thread?
In case others are sharing this misconception: there *are* no spurious
wakeups. But if they were to happen, the current code doesn't handle
them correctly, unlike every other primitive I know of in the kernel,
which is why I fixed it while tidying the code.
I don't know of a rule which says "thou shalt not wake a random thread
in the kernel": for all I know wierd things like CPU hotplug or
software suspend may do this in the future.
Hope that clarifies,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you
write:
> On Mon, 1 Sep 2003, Hugh Dickins wrote:
> >
> > 5. If you're not doing anything in __remove_from_page_cache (rightly
> > trying to avoid hotpath), you do need to futex_rehash in mm/swap_state.c
> > __delete_from_swap_cache (last time I did say without the __s, but that
> > would miss an instance you need to catch). That will handle the swapoff
> > case amongst others.
>
> Of course, the reason I originally said without the __s, was because
> move_from_swap_cache uses __delete_from_swap_cache, and we don't want
> interference there. So best convert that to use __remove_from_page_cache
> instead, with INC_CACHE_INFO(del_total) outside the locking, after the
> set_page_dirty: would improve symmetry between move_from_ and move_to_.
But you previously said:
Message-ID: <[email protected]>
> 2. Please leave mm/swap_state.c's move_to_swap_cache and move_from_swap_
> cache out of it. I already explained how those are for tmpfs files, and
> it's only the file mapping and index you need to worry about, you won't
> see a such page while it's assigned to swapper_space. If you're anxious
> to show that you've visited everywhere that modifies page->mapping, then
> add a comment or BUG, but not code which could mislead people into
> thinking futexes really need to be rehashed there.
Confused,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell <[email protected]> wrote:
>
> I don't know of a rule which says "thou shalt not wake a random thread
> in the kernel": for all I know wierd things like CPU hotplug or
> software suspend may do this in the future.
pdflush is sensitive to that. It emits angry squeaks if unexpectedly woken.
And up until a couple of months ago there were sporadic squeaking reports,
but they seem to have gone away.
Yes, we should treat a random wakeup like that as a bug.
On Tue, 2003-09-02 at 18:16, Andrew Morton wrote:
> Rusty Russell <[email protected]> wrote:
> >
> > I don't know of a rule which says "thou shalt not wake a random thread
> > in the kernel": for all I know wierd things like CPU hotplug or
> > software suspend may do this in the future.
>
> pdflush is sensitive to that. It emits angry squeaks if unexpectedly woken.
>
> And up until a couple of months ago there were sporadic squeaking reports,
> but they seem to have gone away.
I still run into the pdflush problem once a month or so, but only with
boxes that are up for a week or more. It usually takes the box down if
for no other reason than it's too busy prink()ing to do anything else.
I haven't been able to sysrq it and the particular box that it happens
on doesn't like NMIs so kgdb and the NMI oopser are out.
Are there any good reasons not to do something like the attached patch?
It would at least keep pdflush from evicting everthing interesting that
may have preceded it in dmesg.
--
Dave Hansen
[email protected]
Dave Hansen <[email protected]> wrote:
>
> On Tue, 2003-09-02 at 18:16, Andrew Morton wrote:
> > Rusty Russell <[email protected]> wrote:
> > >
> > > I don't know of a rule which says "thou shalt not wake a random thread
> > > in the kernel": for all I know wierd things like CPU hotplug or
> > > software suspend may do this in the future.
> >
> > pdflush is sensitive to that. It emits angry squeaks if unexpectedly woken.
> >
> > And up until a couple of months ago there were sporadic squeaking reports,
> > but they seem to have gone away.
>
> I still run into the pdflush problem once a month or so, but only with
> boxes that are up for a week or more. It usually takes the box down if
> for no other reason than it's too busy prink()ing to do anything else.
That serves you right for not telling me!
> I haven't been able to sysrq it and the particular box that it happens
> on doesn't like NMIs so kgdb and the NMI oopser are out.
>
> Are there any good reasons not to do something like the attached patch?
> It would at least keep pdflush from evicting everthing interesting that
> may have preceded it in dmesg.
I'd prefer a more intricate patch which does something like the below.
Seriously, please: this shouldn't be happening. We need to work out the
cause.
mm/pdflush.c | 18 +++++++++++++++++-
1 files changed, 17 insertions(+), 1 deletion(-)
diff -puN mm/pdflush.c~pdflush-diag mm/pdflush.c
--- 25/mm/pdflush.c~pdflush-diag 2003-09-02 19:50:13.000000000 -0700
+++ 25-akpm/mm/pdflush.c 2003-09-02 19:53:38.000000000 -0700
@@ -84,6 +84,8 @@ struct pdflush_work {
unsigned long when_i_went_to_sleep;
};
+static int wakeup_count = 100;
+
static int __pdflush(struct pdflush_work *my_work)
{
daemonize("pdflush");
@@ -112,7 +114,10 @@ static int __pdflush(struct pdflush_work
spin_lock_irq(&pdflush_lock);
if (!list_empty(&my_work->list)) {
- printk("pdflush: bogus wakeup!\n");
+ if (wakeup_count > 0) {
+ wakeup_count--;
+ printk("pdflush: bogus wakeup!\n");
+ }
my_work->fn = NULL;
continue;
}
@@ -182,6 +187,7 @@ int pdflush_operation(void (*fn)(unsigne
{
unsigned long flags;
int ret = 0;
+ static int poke_count = 0;
if (fn == NULL)
BUG(); /* Hard to diagnose if it's deferred */
@@ -190,9 +196,19 @@ int pdflush_operation(void (*fn)(unsigne
if (list_empty(&pdflush_list)) {
spin_unlock_irqrestore(&pdflush_lock, flags);
ret = -1;
+ if (wakeup_count < 100 && poke_count < 10) {
+ printk("%s: no threads\n", __FUNCTION__);
+ dump_stack();
+ poke_count++;
+ }
} else {
struct pdflush_work *pdf;
+ if (wakeup_count < 100 && poke_count < 10) {
+ printk("%s: found a thread\n", __FUNCTION__);
+ dump_stack();
+ poke_count++;
+ }
pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
list_del_init(&pdf->list);
if (list_empty(&pdflush_list))
_
In message <[email protected]> you
write:
> On Tue, 2 Sep 2003, Hugh Dickins wrote:
> When sys_futex passes a uaddr in a VM_MAYSHARE vma, it should be handled
> by mapping/index (or inode/offset). When sys_futex passes a uaddr in a
> !VM_MAYSHARE vma, it should be handled by mm/uaddr. (If outside vma?)
>
> That's it. Doesn't a whole lot of code and complication fall away?
> The physical page is pretty much irrelevant.
The physical page is a relic from my original implementation, which
did "pin page and hash on it". Life was simple and good, and then
came FUTEX_FD (which allows more than one futex per process) and
before Ingo found the COW issue, and added the vcache stuff.
Now, I am lost in a maze of VM hackers' advice, all slightly
different 8)
Assume that we do:
1) Look up vma.
2) If vma->vm_flags & VM_SHARED, index by page->mapping & page->index.
3) Otherwise, index by vma->vm_mm & uaddr.
Questions:
1) What is the difference between VM_SHARED and VM_MAYSHARE? They
always seem to be set/reset together.
2) If VM_SHARED, and page->mapping is NULL, what to do? AFAICT, this
can happen in the case of anonymous shared mappings, say mmap
/dev/zero MAP_SHARED and fork()? Treating it as !VM_SHARED (and
hence matching in mm & uaddr) won't work, since the mm's will be
different (and with mremap, the uaddrs may be different).
3) Since we need the offset in the file anyway for the VM_SHARED, it
makes more sense to use get_user_pages() to get the vma and page in
one call, rather than find_extend_vma().
4) mremap on a futex: same case as munmap, it's undefined behavior. A
correct program will need to re-wait on the futex anyway.
BTW, the other solution to the COW problem which Ingo thought about (I
was away on my honeymoon), was to have the child always get the copied
page, even if the parent caused the COW fault. If you also always
un-COW the page in FUTEX_WAIT, this scheme works. IIRC he said the
implementation was icky.
Thanks,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell wrote:
> The physical page is a relic from my original implementation, which
> did "pin page and hash on it". Life was simple and good, and then
> came FUTEX_FD (which allows more than one futex per process) and
> before Ingo found the COW issue, and added the vcache stuff.
Hi Rusty,
You will be please to know I have written a complete patch :)
> Assume that we do:
> 1) Look up vma.
> 2) If vma->vm_flags & VM_SHARED, index by page->mapping & page->index.
> 3) Otherwise, index by vma->vm_mm & uaddr.
Like that, but 2) uses vma->vm_file->f_dentry->d_inode.
That way, there is no need to walk the page table at all unless it's a
non-linear mapping (which my patch does handle).
> Questions:
> 1) What is the difference between VM_SHARED and VM_MAYSHARE? They
> always seem to be set/reset together.
Good question. No kernel code seems to check VM_MAYSHARE - the one to
check is VM_SHARED.
> 2) If VM_SHARED, and page->mapping is NULL, what to do? AFAICT, this
> can happen in the case of anonymous shared mappings, say mmap
> /dev/zero MAP_SHARED and fork()? Treating it as !VM_SHARED (and
> hence matching in mm & uaddr) won't work, since the mm's will be
> different (and with mremap, the uaddrs may be different).
No, that doesn't happen. An anoymous shared mapping calls
shmem_zero_setup(), which creates an anonymous tmpfs file to back the
mapping. It then looks the same as IPC shm or any other tmpfs file.
So it works :)
> 3) Since we need the offset in the file anyway for the VM_SHARED, it
> makes more sense to use get_user_pages() to get the vma and page in
> one call, rather than find_extend_vma().
You need the offset, but you don't need the page. For a linear
mapping, the offset is a very simple calculation - no page table lock
and no page table walk. As a silly bonus it doesn't touch the page.
For non-linear mappings, I try follow_page() and then
get_user_pages(), as usual, to get page->index. Technically you don't
need to swap the page in, but there's no point using complicated code
for that unimportant case.
I added a flag VM_NONLINEAR to distinguish them.
> 4) mremap on a futex: same case as munmap, it's undefined behavior. A
> correct program will need to re-wait on the futex anyway.
>
> BTW, the other solution to the COW problem which Ingo thought about (I
> was away on my honeymoon), was to have the child always get the copied
> page, even if the parent caused the COW fault. If you also always
> un-COW the page in FUTEX_WAIT, this scheme works. IIRC he said the
> implementation was icky.
That's icky in a lot of ways, including unnecessary un-COWing.
I have an obvious fix for mremap(): rehash all the futexes in its
range. That's not in the attached patch, but it will be in the next one.
Please take a look at the patch and see what you think. I have tried
some basic tests with it (playing with your futex-2.2 lib, and Red Hat
9 uses plenty of futexes while booting & running too).
Patch is against 2.6.0-test4. Net reduction of about 100 lines from
test4, too :)
Thanks,
-- Jamie
===========================================
include/linux/mm.h | 1
include/linux/vcache.h | 26 ---
kernel/futex.c | 368 +++++++++++++++++++++++++------------------------
mm/Makefile | 2
mm/fremap.c | 9 +
mm/memory.c | 2
mm/vcache.c | 90 -----------
7 files changed, 200 insertions(+), 298 deletions(-)
Patch name: futex-fixes-2.6.0-test4-01jl
This patch changes the way futexes are indexed, so that they do not
pin pages and also corrects some problems with private mappings and COW
pages.
Currently, all futexes look up the page at the userspace address and
pin it, using the pair (page,offset) as an index into a table of
waiting futexes. Any page with a futex waiting on it remains pinned
in RAM, which is a problem when many futexes are used, especially with
FUTEX_FD.
Another problem is that the page is not always the correct one, if it
can be changed later by a COW (copy on write) operation. This can
happen when waiting on a futex without writing to it after fork(),
exec() or mmap(), if the page is then written to before attempting to
wake a futex at the same adress.
There are two symptoms of the COW problem: 1. The wrong process can
receive wakeups; 2. A process can fail to receive required wakeups.
This patch fixes both by changing the indexing so that VM_SHARED
mappings use the triple (inode,offset,index), and private mappings use
the pair (mm,virtual_address).
The former correctly handles all shared mappings, including tmpfs and
therefore all kinds of shared memory (IPC shm, /dev/shm and
MAP_ANON|MAP_SHARED). This works because every mapping which is
VM_SHARED has an associated non-zero vma->vm_file, and hence inode.
(This is ensured in do_mmap_pgoff, where it calls shmem_zero_setup).
The latter handles all private mappings, both files and anonymous. It
isn't affected by COW, because it doesn't care about the actual pages,
just the virtual address.
The only obvious problem is that mremap() can move a private mapping
without informing futexes waiting on that mapping. However, mremap()
was already broken with futexes, because it doesn't update the vcache,
which is used by futexes, so this just changes an existing bug.
(A later patch from me will fix this problem with mremap(), by moving
the futexes).
This patch has a few bonuses:
1. It removes the vcache implementation, as only futexes were
using it, and they don't any more.
2. Removing the vcache should make COW page faults a bit faster.
3. Futex operations no longer take the page table lock, walk
the page table, fault in pages that aren't mapped in the
page table, or do a vcache hash lookup - they are mostly a
simple offset calculation with one hash for the futex
table. So they should be noticably faster.
4. The patch reduces the kernel size by 98 lines.
-- Jamie
==========================
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/include/linux/mm.h laptop-2.6.0-test4/include/linux/mm.h
--- orig-2.6.0-test4/include/linux/mm.h 2003-09-02 23:06:10.000000000 +0100
+++ laptop-2.6.0-test4/include/linux/mm.h 2003-09-02 23:06:10.000000000 +0100
@@ -110,6 +110,7 @@
#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
+#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/include/linux/vcache.h laptop-2.6.0-test4/include/linux/vcache.h
--- orig-2.6.0-test4/include/linux/vcache.h 2003-07-08 21:44:12.000000000 +0100
+++ laptop-2.6.0-test4/include/linux/vcache.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,26 +0,0 @@
-/*
- * virtual => physical mapping cache support.
- */
-#ifndef _LINUX_VCACHE_H
-#define _LINUX_VCACHE_H
-
-typedef struct vcache_s {
- unsigned long address;
- struct mm_struct *mm;
- struct list_head hash_entry;
- void (*callback)(struct vcache_s *data, struct page *new_page);
-} vcache_t;
-
-extern spinlock_t vcache_lock;
-
-extern void __attach_vcache(vcache_t *vcache,
- unsigned long address,
- struct mm_struct *mm,
- void (*callback)(struct vcache_s *data, struct page *new_page));
-
-extern void __detach_vcache(vcache_t *vcache);
-
-extern void invalidate_vcache(unsigned long address, struct mm_struct *mm,
- struct page *new_page);
-
-#endif
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/kernel/futex.c laptop-2.6.0-test4/kernel/futex.c
--- orig-2.6.0-test4/kernel/futex.c 2003-07-08 21:44:25.000000000 +0100
+++ laptop-2.6.0-test4/kernel/futex.c 2003-09-03 06:07:22.922779181 +0100
@@ -5,6 +5,9 @@
* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
*
+ * Changed to remove page pinning and fix privately mapped COW pages
+ * Copyright (C) Jamie Lokier 2003
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -32,7 +35,6 @@
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/futex.h>
-#include <linux/vcache.h>
#include <linux/mount.h>
#define FUTEX_HASHBITS 8
@@ -45,13 +47,10 @@
struct list_head list;
wait_queue_head_t waiters;
- /* Page struct and offset within it. */
- struct page *page;
+ /* Page keys and offset within the page. */
+ unsigned long keys[2];
int offset;
- /* the virtual => physical COW-safe cache */
- vcache_t vcache;
-
/* For fd, sigio sent using these. */
int fd;
struct file *filp;
@@ -67,85 +66,110 @@
static struct vfsmount *futex_mnt;
/*
- * These are all locks that are necessery to look up a physical
- * mapping safely, and modify/search the futex hash, atomically:
- */
-static inline void lock_futex_mm(void)
-{
- spin_lock(¤t->mm->page_table_lock);
- spin_lock(&vcache_lock);
- spin_lock(&futex_lock);
-}
-
-static inline void unlock_futex_mm(void)
-{
- spin_unlock(&futex_lock);
- spin_unlock(&vcache_lock);
- spin_unlock(¤t->mm->page_table_lock);
-}
-
-/*
- * The physical page is shared, so we can hash on its address:
+ * We hash on the keys returned from __get_page_keys (see below),
+ * and the offset into the page.
*/
-static inline struct list_head *hash_futex(struct page *page, int offset)
+static inline struct list_head *hash_futex(unsigned long key0,
+ unsigned long key1,
+ int offset)
{
- return &futex_queues[hash_long((unsigned long)page + offset,
- FUTEX_HASHBITS)];
+ return &futex_queues[hash_long(key0 + key1 + offset, FUTEX_HASHBITS)];
}
/*
- * Get kernel address of the user page and pin it.
+ * Get two parameters which are the keys for a futex
+ * other than the offset within page.
*
- * Must be called with (and returns with) all futex-MM locks held.
+ * For shared mappings, it's "vma->vm_file->f_dentry->d_inode" and
+ * "page->index". For private mappings, it's "current->mm" and "addr".
+ * We can usually work out the index without swapping in the page.
+ *
+ * Returns: 0, or negative error code.
+ * The two key words are stored in key[0] and key[1] on success.
+ *
+ * Should be called with ¤t->mm->mmap_sem,
+ * but NOT &futex_lock or ¤t->mm->page_table_lock.
*/
-static inline struct page *__pin_page_atomic (struct page *page)
-{
- if (!PageReserved(page))
- get_page(page);
- return page;
-}
-
-static struct page *__pin_page(unsigned long addr)
+static int __get_page_keys(unsigned long addr, unsigned long * keys)
{
struct mm_struct *mm = current->mm;
- struct page *page, *tmp;
+ struct vm_area_struct *vma;
+ struct page *page;
int err;
/*
- * Do a quick atomic lookup first - this is the fastpath.
+ * The futex is hashed differently depending on whether
+ * it's in a shared or private mapping. So check vma first.
*/
- page = follow_page(mm, addr, 0);
- if (likely(page != NULL))
- return __pin_page_atomic(page);
+ vma = find_extend_vma(mm, addr);
+
+ if (unlikely(!vma)) {
+#ifdef FIXADDR_USER_START
+ if (addr >= FIXADDR_USER_START && addr < FIXADDR_USER_END) {
+ keys[0] = 1; /* Different from any pointer value. */
+ keys[1] = addr - FIXADDR_USER_START;
+ return 0;
+ }
+#endif
+ return -EFAULT;
+ }
+
+ /*
+ * Permissions.
+ */
+ if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
+ return -EFAULT;
/*
- * No luck - need to fault in the page:
+ * Private mappings are handled in a simple way.
*/
-repeat_lookup:
+ if (likely(!(vma->vm_flags & VM_SHARED))) {
+ keys[0] = (unsigned long) mm;
+ keys[1] = addr;
+ return 0;
+ }
- unlock_futex_mm();
+ /*
+ * Linear mappings are also simple.
+ */
+ keys[0] = (unsigned long) vma->vm_file->f_dentry->d_inode;
+ if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
+ keys[1] = (((addr - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff);
+ return 0;
+ }
- down_read(&mm->mmap_sem);
- err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL);
- up_read(&mm->mmap_sem);
+ /*
+ * We could walk the page table to read the non-linear
+ * pte, and get the page index without fetching the page
+ * from swap. But that's a lot of code to duplicate here
+ * for a rare case, so we simply fetch the page.
+ */
- lock_futex_mm();
+ /*
+ * Do a quick atomic lookup first - this is the fastpath.
+ */
+ spin_lock(¤t->mm->page_table_lock);
+ page = follow_page(mm, addr, 0);
+ if (likely(page != NULL)) {
+ keys[1] = page->index;
+ spin_unlock(¤t->mm->page_table_lock);
+ return 0;
+ }
+ spin_unlock(¤t->mm->page_table_lock);
- if (err < 0)
- return NULL;
/*
- * Since the faulting happened with locks released, we have to
- * check for races:
+ * Do it the general way.
*/
- tmp = follow_page(mm, addr, 0);
- if (tmp != page) {
+ err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL);
+ if (err >= 0) {
+ keys[1] = page->index;
put_page(page);
- goto repeat_lookup;
}
-
- return page;
+ return err;
}
+
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
@@ -153,25 +177,25 @@
static inline int futex_wake(unsigned long uaddr, int offset, int num)
{
struct list_head *i, *next, *head;
- struct page *page;
- int ret = 0;
+ unsigned long keys[2];
+ int ret;
- lock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
- page = __pin_page(uaddr - offset);
- if (!page) {
- unlock_futex_mm();
- return -EFAULT;
- }
+ ret = __get_page_keys(uaddr - offset, keys);
+ if (unlikely(ret != 0))
+ goto out;
- head = hash_futex(page, offset);
+ head = hash_futex(keys[0], keys[1], offset);
+ spin_lock(&futex_lock);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page && this->offset == offset) {
+ if (this->keys[0] == keys[0] && this->keys[1] == keys[1]
+ && this->offset == offset) {
+
list_del_init(i);
- __detach_vcache(&this->vcache);
wake_up_all(&this->waiters);
if (this->filp)
send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
@@ -180,38 +204,14 @@
break;
}
}
+ spin_unlock(&futex_lock);
- unlock_futex_mm();
- put_page(page);
-
+out:
+ up_read(¤t->mm->mmap_sem);
return ret;
}
/*
- * This gets called by the COW code, we have to rehash any
- * futexes that were pending on the old physical page, and
- * rehash it to the new physical page. The pagetable_lock
- * and vcache_lock is already held:
- */
-static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
-{
- struct futex_q *q = container_of(vcache, struct futex_q, vcache);
- struct list_head *head = hash_futex(new_page, q->offset);
-
- spin_lock(&futex_lock);
-
- if (!list_empty(&q->list)) {
- put_page(q->page);
- q->page = new_page;
- __pin_page_atomic(new_page);
- list_del(&q->list);
- list_add_tail(&q->list, head);
- }
-
- spin_unlock(&futex_lock);
-}
-
-/*
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
@@ -219,74 +219,66 @@
unsigned long uaddr2, int offset2, int nr_wake, int nr_requeue)
{
struct list_head *i, *next, *head1, *head2;
- struct page *page1 = NULL, *page2 = NULL;
- int ret = 0;
+ unsigned long keys1[2], keys2[2];
+ int ret;
- lock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
- page1 = __pin_page(uaddr1 - offset1);
- if (!page1)
+ ret = __get_page_keys(uaddr2 - offset1, keys1);
+ if (unlikely(ret != 0))
goto out;
- page2 = __pin_page(uaddr2 - offset2);
- if (!page2)
+ ret = __get_page_keys(uaddr2 - offset2, keys2);
+ if (unlikely(ret != 0))
goto out;
- head1 = hash_futex(page1, offset1);
- head2 = hash_futex(page2, offset2);
+ head1 = hash_futex(keys1[0], keys1[1], offset1);
+ head2 = hash_futex(keys2[0], keys2[1], offset2);
+ spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page1 && this->offset == offset1) {
+ if (this->keys[0] == keys1[0] && this->keys[1] == keys1[1]
+ && this->offset == offset1) {
+
list_del_init(i);
- __detach_vcache(&this->vcache);
if (++ret <= nr_wake) {
wake_up_all(&this->waiters);
if (this->filp)
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- put_page(this->page);
- __pin_page_atomic (page2);
list_add_tail(i, head2);
- __attach_vcache(&this->vcache, uaddr2,
- current->mm, futex_vcache_callback);
+ this->keys[0] = keys2[0];
+ this->keys[1] = keys2[1];
this->offset = offset2;
- this->page = page2;
if (ret - nr_wake >= nr_requeue)
break;
}
}
}
+ spin_unlock(&futex_lock);
out:
- unlock_futex_mm();
-
- if (page1)
- put_page(page1);
- if (page2)
- put_page(page2);
-
+ up_read(¤t->mm->mmap_sem);
return ret;
}
-static inline void __queue_me(struct futex_q *q, struct page *page,
- unsigned long uaddr, int offset,
- int fd, struct file *filp)
+static inline void queue_me(struct futex_q *q, unsigned long *keys,
+ unsigned long uaddr, int offset,
+ int fd, struct file *filp)
{
- struct list_head *head = hash_futex(page, offset);
+ struct list_head *head = hash_futex(keys[0], keys[1], offset);
+ q->keys[0] = keys[0];
+ q->keys[1] = keys[1];
q->offset = offset;
q->fd = fd;
q->filp = filp;
- q->page = page;
+ spin_lock(&futex_lock);
list_add_tail(&q->list, head);
- /*
- * We register a futex callback to this virtual address,
- * to make sure a COW properly rehashes the futex-queue.
- */
- __attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback);
+ spin_unlock(&futex_lock);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
@@ -294,15 +286,12 @@
{
int ret = 0;
- spin_lock(&vcache_lock);
spin_lock(&futex_lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
- __detach_vcache(&q->vcache);
ret = 1;
}
spin_unlock(&futex_lock);
- spin_unlock(&vcache_lock);
return ret;
}
@@ -312,65 +301,95 @@
unsigned long time)
{
DECLARE_WAITQUEUE(wait, current);
- int ret = 0, curval;
- struct page *page;
+ int ret, curval;
+ unsigned long keys[2];
struct futex_q q;
+ try_again:
init_waitqueue_head(&q.waiters);
- lock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
- page = __pin_page(uaddr - offset);
- if (!page) {
- unlock_futex_mm();
- return -EFAULT;
- }
- __queue_me(&q, page, uaddr, offset, -1, NULL);
+ ret = __get_page_keys(uaddr - offset, keys);
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+ queue_me(&q, keys, uaddr, offset, -1, NULL);
/*
- * Page is pinned, but may no longer be in this address space.
- * It cannot schedule, so we access it with the spinlock held.
+ * Access the page after the futex is queued.
+ * We hold the mmap semaphore, so the mapping cannot have changed
+ * since we looked it up.
*/
if (get_user(curval, (int *)uaddr) != 0) {
- unlock_futex_mm();
ret = -EFAULT;
- goto out;
+ goto out_unqueue;
}
if (curval != val) {
- unlock_futex_mm();
ret = -EWOULDBLOCK;
- goto out;
+ goto out_unqueue;
}
+
/*
- * The get_user() above might fault and schedule so we
- * cannot just set TASK_INTERRUPTIBLE state when queueing
- * ourselves into the futex hash. This code thus has to
+ * Now the futex is queued and we have checked the data, we
+ * don't want to hold mmap_sem while we sleep.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ /*
+ * There might have been scheduling since the queue_me(), as we
+ * cannot hold a spinlock across the get_user() in case it
+ * faults. So we cannot just set TASK_INTERRUPTIBLE state when
+ * queueing ourselves into the futex hash. This code thus has to
* rely on the futex_wake() code doing a wakeup after removing
* the waiter from the list.
*/
add_wait_queue(&q.waiters, &wait);
+ spin_lock(&futex_lock);
set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&q.list)) {
- unlock_futex_mm();
- time = schedule_timeout(time);
+
+ if (unlikely(list_empty(&q.list))) {
+ /*
+ * We were woken already.
+ */
+ spin_unlock(&futex_lock);
+ set_current_state(TASK_RUNNING);
+ return 0;
}
+
+ spin_unlock(&futex_lock);
+ time = schedule_timeout(time);
set_current_state(TASK_RUNNING);
+
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- if (time == 0) {
- ret = -ETIMEDOUT;
- goto out;
- }
+
+ /*
+ * Were we woken or interrupted for a valid reason?
+ */
+ ret = unqueue_me(&q);
+ if (ret == 0)
+ return 0;
+ if (time == 0)
+ return -ETIMEDOUT;
if (signal_pending(current))
- ret = -EINTR;
-out:
- /* Were we woken up anyway? */
+ return -EINTR;
+
+ /*
+ * No, it was a spurious wakeup. Try again. Should never happen. :)
+ */
+ goto try_again;
+
+ out_unqueue:
+ /*
+ * Were we unqueued anyway?
+ */
if (!unqueue_me(&q))
ret = 0;
- put_page(q.page);
-
+ out_release_sem:
+ up_read(¤t->mm->mmap_sem);
return ret;
}
@@ -379,7 +398,6 @@
struct futex_q *q = filp->private_data;
unqueue_me(q);
- put_page(q->page);
kfree(filp->private_data);
return 0;
}
@@ -409,10 +427,10 @@
set the sigio stuff up afterwards. */
static int futex_fd(unsigned long uaddr, int offset, int signal)
{
- struct page *page = NULL;
struct futex_q *q;
+ unsigned long keys[2];
struct file *filp;
- int ret;
+ int ret, err;
ret = -EINVAL;
if (signal < 0 || signal > _NSIG)
@@ -451,31 +469,25 @@
goto out;
}
- lock_futex_mm();
-
- page = __pin_page(uaddr - offset);
- if (!page) {
- unlock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
+ err = __get_page_keys(uaddr - offset, keys);
+ up_read(¤t->mm->mmap_sem);
+ if (unlikely(err != 0)) {
put_unused_fd(ret);
put_filp(filp);
kfree(q);
- return -EFAULT;
+ return err;
}
init_waitqueue_head(&q->waiters);
filp->private_data = q;
- __queue_me(q, page, uaddr, offset, ret, filp);
-
- unlock_futex_mm();
+ queue_me(q, keys, uaddr, offset, ret, filp);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
- page = NULL;
out:
- if (page)
- put_page(page);
return ret;
}
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/mm/fremap.c laptop-2.6.0-test4/mm/fremap.c
--- orig-2.6.0-test4/mm/fremap.c 2003-07-08 21:44:29.000000000 +0100
+++ laptop-2.6.0-test4/mm/fremap.c 2003-09-03 03:00:30.000000000 +0100
@@ -151,9 +151,16 @@
if (vma && (vma->vm_flags & VM_SHARED) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
- end <= vma->vm_end)
+ end <= vma->vm_end) {
+
+ if (start != vma->vm_start || end != vma->vm_end)
+ vma->vm_flags |= VM_NONLINEAR;
+ else
+ vma->vm_flags &= ~VM_NONLINEAR;
+
err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
pgoff, flags & MAP_NONBLOCK);
+ }
up_read(&mm->mmap_sem);
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/mm/Makefile laptop-2.6.0-test4/mm/Makefile
--- orig-2.6.0-test4/mm/Makefile 2003-07-08 21:44:29.000000000 +0100
+++ laptop-2.6.0-test4/mm/Makefile 2003-09-03 04:42:46.000000000 +0100
@@ -9,6 +9,6 @@
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \
- slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
+ slab.o swap.o truncate.o vmscan.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/mm/memory.c laptop-2.6.0-test4/mm/memory.c
--- orig-2.6.0-test4/mm/memory.c 2003-09-02 23:06:13.000000000 +0100
+++ laptop-2.6.0-test4/mm/memory.c 2003-09-03 04:49:10.000000000 +0100
@@ -43,7 +43,6 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
-#include <linux/vcache.h>
#include <linux/rmap-locking.h>
#include <asm/pgalloc.h>
@@ -960,7 +959,6 @@
static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
pte_t *page_table)
{
- invalidate_vcache(address, vma->vm_mm, new_page);
flush_cache_page(vma, address);
establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
}
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/mm/vcache.c laptop-2.6.0-test4/mm/vcache.c
--- orig-2.6.0-test4/mm/vcache.c 2003-07-08 21:44:31.000000000 +0100
+++ laptop-2.6.0-test4/mm/vcache.c 1970-01-01 01:00:00.000000000 +0100
@@ -1,90 +0,0 @@
-/*
- * linux/mm/vcache.c
- *
- * virtual => physical page mapping cache. Users of this mechanism
- * register callbacks for a given (virt,mm,phys) page mapping, and
- * the kernel guarantees to call back when this mapping is invalidated.
- * (ie. upon COW or unmap.)
- *
- * Started by Ingo Molnar, Copyright (C) 2002
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/hash.h>
-#include <linux/vcache.h>
-
-#define VCACHE_HASHBITS 8
-#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS)
-
-spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED;
-
-static struct list_head hash[VCACHE_HASHSIZE];
-
-static struct list_head *hash_vcache(unsigned long address,
- struct mm_struct *mm)
-{
- return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)];
-}
-
-void __attach_vcache(vcache_t *vcache,
- unsigned long address,
- struct mm_struct *mm,
- void (*callback)(struct vcache_s *data, struct page *new))
-{
- struct list_head *hash_head;
-
- address &= PAGE_MASK;
- vcache->address = address;
- vcache->mm = mm;
- vcache->callback = callback;
-
- hash_head = hash_vcache(address, mm);
-
- list_add_tail(&vcache->hash_entry, hash_head);
-}
-
-void __detach_vcache(vcache_t *vcache)
-{
- list_del_init(&vcache->hash_entry);
-}
-
-void invalidate_vcache(unsigned long address, struct mm_struct *mm,
- struct page *new_page)
-{
- struct list_head *l, *hash_head;
- vcache_t *vcache;
-
- address &= PAGE_MASK;
-
- hash_head = hash_vcache(address, mm);
- /*
- * This is safe, because this path is called with the pagetable
- * lock held. So while other mm's might add new entries in
- * parallel, *this* mm is locked out, so if the list is empty
- * now then we do not have to take the vcache lock to see it's
- * really empty.
- */
- if (likely(list_empty(hash_head)))
- return;
-
- spin_lock(&vcache_lock);
- list_for_each(l, hash_head) {
- vcache = list_entry(l, vcache_t, hash_entry);
- if (vcache->address != address || vcache->mm != mm)
- continue;
- vcache->callback(vcache, new_page);
- }
- spin_unlock(&vcache_lock);
-}
-
-static int __init vcache_init(void)
-{
- unsigned int i;
-
- for (i = 0; i < VCACHE_HASHSIZE; i++)
- INIT_LIST_HEAD(hash + i);
- return 0;
-}
-__initcall(vcache_init);
-
On Wed, 3 Sep 2003, Jamie Lokier wrote:
>
> You will be please to know I have written a complete patch :)
Me too, well, mine wasn't quite complete yet, so I'll switch to
reviewing yours later instead. I've not glanced at it so far, but
what you've said about it leaves no doubt that you got my point.
> That way, there is no need to walk the page table at all unless it's a
> non-linear mapping (which my patch does handle).
Gosh, I thought it was just a bit of one-up-man-ship from Andrew,
futex on non-linear! I doubt anyone really cares about that case.
> Good question. No kernel code seems to check VM_MAYSHARE - the one to
> check is VM_SHARED.
No, it should be VM_MAYSHARE (if the behaviour is to depend on
whether user said MAP_SHARED or not: which is a good starting point,
but if odd readonly compatibility issues force us away from that
position, perhaps VM_MAYSHARE won't in the end be the right test).
I agree it's peculiar, I agree (search LKML archives for VM_MAYSHARE)
that again and again I'm having to make the distinction (I can't pretend
to explain it, just indicate it), which strongly suggests it should be
done better. But that's some other patch, some other time,
for now use VM_MAYSHARE.
Observe fs/procfs/task_mmu.c show_map checking VM_MAYSHARE for 's'.
Observe mm/mmap.c do_mmap_pgoff vm_flags &= ~(VM_MAYWRITE | VM_SHARED).
VM_MAYSHARE reflects whether user chose MAP_SHARED, VM_SHARED may not.
> I added a flag VM_NONLINEAR to distinguish them.
Yes, I had that flag removed while it served no purpose,
but I'm happy to have it back once it's useful for efficiency.
> I have an obvious fix for mremap(): rehash all the futexes in its
> range. That's not in the attached patch, but it will be in the next one.
Will it be worth the code added to handle it? I wonder the same of
non-linear (sys_mremap and sys_remap_file_pages, familiar troublemakers).
But all credit for handling them, good to reduce "undefined behaviour"s.
Hugh
Patch name: futex-nonlinear-2.6.0-test4-02jl
Depends on: futex-fixes-2.6.0-test4-01jl
This fixes a couple of bugs in the previous patch:
1. A typo in futex.c. It affects the FUTEX_REQUEUE operation,
which is a fairly recent addition.
2. VM_NONLINEAR would not be set under some conditions when it should be.
The bug only affected futexes on non-linear shared mappings.
Enjoy,
-- Jamie
diff -urN --exclude-from=dontdiff futex1-2.6.0-test4/kernel/futex.c futex2-2.6.0-test4/kernel/futex.c
--- futex1-2.6.0-test4/kernel/futex.c 2003-09-03 12:48:59.000000000 +0100
+++ futex2-2.6.0-test4/kernel/futex.c 2003-09-03 12:56:03.000000000 +0100
@@ -224,7 +224,7 @@
down_read(¤t->mm->mmap_sem);
- ret = __get_page_keys(uaddr2 - offset1, keys1);
+ ret = __get_page_keys(uaddr1 - offset1, keys1);
if (unlikely(ret != 0))
goto out;
ret = __get_page_keys(uaddr2 - offset2, keys2);
diff -urN --exclude-from=dontdiff futex1-2.6.0-test4/mm/fremap.c futex2-2.6.0-test4/mm/fremap.c
--- futex1-2.6.0-test4/mm/fremap.c 2003-09-03 12:29:36.000000000 +0100
+++ futex2-2.6.0-test4/mm/fremap.c 2003-09-03 13:02:35.000000000 +0100
@@ -153,9 +153,9 @@
end > start && start >= vma->vm_start &&
end <= vma->vm_end) {
- if (start != vma->vm_start || end != vma->vm_end)
- vma->vm_flags |= VM_NONLINEAR;
- else
+ vma->vm_flags |= VM_NONLINEAR;
+ if (start == vma->vm_start && end == vma->vm_end &&
+ pgoff == vma->vm_pgoff)
vma->vm_flags &= ~VM_NONLINEAR;
err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
Hugh Dickins wrote:
> Will it be worth the code added to handle it? I wonder the same of
> non-linear (sys_mremap and sys_remap_file_pages, familiar troublemakers).
> But all credit for handling them, good to reduce "undefined behaviour"s.
I dismissed remap_file_pages the same as you at first, but since
Andrew mentioned it, I think it's a fair point. As long as it's
there, programmers should get the natural behaviour from it.
Databases (tdb I think) use futexes in database files, and
remap_file_pages is used to look at different views of database files,
and for huge files, so... It seems reasonable. That's quite an easy
bit of code anyway. The futexes nicely stay persistent even when
their particular offset into the file isn't mapped.
mremap() is used for moving anonymous data structures around mainly.
Applications don't really need to depend on the futexes moving in
that, but it is what the current implementation does so they might
exist by now. The code for this is less defensible as it is more
complicated.
-- Jamie
Jamie Lokier <[email protected]> wrote:
>
> You will be please to know I have written a complete patch :)
Looks pretty sane to me. A couple of (untested) fixups:
Take mmap_sem for writing around the modification of vma->vm_flags.
But hold it for reading across the populate function: it does I/O and is
slow.
25-akpm/mm/fremap.c | 14 ++++++++------
1 files changed, 8 insertions(+), 6 deletions(-)
diff -puN mm/fremap.c~futex-non-page-pinning-akpm-1 mm/fremap.c
--- 25/mm/fremap.c~futex-non-page-pinning-akpm-1 Wed Sep 3 08:16:30 2003
+++ 25-akpm/mm/fremap.c Wed Sep 3 08:19:21 2003
@@ -144,7 +144,10 @@ long sys_remap_file_pages(unsigned long
return err;
#endif
- down_read(&mm->mmap_sem);
+ /*
+ * vm_flags is protected by down_write(mmap_sem)
+ */
+ down_write(&mm->mmap_sem);
vma = find_vma(mm, start);
/*
@@ -161,13 +164,12 @@ long sys_remap_file_pages(unsigned long
if (start == vma->vm_start && end == vma->vm_end &&
pgoff == vma->vm_pgoff)
vma->vm_flags &= ~VM_NONLINEAR;
-
+ downgrade_write(&mm->mmap_sem);
err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
pgoff, flags & MAP_NONBLOCK);
+ up_read(&mm->mmap_sem);
+ } else {
+ up_write(&mm->mmap_sem);
}
-
- up_read(&mm->mmap_sem);
-
return err;
}
-
_
schedule_timeout() returns in state TASK_RUNNING.
25-akpm/kernel/futex.c | 1 -
1 files changed, 1 deletion(-)
diff -puN kernel/futex.c~futex-non-page-pinning-akpm-2 kernel/futex.c
--- 25/kernel/futex.c~futex-non-page-pinning-akpm-2 Wed Sep 3 08:31:34 2003
+++ 25-akpm/kernel/futex.c Wed Sep 3 08:31:47 2003
@@ -358,7 +358,6 @@ static inline int futex_wait(unsigned lo
spin_unlock(&futex_lock);
time = schedule_timeout(time);
- set_current_state(TASK_RUNNING);
/*
* NOTE: we don't remove ourselves from the waitqueue because
_
Andrew Morton wrote:
> Take mmap_sem for writing around the modification of vma->vm_flags.
> But hold it for reading across the populate function: it does I/O and is
> slow.
Agreed. Well spotted.
-- Jamie
Hugh Dickins wrote:
> > Good question. No kernel code seems to check VM_MAYSHARE - the one to
> > check is VM_SHARED.
>
> Observe fs/procfs/task_mmu.c show_map checking VM_MAYSHARE for 's'.
> Observe mm/mmap.c do_mmap_pgoff vm_flags &= ~(VM_MAYWRITE | VM_SHARED).
> VM_MAYSHARE reflects whether user chose MAP_SHARED, VM_SHARED may not.
Hugh, thank you. In the case of futex.c, either flag could be used to
mean "is this a shared" mapping, and each choice has a different
user-visible meaning.
Most of the VM code uses VM_SHARED to ask the question "is this a
shared mapping", but it turns out that most of the VM code means
something different by that than what is meant in userspace. To the
VM code, a "shared mapping" means something which can dirty
backing-file pages, either after mmap() or maybe mprotect() as well.
This is different to the userspace point of view. The common language
is unfortunate. (IMHO the flag should be called VM_SHAREDWRITABLE).
With futex.c using VM_SHARED to determine whether to hash on
(inode,offset,index), mappings of read-only file handles will hash on
(mm,uaddr) which is not correct: a FUTEX_WAIT on a shared mapping
using userspace's meaning of "shared mapping" should notice changes in
a different mm.
Therefore, futex.c will use VM_MAYSHARE in my next patch.
Thanks,
-- Jamie
On Wed, 3 Sep 2003, Jamie Lokier wrote:
>
> Hugh Dickins wrote:
> > > Good question. No kernel code seems to check VM_MAYSHARE - the one to
> > > check is VM_SHARED.
> >
> > Observe fs/procfs/task_mmu.c show_map checking VM_MAYSHARE for 's'.
> > Observe mm/mmap.c do_mmap_pgoff vm_flags &= ~(VM_MAYWRITE | VM_SHARED).
> > VM_MAYSHARE reflects whether user chose MAP_SHARED, VM_SHARED may not.
>
> Hugh, thank you. In the case of futex.c, either flag could be used to
> mean "is this a shared" mapping, and each choice has a different
> user-visible meaning.
Actually: the VM_SHARED flag will never change, so testing VM_SHARED is
actually the _right_ thing from a mm perspective.
The only person who should ever test VM_MAYSHARE is somebody who does
reporting back to user space: VM_MAYSHARE basically ends up meaning "the
user _asked_ for a shared mapping". While "VM_SHARED" means "this mapping
can actually contain a shared dirty page".
The VM itself should only ever care about VM_SHARED. Because that's the
only bit that has real semantic meaning.
Linus
On Wed, 3 Sep 2003, Linus Torvalds wrote:
>
> Actually: the VM_SHARED flag will never change, so testing VM_SHARED is
> actually the _right_ thing from a mm perspective.
>
> The only person who should ever test VM_MAYSHARE is somebody who does
> reporting back to user space: VM_MAYSHARE basically ends up meaning "the
> user _asked_ for a shared mapping". While "VM_SHARED" means "this mapping
> can actually contain a shared dirty page".
>
> The VM itself should only ever care about VM_SHARED. Because that's the
> only bit that has real semantic meaning.
To that part of the kernel interested in dirty pages, yes.
But when interested in futexes, it seems not.
If we're going to document a behaviour as depending on whether the user
said MAP_SHARED or MAP_PRIVATE, then it's VM_MAYSHARE we should check to
decide which behaviour to use.
We could use VM_SHARED, and document the behaviour of the futex as
depending on whether it's in an area that was MAP_SHARED from a file
which was opened for writing as well as reading - but do we really
want to complicate the documentation that way? Principle of least
least surprise, principle of minimal doc.
Hugh
On Wed, 3 Sep 2003, Hugh Dickins wrote:
> >
> > The VM itself should only ever care about VM_SHARED. Because that's the
> > only bit that has real semantic meaning.
>
> To that part of the kernel interested in dirty pages, yes.
> But when interested in futexes, it seems not.
I don't like it.
If the patches can't be made to work for private mappings, then there's
something fundamentally wrong with them.
A non-writable shared mapping has degenerated into a private mapping since
the very first releases of Linux that supported mmap. It started out as a
"hey, we don't support true writable shared mappings, but we _do_ support
cache coherency on read-only mmaps through the normal private mappings,
so..".
And even later on, when true shared mappings were supported, we continued
the "degenerate to a private mapping" approach because the private
mappings tend to be simpler and require less overhead (exactly because
they don't need to worry about dirty bits).
So part of the picture is that this is just how the Linux VM fundamentally
works.
The other part of the picture is that futex'es should "just work" even
when it comes to regular private mappings. Regardless of any VM_SHARED or
VM_MAYSHARE bits. Even if the user did a totally private mmap() in the
first place, that does not mean that the futex shouldn't work properly.
So the thing boils down to:
- if the futex works on a proper private mapping, then the downgrade is
still proper, and the futex should never care about anything but a real
VM_SHARED.
- if the futex doesn't work with a proper private mapping, then that is a
bug _regardless_ of anything else, and VM_SHARED vs VM_MAYSHARE never
enters into the picture anyway.
What?
Linus
On Wed, 3 Sep 2003, Linus Torvalds wrote:
>
> If the patches can't be made to work for private mappings, then there's
> something fundamentally wrong with them.
Of course (not). That's the point, they do work on private mappings, but
the semantics are different on private mappings from on shared mappings:
on private mappings they're private to the mm, on shared mappings they're
shared with other mms (via the shared file).
> So the thing boils down to:
>
> - if the futex works on a proper private mapping, then the downgrade is
> still proper, and the futex should never care about anything but a real
> VM_SHARED.
In the usual mm case, yes, deciding by VM_SHARED and
ignoring VM_MAYSHARE turns out to be the right thing to do.
But a futex differs from the usual mm case, that much was clear when
they were invented, but we're still discovering just how they are.
As I've said before, I haven't a clue about the user/glibc end of
futexes, and for all I know a futex on a shared-readonly-cannot-be-
mprotected-for-writing mapping cannot be used as a futex. If that's
so, then perhaps we should simply prohibit sys_futex on such an area,
and settle this dispute in that way. Is that the case?
Hugh
On Wed, 3 Sep 2003, Hugh Dickins wrote:
>
> Of course (not). That's the point, they do work on private mappings, but
> the semantics are different on private mappings from on shared mappings:
> on private mappings they're private to the mm, on shared mappings they're
> shared with other mms (via the shared file).
That's not true. It never has been true in Linux.
Private mappings that haven't been broken by COW (and a read-only mapping
never will be) will see updates as they happen on the file that backs it.
That's the fundamental difference between "mmap(MAP_PRIVATE)" and
"read()".
You may not like it, and others too have not liked it (Hurd and Mach do
this big dance about MAP_COPY that really creates a static _copy_ of the
state at the time of the mmap), but it's just a fact.
Repeat after me: private read-only mappings are 100% equivalent to shared
read-only mappings. No ifs, buts, or maybes. This is a FACT. It's a fact
codified in many years of Linux implementation, but it's a fact outside of
that too.
(Yeah, yeah, I know some broken old Unixes do not offer mmap consistency
guarantees, and nntpd is unhappy. But Linux isn't broken.)
Linus
On Wed, 3 Sep 2003, Linus Torvalds wrote:
> On Wed, 3 Sep 2003, Hugh Dickins wrote:
> >
> > Of course (not). That's the point, they do work on private mappings, but
> > the semantics are different on private mappings from on shared mappings:
> > on private mappings they're private to the mm, on shared mappings they're
> > shared with other mms (via the shared file).
>
> That's not true. It never has been true in Linux.
>
> Private mappings that haven't been broken by COW (and a read-only mapping
> never will be) will see updates as they happen on the file that backs it.
> That's the fundamental difference between "mmap(MAP_PRIVATE)" and
> "read()".
>
> You may not like it, and others too have not liked it (Hurd and Mach do
> this big dance about MAP_COPY that really creates a static _copy_ of the
> state at the time of the mmap), but it's just a fact.
I like the way Linux does that fine, it's the right way. We have
a misunderstanding. You're talking about the behaviour of mmaps,
I was talking about the behaviour of futexes placed within mmaps.
I'm not sure whether you've read this thread from the beginning,
Jamie started CCing you today. The background is that Rusty's been
working on unpinning futex pages, Jamie discovered inconsistency with
current futex COW behaviour, that pushed me into realizing that the
whole physical-page-based futex implementation has been misguided
(adding more problems than it solves), Jamie has now made a patch
to implement sys_futex the simpler way, we're discussing the test
to distinguish a "private futex" from a "shared futex".
> Repeat after me: private read-only mappings are 100% equivalent to shared
> read-only mappings. No ifs, buts, or maybes. This is a FACT. It's a fact
> codified in many years of Linux implementation, but it's a fact outside of
> that too.
Maybe, but, if the file was opened for writing as well as reading, the
shared read-only mapping can be mprotected to read-write at any point,
which does lead to differences: which is why Linux is very careful
about deciding VM_SHARED, and it's quite difficult to explain.
If we document how sys_futex (which does not dirty a page, doesn't
even need a page there) behaves when placed within different kinds
of mmaps, it's easier for the reader to understand if we don't get
into such sophistications - hence choice of VM_MAYSHARE equivalent
to MAP_SHARED, never mind the readwriteness.
We'd both do better to be reading Jamie's patch.
Hugh
On Wed, 3 Sep 2003, Hugh Dickins wrote:
>
> Maybe, but, if the file was opened for writing as well as reading, the
> shared read-only mapping can be mprotected to read-write at any point,
> which does lead to differences: which is why Linux is very careful
> about deciding VM_SHARED, and it's quite difficult to explain.
And that's why the kernel does this:
case MAP_SHARED:
....
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
...
ie it only degenerates the shared mapping to a private mapping if it
_also_ removes the MAYWRITE bit.
So if the mapping is a shared mapping and read-only - but the file was
opened read-write and the mapping may later be changed to a writable one -
then Linux will keep the mapping VM_SHARED.
> If we document how sys_futex (which does not dirty a page, doesn't
> even need a page there) behaves when placed within different kinds
> of mmaps, it's easier for the reader to understand if we don't get
> into such sophistications - hence choice of VM_MAYSHARE equivalent
> to MAP_SHARED, never mind the readwriteness.
I'd be very very nervous about anything that documents a read-only
MAP_SHARED as anythign but a MAP_PRIVATE. That just is fundamentally not
right, and it _will_ bite us at some point, since all of the rest of the
VM thinks that they are the same.
Linus
In message <[email protected]> you write:
> On Wed, 3 Sep 2003, Jamie Lokier wrote:
> > I have an obvious fix for mremap(): rehash all the futexes in its
> > range. That's not in the attached patch, but it will be in the next one.
>
> Will it be worth the code added to handle it? I wonder the same of
> non-linear (sys_mremap and sys_remap_file_pages, familiar troublemakers).
> But all credit for handling them, good to reduce "undefined behaviour"s.
I don't have a problem with the omission. mremap is logically
equivalent to munmap + mmap, so it's a subset of the "I unmapped
underneath my futex!". It's not like it's going to happen without the
caller knowing: if the address doesn't change, then the futexes won't
break. If they do, the caller needs to reset them anyway.
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> Hi Rusty,
>
> You will be please to know I have written a complete patch :)
Hi Jamie,
Very pleased! Remember, Open Source is all about having other
people do your work for you 8)
> > Assume that we do:
> > 1) Look up vma.
> > 2) If vma->vm_flags & VM_SHARED, index by page->mapping & page->index.
> > 3) Otherwise, index by vma->vm_mm & uaddr.
>
> Like that, but 2) uses vma->vm_file->f_dentry->d_inode.
>
> That way, there is no need to walk the page table at all unless it's a
> non-linear mapping (which my patch does handle).
OK.
> > 2) If VM_SHARED, and page->mapping is NULL, what to do? AFAICT, this
> > can happen in the case of anonymous shared mappings, say mmap
> > /dev/zero MAP_SHARED and fork()? Treating it as !VM_SHARED (and
> > hence matching in mm & uaddr) won't work, since the mm's will be
> > different (and with mremap, the uaddrs may be different).
>
> No, that doesn't happen. An anoymous shared mapping calls
> shmem_zero_setup(), which creates an anonymous tmpfs file to back the
> mapping. It then looks the same as IPC shm or any other tmpfs file.
>
> So it works :)
Ah, I didn't look down that far in do_mmap_pgoff. Right: that makes
things much simpler.
> > 3) Since we need the offset in the file anyway for the VM_SHARED, it
> > makes more sense to use get_user_pages() to get the vma and page in
> > one call, rather than find_extend_vma().
>
> You need the offset, but you don't need the page. For a linear
> mapping, the offset is a very simple calculation - no page table lock
> and no page table walk. As a silly bonus it doesn't touch the page.
>
> For non-linear mappings, I try follow_page() and then
> get_user_pages(), as usual, to get page->index. Technically you don't
> need to swap the page in, but there's no point using complicated code
> for that unimportant case.
>
> I added a flag VM_NONLINEAR to distinguish them.
OK, I would have done it the naive way, but Ingo would probably have
just written what you did (he did the follow_page optimization) 8)
The rest is just nitpicking...
> + /* Page keys and offset within the page. */
> + unsigned long keys[2];
> int offset;
I prefer a union here. It's a little more verbose, but I think it's
clearer:
struct anon_key
{
struct mm_struct *mm;
unsigned long uaddr;
};
struct filebacked_key
{
struct inode *inode;
unsigned long page_index;
};
union hash_key
{
struct anon_key anon;
struct filebacked_key filebacked;
unsigned long keys[2];
};
> +#ifdef FIXADDR_USER_START
> + if (addr >= FIXADDR_USER_START && addr < FIXADDR_USER_END) {
> + keys[0] = 1; /* Different from any pointer value. */
> + keys[1] = addr - FIXADDR_USER_START;
> + return 0;
> + }
> +#endif
I think this is a bit extreme: this would allow futexes in the
VSYSCALL region, right? I admire your thoroughness, but perhaps this
should wait until someone comes up with a reason to do it?
The rest looks ok, I'll do a differential once the rest settles down...
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> Private mappings that haven't been broken by COW (and a read-only mapping
> never will be) will see updates as they happen on the file that backs it.
> That's the fundamental difference between "mmap(MAP_PRIVATE)" and
> "read()".
Right, so it would be consistent for someone doing a FUTEX_WAIT on an
"intact" (not broken by COW) private mapping to see a FUTEX_WAKE done
on that file.
However, Jamie's futex code will see !VM_SHARED on the mapping, and
compare futexes by mm + uaddr (rather than inode + file offset), so
this is NOT the case. Using VM_MAYSHARE instead would make the
MAP_SHARED readonly case work as above, though.
The way futexes are used now, they're both "don't care". If you have
a private mapping or read-only mapping, you'll never get woken by
others with the same file mapped writable shared, but WTF were you
waiting for a futex if the mapping is private anyway: the lock
acquisition won't work (and sleeping forever is easier to debug than
two tasks getting the lock).
Oh no, I think I'm starting to understand the VM a little.
Ick.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
On Thu, 4 Sep 2003, Rusty Russell wrote:
>
> However, Jamie's futex code will see !VM_SHARED on the mapping, and
> compare futexes by mm + uaddr (rather than inode + file offset), so
> this is NOT the case. Using VM_MAYSHARE instead would make the
> MAP_SHARED readonly case work as above, though.
But that is WORSE!
Now MAP_SHARED works, but MAP_PRIVATE does not. That's still the same bug,
but now it' san inconsistent bug!
I'd rather have a consistent bug than one that makes no sense.
Linus
On Thu, 4 Sep 2003, Linus Torvalds wrote:
> On Thu, 4 Sep 2003, Rusty Russell wrote:
> >
> > However, Jamie's futex code will see !VM_SHARED on the mapping, and
> > compare futexes by mm + uaddr (rather than inode + file offset), so
> > this is NOT the case. Using VM_MAYSHARE instead would make the
> > MAP_SHARED readonly case work as above, though.
>
> But that is WORSE!
>
> Now MAP_SHARED works, but MAP_PRIVATE does not. That's still the same bug,
> but now it' san inconsistent bug!
>
> I'd rather have a consistent bug than one that makes no sense.
Aren't we arguing back and forth about a totally pointless case?
I've at last read the futex manpages and looked at Rusty's futex-2.2,
to repair my understanding of the userspace end.
Isn't it the case that to use sys_futex (in the way it's intended),
userspace needs write access to the futex? FUTEX_WAIT and FUTEX_WAKE
are used (depending on condition) after decrementing or incrementing
the futex in userspace. FUTEX_FD is not such a clear case, but again
it appears that you'd use it for an async wait after decrementing.
FUTEX_REQUEUE seems to be a move or remap, doesn't change the picture.
So, isn't discussing sys_futex behaviour on a readonly mapping just
academic? I'd like us to define that behaviour precisely by returning
-EACCES if sys_futex is attempted on a !VM_WRITE mapping, but it's
not worth arguing over. And it doesn't matter whether Jamie tests
VM_MAYSHARE (as I argued) or VM_SHARED (as you insist): they're
set or clear together on all the writable mappings.
The particular case above: if it's !PROT_WRITE MAP_PRIVATE, I'm
saying that's not an area you can manipulate mutexes in anyway;
if it's PROT_WRITE MAP_PRIVATE but the page readonly while shared
with parent, child or sibling, the prior decrement or increment
on the futex in userspace will break COW, so it's private to the
mm by the time sys_futex WAIT or WAKE is called, in either the
old or the new implementation.
(But you can construct fork-with-futex examples in which the old
implementation would share private futex between parent and child,
because no write after fork to break the COW.)
Hugh
On Thu, 4 Sep 2003, Hugh Dickins wrote:
>
> Isn't it the case that to use sys_futex (in the way it's intended),
> userspace needs write access to the futex? FUTEX_WAIT and FUTEX_WAKE
> are used (depending on condition) after decrementing or incrementing
> the futex in userspace. FUTEX_FD is not such a clear case, but again
> it appears that you'd use it for an async wait after decrementing.
> FUTEX_REQUEUE seems to be a move or remap, doesn't change the picture.
Yes.
We can certainly just document it as a nonsense op. All I care about is
that it is _consistently_ broken, and that people don't make read-only
MAP_SHARED do something it has never ever done before - differ from a
semantic standpoint.
> The particular case above: if it's !PROT_WRITE MAP_PRIVATE, I'm
> saying that's not an area you can manipulate mutexes in anyway;
However, the thing is, the case really can be a totally writable
MAP_PRIVATE that just hasn't been modified (and thus not COW'ed) _yet_.
But sure, we could just require that futex pages are dirty in this case.
Linus
On Wed, 3 Sep 2003, Jamie Lokier wrote:
> Patch name: futex-nonlinear-2.6.0-test4-02jl
> Depends on: futex-fixes-2.6.0-test4-01jl
I've now read your patches, they look good to me:
I particularly like the way you divided up the locking.
In sys_remap_file_pages, you set the VM_NONLINEAR flag, then clear
it if this particular population matches the vma. No, you cannot
clear that flag once set, without checking every page and pte_file
already set within the vma. Check if population matches vma first,
and if it doesn't match just set the VM_NONLINEAR flag in that case.
(Andrew already mentioned locking: I'd have said page_table_lock,
but his mmap_sem is also appropriate: it's an odd case.)
I think rip out the FIXADDR_USER_START bit, it's rather over-the-top,
ugly: and that area is readonly, so not a useful place for a futex.
The units of keys[1]: bytes if private but pages if shared.
That's okay for now I think, but if a hashing expert comes along
later s/he'll probably want to change it. The current hash does
add key1 to offset, which is okay: if it xor'ed you'd lose the
the offset bits in the private case.
Those keys[1] pages: in units of PAGE_SIZE in the linear case,
of PAGE_CACHE_SIZE in the nonlinear case. Oh well, this is far
from the only place with such an inconsistency, let's worry
about that when never comes.
The err at the end of __get_page_keys would be 1 from successful
get_user_pages, treated as error by the callers: need to make it 0.
futex_wait: I didn't get around to it in my version, so haven't
thought through the issues, but I'm a bit worried that you get
curval for -EWOULDBLOCK check without holding the futex_lock.
That looks suspicious to me, but I'm going to be lazy and not
try to think about it, because Rusty is sure to understand the
races there. If that code is insufficient as you have it, may
need __pin_page reinstated for just that case (hmm, was that
get_user right before? I'd expect it to kmap_atomic pinned page.)
Hugh
On Thu, 4 Sep 2003, Linus Torvalds wrote:
>
> However, the thing is, the case really can be a totally writable
> MAP_PRIVATE that just hasn't been modified (and thus not COW'ed) _yet_.
>
> But sure, we could just require that futex pages are dirty in this case.
There's no problem here in Jamie's implementation, no need to demand that;
but the previous implementation did make COWing problems for itself, yes.
Hugh
Linus Torvalds wrote:
> The only person who should ever test VM_MAYSHARE is somebody who does
> reporting back to user space: VM_MAYSHARE basically ends up meaning "the
> user _asked_ for a shared mapping". While "VM_SHARED" means "this mapping
> can actually contain a shared dirty page".
Precisely. And for futex, the correct meaning is "the user asked for
a shared mapping": a FUTEX_WAIT on a shared mapping of a file opened
read-only _should_ be woken by a FUTEX_WAKE on the same file by
another process.
> The VM itself should only ever care about VM_SHARED. Because that's the
> only bit that has real semantic meaning.
remap_file_pages() returns -EINVAL on mappings of
read-only file handles because it tests VM_SHARED. There is no good
reason not to remap a read-only file - it doesn't complain about
remapping a PROT_READ mapping of a writable file!
Patch tested and attached.
Enjoy,
-- Jamie
Subject: [PATCH] Allow remap_file_pages() on read-only files
Patch: nonlinear-mayshare-2.6.0-test4-02.2jl
This changes remap_file_pages() to work on mappings of read-only file
handles. Without this it returns -EINVAL.
diff -urN --exclude-from=dontdiff orig-2.6.0-test4/mm/fremap.c nonlinear-2.6.0-test4/mm/fremap.c
--- orig-2.6.0-test4/mm/fremap.c 2003-07-08 21:44:29.000000000 +0100
+++ nonlinear-2.6.0-test4/mm/fremap.c 2003-09-04 18:19:45.000000000 +0100
@@ -145,10 +145,11 @@
vma = find_vma(mm, start);
/*
* Make sure the vma is shared, that it supports prefaulting,
- * and that the remapped range is valid and fully within
- * the single existing vma:
+ * and that the remapped range is valid and fully within the
+ * single existing vma. VM_MAYSHARE is checked (not VM_SHARED)
+ * so that read-only files can be remapped too:
*/
- if (vma && (vma->vm_flags & VM_SHARED) &&
+ if (vma && (vma->vm_flags & VM_MAYSHARE) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
end <= vma->vm_end)
Linus Torvalds wrote:
> Actually: the VM_SHARED flag will never change, so testing VM_SHARED is
> actually the _right_ thing from a mm perspective.
Yes it can. See sys_mprotect(). If that's not intended, it's a bug
in mprotect(). What does PROT_SEM mean for Linux, btw?
See:
if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
return -EINVAL;
and:
newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC));
if ((newflags & ~(newflags >> 4)) & 0xf) {
error = -EACCES;
goto out;
}
newflags is than used to index protection_map[], like this:
newprot = protection_map[newflags & 0xf];
and that is stored in the page tables.
-- Jamie
Rusty Russell wrote:
> I think this is a bit extreme: this would allow futexes in the
> VSYSCALL region, right? I admire your thoroughness, but perhaps this
> should wait until someone comes up with a reason to do it?
I only put that in because map_user_pages does it. It isn't important.
If you look carefully, you see that these patches work exactly like
the old implementation in all cases where the old one worked.
-- Jamie
On Thu, 4 Sep 2003, Jamie Lokier wrote:
>
> Yes it can. See sys_mprotect(). If that's not intended, it's a bug
> in mprotect().
Oh. I see. Yes - it's accessing "vm_flags" with "MAP_SEM". That's really
wrong, since it's not even the same _domain_.
"vm_flags" should use the "VM_xxxx" bits. Trying to use "PROT_xxx" bits is
totally improper, but it so happens that the low three bits
(READ|WRITE|EXEC) are supposed to be the same.
Good catch.
It really should do what mmap() does, and translate from the "PROT_xxx"
domain to the "VM_xxx" domain:
flag = _trans(prot, PROT_READ, VM_READ) |
_trans(prot, PROT_WRITE, VM_WRITE) |
_trans(prot, PROT_EXEC, VM_EXEC);
and the only reason sys_mprotect _looks_ like it is working is that those
three bits (but _not_ MAP_SEM) happen to be the same anyway.
I'm inclined to be lazy, and say "we know the low three bits of "prot" and
"flags" are the same, and leave it as-is, but remove the MAP_SEM, which
clearly is a bug.
But the proper thing is to move that part of calc_vm_flags() to a header
file. Does anybody want to take that on?
Linus
Hugh Dickins wrote:
> In sys_remap_file_pages, you set the VM_NONLINEAR flag, then clear
> it if this particular population matches the vma. No, you cannot
> clear that flag once set, without checking every page and pte_file
> already set within the vma. Check if population matches vma first,
> and if it doesn't match just set the VM_NONLINEAR flag in that case.
> (Andrew already mentioned locking: I'd have said page_table_lock,
> but his mmap_sem is also appropriate: it's an odd case.)
I don't see why you can't clear the flag: the call to ->populate will
change every page and pte_file to correspond with the linear page
offsets, which is all that !VM_NONLINEAR indicates.
However, it _is_ wrong to clear VM_NONLINEAR before the call to
->populate() has finished, with Andrew's patch which uses
downgrade_write(). Instead, the clear must come after ->populate()
has finished.
> I think rip out the FIXADDR_USER_START bit, it's rather over-the-top,
> ugly: and that area is readonly, so not a useful place for a futex.
Agreed. I put it because the old futex has it as a side effect of
get_user_pages(). It can go.
> The units of keys[1]: bytes if private but pages if shared.
> That's okay for now I think, but if a hashing expert comes along
> later s/he'll probably want to change it. The current hash does
> add key1 to offset, which is okay: if it xor'ed you'd lose the
> the offset bits in the private case.
Feel free to think up a better hash that isn't slow. Two iterations
of hash_long() would be a good hash, but slower.
> Those keys[1] pages: in units of PAGE_SIZE in the linear case,
> of PAGE_CACHE_SIZE in the nonlinear case. Oh well, this is far
> from the only place with such an inconsistency, let's worry
> about that when never comes.
Ew.
> The err at the end of __get_page_keys would be 1 from successful
> get_user_pages, treated as error by the callers: need to make it 0.
Well spotted.
> futex_wait: I didn't get around to it in my version, so haven't
> thought through the issues, but I'm a bit worried that you get
> curval for -EWOULDBLOCK check without holding the futex_lock.
> That looks suspicious to me, but I'm going to be lazy and not
> try to think about it, because Rusty is sure to understand the
> races there. If that code is insufficient as you have it, may
> need __pin_page reinstated for just that case (hmm, was that
> get_user right before? I'd expect it to kmap_atomic pinned page.)
The important things are that the futex is queued prior to checking
curval, the requested page won't change (it's protected by mmap_sem),
and any parallel waker changes the word prior to waking us.
You made me notice a rather subtle memory ordering condition, though.
We must issue the read after queuing the futex. There needs to be a
smp_rmb() after queuing and before the read, because the spin_unlock()
barrier only constrains earlier reads, not later ones.
Thanks for all your great insights,
-- Jamie
On Thu, 4 Sep 2003, Linus Torvalds wrote:
>
> Oh. I see. Yes - it's accessing "vm_flags" with "MAP_SEM". That's really
> wrong, since it's not even the same _domain_.
How about something like this that at least gets it closer? It fixes the
fact that incorrect usage of PROT_SEM would allow users to set the
VM_SHARED bit behind the back of the OS, which sounds like a total
disaster and which can potentially confuse other parts of the VM.
It's not a very _pretty_ fix, but there really is no excuse for PROT_xxx
to not match VM_xxx for the three standard protection flags, so in that
sense it is the technically "sane" approach. We might want to just
simplify the mmap() code too..
Linus
On Thu, 4 Sep 2003, Jamie Lokier wrote:
>
> I don't see why you can't clear the flag: the call to ->populate will
> change every page and pte_file to correspond with the linear page
> offsets, which is all that !VM_NONLINEAR indicates.
You're assuming that one call to sys_remap_file_pages precisely populates
a whole vma: no, it's quite likely it'll just do a single page of the vma.
> The important things are that the futex is queued prior to checking
> curval, the requested page won't change (it's protected by mmap_sem),
> and any parallel waker changes the word prior to waking us.
Ah, that may well be so, it's beyond me,
just so long as Rusty is happy with it.
(I don't think you mean "the requested page won't change" - the
down_read on mmap_sem does not prevent it from being swapped out
before the get_user, but nor does it prevent a replacement page
being faulted back in by get_user, and we no longer have any
dependence on those being the same physical page.)
Hugh
Linus Torvalds wrote:
> > Of course (not). That's the point, they do work on private mappings, but
> > the semantics are different on private mappings from on shared mappings:
> > on private mappings they're private to the mm, on shared mappings they're
> > shared with other mms (via the shared file).
>
> Repeat after me: private read-only mappings are 100% equivalent to shared
> read-only mappings. No ifs, buts, or maybes. This is a FACT. It's a fact
> codified in many years of Linux implementation, but it's a fact outside of
> that too.
Thanks Linus. I already knew this, I was in the audience of the old
thread about MAP_COPY, remember? :)
Please read below and think about it, because I'm convinced from your
3 emails later in this thread that you haven't thought about how COW
should interact with futexes.
If you don't have time, skip to the last paragraph.
The new futexes key off (mm,address) for a private mapping, and
(file,offset) for a shared mapping. That is actually a user-visible
distinction, so I have to explain and justify it.
Private writable mapping: futex must be mm-local, obviously. This is
a bug in the old futex code, which could be fixed as you say by
forcibly COWing the page. But that's unnecessary: (mm,address) is fine.
Shared writable mapping: futex must be shared, obviously.
Read-only mapping: as yous say, private and shared are the same for a
read-only mapping, until you call mprotect() if you're permitted.
Anything which breaks that is wrong.
So what shall a futex do on a read-only mapping. First, does it even
make sense? A: Yes it does. If I hand you a scoreboard file and tell
you to wait for changes to words in it, it's a legitimate use of
futexes on a read-only mapping.
Ok, now we understand that _this_ read-only mapping should not be mm-local.
But Linux does something weird at this point, if the new futex code's
hash keys on VM_SHARED.
If I hand you a scoreboard file opened O_RDWR, your futexes are keyed
on file pages. But if I open it O_RDONLY, your futexes are mm-local.
* I contend that the user-visible behaviour of a mapping should
* _not_ depend on whether the file was opened with O_RDWR or O_RDONLY.
Thanks,
-- Jamie
[ Oops. Forgot the actual patch. ]
On Thu, 4 Sep 2003, Linus Torvalds wrote:
>
> How about something like this [ ... ]
THIS.
Linus
===== mm/mprotect.c 1.23 vs edited =====
--- 1.23/mm/mprotect.c Wed Jul 2 21:22:38 2003
+++ edited/mm/mprotect.c Thu Sep 4 11:12:09 2003
@@ -224,7 +224,7 @@
asmlinkage long
sys_mprotect(unsigned long start, size_t len, unsigned long prot)
{
- unsigned long nstart, end, tmp;
+ unsigned long flags, nstart, end, tmp;
struct vm_area_struct * vma, * next, * prev;
int error = -EINVAL;
@@ -239,6 +239,12 @@
if (end == start)
return 0;
+ /*
+ * FIXME! This assumes that PROT_xxx == VM_xxxx for READ, WRITE, EXEC
+ * That does happen to be true, but it's ugly.. mmap() gets this right.
+ */
+ flags = prot & (VM_READ | VM_WRITE | VM_EXEC);
+
down_write(¤t->mm->mmap_sem);
vma = find_vma_prev(current->mm, start, &prev);
@@ -257,7 +263,7 @@
goto out;
}
- newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC));
+ newflags = flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
if ((newflags & ~(newflags >> 4)) & 0xf) {
error = -EACCES;
goto out;
On Thu, 4 Sep 2003, Jamie Lokier wrote:
>
> * I contend that the user-visible behaviour of a mapping should
> * _not_ depend on whether the file was opened with O_RDWR or O_RDONLY.
And I violently agree. But I also add the _other_ requirement:
* I contend that user-visible behaviour of a mapping should be 100% the
* same for a unwritable MAP_SHARED and a unwritten MAP_PRIVATE
Put the two together, and see what you get. You get the requirement that
if MAP_SHARED works, then MAP_PRIVATE also has to work.
That's my requirement. Consistency.
Linus
Linus Torvalds wrote:
> > * I contend that the user-visible behaviour of a mapping should
> > * _not_ depend on whether the file was opened with O_RDWR or O_RDONLY.
>
> And I violently agree. But I also add the _other_ requirement:
>
> * I contend that user-visible behaviour of a mapping should be 100% the
> * same for a unwritable MAP_SHARED and a unwritten MAP_PRIVATE
>
> Put the two together, and see what you get. You get the requirement that
> if MAP_SHARED works, then MAP_PRIVATE also has to work.
I'll add three more conditions to be explicit:
* A futex on a MAP_PRIVATE must be mm-local: the canonical
* example being MAP_PRIVATE of /dev/zero.
* A FUTEX_WAIT on an unwritten mapping should be woken by a
* FUTEX_WAKE to the same address after writing.
* A FUTEX_WAIT on a read-only mapping should wait for the same
* thing from other processes as if it were a writable mapping.
> That's my requirement. Consistency.
Unfortunately I think the above 5 conditions do not have a consistent
solution. Please prove me wrong :)
-- Jamie
Hugh Dickins wrote:
> > I don't see why you can't clear the flag: the call to ->populate will
> > change every page and pte_file to correspond with the linear page
> > offsets, which is all that !VM_NONLINEAR indicates.
>
> You're assuming that one call to sys_remap_file_pages precisely populates
> a whole vma: no, it's quite likely it'll just do a single page of the vma.
What are you talking about? The condition for clearing VM_NONLINEAR
is an explicit check to see if the range to be populated covers the
whole vma.
> > The important things are that the futex is queued prior to checking
> > curval, the requested page won't change (it's protected by mmap_sem),
> > and any parallel waker changes the word prior to waking us.
>
> Ah, that may well be so, it's beyond me,
> just so long as Rusty is happy with it.
If that condition isn't enough, then async futexes are in trouble,
because the curval check equivalent is done in userspace for async
futexes...
> (I don't think you mean "the requested page won't change" - the
> down_read on mmap_sem does not prevent it from being swapped out
> before the get_user, but nor does it prevent a replacement page
> being faulted back in by get_user, and we no longer have any
> dependence on those being the same physical page.)
I mean it prevents the futex key corresponding to the userspace word
from changing before we read the word. For all reasonable uses this
doesn't matter anyway.
-- Jamie
Rusty Russell wrote:
> I don't have a problem with the omission. mremap is logically
> equivalent to munmap + mmap, so it's a subset of the "I unmapped
> underneath my futex!". It's not like it's going to happen without the
> caller knowing: if the address doesn't change, then the futexes won't
> break. If they do, the caller needs to reset them anyway.
I think mremap() on block of memory containing futexes is reasonable.
Imagine a big data structure with a table futex locks at the start of
it. I'm not sure how useful it is, but it's not worthless.
Anyway, I have a patch, tested, which moves remapped futexes _and_
returns EFAULT to waiters when pages are unmapped. It's kept separate
from the main futex patch so you can accept it or not.
-- Jamie
On Thu, 4 Sep 2003, Jamie Lokier wrote:
> Hugh Dickins wrote:
> >
> > You're assuming that one call to sys_remap_file_pages precisely populates
> > a whole vma: no, it's quite likely it'll just do a single page of the vma.
>
> What are you talking about? The condition for clearing VM_NONLINEAR
> is an explicit check to see if the range to be populated covers the
> whole vma.
I apologize, I'm just not reading, am I? Thanks for re-explaining.
You're right, the condition on clearing is fine. It's the (lack of
condition on) setting that's over-enthusiastic, should be saying:
if (start - vma->vm_start != (pgoff - vma->vm_pgoff) << PAGE_SHIFT)
vma->vm_flags |= VM_NONLINEAR;
(Unless I'm making a fool of myself again.)
Hugh
On Thu, 4 Sep 2003, Jamie Lokier wrote:
>
> * A futex on a MAP_PRIVATE must be mm-local: the canonical
> * example being MAP_PRIVATE of /dev/zero.
Actually, /dev/zero is a special case in itself. It is an anonymous
mapping, and is equivalent to MAP_ANON for private mappings. For
MAP_SHARED it is something _totally_ different.
So /dev/zero isn't even an interesting case.
> Unfortunately I think the above 5 conditions do not have a consistent
> solution. Please prove me wrong :)
I don't think there is any inconsistency.
Linus
Hugh Dickins wrote:
> You're right, the condition on clearing is fine. It's the (lack of
> condition on) setting that's over-enthusiastic, should be saying:
>
> if (start - vma->vm_start != (pgoff - vma->vm_pgoff) << PAGE_SHIFT)
> vma->vm_flags |= VM_NONLINEAR;
I hadn't thought of that.
I wonder if it's useful, though. The only time it's likely to not set
the flag with it not already set, in real use, is when other VM code
calls remap_file_pages() with the whole vma. But then the flag is
cleared by the next line.
So it's correct and nice, but I'm not sure it adds anything practical.
-- Jamie
Linus Torvalds wrote:
> > * A futex on a MAP_PRIVATE must be mm-local: the canonical
> > * example being MAP_PRIVATE of /dev/zero.
>
> Actually, /dev/zero is a special case in itself. It is an anonymous
> mapping, and is equivalent to MAP_ANON for private mappings. For
> MAP_SHARED it is something _totally_ different.
Well yes, but conceptually it's behaviour is that of a private mapping
of a file-like object. But fine, let's not get sidetracked by /dev/zero.
I'll restate it:
* A futex on a MAP_PRIVATE must be mm-local. The canonical
example being the data section of your executable.
> > Unfortunately I think the above 5 conditions do not have a consistent
> > solution. Please prove me wrong :)
>
> I don't think there is any inconsistency.
I can't think of a behaviour which satisfies all 5 conditions, so
you'll have to help me out. :/
-- Jamie
In message <[email protected]> you write:
> Rusty Russell wrote:
> > I don't have a problem with the omission. mremap is logically
> > equivalent to munmap + mmap, so it's a subset of the "I unmapped
> > underneath my futex!". It's not like it's going to happen without the
> > caller knowing: if the address doesn't change, then the futexes won't
> > break. If they do, the caller needs to reset them anyway.
>
> I think mremap() on block of memory containing futexes is reasonable.
> Imagine a big data structure with a table futex locks at the start of
> it. I'm not sure how useful it is, but it's not worthless.
Think about the code that does this:
struct futex_file
{
struct futex lock;
int content_len;
char contents[0];
};
fd = sys_futex(&futfile->lock);
...
futfile = mremap(futfile, oldsize, newsize, MREMAP_MAYMOVE);
Now, if mremap doesn't move the memory, futexes aren't broken, even
without your patch, right? If it does move, you've got a futex
sitting in invalid memory, no surprise if it doesn't work.
OTOH, I'm interested in returning EFAULT on waiters when pages are
unmapped, because I realized that stale waiters could "match" live
futex wakeups (an mm_struct gets recycled), and steal the wakeup. Bad
juju. We could do some uid check or something for anon pages, but
cleaner to flush them at unmap.
Cheers!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> Feel free to think up a better hash that isn't slow. Two iterations
> of hash_long() would be a good hash, but slower.
I've used jhash below.
> > The err at the end of __get_page_keys would be 1 from successful
> > get_user_pages, treated as error by the callers: need to make it 0.
>
> Well spotted.
Fixed below.
> > futex_wait: I didn't get around to it in my version, so haven't
> > thought through the issues, but I'm a bit worried that you get
> > curval for -EWOULDBLOCK check without holding the futex_lock.
This works: the only danger is that the WAKE side will wake us even
though we were going to fail with -EWOULDBLOCK, which is why we notice
this in out_unqueue and return 0 in this case (if it's doing wake-one,
it *really* must successfully wake one...).
> > That looks suspicious to me, but I'm going to be lazy and not
> > try to think about it, because Rusty is sure to understand the
> > races there. If that code is insufficient as you have it, may
> > need __pin_page reinstated for just that case (hmm, was that
> > get_user right before? I'd expect it to kmap_atomic pinned page.)
>
> The important things are that the futex is queued prior to checking
> curval, the requested page won't change (it's protected by mmap_sem),
> and any parallel waker changes the word prior to waking us.
>
> You made me notice a rather subtle memory ordering condition, though.
>
> We must issue the read after queuing the futex. There needs to be a
> smp_rmb() after queuing and before the read, because the spin_unlock()
> barrier only constrains earlier reads, not later ones.
Ah, the joys of thinking too hard: I've been here before 8).
In my analyss, the earliest the read can move is to the beginning of
the futex_lock, ie equivalent to:
spin_lock(&futex_lock);
get_user(curval, (int *)uaddr);
list_add_tail(&q->list, head);
spin_unlock(&futex_lock);
Since the wake side has to take the futex lock too, this ordering is
still safe.
> Thanks for all your great insights,
Definitely seconded, Hugh. Thanks!
Here's my accumulated patch set (Jamie's fixed patch included first
for completeness):
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Futexes without pinning pages
Author: Jamie Lokier
Status: Booted on 2.6.0-test4-bk6
D: [ Later fixes from Jamie and Andrew Morton added --RR ]
D: include/linux/mm.h | 1
D: include/linux/vcache.h | 26 ---
D: kernel/futex.c | 368 +++++++++++++++++++++++++------------------------
D: mm/Makefile | 2
D: mm/fremap.c | 9 +
D: mm/memory.c | 2
D: mm/vcache.c | 90 -----------
D: 7 files changed, 200 insertions(+), 298 deletions(-)
D:
D: Patch name: futex-fixes-2.6.0-test4-01jl
D:
D: This patch changes the way futexes are indexed, so that they do not
D: pin pages and also corrects some problems with private mappings and COW
D: pages.
D:
D: Currently, all futexes look up the page at the userspace address and
D: pin it, using the pair (page,offset) as an index into a table of
D: waiting futexes. Any page with a futex waiting on it remains pinned
D: in RAM, which is a problem when many futexes are used, especially with
D: FUTEX_FD.
D:
D: Another problem is that the page is not always the correct one, if it
D: can be changed later by a COW (copy on write) operation. This can
D: happen when waiting on a futex without writing to it after fork(),
D: exec() or mmap(), if the page is then written to before attempting to
D: wake a futex at the same adress.
D:
D: There are two symptoms of the COW problem: 1. The wrong process can
D: receive wakeups; 2. A process can fail to receive required wakeups.
D:
D: This patch fixes both by changing the indexing so that VM_SHARED
D: mappings use the triple (inode,offset,index), and private mappings use
D: the pair (mm,virtual_address).
D:
D: The former correctly handles all shared mappings, including tmpfs and
D: therefore all kinds of shared memory (IPC shm, /dev/shm and
D: MAP_ANON|MAP_SHARED). This works because every mapping which is
D: VM_SHARED has an associated non-zero vma->vm_file, and hence inode.
D: (This is ensured in do_mmap_pgoff, where it calls shmem_zero_setup).
D:
D: The latter handles all private mappings, both files and anonymous. It
D: isn't affected by COW, because it doesn't care about the actual pages,
D: just the virtual address.
D:
D: The only obvious problem is that mremap() can move a private mapping
D: without informing futexes waiting on that mapping. However, mremap()
D: was already broken with futexes, because it doesn't update the vcache,
D: which is used by futexes, so this just changes an existing bug.
D:
D: (A later patch from me will fix this problem with mremap(), by moving
D: the futexes).
D:
D: This patch has a few bonuses:
D:
D: 1. It removes the vcache implementation, as only futexes were
D: using it, and they don't any more.
D:
D: 2. Removing the vcache should make COW page faults a bit faster.
D:
D: 3. Futex operations no longer take the page table lock, walk
D: the page table, fault in pages that aren't mapped in the
D: page table, or do a vcache hash lookup - they are mostly a
D: simple offset calculation with one hash for the futex
D: table. So they should be noticably faster.
D:
D: 4. The patch reduces the kernel size by 98 lines.
D:
D: -- Jamie
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .16361-linux-2.6.0-test4-bk6/include/linux/mm.h .16361-linux-2.6.0-test4-bk6.updated/include/linux/mm.h
--- .16361-linux-2.6.0-test4-bk6/include/linux/mm.h 2003-08-25 11:58:34.000000000 +1000
+++ .16361-linux-2.6.0-test4-bk6.updated/include/linux/mm.h 2003-09-05 14:54:31.000000000 +1000
@@ -110,6 +110,7 @@ struct vm_area_struct {
#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
+#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .16361-linux-2.6.0-test4-bk6/include/linux/vcache.h .16361-linux-2.6.0-test4-bk6.updated/include/linux/vcache.h
--- .16361-linux-2.6.0-test4-bk6/include/linux/vcache.h 2003-01-02 12:30:47.000000000 +1100
+++ .16361-linux-2.6.0-test4-bk6.updated/include/linux/vcache.h 1970-01-01 10:00:00.000000000 +1000
@@ -1,26 +0,0 @@
-/*
- * virtual => physical mapping cache support.
- */
-#ifndef _LINUX_VCACHE_H
-#define _LINUX_VCACHE_H
-
-typedef struct vcache_s {
- unsigned long address;
- struct mm_struct *mm;
- struct list_head hash_entry;
- void (*callback)(struct vcache_s *data, struct page *new_page);
-} vcache_t;
-
-extern spinlock_t vcache_lock;
-
-extern void __attach_vcache(vcache_t *vcache,
- unsigned long address,
- struct mm_struct *mm,
- void (*callback)(struct vcache_s *data, struct page *new_page));
-
-extern void __detach_vcache(vcache_t *vcache);
-
-extern void invalidate_vcache(unsigned long address, struct mm_struct *mm,
- struct page *new_page);
-
-#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .16361-linux-2.6.0-test4-bk6/kernel/futex.c .16361-linux-2.6.0-test4-bk6.updated/kernel/futex.c
--- .16361-linux-2.6.0-test4-bk6/kernel/futex.c 2003-09-05 09:16:38.000000000 +1000
+++ .16361-linux-2.6.0-test4-bk6.updated/kernel/futex.c 2003-09-05 14:54:31.000000000 +1000
@@ -5,6 +5,9 @@
* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
*
+ * Changed to remove page pinning and fix privately mapped COW pages
+ * Copyright (C) Jamie Lokier 2003
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -33,7 +36,6 @@
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/futex.h>
-#include <linux/vcache.h>
#include <linux/mount.h>
#define FUTEX_HASHBITS 8
@@ -46,13 +48,10 @@ struct futex_q {
struct list_head list;
wait_queue_head_t waiters;
- /* Page struct and offset within it. */
- struct page *page;
+ /* Page keys and offset within the page. */
+ unsigned long keys[2];
int offset;
- /* the virtual => physical COW-safe cache */
- vcache_t vcache;
-
/* For fd, sigio sent using these. */
int fd;
struct file *filp;
@@ -66,85 +65,110 @@ static spinlock_t futex_lock = SPIN_LOCK
static struct vfsmount *futex_mnt;
/*
- * These are all locks that are necessery to look up a physical
- * mapping safely, and modify/search the futex hash, atomically:
- */
-static inline void lock_futex_mm(void)
-{
- spin_lock(¤t->mm->page_table_lock);
- spin_lock(&vcache_lock);
- spin_lock(&futex_lock);
-}
-
-static inline void unlock_futex_mm(void)
-{
- spin_unlock(&futex_lock);
- spin_unlock(&vcache_lock);
- spin_unlock(¤t->mm->page_table_lock);
-}
-
-/*
- * The physical page is shared, so we can hash on its address:
+ * We hash on the keys returned from __get_page_keys (see below),
+ * and the offset into the page.
*/
-static inline struct list_head *hash_futex(struct page *page, int offset)
+static inline struct list_head *hash_futex(unsigned long key0,
+ unsigned long key1,
+ int offset)
{
- return &futex_queues[hash_long((unsigned long)page + offset,
- FUTEX_HASHBITS)];
+ return &futex_queues[hash_long(key0 + key1 + offset, FUTEX_HASHBITS)];
}
/*
- * Get kernel address of the user page and pin it.
+ * Get two parameters which are the keys for a futex
+ * other than the offset within page.
*
- * Must be called with (and returns with) all futex-MM locks held.
+ * For shared mappings, it's "vma->vm_file->f_dentry->d_inode" and
+ * "page->index". For private mappings, it's "current->mm" and "addr".
+ * We can usually work out the index without swapping in the page.
+ *
+ * Returns: 0, or negative error code.
+ * The two key words are stored in key[0] and key[1] on success.
+ *
+ * Should be called with ¤t->mm->mmap_sem,
+ * but NOT &futex_lock or ¤t->mm->page_table_lock.
*/
-static inline struct page *__pin_page_atomic (struct page *page)
-{
- if (!PageReserved(page))
- get_page(page);
- return page;
-}
-
-static struct page *__pin_page(unsigned long addr)
+static int __get_page_keys(unsigned long addr, unsigned long * keys)
{
struct mm_struct *mm = current->mm;
- struct page *page, *tmp;
+ struct vm_area_struct *vma;
+ struct page *page;
int err;
/*
- * Do a quick atomic lookup first - this is the fastpath.
+ * The futex is hashed differently depending on whether
+ * it's in a shared or private mapping. So check vma first.
*/
- page = follow_page(mm, addr, 0);
- if (likely(page != NULL))
- return __pin_page_atomic(page);
+ vma = find_extend_vma(mm, addr);
+
+ if (unlikely(!vma)) {
+#ifdef FIXADDR_USER_START
+ if (addr >= FIXADDR_USER_START && addr < FIXADDR_USER_END) {
+ keys[0] = 1; /* Different from any pointer value. */
+ keys[1] = addr - FIXADDR_USER_START;
+ return 0;
+ }
+#endif
+ return -EFAULT;
+ }
/*
- * No luck - need to fault in the page:
+ * Permissions.
*/
-repeat_lookup:
+ if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
+ return -EFAULT;
- unlock_futex_mm();
+ /*
+ * Private mappings are handled in a simple way.
+ */
+ if (likely(!(vma->vm_flags & VM_SHARED))) {
+ keys[0] = (unsigned long) mm;
+ keys[1] = addr;
+ return 0;
+ }
- down_read(&mm->mmap_sem);
- err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL);
- up_read(&mm->mmap_sem);
+ /*
+ * Linear mappings are also simple.
+ */
+ keys[0] = (unsigned long) vma->vm_file->f_dentry->d_inode;
+ if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
+ keys[1] = (((addr - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff);
+ return 0;
+ }
- lock_futex_mm();
+ /*
+ * We could walk the page table to read the non-linear
+ * pte, and get the page index without fetching the page
+ * from swap. But that's a lot of code to duplicate here
+ * for a rare case, so we simply fetch the page.
+ */
- if (err < 0)
- return NULL;
/*
- * Since the faulting happened with locks released, we have to
- * check for races:
+ * Do a quick atomic lookup first - this is the fastpath.
*/
- tmp = follow_page(mm, addr, 0);
- if (tmp != page) {
- put_page(page);
- goto repeat_lookup;
+ spin_lock(¤t->mm->page_table_lock);
+ page = follow_page(mm, addr, 0);
+ if (likely(page != NULL)) {
+ keys[1] = page->index;
+ spin_unlock(¤t->mm->page_table_lock);
+ return 0;
}
+ spin_unlock(¤t->mm->page_table_lock);
- return page;
+ /*
+ * Do it the general way.
+ */
+ err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL);
+ if (err >= 0) {
+ keys[1] = page->index;
+ put_page(page);
+ }
+ return err;
}
+
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
@@ -152,25 +176,25 @@ repeat_lookup:
static inline int futex_wake(unsigned long uaddr, int offset, int num)
{
struct list_head *i, *next, *head;
- struct page *page;
- int ret = 0;
+ unsigned long keys[2];
+ int ret;
- lock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
- page = __pin_page(uaddr - offset);
- if (!page) {
- unlock_futex_mm();
- return -EFAULT;
- }
+ ret = __get_page_keys(uaddr - offset, keys);
+ if (unlikely(ret != 0))
+ goto out;
- head = hash_futex(page, offset);
+ head = hash_futex(keys[0], keys[1], offset);
+ spin_lock(&futex_lock);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page && this->offset == offset) {
+ if (this->keys[0] == keys[0] && this->keys[1] == keys[1]
+ && this->offset == offset) {
+
list_del_init(i);
- __detach_vcache(&this->vcache);
wake_up_all(&this->waiters);
if (this->filp)
send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
@@ -179,38 +203,14 @@ static inline int futex_wake(unsigned lo
break;
}
}
+ spin_unlock(&futex_lock);
- unlock_futex_mm();
- put_page(page);
-
+out:
+ up_read(¤t->mm->mmap_sem);
return ret;
}
/*
- * This gets called by the COW code, we have to rehash any
- * futexes that were pending on the old physical page, and
- * rehash it to the new physical page. The pagetable_lock
- * and vcache_lock is already held:
- */
-static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
-{
- struct futex_q *q = container_of(vcache, struct futex_q, vcache);
- struct list_head *head = hash_futex(new_page, q->offset);
-
- spin_lock(&futex_lock);
-
- if (!list_empty(&q->list)) {
- put_page(q->page);
- q->page = new_page;
- __pin_page_atomic(new_page);
- list_del(&q->list);
- list_add_tail(&q->list, head);
- }
-
- spin_unlock(&futex_lock);
-}
-
-/*
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
@@ -218,74 +218,66 @@ static inline int futex_requeue(unsigned
unsigned long uaddr2, int offset2, int nr_wake, int nr_requeue)
{
struct list_head *i, *next, *head1, *head2;
- struct page *page1 = NULL, *page2 = NULL;
- int ret = 0;
+ unsigned long keys1[2], keys2[2];
+ int ret;
- lock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
- page1 = __pin_page(uaddr1 - offset1);
- if (!page1)
+ ret = __get_page_keys(uaddr1 - offset1, keys1);
+ if (unlikely(ret != 0))
goto out;
- page2 = __pin_page(uaddr2 - offset2);
- if (!page2)
+ ret = __get_page_keys(uaddr2 - offset2, keys2);
+ if (unlikely(ret != 0))
goto out;
- head1 = hash_futex(page1, offset1);
- head2 = hash_futex(page2, offset2);
+ head1 = hash_futex(keys1[0], keys1[1], offset1);
+ head2 = hash_futex(keys2[0], keys2[1], offset2);
+ spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->page == page1 && this->offset == offset1) {
+ if (this->keys[0] == keys1[0] && this->keys[1] == keys1[1]
+ && this->offset == offset1) {
+
list_del_init(i);
- __detach_vcache(&this->vcache);
if (++ret <= nr_wake) {
wake_up_all(&this->waiters);
if (this->filp)
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- put_page(this->page);
- __pin_page_atomic (page2);
list_add_tail(i, head2);
- __attach_vcache(&this->vcache, uaddr2,
- current->mm, futex_vcache_callback);
+ this->keys[0] = keys2[0];
+ this->keys[1] = keys2[1];
this->offset = offset2;
- this->page = page2;
if (ret - nr_wake >= nr_requeue)
break;
}
}
}
+ spin_unlock(&futex_lock);
out:
- unlock_futex_mm();
-
- if (page1)
- put_page(page1);
- if (page2)
- put_page(page2);
-
+ up_read(¤t->mm->mmap_sem);
return ret;
}
-static inline void __queue_me(struct futex_q *q, struct page *page,
- unsigned long uaddr, int offset,
- int fd, struct file *filp)
+static inline void queue_me(struct futex_q *q, unsigned long *keys,
+ unsigned long uaddr, int offset,
+ int fd, struct file *filp)
{
- struct list_head *head = hash_futex(page, offset);
+ struct list_head *head = hash_futex(keys[0], keys[1], offset);
+ q->keys[0] = keys[0];
+ q->keys[1] = keys[1];
q->offset = offset;
q->fd = fd;
q->filp = filp;
- q->page = page;
+ spin_lock(&futex_lock);
list_add_tail(&q->list, head);
- /*
- * We register a futex callback to this virtual address,
- * to make sure a COW properly rehashes the futex-queue.
- */
- __attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback);
+ spin_unlock(&futex_lock);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
@@ -293,15 +285,12 @@ static inline int unqueue_me(struct fute
{
int ret = 0;
- spin_lock(&vcache_lock);
spin_lock(&futex_lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
- __detach_vcache(&q->vcache);
ret = 1;
}
spin_unlock(&futex_lock);
- spin_unlock(&vcache_lock);
return ret;
}
@@ -311,65 +300,94 @@ static inline int futex_wait(unsigned lo
unsigned long time)
{
DECLARE_WAITQUEUE(wait, current);
- int ret = 0, curval;
- struct page *page;
+ int ret, curval;
+ unsigned long keys[2];
struct futex_q q;
+ try_again:
init_waitqueue_head(&q.waiters);
- lock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
- page = __pin_page(uaddr - offset);
- if (!page) {
- unlock_futex_mm();
- return -EFAULT;
- }
- __queue_me(&q, page, uaddr, offset, -1, NULL);
+ ret = __get_page_keys(uaddr - offset, keys);
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+ queue_me(&q, keys, uaddr, offset, -1, NULL);
/*
- * Page is pinned, but may no longer be in this address space.
- * It cannot schedule, so we access it with the spinlock held.
+ * Access the page after the futex is queued.
+ * We hold the mmap semaphore, so the mapping cannot have changed
+ * since we looked it up.
*/
if (get_user(curval, (int *)uaddr) != 0) {
- unlock_futex_mm();
ret = -EFAULT;
- goto out;
+ goto out_unqueue;
}
if (curval != val) {
- unlock_futex_mm();
ret = -EWOULDBLOCK;
- goto out;
+ goto out_unqueue;
}
+
/*
- * The get_user() above might fault and schedule so we
- * cannot just set TASK_INTERRUPTIBLE state when queueing
- * ourselves into the futex hash. This code thus has to
+ * Now the futex is queued and we have checked the data, we
+ * don't want to hold mmap_sem while we sleep.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ /*
+ * There might have been scheduling since the queue_me(), as we
+ * cannot hold a spinlock across the get_user() in case it
+ * faults. So we cannot just set TASK_INTERRUPTIBLE state when
+ * queueing ourselves into the futex hash. This code thus has to
* rely on the futex_wake() code doing a wakeup after removing
* the waiter from the list.
*/
add_wait_queue(&q.waiters, &wait);
+ spin_lock(&futex_lock);
set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&q.list)) {
- unlock_futex_mm();
- time = schedule_timeout(time);
+
+ if (unlikely(list_empty(&q.list))) {
+ /*
+ * We were woken already.
+ */
+ spin_unlock(&futex_lock);
+ set_current_state(TASK_RUNNING);
+ return 0;
}
- set_current_state(TASK_RUNNING);
+
+ spin_unlock(&futex_lock);
+ time = schedule_timeout(time);
+
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
- if (time == 0) {
- ret = -ETIMEDOUT;
- goto out;
- }
+
+ /*
+ * Were we woken or interrupted for a valid reason?
+ */
+ ret = unqueue_me(&q);
+ if (ret == 0)
+ return 0;
+ if (time == 0)
+ return -ETIMEDOUT;
if (signal_pending(current))
- ret = -EINTR;
-out:
- /* Were we woken up anyway? */
+ return -EINTR;
+
+ /*
+ * No, it was a spurious wakeup. Try again. Should never happen. :)
+ */
+ goto try_again;
+
+ out_unqueue:
+ /*
+ * Were we unqueued anyway?
+ */
if (!unqueue_me(&q))
ret = 0;
- put_page(q.page);
-
+ out_release_sem:
+ up_read(¤t->mm->mmap_sem);
return ret;
}
@@ -378,7 +396,6 @@ static int futex_close(struct inode *ino
struct futex_q *q = filp->private_data;
unqueue_me(q);
- put_page(q->page);
kfree(filp->private_data);
return 0;
}
@@ -408,10 +425,10 @@ static struct file_operations futex_fops
set the sigio stuff up afterwards. */
static int futex_fd(unsigned long uaddr, int offset, int signal)
{
- struct page *page = NULL;
struct futex_q *q;
+ unsigned long keys[2];
struct file *filp;
- int ret;
+ int ret, err;
ret = -EINVAL;
if (signal < 0 || signal > _NSIG)
@@ -450,31 +467,25 @@ static int futex_fd(unsigned long uaddr,
goto out;
}
- lock_futex_mm();
-
- page = __pin_page(uaddr - offset);
- if (!page) {
- unlock_futex_mm();
+ down_read(¤t->mm->mmap_sem);
+ err = __get_page_keys(uaddr - offset, keys);
+ up_read(¤t->mm->mmap_sem);
+ if (unlikely(err != 0)) {
put_unused_fd(ret);
put_filp(filp);
kfree(q);
- return -EFAULT;
+ return err;
}
init_waitqueue_head(&q->waiters);
filp->private_data = q;
- __queue_me(q, page, uaddr, offset, ret, filp);
-
- unlock_futex_mm();
+ queue_me(q, keys, uaddr, offset, ret, filp);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
- page = NULL;
out:
- if (page)
- put_page(page);
return ret;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .16361-linux-2.6.0-test4-bk6/mm/Makefile .16361-linux-2.6.0-test4-bk6.updated/mm/Makefile
--- .16361-linux-2.6.0-test4-bk6/mm/Makefile 2003-02-11 14:26:20.000000000 +1100
+++ .16361-linux-2.6.0-test4-bk6.updated/mm/Makefile 2003-09-05 14:54:31.000000000 +1000
@@ -9,6 +9,6 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \
- slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
+ slab.o swap.o truncate.o vmscan.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .16361-linux-2.6.0-test4-bk6/mm/fremap.c .16361-linux-2.6.0-test4-bk6.updated/mm/fremap.c
--- .16361-linux-2.6.0-test4-bk6/mm/fremap.c 2003-09-05 09:16:38.000000000 +1000
+++ .16361-linux-2.6.0-test4-bk6.updated/mm/fremap.c 2003-09-05 14:54:31.000000000 +1000
@@ -144,7 +144,10 @@ long sys_remap_file_pages(unsigned long
return err;
#endif
- down_read(&mm->mmap_sem);
+ /*
+ * vm_flags is protected by down_write(mmap_sem)
+ */
+ down_write(&mm->mmap_sem);
vma = find_vma(mm, start);
/*
@@ -155,12 +158,18 @@ long sys_remap_file_pages(unsigned long
if (vma && (vma->vm_flags & VM_SHARED) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
- end <= vma->vm_end)
+ end <= vma->vm_end) {
+
+ vma->vm_flags |= VM_NONLINEAR;
+ if (start == vma->vm_start && end == vma->vm_end &&
+ pgoff == vma->vm_pgoff)
+ vma->vm_flags &= ~VM_NONLINEAR;
+ downgrade_write(&mm->mmap_sem);
err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
pgoff, flags & MAP_NONBLOCK);
-
- up_read(&mm->mmap_sem);
-
+ up_read(&mm->mmap_sem);
+ } else {
+ up_write(&mm->mmap_sem);
+ }
return err;
}
-
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .16361-linux-2.6.0-test4-bk6/mm/memory.c .16361-linux-2.6.0-test4-bk6.updated/mm/memory.c
--- .16361-linux-2.6.0-test4-bk6/mm/memory.c 2003-09-05 09:16:38.000000000 +1000
+++ .16361-linux-2.6.0-test4-bk6.updated/mm/memory.c 2003-09-05 14:54:31.000000000 +1000
@@ -43,7 +43,6 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
-#include <linux/vcache.h>
#include <linux/rmap-locking.h>
#include <linux/module.h>
@@ -962,7 +961,6 @@ static inline void establish_pte(struct
static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
pte_t *page_table)
{
- invalidate_vcache(address, vma->vm_mm, new_page);
flush_cache_page(vma, address);
establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .16361-linux-2.6.0-test4-bk6/mm/vcache.c .16361-linux-2.6.0-test4-bk6.updated/mm/vcache.c
--- .16361-linux-2.6.0-test4-bk6/mm/vcache.c 2003-01-02 12:30:47.000000000 +1100
+++ .16361-linux-2.6.0-test4-bk6.updated/mm/vcache.c 1970-01-01 10:00:00.000000000 +1000
@@ -1,90 +0,0 @@
-/*
- * linux/mm/vcache.c
- *
- * virtual => physical page mapping cache. Users of this mechanism
- * register callbacks for a given (virt,mm,phys) page mapping, and
- * the kernel guarantees to call back when this mapping is invalidated.
- * (ie. upon COW or unmap.)
- *
- * Started by Ingo Molnar, Copyright (C) 2002
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/hash.h>
-#include <linux/vcache.h>
-
-#define VCACHE_HASHBITS 8
-#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS)
-
-spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED;
-
-static struct list_head hash[VCACHE_HASHSIZE];
-
-static struct list_head *hash_vcache(unsigned long address,
- struct mm_struct *mm)
-{
- return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)];
-}
-
-void __attach_vcache(vcache_t *vcache,
- unsigned long address,
- struct mm_struct *mm,
- void (*callback)(struct vcache_s *data, struct page *new))
-{
- struct list_head *hash_head;
-
- address &= PAGE_MASK;
- vcache->address = address;
- vcache->mm = mm;
- vcache->callback = callback;
-
- hash_head = hash_vcache(address, mm);
-
- list_add_tail(&vcache->hash_entry, hash_head);
-}
-
-void __detach_vcache(vcache_t *vcache)
-{
- list_del_init(&vcache->hash_entry);
-}
-
-void invalidate_vcache(unsigned long address, struct mm_struct *mm,
- struct page *new_page)
-{
- struct list_head *l, *hash_head;
- vcache_t *vcache;
-
- address &= PAGE_MASK;
-
- hash_head = hash_vcache(address, mm);
- /*
- * This is safe, because this path is called with the pagetable
- * lock held. So while other mm's might add new entries in
- * parallel, *this* mm is locked out, so if the list is empty
- * now then we do not have to take the vcache lock to see it's
- * really empty.
- */
- if (likely(list_empty(hash_head)))
- return;
-
- spin_lock(&vcache_lock);
- list_for_each(l, hash_head) {
- vcache = list_entry(l, vcache_t, hash_entry);
- if (vcache->address != address || vcache->mm != mm)
- continue;
- vcache->callback(vcache, new_page);
- }
- spin_unlock(&vcache_lock);
-}
-
-static int __init vcache_init(void)
-{
- unsigned int i;
-
- for (i = 0; i < VCACHE_HASHSIZE; i++)
- INIT_LIST_HEAD(hash + i);
- return 0;
-}
-__initcall(vcache_init);
-
Name: Minor Tweaks To Jamie Lokier's Futex Patch
Author: Rusty Russell
Status: Booted on 2.6.0-test4-bk6
Depends: Misc/futex-jamie.patch.gz
D: Minor changes to Jamie's excellent futex patch.
D: 1) Declare and use a union for the hash key, and rename __get_page_keys
D: to get_page_key.
D: 2) Remove obsolete comment above hash array decl.
D: 3) Simply -EFAULT on futexes in VSYSCALL area.
D: 4) Clarify comment about TASK_INTERRUPTIBLE.
D: 5) Andrew Morton says spurious wakeup is a bug. Catch it.
D: 6) Semantics of futex on read-only pages unclear: require write perm.
D: 7) Use Jenkins hash.
D: 8) Make get_page_keys return 0 on successful get_user_pages().
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .10155-linux-2.6.0-test4-bk6/kernel/futex.c .10155-linux-2.6.0-test4-bk6.updated/kernel/futex.c
--- .10155-linux-2.6.0-test4-bk6/kernel/futex.c 2003-09-05 12:31:41.000000000 +1000
+++ .10155-linux-2.6.0-test4-bk6.updated/kernel/futex.c 2003-09-05 14:28:03.000000000 +1000
@@ -33,13 +33,38 @@
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
-#include <linux/hash.h>
+#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#define FUTEX_HASHBITS 8
+/* For shared mappings, comparison key is
+ * "vma->vm_file->f_dentry->d_inode" and "page->index". For private
+ * mappings, it's "current->mm" and "addr". We can usually work out
+ * the index without swapping in the page.
+ * Note that they never clash: mm and inode ptrs cannot be equal.
+ */
+struct private_key
+{
+ struct mm_struct *mm;
+ unsigned long uaddr;
+};
+
+struct shared_key
+{
+ struct inode *inode;
+ unsigned long page_index;
+};
+
+union hash_key
+{
+ struct private_key private;
+ struct shared_key shared;
+ unsigned long raw[2];
+};
+
/*
* We use this hashed waitqueue instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared):
@@ -49,7 +74,7 @@ struct futex_q {
wait_queue_head_t waiters;
/* Page keys and offset within the page. */
- unsigned long keys[2];
+ union hash_key key;
int offset;
/* For fd, sigio sent using these. */
@@ -57,7 +82,6 @@ struct futex_q {
struct file *filp;
};
-/* The key for the hash is the address + index + offset within page */
static struct list_head futex_queues[1<<FUTEX_HASHBITS];
static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
@@ -65,31 +89,29 @@ static spinlock_t futex_lock = SPIN_LOCK
static struct vfsmount *futex_mnt;
/*
- * We hash on the keys returned from __get_page_keys (see below),
+ * We hash on the keys returned from get_page_key (see below),
* and the offset into the page.
*/
-static inline struct list_head *hash_futex(unsigned long key0,
- unsigned long key1,
+static inline struct list_head *hash_futex(const union hash_key *key,
int offset)
{
- return &futex_queues[hash_long(key0 + key1 + offset, FUTEX_HASHBITS)];
+ u32 hash = jhash2((u32*)key, sizeof(*key)/sizeof(u32), offset);
+
+ /* Just in case someone changes something... */
+ BUILD_BUG_ON(sizeof(*key) % sizeof(u32) != 0);
+ return &futex_queues[hash & ((1<<FUTEX_HASHBITS)-1)];
}
/*
* Get two parameters which are the keys for a futex
* other than the offset within page.
*
- * For shared mappings, it's "vma->vm_file->f_dentry->d_inode" and
- * "page->index". For private mappings, it's "current->mm" and "addr".
- * We can usually work out the index without swapping in the page.
- *
* Returns: 0, or negative error code.
- * The two key words are stored in key[0] and key[1] on success.
*
* Should be called with ¤t->mm->mmap_sem,
* but NOT &futex_lock or ¤t->mm->page_table_lock.
*/
-static int __get_page_keys(unsigned long addr, unsigned long * keys)
+static int get_page_key(unsigned long addr, union hash_key *key)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -102,38 +124,32 @@ static int __get_page_keys(unsigned long
*/
vma = find_extend_vma(mm, addr);
- if (unlikely(!vma)) {
-#ifdef FIXADDR_USER_START
- if (addr >= FIXADDR_USER_START && addr < FIXADDR_USER_END) {
- keys[0] = 1; /* Different from any pointer value. */
- keys[1] = addr - FIXADDR_USER_START;
- return 0;
- }
-#endif
+ if (unlikely(!vma))
return -EFAULT;
- }
/*
* Permissions.
*/
- if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
+ if (unlikely((vma->vm_flags & (VM_IO|VM_READ|VM_WRITE))
+ != (VM_READ | VM_WRITE)))
return -EFAULT;
/*
* Private mappings are handled in a simple way.
*/
- if (likely(!(vma->vm_flags & VM_SHARED))) {
- keys[0] = (unsigned long) mm;
- keys[1] = addr;
+ if (!(vma->vm_flags & VM_SHARED)) {
+ key->private.mm = mm;
+ key->private.uaddr = (addr & PAGE_MASK);
return 0;
}
/*
* Linear mappings are also simple.
*/
- keys[0] = (unsigned long) vma->vm_file->f_dentry->d_inode;
+ key->shared.inode = vma->vm_file->f_dentry->d_inode;
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- keys[1] = (((addr - vma->vm_start) >> PAGE_SHIFT)
+ key->shared.page_index
+ = (((addr - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
return 0;
}
@@ -151,7 +167,7 @@ static int __get_page_keys(unsigned long
spin_lock(¤t->mm->page_table_lock);
page = follow_page(mm, addr, 0);
if (likely(page != NULL)) {
- keys[1] = page->index;
+ key->shared.page_index = page->index;
spin_unlock(¤t->mm->page_table_lock);
return 0;
}
@@ -161,11 +177,11 @@ static int __get_page_keys(unsigned long
* Do it the general way.
*/
err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL);
- if (err >= 0) {
- keys[1] = page->index;
- put_page(page);
- }
- return err;
+ if (err < 0)
+ return err;
+ key->shared.page_index = page->index;
+ put_page(page);
+ return 0;
}
@@ -176,24 +192,24 @@ static int __get_page_keys(unsigned long
static inline int futex_wake(unsigned long uaddr, int offset, int num)
{
struct list_head *i, *next, *head;
- unsigned long keys[2];
+ union hash_key key;
int ret;
down_read(¤t->mm->mmap_sem);
- ret = __get_page_keys(uaddr - offset, keys);
+ ret = get_page_key(uaddr - offset, &key);
if (unlikely(ret != 0))
goto out;
- head = hash_futex(keys[0], keys[1], offset);
+ head = hash_futex(&key, offset);
spin_lock(&futex_lock);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->keys[0] == keys[0] && this->keys[1] == keys[1]
+ if (this->key.raw[0] == key.raw[0]
+ && this->key.raw[1] == key.raw[1]
&& this->offset == offset) {
-
list_del_init(i);
wake_up_all(&this->waiters);
if (this->filp)
@@ -218,28 +234,28 @@ static inline int futex_requeue(unsigned
unsigned long uaddr2, int offset2, int nr_wake, int nr_requeue)
{
struct list_head *i, *next, *head1, *head2;
- unsigned long keys1[2], keys2[2];
+ union hash_key key1, key2;
int ret;
down_read(¤t->mm->mmap_sem);
- ret = __get_page_keys(uaddr1 - offset1, keys1);
+ ret = get_page_key(uaddr1 - offset1, &key1);
if (unlikely(ret != 0))
goto out;
- ret = __get_page_keys(uaddr2 - offset2, keys2);
+ ret = get_page_key(uaddr2 - offset2, &key2);
if (unlikely(ret != 0))
goto out;
- head1 = hash_futex(keys1[0], keys1[1], offset1);
- head2 = hash_futex(keys2[0], keys2[1], offset2);
+ head1 = hash_futex(&key1, offset1);
+ head2 = hash_futex(&key2, offset2);
spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
- if (this->keys[0] == keys1[0] && this->keys[1] == keys1[1]
+ if (this->key.raw[0] == key1.raw[0]
+ && this->key.raw[1] == key1.raw[1]
&& this->offset == offset1) {
-
list_del_init(i);
if (++ret <= nr_wake) {
wake_up_all(&this->waiters);
@@ -248,8 +264,7 @@ static inline int futex_requeue(unsigned
this->fd, POLL_IN);
} else {
list_add_tail(i, head2);
- this->keys[0] = keys2[0];
- this->keys[1] = keys2[1];
+ this->key = key2;
this->offset = offset2;
if (ret - nr_wake >= nr_requeue)
break;
@@ -263,14 +278,13 @@ out:
return ret;
}
-static inline void queue_me(struct futex_q *q, unsigned long *keys,
+static inline void queue_me(struct futex_q *q, union hash_key *key,
unsigned long uaddr, int offset,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(keys[0], keys[1], offset);
+ struct list_head *head = hash_futex(key, offset);
- q->keys[0] = keys[0];
- q->keys[1] = keys[1];
+ q->key = *key;
q->offset = offset;
q->fd = fd;
q->filp = filp;
@@ -301,19 +315,18 @@ static inline int futex_wait(unsigned lo
{
DECLARE_WAITQUEUE(wait, current);
int ret, curval;
- unsigned long keys[2];
+ union hash_key key;
struct futex_q q;
- try_again:
init_waitqueue_head(&q.waiters);
down_read(¤t->mm->mmap_sem);
- ret = __get_page_keys(uaddr - offset, keys);
+ ret = get_page_key(uaddr - offset, &key);
if (unlikely(ret != 0))
goto out_release_sem;
- queue_me(&q, keys, uaddr, offset, -1, NULL);
+ queue_me(&q, &key, uaddr, offset, -1, NULL);
/*
* Access the page after the futex is queued.
@@ -338,10 +351,10 @@ static inline int futex_wait(unsigned lo
/*
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
- * faults. So we cannot just set TASK_INTERRUPTIBLE state when
+ * faults, and we cannot just set TASK_INTERRUPTIBLE state when
* queueing ourselves into the futex hash. This code thus has to
- * rely on the futex_wake() code doing a wakeup after removing
- * the waiter from the list.
+ * rely on the futex_wake() code removing us from hash when it
+ * wakes us up.
*/
add_wait_queue(&q.waiters, &wait);
spin_lock(&futex_lock);
@@ -364,26 +377,19 @@ static inline int futex_wait(unsigned lo
* we are the only user of it.
*/
- /*
- * Were we woken or interrupted for a valid reason?
- */
- ret = unqueue_me(&q);
- if (ret == 0)
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ if (!unqueue_me(&q))
return 0;
if (time == 0)
return -ETIMEDOUT;
if (signal_pending(current))
return -EINTR;
- /*
- * No, it was a spurious wakeup. Try again. Should never happen. :)
- */
- goto try_again;
+ /* A spurious wakeup. Should never happen. */
+ BUG();
out_unqueue:
- /*
- * Were we unqueued anyway?
- */
+ /* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
ret = 0;
out_release_sem:
@@ -426,7 +432,7 @@ static struct file_operations futex_fops
static int futex_fd(unsigned long uaddr, int offset, int signal)
{
struct futex_q *q;
- unsigned long keys[2];
+ union hash_key key;
struct file *filp;
int ret, err;
@@ -468,7 +474,7 @@ static int futex_fd(unsigned long uaddr,
}
down_read(¤t->mm->mmap_sem);
- err = __get_page_keys(uaddr - offset, keys);
+ err = get_page_key(uaddr - offset, &key);
up_read(¤t->mm->mmap_sem);
if (unlikely(err != 0)) {
@@ -481,7 +487,7 @@ static int futex_fd(unsigned long uaddr,
init_waitqueue_head(&q->waiters);
filp->private_data = q;
- queue_me(q, keys, uaddr, offset, ret, filp);
+ queue_me(q, &key, uaddr, offset, ret, filp);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
In message <[email protected]> you write:
> What does PROT_SEM mean for Linux, btw?
It's a relic: some archs might need a special flag to ensure
inter-process atomic ops worked as expected. It was never fully
implemented, with the assumption that such archs just won't be able to
use futexes, and if someone really wants to fix it, they will.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell wrote:
> In message <[email protected]> you write:
> > What does PROT_SEM mean for Linux, btw?
>
> It's a relic: some archs might need a special flag to ensure
> inter-process atomic ops worked as expected. It was never fully
~~~~~~~~~~~
> implemented, with the assumption that such archs just won't be able to
~~~~~~~~~~~
> use futexes, and if someone really wants to fix it, they will.
Looking at the kernel I can confidently say it was never implemented
at all. sys_mprotect mentioned PROT_SEM but it was a misleading logic
error that did nothing useful :)
-- Jamie
Rusty Russell wrote:
> Now, if mremap doesn't move the memory, futexes aren't broken, even
> without your patch, right? If it does move, you've got a futex
> sitting in invalid memory, no surprise if it doesn't work.
If the mremap doesn't move the memory it's fine. No surprise :)
If it's moved, then the program isn't broken - it knows it just did an
mremap, and it sends the wakeup to the new address.
This makes sense if async futexes are used on an in-memory private
database. But such programs can just use MAP_ANON|MAP_SHARED if they
want mremap to work.
> OTOH, I'm interested in returning EFAULT on waiters when pages are
> unmapped, because I realized that stale waiters could "match" live
> futex wakeups (an mm_struct gets recycled), and steal the wakeup. Bad
> juju. We could do some uid check or something for anon pages, but
> cleaner to flush them at unmap.
Ah, you're right. Not fixing that is a serious bug.
It can happen when an async futex fd is passed to another process.
Not only can the mm_struct be recycled, it might be recycled into an
inode so it could match a file futex too.
This can be fixed more simply than the full do_unmap patch I posted
earlier, by invalidating all the futexes in an mm when it is destroyed.
Another fix would be to prevent futex fds of private mappings being
passed to another process, somehow.
It must be fixed somehow.
Linus, which patch do you prefer? Invalidate all futexes in an mm
when it's destroyed, or invalidate ranges in do_munmap?
-- Jamie
ps. There's another bug: shared waiters match inodes, which they don't
hold a reference to. Inodes can be recycled too. Fix is easy: just
need to take an inode reference.
In message <[email protected]> you write:
> Rusty Russell wrote:
> > Now, if mremap doesn't move the memory, futexes aren't broken, even
> > without your patch, right? If it does move, you've got a futex
> > sitting in invalid memory, no surprise if it doesn't work.
>
> If the mremap doesn't move the memory it's fine. No surprise :)
>
> If it's moved, then the program isn't broken - it knows it just did an
> mremap, and it sends the wakeup to the new address.
>
> This makes sense if async futexes are used on an in-memory private
> database. But such programs can just use MAP_ANON|MAP_SHARED if they
> want mremap to work.
wakeup is not a problem: from the kernel's POV, between wakeups the
futex doesn't exist.
The only real case (ignoring the "one thread FUTEX_WAIT while the
other mremaps underneath" which is gonna break anyway), is FUTEX_FD, I
don't see a problem with having to manually move your futex fds in
this case when the memory underneath them has been remapped. In fact,
it'd be surprising if you didn't have to.
> > OTOH, I'm interested in returning EFAULT on waiters when pages are
> > unmapped, because I realized that stale waiters could "match" live
> Ah, you're right. Not fixing that is a serious bug.
> It can happen when an async futex fd is passed to another process.
> ps. There's another bug: shared waiters match inodes, which they don't
> hold a reference to. Inodes can be recycled too. Fix is easy: just
> need to take an inode reference.
Yes. Invalidate is nice because it catches a programmer mistake. But
why not solve the problem by just holding an mm reference, too?
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
btw., regarding this fix:
[email protected], 2003-09-06 12:28:20-07:00, [email protected]
[PATCH] Fix futex hashing bugs
why dont we do this:
} else {
/* Make sure to stop if key1 == key2 */
if (head1 == head2)
break;
list_add_tail(i, head2);
this->key = key2;
if (ret - nr_wake >= nr_requeue)
break;
}
instead of the current:
} else {
list_add_tail(i, head2);
this->key = key2;
if (ret - nr_wake >= nr_requeue)
break;
/* Make sure to stop if key1 == key2 */
if (head1 == head2 && head1 != next)
head1 = i;
}
what's the point in requeueing once, and then exiting the loop by changing
the loop exit condition variable? You are trying to avoid the lockup but
the first one ought to be the most straightforward way to do it.
Ingo
On Sun, 7 Sep 2003, Ingo Molnar wrote:
>
> btw., regarding this fix:
>
> [email protected], 2003-09-06 12:28:20-07:00, [email protected]
> [PATCH] Fix futex hashing bugs
>
> why dont we do this:
>
> } else {
> /* Make sure to stop if key1 == key2 */
> if (head1 == head2)
> break;
> list_add_tail(i, head2);
> this->key = key2;
> if (ret - nr_wake >= nr_requeue)
> break;
> }
>
> instead of the current:
>
> } else {
> list_add_tail(i, head2);
> this->key = key2;
> if (ret - nr_wake >= nr_requeue)
> break;
> /* Make sure to stop if key1 == key2 */
> if (head1 == head2 && head1 != next)
> head1 = i;
> }
>
> what's the point in requeueing once, and then exiting the loop by changing
> the loop exit condition variable? You are trying to avoid the lockup but
> the first one ought to be the most straightforward way to do it.
I think you're reading it as a "list_for_each(i, head1)" loop,
whereas it is and must be a "list_for_each_safe(i, next, head1)" loop.
So it won't (in general) terminate after this one requeueing (as
list_for_each would, finding i->next == head1): termination depends on
next (already set) and head1, so I repoint head1 to the first requeued.
So it should terminate after one pass down the list, when it reaches the
first requeued, and can then return the appropriate "ret" count to user.
You may perhaps know that the ret count is not important, but I don't
know that, so wanted to get it right. (At the time, I also wanted to
have the list sorted exactly as intended, but now I can't see that the
relative positions of different keys could matter at all.)
It may be bad practice to use a familiar macro like list_for_each_safe,
yet play with its controlling variables within the loop. I just felt
safer that way than expanding it, or adding extraneous variables.
Hugh
Ingo Molnar wrote:
> btw., regarding this fix:
>
> [email protected], 2003-09-06 12:28:20-07:00, [email protected]
> [PATCH] Fix futex hashing bugs
>
> why dont we do this:
>
> } else {
> /* Make sure to stop if key1 == key2 */
> if (head1 == head2)
> break;
> list_add_tail(i, head2);
> this->key = key2;
> if (ret - nr_wake >= nr_requeue)
> break;
> }
>
> instead of the current:
>
> } else {
> list_add_tail(i, head2);
> this->key = key2;
> if (ret - nr_wake >= nr_requeue)
> break;
> /* Make sure to stop if key1 == key2 */
> if (head1 == head2 && head1 != next)
> head1 = i;
> }
>
> what's the point in requeueing once, and then exiting the loop by changing
> the loop exit condition variable?
Hugh's patch is clever and subtle. It doesn't exit the loop; the
loop continues from "next".
What it does is change the end condition so that the loop stops just
before the first requeued futex. Let's call that one REQUEUED1.
If are other futexes to requeue, they after inserted after REQUEUED1
(because head2 wasn't changed), yet the end condition _isn't_ changed
when this happens, because now head1 != head2.
This causes the correct number of futexes to be requeued at the end of
the wait list.
> You are trying to avoid the lockup but the first one ought to be the
> most straightforward way to do it.
Hugh's patch returns the correct retval _and_ requeues the correct
number of waiters to the end of the queue. And it does it without
fancy code.
Remember that the waiter order is visible to userspace - it's used by
"fair" operations, so it's appropriate that requeuing a futex to
itself moves nr_requeues waiters to the end of the queue, just like it
does when it requeues to a different futex.
If the code to handle that were complicated, I'd vote for dropping it.
But Hugh's patch does exactly the right thing in a simple way. Lovely!
-- Jamie
Rusty Russell wrote:
> In message <[email protected]> you write:
> > Rusty Russell wrote:
> > > Now, if mremap doesn't move the memory, futexes aren't broken, even
> > > without your patch, right? If it does move, you've got a futex
> > > sitting in invalid memory, no surprise if it doesn't work.
> >
> > If the mremap doesn't move the memory it's fine. No surprise :)
> >
> > If it's moved, then the program isn't broken - it knows it just did an
> > mremap, and it sends the wakeup to the new address.
> >
> > This makes sense if async futexes are used on an in-memory private
> > database. But such programs can just use MAP_ANON|MAP_SHARED if they
> > want mremap to work.
>
> The only real case (ignoring the "one thread FUTEX_WAIT while the
> other mremaps underneath" which is gonna break anyway), is FUTEX_FD,
By "async futex" I mean FUTEX_FD; sorry if that wasn't clear.
By "sync futex" I mean FUTEX_WAIT.
> I don't see a problem with having to manually move your futex fds in
> this case when the memory underneath them has been remapped. In
> fact, it'd be surprising if you didn't have to.
I don't see a problem, as long as it is documented. Anybody grokking
the old futex code would expect futexes to move with mappings.
It's a mild surprise whatever behaviour, because:
- if it's a MAP_SHARED, then the program doesn't have to move futexes,
and this is a good thing (think locks in a remap_file_pages
database mapping).
- the old (page pinning) version moves FUTEX_FD futexes "automatically",
in the sense that they're attached to the page which moves.
> Yes. Invalidate is nice because it catches a programmer mistake. But
> why not solve the problem by just holding an mm reference, too?
That would work. An mm isn't that huge once everything's been
unmapped by exit. Alternatively, mm-private futexes can be woken when
the mm is destroyed.
I just implemented the latter, but come to think of it a reference to
a dead mm is light enough not to bother with a list of "futexes
attached to mm to destroy on exit".
So I'll throw that away and provide a patch which just takes a reference.
(Also, takes inode references).
-- Jamie
Hugh Dickins wrote:
> You may perhaps know that the ret count is not important, but I don't
> know that, so wanted to get it right. (At the time, I also wanted to
> have the list sorted exactly as intended, but now I can't see that the
> relative positions of different keys could matter at all.)
The position of different keys doesn't matter, but the relative
position of identical keys does.
-- jamie
In message <[email protected]> you write:
> I don't see a problem, as long as it is documented. Anybody grokking
> the old futex code would expect futexes to move with mappings.
BTW, I don't know of anyone *doing* this, but IMHO it's not worth a
single line of kernel code, since if you don't adjust your futex
addresses when you mremap, the try_down_futex will segv after the poll
or whatever. As a programmer, I would *expect* to have to reset the
futexes (along with every other pointer into the map) when mremap
happens: after all, I told the kernel to watch the old address. If it
still works, great, but I'd not expect it.
> > Yes. Invalidate is nice because it catches a programmer mistake. But
> > why not solve the problem by just holding an mm reference, too?
>
> That would work. An mm isn't that huge once everything's been
> unmapped by exit. Alternatively, mm-private futexes can be woken when
> the mm is destroyed.
>
> I just implemented the latter, but come to think of it a reference to
> a dead mm is light enough not to bother with a list of "futexes
> attached to mm to destroy on exit".
>
> So I'll throw that away and provide a patch which just takes a reference.
> (Also, takes inode references).
Agreed. You can get a page per 2 fds trivially anyway with pipes, so
staying within that bound is fairly safe.
Cheers!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> On Sun, 7 Sep 2003, Ingo Molnar wrote:
> >
> > btw., regarding this fix:
> >
> > [email protected], 2003-09-06 12:28:20-07:00, [email protected]
> > [PATCH] Fix futex hashing bugs
> >
> > why dont we do this:
> >
> > } else {
> > /* Make sure to stop if key1 == key2 */
> > if (head1 == head2)
> > break;
> > list_add_tail(i, head2);
> > this->key = key2;
> > if (ret - nr_wake >= nr_requeue)
> > break;
> > }
Why not make the code a *whole* lot more readable (and only marginally
slower, if at all) by doing it in two passes: pull them off onto a
(on-stack) list in one pass, then requeue them all in another.
This on top of Hugh's patch on top of Jamie's. Untested, but you get
the idea...
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: FUTEX_REQUEUE simplyfication
Author: Rusty Russell
Status: Booted on 2.6.0-test4-bk9
Depends: Misc/futex-hugh.patch.gz
D: Simplify the logic of FUTEX_REQUEUE.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .32421-linux-2.6.0-test4-bk9/kernel/futex.c .32421-linux-2.6.0-test4-bk9.updated/kernel/futex.c
--- .32421-linux-2.6.0-test4-bk9/kernel/futex.c 2003-09-08 10:44:26.000000000 +1000
+++ .32421-linux-2.6.0-test4-bk9.updated/kernel/futex.c 2003-09-08 11:24:15.000000000 +1000
@@ -253,8 +253,10 @@ out:
static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
int nr_wake, int nr_requeue)
{
- struct list_head *i, *next, *head1, *head2;
+ struct list_head *head1, *head2;
+ struct futex_q *this, *next;
union futex_key key1, key2;
+ LIST_HEAD(moved);
int ret;
down_read(¤t->mm->mmap_sem);
@@ -270,27 +272,29 @@ static int futex_requeue(unsigned long u
head2 = hash_futex(&key2);
spin_lock(&futex_lock);
- list_for_each_safe(i, next, head1) {
- struct futex_q *this = list_entry(i, struct futex_q, list);
-
- if (match_futex (&this->key, &key1)) {
- list_del_init(i);
+ list_for_each_entry_safe(this, next, head1, list) {
+ if (match_futex(&this->key, &key1)) {
if (++ret <= nr_wake) {
+ list_del_init(&this->list);
wake_up_all(&this->waiters);
if (this->filp)
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else {
- list_add_tail(i, head2);
- this->key = key2;
+ /* Dequeue. */
+ list_del(&this->list);
+ list_add(&this->list, &moved);
if (ret - nr_wake >= nr_requeue)
break;
- /* Make sure to stop if key1 == key2 */
- if (head1 == head2 && head1 != next)
- head1 = i;
}
}
}
+
+ /* Requeue */
+ list_for_each_entry_safe(this, next, &moved, list) {
+ list_del(&this->list);
+ list_add_tail(&this->list, head2);
+ }
spin_unlock(&futex_lock);
out:
On Sun, 7 Sep 2003, Jamie Lokier wrote:
> Hugh's patch is clever and subtle. It doesn't exit the loop; the loop
> continues from "next".
ugh. It would be much cleaner to simply do a list_add() instead of a
list_add_tail(). (the ordering of the queue doesnt matter anyway)
Ingo
Rusty Russell wrote:
> In message <[email protected]> you write:
> > I don't see a problem, as long as it is documented. Anybody grokking
> > the old futex code would expect futexes to move with mappings.
>
> BTW, I don't know of anyone *doing* this, but IMHO it's not worth a
> single line of kernel code, since if you don't adjust your futex
> addresses when you mremap, the try_down_futex will segv after the poll
> or whatever. As a programmer, I would *expect* to have to reset the
> futexes (along with every other pointer into the map) when mremap
> happens: after all, I told the kernel to watch the old address. If it
> still works, great, but I'd not expect it.
Sure. As long as it's documented, because my expectation is the
opposite of yours :)
(Some uses of futex don't read the memory after they are woken, until
they have re-tested some other condition and can recalculate the
address, so segv and pointers-into-the-map don't occur in these uses).
-- Jamie
Ingo Molnar wrote:
> > Hugh's patch is clever and subtle. It doesn't exit the loop; the loop
> > continues from "next".
>
> ugh. It would be much cleaner to simply do a list_add() instead of a
> list_add_tail(). (the ordering of the queue doesnt matter anyway)
Why do you say the order doesn't matter? If you change the order in
FUTEX_WAIT & FUTEX_WAKE, then "fair" operations aren't fair any more.
Is there a reason why FUTEX_REQUEUE is exempt from this?
-- Jamie
On Mon, 8 Sep 2003, Jamie Lokier wrote:
> Why do you say the order doesn't matter? If you change the order in
> FUTEX_WAIT & FUTEX_WAKE, then "fair" operations aren't fair any more.
hm, indeed, the ordering of wake-one/wake-few wakeups would be impacted.
> Is there a reason why FUTEX_REQUEUE is exempt from this?
no, you are right - FIFO queueing must be preserved there too.
Ingo