Generic speculative fault handler, tries to service a pagefault
without holding mmap_sem.
Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/mm.h | 2 +
mm/memory.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 60 insertions(+), 1 deletion(-)
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1998,7 +1998,7 @@ again:
if (!*ptep)
goto out;
- if (vma_is_dead(vma, seq))
+ if (vma && vma_is_dead(vma, seq))
goto unlock;
unpin_page_tables();
@@ -3112,6 +3112,63 @@ int handle_mm_fault(struct mm_struct *mm
return handle_pte_fault(mm, vma, address, entry, pmd, flags, 0);
}
+int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+ unsigned int flags)
+{
+ pmd_t *pmd = NULL;
+ pte_t *pte, entry;
+ spinlock_t *ptl;
+ struct vm_area_struct *vma;
+ unsigned int seq;
+ int ret = VM_FAULT_RETRY;
+ int dead;
+
+ __set_current_state(TASK_RUNNING);
+ flags |= FAULT_FLAG_SPECULATIVE;
+
+ count_vm_event(PGFAULT);
+
+ rcu_read_lock();
+ if (!pte_map_lock(mm, NULL, address, pmd, flags, 0, &pte, &ptl))
+ goto out_unlock;
+
+ vma = find_vma(mm, address);
+
+ if (!vma)
+ goto out_unmap;
+
+ dead = RB_EMPTY_NODE(&vma->vm_rb);
+ seq = vma->vm_sequence.sequence;
+ /*
+ * Matches both the wmb in write_seqcount_begin/end() and
+ * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
+ */
+ smp_rmb();
+ if (dead || seq & 1)
+ goto out_unmap;
+
+ if (!(vma->vm_end > address && vma->vm_start <= address))
+ goto out_unmap;
+
+ if (read_seqcount_retry(&vma->vm_sequence, seq))
+ goto out_unmap;
+
+ entry = *pte;
+
+ pte_unmap_unlock(pte, ptl);
+
+ ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
+
+out_unlock:
+ rcu_read_unlock();
+ return ret;
+
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ goto out_unlock;
+}
+
+
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -829,6 +829,8 @@ int invalidate_inode_page(struct page *p
#ifdef CONFIG_MMU
extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
+extern int handle_speculative_fault(struct mm_struct *mm,
+ unsigned long address, unsigned int flags);
#else
static inline int handle_mm_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
--
On Mon, 04 Jan 2010 19:24:35 +0100
Peter Zijlstra <[email protected]> wrote:
> Generic speculative fault handler, tries to service a pagefault
> without holding mmap_sem.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
I'm sorry if I miss something...how does this patch series avoid
that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
("vma is removed" means all other things as freeing file struct etc..)
Thanks,
-Kame
> ---
> include/linux/mm.h | 2 +
> mm/memory.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 60 insertions(+), 1 deletion(-)
>
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c
> +++ linux-2.6/mm/memory.c
> @@ -1998,7 +1998,7 @@ again:
> if (!*ptep)
> goto out;
>
> - if (vma_is_dead(vma, seq))
> + if (vma && vma_is_dead(vma, seq))
> goto unlock;
>
> unpin_page_tables();
> @@ -3112,6 +3112,63 @@ int handle_mm_fault(struct mm_struct *mm
> return handle_pte_fault(mm, vma, address, entry, pmd, flags, 0);
> }
>
> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
> + unsigned int flags)
> +{
> + pmd_t *pmd = NULL;
> + pte_t *pte, entry;
> + spinlock_t *ptl;
> + struct vm_area_struct *vma;
> + unsigned int seq;
> + int ret = VM_FAULT_RETRY;
> + int dead;
> +
> + __set_current_state(TASK_RUNNING);
> + flags |= FAULT_FLAG_SPECULATIVE;
> +
> + count_vm_event(PGFAULT);
> +
> + rcu_read_lock();
> + if (!pte_map_lock(mm, NULL, address, pmd, flags, 0, &pte, &ptl))
> + goto out_unlock;
> +
> + vma = find_vma(mm, address);
> +
> + if (!vma)
> + goto out_unmap;
> +
> + dead = RB_EMPTY_NODE(&vma->vm_rb);
> + seq = vma->vm_sequence.sequence;
> + /*
> + * Matches both the wmb in write_seqcount_begin/end() and
> + * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
> + */
> + smp_rmb();
> + if (dead || seq & 1)
> + goto out_unmap;
> +
> + if (!(vma->vm_end > address && vma->vm_start <= address))
> + goto out_unmap;
> +
> + if (read_seqcount_retry(&vma->vm_sequence, seq))
> + goto out_unmap;
> +
> + entry = *pte;
> +
> + pte_unmap_unlock(pte, ptl);
> +
> + ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
> +
> +out_unlock:
> + rcu_read_unlock();
> + return ret;
> +
> +out_unmap:
> + pte_unmap_unlock(pte, ptl);
> + goto out_unlock;
> +}
> +
> +
> #ifndef __PAGETABLE_PUD_FOLDED
> /*
> * Allocate page upper directory.
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -829,6 +829,8 @@ int invalidate_inode_page(struct page *p
> #ifdef CONFIG_MMU
> extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, unsigned int flags);
> +extern int handle_speculative_fault(struct mm_struct *mm,
> + unsigned long address, unsigned int flags);
> #else
> static inline int handle_mm_fault(struct mm_struct *mm,
> struct vm_area_struct *vma, unsigned long address,
>
> --
>
>
On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> I'm sorry if I miss something...how does this patch series avoid
> that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
> ("vma is removed" means all other things as freeing file struct etc..)
I don't think you're missing anything.
Protecting the vma isn't enough. You need to protect the whole FS stack
with rcu. Probably by moving _all_ of "free_vma()" into the RCU path
(which means that the whole file/inode gets de-allocated at that later RCU
point, rather than synchronously). Not just the actual kfree.
However, it's worth noting that that actually has some very subtle and
major consequences. If you have a temporary file that was removed, where
the mmap() was the last user that kind of delayed freeing would also delay
the final fput of that file that actually deletes it.
Or put another way: if the vma was a writable mapping, a user may do
munmap(mapping, size);
and the backing file is still active and writable AFTER THE MUNMAP! This
can be a huge problem for something that wants to unmount the volume, for
example, or depends on the whole writability-vs-executability thing. The
user may have unmapped it, and expects the file to be immediately
non-busy, but with the delayed free that isn't the case any more.
In other words, now you may well need to make munmap() wait for the RCU
grace period, so that the user who did the unmap really is synchronous wrt
the file accesses. We've had things like that before, and they have been
_huge_ performance problems (ie it may take just a timer tick or two, but
then people do tens of thousands of munmaps, and now that takes many
seconds just due to RCU grace period waiting.
I would say that this whole series is _very_ far from being mergeable.
Peter seems to have been thinking about the details, while missing all the
subtle big picture effects that seem to actually change semantics.
Linus
Hi, Kame.
On Tue, Jan 5, 2010 at 9:25 AM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
> On Mon, 04 Jan 2010 19:24:35 +0100
> Peter Zijlstra <[email protected]> wrote:
>
>> Generic speculative fault handler, tries to service a pagefault
>> without holding mmap_sem.
>>
>> Signed-off-by: Peter Zijlstra <[email protected]>
>
>
> I'm sorry if I miss something...how does this patch series avoid
> that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
> ("vma is removed" means all other things as freeing file struct etc..)
Isn't it protected by get_file and iget?
Am I miss something?
>
> Thanks,
> -Kame
>
>
>
>
>> ---
>> include/linux/mm.h | 2 +
>> mm/memory.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 2 files changed, 60 insertions(+), 1 deletion(-)
>>
>> Index: linux-2.6/mm/memory.c
>> ===================================================================
>> --- linux-2.6.orig/mm/memory.c
>> +++ linux-2.6/mm/memory.c
>> @@ -1998,7 +1998,7 @@ again:
>> if (!*ptep)
>> goto out;
>>
>> - if (vma_is_dead(vma, seq))
>> + if (vma && vma_is_dead(vma, seq))
>> goto unlock;
>>
>> unpin_page_tables();
>> @@ -3112,6 +3112,63 @@ int handle_mm_fault(struct mm_struct *mm
>> return handle_pte_fault(mm, vma, address, entry, pmd, flags, 0);
>> }
>>
>> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> + unsigned int flags)
>> +{
>> + pmd_t *pmd = NULL;
>> + pte_t *pte, entry;
>> + spinlock_t *ptl;
>> + struct vm_area_struct *vma;
>> + unsigned int seq;
>> + int ret = VM_FAULT_RETRY;
>> + int dead;
>> +
>> + __set_current_state(TASK_RUNNING);
>> + flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> + count_vm_event(PGFAULT);
>> +
>> + rcu_read_lock();
>> + if (!pte_map_lock(mm, NULL, address, pmd, flags, 0, &pte, &ptl))
>> + goto out_unlock;
>> +
>> + vma = find_vma(mm, address);
>> +
>> + if (!vma)
>> + goto out_unmap;
>> +
>> + dead = RB_EMPTY_NODE(&vma->vm_rb);
>> + seq = vma->vm_sequence.sequence;
>> + /*
>> + * Matches both the wmb in write_seqcount_begin/end() and
>> + * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
>> + */
>> + smp_rmb();
>> + if (dead || seq & 1)
>> + goto out_unmap;
>> +
>> + if (!(vma->vm_end > address && vma->vm_start <= address))
>> + goto out_unmap;
>> +
>> + if (read_seqcount_retry(&vma->vm_sequence, seq))
>> + goto out_unmap;
>> +
>> + entry = *pte;
>> +
>> + pte_unmap_unlock(pte, ptl);
>> +
>> + ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
>> +
>> +out_unlock:
>> + rcu_read_unlock();
>> + return ret;
>> +
>> +out_unmap:
>> + pte_unmap_unlock(pte, ptl);
>> + goto out_unlock;
>> +}
>> +
>> +
>> #ifndef __PAGETABLE_PUD_FOLDED
>> /*
>> * Allocate page upper directory.
>> Index: linux-2.6/include/linux/mm.h
>> ===================================================================
>> --- linux-2.6.orig/include/linux/mm.h
>> +++ linux-2.6/include/linux/mm.h
>> @@ -829,6 +829,8 @@ int invalidate_inode_page(struct page *p
>> #ifdef CONFIG_MMU
>> extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>> unsigned long address, unsigned int flags);
>> +extern int handle_speculative_fault(struct mm_struct *mm,
>> + unsigned long address, unsigned int flags);
>> #else
>> static inline int handle_mm_fault(struct mm_struct *mm,
>> struct vm_area_struct *vma, unsigned long address,
>>
>> --
>>
>>
>
>
--
Kind regards,
Minchan Kim
On Tue, 5 Jan 2010 13:29:40 +0900
Minchan Kim <[email protected]> wrote:
> Hi, Kame.
>
> On Tue, Jan 5, 2010 at 9:25 AM, KAMEZAWA Hiroyuki
> <[email protected]> wrote:
> > On Mon, 04 Jan 2010 19:24:35 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> >> Generic speculative fault handler, tries to service a pagefault
> >> without holding mmap_sem.
> >>
> >> Signed-off-by: Peter Zijlstra <[email protected]>
> >
> >
> > I'm sorry if I miss something...how does this patch series avoid
> > that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
> > ("vma is removed" means all other things as freeing file struct etc..)
>
> Isn't it protected by get_file and iget?
> Am I miss something?
>
Only kmem_cache_free() part of following code is modified by the patch.
==
229 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
230 {
231 struct vm_area_struct *next = vma->vm_next;
232
233 might_sleep();
234 if (vma->vm_ops && vma->vm_ops->close)
235 vma->vm_ops->close(vma);
236 if (vma->vm_file) {
237 fput(vma->vm_file);
238 if (vma->vm_flags & VM_EXECUTABLE)
239 removed_exe_file_vma(vma->vm_mm);
240 }
241 mpol_put(vma_policy(vma));
242 kmem_cache_free(vm_area_cachep, vma);
243 return next;
244 }
==
Then, fput() can be called. The whole above code should be delayd until RCU
glace period if we use RCU here.
Then, my patch dropped speculative trial of page fault and did synchronous
job here. I'm still considering how to insert some barrier to delay calling
remove_vma() until all page fault goes. One idea was reference count but
it was said not-enough crazy.
Thanks,
-Kame
On Tue, 5 Jan 2010, Minchan Kim wrote:
>
> Isn't it protected by get_file and iget?
When the vma is mapped, yes.
> Am I miss something?
remove_vma() will have done a
fput(vma->vm_file);
and other house-keeping (removing the executable info, doing
vm_ops->close() etc).
And that is _not_ done delayed by RCU, and as outlined in my previous
email I think that if the code really _does_ delay it, then munmap() (and
exit) need to wait for the RCU callbacks to have been done, because
otherwise the file may end up being busy "asynchronously" in ways that
break existing semantics.
Just as an example: imagine a script that does "fork()+execve()" on a
temporary file, and then after waiting for it all to finish with wait4()
does some re-write of the file. It currently works. But what if the
open-for-writing gets ETXTBUSY because the file is still marked as being
VM_DENYWRITE, and RCU hasn't done all the callbacks?
Or if you do the i_writecount handling synchronously (which is likely fine
- it really is just for ETXTBUSY handling, and I don't think speculative
page faults matter), what about a shutdown sequence (or whatever) that
wants to unmount the filesystem, but the file is still open - as it has to
be - because the actual close is delayed by RCU.
So the patch-series as-is is fundamentally buggy - and trying to fix it
seems painful.
I'm also not entirely clear on how the race with page table tear-down vs
page-fault got handled, but I didn't read the whole patch-series very
carefully. I skimmed through it and got rather nervous about it all. It
doesn't seem too large, but it _does_ seem rather cavalier about all the
object lifetimes.
Linus
On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> Then, my patch dropped speculative trial of page fault and did synchronous
> job here. I'm still considering how to insert some barrier to delay calling
> remove_vma() until all page fault goes. One idea was reference count but
> it was said not-enough crazy.
What lock would you use to protect the vma lookup (in order to then
increase the refcount)? A sequence lock with RCU lookup of the vma?
Sounds doable. But it also sounds way more expensive than the current VM
fault handling, which is pretty close to optimal for single-threaded
cases.. That RCU lookup might be cheap, but just the refcount is generally
going to be as expensive as a lock.
Are there some particular mappings that people care about more than
others? If we limit the speculative lookup purely to anonymous memory,
that might simplify the problem space?
[ From past experiences, I suspect DB people would be upset and really
want it for the general file mapping case.. But maybe the main usage
scenario is something else this time? ]
Linus
On Mon, 4 Jan 2010 21:10:29 -0800 (PST)
Linus Torvalds <[email protected]> wrote:
>
>
> On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> > Then, my patch dropped speculative trial of page fault and did synchronous
> > job here. I'm still considering how to insert some barrier to delay calling
> > remove_vma() until all page fault goes. One idea was reference count but
> > it was said not-enough crazy.
>
> What lock would you use to protect the vma lookup (in order to then
> increase the refcount)? A sequence lock with RCU lookup of the vma?
>
Ah, I just used reference counter to show "how many threads are in
page fault to this vma now". Below is from my post.
==
+ rb_node = rcu_dereference(rb_node->rb_left);
+ } else
+ rb_node = rcu_dereference(rb_node->rb_right);
+ }
+ if (vma) {
+ if ((vma->vm_start <= addr) && (addr < vma->vm_end)) {
+ if (!atomic_inc_not_zero(&vma->refcnt))
+ vma = NULL;
+ } else
+ vma = NULL;
+ }
+ rcu_read_unlock();
...
+void vma_put(struct vm_area_struct *vma)
+{
+ if ((atomic_dec_return(&vma->refcnt) == 1) &&
+ waitqueue_active(&vma->wait_queue))
+ wake_up(&vma->wait_queue);
+ return;
+}
==
And wait for this reference count to be good number before calling
remove_vma()
==
+/* called when vma is unlinked and wait for all racy access.*/
+static void invalidate_vma_before_free(struct vm_area_struct *vma)
+{
+ atomic_dec(&vma->refcnt);
+ wait_event(vma->wait_queue, !atomic_read(&vma->refcnt));
+}
+
....
* us to remove next before dropping the locks.
*/
__vma_unlink(mm, next, vma);
+ invalidate_vma_before_free(next);
if (file)
__remove_shared_vm_struct(next, file, mapping);
etc....
==
Above codes are a bit heavy(and buggy). I have some fixes.
> Sounds doable. But it also sounds way more expensive than the current VM
> fault handling, which is pretty close to optimal for single-threaded
> cases.. That RCU lookup might be cheap, but just the refcount is generally
> going to be as expensive as a lock.
>
For single-threaded apps, my patch will have no benefits.
(but will not make anything worse.)
I'll add CONFIG and I wonder I can enable speculave_vma_lookup
only after mm_struct is shared.(but the patch may be messy...)
> Are there some particular mappings that people care about more than
> others? If we limit the speculative lookup purely to anonymous memory,
> that might simplify the problem space?
>
I wonder, for usual people who don't write highly optimized programs,
some small benefit of skipping mmap_sem is to reduce mmap_sem() ping-pong
after doing fork()->exec(). This can cause some jitter to the application.
So, I'm glad if I can help file-backed vmas.
> [ From past experiences, I suspect DB people would be upset and really
> want it for the general file mapping case.. But maybe the main usage
> scenario is something else this time? ]
>
I'd like to hear use cases of really heavy users, too. Christoph ?
Thanks,
-Kame
On Tue, Jan 5, 2010 at 1:43 PM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
> On Tue, 5 Jan 2010 13:29:40 +0900
> Minchan Kim <[email protected]> wrote:
>
>> Hi, Kame.
>>
>> On Tue, Jan 5, 2010 at 9:25 AM, KAMEZAWA Hiroyuki
>> <[email protected]> wrote:
>> > On Mon, 04 Jan 2010 19:24:35 +0100
>> > Peter Zijlstra <[email protected]> wrote:
>> >
>> >> Generic speculative fault handler, tries to service a pagefault
>> >> without holding mmap_sem.
>> >>
>> >> Signed-off-by: Peter Zijlstra <[email protected]>
>> >
>> >
>> > I'm sorry if I miss something...how does this patch series avoid
>> > that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
>> > ("vma is removed" means all other things as freeing file struct etc..)
>>
>> Isn't it protected by get_file and iget?
>> Am I miss something?
>>
> Only kmem_cache_free() part of following code is modified by the patch.
That's it I missed. Thanks, Kame. ;-)
--
Kind regards,
Minchan Kim
Hi, Linus.
On Tue, Jan 5, 2010 at 1:48 PM, Linus Torvalds
<[email protected]> wrote:
>
>
> On Tue, 5 Jan 2010, Minchan Kim wrote:
>>
>> Isn't it protected by get_file and iget?
>
> When the vma is mapped, yes.
>
>> Am I miss something?
>
> remove_vma() will have done a
>
> fput(vma->vm_file);
>
> and other house-keeping (removing the executable info, doing
> vm_ops->close() etc).
>
> And that is _not_ done delayed by RCU, and as outlined in my previous
> email I think that if the code really _does_ delay it, then munmap() (and
> exit) need to wait for the RCU callbacks to have been done, because
> otherwise the file may end up being busy "asynchronously" in ways that
> break existing semantics.
>
> Just as an example: imagine a script that does "fork()+execve()" on a
> temporary file, and then after waiting for it all to finish with wait4()
> does some re-write of the file. It currently works. But what if the
> open-for-writing gets ETXTBUSY because the file is still marked as being
> VM_DENYWRITE, and RCU hasn't done all the callbacks?
>
> Or if you do the i_writecount handling synchronously (which is likely fine
> - it really is just for ETXTBUSY handling, and I don't think speculative
> page faults matter), what about a shutdown sequence (or whatever) that
> wants to unmount the filesystem, but the file is still open - as it has to
> be - because the actual close is delayed by RCU.
>
> So the patch-series as-is is fundamentally buggy - and trying to fix it
> seems painful.
>
> I'm also not entirely clear on how the race with page table tear-down vs
> page-fault got handled, but I didn't read the whole patch-series very
> carefully. I skimmed through it and got rather nervous about it all. It
> doesn't seem too large, but it _does_ seem rather cavalier about all the
> object lifetimes.
>
> Linus
>
Thanks for careful explanation, Linus.
My humble opinion is following as.
Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
It can delay munap and exit but it would be better than handling them by more
complicated things, I think. And both cases aren't often cases so we
can achieve advantage than disadvantage?
--
Kind regards,
Minchan Kim
On Tue, 5 Jan 2010 15:09:47 +0900
Minchan Kim <[email protected]> wrote:
> My humble opinion is following as.
>
> Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
> It can delay munap and exit but it would be better than handling them by more
> complicated things, I think. And both cases aren't often cases so we
> can achieve advantage than disadvantage?
>
In most case, a program is single threaded. And sychronize_rcu() in unmap path
just adds very big overhead.
Thanks,
-Kame
On Tue, Jan 5, 2010 at 3:09 PM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
> On Tue, 5 Jan 2010 15:09:47 +0900
> Minchan Kim <[email protected]> wrote:
>> My humble opinion is following as.
>>
>> Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
>> It can delay munap and exit but it would be better than handling them by more
>> complicated things, I think. And both cases aren't often cases so we
>> can achieve advantage than disadvantage?
>>
>
> In most case, a program is single threaded. And sychronize_rcu() in unmap path
> just adds very big overhead.
Yes.
I suggested you that consider single-thread app's regression, please. :)
First I come to my head is we can count number of thread.
Yes. thread number is a not good choice.
As a matter of fact, I want to work it adaptively.
If the process start to have many threads, speculative page fault turn
on or turn off.
I know it's not easy. I hope other guys have good ideas.
>
> Thanks,
> -Kame
>
>
--
Kind regards,
Minchan Kim
On Tue, 5 Jan 2010 14:30:46 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> ==
> Above codes are a bit heavy(and buggy). I have some fixes.
>
Here is my latest one. But I myself think this should be improved more
and don't think this patch is bug-free. I have some concern
s around fork() etc..and still considering how to avoid atomic ops.
I post this just as a reference rather than cut-out from old e-mails.
Here is a comarison before after patch. A test program does following in
multi-thread. (attached)
while (1) {
touch memory (with write)
pthread_barrier_wait().
fork() if I'm the first thread.
-> exit() immediately
wait()
pthread_barrier_wait()
}
And see overheads by perf. This is result on 8core/2socket hosts.
This is highly affected by the number of sockets.
# Samples: 1164449628090
#
# Overhead Command Shared Object Symbol
# ........ ............... ........................ ......
#
43.23% multi-fault-all [kernel] [k] smp_invalidate_interrupt
16.27% multi-fault-all [kernel] [k] flush_tlb_others_ipi
11.55% multi-fault-all [kernel] [k] _raw_spin_lock_irqsave <========(*)
6.23% multi-fault-all [kernel] [k] intel_pmu_enable_all
2.17% multi-fault-all [kernel] [k] _raw_spin_unlock_irqrestore
1.59% multi-fault-all [kernel] [k] page_fault
1.45% multi-fault-all [kernel] [k] do_wp_page
1.35% multi-fault-all ./multi-fault-all-fork [.] worker
1.26% multi-fault-all [kernel] [k] handle_mm_fault
1.19% multi-fault-all [kernel] [k] _raw_spin_lock
1.03% multi-fault-all [kernel] [k] invalidate_interrupt7
1.00% multi-fault-all [kernel] [k] invalidate_interrupt6
0.99% multi-fault-all [kernel] [k] invalidate_interrupt3
# Samples: 181505050964
#
# Overhead Command Shared Object Symbol
# ........ ............... ........................ ......
#
45.08% multi-fault-all [kernel] [k] smp_invalidate_interrupt
19.45% multi-fault-all [kernel] [k] intel_pmu_enable_all
14.17% multi-fault-all [kernel] [k] flush_tlb_others_ipi
1.89% multi-fault-all [kernel] [k] do_wp_page
1.58% multi-fault-all [kernel] [k] page_fault
1.46% multi-fault-all ./multi-fault-all-fork [.] worker
1.26% multi-fault-all [kernel] [k] do_page_fault
1.14% multi-fault-all [kernel] [k] _raw_spin_lock
1.10% multi-fault-all [kernel] [k] flush_tlb_page
1.09% multi-fault-all [kernel] [k] vma_put <========(**)
0.99% multi-fault-all [kernel] [k] invalidate_interrupt0
0.98% multi-fault-all [kernel] [k] find_vma_speculative <=====(**)
0.81% multi-fault-all [kernel] [k] invalidate_interrupt3
0.81% multi-fault-all [kernel] [k] native_apic_mem_write
0.79% multi-fault-all [kernel] [k] invalidate_interrupt7
0.76% multi-fault-all [kernel] [k] invalidate_interrupt5
(*) is removed overhead of mmap_sem and (**) is added overhead of atomic ops.
And yes, 1% seems still not ideally small. And it's unknown how this false-sharing
of atomic ops can affect to very big SMP.....maybe very big.
BTW, I'm not sure why intel_pmu_enable_all() is very big...because of fork() ?
My patch is below, just as a reference of my work in the last year, not
very clean yet. I need more time for improvements (and does not adhere to
this implementation.)
When you test this, please turn off SPINLOCK_DEBUG to enable split-page-table-lock.
Regards,
-Kame
==
From: KAMEZAWA Hiroyuki <[email protected]>
Asynchronous page fault.
This patch is for avoidng mmap_sem in usual page fault. At running highly
multi-threaded programs, mm->mmap_sem can use much CPU because of false
sharing when it causes page fault in parallel. (Run after fork() is a typical
case, I think.)
This patch uses a speculative vma lookup to reduce that cost.
Considering vma lookup, rb-tree lookup, the only operation we do is checking
node->rb_left,rb_right. And there are no complicated operation.
At page fault, there are no demands for accessing sorted-vma-list or access
prev or next in many case. Except for stack-expansion, we always need a vma
which contains page-fault address. Then, we can access vma's RB-tree in
speculative way.
Even if RB-tree rotation occurs while we walk tree for look-up, we just
miss vma without oops. In other words, we can _try_ to find vma in lockless
manner. If failed, retry is ok.... we take lock and access vma.
For lockess walking, this uses RCU and adds find_vma_speculative(). And
per-vma wait-queue and reference count. This refcnt+wait_queue guarantees that
there are no thread which access the vma when we call subsystem's unmap
functions.
Changelog:
- removed rcu_xxx macros.
- fixed reference count handling bug at vma_put().
- removed atomic_add_unless() etc. But use Big number for handling race.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
arch/x86/mm/fault.c | 14 ++++++
include/linux/mm.h | 14 ++++++
include/linux/mm_types.h | 5 ++
mm/mmap.c | 98 +++++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 127 insertions(+), 4 deletions(-)
Index: linux-2.6.33-rc2/include/linux/mm_types.h
===================================================================
--- linux-2.6.33-rc2.orig/include/linux/mm_types.h
+++ linux-2.6.33-rc2/include/linux/mm_types.h
@@ -11,6 +11,7 @@
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
+#include <linux/rcupdate.h>
#include <linux/page-debug-flags.h>
#include <asm/page.h>
#include <asm/mmu.h>
@@ -180,6 +181,10 @@ struct vm_area_struct {
void * vm_private_data; /* was vm_pte (shared mem) */
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
+ atomic_t refcnt;
+ wait_queue_head_t wait_queue;
+ struct rcu_head rcuhead;
+
#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
Index: linux-2.6.33-rc2/mm/mmap.c
===================================================================
--- linux-2.6.33-rc2.orig/mm/mmap.c
+++ linux-2.6.33-rc2/mm/mmap.c
@@ -187,6 +187,26 @@ error:
return -ENOMEM;
}
+static void __free_vma_rcu_cb(struct rcu_head *head)
+{
+ struct vm_area_struct *vma;
+ vma = container_of(head, struct vm_area_struct, rcuhead);
+ kmem_cache_free(vm_area_cachep, vma);
+}
+/* Call this if vma was linked to rb-tree */
+static void free_vma_rcu(struct vm_area_struct *vma)
+{
+ call_rcu(&vma->rcuhead, __free_vma_rcu_cb);
+}
+#define VMA_FREE_MAGIC (10000000)
+/* called when vma is unlinked and wait for all racy access.*/
+static void invalidate_vma_before_free(struct vm_area_struct *vma)
+{
+ atomic_sub(VMA_FREE_MAGIC, &vma->refcnt);
+ wait_event(vma->wait_queue,
+ (atomic_read(&vma->refcnt) == -VMA_FREE_MAGIC));
+}
+
/*
* Requires inode->i_mapping->i_mmap_lock
*/
@@ -238,7 +258,7 @@ static struct vm_area_struct *remove_vma
removed_exe_file_vma(vma->vm_mm);
}
mpol_put(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma_rcu(vma);
return next;
}
@@ -404,8 +424,12 @@ __vma_link_list(struct mm_struct *mm, st
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
+ atomic_set(&vma->refcnt, 0);
+ init_waitqueue_head(&vma->wait_queue);
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ /* For speculative vma lookup */
+ smp_wmb();
}
static void __vma_link_file(struct vm_area_struct *vma)
@@ -490,6 +514,7 @@ __vma_unlink(struct mm_struct *mm, struc
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
+ smp_wmb();
}
/*
@@ -614,6 +639,7 @@ again: remove_next = 1 + (end > next->
* us to remove next before dropping the locks.
*/
__vma_unlink(mm, next, vma);
+ invalidate_vma_before_free(next);
if (file)
__remove_shared_vm_struct(next, file, mapping);
if (next->anon_vma)
@@ -640,7 +666,7 @@ again: remove_next = 1 + (end > next->
}
mm->map_count--;
mpol_put(vma_policy(next));
- kmem_cache_free(vm_area_cachep, next);
+ free_vma_rcu(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -1544,6 +1570,71 @@ out:
}
/*
+ * Returns vma which contains given address. This scans rb-tree in speculative
+ * way and increment a reference count if found. Even if vma exists in rb-tree,
+ * this function may return NULL in racy case. So, this function cannot be used
+ * for checking whether given address is valid or not.
+ */
+
+struct vm_area_struct *
+find_vma_speculative(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *vma_tmp;
+ struct rb_node *rb_node;
+
+ if (unlikely(!mm))
+ return NULL;;
+
+ rcu_read_lock();
+ /*
+ * Barreir against modification of rb-tree
+ * rb-tree update is not an atomic ops and no barreir is used while
+ * modification. Then, modification to rb-tree can be reordered. This
+ * memory barrier is against vma_(un)link_rb() for avoiding to read
+ * too old data to catch all changes we get rcu_read_lock.
+ *
+ * We may see broken RB-tree and can't find existing vma. But it's ok.
+ * We allowed to return NULL even if valid one exists. The caller will
+ * use find_vma() with read-semaphore.
+ */
+
+ smp_read_barrier_depends();
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
+ while (rb_node) {
+ vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ if (vma_tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
+ }
+ if (vma) {
+ if ((vma->vm_start <= addr) && (addr < vma->vm_end)) {
+ if (atomic_inc_return(&vma->refcnt) < 0) {
+ vma_put(vma);
+ vma = NULL;
+ }
+ } else
+ vma = NULL;
+ }
+ rcu_read_unlock();
+ return vma;
+}
+
+void vma_put(struct vm_area_struct *vma)
+{
+ if ((atomic_dec_return(&vma->refcnt) == -VMA_FREE_MAGIC) &&
+ waitqueue_active(&vma->wait_queue))
+ wake_up(&vma->wait_queue);
+ return;
+}
+
+/*
* Verify that the stack growth is acceptable and
* update accounting. This is shared with both the
* grow-up and grow-down cases.
@@ -1808,6 +1899,7 @@ detach_vmas_to_be_unmapped(struct mm_str
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
do {
rb_erase(&vma->vm_rb, &mm->mm_rb);
+ invalidate_vma_before_free(vma);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
Index: linux-2.6.33-rc2/include/linux/mm.h
===================================================================
--- linux-2.6.33-rc2.orig/include/linux/mm.h
+++ linux-2.6.33-rc2/include/linux/mm.h
@@ -1235,6 +1235,20 @@ static inline struct vm_area_struct * fi
return vma;
}
+/*
+ * Look up vma for given address in speculative way. This allows lockless lookup
+ * of vmas but in racy case, vma may no be found. You have to call find_vma()
+ * under read lock of mm->mmap_sem even if this function returns NULL.
+ * And, returned vma's reference count is incremented to show vma is accessed
+ * asynchronously, the caller has to call vma_put().
+ *
+ * Unlike find_vma(), this returns a vma which contains specified address.
+ * This doesn't return the nearest vma.
+ */
+extern struct vm_area_struct *find_vma_speculative(struct mm_struct *mm,
+ unsigned long addr);
+void vma_put(struct vm_area_struct *vma);
+
static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
Index: linux-2.6.33-rc2/arch/x86/mm/fault.c
===================================================================
--- linux-2.6.33-rc2.orig/arch/x86/mm/fault.c
+++ linux-2.6.33-rc2/arch/x86/mm/fault.c
@@ -952,6 +952,7 @@ do_page_fault(struct pt_regs *regs, unsi
struct mm_struct *mm;
int write;
int fault;
+ int speculative = 0;
tsk = current;
mm = tsk->mm;
@@ -1040,6 +1041,14 @@ do_page_fault(struct pt_regs *regs, unsi
return;
}
+ if ((error_code & PF_USER)) {
+ vma = find_vma_speculative(mm, address);
+ if (vma) {
+ speculative = 1;
+ goto good_area;
+ }
+ }
+
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
@@ -1136,5 +1145,8 @@ good_area:
check_v8086_mode(regs, address, tsk);
- up_read(&mm->mmap_sem);
+ if (speculative)
+ vma_put(vma);
+ else
+ up_read(&mm->mmap_sem);
}
On Mon, 2010-01-04 at 19:13 -0800, Linus Torvalds wrote:
> I would say that this whole series is _very_ far from being mergeable.
Never said anything else ;-)
> Peter seems to have been thinking about the details, while missing all the
> subtle big picture effects that seem to actually change semantics.
Yes, I seem to have missed a rather significant side of the whole
issue.. back to the drawing room for me then.
On Mon, 2010-01-04 at 21:10 -0800, Linus Torvalds wrote:
> Sounds doable. But it also sounds way more expensive than the current VM
> fault handling, which is pretty close to optimal for single-threaded
> cases.. That RCU lookup might be cheap, but just the refcount is generally
> going to be as expensive as a lock.
Right, that refcount adds two atomic ops, the only grace it has is that
its in the vma as opposed to the mm, but there are plenty workloads that
concentrate on a single vma, in which case you get an equally contended
cacheline as with the mmap_sem.
I was trying to avoid having to have that refcount, but then sorta
forgot about the actual fault handlers also poking at the vma :/
On Tue, 2010-01-05 at 15:09 +0900, Minchan Kim wrote:
> Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
> It can delay munap and exit but it would be better than handling them by more
> complicated things, I think. And both cases aren't often cases so we
> can achieve advantage than disadvantage?
Sadly there are programs that mmap()/munmap() at a staggering rate
(clamav comes to mind), so munmap() performance is important too.
On Mon, 2010-01-04 at 19:13 -0800, Linus Torvalds wrote:
> Or put another way: if the vma was a writable mapping, a user may do
>
> munmap(mapping, size);
>
> and the backing file is still active and writable AFTER THE MUNMAP! This
> can be a huge problem for something that wants to unmount the volume, for
> example, or depends on the whole writability-vs-executability thing. The
> user may have unmapped it, and expects the file to be immediately
> non-busy, but with the delayed free that isn't the case any more.
If it were only unmount it would be rather easy to fix by putting that
RCU synchronization in unmount, unmount does a lot of sync things
anyway. But I suspect there's more cases where that non-busy matters
(but I'd need to educate myself on filesystems/vfs to come up with any).
On Mon, 2010-01-04 at 19:13 -0800, Linus Torvalds wrote:
>
> Protecting the vma isn't enough. You need to protect the whole FS stack
> with rcu. Probably by moving _all_ of "free_vma()" into the RCU path
> (which means that the whole file/inode gets de-allocated at that later RCU
> point, rather than synchronously). Not just the actual kfree.
Right, looking at that I found another interesting challenge, fput() can
sleep and I suspect that even with call_srcu() its callbacks have to be
atomic.
While looking at that code, I found the placement of might_sleep() a tad
confusing, I'd expect that to be in fput() since that is the regular
entry point (all except AIO, which does crazy things).
---
fs/file_table.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/file_table.c b/fs/file_table.c
index 69652c5..6070c32 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -196,6 +196,8 @@ EXPORT_SYMBOL(alloc_file);
void fput(struct file *file)
{
+ might_sleep();
+
if (atomic_long_dec_and_test(&file->f_count))
__fput(file);
}
@@ -236,8 +238,6 @@ void __fput(struct file *file)
struct vfsmount *mnt = file->f_path.mnt;
struct inode *inode = dentry->d_inode;
- might_sleep();
-
fsnotify_close(file);
/*
* The function eventpoll_release() should be the first called
On Mon, 04 Jan 2010 19:24:35 +0100
Peter Zijlstra <[email protected]> wrote:
> Generic speculative fault handler, tries to service a pagefault
> without holding mmap_sem.
while I appreciate the goal of reducing contention on this lock...
wouldn't step one be to remove the page zeroing from under this lock?
that's by far (easily by 10x I would guess) the most expensive thing
that's done under the lock, and I would expect a first order of
contention reduction just by having the zeroing of a page not done
under the lock...
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
Arjan van de Ven <[email protected]> writes:
> On Mon, 04 Jan 2010 19:24:35 +0100
> Peter Zijlstra <[email protected]> wrote:
>
>> Generic speculative fault handler, tries to service a pagefault
>> without holding mmap_sem.
>
>
> while I appreciate the goal of reducing contention on this lock...
> wouldn't step one be to remove the page zeroing from under this lock?
> that's by far (easily by 10x I would guess) the most expensive thing
> that's done under the lock, and I would expect a first order of
> contention reduction just by having the zeroing of a page not done
> under the lock...
The cache line bouncing of the shared cache lines hurts too.
I suspect fixing this all properly will need some deeper changes.
-Andi
--
[email protected] -- Speaking for myself only.
On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> I'd like to hear use cases of really heavy users, too. Christoph ?
A typical use case is a highly parallel memory intensive process
(simulations f.e.). Those are configured with the number of hardware
threads supported in mind to max out the performance of the underlying
hardware. On startup they start N threads and then each thread begins
initializing its memory (to get proper locality it has to be done this
way, you also want concurrency during this expensive operation).
The larger the number of threads the more contention on the
cachelines containing mmap_sem and the other cacheline containing the rss
counters. In extreme cases we had to wait 30mins to an hour in order for
the cacheline bouncing to complete (startup of a big HPC app on
IA64 with 1k threads).
On Tue, 5 Jan 2010, Arjan van de Ven wrote:
> while I appreciate the goal of reducing contention on this lock...
> wouldn't step one be to remove the page zeroing from under this lock?
> that's by far (easily by 10x I would guess) the most expensive thing
> that's done under the lock, and I would expect a first order of
> contention reduction just by having the zeroing of a page not done
> under the lock...
The main issue is cacheline bouncing. mmap sem is a rw semaphore and only
held for read during a fault.
On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> #
> # Overhead Command Shared Object Symbol
> # ........ ............... ........................ ......
> #
> 43.23% multi-fault-all [kernel] [k] smp_invalidate_interrupt
> 16.27% multi-fault-all [kernel] [k] flush_tlb_others_ipi
> 11.55% multi-fault-all [kernel] [k] _raw_spin_lock_irqsave <========(*)
> 6.23% multi-fault-all [kernel] [k] intel_pmu_enable_all
> 2.17% multi-fault-all [kernel] [k] _raw_spin_unlock_irqrestore
Hmm.. The default rwsem implementation shouldn't have any spin-locks in
the fast-path. And your profile doesn't seem to have any scheduler
footprint, so I wonder what is going on.
Oh.
Lookie here:
- arch/x86/Kconfig.cpu:
config X86_XADD
def_bool y
depends on X86_32 && !M386
- arch/x86/Kconfig:
config RWSEM_GENERIC_SPINLOCK
def_bool !X86_XADD
config RWSEM_XCHGADD_ALGORITHM
def_bool X86_XADD
it looks like X86_XADD only gets enabled on 32-bit builds. Which means
that x86-64 in turn seems to end up always using the slower "generic
spinlock" version.
Are you sure this isn't the reason why your profiles are horrible?
Linus
On Tue, 5 Jan 2010, Peter Zijlstra wrote:
>
> If it were only unmount it would be rather easy to fix by putting that
> RCU synchronization in unmount, unmount does a lot of sync things
> anyway. But I suspect there's more cases where that non-busy matters
> (but I'd need to educate myself on filesystems/vfs to come up with any).
unmount may well be the only really huge piece.
The only other effects of delaying closing a file I can see are
- the ETXTBUSY thing, but we don't need to delay _that_ part, so this may
be a non-issue.
- the actual freeing of the data on disk (ie people may expect that the
last close really frees up the space on the filesystem). However, this
is _such_ a subtle semantic thing that maybe nobody cares.
It's perhaps worth noting that I think Nick's VFS scalability patches did
at least _some_ of the "struct filp" freeing in RCU context too, so this
whole "vfs delays things in RCU" is not a new thing.
But I think that in Nick's case it was stricly just the freeing of the
inode/dentry data structure (because he needed to traverse the dentry list
locklessly - he didn't then _use_ the results locklessly). So the actual
filesystem operations didn't get deferred, and as a result it didn't have
this particular semantic nightmare.
Linus
On Tue, Jan 05, 2010 at 07:34:02AM -0800, Linus Torvalds wrote:
> The only other effects of delaying closing a file I can see are
>
> - the ETXTBUSY thing, but we don't need to delay _that_ part, so this may
> be a non-issue.
>
> - the actual freeing of the data on disk (ie people may expect that the
> last close really frees up the space on the filesystem). However, this
> is _such_ a subtle semantic thing that maybe nobody cares.
- a bunch of fs operations done from RCU callbacks. Including severely
blocking ones. As in "for minutes" in the most severe cases, and with
large numbers of objects involved. Can get very unpleasant...
On Tue, 5 Jan 2010, Al Viro wrote:
>
> - a bunch of fs operations done from RCU callbacks. Including severely
> blocking ones.
Yeah, you're right (and Peter also pointed out the might_sleep). That is
likely to be the really fundamental issue.
You _can_ handle it (make the RCU callback just schedule the work instead
of doing it directly), but it does sound really nasty. I suspect we should
explore just about any other approach over this one.
Linus
On Tue, 5 Jan 2010, Linus Torvalds wrote:
>
> Lookie here:
>
> - arch/x86/Kconfig.cpu:
>
> config X86_XADD
> def_bool y
> depends on X86_32 && !M386
>
> - arch/x86/Kconfig:
>
> config RWSEM_GENERIC_SPINLOCK
> def_bool !X86_XADD
>
> config RWSEM_XCHGADD_ALGORITHM
> def_bool X86_XADD
>
> it looks like X86_XADD only gets enabled on 32-bit builds. Which means
> that x86-64 in turn seems to end up always using the slower "generic
> spinlock" version.
Sadly, it's not as easy as just changing the X86_XADD "depends on" to say
"X86_64 || !M386" instead. That just results in
kernel/built-in.o: In function `up_read':
(.text+0x2d8e5): undefined reference to `call_rwsem_wake'
etc, because the x86-64 code has obviously never seen the optimized
call-paths, and they need the asm wrappers for full semantics.
Oh well. Somebody who is bored might look at trying to make the wrapper
code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the
successful rwsem cases much faster.
Linus
Linus Torvalds <[email protected]> writes:
>
> etc, because the x86-64 code has obviously never seen the optimized
> call-paths, and they need the asm wrappers for full semantics.
iirc Andrea ran benchmarks at some point and it didn't make too much
difference on the systems back then (K8 era). Given K8 has fast atomics.
> Oh well. Somebody who is bored might look at trying to make the wrapper
> code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the
> successful rwsem cases much faster.
Maybe, maybe not.
-Andi
--
[email protected] -- Speaking for myself only.
On Tue, 5 Jan 2010, Andi Kleen wrote:
> > Oh well. Somebody who is bored might look at trying to make the wrapper
> > code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the
> > successful rwsem cases much faster.
>
> Maybe, maybe not.
What we saw in the past was that the cost of acquiring exclusive
access to the contended cachelines containing mmap_sem and rss counters
was most important.
On Tue, 5 Jan 2010, Andi Kleen wrote:
>
> > Oh well. Somebody who is bored might look at trying to make the wrapper
> > code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the
> > successful rwsem cases much faster.
>
> Maybe, maybe not.
If there is actual contention on the lock, but mainly just readers (which
is what the profile indicates: since there is no scheduler footprint, the
actual writer-vs-reader case is probably very rare), then the xadd is
likely to be _much_ faster than the spinlock.
Sure, the cacheline is going to bounce regardless (since it's a shared
per-mm data structure), but the spinlock is going to bounce wildly
back-and-forth between everybody who _tries_ to get it, while the regular
xadd is going to bounce just once per actual successful xadd.
So a spinlock is as cheap as an atomic when there is no contention (which
is the common single-thread case - the real cost of both lock and atomic
is simply the fact that CPU serialization is expensive), but when there is
actual lock contention, I bet the atomic xadd is going to be shown to be
superior.
Remember: we commonly claim that 'spin_unlock' is basically free on x86 -
and that's true, but it is _only_ true for the uncontended state.
Linus
On Tue, Jan 05, 2010 at 11:47:49AM -0600, Christoph Lameter wrote:
> On Tue, 5 Jan 2010, Andi Kleen wrote:
>
> > > Oh well. Somebody who is bored might look at trying to make the wrapper
> > > code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the
> > > successful rwsem cases much faster.
> >
> > Maybe, maybe not.
>
> What we saw in the past was that the cost of acquiring exclusive
> access to the contended cachelines containing mmap_sem and rss counters
> was most important.
That cost should be about the same between the spinlock and the atomic version.
-Andi
--
[email protected] -- Speaking for myself only.
On Tue, 5 Jan 2010, Linus Torvalds wrote:
> So a spinlock is as cheap as an atomic when there is no contention (which
> is the common single-thread case - the real cost of both lock and atomic
> is simply the fact that CPU serialization is expensive), but when there is
> actual lock contention, I bet the atomic xadd is going to be shown to be
> superior.
>
> Remember: we commonly claim that 'spin_unlock' is basically free on x86 -
> and that's true, but it is _only_ true for the uncontended state.
Its also free if the MESI algorithm has been tuned in such a way that the
exclusive cacheline that was just acquired is not immediately released
after a single access.
If the critical section protected by the spinlock is small then the
delay will keep the cacheline exclusive until we hit the unlock. This
is the case here as far as I can tell.
On Tue, 5 Jan 2010, Christoph Lameter wrote:
>
> If the critical section protected by the spinlock is small then the
> delay will keep the cacheline exclusive until we hit the unlock. This
> is the case here as far as I can tell.
I hope somebody can time it. Because I think the idle reads on all the
(unsuccessful) spinlocks will kill it.
Think of it this way: under heavy contention, you'll see a lot of people
waiting for the spinlocks and one of them succeeds at writing it, reading
the line. So you get an O(n^2) bus traffic access pattern. In contrast,
with an xadd, you get O(n) behavior - everybody does _one_ acquire-for-
write bus access.
Remember: the critical section is small, but since you're contending on
the spinlock, that doesn't much _help_. The readers are all hitting the
lock (and you can try to solve the O(n*2) issue with back-off, but quite
frankly, anybody who does that has basically already lost - I'm personally
convinced you should never do lock backoff, and instead look at what you
did wrong at a higher level instead).
Linus
On Tue, 5 Jan 2010, Linus Torvalds wrote:
> I hope somebody can time it. Because I think the idle reads on all the
> (unsuccessful) spinlocks will kill it.
>
> Think of it this way: under heavy contention, you'll see a lot of people
> waiting for the spinlocks and one of them succeeds at writing it, reading
> the line. So you get an O(n^2) bus traffic access pattern. In contrast,
> with an xadd, you get O(n) behavior - everybody does _one_ acquire-for-
> write bus access.
There will not be much spinning. The cacheline will be held exclusively by
one processor. A request by other processors for shared access to the
cacheline will effectively stop the execution on those processors until
the cacheline is available.
Access to main memory is mediated by the cache controller anyways. As
long as the cacheline has not been invalidated it will be reused. Maybe
Andi can give us more details on the exact behavior on recent Intel
processors?
One thing that could change with the removal of the spinlock is that the
initial request for the cacheline will not be for read anymore but for
write. This saves us the upgrade from read / shared state to exclusive access.
On Tue, Jan 05, 2010 at 10:25:43AM -0800, Linus Torvalds wrote:
>
>
> On Tue, 5 Jan 2010, Christoph Lameter wrote:
> >
> > If the critical section protected by the spinlock is small then the
> > delay will keep the cacheline exclusive until we hit the unlock. This
> > is the case here as far as I can tell.
>
> I hope somebody can time it. Because I think the idle reads on all the
> (unsuccessful) spinlocks will kill it.
But on many systems, it does take some time for the idle reads to make
their way to the CPU that just acquired the lock. My (admittedly dated)
experience is that the CPU acquiring the lock has a few bus clocks
before the cache line containing the lock gets snatched away.
> Think of it this way: under heavy contention, you'll see a lot of people
> waiting for the spinlocks and one of them succeeds at writing it, reading
> the line. So you get an O(n^2) bus traffic access pattern. In contrast,
> with an xadd, you get O(n) behavior - everybody does _one_ acquire-for-
> write bus access.
xadd (and xchg) certainly are nicer where they apply!
> Remember: the critical section is small, but since you're contending on
> the spinlock, that doesn't much _help_. The readers are all hitting the
> lock (and you can try to solve the O(n*2) issue with back-off, but quite
> frankly, anybody who does that has basically already lost - I'm personally
> convinced you should never do lock backoff, and instead look at what you
> did wrong at a higher level instead).
Music to my ears! ;-)
Thanx, Paul
On Tue, 5 Jan 2010, Christoph Lameter wrote:
>
> There will not be much spinning. The cacheline will be held exclusively by
> one processor. A request by other processors for shared access to the
> cacheline will effectively stop the execution on those processors until
> the cacheline is available.
AT WHICH POINT THEY ALL RACE TO GET IT TO SHARED MODE, only to then have
one of them actually see that it got a ticket!
Don't you see the problem? The spinlock (with ticket locks) essentially
does the "xadd" atomic increment anyway, but then it _waits_ for it. All
totally pointless, and all just making for problems, and wasting CPU time
and causing more cross-node traffic.
In contrast, a native xadd-based rwsem will basically do the atomic
increment (that ticket spinlocks also do) and then just go on with its
life. Never waiting for all the other CPU's to also do their ticket
spinlocks.
The "short critical region" you talk about is totally meaningless, since
the contention will mean that everybody is waiting and hitting that
cacheline for a long time regardless - they'll be waiting for O(n)
_other_ CPU's to get the lock first!
Linus
On Tue, 5 Jan 2010, Paul E. McKenney wrote:
>
> But on many systems, it does take some time for the idle reads to make
> their way to the CPU that just acquired the lock.
Yes. But the point is that there is lots of them.
So think of it this way: every time _one_ CPU acquires a lock (and
then releases it), _all_ CPU's will read the new value. Imagine the
cross-socket traffic.
In contrast, doing just a single xadd (which replaces the whole
"spin_lock+non-atomics+spin_unlock"), every times _once_ CPU cquires a
lock, that's it. The other CPU's arent' all waiting in line for the lock
to be released, and reading the cacheline to see if it's their turn.
Sure, after they got the lock they'll all eventually end up reading from
that cacheline that contains 'struct mm_struct', but that's something we
could even think about trying to minimize by putting the mmap_sem as far
away from the other fields as possible.
Now, it's very possible that if you have a broadcast model of cache
coherency, none of this much matters and you end up with almost all the
same bus traffic anyway. But I do think it could matter a lot.
Linus
On Tue, 5 Jan 2010, Linus Torvalds wrote:
> Don't you see the problem? The spinlock (with ticket locks) essentially
> does the "xadd" atomic increment anyway, but then it _waits_ for it. All
> totally pointless, and all just making for problems, and wasting CPU time
> and causing more cross-node traffic.
For the xadd to work it first would have to acquire the cacheline
exclusively. That can only occur for a single processor and with the
fitting minimum holdtime for exclusive cachelines everything should be
fine. If the critical section is done (unlock complete) then the next
processor (which had to stopped waiting to access the cacheline) can
acquire the lock.
The wait state is the processor being stopped due to not being able to
access the cacheline. Not the processor spinning in the xadd loop. That
only occurs if the critical section is longer than the timeout.
On Tue, Jan 05, 2010 at 11:08:46AM -0800, Linus Torvalds wrote:
>
>
> On Tue, 5 Jan 2010, Paul E. McKenney wrote:
> >
> > But on many systems, it does take some time for the idle reads to make
> > their way to the CPU that just acquired the lock.
>
> Yes. But the point is that there is lots of them.
>
> So think of it this way: every time _one_ CPU acquires a lock (and
> then releases it), _all_ CPU's will read the new value. Imagine the
> cross-socket traffic.
Yep, been there, and with five-microsecond cross-"socket" latencies.
Of course, the CPU clock frequencies were a bit slower back then.
> In contrast, doing just a single xadd (which replaces the whole
> "spin_lock+non-atomics+spin_unlock"), every times _once_ CPU cquires a
> lock, that's it. The other CPU's arent' all waiting in line for the lock
> to be released, and reading the cacheline to see if it's their turn.
>
> Sure, after they got the lock they'll all eventually end up reading from
> that cacheline that contains 'struct mm_struct', but that's something we
> could even think about trying to minimize by putting the mmap_sem as far
> away from the other fields as possible.
>
> Now, it's very possible that if you have a broadcast model of cache
> coherency, none of this much matters and you end up with almost all the
> same bus traffic anyway. But I do think it could matter a lot.
I have seen systems that work both ways. If the CPU has enough time
between getting the cache line in unlocked state to lock it, do the
modification, and release the lock before the first in the flurry of
reads arrives, then performance will be just fine. Each reader will
see the cache line with an unlocked lock and life will be good.
On the other hand, as you say, if the first in the flurry of reads arrives
before the CPU has managed to make its way through the critical section,
then your n^2 nightmare comes true.
If the critical section operates only on the cache line containing the
lock, and if the critical section has only a handful of instructions,
I bet you win on almost all platforms. But, like you, I would still
prefer the xadd (or xchg or whatever) to the lock, where feasible.
But you cannot expect all systems to see a major performance boost from
switching from locks to xadds. Often, all you get is sounder sleep.
Which is valuable in its own right. ;-)
Thanx, Paul
On Tue, 5 Jan 2010, Christoph Lameter wrote:
>
> The wait state is the processor being stopped due to not being able to
> access the cacheline. Not the processor spinning in the xadd loop. That
> only occurs if the critical section is longer than the timeout.
You don't know what you're talking about, do you?
Just go and read the source code.
The process is spinning currently in the spin_lock loop. Here, I'll quote
it to you:
LOCK_PREFIX "xaddw %w0, %1\n"
"1:\t"
"cmpb %h0, %b0\n\t"
"je 2f\n\t"
"rep ; nop\n\t"
"movb %1, %b0\n\t"
/* don't need lfence here, because loads are in-order */
"jmp 1b\n"
note the loop that spins - reading the thing over and over - waiting for
_that_ CPU to be the owner of the xadd ticket?
That's the one you have now, only because x86-64 uses the STUPID FALLBACK
CODE for the rwsemaphores!
In contrast, look at what the non-stupid rwsemaphore code does (which
triggers on x86-32):
LOCK_PREFIX " incl (%%eax)\n\t"
/* adds 0x00000001, returns the old value */
" jns 1f\n"
" call call_rwsem_down_read_failed\n"
(that's a "down_read()", which happens to be the op we care most about.
See? That's a single locked "inc" (it avoids the xadd on the read side
because of how we've biased things). In particular, notice how this means
that we do NOT have fifty million CPU's all trying to read the same
location while one writes to it successfully.
Spot the difference?
Here's putting it another way. Which of these schenarios do you think
should result in less cross-node traffic:
- multiple CPU's that - one by one - get the cacheline for exclusive
access.
- multiple CPU's that - one by one - get the cacheline for exclusive
access, while other CPU's are all trying to read the same cacheline at
the same time, over and over again, in a loop.
See the shared part? See the difference? If you look at just a single lock
acquire, it boils down to these two scenarios
- one CPU gets the cacheline exclusively
- one CPU gets the cacheline exclusively while <n> other CPU's are all
trying to read the old and the new value.
It really is that simple.
Linus
On Tue, 2010-01-05 at 10:25 -0800, Linus Torvalds wrote:
> The readers are all hitting the
> lock (and you can try to solve the O(n*2) issue with back-off, but quite
> frankly, anybody who does that has basically already lost
/me sneaks in a reference to local spinning spinlocks just to have them
mentioned.
On Tue, 5 Jan 2010, Peter Zijlstra wrote:
>
> On Tue, 2010-01-05 at 10:25 -0800, Linus Torvalds wrote:
> > The readers are all hitting the
> > lock (and you can try to solve the O(n*2) issue with back-off, but quite
> > frankly, anybody who does that has basically already lost
>
> /me sneaks in a reference to local spinning spinlocks just to have them
> mentioned.
Were you in on the discussion with (I think) Andy Glew about that? I don't
think he ever came up with an actual workable solution, but he tried. He
worked for Intel at the time (maybe still does), and was looking at
architectural improvements. I think you need a "compare-and-exchange-2-
separate-words" instruction to make it work (not "cmpxchg8/16b" -
literally two _different_ words).
I think m68k can do it. Even then it's a lot more expensive than a regular
lock for any reasonable common case.
Linus
On Tue, 5 Jan 2010, Linus Torvalds wrote:
>
> I think you need a "compare-and-exchange-2-separate-words" instruction
> to make it work (not "cmpxchg8/16b" - literally two _different_ words).
Btw, I might be misremembering - Andy was looking at various lockless
algorithms too. Maybe the problem was the non-local space requirement.
There were several spin-lock variants that would be improved if we could
pass a cookie from the 'lock' to the 'unlock'.
In fact, even the ticket locks would be improved by that, since we could
then possibly do the unlock as a plain 'store' rather than an 'add', and
keep the nex-owner cookie in a register over the lock rather than unlock
by just incrementing it in the nasty lock cacheline.
Linus
On Tue, Jan 05, 2010 at 09:29:16PM +0100, Peter Zijlstra wrote:
> On Tue, 2010-01-05 at 10:25 -0800, Linus Torvalds wrote:
> > The readers are all hitting the
> > lock (and you can try to solve the O(n*2) issue with back-off, but quite
> > frankly, anybody who does that has basically already lost
>
> /me sneaks in a reference to local spinning spinlocks just to have them
> mentioned.
Been there, done that. More than once. One of them remains in
production use.
The trick is to use a normal spinlock at low levels of contention, and
switch to a more elaborate structure if contention becomes a problem --
if a task spins for too long on the normal spinlock, it set a bit
in the normal spinlock word. A separate structure allowed tasks to
spin on their own lock word, and also arranged to hand the lock off
to requestors on the same NUMA node as the task releasing the lock in
order to reduce the average cache-miss latency, but also bounding the
resulting unfairness. It also avoided handing locks off to tasks whose
local spins were interrupted, the idea being that if contention is high,
the probability of being interrupted while spinning is higher than that
of being interrupted while holding the lock (since the time spent spinning
is much greater than the time spent actually holding the lock).
The race conditions were extremely challenging, so, in most cases,
designing for low contention seems -much- more productive. ;-)
Thanx, Paul
On Tue, 5 Jan 2010, Peter Zijlstra wrote:
>
> While looking at that code, I found the placement of might_sleep() a tad
> confusing, I'd expect that to be in fput() since that is the regular
> entry point (all except AIO, which does crazy things).
I think it's the crazy AIO thing that is one of the reasons for doing it
in __fput. If it's just in fput() it covers the _normal_ cases better, but
I doubt that those were the primary worry ;).
Linus
On Tue, 5 Jan 2010 07:26:31 -0800 (PST)
Linus Torvalds <[email protected]> wrote:
>
>
> On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > #
> > # Overhead Command Shared Object Symbol
> > # ........ ............... ........................ ......
> > #
> > 43.23% multi-fault-all [kernel] [k] smp_invalidate_interrupt
> > 16.27% multi-fault-all [kernel] [k] flush_tlb_others_ipi
> > 11.55% multi-fault-all [kernel] [k] _raw_spin_lock_irqsave <========(*)
> > 6.23% multi-fault-all [kernel] [k] intel_pmu_enable_all
> > 2.17% multi-fault-all [kernel] [k] _raw_spin_unlock_irqrestore
>
> Hmm.. The default rwsem implementation shouldn't have any spin-locks in
> the fast-path. And your profile doesn't seem to have any scheduler
> footprint, so I wonder what is going on.
>
> Oh.
>
> Lookie here:
>
> - arch/x86/Kconfig.cpu:
>
> config X86_XADD
> def_bool y
> depends on X86_32 && !M386
>
> - arch/x86/Kconfig:
>
> config RWSEM_GENERIC_SPINLOCK
> def_bool !X86_XADD
>
> config RWSEM_XCHGADD_ALGORITHM
> def_bool X86_XADD
>
> it looks like X86_XADD only gets enabled on 32-bit builds. Which means
> that x86-64 in turn seems to end up always using the slower "generic
> spinlock" version.
>
> Are you sure this isn't the reason why your profiles are horrible?
>
I think this is the 1st reason but haven't rewrote rwsem itself and tested,
sorry.
This is a profile in other test.
==
2.6.33-rc2's score of the same test program is here.
75.42% multi-fault-all [kernel] [k] _raw_spin_lock_irqsav
|
--- _raw_spin_lock_irqsave
|
|--49.13%-- __down_read_trylock
| down_read_trylock
| do_page_fault
| page_fault
| 0x400950
| |
| --100.00%-- (nil)
|
|--46.92%-- __up_read
| up_read
| |
| |--99.99%-- do_page_fault
| | page_fault
| | 0x400950
| | (nil)
| --0.01%-- [...]
==
yes, spinlock is from rwsem.
Why I tried "skipping rwsem" is because I like avoid locking rather than rewrite
lock itself when I think of the influence of the patch....
Thanks,
-Kame
On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> I think this is the 1st reason but haven't rewrote rwsem itself and tested,
> sorry.
Here's a totally untested patch! It may or may not work. It builds for me,
but that may be some cosmic accident. I _think_ I got the callee-clobbered
register set right, but somebody should check the comment in the new
rwsem_64.S, and double-check that the code actually matches what I tried
to do.
I had to change the inline asm to get the register sizes right too, so for
all I know this screws up x86-32 too.
In other words: UNTESTED! It may molest your pets and drink all your beer.
You have been warned.
(Is a 16-bit rwsem count even enough for x86-64? Possibly not. That's a
separate issue entirely, though).
Linus
---
arch/x86/Kconfig.cpu | 2 +-
arch/x86/include/asm/rwsem.h | 18 +++++-----
arch/x86/lib/Makefile | 1 +
arch/x86/lib/rwsem_64.S | 81 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 92 insertions(+), 10 deletions(-)
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index f20ddf8..a198293 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT
config X86_XADD
def_bool y
- depends on X86_32 && !M386
+ depends on X86_64 || !M386
config X86_PPRO_FENCE
bool "PentiumPro memory ordering errata workaround"
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ca7517d..625baca 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -65,7 +65,7 @@ extern asmregparm struct rw_semaphore *
#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
struct rw_semaphore {
- signed long count;
+ __s32 count;
spinlock_t wait_lock;
struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -105,7 +105,7 @@ do { \
static inline void __down_read(struct rw_semaphore *sem)
{
asm volatile("# beginning down_read\n\t"
- LOCK_PREFIX " incl (%%eax)\n\t"
+ LOCK_PREFIX " incl (%1)\n\t"
/* adds 0x00000001, returns the old value */
" jns 1f\n"
" call call_rwsem_down_read_failed\n"
@@ -147,7 +147,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
tmp = RWSEM_ACTIVE_WRITE_BIAS;
asm volatile("# beginning down_write\n\t"
- LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ LOCK_PREFIX " xaddl %%edx,(%2)\n\t"
/* subtract 0x0000ffff, returns the old value */
" testl %%edx,%%edx\n\t"
/* was the count 0 before? */
@@ -170,9 +170,9 @@ static inline void __down_write(struct rw_semaphore *sem)
*/
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
- signed long ret = cmpxchg(&sem->count,
- RWSEM_UNLOCKED_VALUE,
- RWSEM_ACTIVE_WRITE_BIAS);
+ __s32 ret = cmpxchg(&sem->count,
+ RWSEM_UNLOCKED_VALUE,
+ RWSEM_ACTIVE_WRITE_BIAS);
if (ret == RWSEM_UNLOCKED_VALUE)
return 1;
return 0;
@@ -185,7 +185,7 @@ static inline void __up_read(struct rw_semaphore *sem)
{
__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
asm volatile("# beginning __up_read\n\t"
- LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ LOCK_PREFIX " xaddl %%edx,(%2)\n\t"
/* subtracts 1, returns the old value */
" jns 1f\n\t"
" call call_rwsem_wake\n"
@@ -203,7 +203,7 @@ static inline void __up_write(struct rw_semaphore *sem)
{
asm volatile("# beginning __up_write\n\t"
" movl %2,%%edx\n\t"
- LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
+ LOCK_PREFIX " xaddl %%edx,(%1)\n\t"
/* tries to transition
0xffff0001 -> 0x00000000 */
" jz 1f\n"
@@ -221,7 +221,7 @@ static inline void __up_write(struct rw_semaphore *sem)
static inline void __downgrade_write(struct rw_semaphore *sem)
{
asm volatile("# beginning __downgrade_write\n\t"
- LOCK_PREFIX " addl %2,(%%eax)\n\t"
+ LOCK_PREFIX " addl %2,(%1)\n\t"
/* transitions 0xZZZZ0001 -> 0xYYYY0001 */
" jns 1f\n\t"
" call call_rwsem_downgrade_wake\n"
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754..c802451 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -39,4 +39,5 @@ else
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
endif
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
new file mode 100644
index 0000000..15acecf
--- /dev/null
+++ b/arch/x86/lib/rwsem_64.S
@@ -0,0 +1,81 @@
+/*
+ * x86-64 rwsem wrappers
+ *
+ * This interfaces the inline asm code to the slow-path
+ * C routines. We need to save the call-clobbered regs
+ * that the asm does not mark as clobbered, and move the
+ * argument from %rax to %rdi.
+ *
+ * NOTE! We don't need to save %rax, because the functions
+ * will always return the semaphore pointer in %rax (which
+ * is also the input argument to these helpers)
+ *
+ * The following can clobber %rdx because the asm clobbers it:
+ * call_rwsem_down_write_failed
+ * call_rwsem_wake
+ * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
+ */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+#define save_common_regs \
+ pushq %rdi; \
+ pushq %rsi; \
+ pushq %rcx; \
+ pushq %r8; \
+ pushq %r9; \
+ pushq %r10; \
+ pushq %r11
+
+#define restore_common_regs \
+ popq %r11; \
+ popq %r10; \
+ popq %r9; \
+ popq %r8; \
+ popq %rcx; \
+ popq %rsi; \
+ popq %rdi
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+ save_common_regs
+ pushq %rdx
+ movq %rax,%rdi
+ call rwsem_down_read_failed
+ popq %rdx
+ restore_common_regs
+ ret
+ ENDPROC(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_write_failed)
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_down_write_failed
+ restore_common_regs
+ ret
+ ENDPROC(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_wake)
+ decw %dx /* do nothing if still outstanding active readers */
+ jnz 1f
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_wake
+ restore_common_regs
+1: ret
+ ENDPROC(call_rwsem_wake)
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_downgrade_wake)
+ save_common_regs
+ pushq %rdx
+ movq %rax,%rdi
+ call rwsem_downgrade_wake
+ popq %rdx
+ restore_common_regs
+ ret
+ ENDPROC(call_rwsem_downgrade_wake)
On Tue, 5 Jan 2010 17:37:08 -0800 (PST)
Linus Torvalds <[email protected]> wrote:
>
>
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> > I think this is the 1st reason but haven't rewrote rwsem itself and tested,
> > sorry.
>
> Here's a totally untested patch! It may or may not work. It builds for me,
> but that may be some cosmic accident. I _think_ I got the callee-clobbered
> register set right, but somebody should check the comment in the new
> rwsem_64.S, and double-check that the code actually matches what I tried
> to do.
>
> I had to change the inline asm to get the register sizes right too, so for
> all I know this screws up x86-32 too.
>
> In other words: UNTESTED! It may molest your pets and drink all your beer.
> You have been warned.
>
Thank you for warning ;)
My host boots successfully. Here is the result.
Result of Linus's rwmutex XADD patch.
Test:
while (1) {
touch memory
barrier
fork()->exit() if cpu==0
berrier
}
# Samples: 1121655736712
#
# Overhead Command Shared Object Symbol
# ........ ............... ........................ ......
#
50.26% multi-fault-all [kernel] [k] smp_invalidate_interrup
15.94% multi-fault-all [kernel] [k] flush_tlb_others_ipi
6.50% multi-fault-all [kernel] [k] intel_pmu_enable_all
3.17% multi-fault-all [kernel] [k] down_read_trylock
2.08% multi-fault-all [kernel] [k] do_wp_page
1.69% multi-fault-all [kernel] [k] page_fault
1.63% multi-fault-all ./multi-fault-all-fork [.] worker
1.53% multi-fault-all [kernel] [k] up_read
1.35% multi-fault-all [kernel] [k] do_page_fault
1.24% multi-fault-all [kernel] [k] _raw_spin_lock
1.10% multi-fault-all [kernel] [k] flush_tlb_page
0.96% multi-fault-all [kernel] [k] invalidate_interrupt0
0.92% multi-fault-all [kernel] [k] invalidate_interrupt3
0.90% multi-fault-all [kernel] [k] invalidate_interrupt2
Test:
while (1) {
touch memory
barrier
madvice DONTNEED to locally touched memory.
barrier
}
# Samples: 1335012531823
#
# Overhead Command Shared Object Symbol
# ........ ............... ........................ ......
#
32.17% multi-fault-all [kernel] [k] clear_page_c
9.60% multi-fault-all [kernel] [k] _raw_spin_lock
8.14% multi-fault-all [kernel] [k] _raw_spin_lock_irqsave
6.23% multi-fault-all [kernel] [k] down_read_trylock
4.98% multi-fault-all [kernel] [k] _raw_spin_lock_irq
4.63% multi-fault-all [kernel] [k] __mem_cgroup_try_charge
4.45% multi-fault-all [kernel] [k] up_read
3.83% multi-fault-all [kernel] [k] handle_mm_fault
3.19% multi-fault-all [kernel] [k] __rmqueue
3.05% multi-fault-all [kernel] [k] __mem_cgroup_commit_cha
2.39% multi-fault-all [kernel] [k] bad_range
1.78% multi-fault-all [kernel] [k] page_fault
1.74% multi-fault-all [kernel] [k] mem_cgroup_charge_commo
1.71% multi-fault-all [kernel] [k] lookup_page_cgroup
Then, the result is much improved by XADD rwsem.
In above profile, rwsem is still there.
But page-fault/sec is good. I hope some "big" machine users join to the test.
(I hope 4 sockets, at least..)
Here is peformance counter result of DONTNEED test. Counting the number of page
faults in 60 sec. So, bigger number of page fault is better.
[XADD rwsem]
[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
Performance counter stats for './multi-fault-all 8' (5 runs):
41950863 page-faults ( +- 1.355% )
502983592 cache-misses ( +- 0.628% )
60.002682206 seconds time elapsed ( +- 0.000% )
[my patch]
[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
Performance counter stats for './multi-fault-all 8' (5 runs):
35835485 page-faults ( +- 0.257% )
511445661 cache-misses ( +- 0.770% )
60.004243198 seconds time elapsed ( +- 0.002% )
Ah....xadd-rwsem seems to be faster than my patch ;)
Maybe my patch adds some big overhead (see below)
Then, on my host, I can get enough page-fault throughput by modifing rwsem.
Just for my interest, profile on my patch is here.
24.69% multi-fault-all [kernel] [k] clear_page_c
20.26% multi-fault-all [kernel] [k] _raw_spin_lock
8.59% multi-fault-all [kernel] [k] _raw_spin_lock_irq
4.88% multi-fault-all [kernel] [k] page_add_new_anon_rmap
4.33% multi-fault-all [kernel] [k] _raw_spin_lock_irqsave
4.27% multi-fault-all [kernel] [k] vma_put
3.55% multi-fault-all [kernel] [k] __mem_cgroup_try_charge
3.36% multi-fault-all [kernel] [k] find_vma_speculative
2.90% multi-fault-all [kernel] [k] handle_mm_fault
2.77% multi-fault-all [kernel] [k] __rmqueue
2.49% multi-fault-all [kernel] [k] bad_range
Hmm...spinlock contention is twice bigger.....????
20.46% multi-fault-all [kernel] [k] _raw_spin_lock
|
--- _raw_spin_lock
|
|--81.42%-- free_pcppages_bulk
| free_hot_cold_page
| __pagevec_free
| release_pages
| free_pages_and_swap_cache
| |
| |--99.57%-- unmap_vmas
| | zap_page_range
| | sys_madvise
| | system_call_fastpath
| | 0x3f6b0e2cf7
| --0.43%-- [...]
|
|--17.86%-- get_page_from_freelist
| __alloc_pages_nodemask
| handle_mm_fault
| do_page_fault
| page_fault
| 0x400940
| (nil)
--0.71%-- [...]
This seems to be page allocator lock. Hmm...why this big..
Thanks,
-Kame
On Tue, 5 Jan 2010 09:17:11 -0600 (CST)
Christoph Lameter <[email protected]> wrote:
> On Tue, 5 Jan 2010, Arjan van de Ven wrote:
>
> > while I appreciate the goal of reducing contention on this lock...
> > wouldn't step one be to remove the page zeroing from under this
> > lock? that's by far (easily by 10x I would guess) the most
> > expensive thing that's done under the lock, and I would expect a
> > first order of contention reduction just by having the zeroing of a
> > page not done under the lock...
>
> The main issue is cacheline bouncing. mmap sem is a rw semaphore and
> only held for read during a fault.
depends on the workload; on a many-threads-java workload, you also get
it for write quite a bit (lots of malloc/frees in userspace in addition
to pagefaults).. at which point you do end up serializing on the
zeroing.
There's some real life real big workloads that show this pretty badly;
so far the workaround is to have glibc batch up a lot of the free()s..
but that's just pushing it a little further out.
>
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> My host boots successfully. Here is the result.
Hey, looks good. It does have that 3% trylock overhead:
3.17% multi-fault-all [kernel] [k] down_read_trylock
but that doesn't seem excessive.
Of course, your other load with MADV_DONTNEED seems to be horrible, and
has some nasty spinlock issues, but that looks like a separate deal (I
assume that load is just very hard on the pgtable lock).
That said, profiles are hard to compare performance with - the main thing
that matters for performance is not how the profile looks, but how it
actually performs. So:
> Then, the result is much improved by XADD rwsem.
>
> In above profile, rwsem is still there.
> But page-fault/sec is good. I hope some "big" machine users join to the test.
That "page-fault/sec" number is ultimately the only thing that matters.
> Here is peformance counter result of DONTNEED test. Counting the number of page
> faults in 60 sec. So, bigger number of page fault is better.
>
> [XADD rwsem]
> [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
>
> Performance counter stats for './multi-fault-all 8' (5 runs):
>
> 41950863 page-faults ( +- 1.355% )
> 502983592 cache-misses ( +- 0.628% )
>
> 60.002682206 seconds time elapsed ( +- 0.000% )
>
> [my patch]
> [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
>
> Performance counter stats for './multi-fault-all 8' (5 runs):
>
> 35835485 page-faults ( +- 0.257% )
> 511445661 cache-misses ( +- 0.770% )
>
> 60.004243198 seconds time elapsed ( +- 0.002% )
>
> Ah....xadd-rwsem seems to be faster than my patch ;)
Hey, that sounds great. NOTE! My patch really could be improved. In
particular, I suspect that on x86-64, we should take advantage of the
64-bit counter, and use a different RW_BIAS. That way we're not limited to
32k readers, which _could_ otherwise be a problem.
So consider my rwsem patch to be purely preliminary. Now that you've
tested it, I feel a lot better about it being basically correct, but it
has room for improvement.
Linus
On Tue, 5 Jan 2010 19:27:07 -0800 (PST)
Linus Torvalds <[email protected]> wrote:
>
>
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> > My host boots successfully. Here is the result.
>
> Hey, looks good. It does have that 3% trylock overhead:
>
> 3.17% multi-fault-all [kernel] [k] down_read_trylock
>
> but that doesn't seem excessive.
>
> Of course, your other load with MADV_DONTNEED seems to be horrible, and
> has some nasty spinlock issues, but that looks like a separate deal (I
> assume that load is just very hard on the pgtable lock).
>
It's zone->lock, I guess. My test program avoids pgtable lock problem.
> That said, profiles are hard to compare performance with - the main thing
> that matters for performance is not how the profile looks, but how it
> actually performs. So:
>
> > Then, the result is much improved by XADD rwsem.
> >
> > In above profile, rwsem is still there.
> > But page-fault/sec is good. I hope some "big" machine users join to the test.
>
> That "page-fault/sec" number is ultimately the only thing that matters.
>
yes.
> > Here is peformance counter result of DONTNEED test. Counting the number of page
> > faults in 60 sec. So, bigger number of page fault is better.
> >
> > [XADD rwsem]
> > [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
> >
> > Performance counter stats for './multi-fault-all 8' (5 runs):
> >
> > 41950863 page-faults ( +- 1.355% )
> > 502983592 cache-misses ( +- 0.628% )
> >
> > 60.002682206 seconds time elapsed ( +- 0.000% )
> >
> > [my patch]
> > [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
> >
> > Performance counter stats for './multi-fault-all 8' (5 runs):
> >
> > 35835485 page-faults ( +- 0.257% )
> > 511445661 cache-misses ( +- 0.770% )
> >
> > 60.004243198 seconds time elapsed ( +- 0.002% )
> >
> > Ah....xadd-rwsem seems to be faster than my patch ;)
>
> Hey, that sounds great. NOTE! My patch really could be improved. In
> particular, I suspect that on x86-64, we should take advantage of the
> 64-bit counter, and use a different RW_BIAS. That way we're not limited to
> 32k readers, which _could_ otherwise be a problem.
>
> So consider my rwsem patch to be purely preliminary. Now that you've
> tested it, I feel a lot better about it being basically correct, but it
> has room for improvement.
>
I'd like to stop updating my patch and wait and see how this issue goes.
Anyway, test on a big machine is appreciated because I cat test only on
2 sockets host.
Thanks,
-Kame
On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> > Of course, your other load with MADV_DONTNEED seems to be horrible, and
> > has some nasty spinlock issues, but that looks like a separate deal (I
> > assume that load is just very hard on the pgtable lock).
>
> It's zone->lock, I guess. My test program avoids pgtable lock problem.
Yeah, I should have looked more at your callchain. That's nasty. Much
worse than the per-mm lock. I thought the page buffering would avoid the
zone lock becoming a huge problem, but clearly not in this case.
Linus
On Tue, 5 Jan 2010 20:20:56 -0800 (PST)
Linus Torvalds <[email protected]> wrote:
>
>
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > >
> > > Of course, your other load with MADV_DONTNEED seems to be horrible, and
> > > has some nasty spinlock issues, but that looks like a separate deal (I
> > > assume that load is just very hard on the pgtable lock).
> >
> > It's zone->lock, I guess. My test program avoids pgtable lock problem.
>
> Yeah, I should have looked more at your callchain. That's nasty. Much
> worse than the per-mm lock. I thought the page buffering would avoid the
> zone lock becoming a huge problem, but clearly not in this case.
>
For my mental peace, I rewrote test program as
while () {
touch memory
barrier
madvice DONTNEED all range by cpu 0
barrier
}
And serialize madivce().
Then, zone->lock disappears and I don't see big difference with XADD rwsem and
my tricky patch. I think I got reasonable result and fixing rwsem is the sane way.
next target will be clear_page()? hehe.
What catches my eyes is cost of memcg... (>_<
Thank you all,
-Kame
==
[XADD rwsem]
[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
Performance counter stats for './multi-fault-all 8' (5 runs):
33029186 page-faults ( +- 0.146% )
348698659 cache-misses ( +- 0.149% )
60.002876268 seconds time elapsed ( +- 0.001% )
# Samples: 815596419603
#
# Overhead Command Shared Object Symbol
# ........ ............... ........................ ......
#
41.51% multi-fault-all [kernel] [k] clear_page_c
9.08% multi-fault-all [kernel] [k] down_read_trylock
6.23% multi-fault-all [kernel] [k] up_read
6.17% multi-fault-all [kernel] [k] __mem_cgroup_try_charg
4.76% multi-fault-all [kernel] [k] handle_mm_fault
3.77% multi-fault-all [kernel] [k] __mem_cgroup_commit_ch
3.62% multi-fault-all [kernel] [k] __rmqueue
2.30% multi-fault-all [kernel] [k] _raw_spin_lock
2.30% multi-fault-all [kernel] [k] page_fault
2.12% multi-fault-all [kernel] [k] mem_cgroup_charge_comm
2.05% multi-fault-all [kernel] [k] bad_range
1.78% multi-fault-all [kernel] [k] _raw_spin_lock_irq
1.53% multi-fault-all [kernel] [k] lookup_page_cgroup
1.44% multi-fault-all [kernel] [k] __mem_cgroup_uncharge_
1.41% multi-fault-all ./multi-fault-all [.] worker
1.30% multi-fault-all [kernel] [k] get_page_from_freelist
1.06% multi-fault-all [kernel] [k] page_remove_rmap
[async page fault]
[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
Performance counter stats for './multi-fault-all 8' (5 runs):
33345089 page-faults ( +- 0.555% )
357660074 cache-misses ( +- 1.438% )
60.003711279 seconds time elapsed ( +- 0.002% )
40.94% multi-fault-all [kernel] [k] clear_page_c
6.96% multi-fault-all [kernel] [k] vma_put
6.82% multi-fault-all [kernel] [k] page_add_new_anon_rmap
5.86% multi-fault-all [kernel] [k] __mem_cgroup_try_charg
4.40% multi-fault-all [kernel] [k] __rmqueue
4.14% multi-fault-all [kernel] [k] find_vma_speculative
3.97% multi-fault-all [kernel] [k] handle_mm_fault
3.52% multi-fault-all [kernel] [k] _raw_spin_lock
3.46% multi-fault-all [kernel] [k] __mem_cgroup_commit_ch
2.23% multi-fault-all [kernel] [k] bad_range
2.16% multi-fault-all [kernel] [k] mem_cgroup_charge_comm
1.96% multi-fault-all [kernel] [k] _raw_spin_lock_irq
1.75% multi-fault-all [kernel] [k] mem_cgroup_add_lru_lis
1.73% multi-fault-all [kernel] [k] page_fault
At last your patient try makes the problem solvealthough it's from not your patch series.
Thanks for very patient try and testing until now, Kame. :)I learned lot of things from this thread.
Thanks, all.
On Wed, Jan 6, 2010 at 4:06 PM, KAMEZAWA Hiroyuki<[email protected]> wrote:> On Tue, 5 Jan 2010 20:20:56 -0800 (PST)> Linus Torvalds <[email protected]> wrote:>>>>>>> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:>> > >>> > > Of course, your other load with MADV_DONTNEED seems to be horrible, and>> > > has some nasty spinlock issues, but that looks like a separate deal (I>> > > assume that load is just very hard on the pgtable lock).>> >>> > It's zone->lock, I guess. My test program avoids pgtable lock problem.>>>> Yeah, I should have looked more at your callchain. That's nasty. Much>> worse than the per-mm lock. I thought the page buffering would avoid the>> zone lock becoming a huge problem, but clearly not in this case.>>> For my mental peace, I rewrote test program as>> while () {> touch memory> barrier> madvice DONTNEED all range by cpu 0> barrier> }> And serialize madivce().>> Then, zone->lock disappears and I don't see big difference with XADD rwsem and> my tricky patch. I think I got reasonable result and fixing rwsem is the sane way.>> next target will be clear_page()? hehe.> What catches my eyes is cost of memcg... (>_<>> Thank you all,> -Kame> ==> [XADD rwsem]> [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8>> Performance counter stats for './multi-fault-all 8' (5 runs):>> 33029186 page-faults ( +- 0.146% )> 348698659 cache-misses ( +- 0.149% )>> 60.002876268 seconds time elapsed ( +- 0.001% )>> # Samples: 815596419603> #> # Overhead Command Shared Object Symbol> # ........ ............... ........................ ......> #> 41.51% multi-fault-all [kernel] [k] clear_page_c> 9.08% multi-fault-all [kernel] [k] down_read_trylock> 6.23% multi-fault-all [kernel] [k] up_read> 6.17% multi-fault-all [kernel] [k] __mem_cgroup_try_charg> 4.76% multi-fault-all [kernel] [k] handle_mm_fault> 3.77% multi-fault-all [kernel] [k] __mem_cgroup_commit_ch> 3.62% multi-fault-all [kernel] [k] __rmqueue> 2.30% multi-fault-all [kernel] [k] _raw_spin_lock> 2.30% multi-fault-all [kernel] [k] page_fault> 2.12% multi-fault-all [kernel] [k] mem_cgroup_charge_comm> 2.05% multi-fault-all [kernel] [k] bad_range> 1.78% multi-fault-all [kernel] [k] _raw_spin_lock_irq> 1.53% multi-fault-all [kernel] [k] lookup_page_cgroup> 1.44% multi-fault-all [kernel] [k] __mem_cgroup_uncharge_> 1.41% multi-fault-all ./multi-fault-all [.] worker> 1.30% multi-fault-all [kernel] [k] get_page_from_freelist> 1.06% multi-fault-all [kernel] [k] page_remove_rmap>>>> [async page fault]> [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8>> Performance counter stats for './multi-fault-all 8' (5 runs):>> 33345089 page-faults ( +- 0.555% )> 357660074 cache-misses ( +- 1.438% )>> 60.003711279 seconds time elapsed ( +- 0.002% )>>> 40.94% multi-fault-all [kernel] [k] clear_page_c> 6.96% multi-fault-all [kernel] [k] vma_put> 6.82% multi-fault-all [kernel] [k] page_add_new_anon_rmap> 5.86% multi-fault-all [kernel] [k] __mem_cgroup_try_charg> 4.40% multi-fault-all [kernel] [k] __rmqueue> 4.14% multi-fault-all [kernel] [k] find_vma_speculative> 3.97% multi-fault-all [kernel] [k] handle_mm_fault> 3.52% multi-fault-all [kernel] [k] _raw_spin_lock> 3.46% multi-fault-all [kernel] [k] __mem_cgroup_commit_ch> 2.23% multi-fault-all [kernel] [k] bad_range> 2.16% multi-fault-all [kernel] [k] mem_cgroup_charge_comm> 1.96% multi-fault-all [kernel] [k] _raw_spin_lock_irq> 1.75% multi-fault-all [kernel] [k] mem_cgroup_add_lru_lis> 1.73% multi-fault-all [kernel] [k] page_fault>
-- Kind regards,Minchan Kim????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> 9.08% multi-fault-all [kernel] [k] down_read_trylock
Btw, the "trylock" implementation of rwsemaphores doesn't look very good
on x86, even after teaching x64-64 to use the xadd versions of rwsems.
The reason? It sadly does a "load/inc/cmpxchg" sequence to do an "add if
there are no writers", and that may be the obvious model, but it sucks
from a cache access standpoint.
Why? Because it starts the sequence with a plain read. So if it doesn't
have the cacheline, it will likely get it first in non-exclusive mode, and
then the cmpxchg a few instructions later will need to turn it into an
exclusive ownership.
So now that trylock may well end up doing two bus accesses rather than
one.
It's possible that an OoO x86 CPU might notice the "read followed by
r-m-w" pattern and just turn it into an exclusive cache fetch immediately,
but I don't think they are quite that smart. But who knows?
Anyway, for the above reason it might actually be better to _avoid_ the
load entirely, and just make __down_read_trylock() assume the rwsem starts
out unlocked - replace the initial memory load with just loading a
constant.
That way, it will do the cmpxchg first, and if it wasn't unlocked and had
other readers active, it will end up doing an extra cmpxchg, but still
hopefully avoid the extra bus cycles.
So it might be worth testing this trivial patch on top of my other one.
UNTESTED. But the patch is small and simple, so maybe it works anyway. It
would be interesting to hear if it makes any difference. Better? Worse? Or
is it a "There's no difference at all, Linus. You're on drugs with that
whole initial bus cycle thing"
Linus
---
arch/x86/include/asm/rwsem.h | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 625baca..275c0a1 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -123,7 +123,6 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
{
__s32 result, tmp;
asm volatile("# beginning __down_read_trylock\n\t"
- " movl %0,%1\n\t"
"1:\n\t"
" movl %1,%2\n\t"
" addl %3,%2\n\t"
@@ -133,7 +132,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
"2:\n\t"
"# ending __down_read_trylock\n\t"
: "+m" (sem->count), "=&a" (result), "=&r" (tmp)
- : "i" (RWSEM_ACTIVE_READ_BIAS)
+ : "i" (RWSEM_ACTIVE_READ_BIAS), "1" (RWSEM_UNLOCKED_VALUE)
: "memory", "cc");
return result >= 0 ? 1 : 0;
}
On Tue, 2010-01-05 at 08:10 -0800, Linus Torvalds wrote:
> You _can_ handle it (make the RCU callback just schedule the work instead
> of doing it directly), but it does sound really nasty. I suspect we should
> explore just about any other approach over this one.
Agreed, scheduling work also has the bonus that sync rcu doesn't quite
do what you expect it to etc..
Anyway, the best I could come up with is something like the below, I
tried to come up with something that would iterate mm->cpu_vm_mask, but
since faults can schedule that doesn't work out.
Iterating the full thread group seems like a mighty expensive thing to
do in light of these massive thread freaks like Java and Google.
Will ponder things more...
---
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1277,6 +1277,7 @@ struct task_struct {
struct plist_node pushable_tasks;
struct mm_struct *mm, *active_mm;
+ struct vm_area_struct *fault_vma;
/* task state */
int exit_state;
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -3157,6 +3157,7 @@ int handle_speculative_fault(struct mm_s
goto out_unmap;
entry = *pte;
+ rcu_assign_pointer(current->fault_vma, vma);
if (read_seqcount_retry(&vma->vm_sequence, seq))
goto out_unmap;
@@ -3167,6 +3168,7 @@ int handle_speculative_fault(struct mm_s
ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
out_unlock:
+ rcu_assign_pointer(current->fault_vma, NULL);
rcu_read_unlock();
return ret;
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -235,6 +235,27 @@ static void free_vma(struct vm_area_stru
call_rcu(&vma->vm_rcu_head, free_vma_rcu);
}
+static void sync_vma(struct vm_area_struct *vma)
+{
+ struct task_struct *t;
+ int block = 0;
+
+ if (!vma->vm_file)
+ return;
+
+ rcu_read_lock();
+ for (t = current; (t = next_thread(t)) != current; ) {
+ if (rcu_dereference(t->fault_vma) == vma) {
+ block = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (block)
+ synchronize_rcu();
+}
+
/*
* Close a vm structure and free it, returning the next.
*/
@@ -243,6 +264,7 @@ static struct vm_area_struct *remove_vma
struct vm_area_struct *next = vma->vm_next;
might_sleep();
+ sync_vma(vma);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file) {
On Wed, 6 Jan 2010 01:39:17 -0800 (PST)
Linus Torvalds <[email protected]> wrote:
>
>
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> > 9.08% multi-fault-all [kernel] [k] down_read_trylock
<snip>
> That way, it will do the cmpxchg first, and if it wasn't unlocked and had
> other readers active, it will end up doing an extra cmpxchg, but still
> hopefully avoid the extra bus cycles.
>
> So it might be worth testing this trivial patch on top of my other one.
>
Test: on 8-core/2-socket x86-64
while () {
touch memory
barrier
madvice DONTNEED all range by cpu 0
barrier
}
<Before> (cut from my post)
> [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
>
> Performance counter stats for './multi-fault-all 8' (5 runs):
>
> 33029186 page-faults ( +- 0.146% )
> 348698659 cache-misses ( +- 0.149% )
>
> 60.002876268 seconds time elapsed ( +- 0.001% )
> 41.51% multi-fault-all [kernel] [k] clear_page_c
> 9.08% multi-fault-all [kernel] [k] down_read_trylock
> 6.23% multi-fault-all [kernel] [k] up_read
> 6.17% multi-fault-all [kernel] [k] __mem_cgroup_try_charg
<After>
[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
Performance counter stats for './multi-fault-all 8' (5 runs):
33782787 page-faults ( +- 2.650% )
332753197 cache-misses ( +- 0.477% )
60.003984337 seconds time elapsed ( +- 0.004% )
# Samples: 1014408915089
#
# Overhead Command Shared Object Symbol
# ........ ............... ........................ ......
#
44.42% multi-fault-all [kernel] [k] clear_page_c
7.73% multi-fault-all [kernel] [k] down_read_trylock
6.65% multi-fault-all [kernel] [k] __mem_cgroup_try_char
6.15% multi-fault-all [kernel] [k] up_read
4.87% multi-fault-all [kernel] [k] handle_mm_fault
3.70% multi-fault-all [kernel] [k] __rmqueue
3.69% multi-fault-all [kernel] [k] __mem_cgroup_commit_c
2.35% multi-fault-all [kernel] [k] bad_range
yes, it seems slightly improved, at least on this test.
but page-fault-throughput test score is within error range.
Thanks,
-Kame
On Tue, 5 Jan 2010, Arjan van de Ven wrote:
> On Tue, 5 Jan 2010 09:17:11 -0600 (CST)
> Christoph Lameter <[email protected]> wrote:
>
> > On Tue, 5 Jan 2010, Arjan van de Ven wrote:
> >
> > > while I appreciate the goal of reducing contention on this lock...
> > > wouldn't step one be to remove the page zeroing from under this
> > > lock? that's by far (easily by 10x I would guess) the most
> > > expensive thing that's done under the lock, and I would expect a
> > > first order of contention reduction just by having the zeroing of a
> > > page not done under the lock...
> >
> > The main issue is cacheline bouncing. mmap sem is a rw semaphore and
> > only held for read during a fault.
>
> depends on the workload; on a many-threads-java workload, you also get
> it for write quite a bit (lots of malloc/frees in userspace in addition
> to pagefaults).. at which point you do end up serializing on the
> zeroing.
>
> There's some real life real big workloads that show this pretty badly;
> so far the workaround is to have glibc batch up a lot of the free()s..
> but that's just pushing it a little further out.
Again mmap_sem is a rwsem and only a read lock is held. Zeroing in
do_anonymous_page can occur concurrently on multiple processors in the
same address space. The pte lock is intentionally taken *after* zeroing to
allow concurrent zeroing to occur.
On Thu, 7 Jan 2010, Christoph Lameter wrote:
> >
> > depends on the workload; on a many-threads-java workload, you also get
> > it for write quite a bit (lots of malloc/frees in userspace in addition
> > to pagefaults).. at which point you do end up serializing on the
> > zeroing.
> >
> > There's some real life real big workloads that show this pretty badly;
> > so far the workaround is to have glibc batch up a lot of the free()s..
> > but that's just pushing it a little further out.
>
> Again mmap_sem is a rwsem and only a read lock is held. Zeroing in
> do_anonymous_page can occur concurrently on multiple processors in the
> same address space. The pte lock is intentionally taken *after* zeroing to
> allow concurrent zeroing to occur.
You're missing what Arjan said - the jav workload does a lot of memory
allocations too, causing mmap/munmap.
So now some paths are indeed holding it for writing (or need to wait for
it to become writable). And the fairness of rwsems quite possibly then
impacts throughput a _lot_..
(Side note: I wonder if we should wake up _all_ readers when we wake up
any. Right now, we wake up all readers - but only until we hit a writer.
Which is the _fair_ thing to do, but it does mean that we can end up in
horrible patterns of alternating readers/writers, when it could be much
better to just say "release the hounds" and let all pending readers go
after a writer has had its turn).
Linus
On Thu, 7 Jan 2010, Linus Torvalds wrote:
>
> (Side note: I wonder if we should wake up _all_ readers when we wake up
> any. Right now, we wake up all readers - but only until we hit a writer.
> Which is the _fair_ thing to do, but it does mean that we can end up in
> horrible patterns of alternating readers/writers, when it could be much
> better to just say "release the hounds" and let all pending readers go
> after a writer has had its turn).
Btw, this would still be "mostly fair" in the sense that you couldn't
starve writers. Any writer on the list is still guaranteed to be woken up
next, because now it will be at the front of the queue.
So it would be starvation-proof - new readers that come in _after_ we've
woken up all the old ones would not get to pass the writers. It might be
interesting to test, if somebody has a problematic threaded workload with
lots of page faults and allocations mixxed.
Linus
On Thu, Jan 07, 2010 at 08:19:56AM -0800, Linus Torvalds wrote:
>
>
> On Thu, 7 Jan 2010, Christoph Lameter wrote:
> > >
> > > depends on the workload; on a many-threads-java workload, you also get
> > > it for write quite a bit (lots of malloc/frees in userspace in addition
> > > to pagefaults).. at which point you do end up serializing on the
> > > zeroing.
> > >
> > > There's some real life real big workloads that show this pretty badly;
> > > so far the workaround is to have glibc batch up a lot of the free()s..
> > > but that's just pushing it a little further out.
> >
> > Again mmap_sem is a rwsem and only a read lock is held. Zeroing in
> > do_anonymous_page can occur concurrently on multiple processors in the
> > same address space. The pte lock is intentionally taken *after* zeroing to
> > allow concurrent zeroing to occur.
>
> You're missing what Arjan said - the jav workload does a lot of memory
> allocations too, causing mmap/munmap.
>
> So now some paths are indeed holding it for writing (or need to wait for
> it to become writable). And the fairness of rwsems quite possibly then
> impacts throughput a _lot_..
>
> (Side note: I wonder if we should wake up _all_ readers when we wake up
> any. Right now, we wake up all readers - but only until we hit a writer.
> Which is the _fair_ thing to do, but it does mean that we can end up in
> horrible patterns of alternating readers/writers, when it could be much
> better to just say "release the hounds" and let all pending readers go
> after a writer has had its turn).
This can indeed work well in many cases. The situation where it can
get you in trouble is where there are many more readers than CPUs (or
disk spindles or whatever it is that limits the amount of effective
parallelism readers can attain). In this case, releasing more readers
than can run in parallel will delay the writers for no good reason.
So one strategy is to release readers, but no more than the number of
CPUs (or whatever the limit is). More complicated strategies are out
there, but there is a limit to how much of the scheduler one should
involve in lock-granting decisions.
Thanx, Paul
On Thu, 7 Jan 2010, Linus Torvalds wrote:
> You're missing what Arjan said - the jav workload does a lot of memory
> allocations too, causing mmap/munmap.
Well isnt that tunable on the app level? Get bigger chunks of memory in
order to reduce the frequency of mmap operations? If you want concurrency
of faults then mmap_sem write locking currently needs to be limited.
> So now some paths are indeed holding it for writing (or need to wait for
> it to become writable). And the fairness of rwsems quite possibly then
> impacts throughput a _lot_..
Very true. Doing range locking (maybe using the split pte lock
boundaries, shifting some state from mm_struct into vmas) may be a way to
avoid hold mmap_sem for write in that case.
> (Side note: I wonder if we should wake up _all_ readers when we wake up
> any. Right now, we wake up all readers - but only until we hit a writer.
> Which is the _fair_ thing to do, but it does mean that we can end up in
> horrible patterns of alternating readers/writers, when it could be much
> better to just say "release the hounds" and let all pending readers go
> after a writer has had its turn).
Have a cycle with concurrent readers followed by a cycle of serialized
writers may be best under heavy load. The writers need to be limited in
frequency otherwise they will starve the readers.
On Thu, 2010-01-07 at 08:19 -0800, Linus Torvalds wrote:
>
> So now some paths are indeed holding it for writing (or need to wait for
> it to become writable). And the fairness of rwsems quite possibly then
> impacts throughput a _lot_..
Right, supposing we can make this speculative fault stuff work, then we
can basically reduce the mmap_sem usage in fault to:
- allocating new page tables
- extending the growable vmas
And do everything else without holding it, including zeroing and IO.
On Thu, 7 Jan 2010, Peter Zijlstra wrote:
>
> Right, supposing we can make this speculative fault stuff work, then we
> can basically reduce the mmap_sem usage in fault to:
>
> - allocating new page tables
> - extending the growable vmas
>
> And do everything else without holding it, including zeroing and IO.
Well, I have yet to hear a realistic scenario of _how_ to do it all
speculatively in the first place, at least not without horribly subtle
complexity issues. So I'd really rather see how far we can possibly get by
just improving mmap_sem.
Linus
On Thu, 7 Jan 2010, Linus Torvalds wrote:
>
> Well, I have yet to hear a realistic scenario of _how_ to do it all
> speculatively in the first place, at least not without horribly subtle
> complexity issues. So I'd really rather see how far we can possibly get by
> just improving mmap_sem.
For an example of this: it's entirely possible that one avenue of mmap_sem
improvement would be to look at the _writer_ side, and see how that can be
improved.
An example of where we've done that is in madvise(): we used to always
take it for writing (because _some_ madvise versions needed the exclusive
access). And suddenly some operations got way more scalable, and work in
the presense of concurrent page faults.
And quite frankly, I'd _much_ rather look at that kind of simple and
logically fairly straightforward solutions, instead of doing the whole
speculative page fault work.
For example: there's no real reason why we take mmap_sem for writing when
extending an existing vma. And while 'brk()' is a very oldfashioned way of
doing memory management, it's still quite common. So rather than looking
at subtle lockless algorithms, why not look at doing the common cases of
an extending brk? Make that one take the mmap_sem for _reading_, and then
do the extending of the brk area with a simple cmpxchg or something?
And "extending brk" is actually a lot more common than shrinking it, and
is common for exactly the kind of workloads that are often nasty right now
(threaded allocators with lots and lots of smallish allocations)
The thing is, I can pretty much _guarantee_ that the speculative page
fault is going to end up doing a lot of nasty stuff that still needs
almost-global locking, and it's likely to be more complicated and slower
for the single-threaded case (you end up needing refcounts, a new "local"
lock or something).
Sure, moving to a per-vma lock can help, but it doesn't help a lot. It
doesn't help AT ALL for the single-threaded case, and for the
multi-threaded case I will bet you that a _lot_ of cases will have one
very hot vma - the regular data vma that gets shared for normal malloc()
etc.
So I'm personally rather doubtful about the whole speculative work. It's a
fair amount of complexity without any really obvious upside. Yes, the
mmap_sem can be very annoying, but nobody can really honestly claim that
we've really optimized it all that much.
Linus
On Thu, 2010-01-07 at 09:49 -0800, Linus Torvalds wrote:
>
> The thing is, I can pretty much _guarantee_ that the speculative page
> fault is going to end up doing a lot of nasty stuff that still needs
> almost-global locking, and it's likely to be more complicated and slower
> for the single-threaded case (you end up needing refcounts, a new "local"
> lock or something).
Well, with that sync_vma() thing I posted the other day all the
speculative page fault needs is a write to current->fault_vma, the ptl
and an O(nr_threads) loop on unmap() for file vmas -- aside from writing
the pte itself of course.
On Thu, 7 Jan 2010, Peter Zijlstra wrote:
>
> Well, with that sync_vma() thing I posted the other day all the
> speculative page fault needs is a write to current->fault_vma, the ptl
> and an O(nr_threads) loop on unmap() for file vmas -- aside from writing
> the pte itself of course.
How do you handle the race of
fault handler: munmap
look up vma without locks
.. interrupt happens or
something delays things ..
remove the vma from the list
sync_vma()
current->fault_vma = vma
etc?
Maybe I'm missing something, but quite frankly, the above looks pretty
damn fundamental. Something needs to take a lock. If you get rid of the
mmap_sem, you need to replace it with another lock.
There's no sane way to "look up vma and atomically mark it as busy"
without locks. You can do it with extra work, ie something like
look up vma without locks
current->fault_vma = vma
smp_mb();
check that the vma is still on the list
(with he appropriate barriers on the munmap side too, of course) where you
avoid the lock by basically turning it into an ordering problem, but it
ends up being pretty much as expensive anyway for single threads.
And for lots and lots of threads, you now made that munmap be pretty
expensive.
Linus
On Thu, 7 Jan 2010, Linus Torvalds wrote:
>
> For example: there's no real reason why we take mmap_sem for writing when
> extending an existing vma. And while 'brk()' is a very oldfashioned way of
> doing memory management, it's still quite common. So rather than looking
> at subtle lockless algorithms, why not look at doing the common cases of
> an extending brk? Make that one take the mmap_sem for _reading_, and then
> do the extending of the brk area with a simple cmpxchg or something?
I didn't use cmpxchg, because we actually want to update both
'current->brk' _and_ the vma->vm_end atomically, so here's a totally
untested patch that uses the page_table_lock spinlock for it instead (it
could be a new spinlock, not worth it).
It's also totally untested and might be horribly broken. But you get the
idea.
We could probably do things like this in regular mmap() too for the
"extend a mmap" case. brk() is just especially simple.
Linus
---
mm/mmap.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
1 files changed, 79 insertions(+), 10 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index ee22989..3d07e5f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -242,23 +242,14 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
return next;
}
-SYSCALL_DEFINE1(brk, unsigned long, brk)
+static long slow_brk(unsigned long brk)
{
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
- unsigned long min_brk;
down_write(&mm->mmap_sem);
-#ifdef CONFIG_COMPAT_BRK
- min_brk = mm->end_code;
-#else
- min_brk = mm->start_brk;
-#endif
- if (brk < min_brk)
- goto out;
-
/*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
@@ -297,6 +288,84 @@ out:
return retval;
}
+/*
+ * NOTE NOTE NOTE!
+ *
+ * We do all this speculatively with just the read lock held.
+ * If anything goes wrong, we release the read lock, and punt
+ * to the traditional write-locked version.
+ *
+ * Here "goes wrong" includes:
+ * - having to unmap the area and actually shrink it
+ * - the final cmpxchg doesn't succeed
+ * - the old brk area wasn't a simple anonymous vma
+ * etc etc
+ */
+static struct vm_area_struct *ok_to_extend_brk(struct mm_struct *mm, unsigned long cur_brk, unsigned long brk)
+{
+ struct vm_area_struct *vma, *nextvma;
+
+ nextvma = find_vma_prev(mm, cur_brk, &vma);
+ if (vma) {
+ /*
+ * Needs to be an anonymous vma that ends at the current brk,
+ * and with no following vma that would start before the
+ * new brk
+ */
+ if (vma->vm_file || cur_brk != vma->vm_end || (nextvma && brk > nextvma->vm_start))
+ vma = NULL;
+ }
+ return vma;
+}
+
+SYSCALL_DEFINE1(brk, unsigned long, brk)
+{
+ unsigned long cur_brk, min_brk;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ cur_brk = mm->brk;
+#ifdef CONFIG_COMPAT_BRK
+ min_brk = mm->end_code;
+#else
+ min_brk = mm->start_brk;
+#endif
+ if (brk < min_brk)
+ goto out;
+
+ brk = PAGE_ALIGN(brk);
+ cur_brk = PAGE_ALIGN(cur_brk);
+
+ if (brk < cur_brk)
+ goto slow_case;
+ if (brk == cur_brk)
+ goto out;
+
+ vma = ok_to_extend_brk(mm, cur_brk, brk);
+ if (!vma)
+ goto slow_case;
+
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_end == cur_brk) {
+ vma->vm_end = brk;
+ mm->brk = brk;
+ cur_brk = brk;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (cur_brk != brk)
+ goto slow_case;
+
+out:
+ up_read(&mm->mmap_sem);
+ return cur_brk;
+
+slow_case:
+ up_read(&mm->mmap_sem);
+ return slow_brk(brk);
+}
+
#ifdef DEBUG_MM_RB
static int browse_rb(struct rb_root *root)
{
On Thu, Jan 07, 2010 at 10:44:13AM -0800, Linus Torvalds wrote:
>
>
> On Thu, 7 Jan 2010, Linus Torvalds wrote:
> >
> > For example: there's no real reason why we take mmap_sem for writing when
> > extending an existing vma. And while 'brk()' is a very oldfashioned way of
> > doing memory management, it's still quite common. So rather than looking
> > at subtle lockless algorithms, why not look at doing the common cases of
> > an extending brk? Make that one take the mmap_sem for _reading_, and then
> > do the extending of the brk area with a simple cmpxchg or something?
>
> I didn't use cmpxchg, because we actually want to update both
> 'current->brk' _and_ the vma->vm_end atomically, so here's a totally
> untested patch that uses the page_table_lock spinlock for it instead (it
> could be a new spinlock, not worth it).
>
> It's also totally untested and might be horribly broken. But you get the
> idea.
>
> We could probably do things like this in regular mmap() too for the
> "extend a mmap" case. brk() is just especially simple.
One question on the final test...
> Linus
>
> ---
> mm/mmap.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
> 1 files changed, 79 insertions(+), 10 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index ee22989..3d07e5f 100644
[ . . . ]
> + if (!vma)
> + goto slow_case;
> +
> + spin_lock(&mm->page_table_lock);
> + if (vma->vm_end == cur_brk) {
> + vma->vm_end = brk;
> + mm->brk = brk;
> + cur_brk = brk;
> + }
> + spin_unlock(&mm->page_table_lock);
> +
> + if (cur_brk != brk)
Can this be "if (cur_brk < brk)"? Seems like it should, given the
earlier tests, but I don't claim to understand the VM code.
Thanx, Paul
> + goto slow_case;
> +
> +out:
> + up_read(&mm->mmap_sem);
> + return cur_brk;
> +
> +slow_case:
> + up_read(&mm->mmap_sem);
> + return slow_brk(brk);
> +}
> +
> #ifdef DEBUG_MM_RB
> static int browse_rb(struct rb_root *root)
> {
On Thu, 7 Jan 2010, Linus Torvalds wrote:
> + if (brk < cur_brk)
> + goto slow_case;
> + if (brk == cur_brk)
> + goto out;
> +
> + vma = ok_to_extend_brk(mm, cur_brk, brk);
> + if (!vma)
> + goto slow_case;
> +
> + spin_lock(&mm->page_table_lock);
page_table_lock used to serialize multiple fast brks?
CONFIG_SPLIT_PTLOCK implies that code will not use this lock in fault
handling. So no serialization with faults.
Also the current code assumes vm_end and so on to be stable if mmap_sem is
held. F.e. find_vma() from do_fault is now running while vm_end may be changing
under it.
On Thu, 7 Jan 2010, Paul E. McKenney wrote:
> > +
> > + spin_lock(&mm->page_table_lock);
> > + if (vma->vm_end == cur_brk) {
> > + vma->vm_end = brk;
> > + mm->brk = brk;
> > + cur_brk = brk;
> > + }
> > + spin_unlock(&mm->page_table_lock);
> > +
> > + if (cur_brk != brk)
>
> Can this be "if (cur_brk < brk)"? Seems like it should, given the
> earlier tests, but I don't claim to understand the VM code.
It's really just a flag, to test whether the final check (inside the
spinlock) succeeded, or whether we perhaps raced with _another_ brk() call
that also had the mm_sem for reading.
We know that cur_brk was different from brk before - because otherwise
we'd have just returned early (or done the slow case). So testing whether
it's different afterwards really only tests whether that
cur_brk = brk;
statment was executed or not.
I could have used a separate flag called "success" or something.
Linus
On Thu, 7 Jan 2010, Christoph Lameter wrote:
>
> page_table_lock used to serialize multiple fast brks?
>
> CONFIG_SPLIT_PTLOCK implies that code will not use this lock in fault
> handling. So no serialization with faults.
Correct. The faults we do not need to serialize with. It doesn't matter
whether they see the old or the new end.
> Also the current code assumes vm_end and so on to be stable if mmap_sem is
> held. F.e. find_vma() from do_fault is now running while vm_end may be changing
> under it.
Again, it doesn't matter. Old or new - if some other thread looks up the
vma, either is fine.
Linus
On Thu, 7 Jan 2010, Linus Torvalds wrote:
>
> Again, it doesn't matter. Old or new - if some other thread looks up the
> vma, either is fine.
Btw, don't get me wrong. That patch may compile (I checked it), but I am
not in any way claiming that it is anything else than a total throw-away
"this is something we could look at doing" suggestion.
For example, I'm not at all wedded to using 'mm->page_table_lock': I in
fact wanted to use a per-vma lock, but I picked a lock we already had. The
fact that picking a lock we already had also means that it serializes page
table updates (sometimes) is actually a downside, not a good thing.
So the patch was meant to get people thinking about alternatives, rather
than anything else.
The point being that there are things we can play with on mmap_sem, that
don't involve getting rid of it - just being a bit more aggressive in how
we use it.
Linus
On Thu, Jan 07, 2010 at 12:06:48PM -0800, Linus Torvalds wrote:
>
>
> On Thu, 7 Jan 2010, Paul E. McKenney wrote:
> > > +
> > > + spin_lock(&mm->page_table_lock);
> > > + if (vma->vm_end == cur_brk) {
> > > + vma->vm_end = brk;
> > > + mm->brk = brk;
> > > + cur_brk = brk;
> > > + }
> > > + spin_unlock(&mm->page_table_lock);
> > > +
> > > + if (cur_brk != brk)
> >
> > Can this be "if (cur_brk < brk)"? Seems like it should, given the
> > earlier tests, but I don't claim to understand the VM code.
>
> It's really just a flag, to test whether the final check (inside the
> spinlock) succeeded, or whether we perhaps raced with _another_ brk() call
> that also had the mm_sem for reading.
>
> We know that cur_brk was different from brk before - because otherwise
> we'd have just returned early (or done the slow case). So testing whether
> it's different afterwards really only tests whether that
>
> cur_brk = brk;
>
> statment was executed or not.
>
> I could have used a separate flag called "success" or something.
I was (perhaps confusedly) thinking of a pair of threads both trying
to sbrk() at the same time. One of them wins by acquiring the
->page_table_lock first. Then the other acquires the spinlock, but
sees vma->vm_end != cur_brk. But if the first one extended the region
at least as far as the second intended to, the second one's work is done.
Of course, we can debate the sanity of an application that actually does
concurrent sbrk() calls.
Thanx, Paul
On Thu, 2010-01-07 at 10:44 -0800, Linus Torvalds wrote:
>
> On Thu, 7 Jan 2010, Linus Torvalds wrote:
> >
> > For example: there's no real reason why we take mmap_sem for writing when
> > extending an existing vma. And while 'brk()' is a very oldfashioned way of
> > doing memory management, it's still quite common. So rather than looking
> > at subtle lockless algorithms, why not look at doing the common cases of
> > an extending brk? Make that one take the mmap_sem for _reading_, and then
> > do the extending of the brk area with a simple cmpxchg or something?
>
> I didn't use cmpxchg, because we actually want to update both
> 'current->brk' _and_ the vma->vm_end atomically, so here's a totally
> untested patch that uses the page_table_lock spinlock for it instead (it
> could be a new spinlock, not worth it).
>
> It's also totally untested and might be horribly broken. But you get the
> idea.
>
> We could probably do things like this in regular mmap() too for the
> "extend a mmap" case. brk() is just especially simple.
I haven't yet looked at the patch, but isn't expand_stack() kinda like
what you want? That serializes using anon_vma_lock().
On Thu, 2010-01-07 at 10:15 -0800, Linus Torvalds wrote:
>
> On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> >
> > Well, with that sync_vma() thing I posted the other day all the
> > speculative page fault needs is a write to current->fault_vma, the ptl
> > and an O(nr_threads) loop on unmap() for file vmas -- aside from writing
> > the pte itself of course.
>
> How do you handle the race of
>
> fault handler: munmap
>
> look up vma without locks
>
> .. interrupt happens or
> something delays things ..
> remove the vma from the list
> sync_vma()
>
> current->fault_vma = vma
>
> etc?
>
> Maybe I'm missing something, but quite frankly, the above looks pretty
> damn fundamental. Something needs to take a lock. If you get rid of the
> mmap_sem, you need to replace it with another lock.
>
> There's no sane way to "look up vma and atomically mark it as busy"
> without locks. You can do it with extra work, ie something like
>
> look up vma without locks
> current->fault_vma = vma
> smp_mb();
> check that the vma is still on the list
>
> (with he appropriate barriers on the munmap side too, of course) where you
> avoid the lock by basically turning it into an ordering problem, but it
> ends up being pretty much as expensive anyway for single threads.
As expensive for single threads is just fine with me, as long as we get
cheaper with lots of threads.
I basically did the above revalidation with seqlocks, we lookup the vma,
mark it busy and revalidate the vma sequence count.
> And for lots and lots of threads, you now made that munmap be pretty
> expensive.
Yeah, I really hate that part too, trying hard to come up with something
smarter but my brain isn't yet co-operating :-)
On Thu, 7 Jan 2010, Peter Zijlstra wrote:
>
> I haven't yet looked at the patch, but isn't expand_stack() kinda like
> what you want? That serializes using anon_vma_lock().
Yeah, that sounds like the right thing to do. It is the same operation,
after all (and has the same effects, especially for the special case of
upwards-growing stacks).
So basically the idea is to extend that stack expansion to brk(), and
possibly mmap() in general.
Doing the same for munmap() (or shrinking thigns in general, which you can
do with brk but not with the stack) is quite a bit harder. As can be seen
by the fact that all the problems with the speculative approach are in the
unmap cases.
But the good news is that shrinking mappings is _much_ less common than
growing them. Many memory allocators never shrink at all, or shrink only
when they hit certain big chunks. In a lot of cases, the only time you
shrink a mapping ends up being at the final exit, which doesn't have any
locking issues anyway, since even if we take the mmap_sem lock for
writing, there aren't going to be any readers possibly left.
And a lot of growing mmaps end up just extending an old one. No, not
always, but I suspect that if we really put some effort into it, we could
probably make the write-lock frequency go down by something like an order
of magnitude on many loads.
Not all loads, no. Some loads will do a lot of file mmap's, or use
MAP_FIXED and/or mprotect to split vma's on purpose. But that is certainly
not likely to be the common case.
Linus
On 01/07/2010 12:36 PM, Linus Torvalds wrote:
> On Thu, 7 Jan 2010, Peter Zijlstra wrote:
>>
>> Right, supposing we can make this speculative fault stuff work, then we
>> can basically reduce the mmap_sem usage in fault to:
>>
>> - allocating new page tables
>> - extending the growable vmas
>>
>> And do everything else without holding it, including zeroing and IO.
>
> Well, I have yet to hear a realistic scenario of _how_ to do it all
> speculatively in the first place, at least not without horribly subtle
> complexity issues. So I'd really rather see how far we can possibly get by
> just improving mmap_sem.
I would like to second this sentiment. I am trying to make the
anon_vma and rmap bits more scalable for multi-process server
workloads and it is quite worrying how complex locking already
is.
--
All rights reversed.
On Thu, 7 Jan 2010 14:33:50 -0800 (PST)
Linus Torvalds <[email protected]> wrote:
>
>
> On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> >
> > I haven't yet looked at the patch, but isn't expand_stack() kinda like
> > what you want? That serializes using anon_vma_lock().
>
> Yeah, that sounds like the right thing to do. It is the same operation,
> after all (and has the same effects, especially for the special case of
> upwards-growing stacks).
>
> So basically the idea is to extend that stack expansion to brk(), and
> possibly mmap() in general.
>
Hmm, do_brk() sometimes unmap conflicting mapping. Isn't it be a problem ?
Stack expansion just fails and SEGV if it hit with other mmaps....
Thanks,
-Kame
On Fri, 8 Jan 2010 09:23:33 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> On Thu, 7 Jan 2010 14:33:50 -0800 (PST)
> Linus Torvalds <[email protected]> wrote:
>
> >
> >
> > On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> > >
> > > I haven't yet looked at the patch, but isn't expand_stack() kinda like
> > > what you want? That serializes using anon_vma_lock().
> >
> > Yeah, that sounds like the right thing to do. It is the same operation,
> > after all (and has the same effects, especially for the special case of
> > upwards-growing stacks).
> >
> > So basically the idea is to extend that stack expansion to brk(), and
> > possibly mmap() in general.
> >
> Hmm, do_brk() sometimes unmap conflicting mapping. Isn't it be a problem ?
> Stack expansion just fails and SEGV if it hit with other mmaps....
Sorry please ignore the last line...
Sorry for noise.
-Kame
On Fri, 8 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> Hmm, do_brk() sometimes unmap conflicting mapping. Isn't it be a problem ?
No. For two reasons:
- sys_brk() doesn't actually do that (see the "find_vma_intersection()"
call). I'm not sure why do_brk() does, but it might have to do with
execve().
- the patch I sent out just falls back to the old code if it finds
something fishy, so it will do whatever do_brk() does regardless.
(Yes, brk() does unmap the old brk for the _shrinking_ case, of course.
Again, the patch I sent just falls back to the old behavior in that case)
Linus
On Thu, 7 Jan 2010, Linus Torvalds wrote:
>
> - the patch I sent out just falls back to the old code if it finds
> something fishy, so it will do whatever do_brk() does regardless.
Btw, I'd like to state it again - the patch I sent out was not ready to be
applied. I'm pretty sure it should check things like certain vm_flags
too(VM_LOCKED etc), and fall back for those cases as well.
So the patch was more meant to illustrate the _concept_ than meant to
necessarily be taken seriously as-is.
Linus
On Thu, 7 Jan 2010 10:36:52 -0600 (CST)
Christoph Lameter <[email protected]> wrote:
> On Thu, 7 Jan 2010, Linus Torvalds wrote:
>
> > You're missing what Arjan said - the jav workload does a lot of
> > memory allocations too, causing mmap/munmap.
>
> Well isnt that tunable on the app level? Get bigger chunks of memory
> in order to reduce the frequency of mmap operations? If you want
> concurrency of faults then mmap_sem write locking currently needs to
> be limited.
if an app has to change because our kernel sucks (for no good reason),
"change the app" really is the lame type of answer.
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On Thu, 7 Jan 2010, Arjan van de Ven wrote:
>
> if an app has to change because our kernel sucks (for no good reason),
> "change the app" really is the lame type of answer.
Well, in all fairness, I doubt other kernels do any better. So changing
the app is likely to help in general, and thus be at least part of the
right solution.
But as outlined, we _can_ almost certainly do better on many simple and
common cases, so changing the kernel - in addition to fixing app memory
allocation patterns - sounds like a good avenue.
Linus
On Thu, 7 Jan 2010, Arjan van de Ven wrote:
> if an app has to change because our kernel sucks (for no good reason),
> "change the app" really is the lame type of answer.
We are changing apps all of the time here to reduce the number of system
calls. Any system call usually requires context switching, scheduling
activities etc. Evil effects if you want the processor for computation
and are sensitive to cpu caching effects. It is good to reduce the number
of system calls as much as possible.
System calls are at best placed to affect the largest memory
possible in a given context and be avoided in loops.
On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
>
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > >
> > > Of course, your other load with MADV_DONTNEED seems to be horrible, and
> > > has some nasty spinlock issues, but that looks like a separate deal (I
> > > assume that load is just very hard on the pgtable lock).
> >
> > It's zone->lock, I guess. My test program avoids pgtable lock problem.
>
> Yeah, I should have looked more at your callchain. That's nasty. Much
> worse than the per-mm lock. I thought the page buffering would avoid the
> zone lock becoming a huge problem, but clearly not in this case.
Right, so I ran some numbers on a multi-socket (2) machine as well:
pf/min
-tip 56398626
-tip + xadd 174753190
-tip + speculative 189274319
-tip + xadd + speculative 200174641
[ variance is around 0.5% for this workload, ran most of these numbers
with --repeat 5 ]
At both the xadd/speculative point the workload is dominated by the
zone->lock, the xadd+speculative removes some of the contention, and
removing the various RSS counters could yield another few percent
according to the profiles, but then we're pretty much there.
One way around those RSS counters is to track it per task, a quick grep
shows its only the oom-killer and proc that use them.
A quick hack removing them gets us: 203158058
So from a throughput pov. the whole speculative fault thing might not be
interesting until the rest of the vm gets a lift to go along with it.
>From a blocking on mmap_sem pov. I think Linus is right in that we
should first consider things like dropping mmap_sep around IO and page
zeroing, and generally looking at reducing hold times and such.
So while I think its quite feasible to do these speculative faults, it
appears we're not quite ready for them.
Maybe I can get -rt to carry it for a while, there we have to reduce
mmap_sem to a mutex, which hurts lots.
On Fri, 8 Jan 2010, Peter Zijlstra wrote:
> On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
> >
> > Yeah, I should have looked more at your callchain. That's nasty. Much
> > worse than the per-mm lock. I thought the page buffering would avoid the
> > zone lock becoming a huge problem, but clearly not in this case.
>
> Right, so I ran some numbers on a multi-socket (2) machine as well:
>
> pf/min
>
> -tip 56398626
> -tip + xadd 174753190
> -tip + speculative 189274319
> -tip + xadd + speculative 200174641
>
> [ variance is around 0.5% for this workload, ran most of these numbers
> with --repeat 5 ]
That's a huge jump. It's clear that the spinlock-based rwsem's simply
suck. The speculation gets rid of some additional mmap_sem contention,
but at least for two sockets it looks like the rwsem implementation was
the biggest problem by far.
> At both the xadd/speculative point the workload is dominated by the
> zone->lock, the xadd+speculative removes some of the contention, and
> removing the various RSS counters could yield another few percent
> according to the profiles, but then we're pretty much there.
I don't know if worrying about a few percent is worth it. "Perfect is the
enemy of good", and the workload is pretty dang artificial with the whole
"remove pages and re-fault them as fast as you can".
So the benchmark is pointless and extreme, and I think it's not worth
worrying too much about details. Especially when compared to just the
*three-fold* jump from just the fairly trivial rwsem implementation change
(with speculation on top of it then adding another 15% improvement -
nothing to sneeze at, but it's still in a different class).
Of course, larger numbers of sockets will likely change the situation, but
at the same time I do suspect that workloads designed for hundreds of
cores will need to try to behave better than that benchmark anyway ;)
> One way around those RSS counters is to track it per task, a quick grep
> shows its only the oom-killer and proc that use them.
>
> A quick hack removing them gets us: 203158058
Yeah, well.. After that 200% and 15% improvement, a 1.5% improvement on a
totally artificial benchmark looks less interesting.
Because let's face it - if your workload does several million page faults
per second, you're just doing something fundamentally _wrong_.
Linus
On Fri, 8 Jan 2010, Linus Torvalds wrote:
> That's a huge jump. It's clear that the spinlock-based rwsem's simply
> suck. The speculation gets rid of some additional mmap_sem contention,
> but at least for two sockets it looks like the rwsem implementation was
> the biggest problem by far.
I'd say that the ticket lock sucks for short critical sections vs. a
simple spinlock since it forces the cacheline into shared mode.
> Of course, larger numbers of sockets will likely change the situation, but
> at the same time I do suspect that workloads designed for hundreds of
> cores will need to try to behave better than that benchmark anyway ;)
Can we at least consider a typical standard business server, dual quad
core hyperthreaded with 16 "cpus"? Cacheline contention will increase
significantly there.
> Because let's face it - if your workload does several million page faults
> per second, you're just doing something fundamentally _wrong_.
You may just want to get your app running and its trying to initialize
its memory in parallel on all threads. Nothing wrong with that.
On Fri, 8 Jan 2010, Christoph Lameter wrote:
>
> Can we at least consider a typical standard business server, dual quad
> core hyperthreaded with 16 "cpus"? Cacheline contention will increase
> significantly there.
I bet it won't be a problem. It's when things go cross-socket that they
suck. So 16 cpu's across two sockets I wouldn't worry about.
> > Because let's face it - if your workload does several million page faults
> > per second, you're just doing something fundamentally _wrong_.
>
> You may just want to get your app running and its trying to initialize
> its memory in parallel on all threads. Nothing wrong with that.
Umm. That's going to be limited by the memset/memcpy, not the rwlock, I
bet.
The benchmark in question literally did a single byte write to each page
in order to show just the kernel component. That really isn't realistic
for any real load.
Linus
On Fri, 8 Jan 2010, Linus Torvalds wrote:
> I bet it won't be a problem. It's when things go cross-socket that they
> suck. So 16 cpu's across two sockets I wouldn't worry about.
>
> > > Because let's face it - if your workload does several million page faults
> > > per second, you're just doing something fundamentally _wrong_.
> >
> > You may just want to get your app running and its trying to initialize
> > its memory in parallel on all threads. Nothing wrong with that.
>
> Umm. That's going to be limited by the memset/memcpy, not the rwlock, I
> bet.
That may be true for a system with 2 threads. As the number of threads
increases so does the cacheline contention. In larger systems the
memset/memcpy is negligible.
> The benchmark in question literally did a single byte write to each page
> in order to show just the kernel component. That really isn't realistic
> for any real load.
Each anon fault also includes zeroing the page before its ready to be
written to. The cachelines will be hot after a fault and initialization of
any variables in the page will be fast due to that warming up effect.
Christoph Lameter <[email protected]> writes:
> Can we at least consider a typical standard business server, dual quad
> core hyperthreaded with 16 "cpus"? Cacheline contention will increase
> significantly there.
This year's standard server will be more like 24-64 "cpus"
Cheating with locks usually doesn't work anymore at these sizes.
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, 8 Jan 2010, Andi Kleen wrote:
> This year's standard server will be more like 24-64 "cpus"
What will it be? 2 or 4 sockets?
> Cheating with locks usually doesn't work anymore at these sizes.
Cheating means?
On Fri, Jan 08, 2010 at 12:56:08PM -0600, Christoph Lameter wrote:
> On Fri, 8 Jan 2010, Andi Kleen wrote:
>
> > This year's standard server will be more like 24-64 "cpus"
>
> What will it be? 2 or 4 sockets?
2-4 sockets.
> > Cheating with locks usually doesn't work anymore at these sizes.
>
> Cheating means?
Not really fixing the underlying data structure problem.
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, 8 Jan 2010, Christoph Lameter wrote:
> On Fri, 8 Jan 2010, Andi Kleen wrote:
>
> > This year's standard server will be more like 24-64 "cpus"
>
> What will it be? 2 or 4 sockets?
I think we can be pretty safe in saying that two sockets is going to be
overwhelmingly the more common case.
It's simply physics and form factor. It's hard to put four powerful CPU's
on one board in any normal form-factor, so when you go from 2->4 sockets,
you almost inevitably have to go to rather fancier form-factors (or
low-power sockets designed for socket-density rather than multi-core
density, which is kind of against the point these days).
So often you end up with CPU daughter-cards etc, which involves a lot more
design and cost, and no longer fit in standard desktop enclosures for
people who want stand-alone servers etc (or even in rack setups if you
want local disks too etc).
Think about it this way: just four sockets and associated per-socket RAM
DIMM's (never mind anything else) take up a _lot_ of space. And you can
only make your boards so big before they start having purely machanical
issues due to flexing etc.
Which is why I suspect that two sockets will be the bulk of the server
space for the forseeable future. It's been true before, and multiple
memory channels per socket to feed all those cores are just making it even
more so.
Linus
On Fri, Jan 08, 2010 at 11:11:32AM -0800, Linus Torvalds wrote:
>
>
> On Fri, 8 Jan 2010, Christoph Lameter wrote:
>
> > On Fri, 8 Jan 2010, Andi Kleen wrote:
> >
> > > This year's standard server will be more like 24-64 "cpus"
> >
> > What will it be? 2 or 4 sockets?
>
> I think we can be pretty safe in saying that two sockets is going to be
> overwhelmingly the more common case.
With 24 CPU threads cheating is very difficult too.
Besides even the "uncommon" part of a large pie can be still a lot of systems.
-Andi
On Fri, 8 Jan 2010, Andi Kleen wrote:
>
> With 24 CPU threads cheating is very difficult too.
Stop making value judgements in you word choice, like "cheating".
The fact is, the mmap_sem is per-mm, and works perfectly well. Other
locking can be vma-specific, but as already mentioned, it's not going to
_help_, since most of the time even hugely threaded programs will be using
malloc-like functionality and you have allocations not only cross threads,
but in general using the same vma.
Another fact is simply that you shouldn't write your app so that it needs
to do millions of page faults per second.
So this whole "cheating" argument of yours is total bullshit. It bears no
relation to reality.
Linus
On Fri, 8 Jan 2010, Linus Torvalds wrote:
>
> Another fact is simply that you shouldn't write your app so that it needs
> to do millions of page faults per second.
Side note: an app with lots of threads, and that needs to fault in lots of
pages at startup time, and has performance problems, can - and should -
likely use interfaces that are _designed_ for that.
There's things like madvise(WILLNEED) etc, which can batch up the filling
of a memory area.
If you're doing a performance-sensitive application with hundreds of
threads, and hundreds of gigabytes of data, you had better know about
simple concepts like "batch fill", rather than whine about "oh my, my
totally special-case app takes a few seconds to start because I'M A
F*CKING MORON AND DID EVERYTHING WRONG".
Linus
On Fri, 8 Jan 2010, Christoph Lameter wrote:
>
> I'd say that the ticket lock sucks for short critical sections vs. a
> simple spinlock since it forces the cacheline into shared mode.
Btw, I do agree that it's likely made worse by the fairness of the ticket
locks and the resulting extra traffic of people waiting for their turn.
Often for absolutely no good reason, since in this case the rwlock itself
will then be granted for reading in most cases - and there are no
ordering issues on readers.
We worried about the effects of fair spinlocks when introducing the ticket
locks, but nobody ever actually had a load that seemed to indicate it made
much of a difference, and we did have a few cases where starvation was a
very noticeable problem.
Linus
On Fri, 8 Jan 2010, Linus Torvalds wrote:
> We worried about the effects of fair spinlocks when introducing the ticket
> locks, but nobody ever actually had a load that seemed to indicate it made
> much of a difference, and we did have a few cases where starvation was a
> very noticeable problem.
And I made the point that starvation was a hardware issue due to immature
cacheline handling. Now the software patchup job for the hardware breakage
is causing regressions for everyone.
On Fri, 8 Jan 2010, Christoph Lameter wrote:
>
> And I made the point that starvation was a hardware issue due to immature
> cacheline handling. Now the software patchup job for the hardware breakage
> is causing regressions for everyone.
Well, in all fairness, (a) existing hardware doesn't do a good job, and
would have a really hard time doing so in general (ie the whole issue of
on-die vs directly-between-sockets vs between-complex-fabric), and (b) in
this case, the problem really was that the x86-64 rwsems were badly
implemented.
The fact that somebody _thought_ that it might be ok to do them with
spinlocks and had done some limited testing without ever hitting the
problem spot (probably never having tested any amount of contention at
all) is immaterial. We should have had real native rwsemaphores for
x86-64, and complaining about the fallback sucking under load is kind of
pointless.
Linus
On Friday 08 January 2010 11:53:30 Peter Zijlstra wrote:
> On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
> >
> > On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > > >
> > > > Of course, your other load with MADV_DONTNEED seems to be horrible, and
> > > > has some nasty spinlock issues, but that looks like a separate deal (I
> > > > assume that load is just very hard on the pgtable lock).
> > >
> > > It's zone->lock, I guess. My test program avoids pgtable lock problem.
> >
> > Yeah, I should have looked more at your callchain. That's nasty. Much
> > worse than the per-mm lock. I thought the page buffering would avoid the
> > zone lock becoming a huge problem, but clearly not in this case.
>
> Right, so I ran some numbers on a multi-socket (2) machine as well:
>
> pf/min
>
> -tip 56398626
> -tip + xadd 174753190
> -tip + speculative 189274319
> -tip + xadd + speculative 200174641
Has anyone tried these patches with ramzswap? Nitin do they help with the locking
issues you mentioned?
Thanks,
Ed
> [ variance is around 0.5% for this workload, ran most of these numbers
> with --repeat 5 ]
>
> At both the xadd/speculative point the workload is dominated by the
> zone->lock, the xadd+speculative removes some of the contention, and
> removing the various RSS counters could yield another few percent
> according to the profiles, but then we're pretty much there.
>
> One way around those RSS counters is to track it per task, a quick grep
> shows its only the oom-killer and proc that use them.
>
> A quick hack removing them gets us: 203158058
>
> So from a throughput pov. the whole speculative fault thing might not be
> interesting until the rest of the vm gets a lift to go along with it.
>
> >From a blocking on mmap_sem pov. I think Linus is right in that we
> should first consider things like dropping mmap_sep around IO and page
> zeroing, and generally looking at reducing hold times and such.
>
> So while I think its quite feasible to do these speculative faults, it
> appears we're not quite ready for them.
>
> Maybe I can get -rt to carry it for a while, there we have to reduce
> mmap_sem to a mutex, which hurts lots.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
On Fri, 8 Jan 2010 09:51:49 -0600 (CST)
Christoph Lameter <[email protected]> wrote:
> On Thu, 7 Jan 2010, Arjan van de Ven wrote:
>
> > if an app has to change because our kernel sucks (for no good
> > reason), "change the app" really is the lame type of answer.
>
> We are changing apps all of the time here to reduce the number of
> system calls.
it's fine to fix an app for something fundamental.
it sucks to need to fix an app because we decided to clear a page
inside a lock rather than outside it.
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On 01/09/2010 08:17 PM, Ed Tomlinson wrote:
> On Friday 08 January 2010 11:53:30 Peter Zijlstra wrote:
>> On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
>>>
>>> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>>>>>
>>>>> Of course, your other load with MADV_DONTNEED seems to be horrible, and
>>>>> has some nasty spinlock issues, but that looks like a separate deal (I
>>>>> assume that load is just very hard on the pgtable lock).
>>>>
>>>> It's zone->lock, I guess. My test program avoids pgtable lock problem.
>>>
>>> Yeah, I should have looked more at your callchain. That's nasty. Much
>>> worse than the per-mm lock. I thought the page buffering would avoid the
>>> zone lock becoming a huge problem, but clearly not in this case.
>>
>> Right, so I ran some numbers on a multi-socket (2) machine as well:
>>
>> pf/min
>>
>> -tip 56398626
>> -tip + xadd 174753190
>> -tip + speculative 189274319
>> -tip + xadd + speculative 200174641
>
> Has anyone tried these patches with ramzswap? Nitin do they help with the locking
> issues you mentioned?
>
Locking problem with ramzswap seems completely unrelated to what is being discussed here.
Thanks,
Nitin