The RT patch has been having lots of trouble lately with large machines
and applications running lots of threads. This usually boils down to a
bottle neck of a single lock: the mm->mmap_sem.
The mmap_sem is a rwsem, which can sleep, but it also can be taken with
a read/write lock, where a read lock can be taken by several tasks at
the same time and the write lock can be only taken by a single task.
But due to priority inheritance, having multiple readers makes the code
much more complex, thus the -rt patch converts all rwsems into a single
mutex, where readers may nest (the same task may grab the same rwsem for
read multiple times), but only one task may hold the rwsem at any given
time (for read or write).
When we have lots of threads, the rwsem may be taken often, either for
memory allocation or filling in page faults. This becomes a bottle neck
for threads as only one thread at a time may grab the mmap_sem (which is
shared by all threads of a process).
Previous attempts of adding multiple readers became too complex and was
error prone. This approach takes on a much more simpler technique, one
that is actually used by per cpu locks.
The idea here is to have an rwsem create a rt_mutex for each CPU.
Actually, it creates a rwsem for each CPU that can only be acquired by
one task at a time. This allows for readers on separate CPUs to take
only the per cpu lock. When a writer needs to take a lock, it must grab
all CPU locks before continuing.
This approach does nothing special with the rt_mutex or priority
inheritance code. That stays the same, and works normally (thus less
error prone). The trick here is that when a reader takes a rwsem for
read, it must disable migration, that way it can unlock the rwsem
without needing any special searches (which lock did it take?).
I've tested this a bit, and so far it works well. I haven't found a nice
way to initialize the locks, so I'm using the silly initialize_rwsem()
at all places that acquire the lock. But we can work on this later.
Also, I don't use per_cpu sections for the locks, which means we have
cache line collisions, but a normal (mainline) rwsem has that as well.
These are all room for improvement (and why this is just an RFC patch).
I'll see if I can get some numbers to see how this fixes the issues with
multi threads on big boxes.
Thoughts?
-- Steve
Not-yet-signed-off-by: Steven Rostedt <[email protected]>
diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
index 802c690..cd0c812 100644
--- a/include/linux/rwsem_rt.h
+++ b/include/linux/rwsem_rt.h
@@ -18,7 +18,7 @@
#include <linux/rtmutex.h>
-struct rw_semaphore {
+struct __rw_semaphore {
struct rt_mutex lock;
int read_depth;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -26,22 +26,40 @@ struct rw_semaphore {
#endif
};
+struct rw_semaphore {
+ int initialized;
+ struct __rw_semaphore lock[NR_CPUS];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ const char *name;
+ struct lock_class_key __key[NR_CPUS];
+#endif
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __RWSEM_INITIALIZER(_name) \
+ { .name = _name }
+#else
#define __RWSEM_INITIALIZER(name) \
- { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
- RW_DEP_MAP_INIT(name) }
+ { }
+
+#endif
#define DECLARE_RWSEM(lockname) \
struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
-extern void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+extern void __rt_rwsem_init(struct __rw_semaphore *rwsem, char *name,
struct lock_class_key *key);
-# define rt_init_rwsem(sem) \
-do { \
- static struct lock_class_key __key; \
- \
- rt_mutex_init(&(sem)->lock); \
- __rt_rwsem_init((sem), #sem, &__key); \
+# define rt_init_rwsem(sem) \
+do { \
+ static struct lock_class_key __key[NR_CPUS]; \
+ int ____i; \
+ \
+ for (____i = 0; ____i < NR_CPUS; ____i++) { \
+ rt_mutex_init(&((sem)->lock[____i]).lock); \
+ __rt_rwsem_init(&((sem)->lock[____i]), #sem, &__key[____i]); \
+ } \
+ (sem)->initialized = 1; \
} while (0)
extern void rt_down_write(struct rw_semaphore *rwsem);
@@ -55,7 +73,11 @@ extern void rt_up_write(struct rw_semaphore *rwsem);
extern void rt_downgrade_write(struct rw_semaphore *rwsem);
#define init_rwsem(sem) rt_init_rwsem(sem)
-#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock)
+/*
+ * Use raw_smp_processor_id(), as readlocks use migrate disable,
+ * and write locks lock all of them (we don't care which one we test.
+ */
+#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock[raw_smp_processor_id()].lock)
static inline void down_read(struct rw_semaphore *sem)
{
diff --git a/kernel/rt.c b/kernel/rt.c
index 092d6b3..f8dab27 100644
--- a/kernel/rt.c
+++ b/kernel/rt.c
@@ -306,18 +306,52 @@ EXPORT_SYMBOL(__rt_rwlock_init);
* rw_semaphores
*/
+static void __initialize_rwsem(struct rw_semaphore *rwsem)
+{
+ int i;
+
+ /* TODO add spinlock here? */
+ rwsem->initialized = 1;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ rt_mutex_init(&rwsem->lock[i].lock);
+ __rt_rwsem_init(&rwsem->lock[i],
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem->name, &rwsem->key[i]
+#else
+ "", 0
+#endif
+ );
+ }
+}
+
+#define initialize_rwsem(rwsem) \
+ do { \
+ if (unlikely(!rwsem->initialized)) \
+ __initialize_rwsem(rwsem); \
+ } while (0)
+
void rt_up_write(struct rw_semaphore *rwsem)
{
- rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
- rt_mutex_unlock(&rwsem->lock);
+ int i;
+
+ for_each_possible_cpu(i) {
+ rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
+ rt_mutex_unlock(&rwsem->lock[i].lock);
+ }
}
EXPORT_SYMBOL(rt_up_write);
void rt_up_read(struct rw_semaphore *rwsem)
{
- rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
- if (--rwsem->read_depth == 0)
- rt_mutex_unlock(&rwsem->lock);
+ int cpu;
+
+ cpu = smp_processor_id();
+ rwsem_release(&rwsem->lock[cpu].dep_map, 1, _RET_IP_);
+ if (--rwsem->lock[cpu].read_depth == 0) {
+ rt_mutex_unlock(&rwsem->lock[cpu].lock);
+ migrate_enable();
+ }
}
EXPORT_SYMBOL(rt_up_read);
@@ -327,67 +361,112 @@ EXPORT_SYMBOL(rt_up_read);
*/
void rt_downgrade_write(struct rw_semaphore *rwsem)
{
- BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
- rwsem->read_depth = 1;
+ int cpu;
+ int i;
+
+ migrate_disable();
+ cpu = smp_processor_id();
+ for_each_possible_cpu(i) {
+ if (cpu == i) {
+ BUG_ON(rt_mutex_owner(&rwsem->lock[i].lock) != current);
+ rwsem->lock[i].read_depth = 1;
+ } else {
+ rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
+ rt_mutex_unlock(&rwsem->lock[i].lock);
+ }
+ }
}
EXPORT_SYMBOL(rt_downgrade_write);
int rt_down_write_trylock(struct rw_semaphore *rwsem)
{
- int ret = rt_mutex_trylock(&rwsem->lock);
+ int ret;
+ int i;
- if (ret)
- rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
- return ret;
+ initialize_rwsem(rwsem);
+
+ for_each_possible_cpu(i) {
+ ret = rt_mutex_trylock(&rwsem->lock[i].lock);
+ if (ret)
+ rwsem_acquire(&rwsem->lock[i].dep_map, 0, 1, _RET_IP_);
+ else
+ goto release;
+ }
+ return 1;
+ release:
+ while (--i >= 0) {
+ rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
+ rt_mutex_unlock(&rwsem->lock[i].lock);
+ }
+ return 0;
}
EXPORT_SYMBOL(rt_down_write_trylock);
void rt_down_write(struct rw_semaphore *rwsem)
{
- rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
- rt_mutex_lock(&rwsem->lock);
+ int i;
+ initialize_rwsem(rwsem);
+ for_each_possible_cpu(i) {
+ rwsem_acquire(&rwsem->lock[i].dep_map, 0, 0, _RET_IP_);
+ rt_mutex_lock(&rwsem->lock[i].lock);
+ }
}
EXPORT_SYMBOL(rt_down_write);
void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
{
- rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
- rt_mutex_lock(&rwsem->lock);
+ int i;
+
+ initialize_rwsem(rwsem);
+ for_each_possible_cpu(i) {
+ rwsem_acquire(&rwsem->lock[i].dep_map, subclass, 0, _RET_IP_);
+ rt_mutex_lock(&rwsem->lock[i].lock);
+ }
}
EXPORT_SYMBOL(rt_down_write_nested);
int rt_down_read_trylock(struct rw_semaphore *rwsem)
{
- struct rt_mutex *lock = &rwsem->lock;
+ struct rt_mutex *lock;
int ret = 1;
+ int cpu;
+ initialize_rwsem(rwsem);
+ migrate_disable();
+ cpu = smp_processor_id();
+ lock = &rwsem->lock[cpu].lock;
/*
* recursive read locks succeed when current owns the rwsem,
* but not when read_depth == 0 which means that the rwsem is
* write locked.
*/
if (rt_mutex_owner(lock) != current)
- ret = rt_mutex_trylock(&rwsem->lock);
- else if (!rwsem->read_depth)
+ ret = rt_mutex_trylock(lock);
+ else if (!rwsem->lock[cpu].read_depth)
ret = 0;
if (ret) {
- rwsem->read_depth++;
- rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
- }
+ rwsem->lock[cpu].read_depth++;
+ rwsem_acquire(&rwsem->lock[cpu].dep_map, 0, 1, _RET_IP_);
+ } else
+ migrate_enable();
return ret;
}
EXPORT_SYMBOL(rt_down_read_trylock);
static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
{
- struct rt_mutex *lock = &rwsem->lock;
+ struct rt_mutex *lock;
+ int cpu;
- rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
+ migrate_disable();
+ cpu = smp_processor_id();
+ lock = &rwsem->lock[cpu].lock;
+ rwsem_acquire_read(&rwsem->lock[cpu].dep_map, subclass, 0, _RET_IP_);
if (rt_mutex_owner(lock) != current)
- rt_mutex_lock(&rwsem->lock);
- rwsem->read_depth++;
+ rt_mutex_lock(lock);
+ rwsem->lock[cpu].read_depth++;
}
void rt_down_read(struct rw_semaphore *rwsem)
@@ -402,7 +481,7 @@ void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
}
EXPORT_SYMBOL(rt_down_read_nested);
-void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+void __rt_rwsem_init(struct __rw_semaphore *rwsem, char *name,
struct lock_class_key *key)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
On Tue, 2012-05-15 at 10:03 -0400, Steven Rostedt wrote:
>
> where readers may nest (the same task may grab the same rwsem for
> read multiple times), but only one task may hold the rwsem at any
> given
> time (for read or write).
Humm, that sounds iffy, rwsem isn't a recursive read lock only rwlock_t
is.
> The idea here is to have an rwsem create a rt_mutex for each CPU.
> Actually, it creates a rwsem for each CPU that can only be acquired by
> one task at a time. This allows for readers on separate CPUs to take
> only the per cpu lock. When a writer needs to take a lock, it must
> grab
> all CPU locks before continuing.
So you've turned it into a global/local or br or whatever that thing was
called lock.
>
> Also, I don't use per_cpu sections for the locks, which means we have
> cache line collisions, but a normal (mainline) rwsem has that as well.
>
Why not?
> Thoughts?
Ideally someone would try and get rid of mmap_sem itself.. but that's a
tough nut.
> void rt_down_write(struct rw_semaphore *rwsem)
> {
> - rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
> - rt_mutex_lock(&rwsem->lock);
> + int i;
> + initialize_rwsem(rwsem);
> + for_each_possible_cpu(i) {
> + rwsem_acquire(&rwsem->lock[i].dep_map, 0, 0,
> _RET_IP_);
> + rt_mutex_lock(&rwsem->lock[i].lock);
> + }
> }
> EXPORT_SYMBOL(rt_down_write);
>
That'll make lockdep explode.. you'll want to make the whole set a
single lock and not treat it as nr_cpus locks.
On Tue, 2012-05-15 at 17:06 +0200, Peter Zijlstra wrote:
> On Tue, 2012-05-15 at 10:03 -0400, Steven Rostedt wrote:
> >
> > where readers may nest (the same task may grab the same rwsem for
> > read multiple times), but only one task may hold the rwsem at any
> > given
> > time (for read or write).
>
> Humm, that sounds iffy, rwsem isn't a recursive read lock only rwlock_t
> is.
In that case, current -rt is broken. As it has it being a recursive lock
(without my patch).
>
> > The idea here is to have an rwsem create a rt_mutex for each CPU.
> > Actually, it creates a rwsem for each CPU that can only be acquired by
> > one task at a time. This allows for readers on separate CPUs to take
> > only the per cpu lock. When a writer needs to take a lock, it must
> > grab
> > all CPU locks before continuing.
>
> So you've turned it into a global/local or br or whatever that thing was
> called lock.
Yeah, basically I'm doing what that silly thing did.
> >
> > Also, I don't use per_cpu sections for the locks, which means we have
> > cache line collisions, but a normal (mainline) rwsem has that as well.
> >
> Why not?
Because it was hard to figure out how to do it transparently from the
rest of the kernel (with non-rt being unaffected). For now I wanted a
proof of concept. If we can figure out how to make this real per cpu,
I'm all for it. In fact, I'm wondering if that wouldn't make normal
rwsems even faster in mainline (no cacheline bouncing from readers).
I'll have to look at the br_lock thingy again and see how they did it. I
couldn't remember what lock did that, thanks for the reminder ;-)
>
> > Thoughts?
>
> Ideally someone would try and get rid of mmap_sem itself.. but that's a
> tough nut.
Yeah, that would be the best scenario, but we are getting complaints
about today's -rt. :-/
>
>
>
> > void rt_down_write(struct rw_semaphore *rwsem)
> > {
> > - rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
> > - rt_mutex_lock(&rwsem->lock);
> > + int i;
> > + initialize_rwsem(rwsem);
> > + for_each_possible_cpu(i) {
> > + rwsem_acquire(&rwsem->lock[i].dep_map, 0, 0,
> > _RET_IP_);
> > + rt_mutex_lock(&rwsem->lock[i].lock);
> > + }
> > }
> > EXPORT_SYMBOL(rt_down_write);
> >
> That'll make lockdep explode.. you'll want to make the whole set a
> single lock and not treat it as nr_cpus locks.
Yeah, I thought about making it a single entity, but I was thinking the
write lock would complain or something. But looking back, I think we can
do this. Also note, I wrote this at 4am this morning as I had insomnia
and couldn't sleep. I thought of this then and decided to write it out.
Thus this is a I-can't-sleep-let-me-write-a-patch-to-piss-off-tglx
patch.
-- Steve
On Tue, 2012-05-15 at 10:03 -0400, Steven Rostedt wrote:
> I'll see if I can get some numbers to see how this fixes the issues with
> multi threads on big boxes.
>
I couldn't get access to the big box, so I wrote my own test. The
attached program is what I used. It creates 400 threads and allocates a
memory range (with mmap) of 10 gigs. Then it runs all 400 threads, where
each is fighting to read this new memory. Causing lots of page faults.
I tested on a 4 CPU box with 3.4.0-rc7-rt6:
Without the patch:
map=10737418240
time = 11302617 usecs
map=10737418240
time = 11229341 usecs
map=10737418240
time = 11171463 usecs
map=10737418240
time = 11435549 usecs
map=10737418240
time = 11299086 usecs
With the patch:
map=10737418240
time = 6493796 usecs
map=10737418240
time = 6726186 usecs
map=10737418240
time = 3978194 usecs
map=10737418240
time = 6796688 usecs
So it went from roughly 11 secs to 6 secs (even had one 4sec run). This
shows that it sped up the fault access by almost half.
-- Steve
On Tue, 2012-05-15 at 11:42 -0400, Steven Rostedt wrote:
> On Tue, 2012-05-15 at 17:06 +0200, Peter Zijlstra wrote:
> > On Tue, 2012-05-15 at 10:03 -0400, Steven Rostedt wrote:
> > >
> > > where readers may nest (the same task may grab the same rwsem for
> > > read multiple times), but only one task may hold the rwsem at any
> > > given
> > > time (for read or write).
> >
> > Humm, that sounds iffy, rwsem isn't a recursive read lock only rwlock_t
> > is.
>
> In that case, current -rt is broken. As it has it being a recursive lock
> (without my patch).
>
Why wouldn't it be recursive. If two different tasks are allowed to grab
a read lock at the same time, why can't the same task grab a read lock
twice? As long as it releases it the same amount of times.
Now you can't grab a read lock if you have the write lock.
-- Steve
On Tue, 2012-05-15 at 13:25 -0400, Steven Rostedt wrote:
> On Tue, 2012-05-15 at 11:42 -0400, Steven Rostedt wrote:
> > On Tue, 2012-05-15 at 17:06 +0200, Peter Zijlstra wrote:
> > > On Tue, 2012-05-15 at 10:03 -0400, Steven Rostedt wrote:
> > > >
> > > > where readers may nest (the same task may grab the same rwsem for
> > > > read multiple times), but only one task may hold the rwsem at any
> > > > given
> > > > time (for read or write).
> > >
> > > Humm, that sounds iffy, rwsem isn't a recursive read lock only rwlock_t
> > > is.
> >
> > In that case, current -rt is broken. As it has it being a recursive lock
> > (without my patch).
Nah not broken, just pointless. A recursive lock that's not used
recursively is fine.
>
> Why wouldn't it be recursive. If two different tasks are allowed to grab
> a read lock at the same time, why can't the same task grab a read lock
> twice? As long as it releases it the same amount of times.
>
> Now you can't grab a read lock if you have the write lock.
rwsem is fifo-fair, if a writer comes in between the second read
acquisition (even by the same task) would block and you'd be a deadlock
since the write won't succeed since you're still holding a reader.
On Tue, 2012-05-15 at 19:31 +0200, Peter Zijlstra wrote:
> On Tue, 2012-05-15 at 13:25 -0400, Steven Rostedt wrote:
> > On Tue, 2012-05-15 at 11:42 -0400, Steven Rostedt wrote:
> > > On Tue, 2012-05-15 at 17:06 +0200, Peter Zijlstra wrote:
> > > > On Tue, 2012-05-15 at 10:03 -0400, Steven Rostedt wrote:
> > > > >
> > > > > where readers may nest (the same task may grab the same rwsem for
> > > > > read multiple times), but only one task may hold the rwsem at any
> > > > > given
> > > > > time (for read or write).
> > > >
> > > > Humm, that sounds iffy, rwsem isn't a recursive read lock only rwlock_t
> > > > is.
> > >
> > > In that case, current -rt is broken. As it has it being a recursive lock
> > > (without my patch).
>
> Nah not broken, just pointless. A recursive lock that's not used
> recursively is fine.
Heh, sure :-) But as -rt keeps it recursive, I didn't want to change
that.
>
> >
> > Why wouldn't it be recursive. If two different tasks are allowed to grab
> > a read lock at the same time, why can't the same task grab a read lock
> > twice? As long as it releases it the same amount of times.
> >
> > Now you can't grab a read lock if you have the write lock.
>
> rwsem is fifo-fair, if a writer comes in between the second read
> acquisition (even by the same task) would block and you'd be a deadlock
> since the write won't succeed since you're still holding a reader.
Yep agreed. And this patch didn't change that either.
-- Steve
On Tue, 15 May 2012, Steven Rostedt wrote:
> The RT patch has been having lots of trouble lately with large machines
> and applications running lots of threads. This usually boils down to a
> bottle neck of a single lock: the mm->mmap_sem.
>
> The mmap_sem is a rwsem, which can sleep, but it also can be taken with
> a read/write lock, where a read lock can be taken by several tasks at
> the same time and the write lock can be only taken by a single task.
>
> But due to priority inheritance, having multiple readers makes the code
> much more complex, thus the -rt patch converts all rwsems into a single
> mutex, where readers may nest (the same task may grab the same rwsem for
> read multiple times), but only one task may hold the rwsem at any given
> time (for read or write).
>
> When we have lots of threads, the rwsem may be taken often, either for
> memory allocation or filling in page faults. This becomes a bottle neck
> for threads as only one thread at a time may grab the mmap_sem (which is
> shared by all threads of a process).
>
> Previous attempts of adding multiple readers became too complex and was
> error prone. This approach takes on a much more simpler technique, one
> that is actually used by per cpu locks.
>
> The idea here is to have an rwsem create a rt_mutex for each CPU.
> Actually, it creates a rwsem for each CPU that can only be acquired by
> one task at a time. This allows for readers on separate CPUs to take
> only the per cpu lock. When a writer needs to take a lock, it must grab
> all CPU locks before continuing.
>
> This approach does nothing special with the rt_mutex or priority
> inheritance code. That stays the same, and works normally (thus less
> error prone). The trick here is that when a reader takes a rwsem for
> read, it must disable migration, that way it can unlock the rwsem
> without needing any special searches (which lock did it take?).
>
> I've tested this a bit, and so far it works well. I haven't found a nice
> way to initialize the locks, so I'm using the silly initialize_rwsem()
> at all places that acquire the lock. But we can work on this later.
>
> Also, I don't use per_cpu sections for the locks, which means we have
> cache line collisions, but a normal (mainline) rwsem has that as well.
>
> These are all room for improvement (and why this is just an RFC patch).
>
> I'll see if I can get some numbers to see how this fixes the issues with
> multi threads on big boxes.
>
> Thoughts?
>
> -- Steve
>
> Not-yet-signed-off-by: Steven Rostedt <[email protected]>
It looks interesting. I wanted to compile it and test it, but started
running into some problems, I fixed two simple things, but wanted to wait
to see if you would follow Peter's suggestion for lockdep before
proceeding too far.
Thanks
John
>From b70162eaaaa72263d6f13571c1f4675192f4f6cc Mon Sep 17 00:00:00 2001
From: John Kacur <[email protected]>
Date: Tue, 15 May 2012 18:25:06 +0200
Subject: [PATCH 1/2] Stringify "name" in __RWSEM_INITIALIZER
This fixes compile errors of the type:
error: initializer element is not constant
Signed-off-by: John Kacur <[email protected]>
---
include/linux/rwsem_rt.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
index cd0c812..dba3b50 100644
--- a/include/linux/rwsem_rt.h
+++ b/include/linux/rwsem_rt.h
@@ -37,7 +37,7 @@ struct rw_semaphore {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __RWSEM_INITIALIZER(_name) \
- { .name = _name }
+ { .name = #_name }
#else
#define __RWSEM_INITIALIZER(name) \
{ }
--
1.7.2.3
>From faefd7e9189b29aa8f8c2b3961b1c05889c27cd7 Mon Sep 17 00:00:00 2001
From: John Kacur <[email protected]>
Date: Tue, 15 May 2012 18:49:36 +0200
Subject: [PATCH 2/2] Fix wrong member name in __initialize_rwsem - change key to __key
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Fix the following error
linux-rt/kernel/rt.c:320: error: ?struct rw_semaphore? has no member named ?key?
Signed-off-by: John Kacur <[email protected]>
---
kernel/rt.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/kernel/rt.c b/kernel/rt.c
index f8dab27..86efaa6 100644
--- a/kernel/rt.c
+++ b/kernel/rt.c
@@ -317,7 +317,7 @@ static void __initialize_rwsem(struct rw_semaphore *rwsem)
rt_mutex_init(&rwsem->lock[i].lock);
__rt_rwsem_init(&rwsem->lock[i],
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- rwsem->name, &rwsem->key[i]
+ rwsem->name, &rwsem->__key[i]
#else
"", 0
#endif
--
1.7.2.3
On Tue, 2012-05-15 at 20:00 +0200, John Kacur wrote:
>
> It looks interesting. I wanted to compile it and test it, but started
> running into some problems, I fixed two simple things, but wanted to wait
> to see if you would follow Peter's suggestion for lockdep before
> proceeding too far.
I wouldn't bother compiling it with lockdep (yet), as it is completely
broken. I'll need to implement Peter's suggestion.
-- Steve
On Tue, May 15, 2012 at 10:03:45AM -0400, Steven Rostedt wrote:
> The RT patch has been having lots of trouble lately with large machines
> and applications running lots of threads. This usually boils down to a
> bottle neck of a single lock: the mm->mmap_sem.
Some researchers at MIT RCU-ified this lock:
http://people.csail.mit.edu/nickolai/papers/clements-bonsai.pdf
They have the patches in a git tree that can be found here:
http://pdos.csail.mit.edu/mosbench/
Looks like some work required to get this mainline-ready -- for one thing,
it is based on an old kernel.
Thanx, Paul
> The mmap_sem is a rwsem, which can sleep, but it also can be taken with
> a read/write lock, where a read lock can be taken by several tasks at
> the same time and the write lock can be only taken by a single task.
>
> But due to priority inheritance, having multiple readers makes the code
> much more complex, thus the -rt patch converts all rwsems into a single
> mutex, where readers may nest (the same task may grab the same rwsem for
> read multiple times), but only one task may hold the rwsem at any given
> time (for read or write).
>
> When we have lots of threads, the rwsem may be taken often, either for
> memory allocation or filling in page faults. This becomes a bottle neck
> for threads as only one thread at a time may grab the mmap_sem (which is
> shared by all threads of a process).
>
> Previous attempts of adding multiple readers became too complex and was
> error prone. This approach takes on a much more simpler technique, one
> that is actually used by per cpu locks.
>
> The idea here is to have an rwsem create a rt_mutex for each CPU.
> Actually, it creates a rwsem for each CPU that can only be acquired by
> one task at a time. This allows for readers on separate CPUs to take
> only the per cpu lock. When a writer needs to take a lock, it must grab
> all CPU locks before continuing.
>
> This approach does nothing special with the rt_mutex or priority
> inheritance code. That stays the same, and works normally (thus less
> error prone). The trick here is that when a reader takes a rwsem for
> read, it must disable migration, that way it can unlock the rwsem
> without needing any special searches (which lock did it take?).
>
> I've tested this a bit, and so far it works well. I haven't found a nice
> way to initialize the locks, so I'm using the silly initialize_rwsem()
> at all places that acquire the lock. But we can work on this later.
>
> Also, I don't use per_cpu sections for the locks, which means we have
> cache line collisions, but a normal (mainline) rwsem has that as well.
>
> These are all room for improvement (and why this is just an RFC patch).
>
> I'll see if I can get some numbers to see how this fixes the issues with
> multi threads on big boxes.
>
> Thoughts?
>
> -- Steve
>
> Not-yet-signed-off-by: Steven Rostedt <[email protected]>
>
> diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
> index 802c690..cd0c812 100644
> --- a/include/linux/rwsem_rt.h
> +++ b/include/linux/rwsem_rt.h
> @@ -18,7 +18,7 @@
>
> #include <linux/rtmutex.h>
>
> -struct rw_semaphore {
> +struct __rw_semaphore {
> struct rt_mutex lock;
> int read_depth;
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> @@ -26,22 +26,40 @@ struct rw_semaphore {
> #endif
> };
>
> +struct rw_semaphore {
> + int initialized;
> + struct __rw_semaphore lock[NR_CPUS];
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + const char *name;
> + struct lock_class_key __key[NR_CPUS];
> +#endif
> +};
> +
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +#define __RWSEM_INITIALIZER(_name) \
> + { .name = _name }
> +#else
> #define __RWSEM_INITIALIZER(name) \
> - { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
> - RW_DEP_MAP_INIT(name) }
> + { }
> +
> +#endif
>
> #define DECLARE_RWSEM(lockname) \
> struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
>
> -extern void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
> +extern void __rt_rwsem_init(struct __rw_semaphore *rwsem, char *name,
> struct lock_class_key *key);
>
> -# define rt_init_rwsem(sem) \
> -do { \
> - static struct lock_class_key __key; \
> - \
> - rt_mutex_init(&(sem)->lock); \
> - __rt_rwsem_init((sem), #sem, &__key); \
> +# define rt_init_rwsem(sem) \
> +do { \
> + static struct lock_class_key __key[NR_CPUS]; \
> + int ____i; \
> + \
> + for (____i = 0; ____i < NR_CPUS; ____i++) { \
> + rt_mutex_init(&((sem)->lock[____i]).lock); \
> + __rt_rwsem_init(&((sem)->lock[____i]), #sem, &__key[____i]); \
> + } \
> + (sem)->initialized = 1; \
> } while (0)
>
> extern void rt_down_write(struct rw_semaphore *rwsem);
> @@ -55,7 +73,11 @@ extern void rt_up_write(struct rw_semaphore *rwsem);
> extern void rt_downgrade_write(struct rw_semaphore *rwsem);
>
> #define init_rwsem(sem) rt_init_rwsem(sem)
> -#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock)
> +/*
> + * Use raw_smp_processor_id(), as readlocks use migrate disable,
> + * and write locks lock all of them (we don't care which one we test.
> + */
> +#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock[raw_smp_processor_id()].lock)
>
> static inline void down_read(struct rw_semaphore *sem)
> {
> diff --git a/kernel/rt.c b/kernel/rt.c
> index 092d6b3..f8dab27 100644
> --- a/kernel/rt.c
> +++ b/kernel/rt.c
> @@ -306,18 +306,52 @@ EXPORT_SYMBOL(__rt_rwlock_init);
> * rw_semaphores
> */
>
> +static void __initialize_rwsem(struct rw_semaphore *rwsem)
> +{
> + int i;
> +
> + /* TODO add spinlock here? */
> + rwsem->initialized = 1;
> +
> + for (i = 0; i < NR_CPUS; i++) {
> + rt_mutex_init(&rwsem->lock[i].lock);
> + __rt_rwsem_init(&rwsem->lock[i],
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + rwsem->name, &rwsem->key[i]
> +#else
> + "", 0
> +#endif
> + );
> + }
> +}
> +
> +#define initialize_rwsem(rwsem) \
> + do { \
> + if (unlikely(!rwsem->initialized)) \
> + __initialize_rwsem(rwsem); \
> + } while (0)
> +
> void rt_up_write(struct rw_semaphore *rwsem)
> {
> - rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
> - rt_mutex_unlock(&rwsem->lock);
> + int i;
> +
> + for_each_possible_cpu(i) {
> + rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
> + rt_mutex_unlock(&rwsem->lock[i].lock);
> + }
> }
> EXPORT_SYMBOL(rt_up_write);
>
> void rt_up_read(struct rw_semaphore *rwsem)
> {
> - rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
> - if (--rwsem->read_depth == 0)
> - rt_mutex_unlock(&rwsem->lock);
> + int cpu;
> +
> + cpu = smp_processor_id();
> + rwsem_release(&rwsem->lock[cpu].dep_map, 1, _RET_IP_);
> + if (--rwsem->lock[cpu].read_depth == 0) {
> + rt_mutex_unlock(&rwsem->lock[cpu].lock);
> + migrate_enable();
> + }
> }
> EXPORT_SYMBOL(rt_up_read);
>
> @@ -327,67 +361,112 @@ EXPORT_SYMBOL(rt_up_read);
> */
> void rt_downgrade_write(struct rw_semaphore *rwsem)
> {
> - BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
> - rwsem->read_depth = 1;
> + int cpu;
> + int i;
> +
> + migrate_disable();
> + cpu = smp_processor_id();
> + for_each_possible_cpu(i) {
> + if (cpu == i) {
> + BUG_ON(rt_mutex_owner(&rwsem->lock[i].lock) != current);
> + rwsem->lock[i].read_depth = 1;
> + } else {
> + rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
> + rt_mutex_unlock(&rwsem->lock[i].lock);
> + }
> + }
> }
> EXPORT_SYMBOL(rt_downgrade_write);
>
> int rt_down_write_trylock(struct rw_semaphore *rwsem)
> {
> - int ret = rt_mutex_trylock(&rwsem->lock);
> + int ret;
> + int i;
>
> - if (ret)
> - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
> - return ret;
> + initialize_rwsem(rwsem);
> +
> + for_each_possible_cpu(i) {
> + ret = rt_mutex_trylock(&rwsem->lock[i].lock);
> + if (ret)
> + rwsem_acquire(&rwsem->lock[i].dep_map, 0, 1, _RET_IP_);
> + else
> + goto release;
> + }
> + return 1;
> + release:
> + while (--i >= 0) {
> + rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
> + rt_mutex_unlock(&rwsem->lock[i].lock);
> + }
> + return 0;
> }
> EXPORT_SYMBOL(rt_down_write_trylock);
>
> void rt_down_write(struct rw_semaphore *rwsem)
> {
> - rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
> - rt_mutex_lock(&rwsem->lock);
> + int i;
> + initialize_rwsem(rwsem);
> + for_each_possible_cpu(i) {
> + rwsem_acquire(&rwsem->lock[i].dep_map, 0, 0, _RET_IP_);
> + rt_mutex_lock(&rwsem->lock[i].lock);
> + }
> }
> EXPORT_SYMBOL(rt_down_write);
>
> void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
> {
> - rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
> - rt_mutex_lock(&rwsem->lock);
> + int i;
> +
> + initialize_rwsem(rwsem);
> + for_each_possible_cpu(i) {
> + rwsem_acquire(&rwsem->lock[i].dep_map, subclass, 0, _RET_IP_);
> + rt_mutex_lock(&rwsem->lock[i].lock);
> + }
> }
> EXPORT_SYMBOL(rt_down_write_nested);
>
> int rt_down_read_trylock(struct rw_semaphore *rwsem)
> {
> - struct rt_mutex *lock = &rwsem->lock;
> + struct rt_mutex *lock;
> int ret = 1;
> + int cpu;
>
> + initialize_rwsem(rwsem);
> + migrate_disable();
> + cpu = smp_processor_id();
> + lock = &rwsem->lock[cpu].lock;
> /*
> * recursive read locks succeed when current owns the rwsem,
> * but not when read_depth == 0 which means that the rwsem is
> * write locked.
> */
> if (rt_mutex_owner(lock) != current)
> - ret = rt_mutex_trylock(&rwsem->lock);
> - else if (!rwsem->read_depth)
> + ret = rt_mutex_trylock(lock);
> + else if (!rwsem->lock[cpu].read_depth)
> ret = 0;
>
> if (ret) {
> - rwsem->read_depth++;
> - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
> - }
> + rwsem->lock[cpu].read_depth++;
> + rwsem_acquire(&rwsem->lock[cpu].dep_map, 0, 1, _RET_IP_);
> + } else
> + migrate_enable();
> return ret;
> }
> EXPORT_SYMBOL(rt_down_read_trylock);
>
> static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
> {
> - struct rt_mutex *lock = &rwsem->lock;
> + struct rt_mutex *lock;
> + int cpu;
>
> - rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
> + migrate_disable();
> + cpu = smp_processor_id();
> + lock = &rwsem->lock[cpu].lock;
> + rwsem_acquire_read(&rwsem->lock[cpu].dep_map, subclass, 0, _RET_IP_);
>
> if (rt_mutex_owner(lock) != current)
> - rt_mutex_lock(&rwsem->lock);
> - rwsem->read_depth++;
> + rt_mutex_lock(lock);
> + rwsem->lock[cpu].read_depth++;
> }
>
> void rt_down_read(struct rw_semaphore *rwsem)
> @@ -402,7 +481,7 @@ void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
> }
> EXPORT_SYMBOL(rt_down_read_nested);
>
> -void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
> +void __rt_rwsem_init(struct __rw_semaphore *rwsem, char *name,
> struct lock_class_key *key)
> {
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Thu, May 17, 2012 at 08:18:38AM -0700, Paul E. McKenney wrote:
> On Tue, May 15, 2012 at 10:03:45AM -0400, Steven Rostedt wrote:
> > The RT patch has been having lots of trouble lately with large machines
> > and applications running lots of threads. This usually boils down to a
> > bottle neck of a single lock: the mm->mmap_sem.
>
> Some researchers at MIT RCU-ified this lock:
>
> http://people.csail.mit.edu/nickolai/papers/clements-bonsai.pdf
>
> They have the patches in a git tree that can be found here:
>
> http://pdos.csail.mit.edu/mosbench/
>
> Looks like some work required to get this mainline-ready -- for one thing,
> it is based on an old kernel.
More specifically, git://pdos.csail.mit.edu/mosbench/pk.git. The branch
that covers their stuff is rcuvm-pure, which is a merge of rcuvm-hybrid
and rcuvm-early-release, each of which is discussed in their paper
(above PDF). The commits are based on 2.6.37, so a bit on the old side.
About 100 non-merge commits, so not a trivial change. ;-)
Thanx, Paul
> > The mmap_sem is a rwsem, which can sleep, but it also can be taken with
> > a read/write lock, where a read lock can be taken by several tasks at
> > the same time and the write lock can be only taken by a single task.
> >
> > But due to priority inheritance, having multiple readers makes the code
> > much more complex, thus the -rt patch converts all rwsems into a single
> > mutex, where readers may nest (the same task may grab the same rwsem for
> > read multiple times), but only one task may hold the rwsem at any given
> > time (for read or write).
> >
> > When we have lots of threads, the rwsem may be taken often, either for
> > memory allocation or filling in page faults. This becomes a bottle neck
> > for threads as only one thread at a time may grab the mmap_sem (which is
> > shared by all threads of a process).
> >
> > Previous attempts of adding multiple readers became too complex and was
> > error prone. This approach takes on a much more simpler technique, one
> > that is actually used by per cpu locks.
> >
> > The idea here is to have an rwsem create a rt_mutex for each CPU.
> > Actually, it creates a rwsem for each CPU that can only be acquired by
> > one task at a time. This allows for readers on separate CPUs to take
> > only the per cpu lock. When a writer needs to take a lock, it must grab
> > all CPU locks before continuing.
> >
> > This approach does nothing special with the rt_mutex or priority
> > inheritance code. That stays the same, and works normally (thus less
> > error prone). The trick here is that when a reader takes a rwsem for
> > read, it must disable migration, that way it can unlock the rwsem
> > without needing any special searches (which lock did it take?).
> >
> > I've tested this a bit, and so far it works well. I haven't found a nice
> > way to initialize the locks, so I'm using the silly initialize_rwsem()
> > at all places that acquire the lock. But we can work on this later.
> >
> > Also, I don't use per_cpu sections for the locks, which means we have
> > cache line collisions, but a normal (mainline) rwsem has that as well.
> >
> > These are all room for improvement (and why this is just an RFC patch).
> >
> > I'll see if I can get some numbers to see how this fixes the issues with
> > multi threads on big boxes.
> >
> > Thoughts?
> >
> > -- Steve
> >
> > Not-yet-signed-off-by: Steven Rostedt <[email protected]>
> >
> > diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
> > index 802c690..cd0c812 100644
> > --- a/include/linux/rwsem_rt.h
> > +++ b/include/linux/rwsem_rt.h
> > @@ -18,7 +18,7 @@
> >
> > #include <linux/rtmutex.h>
> >
> > -struct rw_semaphore {
> > +struct __rw_semaphore {
> > struct rt_mutex lock;
> > int read_depth;
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > @@ -26,22 +26,40 @@ struct rw_semaphore {
> > #endif
> > };
> >
> > +struct rw_semaphore {
> > + int initialized;
> > + struct __rw_semaphore lock[NR_CPUS];
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > + const char *name;
> > + struct lock_class_key __key[NR_CPUS];
> > +#endif
> > +};
> > +
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > +#define __RWSEM_INITIALIZER(_name) \
> > + { .name = _name }
> > +#else
> > #define __RWSEM_INITIALIZER(name) \
> > - { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
> > - RW_DEP_MAP_INIT(name) }
> > + { }
> > +
> > +#endif
> >
> > #define DECLARE_RWSEM(lockname) \
> > struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
> >
> > -extern void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
> > +extern void __rt_rwsem_init(struct __rw_semaphore *rwsem, char *name,
> > struct lock_class_key *key);
> >
> > -# define rt_init_rwsem(sem) \
> > -do { \
> > - static struct lock_class_key __key; \
> > - \
> > - rt_mutex_init(&(sem)->lock); \
> > - __rt_rwsem_init((sem), #sem, &__key); \
> > +# define rt_init_rwsem(sem) \
> > +do { \
> > + static struct lock_class_key __key[NR_CPUS]; \
> > + int ____i; \
> > + \
> > + for (____i = 0; ____i < NR_CPUS; ____i++) { \
> > + rt_mutex_init(&((sem)->lock[____i]).lock); \
> > + __rt_rwsem_init(&((sem)->lock[____i]), #sem, &__key[____i]); \
> > + } \
> > + (sem)->initialized = 1; \
> > } while (0)
> >
> > extern void rt_down_write(struct rw_semaphore *rwsem);
> > @@ -55,7 +73,11 @@ extern void rt_up_write(struct rw_semaphore *rwsem);
> > extern void rt_downgrade_write(struct rw_semaphore *rwsem);
> >
> > #define init_rwsem(sem) rt_init_rwsem(sem)
> > -#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock)
> > +/*
> > + * Use raw_smp_processor_id(), as readlocks use migrate disable,
> > + * and write locks lock all of them (we don't care which one we test.
> > + */
> > +#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock[raw_smp_processor_id()].lock)
> >
> > static inline void down_read(struct rw_semaphore *sem)
> > {
> > diff --git a/kernel/rt.c b/kernel/rt.c
> > index 092d6b3..f8dab27 100644
> > --- a/kernel/rt.c
> > +++ b/kernel/rt.c
> > @@ -306,18 +306,52 @@ EXPORT_SYMBOL(__rt_rwlock_init);
> > * rw_semaphores
> > */
> >
> > +static void __initialize_rwsem(struct rw_semaphore *rwsem)
> > +{
> > + int i;
> > +
> > + /* TODO add spinlock here? */
> > + rwsem->initialized = 1;
> > +
> > + for (i = 0; i < NR_CPUS; i++) {
> > + rt_mutex_init(&rwsem->lock[i].lock);
> > + __rt_rwsem_init(&rwsem->lock[i],
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > + rwsem->name, &rwsem->key[i]
> > +#else
> > + "", 0
> > +#endif
> > + );
> > + }
> > +}
> > +
> > +#define initialize_rwsem(rwsem) \
> > + do { \
> > + if (unlikely(!rwsem->initialized)) \
> > + __initialize_rwsem(rwsem); \
> > + } while (0)
> > +
> > void rt_up_write(struct rw_semaphore *rwsem)
> > {
> > - rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
> > - rt_mutex_unlock(&rwsem->lock);
> > + int i;
> > +
> > + for_each_possible_cpu(i) {
> > + rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
> > + rt_mutex_unlock(&rwsem->lock[i].lock);
> > + }
> > }
> > EXPORT_SYMBOL(rt_up_write);
> >
> > void rt_up_read(struct rw_semaphore *rwsem)
> > {
> > - rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
> > - if (--rwsem->read_depth == 0)
> > - rt_mutex_unlock(&rwsem->lock);
> > + int cpu;
> > +
> > + cpu = smp_processor_id();
> > + rwsem_release(&rwsem->lock[cpu].dep_map, 1, _RET_IP_);
> > + if (--rwsem->lock[cpu].read_depth == 0) {
> > + rt_mutex_unlock(&rwsem->lock[cpu].lock);
> > + migrate_enable();
> > + }
> > }
> > EXPORT_SYMBOL(rt_up_read);
> >
> > @@ -327,67 +361,112 @@ EXPORT_SYMBOL(rt_up_read);
> > */
> > void rt_downgrade_write(struct rw_semaphore *rwsem)
> > {
> > - BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
> > - rwsem->read_depth = 1;
> > + int cpu;
> > + int i;
> > +
> > + migrate_disable();
> > + cpu = smp_processor_id();
> > + for_each_possible_cpu(i) {
> > + if (cpu == i) {
> > + BUG_ON(rt_mutex_owner(&rwsem->lock[i].lock) != current);
> > + rwsem->lock[i].read_depth = 1;
> > + } else {
> > + rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
> > + rt_mutex_unlock(&rwsem->lock[i].lock);
> > + }
> > + }
> > }
> > EXPORT_SYMBOL(rt_downgrade_write);
> >
> > int rt_down_write_trylock(struct rw_semaphore *rwsem)
> > {
> > - int ret = rt_mutex_trylock(&rwsem->lock);
> > + int ret;
> > + int i;
> >
> > - if (ret)
> > - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
> > - return ret;
> > + initialize_rwsem(rwsem);
> > +
> > + for_each_possible_cpu(i) {
> > + ret = rt_mutex_trylock(&rwsem->lock[i].lock);
> > + if (ret)
> > + rwsem_acquire(&rwsem->lock[i].dep_map, 0, 1, _RET_IP_);
> > + else
> > + goto release;
> > + }
> > + return 1;
> > + release:
> > + while (--i >= 0) {
> > + rwsem_release(&rwsem->lock[i].dep_map, 1, _RET_IP_);
> > + rt_mutex_unlock(&rwsem->lock[i].lock);
> > + }
> > + return 0;
> > }
> > EXPORT_SYMBOL(rt_down_write_trylock);
> >
> > void rt_down_write(struct rw_semaphore *rwsem)
> > {
> > - rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
> > - rt_mutex_lock(&rwsem->lock);
> > + int i;
> > + initialize_rwsem(rwsem);
> > + for_each_possible_cpu(i) {
> > + rwsem_acquire(&rwsem->lock[i].dep_map, 0, 0, _RET_IP_);
> > + rt_mutex_lock(&rwsem->lock[i].lock);
> > + }
> > }
> > EXPORT_SYMBOL(rt_down_write);
> >
> > void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
> > {
> > - rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
> > - rt_mutex_lock(&rwsem->lock);
> > + int i;
> > +
> > + initialize_rwsem(rwsem);
> > + for_each_possible_cpu(i) {
> > + rwsem_acquire(&rwsem->lock[i].dep_map, subclass, 0, _RET_IP_);
> > + rt_mutex_lock(&rwsem->lock[i].lock);
> > + }
> > }
> > EXPORT_SYMBOL(rt_down_write_nested);
> >
> > int rt_down_read_trylock(struct rw_semaphore *rwsem)
> > {
> > - struct rt_mutex *lock = &rwsem->lock;
> > + struct rt_mutex *lock;
> > int ret = 1;
> > + int cpu;
> >
> > + initialize_rwsem(rwsem);
> > + migrate_disable();
> > + cpu = smp_processor_id();
> > + lock = &rwsem->lock[cpu].lock;
> > /*
> > * recursive read locks succeed when current owns the rwsem,
> > * but not when read_depth == 0 which means that the rwsem is
> > * write locked.
> > */
> > if (rt_mutex_owner(lock) != current)
> > - ret = rt_mutex_trylock(&rwsem->lock);
> > - else if (!rwsem->read_depth)
> > + ret = rt_mutex_trylock(lock);
> > + else if (!rwsem->lock[cpu].read_depth)
> > ret = 0;
> >
> > if (ret) {
> > - rwsem->read_depth++;
> > - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
> > - }
> > + rwsem->lock[cpu].read_depth++;
> > + rwsem_acquire(&rwsem->lock[cpu].dep_map, 0, 1, _RET_IP_);
> > + } else
> > + migrate_enable();
> > return ret;
> > }
> > EXPORT_SYMBOL(rt_down_read_trylock);
> >
> > static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
> > {
> > - struct rt_mutex *lock = &rwsem->lock;
> > + struct rt_mutex *lock;
> > + int cpu;
> >
> > - rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
> > + migrate_disable();
> > + cpu = smp_processor_id();
> > + lock = &rwsem->lock[cpu].lock;
> > + rwsem_acquire_read(&rwsem->lock[cpu].dep_map, subclass, 0, _RET_IP_);
> >
> > if (rt_mutex_owner(lock) != current)
> > - rt_mutex_lock(&rwsem->lock);
> > - rwsem->read_depth++;
> > + rt_mutex_lock(lock);
> > + rwsem->lock[cpu].read_depth++;
> > }
> >
> > void rt_down_read(struct rw_semaphore *rwsem)
> > @@ -402,7 +481,7 @@ void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
> > }
> > EXPORT_SYMBOL(rt_down_read_nested);
> >
> > -void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
> > +void __rt_rwsem_init(struct __rw_semaphore *rwsem, char *name,
> > struct lock_class_key *key)
> > {
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> >
On Thu, 2012-05-17 at 08:18 -0700, Paul E. McKenney wrote:
> Some researchers at MIT RCU-ified this lock:
>
> http://people.csail.mit.edu/nickolai/papers/clements-bonsai.pdf
Ah, as have I [1].. and they seem to have gotten about as far as I have,
which means almost there but not quite [2] :-)
The most interesting case is file maps and they simply ignored those.
While I appreciate that from an academic pov, -- they can still write a
paper on the other interesting bits -- I don't really like it from a
practical point.
[1] https://lkml.org/lkml/2010/1/4/257
[2] https://lkml.org/lkml/2010/1/4/532
On Thu, May 17, 2012 at 05:32:59PM +0200, Peter Zijlstra wrote:
> On Thu, 2012-05-17 at 08:18 -0700, Paul E. McKenney wrote:
> > Some researchers at MIT RCU-ified this lock:
> >
> > http://people.csail.mit.edu/nickolai/papers/clements-bonsai.pdf
>
> Ah, as have I [1].. and they seem to have gotten about as far as I have,
> which means almost there but not quite [2] :-)
I had forgotten about that -- that was the first call for call_srcu(),
if I remember correctly.
> The most interesting case is file maps and they simply ignored those.
> While I appreciate that from an academic pov, -- they can still write a
> paper on the other interesting bits -- I don't really like it from a
> practical point.
>
> [1] https://lkml.org/lkml/2010/1/4/257
> [2] https://lkml.org/lkml/2010/1/4/532
Hmmm... Do the recent dcache changes cover some of the things that
Linus called out? Probably not, but some at least.
Thanx, Paul
On Thu, 2012-05-17 at 08:47 -0700, Paul E. McKenney wrote:
> On Thu, May 17, 2012 at 05:32:59PM +0200, Peter Zijlstra wrote:
> > On Thu, 2012-05-17 at 08:18 -0700, Paul E. McKenney wrote:
> > > Some researchers at MIT RCU-ified this lock:
> > >
> > > http://people.csail.mit.edu/nickolai/papers/clements-bonsai.pdf
> >
> > Ah, as have I [1].. and they seem to have gotten about as far as I have,
> > which means almost there but not quite [2] :-)
>
> I had forgotten about that -- that was the first call for call_srcu(),
> if I remember correctly.
>
> > The most interesting case is file maps and they simply ignored those.
> > While I appreciate that from an academic pov, -- they can still write a
> > paper on the other interesting bits -- I don't really like it from a
> > practical point.
> >
> > [1] https://lkml.org/lkml/2010/1/4/257
> > [2] https://lkml.org/lkml/2010/1/4/532
>
> Hmmm... Do the recent dcache changes cover some of the things that
> Linus called out? Probably not, but some at least.
No, and the points viro made:
https://lkml.org/lkml/2010/1/5/194
are still very much an issue, you really don't want to do fput() from an
asynchronous context. Which means you have to do synchronize_rcu() or
similar from munmap() which will be rather unpopular :/
Since we should not use per-cpu data for either files or processes
(there are simply too many of those around) the alternative is
horrendously hideous things like:
https://lkml.org/lkml/2010/1/6/136
which one cannot get away with either.
The whole thing is very vexing indeed since all of this is only needed
for ill-behaved applications since a well-constructed application will
never fault in a range it is concurrently unmapping.
Most annoying.
On Thu, May 17, 2012 at 06:17:47PM +0200, Peter Zijlstra wrote:
> On Thu, 2012-05-17 at 08:47 -0700, Paul E. McKenney wrote:
> > On Thu, May 17, 2012 at 05:32:59PM +0200, Peter Zijlstra wrote:
> > > On Thu, 2012-05-17 at 08:18 -0700, Paul E. McKenney wrote:
> > > > Some researchers at MIT RCU-ified this lock:
> > > >
> > > > http://people.csail.mit.edu/nickolai/papers/clements-bonsai.pdf
> > >
> > > Ah, as have I [1].. and they seem to have gotten about as far as I have,
> > > which means almost there but not quite [2] :-)
> >
> > I had forgotten about that -- that was the first call for call_srcu(),
> > if I remember correctly.
> >
> > > The most interesting case is file maps and they simply ignored those.
> > > While I appreciate that from an academic pov, -- they can still write a
> > > paper on the other interesting bits -- I don't really like it from a
> > > practical point.
> > >
> > > [1] https://lkml.org/lkml/2010/1/4/257
> > > [2] https://lkml.org/lkml/2010/1/4/532
> >
> > Hmmm... Do the recent dcache changes cover some of the things that
> > Linus called out? Probably not, but some at least.
>
> No, and the points viro made:
>
> https://lkml.org/lkml/2010/1/5/194
>
> are still very much an issue, you really don't want to do fput() from an
> asynchronous context. Which means you have to do synchronize_rcu() or
> similar from munmap() which will be rather unpopular :/
I don't claim to understand all of the code, but I am also unafraid to
ask stupid questions. ;-)
So, is it possible to do something like the following?
1. Schedule a workqueue from an RCU callback, and to have that
workqueue do the fput.
2. Make things like unmount() do rcu_barrier() followed by
flush_workqueue(), or probably multiple flush_workqueue()s.
3. If someone concurrently does munmap() and a write to the
to-be-unmapped region, then the write can legally happen.
4. Acquire mmap_sem in the fault path, but only if the fault
requires blocking, and recheck the situation under
mmap_sem -- the hope being to prevent long-lived page
faults from messing things up.
Fire away! ;-)
> Since we should not use per-cpu data for either files or processes
> (there are simply too many of those around) the alternative is
> horrendously hideous things like:
>
> https://lkml.org/lkml/2010/1/6/136
>
> which one cannot get away with either.
>
> The whole thing is very vexing indeed since all of this is only needed
> for ill-behaved applications since a well-constructed application will
> never fault in a range it is concurrently unmapping.
>
> Most annoying.
No argument there!!!
Thanx, Paul
On Thu, 2012-05-17 at 13:08 -0700, Paul E. McKenney wrote:
> I don't claim to understand all of the code, but I am also unafraid to
> ask stupid questions. ;-)
>
> So, is it possible to do something like the following?
>
> 1. Schedule a workqueue from an RCU callback, and to have that
> workqueue do the fput.
Possible yes, but also undesirable, fput() can do a lot of work. Viro
very much didn't want this.
> 2. Make things like unmount() do rcu_barrier() followed by
> flush_workqueue(), or probably multiple flush_workqueue()s.
For unmount() we could get away with this, unmount() isn't usually
(ever?) a critical path. However, as noted by viro the fput() which is
still required can itself cause a tremendous amount of work, even if
only synced against an unmount, having this work done from an async
context isn't desired.
> 3. If someone concurrently does munmap() and a write to the
> to-be-unmapped region, then the write can legally happen.
Not entirely different from the current situation -- the timing changes
between the RCU and current implementation, but imagine the write
happens while the unmap() is in progress but hasn't quite reached the
range we write to.
Anyway, this is all firmly in 'undefined' territory so anybody breaking
from this change deserves all the pain (and probably more) they get.
As already stated, any fault in a region that's being unmapped is the
result of an ill-formed program.
> 4. Acquire mmap_sem in the fault path, but only if the fault
> requires blocking, and recheck the situation under
> mmap_sem -- the hope being to prevent long-lived page
> faults from messing things up.
Not relevant, a fault might not need to block but could still extend the
refcount lifetime of the file object beyond unmap and thus bear the
responsibility of the final fput, which we cannot know a-priori.
Its all made much more complex by the fact that we're avoiding taking
the refcount from the speculative fault in order to avoid the 'global'
synchronization on that cacheline -- which is the real problem
really :-)
On Tue, 15 May 2012, Steven Rostedt wrote:
> +struct rw_semaphore {
> + int initialized;
> + struct __rw_semaphore lock[NR_CPUS];
So that will blow up every rw_semaphore user by
NR_CPUS * sizeof(struct __rw_semaphore)
With lockdep off thats: NR_CPUS * 48
With lockdep on thats: NR_CPUS * 128 + NR_CPUS * 8 (__key)
So for NR_CPUS=64 that's 3072 or 8704 Bytes.
That'll make e.g. XFS happy. xfs_inode has two rw_sems.
sizeof(xfs_inode) in mainline is: 856 bytes
sizeof(xfs_inode) on RT is: 1028 bytes
But with your change it would goto (NR_CPUS = 64):
1028 - 96 + 2 * 3072 = 7076 bytes
That's almost an order of magnitude!
NFS has an rwsem in the inode as well, and ext4 has two.
So we trade massive memory waste for how much performance?
We really need numbers for various scenarios. There are applications
which are pretty mmap heavy and it would really surprise me when
taking NR_CPUS locks in one go is not going to cause a massive
overhead.
Thanks,
tglx
On Tue, 2012-05-22 at 17:26 +0200, Thomas Gleixner wrote:
> On Tue, 15 May 2012, Steven Rostedt wrote:
> > +struct rw_semaphore {
> > + int initialized;
> > + struct __rw_semaphore lock[NR_CPUS];
>
> So that will blow up every rw_semaphore user by
>
> NR_CPUS * sizeof(struct __rw_semaphore)
>
> With lockdep off thats: NR_CPUS * 48
>
> With lockdep on thats: NR_CPUS * 128 + NR_CPUS * 8 (__key)
>
> So for NR_CPUS=64 that's 3072 or 8704 Bytes.
For a box that has 64 CPUS, 8k should be nothing (even for every task).
But then again, NR_CPUS is compile time option. It would be nice if we
could make NR_CPUS just what was actually available :-/
>
> That'll make e.g. XFS happy. xfs_inode has two rw_sems.
>
> sizeof(xfs_inode) in mainline is: 856 bytes
>
> sizeof(xfs_inode) on RT is: 1028 bytes
>
> But with your change it would goto (NR_CPUS = 64):
>
> 1028 - 96 + 2 * 3072 = 7076 bytes
>
> That's almost an order of magnitude!
>
> NFS has an rwsem in the inode as well, and ext4 has two.
>
> So we trade massive memory waste for how much performance?
We could always make this an option. I may be able to also do linker
tricks to make it a boot time option where the memory is allocated in
sections that can be freed if the option is not enabled. Just a thought,
I know this is making it more complex than necessary.
>
> We really need numbers for various scenarios. There are applications
> which are pretty mmap heavy and it would really surprise me when
> taking NR_CPUS locks in one go is not going to cause a massive
> overhead.
Well, it doesn't take NR_CPUS locks, it takes possible_cpus() locks,
which may be much smaller. As a compiled time NR_CPUS=64 running on a
box with just 4 cpus will do a loop of 4 and not 64.
I'm all for benchmarks. But right now, making all readers pass through a
single mutex is a huge bottle neck for a lot of loads. Yes, they are
mostly Java loads, but for some strange reason, our customers seems to
like to run Java on our RT kernel :-p
-- Steve
On Tue, 22 May 2012, Steven Rostedt wrote:
> On Tue, 2012-05-22 at 17:26 +0200, Thomas Gleixner wrote:
> > On Tue, 15 May 2012, Steven Rostedt wrote:
> > > +struct rw_semaphore {
> > > + int initialized;
> > > + struct __rw_semaphore lock[NR_CPUS];
> >
> > So that will blow up every rw_semaphore user by
> >
> > NR_CPUS * sizeof(struct __rw_semaphore)
> >
> > With lockdep off thats: NR_CPUS * 48
> >
> > With lockdep on thats: NR_CPUS * 128 + NR_CPUS * 8 (__key)
> >
> > So for NR_CPUS=64 that's 3072 or 8704 Bytes.
>
> For a box that has 64 CPUS, 8k should be nothing (even for every task).
> But then again, NR_CPUS is compile time option. It would be nice if we
> could make NR_CPUS just what was actually available :-/
We are talking about inodes not tasks. My 32 core machine has
ext4_inode_cache 1997489
xfs_inode 838780
and those are not my largest filesytem. So I pretty much care whether
my inode cache eats 20 GB or 2 GB of RAM. And so does every one else
with a machine with large filesystems.
Even if I compile the kernel with NR_CPUS=32 then it's still 11GB
vs. 2GB.
> > So we trade massive memory waste for how much performance?
>
> We could always make this an option. I may be able to also do linker
> tricks to make it a boot time option where the memory is allocated in
> sections that can be freed if the option is not enabled. Just a thought,
> I know this is making it more complex than necessary.
Oh yes, we all know your affinity to the most complex solutions. :)
> > We really need numbers for various scenarios. There are applications
> > which are pretty mmap heavy and it would really surprise me when
> > taking NR_CPUS locks in one go is not going to cause a massive
> > overhead.
>
> Well, it doesn't take NR_CPUS locks, it takes possible_cpus() locks,
> which may be much smaller. As a compiled time NR_CPUS=64 running on a
> box with just 4 cpus will do a loop of 4 and not 64.
Then let's talk about 32 cores, which is what I have and not really an
exotic machine anymore.
> I'm all for benchmarks. But right now, making all readers pass through a
> single mutex is a huge bottle neck for a lot of loads. Yes, they are
> mostly Java loads, but for some strange reason, our customers seems to
> like to run Java on our RT kernel :-p
I'm well aware that mmap_sem is a PITA but replacing one nightmare
with the next one is not the best approach.
Thanks,
tglx
On Tue, 2012-05-22 at 18:40 +0200, Thomas Gleixner wrote:
> > I'm all for benchmarks. But right now, making all readers pass through a
> > single mutex is a huge bottle neck for a lot of loads. Yes, they are
> > mostly Java loads, but for some strange reason, our customers seems to
> > like to run Java on our RT kernel :-p
>
> I'm well aware that mmap_sem is a PITA but replacing one nightmare
> with the next one is not the best approach.
Perhaps we could just change the mmap_sem to use this approach. Create a
new type of rwsem/lock for -rt that we can be picky about.
Yeah, mmap_sem is a real PITA and it would be nice to have a solution
that can be used until we can convert it to an RCU lock.
-- Steve
On Tue, 22 May 2012, Steven Rostedt wrote:
> On Tue, 2012-05-22 at 18:40 +0200, Thomas Gleixner wrote:
>
> > > I'm all for benchmarks. But right now, making all readers pass through a
> > > single mutex is a huge bottle neck for a lot of loads. Yes, they are
> > > mostly Java loads, but for some strange reason, our customers seems to
> > > like to run Java on our RT kernel :-p
> >
> > I'm well aware that mmap_sem is a PITA but replacing one nightmare
> > with the next one is not the best approach.
>
> Perhaps we could just change the mmap_sem to use this approach. Create a
> new type of rwsem/lock for -rt that we can be picky about.
>
> Yeah, mmap_sem is a real PITA and it would be nice to have a solution
> that can be used until we can convert it to an RCU lock.
That still wants to be verified with numbers on a machine with at
least 32 cores and workloads which are mmap heavy. And before we don't
have such numbers we can really stop arguing about that solution.
Thanks,
tglx
On Tue, 2012-05-22 at 19:07 +0200, Thomas Gleixner wrote:
> On Tue, 22 May 2012, Steven Rostedt wrote:
>
> > On Tue, 2012-05-22 at 18:40 +0200, Thomas Gleixner wrote:
> >
> > > > I'm all for benchmarks. But right now, making all readers pass through a
> > > > single mutex is a huge bottle neck for a lot of loads. Yes, they are
> > > > mostly Java loads, but for some strange reason, our customers seems to
> > > > like to run Java on our RT kernel :-p
> > >
> > > I'm well aware that mmap_sem is a PITA but replacing one nightmare
> > > with the next one is not the best approach.
> >
> > Perhaps we could just change the mmap_sem to use this approach. Create a
> > new type of rwsem/lock for -rt that we can be picky about.
> >
> > Yeah, mmap_sem is a real PITA and it would be nice to have a solution
> > that can be used until we can convert it to an RCU lock.
>
> That still wants to be verified with numbers on a machine with at
> least 32 cores and workloads which are mmap heavy. And before we don't
> have such numbers we can really stop arguing about that solution.
>
Agreed. I now have to find those that complained before, and see how
this patch can help. We need the patch to get the numbers (otherwise
it's a chicken vs egg deal).
I'd also like to see what problems would happen from taking all cpu
reader locks for a given writer.
I never said that this code must be merged. I want to see the numbers
too before we decide anything. I'll still clean up the patch and
hopefully we can get others to test it out and give their feedback.
Otherwise we're just hand waving back and forth at each other and it's
not fair because you have bigger hands that I do ;-)
-- Steve
PS. Whatever the outcome, I did say that I will *not* be porting this to
any of the stable releases, as it is a new 'feature' and not a true bug
fix.