Split futex global spinlock futex_lock into hash bucket spinlocks.
Add bucket spinlock recursively lock check fixed by Jakub Jelinek.
--- bk-linux-2.6/kernel/futex.c 2003-09-10 13:16:54.000000000 +0800
+++ bk-linux-2.6.dev/kernel/futex.c 2003-09-10 13:09:52.000000000 +0800
@@ -79,9 +79,16 @@
struct file *filp;
};
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+ struct list_head chain;
+ spinlock_t lock;
+};
+
/* The key for the hash is the address + index + offset within page */
-static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
/* Futex-fs vfsmount entry: */
static struct vfsmount *futex_mnt;
@@ -89,7 +96,7 @@
/*
* We hash on the keys returned from get_futex_key (see below).
*/
-static inline struct list_head *hash_futex(union futex_key *key)
+static inline struct futex_hash_bucket *hash_futex(union futex_key *key)
{
return &futex_queues[hash_long(key->both.word
+ (unsigned long) key->both.ptr
@@ -214,6 +221,7 @@
static int futex_wake(unsigned long uaddr, int num)
{
struct list_head *i, *next, *head;
+ struct futex_hash_bucket *bh;
union futex_key key;
int ret;
@@ -223,9 +231,10 @@
if (unlikely(ret != 0))
goto out;
- head = hash_futex(&key);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
+ head = &bh->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
@@ -239,7 +248,7 @@
break;
}
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
out:
up_read(¤t->mm->mmap_sem);
@@ -254,6 +263,7 @@
int nr_wake, int nr_requeue)
{
struct list_head *i, *next, *head1, *head2;
+ struct futex_hash_bucket *bh1, *bh2;
union futex_key key1, key2;
int ret;
@@ -266,10 +276,19 @@
if (unlikely(ret != 0))
goto out;
- head1 = hash_futex(&key1);
- head2 = hash_futex(&key2);
+ bh1 = hash_futex(&key1);
+ bh2 = hash_futex(&key2);
+ if (bh1 < bh2) {
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ } else {
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+ }
+ head1 = &bh1->chain;
+ head2 = &bh2->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
@@ -291,8 +310,14 @@
}
}
}
- spin_unlock(&futex_lock);
-
+ if (bh1 < bh2) {
+ spin_unlock(&bh2->lock);
+ spin_unlock(&bh1->lock);
+ } else {
+ if (bh1 > bh2)
+ spin_unlock(&bh1->lock);
+ spin_unlock(&bh2->lock);
+ }
out:
up_read(¤t->mm->mmap_sem);
return ret;
@@ -301,28 +326,30 @@
static inline void queue_me(struct futex_q *q, union futex_key *key,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(key);
+ struct futex_hash_bucket *bh = hash_futex(key);
+ struct list_head *head = &bh->chain;
q->key = *key;
q->fd = fd;
q->filp = filp;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
list_add_tail(&q->list, head);
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static inline int unqueue_me(struct futex_q *q)
{
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
ret = 1;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -332,6 +359,7 @@
int ret, curval;
union futex_key key;
struct futex_q q;
+ struct futex_hash_bucket *bh = NULL;
try_again:
init_waitqueue_head(&q.waiters);
@@ -373,19 +401,20 @@
* the waiter from the list.
*/
add_wait_queue(&q.waiters, &wait);
- spin_lock(&futex_lock);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (unlikely(list_empty(&q.list))) {
/*
* We were woken already.
*/
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
set_current_state(TASK_RUNNING);
return 0;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
time = schedule_timeout(time);
set_current_state(TASK_RUNNING);
@@ -435,13 +464,14 @@
struct poll_table_struct *wait)
{
struct futex_q *q = filp->private_data;
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
poll_wait(filp, &q->waiters, wait);
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM;
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -587,8 +617,10 @@
register_filesystem(&futex_fs_type);
futex_mnt = kern_mount(&futex_fs_type);
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
- INIT_LIST_HEAD(&futex_queues[i]);
+ for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ INIT_LIST_HEAD(&futex_queues[i].chain);
+ futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+ }
return 0;
}
__initcall(init);
Good Luck !
?
Boris Hu
(Hu Jiangtao)
----------------------------------
This email message contains solely
my own personal views, and not
necessarily those of my employer.
----------------------------------
On Wed, Sep 10, 2003 at 01:23:47PM +0800, Hu, Boris wrote:
> Split futex global spinlock futex_lock into hash bucket spinlocks.
> Add bucket spinlock recursively lock check fixed by Jakub Jelinek.
This looks nice and straightforward. How much did it speed up the
benchmarks?
-- wli
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
William Lee Irwin III wrote:
> This looks nice and straightforward. How much did it speed up the
> benchmarks?
I just tested it on a UP P4 HT machine. I wouldn't expect much
difference here, bigger machines should show more (I'll try to get
things to work on my 4p machine). Volano performance increased by a bit
(~1%) but this is close to the error range.
Before: about 15479 messages per second
Now: about 15615 messages per second
The glibc test suite still passes so the code should be fine.
- --
- --------------. ,-. 444 Castro Street
Ulrich Drepper \ ,-----------------' \ Mountain View, CA 94041 USA
Red Hat `--' drepper at redhat.com `---------------------------
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (GNU/Linux)
iD8DBQE/XtRn2ijCOnn/RHQRAs+3AKCJ5C2WafltO9mCUiOfrw6ADiqCwACfQ3lc
b2z6DNKQqQz1X3/DreZIdpc=
=ILjm
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
I've run some tests on a 4p P4 HT machine. Some heavy futex using
benchmark code runs about 6% faster with the patch. So, despite the
reduced time the futex lock is actually taken after Jamie's patch the
separate locks still add benefits.
Plus, currently we do not allocate locks so that the futexes fall in
different buckets. To some extend this is possible and would be a
possible optimization if this patch goes in.
- --
- --------------. ,-. 444 Castro Street
Ulrich Drepper \ ,-----------------' \ Mountain View, CA 94041 USA
Red Hat `--' drepper at redhat.com `---------------------------
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (GNU/Linux)
iD8DBQE/XvC62ijCOnn/RHQRApUsAJ0RB/WsVZTgC31yIbAFL4KdMi8UcgCeI+Gv
uAVQQReC9Tl1g8HogsReVdA=
=vRBt
-----END PGP SIGNATURE-----
Ulrich Drepper wrote:
> Plus, currently we do not allocate locks so that the futexes fall in
> different buckets. To some extend this is possible and would be a
> possible optimization if this patch goes in.
What do you mean? The futexes should be in different buckets already
because of the hash function.
-- Jamie
Hu, Boris wrote:
> Split futex global spinlock futex_lock into hash bucket spinlocks.
> +/*
> + * Split the global futex_lock into every hash list lock.
> + */
> +struct futex_hash_bucket {
> + struct list_head chain;
> + spinlock_t lock;
> +};
Put "lock" first: it is always the first field accessed. That will
save a few clock cycles on some systems.
I was going to suggest something about cache alignment, but of course
that doesn't make sense. If the structure is made any larger,
it might as well contain more hash buckets.
Thinking a little deeper, it occurs to me that for scalable SMP
performance, you want:
(1 << FUTEX_HASHBITS) > some factor * SMP_CACHE_BYTES * NR_CPUS / sizeof (bucket)
To put it into perspective, consider a hypothetical 16-way P4.
SMP_CACHE_BYTES is 128 on a P4. That's 24 cache lines in the whole hash table.
If there are only a few futexes in the table at any time, the dominant
time for each operation is going to be the spinlock and associated
cache line transfers, not traversing a bucket's list.
So that hypothetical box would have an effective hash table of only 24 buckets.
What I'm saying is:
If you're able to benchmark changes to the code on a big box,
and you want to tune it's performance, try changing
FUTEX_HASHBITS. Also try adding a dummy word into
futex_hash_bucket, and use __cache_aligned_in_smp on the array so that
no buckets straddle two cache lines.
Thought for the day...
-- Jamie
Sound good. Here is the update of the patch. :)
--- bk-linux-2.6/kernel/futex.c 2003-09-10 13:16:54.000000000 +0800
+++ bk-linux-2.6.dev/kernel/futex.c 2003-09-11 14:57:08.000000000
+0800
@@ -79,9 +79,17 @@
struct file *filp;
};
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+ spinlock_t lock;
+ struct list_head chain;
+};
+
/* The key for the hash is the address + index + offset within page */
-static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS] \
+ __cacheline_aligned_in_smp;
/* Futex-fs vfsmount entry: */
static struct vfsmount *futex_mnt;
@@ -89,7 +97,7 @@
/*
* We hash on the keys returned from get_futex_key (see below).
*/
-static inline struct list_head *hash_futex(union futex_key *key)
+static inline struct futex_hash_bucket *hash_futex(union futex_key
*key)
{
return &futex_queues[hash_long(key->both.word
+ (unsigned long) key->both.ptr
@@ -214,6 +222,7 @@
static int futex_wake(unsigned long uaddr, int num)
{
struct list_head *i, *next, *head;
+ struct futex_hash_bucket *bh;
union futex_key key;
int ret;
@@ -223,9 +232,10 @@
if (unlikely(ret != 0))
goto out;
- head = hash_futex(&key);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
+ head = &bh->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q,
list);
@@ -239,7 +249,7 @@
break;
}
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
out:
up_read(¤t->mm->mmap_sem);
@@ -254,6 +264,7 @@
int nr_wake, int nr_requeue)
{
struct list_head *i, *next, *head1, *head2;
+ struct futex_hash_bucket *bh1, *bh2;
union futex_key key1, key2;
int ret;
@@ -266,10 +277,19 @@
if (unlikely(ret != 0))
goto out;
- head1 = hash_futex(&key1);
- head2 = hash_futex(&key2);
+ bh1 = hash_futex(&key1);
+ bh2 = hash_futex(&key2);
+ if (bh1 < bh2) {
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ } else {
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+ }
+ head1 = &bh1->chain;
+ head2 = &bh2->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q,
list);
@@ -291,8 +311,14 @@
}
}
}
- spin_unlock(&futex_lock);
-
+ if (bh1 < bh2) {
+ spin_unlock(&bh2->lock);
+ spin_unlock(&bh1->lock);
+ } else {
+ if (bh1 > bh2)
+ spin_unlock(&bh1->lock);
+ spin_unlock(&bh2->lock);
+ }
out:
up_read(¤t->mm->mmap_sem);
return ret;
@@ -301,28 +327,30 @@
static inline void queue_me(struct futex_q *q, union futex_key *key,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(key);
+ struct futex_hash_bucket *bh = hash_futex(key);
+ struct list_head *head = &bh->chain;
q->key = *key;
q->fd = fd;
q->filp = filp;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
list_add_tail(&q->list, head);
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static inline int unqueue_me(struct futex_q *q)
{
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
ret = 1;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -332,6 +360,7 @@
int ret, curval;
union futex_key key;
struct futex_q q;
+ struct futex_hash_bucket *bh = NULL;
try_again:
init_waitqueue_head(&q.waiters);
@@ -373,19 +402,20 @@
* the waiter from the list.
*/
add_wait_queue(&q.waiters, &wait);
- spin_lock(&futex_lock);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (unlikely(list_empty(&q.list))) {
/*
* We were woken already.
*/
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
set_current_state(TASK_RUNNING);
return 0;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
time = schedule_timeout(time);
set_current_state(TASK_RUNNING);
@@ -435,13 +465,14 @@
struct poll_table_struct *wait)
{
struct futex_q *q = filp->private_data;
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
poll_wait(filp, &q->waiters, wait);
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM;
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -587,8 +618,10 @@
register_filesystem(&futex_fs_type);
futex_mnt = kern_mount(&futex_fs_type);
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
- INIT_LIST_HEAD(&futex_queues[i]);
+ for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ INIT_LIST_HEAD(&futex_queues[i].chain);
+ futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+ }
return 0;
}
__initcall(init);
> Hu, Boris wrote:
> > Split futex global spinlock futex_lock into hash bucket spinlocks.
>
> > +/*
> > + * Split the global futex_lock into every hash list lock.
> > + */
> > +struct futex_hash_bucket {
> > + struct list_head chain;
> > + spinlock_t lock;
> > +};
>
> Put "lock" first: it is always the first field accessed. That will
> save a few clock cycles on some systems.
>
> I was going to suggest something about cache alignment, but of course
> that doesn't make sense. If the structure is made any larger,
> it might as well contain more hash buckets.
>
> Thinking a little deeper, it occurs to me that for scalable SMP
> performance, you want:
>
> (1 << FUTEX_HASHBITS) > some factor * SMP_CACHE_BYTES * NR_CPUS /
sizeof
> (bucket)
>
> To put it into perspective, consider a hypothetical 16-way P4.
> SMP_CACHE_BYTES is 128 on a P4. That's 24 cache lines in the whole
hash
> table.
>
> If there are only a few futexes in the table at any time, the dominant
> time for each operation is going to be the spinlock and associated
> cache line transfers, not traversing a bucket's list.
>
> So that hypothetical box would have an effective hash table of only 24
> buckets.
>
> What I'm saying is:
>
> If you're able to benchmark changes to the code on a big box,
> and you want to tune it's performance, try changing
> FUTEX_HASHBITS. Also try adding a dummy word into
> futex_hash_bucket, and use __cache_aligned_in_smp on the array
so
> that
> no buckets straddle two cache lines.
>
> Thought for the day...
> -- Jamie
In message <37FBBA5F3A361C41AB7CE44558C3448E01C0B8DE@pdsmsx403.ccr.corp.intel.com> you write:
> +/*
> + * Split the global futex_lock into every hash list lock.
> + */
> +struct futex_hash_bucket {
> + spinlock_t lock;
> + struct list_head chain;
> +};
> +
> /* The key for the hash is the address + index + offset within page */
> -static struct list_head futex_queues[1<<FUTEX_HASHBITS];
> -static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
> +static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS] \
> + __cacheline_aligned_in_smp;
This doesn't do what you expect, unfortunately. You need:
struct futex_hash_bucket {
spinlock_t lock;
struct list_head chain;
} ____cacheline_aligned_in_smp;
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]
__cacheline_aligned_in_smp;
Also, Jamie was hinting at a sectored approach: optimal memory
footprint/speed balance might be one lock in one cache-line-worth of
list_head. But the above will do unless someone gets extremely
excited.
Uli, can we ask you for benchmarks with this change, too?
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell wrote:
> Uli, can we ask you for benchmarks with this change, too?
After these changes the code still works but I couldn't really measure
any differences to the code without the extra attributes. This is on a
4p machine with 10 processes running in concurrently using mutexes and
condvars with 250 threads each. This might be because either the hash
function is good or very bad (i.e., hashes all futexes in the same
bucket or far away). I guess the extra attributes don't hurt.
--
--------------. ,-. 444 Castro Street
Ulrich Drepper \ ,-----------------' \ Mountain View, CA 94041 USA
Red Hat `--' drepper at redhat.com `---------------------------
IIRC, according to Ulrich previous mail, after Jamie's patch, the
separate spinlock could still add 6% bonus in one of his benchmark. :)
Here is the update of the patch.
Thanks.
-boris
--- bk-linux-2.6/kernel/futex.c 2003-09-10 13:16:54.000000000 +0800
+++ bk-linux-2.6.dev/kernel/futex.c 2003-09-16 15:16:27.000000000
+0800
@@ -79,9 +79,17 @@
struct file *filp;
};
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+ spinlock_t lock;
+ struct list_head chain;
+} ____cacheline_aligned_in_smp;
+
/* The key for the hash is the address + index + offset within page */
-static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS] \
+ __cacheline_aligned_in_smp;
/* Futex-fs vfsmount entry: */
static struct vfsmount *futex_mnt;
@@ -89,7 +97,7 @@
/*
* We hash on the keys returned from get_futex_key (see below).
*/
-static inline struct list_head *hash_futex(union futex_key *key)
+static inline struct futex_hash_bucket *hash_futex(union futex_key
*key)
{
return &futex_queues[hash_long(key->both.word
+ (unsigned long) key->both.ptr
@@ -214,6 +222,7 @@
static int futex_wake(unsigned long uaddr, int num)
{
struct list_head *i, *next, *head;
+ struct futex_hash_bucket *bh;
union futex_key key;
int ret;
@@ -223,9 +232,10 @@
if (unlikely(ret != 0))
goto out;
- head = hash_futex(&key);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
+ head = &bh->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q,
list);
@@ -239,7 +249,7 @@
break;
}
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
out:
up_read(¤t->mm->mmap_sem);
@@ -254,6 +264,7 @@
int nr_wake, int nr_requeue)
{
struct list_head *i, *next, *head1, *head2;
+ struct futex_hash_bucket *bh1, *bh2;
union futex_key key1, key2;
int ret;
@@ -266,10 +277,19 @@
if (unlikely(ret != 0))
goto out;
- head1 = hash_futex(&key1);
- head2 = hash_futex(&key2);
+ bh1 = hash_futex(&key1);
+ bh2 = hash_futex(&key2);
+ if (bh1 < bh2) {
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ } else {
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+ }
+ head1 = &bh1->chain;
+ head2 = &bh2->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q,
list);
@@ -291,8 +311,14 @@
}
}
}
- spin_unlock(&futex_lock);
-
+ if (bh1 < bh2) {
+ spin_unlock(&bh2->lock);
+ spin_unlock(&bh1->lock);
+ } else {
+ if (bh1 > bh2)
+ spin_unlock(&bh1->lock);
+ spin_unlock(&bh2->lock);
+ }
out:
up_read(¤t->mm->mmap_sem);
return ret;
@@ -301,28 +327,30 @@
static inline void queue_me(struct futex_q *q, union futex_key *key,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(key);
+ struct futex_hash_bucket *bh = hash_futex(key);
+ struct list_head *head = &bh->chain;
q->key = *key;
q->fd = fd;
q->filp = filp;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
list_add_tail(&q->list, head);
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static inline int unqueue_me(struct futex_q *q)
{
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
ret = 1;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -332,6 +360,7 @@
int ret, curval;
union futex_key key;
struct futex_q q;
+ struct futex_hash_bucket *bh = NULL;
try_again:
init_waitqueue_head(&q.waiters);
@@ -373,19 +402,20 @@
* the waiter from the list.
*/
add_wait_queue(&q.waiters, &wait);
- spin_lock(&futex_lock);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (unlikely(list_empty(&q.list))) {
/*
* We were woken already.
*/
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
set_current_state(TASK_RUNNING);
return 0;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
time = schedule_timeout(time);
set_current_state(TASK_RUNNING);
@@ -435,13 +465,14 @@
struct poll_table_struct *wait)
{
struct futex_q *q = filp->private_data;
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
poll_wait(filp, &q->waiters, wait);
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM;
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -587,8 +618,10 @@
register_filesystem(&futex_fs_type);
futex_mnt = kern_mount(&futex_fs_type);
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
- INIT_LIST_HEAD(&futex_queues[i]);
+ for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ INIT_LIST_HEAD(&futex_queues[i].chain);
+ futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+ }
return 0;
}
__initcall(init);
> In message
> <37FBBA5F3A361C41AB7CE44558C3448E01C0B8DE@pdsmsx403.ccr.corp.intel.c
> om> you write:
> > +/*
> > + * Split the global futex_lock into every hash list lock.
> > + */
> > +struct futex_hash_bucket {
> > + spinlock_t lock;
> > + struct list_head chain;
> > +};
> > +
> > /* The key for the hash is the address + index + offset within page
> */
> > -static struct list_head futex_queues[1<<FUTEX_HASHBITS];
> > -static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
> > +static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS] \
> > + __cacheline_aligned_in_smp;
>
> This doesn't do what you expect, unfortunately. You need:
>
> struct futex_hash_bucket {
> spinlock_t lock;
> struct list_head chain;
> } ____cacheline_aligned_in_smp;
>
> static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]
> __cacheline_aligned_in_smp;
>
> Also, Jamie was hinting at a sectored approach: optimal memory
> footprint/speed balance might be one lock in one cache-line-worth of
> list_head. But the above will do unless someone gets extremely
> excited.
>
> Uli, can we ask you for benchmarks with this change, too?
>
> Cheers,
> Rusty.
> --
> Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Ulrich Drepper wrote:
> After these changes the code still works but I couldn't really measure
> any differences to the code without the extra attributes. This is on a
> 4p machine with 10 processes running in concurrently using mutexes and
> condvars with 250 threads each. This might be because either the hash
> function is good or very bad (i.e., hashes all futexes in the same
> bucket or far away). I guess the extra attributes don't hurt.
On a 4 CPU machine the hash table is plenty large enough without the
extra attributes. (Assuming 128 byte cache lines: 3072 bytes without
(24 cache lines); 32768 bytes with).
The extra cache lines might hurt a bit when all the threads run on a
single CPU, or on a HT, just because 32768 bytes is a lot more L1 than
3072 bytes.
-- Jamie
I don't agree with this change,
Rusty Russell wrote:
> This doesn't do what you expect, unfortunately. You need:
>
> struct futex_hash_bucket {
> spinlock_t lock;
> struct list_head chain;
> } ____cacheline_aligned_in_smp;
>
> static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]
> __cacheline_aligned_in_smp;
> Also, Jamie was hinting at a sectored approach: optimal memory
> footprint/speed balance might be one lock in one cache-line-worth of
> list_head.
That was before I thought a bit more :)
If you're increasing the size of futex_hash_bucket, you can put
multiple lists in each one (as you observed). That reduces the
average list length, making hash operations faster when the table is
quite full.
There isn't any speed penalty for having a lock per list_head when you
do that, because the cache line has to bounce anyway for the
list_head, and locks are small. (On HT, where cache lines are shared
between sibling CPUs, a lock per list is a slight improvement).
Therefore, if cache line transfer between CPUs is a bottleneck, it
makes more sense to increase FUTEX_HASHBITS than to increase the
alignment of each bucket.
Increasing the alignment from 12 bytes to 16 bytes, to prevent buckets
straddling 2 cache lines, makes sense though. (Alternatively using a
single-linked list would do the trick, and be kinder on UP).
> Uli, can we ask you for benchmarks with this change, too?
Theoretically, I'd expect slightly better performance from increasing
FUTEX_HASHBITS than from this change, if there is any improvement at all.
Note that we're still bouncing mmap_sem on every futex operation, so
if the 6% improvement from splitting locks is anything to go by,
splitting mmap_sem should make a further measureable improvement,
although it might slow down the rest of the kernel.
-- Jamie
In message <[email protected]> you write:
> Therefore, if cache line transfer between CPUs is a bottleneck, it
> makes more sense to increase FUTEX_HASHBITS than to increase the
> alignment of each bucket.
Clear thinking as usual. One request: could you please make a really
stupid mistake to make me feel better? 8)
So I think take Boris' original patch, but just drop the
"__cacheline_aligned_in_smp" bit.
Below is that patch again, with the hashing change patch following
(Uli, if you're feeling bored, benchmarks on the hash change would be
interesting).
When Linus comes back I'll send them both, unless there are
complaints.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: Split futex global spinlock futex_lock
Author: "Hu, Boris" <[email protected]>
Status: Booted on 2.6.0-test5-bk3
D: Simple change to split the futex lock into a per-hashchain lock.
D: Don't bother cacheline aligning: Jamie points out that increasing
D: FUTEX_HASHBITS would have more payoff.
D:
D: Ulrich Drepper reports 6% improvement in on a 4way futex-thrashing
D: benchmark.
--- bk-linux-2.6/kernel/futex.c 2003-09-10 13:16:54.000000000 +0800
+++ bk-linux-2.6.dev/kernel/futex.c 2003-09-11 14:57:08.000000000
+0800
@@ -79,9 +79,16 @@
struct file *filp;
};
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+ spinlock_t lock;
+ struct list_head chain;
+};
+
/* The key for the hash is the address + index + offset within page */
-static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
/* Futex-fs vfsmount entry: */
static struct vfsmount *futex_mnt;
@@ -89,7 +97,7 @@
/*
* We hash on the keys returned from get_futex_key (see below).
*/
-static inline struct list_head *hash_futex(union futex_key *key)
+static inline struct futex_hash_bucket *hash_futex(union futex_key *key)
{
return &futex_queues[hash_long(key->both.word
+ (unsigned long) key->both.ptr
@@ -214,6 +222,7 @@
static int futex_wake(unsigned long uaddr, int num)
{
struct list_head *i, *next, *head;
+ struct futex_hash_bucket *bh;
union futex_key key;
int ret;
@@ -223,9 +232,10 @@
if (unlikely(ret != 0))
goto out;
- head = hash_futex(&key);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
+ head = &bh->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list);
@@ -239,7 +249,7 @@
break;
}
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
out:
up_read(¤t->mm->mmap_sem);
@@ -254,6 +264,7 @@
int nr_wake, int nr_requeue)
{
struct list_head *i, *next, *head1, *head2;
+ struct futex_hash_bucket *bh1, *bh2;
union futex_key key1, key2;
int ret;
@@ -266,10 +277,19 @@
if (unlikely(ret != 0))
goto out;
- head1 = hash_futex(&key1);
- head2 = hash_futex(&key2);
+ bh1 = hash_futex(&key1);
+ bh2 = hash_futex(&key2);
+ if (bh1 < bh2) {
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ } else {
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+ }
+ head1 = &bh1->chain;
+ head2 = &bh2->chain;
- spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
@@ -291,8 +311,14 @@
}
}
}
- spin_unlock(&futex_lock);
-
+ if (bh1 < bh2) {
+ spin_unlock(&bh2->lock);
+ spin_unlock(&bh1->lock);
+ } else {
+ if (bh1 > bh2)
+ spin_unlock(&bh1->lock);
+ spin_unlock(&bh2->lock);
+ }
out:
up_read(¤t->mm->mmap_sem);
return ret;
@@ -301,28 +327,30 @@
static inline void queue_me(struct futex_q *q, union futex_key *key,
int fd, struct file *filp)
{
- struct list_head *head = hash_futex(key);
+ struct futex_hash_bucket *bh = hash_futex(key);
+ struct list_head *head = &bh->chain;
q->key = *key;
q->fd = fd;
q->filp = filp;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
list_add_tail(&q->list, head);
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static inline int unqueue_me(struct futex_q *q)
{
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (!list_empty(&q->list)) {
list_del(&q->list);
ret = 1;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -332,6 +360,7 @@
int ret, curval;
union futex_key key;
struct futex_q q;
+ struct futex_hash_bucket *bh = NULL;
try_again:
init_waitqueue_head(&q.waiters);
@@ -373,19 +402,20 @@
* the waiter from the list.
*/
add_wait_queue(&q.waiters, &wait);
- spin_lock(&futex_lock);
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (unlikely(list_empty(&q.list))) {
/*
* We were woken already.
*/
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
set_current_state(TASK_RUNNING);
return 0;
}
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
time = schedule_timeout(time);
set_current_state(TASK_RUNNING);
@@ -435,13 +465,14 @@
struct poll_table_struct *wait)
{
struct futex_q *q = filp->private_data;
+ struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0;
poll_wait(filp, &q->waiters, wait);
- spin_lock(&futex_lock);
+ spin_lock(&bh->lock);
if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM;
- spin_unlock(&futex_lock);
+ spin_unlock(&bh->lock);
return ret;
}
@@ -587,8 +618,10 @@
register_filesystem(&futex_fs_type);
futex_mnt = kern_mount(&futex_fs_type);
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
- INIT_LIST_HEAD(&futex_queues[i]);
+ for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ INIT_LIST_HEAD(&futex_queues[i].chain);
+ futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+ }
return 0;
}
__initcall(init);
Name: Minor Tweaks To Jamie Lokier & Hugh Dickins's Futex Patch
Author: Rusty Russell
Status: Booted on 2.6.0-test5-bk3
Depends: Misc/futex-split-lock.patch.gz
D: Minor changes to Jamie & Hugh's excellent futex patch.
D: 1) Remove obsolete comment above hash array decl.
D: 2) Clarify comment about TASK_INTERRUPTIBLE.
D: 3) Andrew Morton says spurious wakeup is a bug. Catch it.
D: 4) Use Jenkins hash.
D: 5) Make hash function non-inline.
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31053-linux-2.6.0-test5-bk3/kernel/futex.c .31053-linux-2.6.0-test5-bk3.updated/kernel/futex.c
--- .31053-linux-2.6.0-test5-bk3/kernel/futex.c 2003-09-17 10:27:05.000000000 +1000
+++ .31053-linux-2.6.0-test5-bk3.updated/kernel/futex.c 2003-09-17 10:28:20.000000000 +1000
@@ -33,7 +33,7 @@
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
-#include <linux/hash.h>
+#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
@@ -44,6 +44,7 @@
/*
* Futexes are matched on equal values of this key.
* The key type depends on whether it's a shared or private mapping.
+ * Don't rearrange members without looking at hash_futex().
*/
union futex_key {
struct {
@@ -87,7 +88,6 @@ struct futex_hash_bucket {
struct list_head chain;
};
-/* The key for the hash is the address + index + offset within page */
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
/* Futex-fs vfsmount entry: */
@@ -96,11 +96,12 @@ static struct vfsmount *futex_mnt;
/*
* We hash on the keys returned from get_futex_key (see below).
*/
-static inline struct futex_hash_bucket *hash_futex(union futex_key *key)
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
- return &futex_queues[hash_long(key->both.word
- + (unsigned long) key->both.ptr
- + key->both.offset, FUTEX_HASHBITS)];
+ u32 hash = jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset);
+ return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
}
/*
@@ -361,7 +362,6 @@ static int futex_wait(unsigned long uadd
struct futex_q q;
struct futex_hash_bucket *bh = NULL;
- try_again:
init_waitqueue_head(&q.waiters);
down_read(¤t->mm->mmap_sem);
@@ -395,10 +395,10 @@ static int futex_wait(unsigned long uadd
/*
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
- * faults. So we cannot just set TASK_INTERRUPTIBLE state when
+ * faults, and we cannot just set TASK_INTERRUPTIBLE state when
* queueing ourselves into the futex hash. This code thus has to
- * rely on the futex_wake() code doing a wakeup after removing
- * the waiter from the list.
+ * rely on the futex_wake() code removing us from hash when it
+ * wakes us up.
*/
add_wait_queue(&q.waiters, &wait);
bh = hash_futex(&key);
@@ -423,26 +423,17 @@ static int futex_wait(unsigned long uadd
* we are the only user of it.
*/
- /*
- * Were we woken or interrupted for a valid reason?
- */
- ret = unqueue_me(&q);
- if (ret == 0)
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ if (!unqueue_me(&q))
return 0;
if (time == 0)
return -ETIMEDOUT;
- if (signal_pending(current))
- return -EINTR;
-
- /*
- * No, it was a spurious wakeup. Try again. Should never happen. :)
- */
- goto try_again;
+ /* A spurious wakeup should never happen. */
+ WARN_ON(!signal_pending(current));
+ return -EINTR;
out_unqueue:
- /*
- * Were we unqueued anyway?
- */
+ /* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
ret = 0;
out_release_sem: