2007-03-12 18:20:13

by Frank Filz

[permalink] [raw]
Subject: [PATCH] [RESEND] Improve idmap parallelism

Resend: I sent this awhile ago, but it may have been missed with the excitement of Connectathon, weddings, and honeymoons. I've verified it compiles against 2.6.21 (I originally tested it on 2.6.20).

Frank

This patch improves idmap parallelism by reducing the serialization of
idmap lookups. Currently, each client can only process one idmap lookup
at a time, potentially delaying other lookups that might be satisfied by
the in kernel cache while waiting for a user space lookup.

The existing code uses two mutexes, but one of them is held the entire
time of the lookup.

The biggest change this patch makes is to re-order lock use so that one
lock serializes multiple user space lookups (for the same nfs_client).
This lock is only held when the kernel cache lookup fails and a user
space lookup must occur. The other lock is used to protect the in kernel
cache (a pair of hash tables). This lock is only held when the internal
cache is being accessed. Further, since most accesses are lookups, this
second lock is changed to a read-write lock.

After acquiring the mutex, a second cache lookup is made just in case a
user space lookup had been in progress for this id.

I tested this using fsstress on an SMP machine. While testing, I put in
some metering code which showed as many as 1000 cache lookups satisfied
while an upcall was in progress, and noted occasional lookups for the id
that an upcall was in progress for.

This patch was modified from an initial patch by Usha Ketineni.

Signed-off-by: Frank Filz <[email protected]>

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9d4a6b2..2877a02 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -86,9 +86,9 @@ struct idmap_hashtable {
struct idmap {
struct dentry *idmap_dentry;
wait_queue_head_t idmap_wq;
- struct idmap_msg idmap_im;
+ struct idmap_msg idmap_im; /* protected by mutex */
struct mutex idmap_lock; /* Serializes upcalls */
- struct mutex idmap_im_lock; /* Protects the hashtable */
+ rwlock_t idmap_im_lock; /* Protects the hashtable */
struct idmap_hashtable idmap_user_hash;
struct idmap_hashtable idmap_group_hash;
};
@@ -127,7 +127,7 @@ nfs_idmap_new(struct nfs_client *clp)
}

mutex_init(&idmap->idmap_lock);
- mutex_init(&idmap->idmap_im_lock);
+ rwlock_init(&idmap->idmap_im_lock);
init_waitqueue_head(&idmap->idmap_wq);
idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -243,14 +243,28 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
if (namelen >= IDMAP_NAMESZ)
return -EINVAL;

+ read_lock(&idmap->idmap_im_lock);
+ he = idmap_lookup_name(h, name, namelen);
+ if (he != NULL) {
+ *id = he->ih_id;
+ read_unlock(&idmap->idmap_im_lock);
+ return 0;
+ }
+ read_unlock(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this name
+ * was in progress.
+ */
+ read_lock(&idmap->idmap_im_lock);
he = idmap_lookup_name(h, name, namelen);
if (he != NULL) {
*id = he->ih_id;
- ret = 0;
- goto out;
+ read_unlock(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return 0;
}

memset(im, 0, sizeof(*im));
@@ -270,11 +284,10 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ read_unlock(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
*id = im->im_id;
@@ -283,7 +296,6 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return (ret);
}
@@ -304,14 +316,30 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

im = &idmap->idmap_im;

+ read_lock(&idmap->idmap_im_lock);
+ he = idmap_lookup_id(h, id);
+ if (he != 0) {
+ memcpy(name, he->ih_name, he->ih_namelen);
+ ret = he->ih_namelen;
+ read_unlock(&idmap->idmap_im_lock);
+ return ret;
+ }
+ read_unlock(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this id
+ * was in progress.
+ */
+ read_lock(&idmap->idmap_im_lock);
he = idmap_lookup_id(h, id);
if (he != 0) {
memcpy(name, he->ih_name, he->ih_namelen);
ret = he->ih_namelen;
- goto out;
+ read_unlock(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return ret;
}

memset(im, 0, sizeof(*im));
@@ -331,11 +359,10 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ read_unlock(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
@@ -346,7 +373,6 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return ret;
}
@@ -391,7 +417,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
if (copy_from_user(&im_in, src, mlen) != 0)
return (-EFAULT);

- mutex_lock(&idmap->idmap_im_lock);
+ write_lock(&idmap->idmap_im_lock);

ret = mlen;
im->im_status = im_in.im_status;
@@ -451,7 +477,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
ret = mlen;
out:
- mutex_unlock(&idmap->idmap_im_lock);
+ write_unlock(&idmap->idmap_im_lock);
return ret;
}

@@ -463,10 +489,10 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)

if (msg->errno >= 0)
return;
- mutex_lock(&idmap->idmap_im_lock);
+ write_lock(&idmap->idmap_im_lock);
im->im_status = IDMAP_STATUS_LOOKUPFAIL;
wake_up(&idmap->idmap_wq);
- mutex_unlock(&idmap->idmap_im_lock);
+ write_unlock(&idmap->idmap_im_lock);
}

/*



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs


2007-03-14 16:57:26

by Frank Filz

[permalink] [raw]
Subject: [PATCH] [RESUBMITT] Improve idmap parallelism

On Mon, 2007-03-12 at 22:17 -0400, Trond Myklebust wrote:
> On Mon, 2007-03-12 at 15:08 -0400, Trond Myklebust wrote:
> > On Mon, 2007-03-12 at 11:19 -0700, Frank Filz wrote:
> > > Resend: I sent this awhile ago, but it may have been missed with the excitement of Connectathon, weddings, and honeymoons. I've verified it compiles against 2.6.21 (I originally tested it on 2.6.20).
> >
> > You forgot the Linux Storage and Filesystem Workshop and FAST. It has
> > been an exciting life in the past 2 months. :-)
> >
> > At first glance it looks OK. I'll take it for a spin...
>
> Hmm... Apart from having been mangled by your mailer, I immediately
> started getting major complaints about scheduling under a spin lock.

My apologies on the formatting.

> Looking more closely, it seems you are holding the read lock while
> calling rpc_queue_upcall(), which again will result in a call to
> idmap_pipe_upcall(). That won't work since copy_to_user() is allowed to
> sleep.

Ah, oops. I have rewritten the code using rwsem to avoid this problem.
This may not have caused a problem in my testing because the only
contention should be the write, which shouldn't be attempted until after
the copy_to_user is done (given the serialization of upcalls). Still, no
excuse for faulty coding.

> In addition, there were several leakages of the read lock.

Fixed those.

I ran the same testing overnight on this updated patch.

Here is the re-submitted patch:

This patch improves idmap parallelism by reducing the serialization of
idmap lookups. Currently, each client can only process one idmap lookup
at a time, potentially delaying other lookups that might be satisfied by
the in kernel cache while waiting for a user space lookup.

The existing code uses two mutexes, but one of them is held the entire
time of the lookup.

The biggest change this patch makes is to re-order lock use so that one
lock serializes multiple user space lookups (for the same nfs_client).
This lock is only held when the kernel cache lookup fails and a user
space lookup must occur. The other lock is used to protect the in kernel
cache (a pair of hash tables). This lock is only held when the internal
cache is being accessed. Further, since most accesses are lookups, this
second lock is changed to a read-write lock.

After acquiring the mutex, a second cache lookup is made just in case a
user space lookup had been in progress for this id.

I tested this using fsstress on an SMP machine. While testing, I put in
some metering code which showed as many as 1000 cache lookups satisfied
while an upcall was in progress, and noted occasional lookups for the id
that an upcall was in progress for.

This patch was modified from an initial patch by Usha Ketineni.

Signed-off-by: Frank Filz <[email protected]>

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9d4a6b2..6c5c0f8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -36,6 +36,7 @@

#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -86,9 +87,9 @@ struct idmap_hashtable {
struct idmap {
struct dentry *idmap_dentry;
wait_queue_head_t idmap_wq;
- struct idmap_msg idmap_im;
+ struct idmap_msg idmap_im; /* protected by mutex */
struct mutex idmap_lock; /* Serializes upcalls */
- struct mutex idmap_im_lock; /* Protects the hashtable */
+ struct rw_semaphore idmap_im_lock; /* Protects the hashtable */
struct idmap_hashtable idmap_user_hash;
struct idmap_hashtable idmap_group_hash;
};
@@ -127,7 +128,7 @@ nfs_idmap_new(struct nfs_client *clp)
}

mutex_init(&idmap->idmap_lock);
- mutex_init(&idmap->idmap_im_lock);
+ init_rwsem(&idmap->idmap_im_lock);
init_waitqueue_head(&idmap->idmap_wq);
idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -243,14 +244,28 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
if (namelen >= IDMAP_NAMESZ)
return -EINVAL;

+ down_read(&idmap->idmap_im_lock);
+ he = idmap_lookup_name(h, name, namelen);
+ if (he != NULL) {
+ *id = he->ih_id;
+ up_read(&idmap->idmap_im_lock);
+ return 0;
+ }
+ up_read(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this name
+ * was in progress.
+ */
+ down_read(&idmap->idmap_im_lock);
he = idmap_lookup_name(h, name, namelen);
if (he != NULL) {
*id = he->ih_id;
- ret = 0;
- goto out;
+ up_read(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return 0;
}

memset(im, 0, sizeof(*im));
@@ -266,15 +281,15 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
add_wait_queue(&idmap->idmap_wq, &wq);
if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
remove_wait_queue(&idmap->idmap_wq, &wq);
+ up_read(&idmap->idmap_im_lock);
goto out;
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_read(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
*id = im->im_id;
@@ -283,7 +298,6 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return (ret);
}
@@ -304,14 +318,30 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

im = &idmap->idmap_im;

+ down_read(&idmap->idmap_im_lock);
+ he = idmap_lookup_id(h, id);
+ if (he != 0) {
+ memcpy(name, he->ih_name, he->ih_namelen);
+ ret = he->ih_namelen;
+ up_read(&idmap->idmap_im_lock);
+ return ret;
+ }
+ up_read(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this id
+ * was in progress.
+ */
+ down_read(&idmap->idmap_im_lock);
he = idmap_lookup_id(h, id);
if (he != 0) {
memcpy(name, he->ih_name, he->ih_namelen);
ret = he->ih_namelen;
- goto out;
+ up_read(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return ret;
}

memset(im, 0, sizeof(*im));
@@ -327,15 +357,15 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
remove_wait_queue(&idmap->idmap_wq, &wq);
+ up_read(&idmap->idmap_im_lock);
goto out;
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_read(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
@@ -346,7 +376,6 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return ret;
}
@@ -391,7 +420,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
if (copy_from_user(&im_in, src, mlen) != 0)
return (-EFAULT);

- mutex_lock(&idmap->idmap_im_lock);
+ down_write(&idmap->idmap_im_lock);

ret = mlen;
im->im_status = im_in.im_status;
@@ -451,7 +480,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
ret = mlen;
out:
- mutex_unlock(&idmap->idmap_im_lock);
+ up_write(&idmap->idmap_im_lock);
return ret;
}

@@ -463,10 +492,10 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)

if (msg->errno >= 0)
return;
- mutex_lock(&idmap->idmap_im_lock);
+ down_write(&idmap->idmap_im_lock);
im->im_status = IDMAP_STATUS_LOOKUPFAIL;
wake_up(&idmap->idmap_wq);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_write(&idmap->idmap_im_lock);
}

/*



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs

2007-03-12 19:09:11

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH] [RESEND] Improve idmap parallelism

On Mon, 2007-03-12 at 11:19 -0700, Frank Filz wrote:
> Resend: I sent this awhile ago, but it may have been missed with the excitement of Connectathon, weddings, and honeymoons. I've verified it compiles against 2.6.21 (I originally tested it on 2.6.20).

You forgot the Linux Storage and Filesystem Workshop and FAST. It has
been an exciting life in the past 2 months. :-)

At first glance it looks OK. I'll take it for a spin...

Cheers,
Trond

> Frank
>
> This patch improves idmap parallelism by reducing the serialization of
> idmap lookups. Currently, each client can only process one idmap lookup
> at a time, potentially delaying other lookups that might be satisfied by
> the in kernel cache while waiting for a user space lookup.
>
> The existing code uses two mutexes, but one of them is held the entire
> time of the lookup.
>
> The biggest change this patch makes is to re-order lock use so that one
> lock serializes multiple user space lookups (for the same nfs_client).
> This lock is only held when the kernel cache lookup fails and a user
> space lookup must occur. The other lock is used to protect the in kernel
> cache (a pair of hash tables). This lock is only held when the internal
> cache is being accessed. Further, since most accesses are lookups, this
> second lock is changed to a read-write lock.
>
> After acquiring the mutex, a second cache lookup is made just in case a
> user space lookup had been in progress for this id.
>
> I tested this using fsstress on an SMP machine. While testing, I put in
> some metering code which showed as many as 1000 cache lookups satisfied
> while an upcall was in progress, and noted occasional lookups for the id
> that an upcall was in progress for.
>
> This patch was modified from an initial patch by Usha Ketineni.
>
> Signed-off-by: Frank Filz <[email protected]>
>
> diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
> index 9d4a6b2..2877a02 100644
> --- a/fs/nfs/idmap.c
> +++ b/fs/nfs/idmap.c
> @@ -86,9 +86,9 @@ struct idmap_hashtable {
> struct idmap {
> struct dentry *idmap_dentry;
> wait_queue_head_t idmap_wq;
> - struct idmap_msg idmap_im;
> + struct idmap_msg idmap_im; /* protected by mutex */
> struct mutex idmap_lock; /* Serializes upcalls */
> - struct mutex idmap_im_lock; /* Protects the hashtable */
> + rwlock_t idmap_im_lock; /* Protects the hashtable */
> struct idmap_hashtable idmap_user_hash;
> struct idmap_hashtable idmap_group_hash;
> };
> @@ -127,7 +127,7 @@ nfs_idmap_new(struct nfs_client *clp)
> }
>
> mutex_init(&idmap->idmap_lock);
> - mutex_init(&idmap->idmap_im_lock);
> + rwlock_init(&idmap->idmap_im_lock);
> init_waitqueue_head(&idmap->idmap_wq);
> idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
> idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
> @@ -243,14 +243,28 @@ nfs_idmap_id(struct idmap *idmap, struct
> idmap_hashtable *h,
> if (namelen >= IDMAP_NAMESZ)
> return -EINVAL;
>
> + read_lock(&idmap->idmap_im_lock);
> + he = idmap_lookup_name(h, name, namelen);
> + if (he != NULL) {
> + *id = he->ih_id;
> + read_unlock(&idmap->idmap_im_lock);
> + return 0;
> + }
> + read_unlock(&idmap->idmap_im_lock);
> +
> mutex_lock(&idmap->idmap_lock);
> - mutex_lock(&idmap->idmap_im_lock);
>
> + /* Attempt lookup again in case we blocked
> + * because another attempt on this name
> + * was in progress.
> + */
> + read_lock(&idmap->idmap_im_lock);
> he = idmap_lookup_name(h, name, namelen);
> if (he != NULL) {
> *id = he->ih_id;
> - ret = 0;
> - goto out;
> + read_unlock(&idmap->idmap_im_lock);
> + mutex_unlock(&idmap->idmap_lock);
> + return 0;
> }
>
> memset(im, 0, sizeof(*im));
> @@ -270,11 +284,10 @@ nfs_idmap_id(struct idmap *idmap, struct
> idmap_hashtable *h,
> }
>
> set_current_state(TASK_UNINTERRUPTIBLE);
> - mutex_unlock(&idmap->idmap_im_lock);
> + read_unlock(&idmap->idmap_im_lock);
> schedule();
> current->state = TASK_RUNNING;
> remove_wait_queue(&idmap->idmap_wq, &wq);
> - mutex_lock(&idmap->idmap_im_lock);
>
> if (im->im_status & IDMAP_STATUS_SUCCESS) {
> *id = im->im_id;
> @@ -283,7 +296,6 @@ nfs_idmap_id(struct idmap *idmap, struct
> idmap_hashtable *h,
>
> out:
> memset(im, 0, sizeof(*im));
> - mutex_unlock(&idmap->idmap_im_lock);
> mutex_unlock(&idmap->idmap_lock);
> return (ret);
> }
> @@ -304,14 +316,30 @@ nfs_idmap_name(struct idmap *idmap, struct
> idmap_hashtable *h,
>
> im = &idmap->idmap_im;
>
> + read_lock(&idmap->idmap_im_lock);
> + he = idmap_lookup_id(h, id);
> + if (he != 0) {
> + memcpy(name, he->ih_name, he->ih_namelen);
> + ret = he->ih_namelen;
> + read_unlock(&idmap->idmap_im_lock);
> + return ret;
> + }
> + read_unlock(&idmap->idmap_im_lock);
> +
> mutex_lock(&idmap->idmap_lock);
> - mutex_lock(&idmap->idmap_im_lock);
>
> + /* Attempt lookup again in case we blocked
> + * because another attempt on this id
> + * was in progress.
> + */
> + read_lock(&idmap->idmap_im_lock);
> he = idmap_lookup_id(h, id);
> if (he != 0) {
> memcpy(name, he->ih_name, he->ih_namelen);
> ret = he->ih_namelen;
> - goto out;
> + read_unlock(&idmap->idmap_im_lock);
> + mutex_unlock(&idmap->idmap_lock);
> + return ret;
> }
>
> memset(im, 0, sizeof(*im));
> @@ -331,11 +359,10 @@ nfs_idmap_name(struct idmap *idmap, struct
> idmap_hashtable *h,
> }
>
> set_current_state(TASK_UNINTERRUPTIBLE);
> - mutex_unlock(&idmap->idmap_im_lock);
> + read_unlock(&idmap->idmap_im_lock);
> schedule();
> current->state = TASK_RUNNING;
> remove_wait_queue(&idmap->idmap_wq, &wq);
> - mutex_lock(&idmap->idmap_im_lock);
>
> if (im->im_status & IDMAP_STATUS_SUCCESS) {
> if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
> @@ -346,7 +373,6 @@ nfs_idmap_name(struct idmap *idmap, struct
> idmap_hashtable *h,
>
> out:
> memset(im, 0, sizeof(*im));
> - mutex_unlock(&idmap->idmap_im_lock);
> mutex_unlock(&idmap->idmap_lock);
> return ret;
> }
> @@ -391,7 +417,7 @@ idmap_pipe_downcall(struct file *filp, const char
> __user *src, size_t mlen)
> if (copy_from_user(&im_in, src, mlen) != 0)
> return (-EFAULT);
>
> - mutex_lock(&idmap->idmap_im_lock);
> + write_lock(&idmap->idmap_im_lock);
>
> ret = mlen;
> im->im_status = im_in.im_status;
> @@ -451,7 +477,7 @@ idmap_pipe_downcall(struct file *filp, const char
> __user *src, size_t mlen)
> idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
> ret = mlen;
> out:
> - mutex_unlock(&idmap->idmap_im_lock);
> + write_unlock(&idmap->idmap_im_lock);
> return ret;
> }
>
> @@ -463,10 +489,10 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
>
> if (msg->errno >= 0)
> return;
> - mutex_lock(&idmap->idmap_im_lock);
> + write_lock(&idmap->idmap_im_lock);
> im->im_status = IDMAP_STATUS_LOOKUPFAIL;
> wake_up(&idmap->idmap_wq);
> - mutex_unlock(&idmap->idmap_im_lock);
> + write_unlock(&idmap->idmap_im_lock);
> }
>
> /*
>
>
>
> -------------------------------------------------------------------------
> Take Surveys. Earn Cash. Influence the Future of IT
> Join SourceForge.net's Techsay panel and you'll get the chance to share your
> opinions on IT & business topics through brief surveys-and earn cash
> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
> _______________________________________________
> NFS maillist - [email protected]
> https://lists.sourceforge.net/lists/listinfo/nfs


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs

2007-03-13 02:17:56

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH] [RESEND] Improve idmap parallelism

On Mon, 2007-03-12 at 15:08 -0400, Trond Myklebust wrote:
> On Mon, 2007-03-12 at 11:19 -0700, Frank Filz wrote:
> > Resend: I sent this awhile ago, but it may have been missed with the excitement of Connectathon, weddings, and honeymoons. I've verified it compiles against 2.6.21 (I originally tested it on 2.6.20).
>
> You forgot the Linux Storage and Filesystem Workshop and FAST. It has
> been an exciting life in the past 2 months. :-)
>
> At first glance it looks OK. I'll take it for a spin...

Hmm... Apart from having been mangled by your mailer, I immediately
started getting major complaints about scheduling under a spin lock.

Looking more closely, it seems you are holding the read lock while
calling rpc_queue_upcall(), which again will result in a call to
idmap_pipe_upcall(). That won't work since copy_to_user() is allowed to
sleep.

In addition, there were several leakages of the read lock.

Cheers
Trond


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs

2007-04-04 20:09:31

by Frank Filz

[permalink] [raw]
Subject: [PATCH] [RESEND] Improve idmap parallelism

Not sure if this got lost in the noise, resending...

On Mon, 2007-03-12 at 22:17 -0400, Trond Myklebust wrote:
> On Mon, 2007-03-12 at 15:08 -0400, Trond Myklebust wrote:
> > On Mon, 2007-03-12 at 11:19 -0700, Frank Filz wrote:
> > > Resend: I sent this awhile ago, but it may have been missed with the excitement of Connectathon, weddings, and honeymoons. I've verified it compiles against 2.6.21 (I originally tested it on 2.6.20).
> >
> > You forgot the Linux Storage and Filesystem Workshop and FAST. It has
> > been an exciting life in the past 2 months. :-)
> >
> > At first glance it looks OK. I'll take it for a spin...
>
> Hmm... Apart from having been mangled by your mailer, I immediately
> started getting major complaints about scheduling under a spin lock.

My apologies on the formatting.

> Looking more closely, it seems you are holding the read lock while
> calling rpc_queue_upcall(), which again will result in a call to
> idmap_pipe_upcall(). That won't work since copy_to_user() is allowed to
> sleep.

Ah, oops. I have rewritten the code using rwsem to avoid this problem.
This may not have caused a problem in my testing because the only
contention should be the write, which shouldn't be attempted until after
the copy_to_user is done (given the serialization of upcalls). Still, no
excuse for faulty coding.

> In addition, there were several leakages of the read lock.

Fixed those.

I ran the same testing overnight on this updated patch.

Here is the re-submitted patch:

This patch improves idmap parallelism by reducing the serialization of
idmap lookups. Currently, each client can only process one idmap lookup
at a time, potentially delaying other lookups that might be satisfied by
the in kernel cache while waiting for a user space lookup.

The existing code uses two mutexes, but one of them is held the entire
time of the lookup.

The biggest change this patch makes is to re-order lock use so that one
lock serializes multiple user space lookups (for the same nfs_client).
This lock is only held when the kernel cache lookup fails and a user
space lookup must occur. The other lock is used to protect the in kernel
cache (a pair of hash tables). This lock is only held when the internal
cache is being accessed. Further, since most accesses are lookups, this
second lock is changed to a read-write lock.

After acquiring the mutex, a second cache lookup is made just in case a
user space lookup had been in progress for this id.

I tested this using fsstress on an SMP machine. While testing, I put in
some metering code which showed as many as 1000 cache lookups satisfied
while an upcall was in progress, and noted occasional lookups for the id
that an upcall was in progress for.

This patch was modified from an initial patch by Usha Ketineni.

Signed-off-by: Frank Filz <[email protected]>

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9d4a6b2..6c5c0f8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -36,6 +36,7 @@

#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -86,9 +87,9 @@ struct idmap_hashtable {
struct idmap {
struct dentry *idmap_dentry;
wait_queue_head_t idmap_wq;
- struct idmap_msg idmap_im;
+ struct idmap_msg idmap_im; /* protected by mutex */
struct mutex idmap_lock; /* Serializes upcalls */
- struct mutex idmap_im_lock; /* Protects the hashtable */
+ struct rw_semaphore idmap_im_lock; /* Protects the hashtable */
struct idmap_hashtable idmap_user_hash;
struct idmap_hashtable idmap_group_hash;
};
@@ -127,7 +128,7 @@ nfs_idmap_new(struct nfs_client *clp)
}

mutex_init(&idmap->idmap_lock);
- mutex_init(&idmap->idmap_im_lock);
+ init_rwsem(&idmap->idmap_im_lock);
init_waitqueue_head(&idmap->idmap_wq);
idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -243,14 +244,28 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
if (namelen >= IDMAP_NAMESZ)
return -EINVAL;

+ down_read(&idmap->idmap_im_lock);
+ he = idmap_lookup_name(h, name, namelen);
+ if (he != NULL) {
+ *id = he->ih_id;
+ up_read(&idmap->idmap_im_lock);
+ return 0;
+ }
+ up_read(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this name
+ * was in progress.
+ */
+ down_read(&idmap->idmap_im_lock);
he = idmap_lookup_name(h, name, namelen);
if (he != NULL) {
*id = he->ih_id;
- ret = 0;
- goto out;
+ up_read(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return 0;
}

memset(im, 0, sizeof(*im));
@@ -266,15 +281,15 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
add_wait_queue(&idmap->idmap_wq, &wq);
if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
remove_wait_queue(&idmap->idmap_wq, &wq);
+ up_read(&idmap->idmap_im_lock);
goto out;
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_read(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
*id = im->im_id;
@@ -283,7 +298,6 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return (ret);
}
@@ -304,14 +318,30 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

im = &idmap->idmap_im;

+ down_read(&idmap->idmap_im_lock);
+ he = idmap_lookup_id(h, id);
+ if (he != 0) {
+ memcpy(name, he->ih_name, he->ih_namelen);
+ ret = he->ih_namelen;
+ up_read(&idmap->idmap_im_lock);
+ return ret;
+ }
+ up_read(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this id
+ * was in progress.
+ */
+ down_read(&idmap->idmap_im_lock);
he = idmap_lookup_id(h, id);
if (he != 0) {
memcpy(name, he->ih_name, he->ih_namelen);
ret = he->ih_namelen;
- goto out;
+ up_read(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return ret;
}

memset(im, 0, sizeof(*im));
@@ -327,15 +357,15 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
remove_wait_queue(&idmap->idmap_wq, &wq);
+ up_read(&idmap->idmap_im_lock);
goto out;
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_read(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
@@ -346,7 +376,6 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return ret;
}
@@ -391,7 +420,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
if (copy_from_user(&im_in, src, mlen) != 0)
return (-EFAULT);

- mutex_lock(&idmap->idmap_im_lock);
+ down_write(&idmap->idmap_im_lock);

ret = mlen;
im->im_status = im_in.im_status;
@@ -451,7 +480,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
ret = mlen;
out:
- mutex_unlock(&idmap->idmap_im_lock);
+ up_write(&idmap->idmap_im_lock);
return ret;
}

@@ -463,10 +492,10 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)

if (msg->errno >= 0)
return;
- mutex_lock(&idmap->idmap_im_lock);
+ down_write(&idmap->idmap_im_lock);
im->im_status = IDMAP_STATUS_LOOKUPFAIL;
wake_up(&idmap->idmap_wq);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_write(&idmap->idmap_im_lock);
}

/*



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs

2007-07-09 22:28:23

by Frank Filz

[permalink] [raw]
Subject: [PATCH] [RESEND] Improve idmap parallelism

Not sure if this got lost in the noise, resending...

On Mon, 2007-03-12 at 22:17 -0400, Trond Myklebust wrote:
> On Mon, 2007-03-12 at 15:08 -0400, Trond Myklebust wrote:
> > On Mon, 2007-03-12 at 11:19 -0700, Frank Filz wrote:
> > > Resend: I sent this awhile ago, but it may have been missed with the excitement of Connectathon, weddings, and honeymoons. I've verified it compiles against 2.6.21 (I originally tested it on 2.6.20).
> >
> > You forgot the Linux Storage and Filesystem Workshop and FAST. It has
> > been an exciting life in the past 2 months. :-)
> >
> > At first glance it looks OK. I'll take it for a spin...
>
> Hmm... Apart from having been mangled by your mailer, I immediately
> started getting major complaints about scheduling under a spin lock.

My apologies on the formatting.

> Looking more closely, it seems you are holding the read lock while
> calling rpc_queue_upcall(), which again will result in a call to
> idmap_pipe_upcall(). That won't work since copy_to_user() is allowed to
> sleep.

Ah, oops. I have rewritten the code using rwsem to avoid this problem.
This may not have caused a problem in my testing because the only
contention should be the write, which shouldn't be attempted until after
the copy_to_user is done (given the serialization of upcalls). Still, no
excuse for faulty coding.

> In addition, there were several leakages of the read lock.

Fixed those.

I ran the same testing overnight on this updated patch.

Here is the re-submitted patch:

This patch improves idmap parallelism by reducing the serialization of
idmap lookups. Currently, each client can only process one idmap lookup
at a time, potentially delaying other lookups that might be satisfied by
the in kernel cache while waiting for a user space lookup.

The existing code uses two mutexes, but one of them is held the entire
time of the lookup.

The biggest change this patch makes is to re-order lock use so that one
lock serializes multiple user space lookups (for the same nfs_client).
This lock is only held when the kernel cache lookup fails and a user
space lookup must occur. The other lock is used to protect the in kernel
cache (a pair of hash tables). This lock is only held when the internal
cache is being accessed. Further, since most accesses are lookups, this
second lock is changed to a read-write lock.

After acquiring the mutex, a second cache lookup is made just in case a
user space lookup had been in progress for this id.

I tested this using fsstress on an SMP machine. While testing, I put in
some metering code which showed as many as 1000 cache lookups satisfied
while an upcall was in progress, and noted occasional lookups for the id
that an upcall was in progress for.

This patch was modified from an initial patch by Usha Ketineni.

Signed-off-by: Frank Filz <[email protected]>

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9d4a6b2..6c5c0f8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -36,6 +36,7 @@

#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -86,9 +87,9 @@ struct idmap_hashtable {
struct idmap {
struct dentry *idmap_dentry;
wait_queue_head_t idmap_wq;
- struct idmap_msg idmap_im;
+ struct idmap_msg idmap_im; /* protected by mutex */
struct mutex idmap_lock; /* Serializes upcalls */
- struct mutex idmap_im_lock; /* Protects the hashtable */
+ struct rw_semaphore idmap_im_lock; /* Protects the hashtable */
struct idmap_hashtable idmap_user_hash;
struct idmap_hashtable idmap_group_hash;
};
@@ -127,7 +128,7 @@ nfs_idmap_new(struct nfs_client *clp)
}

mutex_init(&idmap->idmap_lock);
- mutex_init(&idmap->idmap_im_lock);
+ init_rwsem(&idmap->idmap_im_lock);
init_waitqueue_head(&idmap->idmap_wq);
idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -243,14 +244,28 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
if (namelen >= IDMAP_NAMESZ)
return -EINVAL;

+ down_read(&idmap->idmap_im_lock);
+ he = idmap_lookup_name(h, name, namelen);
+ if (he != NULL) {
+ *id = he->ih_id;
+ up_read(&idmap->idmap_im_lock);
+ return 0;
+ }
+ up_read(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this name
+ * was in progress.
+ */
+ down_read(&idmap->idmap_im_lock);
he = idmap_lookup_name(h, name, namelen);
if (he != NULL) {
*id = he->ih_id;
- ret = 0;
- goto out;
+ up_read(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return 0;
}

memset(im, 0, sizeof(*im));
@@ -266,15 +281,15 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,
add_wait_queue(&idmap->idmap_wq, &wq);
if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
remove_wait_queue(&idmap->idmap_wq, &wq);
+ up_read(&idmap->idmap_im_lock);
goto out;
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_read(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
*id = im->im_id;
@@ -283,7 +298,6 @@ nfs_idmap_id(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return (ret);
}
@@ -304,14 +318,30 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

im = &idmap->idmap_im;

+ down_read(&idmap->idmap_im_lock);
+ he = idmap_lookup_id(h, id);
+ if (he != 0) {
+ memcpy(name, he->ih_name, he->ih_namelen);
+ ret = he->ih_namelen;
+ up_read(&idmap->idmap_im_lock);
+ return ret;
+ }
+ up_read(&idmap->idmap_im_lock);
+
mutex_lock(&idmap->idmap_lock);
- mutex_lock(&idmap->idmap_im_lock);

+ /* Attempt lookup again in case we blocked
+ * because another attempt on this id
+ * was in progress.
+ */
+ down_read(&idmap->idmap_im_lock);
he = idmap_lookup_id(h, id);
if (he != 0) {
memcpy(name, he->ih_name, he->ih_namelen);
ret = he->ih_namelen;
- goto out;
+ up_read(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return ret;
}

memset(im, 0, sizeof(*im));
@@ -327,15 +357,15 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
remove_wait_queue(&idmap->idmap_wq, &wq);
+ up_read(&idmap->idmap_im_lock);
goto out;
}

set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_read(&idmap->idmap_im_lock);
schedule();
current->state = TASK_RUNNING;
remove_wait_queue(&idmap->idmap_wq, &wq);
- mutex_lock(&idmap->idmap_im_lock);

if (im->im_status & IDMAP_STATUS_SUCCESS) {
if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
@@ -346,7 +376,6 @@ nfs_idmap_name(struct idmap *idmap, struct
idmap_hashtable *h,

out:
memset(im, 0, sizeof(*im));
- mutex_unlock(&idmap->idmap_im_lock);
mutex_unlock(&idmap->idmap_lock);
return ret;
}
@@ -391,7 +420,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
if (copy_from_user(&im_in, src, mlen) != 0)
return (-EFAULT);

- mutex_lock(&idmap->idmap_im_lock);
+ down_write(&idmap->idmap_im_lock);

ret = mlen;
im->im_status = im_in.im_status;
@@ -451,7 +480,7 @@ idmap_pipe_downcall(struct file *filp, const char
__user *src, size_t mlen)
idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
ret = mlen;
out:
- mutex_unlock(&idmap->idmap_im_lock);
+ up_write(&idmap->idmap_im_lock);
return ret;
}

@@ -463,10 +492,10 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)

if (msg->errno >= 0)
return;
- mutex_lock(&idmap->idmap_im_lock);
+ down_write(&idmap->idmap_im_lock);
im->im_status = IDMAP_STATUS_LOOKUPFAIL;
wake_up(&idmap->idmap_wq);
- mutex_unlock(&idmap->idmap_im_lock);
+ up_write(&idmap->idmap_im_lock);
}

/*



-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs

2007-07-10 03:18:51

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH] [RESEND] Improve idmap parallelism

On Mon, 2007-07-09 at 15:32 -0700, Frank Filz wrote:
> Not sure if this got lost in the noise, resending...

This one I'm less sure about: I have some long held ideas about how I
want to restructure the idmapper cache.

* For one thing, I want to get completely rid of the locking: this
is a write-once cache, so it should be using RCU.
* Secondly, the idmap cache should never have been a
per-nfs_client structure. We should rather be using a single
global cache that can translate between any owner/owner_group to
a uid/gid and any uid/gid to any owner/owner_group (as long as
we know to which NFSv4 domain the server belongs).

Cheers
Trond


> On Mon, 2007-03-12 at 22:17 -0400, Trond Myklebust wrote:
> > On Mon, 2007-03-12 at 15:08 -0400, Trond Myklebust wrote:
> > > On Mon, 2007-03-12 at 11:19 -0700, Frank Filz wrote:
> > > > Resend: I sent this awhile ago, but it may have been missed with the excitement of Connectathon, weddings, and honeymoons. I've verified it compiles against 2.6.21 (I originally tested it on 2.6.20).
> > >
> > > You forgot the Linux Storage and Filesystem Workshop and FAST. It has
> > > been an exciting life in the past 2 months. :-)
> > >
> > > At first glance it looks OK. I'll take it for a spin...
> >
> > Hmm... Apart from having been mangled by your mailer, I immediately
> > started getting major complaints about scheduling under a spin lock.
>
> My apologies on the formatting.
>
> > Looking more closely, it seems you are holding the read lock while
> > calling rpc_queue_upcall(), which again will result in a call to
> > idmap_pipe_upcall(). That won't work since copy_to_user() is allowed to
> > sleep.
>
> Ah, oops. I have rewritten the code using rwsem to avoid this problem.
> This may not have caused a problem in my testing because the only
> contention should be the write, which shouldn't be attempted until after
> the copy_to_user is done (given the serialization of upcalls). Still, no
> excuse for faulty coding.
>
> > In addition, there were several leakages of the read lock.
>
> Fixed those.
>
> I ran the same testing overnight on this updated patch.
>
> Here is the re-submitted patch:
>
> This patch improves idmap parallelism by reducing the serialization of
> idmap lookups. Currently, each client can only process one idmap lookup
> at a time, potentially delaying other lookups that might be satisfied by
> the in kernel cache while waiting for a user space lookup.
>
> The existing code uses two mutexes, but one of them is held the entire
> time of the lookup.
>
> The biggest change this patch makes is to re-order lock use so that one
> lock serializes multiple user space lookups (for the same nfs_client).
> This lock is only held when the kernel cache lookup fails and a user
> space lookup must occur. The other lock is used to protect the in kernel
> cache (a pair of hash tables). This lock is only held when the internal
> cache is being accessed. Further, since most accesses are lookups, this
> second lock is changed to a read-write lock.
>
> After acquiring the mutex, a second cache lookup is made just in case a
> user space lookup had been in progress for this id.
>
> I tested this using fsstress on an SMP machine. While testing, I put in
> some metering code which showed as many as 1000 cache lookups satisfied
> while an upcall was in progress, and noted occasional lookups for the id
> that an upcall was in progress for.
>
> This patch was modified from an initial patch by Usha Ketineni.
>
> Signed-off-by: Frank Filz <[email protected]>
>
> diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
> index 9d4a6b2..6c5c0f8 100644
> --- a/fs/nfs/idmap.c
> +++ b/fs/nfs/idmap.c
> @@ -36,6 +36,7 @@
>
> #include <linux/module.h>
> #include <linux/mutex.h>
> +#include <linux/rwsem.h>
> #include <linux/init.h>
> #include <linux/types.h>
> #include <linux/slab.h>
> @@ -86,9 +87,9 @@ struct idmap_hashtable {
> struct idmap {
> struct dentry *idmap_dentry;
> wait_queue_head_t idmap_wq;
> - struct idmap_msg idmap_im;
> + struct idmap_msg idmap_im; /* protected by mutex */
> struct mutex idmap_lock; /* Serializes upcalls */
> - struct mutex idmap_im_lock; /* Protects the hashtable */
> + struct rw_semaphore idmap_im_lock; /* Protects the hashtable */
> struct idmap_hashtable idmap_user_hash;
> struct idmap_hashtable idmap_group_hash;
> };
> @@ -127,7 +128,7 @@ nfs_idmap_new(struct nfs_client *clp)
> }
>
> mutex_init(&idmap->idmap_lock);
> - mutex_init(&idmap->idmap_im_lock);
> + init_rwsem(&idmap->idmap_im_lock);
> init_waitqueue_head(&idmap->idmap_wq);
> idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
> idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
> @@ -243,14 +244,28 @@ nfs_idmap_id(struct idmap *idmap, struct
> idmap_hashtable *h,
> if (namelen >= IDMAP_NAMESZ)
> return -EINVAL;
>
> + down_read(&idmap->idmap_im_lock);
> + he = idmap_lookup_name(h, name, namelen);
> + if (he != NULL) {
> + *id = he->ih_id;
> + up_read(&idmap->idmap_im_lock);
> + return 0;
> + }
> + up_read(&idmap->idmap_im_lock);
> +
> mutex_lock(&idmap->idmap_lock);
> - mutex_lock(&idmap->idmap_im_lock);
>
> + /* Attempt lookup again in case we blocked
> + * because another attempt on this name
> + * was in progress.
> + */
> + down_read(&idmap->idmap_im_lock);
> he = idmap_lookup_name(h, name, namelen);
> if (he != NULL) {
> *id = he->ih_id;
> - ret = 0;
> - goto out;
> + up_read(&idmap->idmap_im_lock);
> + mutex_unlock(&idmap->idmap_lock);
> + return 0;
> }
>
> memset(im, 0, sizeof(*im));
> @@ -266,15 +281,15 @@ nfs_idmap_id(struct idmap *idmap, struct
> idmap_hashtable *h,
> add_wait_queue(&idmap->idmap_wq, &wq);
> if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
> remove_wait_queue(&idmap->idmap_wq, &wq);
> + up_read(&idmap->idmap_im_lock);
> goto out;
> }
>
> set_current_state(TASK_UNINTERRUPTIBLE);
> - mutex_unlock(&idmap->idmap_im_lock);
> + up_read(&idmap->idmap_im_lock);
> schedule();
> current->state = TASK_RUNNING;
> remove_wait_queue(&idmap->idmap_wq, &wq);
> - mutex_lock(&idmap->idmap_im_lock);
>
> if (im->im_status & IDMAP_STATUS_SUCCESS) {
> *id = im->im_id;
> @@ -283,7 +298,6 @@ nfs_idmap_id(struct idmap *idmap, struct
> idmap_hashtable *h,
>
> out:
> memset(im, 0, sizeof(*im));
> - mutex_unlock(&idmap->idmap_im_lock);
> mutex_unlock(&idmap->idmap_lock);
> return (ret);
> }
> @@ -304,14 +318,30 @@ nfs_idmap_name(struct idmap *idmap, struct
> idmap_hashtable *h,
>
> im = &idmap->idmap_im;
>
> + down_read(&idmap->idmap_im_lock);
> + he = idmap_lookup_id(h, id);
> + if (he != 0) {
> + memcpy(name, he->ih_name, he->ih_namelen);
> + ret = he->ih_namelen;
> + up_read(&idmap->idmap_im_lock);
> + return ret;
> + }
> + up_read(&idmap->idmap_im_lock);
> +
> mutex_lock(&idmap->idmap_lock);
> - mutex_lock(&idmap->idmap_im_lock);
>
> + /* Attempt lookup again in case we blocked
> + * because another attempt on this id
> + * was in progress.
> + */
> + down_read(&idmap->idmap_im_lock);
> he = idmap_lookup_id(h, id);
> if (he != 0) {
> memcpy(name, he->ih_name, he->ih_namelen);
> ret = he->ih_namelen;
> - goto out;
> + up_read(&idmap->idmap_im_lock);
> + mutex_unlock(&idmap->idmap_lock);
> + return ret;
> }
>
> memset(im, 0, sizeof(*im));
> @@ -327,15 +357,15 @@ nfs_idmap_name(struct idmap *idmap, struct
> idmap_hashtable *h,
>
> if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
> remove_wait_queue(&idmap->idmap_wq, &wq);
> + up_read(&idmap->idmap_im_lock);
> goto out;
> }
>
> set_current_state(TASK_UNINTERRUPTIBLE);
> - mutex_unlock(&idmap->idmap_im_lock);
> + up_read(&idmap->idmap_im_lock);
> schedule();
> current->state = TASK_RUNNING;
> remove_wait_queue(&idmap->idmap_wq, &wq);
> - mutex_lock(&idmap->idmap_im_lock);
>
> if (im->im_status & IDMAP_STATUS_SUCCESS) {
> if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
> @@ -346,7 +376,6 @@ nfs_idmap_name(struct idmap *idmap, struct
> idmap_hashtable *h,
>
> out:
> memset(im, 0, sizeof(*im));
> - mutex_unlock(&idmap->idmap_im_lock);
> mutex_unlock(&idmap->idmap_lock);
> return ret;
> }
> @@ -391,7 +420,7 @@ idmap_pipe_downcall(struct file *filp, const char
> __user *src, size_t mlen)
> if (copy_from_user(&im_in, src, mlen) != 0)
> return (-EFAULT);
>
> - mutex_lock(&idmap->idmap_im_lock);
> + down_write(&idmap->idmap_im_lock);
>
> ret = mlen;
> im->im_status = im_in.im_status;
> @@ -451,7 +480,7 @@ idmap_pipe_downcall(struct file *filp, const char
> __user *src, size_t mlen)
> idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
> ret = mlen;
> out:
> - mutex_unlock(&idmap->idmap_im_lock);
> + up_write(&idmap->idmap_im_lock);
> return ret;
> }
>
> @@ -463,10 +492,10 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
>
> if (msg->errno >= 0)
> return;
> - mutex_lock(&idmap->idmap_im_lock);
> + down_write(&idmap->idmap_im_lock);
> im->im_status = IDMAP_STATUS_LOOKUPFAIL;
> wake_up(&idmap->idmap_wq);
> - mutex_unlock(&idmap->idmap_im_lock);
> + up_write(&idmap->idmap_im_lock);
> }
>
> /*
>
>


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs