LinuxLists.cc - [PATCH 0/6] NFS DRC scalability patches

2014-08-03 17:06:07

Subject: [PATCH 0/6] NFS DRC scalability patches

Hi Bruce,

The following patchset is again more an FYI than a must have, but it
addresses a significant scalability problem for NFSv3 when running
over 10GigE and 40GigE networks. At one point, I was seeing >60% CPU
spinning on the DRC global spinlock...

The one thing to note is that because the statistics are moved outside
spinlocking, then their accuracy is slightly reduced (I do convert the
number of entries into an atomic, so that is still 100% correct). The
right thing to do there is probably to convert those into per-cpu
counters, however I've not had time to do so for now.

Cheers,
Trond

Trond Myklebust (6):
nfsd: Clean up drc cache in preparation for global spinlock
elimination
nfsd: convert the lru list into a per-bucket thing
nfsd: Remove the cache_hash list
nfsd: convert num_drc_entries to an atomic_t
nfsd: split DRC global spinlock into per-bucket locks
nfsd: Reorder nfsd_cache_match to check more powerful discriminators
first

fs/nfsd/cache.h | 1 -
fs/nfsd/nfscache.c | 205 +++++++++++++++++++++++++++++------------------------
2 files changed, 113 insertions(+), 93 deletions(-)

--
1.9.3

2014-08-04 11:44:46

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH 5/6] nfsd: split DRC global spinlock into per-bucket locks

On Sun, 3 Aug 2014 13:06:03 -0400
Trond Myklebust <[email protected]> wrote:

> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfsd/nfscache.c | 43 ++++++++++++++++++++-----------------------
> 1 file changed, 20 insertions(+), 23 deletions(-)
>

nit: There are a number of comments in nfscache.c that refer to the
cache_lock. Those should be updated since this will make them terribly
confusing later.

> diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
> index dc909091349b..74603654b7f9 100644
> --- a/fs/nfsd/nfscache.c
> +++ b/fs/nfsd/nfscache.c
> @@ -29,6 +29,7 @@
>
> struct nfsd_drc_bucket {
> struct list_head lru_head;
> + spinlock_t cache_lock;
> };
>
> static struct nfsd_drc_bucket *drc_hashtbl;
> @@ -79,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
> * A cache entry is "single use" if c_state == RC_INPROG
> * Otherwise, it when accessing _prev or _next, the lock must be held.
> */
> -static DEFINE_SPINLOCK(cache_lock);
> static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
>
> /*
> @@ -154,11 +154,11 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
> }
>
> static void
> -nfsd_reply_cache_free(struct svc_cacherep *rp)
> +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
> {
> - spin_lock(&cache_lock);
> + spin_lock(&b->cache_lock);
> nfsd_reply_cache_free_locked(rp);
> - spin_unlock(&cache_lock);
> + spin_unlock(&b->cache_lock);
> }
>
> int nfsd_reply_cache_init(void)
> @@ -180,8 +180,10 @@ int nfsd_reply_cache_init(void)
> drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
> if (!drc_hashtbl)
> goto out_nomem;
> - for (i = 0; i < hashsize; i++)
> + for (i = 0; i < hashsize; i++) {
> INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
> + spin_lock_init(&drc_hashtbl[i].cache_lock);
> + }
> drc_hashsize = hashsize;
>
> return 0;
> @@ -265,9 +267,13 @@ prune_cache_entries(void)
> for (i = 0; i < drc_hashsize; i++) {
> struct nfsd_drc_bucket *b = &drc_hashtbl[i];
>
> + if (list_empty(&b->lru_head))
> + continue;
> + spin_lock(&b->cache_lock);
> freed += prune_bucket(b);
> if (!list_empty(&b->lru_head))
> cancel = false;
> + spin_unlock(&b->cache_lock);
> }
>
> /*
> @@ -282,9 +288,7 @@ prune_cache_entries(void)
> static void
> cache_cleaner_func(struct work_struct *unused)
> {
> - spin_lock(&cache_lock);
> prune_cache_entries();
> - spin_unlock(&cache_lock);
> }
>
> static unsigned long
> @@ -296,12 +300,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
> static unsigned long
> nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
> {
> - unsigned long freed;
> -
> - spin_lock(&cache_lock);
> - freed = prune_cache_entries();
> - spin_unlock(&cache_lock);
> - return freed;
> + return prune_cache_entries();
> }
> /*
> * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
> @@ -426,14 +425,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
> * preallocate an entry.
> */
> rp = nfsd_reply_cache_alloc();
> - spin_lock(&cache_lock);
> + spin_lock(&b->cache_lock);
> if (likely(rp)) {
> atomic_inc(&num_drc_entries);
> drc_mem_usage += sizeof(*rp);
> }
>
> /* go ahead and prune the cache */
> - prune_cache_entries();
> + prune_bucket(b);
>
> found = nfsd_cache_search(b, rqstp, csum);
> if (found) {
> @@ -470,7 +469,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
> }
> rp->c_type = RC_NOCACHE;
> out:
> - spin_unlock(&cache_lock);
> + spin_unlock(&b->cache_lock);
> return rtn;
>
> found_entry:
> @@ -548,7 +547,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
>
> /* Don't cache excessive amounts of data and XDR failures */
> if (!statp || len > (256 >> 2)) {
> - nfsd_reply_cache_free(rp);
> + nfsd_reply_cache_free(b, rp);
> return;
> }
>
> @@ -563,23 +562,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
> bufsize = len << 2;
> cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
> if (!cachv->iov_base) {
> - nfsd_reply_cache_free(rp);
> + nfsd_reply_cache_free(b, rp);
> return;
> }
> cachv->iov_len = bufsize;
> memcpy(cachv->iov_base, statp, bufsize);
> break;
> case RC_NOCACHE:
> - nfsd_reply_cache_free(rp);
> + nfsd_reply_cache_free(b, rp);
> return;
> }
> - spin_lock(&cache_lock);
> + spin_lock(&b->cache_lock);
> drc_mem_usage += bufsize;
> lru_put_end(b, rp);
> rp->c_secure = rqstp->rq_secure;
> rp->c_type = cachetype;
> rp->c_state = RC_DONE;
> - spin_unlock(&cache_lock);
> + spin_unlock(&b->cache_lock);
> return;
> }
>
> @@ -610,7 +609,6 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
> */
> static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
> {
> - spin_lock(&cache_lock);
> seq_printf(m, "max entries: %u\n", max_drc_entries);
> seq_printf(m, "num entries: %u\n",
> atomic_read(&num_drc_entries));
> @@ -622,7 +620,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
> seq_printf(m, "payload misses: %u\n", payload_misses);
> seq_printf(m, "longest chain len: %u\n", longest_chain);
> seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize);
> - spin_unlock(&cache_lock);
> return 0;
> }
>

Other than that, this looks good to me:

Reviewed-by: Jeff Layton <[email protected]>

2014-08-03 17:06:09

by Trond Myklebust

[permalink] [raw]

Subject: [PATCH 2/6] nfsd: convert the lru list into a per-bucket thing

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/nfscache.c | 73 +++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index c9e25bc7ea56..a387d443641f 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,10 +29,10 @@

struct nfsd_drc_bucket {
struct hlist_head cache_hash;
+ struct list_head lru_head;
};

static struct nfsd_drc_bucket *drc_hashtbl;
-static struct list_head lru_head;
static struct kmem_cache *drc_slab;

/* max number of entries allowed in the cache */
@@ -40,6 +40,7 @@ static unsigned int max_drc_entries;

/* number of significant bits in the hash value */
static unsigned int maskbits;
+static unsigned int drc_hashsize;

/*
* Stats and other tracking of on the duplicate reply cache. All of these and
@@ -167,8 +168,8 @@ nfsd_reply_cache_free(struct svc_cacherep *rp)
int nfsd_reply_cache_init(void)
{
unsigned int hashsize;
+ unsigned int i;

- INIT_LIST_HEAD(&lru_head);
max_drc_entries = nfsd_cache_size_limit();
num_drc_entries = 0;
hashsize = nfsd_hashsize(max_drc_entries);
@@ -183,6 +184,9 @@ int nfsd_reply_cache_init(void)
drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
if (!drc_hashtbl)
goto out_nomem;
+ for (i = 0; i < hashsize; i++)
+ INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
+ drc_hashsize = hashsize;

return 0;
out_nomem:
@@ -194,17 +198,22 @@ out_nomem:
void nfsd_reply_cache_shutdown(void)
{
struct svc_cacherep *rp;
+ unsigned int i;

unregister_shrinker(&nfsd_reply_cache_shrinker);
cancel_delayed_work_sync(&cache_cleaner);

- while (!list_empty(&lru_head)) {
- rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
- nfsd_reply_cache_free_locked(rp);
+ for (i = 0; i < drc_hashsize; i++) {
+ struct list_head *head = &drc_hashtbl[i].lru_head;
+ while (!list_empty(head)) {
+ rp = list_first_entry(head, struct svc_cacherep, c_lru);
+ nfsd_reply_cache_free_locked(rp);
+ }
}

kfree (drc_hashtbl);
drc_hashtbl = NULL;
+ drc_hashsize = 0;

if (drc_slab) {
kmem_cache_destroy(drc_slab);
@@ -217,10 +226,10 @@ void nfsd_reply_cache_shutdown(void)
* not already scheduled.
*/
static void
-lru_put_end(struct svc_cacherep *rp)
+lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
rp->c_timestamp = jiffies;
- list_move_tail(&rp->c_lru, &lru_head);
+ list_move_tail(&rp->c_lru, &b->lru_head);
schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}

@@ -234,17 +243,13 @@ hash_refile(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
hlist_add_head(&rp->c_hash, &b->cache_hash);
}

-/*
- * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
- * Also prune the oldest ones when the total exceeds the max number of entries.
- */
static long
-prune_cache_entries(void)
+prune_bucket(struct nfsd_drc_bucket *b)
{
struct svc_cacherep *rp, *tmp;
long freed = 0;

- list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
+ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
/*
* Don't free entries attached to calls that are still
* in-progress, but do keep scanning the list.
@@ -257,16 +262,33 @@ prune_cache_entries(void)
nfsd_reply_cache_free_locked(rp);
freed++;
}
+ return freed;
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static long
+prune_cache_entries(void)
+{
+ unsigned int i;
+ long freed = 0;
+ bool cancel = true;
+
+ for (i = 0; i < drc_hashsize; i++) {
+ struct nfsd_drc_bucket *b = &drc_hashtbl[i];
+
+ freed += prune_bucket(b);
+ if (!list_empty(&b->lru_head))
+ cancel = false;
+ }

/*
- * Conditionally rearm the job. If we cleaned out the list, then
- * cancel any pending run (since there won't be any work to do).
- * Otherwise, we rearm the job or modify the existing one to run in
- * RC_EXPIRE since we just ran the pruner.
+ * Conditionally rearm the job to run in RC_EXPIRE since we just
+ * ran the pruner.
*/
- if (list_empty(&lru_head))
- cancel_delayed_work(&cache_cleaner);
- else
+ if (!cancel)
mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
return freed;
}
@@ -458,7 +480,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rp->c_csum = csum;

hash_refile(b, rp);
- lru_put_end(rp);
+ lru_put_end(b, rp);

/* release any buffer */
if (rp->c_type == RC_REPLBUFF) {
@@ -475,7 +497,7 @@ found_entry:
nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
age = jiffies - rp->c_timestamp;
- lru_put_end(rp);
+ lru_put_end(b, rp);

rtn = RC_DROPIT;
/* Request being processed or excessive rexmits */
@@ -530,12 +552,17 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
+ u32 hash;
+ struct nfsd_drc_bucket *b;
int len;
size_t bufsize = 0;

if (!rp)
return;

+ hash = nfsd_cache_hash(rp->c_xid);
+ b = &drc_hashtbl[hash];
+
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;

@@ -568,7 +595,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
}
spin_lock(&cache_lock);
drc_mem_usage += bufsize;
- lru_put_end(rp);
+ lru_put_end(b, rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
rp->c_state = RC_DONE;
--
1.9.3

2014-08-03 17:06:07

by Trond Myklebust

[permalink] [raw]

Subject: [PATCH 1/6] nfsd: Clean up drc cache in preparation for global spinlock elimination

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/nfscache.c | 38 +++++++++++++++++++++++++-------------
1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 6040da8830ff..c9e25bc7ea56 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -27,7 +27,11 @@
*/
#define TARGET_BUCKET_SIZE 64

-static struct hlist_head * cache_hash;
+struct nfsd_drc_bucket {
+ struct hlist_head cache_hash;
+};
+
+static struct nfsd_drc_bucket *drc_hashtbl;
static struct list_head lru_head;
static struct kmem_cache *drc_slab;

@@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}

+static u32
+nfsd_cache_hash(__be32 xid)
+{
+ return hash_32(be32_to_cpu(xid), maskbits);
+}
+
static struct svc_cacherep *
nfsd_reply_cache_alloc(void)
{
@@ -170,8 +180,8 @@ int nfsd_reply_cache_init(void)
if (!drc_slab)
goto out_nomem;

- cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
- if (!cache_hash)
+ drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
+ if (!drc_hashtbl)
goto out_nomem;

return 0;
@@ -193,8 +203,8 @@ void nfsd_reply_cache_shutdown(void)
nfsd_reply_cache_free_locked(rp);
}

- kfree (cache_hash);
- cache_hash = NULL;
+ kfree (drc_hashtbl);
+ drc_hashtbl = NULL;

if (drc_slab) {
kmem_cache_destroy(drc_slab);
@@ -218,10 +228,10 @@ lru_put_end(struct svc_cacherep *rp)
* Move a cache entry from one hash list to another
*/
static void
-hash_refile(struct svc_cacherep *rp)
+hash_refile(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
- hlist_del_init(&rp->c_hash);
- hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
+ hlist_del(&rp->c_hash);
+ hlist_add_head(&rp->c_hash, &b->cache_hash);
}

/*
@@ -350,13 +360,13 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
* NULL on failure.
*/
static struct svc_cacherep *
-nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
+nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
+ __wsum csum)
{
struct svc_cacherep *rp, *ret = NULL;
- struct hlist_head *rh;
+ struct hlist_head *rh = &b->cache_hash;
unsigned int entries = 0;

- rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)];
hlist_for_each_entry(rp, rh, c_hash) {
++entries;
if (nfsd_cache_match(rqstp, csum, rp)) {
@@ -394,6 +404,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
vers = rqstp->rq_vers,
proc = rqstp->rq_proc;
__wsum csum;
+ u32 hash = nfsd_cache_hash(xid);
+ struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
unsigned long age;
int type = rqstp->rq_cachetype;
int rtn = RC_DOIT;
@@ -420,7 +432,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
/* go ahead and prune the cache */
prune_cache_entries();

- found = nfsd_cache_search(rqstp, csum);
+ found = nfsd_cache_search(b, rqstp, csum);
if (found) {
if (likely(rp))
nfsd_reply_cache_free_locked(rp);
@@ -445,7 +457,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rp->c_len = rqstp->rq_arg.len;
rp->c_csum = csum;

- hash_refile(rp);
+ hash_refile(b, rp);
lru_put_end(rp);

/* release any buffer */
--
1.9.3

2014-08-03 17:06:14

by Trond Myklebust

[permalink] [raw]

Subject: [PATCH 6/6] nfsd: Reorder nfsd_cache_match to check more powerful discriminators first

We would normally expect the xid and the checksum to be the best
discriminators. Check them before looking at the procedure number,
etc.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/nfscache.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 74603654b7f9..122f69185ef5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -338,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
static bool
nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
{
- /* Check RPC header info first */
- if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
- rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
- rqstp->rq_arg.len != rp->c_len ||
- !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
- rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ /* Check RPC XID first */
+ if (rqstp->rq_xid != rp->c_xid)
return false;
-
/* compare checksum of NFS data */
if (csum != rp->c_csum) {
++payload_misses;
return false;
}

+ /* Other discriminators */
+ if (rqstp->rq_proc != rp->c_proc ||
+ rqstp->rq_prot != rp->c_prot ||
+ rqstp->rq_vers != rp->c_vers ||
+ rqstp->rq_arg.len != rp->c_len ||
+ !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+ rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ return false;
+
return true;
}

--
1.9.3

2014-08-05 20:51:07

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH 0/6] NFS DRC scalability patches

On Mon, Aug 04, 2014 at 07:45:46AM -0400, Jeff Layton wrote:
> On Sun, 3 Aug 2014 13:05:58 -0400
> Trond Myklebust <[email protected]> wrote:
>
> > Hi Bruce,
> >
> > The following patchset is again more an FYI than a must have, but it
> > addresses a significant scalability problem for NFSv3 when running
> > over 10GigE and 40GigE networks. At one point, I was seeing >60% CPU
> > spinning on the DRC global spinlock...
> >
> > The one thing to note is that because the statistics are moved outside
> > spinlocking, then their accuracy is slightly reduced (I do convert the
> > number of entries into an atomic, so that is still 100% correct). The
> > right thing to do there is probably to convert those into per-cpu
> > counters, however I've not had time to do so for now.
> >
>
> Consider adding some FIXME comments to that effect? While not terribly
> important, it would be good to fix them at some point.
>
> > Cheers,
> > Trond
> >
> > Trond Myklebust (6):
> > nfsd: Clean up drc cache in preparation for global spinlock
> > elimination
> > nfsd: convert the lru list into a per-bucket thing
> > nfsd: Remove the cache_hash list
> > nfsd: convert num_drc_entries to an atomic_t
> > nfsd: split DRC global spinlock into per-bucket locks
> > nfsd: Reorder nfsd_cache_match to check more powerful discriminators
> > first
> >
> > fs/nfsd/cache.h | 1 -
> > fs/nfsd/nfscache.c | 205 +++++++++++++++++++++++++++++------------------------
> > 2 files changed, 113 insertions(+), 93 deletions(-)
> >
>
> Looks good. Nice work! You can add this to each since they all look
> good to me. It would be good to fix up the stats collection and the
> comments, but I wouldn't want to hold up this work for that.
>
> Reviewed-by: Jeff Layton <[email protected]>

Could you also rebase this to my for-3.17? (Or to 3.17-rc1, whenever
that comes out.) Not urgent, but there appear to be some minor
conflicts.

--b.

2014-08-04 11:45:49

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH 0/6] NFS DRC scalability patches

On Sun, 3 Aug 2014 13:05:58 -0400
Trond Myklebust <[email protected]> wrote:

> Hi Bruce,
>
> The following patchset is again more an FYI than a must have, but it
> addresses a significant scalability problem for NFSv3 when running
> over 10GigE and 40GigE networks. At one point, I was seeing >60% CPU
> spinning on the DRC global spinlock...
>
> The one thing to note is that because the statistics are moved outside
> spinlocking, then their accuracy is slightly reduced (I do convert the
> number of entries into an atomic, so that is still 100% correct). The
> right thing to do there is probably to convert those into per-cpu
> counters, however I've not had time to do so for now.
>

Consider adding some FIXME comments to that effect? While not terribly
important, it would be good to fix them at some point.

> Cheers,
> Trond
>
> Trond Myklebust (6):
> nfsd: Clean up drc cache in preparation for global spinlock
> elimination
> nfsd: convert the lru list into a per-bucket thing
> nfsd: Remove the cache_hash list
> nfsd: convert num_drc_entries to an atomic_t
> nfsd: split DRC global spinlock into per-bucket locks
> nfsd: Reorder nfsd_cache_match to check more powerful discriminators
> first
>
> fs/nfsd/cache.h | 1 -
> fs/nfsd/nfscache.c | 205 +++++++++++++++++++++++++++++------------------------
> 2 files changed, 113 insertions(+), 93 deletions(-)
>

Looks good. Nice work! You can add this to each since they all look
good to me. It would be good to fix up the stats collection and the
comments, but I wouldn't want to hold up this work for that.

Reviewed-by: Jeff Layton <[email protected]>

2014-08-03 17:06:10

by Trond Myklebust

[permalink] [raw]

Subject: [PATCH 3/6] nfsd: Remove the cache_hash list

Now that the lru list is per-bucket, we don't need a second list for
searches.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/cache.h | 1 -
fs/nfsd/nfscache.c | 19 ++-----------------
2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b582f9ab6b2a..dd96a3830004 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -18,7 +18,6 @@
* is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
- struct hlist_node c_hash;
struct list_head c_lru;

unsigned char c_state, /* unused, inprog, done */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index a387d443641f..8abec475f80f 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -28,7 +28,6 @@
#define TARGET_BUCKET_SIZE 64

struct nfsd_drc_bucket {
- struct hlist_head cache_hash;
struct list_head lru_head;
};

@@ -137,7 +136,6 @@ nfsd_reply_cache_alloc(void)
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
INIT_LIST_HEAD(&rp->c_lru);
- INIT_HLIST_NODE(&rp->c_hash);
}
return rp;
}
@@ -149,8 +147,6 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
}
- if (!hlist_unhashed(&rp->c_hash))
- hlist_del(&rp->c_hash);
list_del(&rp->c_lru);
--num_drc_entries;
drc_mem_usage -= sizeof(*rp);
@@ -233,16 +229,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}

-/*
- * Move a cache entry from one hash list to another
- */
-static void
-hash_refile(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
-{
- hlist_del(&rp->c_hash);
- hlist_add_head(&rp->c_hash, &b->cache_hash);
-}
-
static long
prune_bucket(struct nfsd_drc_bucket *b)
{
@@ -386,10 +372,10 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
__wsum csum)
{
struct svc_cacherep *rp, *ret = NULL;
- struct hlist_head *rh = &b->cache_hash;
+ struct list_head *rh = &b->lru_head;
unsigned int entries = 0;

- hlist_for_each_entry(rp, rh, c_hash) {
+ list_for_each_entry(rp, rh, c_lru) {
++entries;
if (nfsd_cache_match(rqstp, csum, rp)) {
ret = rp;
@@ -479,7 +465,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rp->c_len = rqstp->rq_arg.len;
rp->c_csum = csum;

- hash_refile(b, rp);
lru_put_end(b, rp);

/* release any buffer */
--
1.9.3

2014-08-03 17:06:13

by Trond Myklebust

[permalink] [raw]

Subject: [PATCH 5/6] nfsd: split DRC global spinlock into per-bucket locks

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/nfscache.c | 43 ++++++++++++++++++++-----------------------
1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index dc909091349b..74603654b7f9 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,6 +29,7 @@

struct nfsd_drc_bucket {
struct list_head lru_head;
+ spinlock_t cache_lock;
};

static struct nfsd_drc_bucket *drc_hashtbl;
@@ -79,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
* A cache entry is "single use" if c_state == RC_INPROG
* Otherwise, it when accessing _prev or _next, the lock must be held.
*/
-static DEFINE_SPINLOCK(cache_lock);
static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);

/*
@@ -154,11 +154,11 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
}

static void
-nfsd_reply_cache_free(struct svc_cacherep *rp)
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
nfsd_reply_cache_free_locked(rp);
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
}

int nfsd_reply_cache_init(void)
@@ -180,8 +180,10 @@ int nfsd_reply_cache_init(void)
drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
if (!drc_hashtbl)
goto out_nomem;
- for (i = 0; i < hashsize; i++)
+ for (i = 0; i < hashsize; i++) {
INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
+ spin_lock_init(&drc_hashtbl[i].cache_lock);
+ }
drc_hashsize = hashsize;

return 0;
@@ -265,9 +267,13 @@ prune_cache_entries(void)
for (i = 0; i < drc_hashsize; i++) {
struct nfsd_drc_bucket *b = &drc_hashtbl[i];

+ if (list_empty(&b->lru_head))
+ continue;
+ spin_lock(&b->cache_lock);
freed += prune_bucket(b);
if (!list_empty(&b->lru_head))
cancel = false;
+ spin_unlock(&b->cache_lock);
}

/*
@@ -282,9 +288,7 @@ prune_cache_entries(void)
static void
cache_cleaner_func(struct work_struct *unused)
{
- spin_lock(&cache_lock);
prune_cache_entries();
- spin_unlock(&cache_lock);
}

static unsigned long
@@ -296,12 +300,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long freed;
-
- spin_lock(&cache_lock);
- freed = prune_cache_entries();
- spin_unlock(&cache_lock);
- return freed;
+ return prune_cache_entries();
}
/*
* Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
@@ -426,14 +425,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* preallocate an entry.
*/
rp = nfsd_reply_cache_alloc();
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
if (likely(rp)) {
atomic_inc(&num_drc_entries);
drc_mem_usage += sizeof(*rp);
}

/* go ahead and prune the cache */
- prune_cache_entries();
+ prune_bucket(b);

found = nfsd_cache_search(b, rqstp, csum);
if (found) {
@@ -470,7 +469,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
}
rp->c_type = RC_NOCACHE;
out:
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
return rtn;

found_entry:
@@ -548,7 +547,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)

/* Don't cache excessive amounts of data and XDR failures */
if (!statp || len > (256 >> 2)) {
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}

@@ -563,23 +562,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
bufsize = len << 2;
cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
if (!cachv->iov_base) {
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
cachv->iov_len = bufsize;
memcpy(cachv->iov_base, statp, bufsize);
break;
case RC_NOCACHE:
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
drc_mem_usage += bufsize;
lru_put_end(b, rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
rp->c_state = RC_DONE;
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
return;
}

@@ -610,7 +609,6 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
*/
static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
{
- spin_lock(&cache_lock);
seq_printf(m, "max entries: %u\n", max_drc_entries);
seq_printf(m, "num entries: %u\n",
atomic_read(&num_drc_entries));
@@ -622,7 +620,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "payload misses: %u\n", payload_misses);
seq_printf(m, "longest chain len: %u\n", longest_chain);
seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize);
- spin_unlock(&cache_lock);
return 0;
}

--
1.9.3

2014-08-03 17:06:11

by Trond Myklebust

[permalink] [raw]

Subject: [PATCH 4/6] nfsd: convert num_drc_entries to an atomic_t

...so we can remove the spinlocking around it.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/nfscache.c | 28 ++++++++++++----------------
1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 8abec475f80f..dc909091349b 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -47,7 +47,7 @@ static unsigned int drc_hashsize;
*/

/* total number of entries */
-static unsigned int num_drc_entries;
+static atomic_t num_drc_entries;

/* cache misses due only to checksum comparison failures */
static unsigned int payload_misses;
@@ -148,7 +148,7 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
kfree(rp->c_replvec.iov_base);
}
list_del(&rp->c_lru);
- --num_drc_entries;
+ atomic_dec(&num_drc_entries);
drc_mem_usage -= sizeof(*rp);
kmem_cache_free(drc_slab, rp);
}
@@ -167,7 +167,7 @@ int nfsd_reply_cache_init(void)
unsigned int i;

max_drc_entries = nfsd_cache_size_limit();
- num_drc_entries = 0;
+ atomic_set(&num_drc_entries, 0);
hashsize = nfsd_hashsize(max_drc_entries);
maskbits = ilog2(hashsize);

@@ -242,7 +242,7 @@ prune_bucket(struct nfsd_drc_bucket *b)
*/
if (rp->c_state == RC_INPROG)
continue;
- if (num_drc_entries <= max_drc_entries &&
+ if (atomic_read(&num_drc_entries) <= max_drc_entries &&
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(rp);
@@ -290,13 +290,7 @@ cache_cleaner_func(struct work_struct *unused)
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long num;
-
- spin_lock(&cache_lock);
- num = num_drc_entries;
- spin_unlock(&cache_lock);
-
- return num;
+ return atomic_read(&num_drc_entries);
}

static unsigned long
@@ -386,11 +380,12 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
/* tally hash chain length stats */
if (entries > longest_chain) {
longest_chain = entries;
- longest_chain_cachesize = num_drc_entries;
+ longest_chain_cachesize = atomic_read(&num_drc_entries);
} else if (entries == longest_chain) {
/* prefer to keep the smallest cachesize possible here */
- longest_chain_cachesize = min(longest_chain_cachesize,
- num_drc_entries);
+ longest_chain_cachesize = min_t(unsigned int,
+ longest_chain_cachesize,
+ atomic_read(&num_drc_entries));
}

return ret;
@@ -433,7 +428,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rp = nfsd_reply_cache_alloc();
spin_lock(&cache_lock);
if (likely(rp)) {
- ++num_drc_entries;
+ atomic_inc(&num_drc_entries);
drc_mem_usage += sizeof(*rp);
}

@@ -617,7 +612,8 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
{
spin_lock(&cache_lock);
seq_printf(m, "max entries: %u\n", max_drc_entries);
- seq_printf(m, "num entries: %u\n", num_drc_entries);
+ seq_printf(m, "num entries: %u\n",
+ atomic_read(&num_drc_entries));
seq_printf(m, "hash buckets: %u\n", 1 << maskbits);
seq_printf(m, "mem usage: %u\n", drc_mem_usage);
seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
--
1.9.3