knfsd: The duplicate request cache has a static size whose value
was chosen in the dim mists of history and is now far too small for
the data structure to be useful. So, allow it to expand on demand,
when too young an entry falls off the end of an LRU chain.
There is no way to tune the maximum size or minimum age parameters at
runtime, nor has any provision been made for shrinking the repcache
under memory pressure. The maximum size observed, on a multi-CPU
multi-NIC machine handling an extreme workload of about 80,000
non-idempotent calls per second, has been only about 16MiB.
An earlier version of this patch set contained code that also expanded
the hash index under load, but testing shows that merely overallocating
the initial hash index provides a good enough index for observed
loads so the extra complexity was not considered worthwhile.
Signed-off-by: Greg Banks <[email protected]>
---
fs/nfsd/nfscache.c | 66 ++++++++++++++++++++++++++++++++++++++----
1 files changed, 61 insertions(+), 5 deletions(-)
Index: linux-git-20061009/fs/nfsd/nfscache.c
===================================================================
--- linux-git-20061009.orig/fs/nfsd/nfscache.c 2006-10-10 16:39:12.624126466 +1000
+++ linux-git-20061009/fs/nfsd/nfscache.c 2006-10-10 16:41:07.121363949 +1000
@@ -9,7 +9,7 @@
*
* Copyright (C) 1995, 1996 Olaf Kirch <[email protected]>
*
- * SMP lock splitting by Greg Banks <[email protected]>
+ * Dynamic expansion and SMP lock splitting by Greg Banks <[email protected]>
* Copyright (c) 2005-2006 Silicon Graphics, Inc.
*/
@@ -24,11 +24,21 @@
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/cache.h>
-/* Size of reply cache. Common values are:
+/* Initial size of reply cache. Common values are:
* 4.3BSD: 128
* 4.4BSD: 256
* Solaris2: 1024
* DEC Unix: 512-4096
+ *
+ * All these values reflect network packet rates and NFS loads common
+ * somewhen around 1990, and are utterly inadequate for modern NFS
+ * servers. To be at all effective the reply cache needs to hold all
+ * NFS calls seen by the server for at least a client RPC timeout period
+ * (typically 1.1 seconds), and to handle weird IP routing issues should
+ * really hold 120 seconds of traffic. A modern NFS server can be
+ * fielding upwards of 10,000 calls per second, which means the default
+ * cache size of 1024 holds about 102 milliseconds' traffic, i.e. the
+ * default size is three orders of magnitude too small.
*/
#define CACHESIZE 1024
/* number of buckets used to manage LRU lists and cache locks (power of 2) */
@@ -41,10 +51,16 @@
#define CACHE_MAX_SIZE (16*1024*CACHE_NUM_BUCKETS)
/* largest possible number of entries in LRU per bucket */
#define CACHE_BUCKET_MAX_SIZE (CACHE_MAX_SIZE/CACHE_NUM_BUCKETS)
+/* number of entries each bucket will expand by */
+#define CACHE_BUCKET_INCREMENT (1024/CACHE_NUM_BUCKETS)
/* log2 of largest desired hash chain length */
#define MAX_CHAIN_ORDER 2
/* size of the per-bucket hash table */
#define HASHSIZE ((CACHE_MAX_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
+/* the cache attempts to expand if an entry younger than this is evicted */
+#define CACHE_THRESH_AGE (11 * HZ / 10) /* in jiffies */
+/* parameters for rate limiting cache expansion */
+#define CACHE_RATE_JIFFIES (HZ/2)
/*
* locking for the reply cache:
@@ -64,6 +80,9 @@ struct svc_cache_bucket
struct list_head lru;
unsigned int size;
struct hlist_head *hash;
+ /* parameters for expand rate limiting */
+ unsigned long last;
+ unsigned long nhits;
} ____cacheline_aligned_in_smp;
static struct svc_cache_bucket cache_buckets[CACHE_NUM_BUCKETS];
@@ -87,12 +106,12 @@ request_hash(u32 xid)
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
/*
- * Initialise the reply cache data structures.
+ * Expand (or initialise) the reply cache data structures.
* Called without cache_lock, uses it internally. Returns
* 0 on success, an error otherwise.
*/
static void
-nfsd_cache_bucket_init(struct svc_cache_bucket *b, unsigned int nrecs)
+nfsd_cache_bucket_expand(struct svc_cache_bucket *b, unsigned int nrecs)
{
struct svc_cacherep *rp;
unsigned int i;
@@ -136,7 +155,7 @@ nfsd_cache_init(void)
INIT_LIST_HEAD(&b->lru);
spin_lock_init(&b->lock);
- nfsd_cache_bucket_init(b, CACHESIZE/CACHE_NUM_BUCKETS);
+ nfsd_cache_bucket_expand(b, CACHESIZE/CACHE_NUM_BUCKETS);
b->hash = kmalloc (HASHSIZE * sizeof(struct hlist_head), GFP_KERNEL);
if (!b->hash) {
@@ -186,6 +205,28 @@ lru_put_end(struct svc_cache_bucket *b,
}
/*
+ * Decide whether it is time to expand the cache. Returns 1 iff
+ * the cache is to be expanded. Called with bucket lock held.
+ */
+static int
+nfsd_cache_expand_ratelimit(struct svc_cache_bucket *b)
+{
+ unsigned long now = jiffies;
+
+ b->nhits++;
+ if (b->last == 0) {
+ b->last = now;
+ } else if ((now - b->last) > CACHE_RATE_JIFFIES &&
+ b->nhits > (b->size >> 4)) {
+ b->nhits = 0;
+ b->last = now;
+ return 1;
+ }
+ return 0;
+}
+
+
+/*
* Try to find an entry matching the current call in the cache. When none
* is found, we grab the oldest unlocked entry off the LRU list.
* Note that no operation within the loop may sleep.
@@ -205,6 +246,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
unsigned long age;
int rtn;
int safe = 0;
+ int expand = 0;
h = request_hash(xid);
b = bucket_for_hash(h);
@@ -255,6 +297,18 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
goto out;
}
+ if (rp->c_state != RC_UNUSED) {
+ /* reusing an existing cache entry */
+ age = jiffies - rp->c_timestamp;
+ if (age < CACHE_THRESH_AGE &&
+ b->size < CACHE_BUCKET_MAX_SIZE &&
+ nfsd_cache_expand_ratelimit(b)) {
+ expand = CACHE_BUCKET_INCREMENT;
+ if (b->size + expand > CACHE_BUCKET_MAX_SIZE)
+ expand = CACHE_BUCKET_MAX_SIZE - b->size;
+ }
+ }
+
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
rp->c_xid = xid;
@@ -276,6 +330,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
rp->c_type = RC_NOCACHE;
out:
spin_unlock(&b->lock);
+ if (expand)
+ nfsd_cache_bucket_expand(b, expand);
return rtn;
found_entry:
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Wednesday October 11, [email protected] wrote:
> /*
> + * Decide whether it is time to expand the cache. Returns 1 iff
> + * the cache is to be expanded. Called with bucket lock held.
> + */
> +static int
> +nfsd_cache_expand_ratelimit(struct svc_cache_bucket *b)
> +{
> + unsigned long now = jiffies;
> +
> + b->nhits++;
> + if (b->last == 0) {
> + b->last = now;
> + } else if ((now - b->last) > CACHE_RATE_JIFFIES &&
> + b->nhits > (b->size >> 4)) {
> + b->nhits = 0;
> + b->last = now;
> + return 1;
> + }
> + return 0;
> +}
I'm wondering a bit about this function.
Firstly (and this is very minor) I expected to see "time_before" or
similar when comparing times. The code you have is correct, but maybe
it will also be obvious if it read
if (time_after(now, b->last + CACHE_RATE_JIFFIES))
??
Also, nhits is only reset when we return 1, and hence grow the bucket.
So if we only occasionally had large bursts which included
CACHE_BUCKET_MAX_SIZE requests in CACHE_THRESH_AGE jiffies, we would
still eventually grow the bucket to it's max size.
I think I would like to have something like
b->nhits++;
if (b->last == 0) {
b->last = now;
} else if (time_after(now, b->last + CACHE_MAX_WAIT)) {
b->last = now; /* too slow, forget it */
b->nhits = 0;
return 0;
} else if (time_after(now, b->last + CACHE_RATE_JIFFIES) &&
b->nhits > (b->size >> 4)) {
b->nhits = 0;
b->last = now;
return 1;
}
return 0;
with CACHE_MAX_WAIT maybe 60*HZ.
Make sense?
NeilBrown
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Mon, Oct 16, 2006 at 12:14:01PM +1000, Neil Brown wrote:
> On Wednesday October 11, [email protected] wrote:
> > /*
> > + * Decide whether it is time to expand the cache. Returns 1 iff
> > + * the cache is to be expanded. Called with bucket lock held.
> > + */
> > +static int
> > +nfsd_cache_expand_ratelimit(struct svc_cache_bucket *b)
> > +{
> > + unsigned long now = jiffies;
> > +
> > + b->nhits++;
> > + if (b->last == 0) {
> > + b->last = now;
> > + } else if ((now - b->last) > CACHE_RATE_JIFFIES &&
> > + b->nhits > (b->size >> 4)) {
> > + b->nhits = 0;
> > + b->last = now;
> > + return 1;
> > + }
> > + return 0;
> > +}
>
> I'm wondering a bit about this function.
> Firstly (and this is very minor) I expected to see "time_before" or
> similar when comparing times. The code you have is correct, but maybe
> it will also be obvious if it read
> if (time_after(now, b->last + CACHE_RATE_JIFFIES))
> ??
Ok.
> Also, nhits is only reset when we return 1, and hence grow the bucket.
> So if we only occasionally had large bursts which included
> CACHE_BUCKET_MAX_SIZE requests in CACHE_THRESH_AGE jiffies, we would
> still eventually grow the bucket to it's max size.
> I think I would like to have something like
>
>
> b->nhits++;
> if (b->last == 0) {
> b->last = now;
> } else if (time_after(now, b->last + CACHE_MAX_WAIT)) {
> b->last = now; /* too slow, forget it */
> b->nhits = 0;
> return 0;
> } else if (time_after(now, b->last + CACHE_RATE_JIFFIES) &&
> b->nhits > (b->size >> 4)) {
> b->nhits = 0;
> b->last = now;
> return 1;
> }
> return 0;
>
> with CACHE_MAX_WAIT maybe 60*HZ.
>
> Make sense?
>
How about this?
b->nhits++;
if (b->last == 0) {
b->last = now;
} else if (time_after(now, b->last + CACHE_MAX_WAIT)) {
b->last = now; /* too slow */
b->nhits >>= 1;
return 0;
} else if (time_after(now, b->last + CACHE_RATE_JIFFIES) &&
b->nhits > (b->size >> 4)) {
b->nhits = 0;
b->last = now;
return 1;
}
return 0;
It gives a damping factor without totally forgetting the past hits.
Otherwise the case of a regular burst every 61 seconds fails to expand
the repcache, which I think it should.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Monday October 16, [email protected] wrote:
>
> How about this?
>
> b->nhits++;
> if (b->last == 0) {
> b->last = now;
> } else if (time_after(now, b->last + CACHE_MAX_WAIT)) {
> b->last = now; /* too slow */
> b->nhits >>= 1;
> return 0;
> } else if (time_after(now, b->last + CACHE_RATE_JIFFIES) &&
> b->nhits > (b->size >> 4)) {
> b->nhits = 0;
> b->last = now;
> return 1;
> }
> return 0;
>
> It gives a damping factor without totally forgetting the past hits.
> Otherwise the case of a regular burst every 61 seconds fails to expand
> the repcache, which I think it should.
Works for me. Definite improvement.
Thanks,
NeilBrown
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs