From: Krishna Kumar Subject: [RFC PATCH 1/1]: nfsd: By changing RA caching to file handle caching Date: Tue, 30 Dec 2008 16:12:59 +0530 Message-ID: <20081230104259.9409.88789.sendpatchset@localhost.localdomain> References: <20081230104245.9409.30030.sendpatchset@localhost.localdomain> Cc: krkumar2@in.ibm.com, Krishna Kumar To: linux-nfs@vger.kernel.org Return-path: Received: from e36.co.us.ibm.com ([32.97.110.154]:46368 "EHLO e36.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751557AbYL3KnD (ORCPT ); Tue, 30 Dec 2008 05:43:03 -0500 Received: from d03relay02.boulder.ibm.com (d03relay02.boulder.ibm.com [9.17.195.227]) by e36.co.us.ibm.com (8.13.1/8.13.1) with ESMTP id mBUAgA4G026616 for ; Tue, 30 Dec 2008 03:42:10 -0700 Received: from d03av04.boulder.ibm.com (d03av04.boulder.ibm.com [9.17.195.170]) by d03relay02.boulder.ibm.com (8.13.8/8.13.8/NCO v9.1) with ESMTP id mBUAh2Cn056538 for ; Tue, 30 Dec 2008 03:43:02 -0700 Received: from d03av04.boulder.ibm.com (loopback [127.0.0.1]) by d03av04.boulder.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id mBUAh2jn007928 for ; Tue, 30 Dec 2008 03:43:02 -0700 In-Reply-To: <20081230104245.9409.30030.sendpatchset-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> Sender: linux-nfs-owner@vger.kernel.org List-ID: From: Krishna Kumar Implement the FH caching. List of changes: 1. Rename RA to FH, parm to cache, and remove all users of readahead. 2. Add fields in the fhparms to cache file, svc_export, expiry time and expiry list. Modify some other fields (eg p_count is atomic). 3. Implement a daemon to clean up cached FH's. 4. Added four helper functions: fh_cache_get: Hold a reference to dentry and svc_export. fh_cache_put: Drop a reference to file, dentry and svc_export. fh_get_cached_values: Returns file and svc_export. fh_cache_upd: Updates file and svc_export. Add entry to list for daemon to cleanup. 5. get_raparms is slightly rewritten. 6. nfsd_read rewritten to use the cache. 7. File remove operation from the client results in the server checking the cache and drops reference immediately (remove operation on the server still retains the reference for some time). 8. init and shutdown are slightly modified. 9. ra_size, ra_depth, nfsd_racache_init and nfsd_racache_shutdown still retain the "ra" prefix for now. Signed-off-by: Krishna Kumar --- fs/nfsd/vfs.c | 449 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 358 insertions(+), 91 deletions(-) diff -ruNp linux-2.6.28.org/fs/nfsd/vfs.c linux-2.6.28.new/fs/nfsd/vfs.c --- linux-2.6.28.org/fs/nfsd/vfs.c 2008-12-30 09:52:43.000000000 +0530 +++ linux-2.6.28.new/fs/nfsd/vfs.c 2008-12-30 12:09:57.000000000 +0530 @@ -55,38 +55,53 @@ #include #endif /* CONFIG_NFSD_V4 */ #include +#include #include #define NFSDDBG_FACILITY NFSDDBG_FILEOP +/* Number of jiffies to cache the file before releasing */ +#define NFSD_CACHE_JIFFIES 100 /* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. + * This is a cache of file handles to quicken file lookup. This also helps + * to prevent multiple open/close of a file when the client reads it. + * * If you increase the number of cached files very much, you'll need to * add a hash table here. */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; +struct fhcache { + struct fhcache *p_next; + + /* Hashed on this parameter */ + __u32 p_auth; + + /* Cached information */ + struct file *p_filp; + struct svc_export *p_exp; + + /* Refcount for overwrite */ + atomic_t p_count; + + /* When this entry expires */ + unsigned long p_expires; + + /* List of entries linked to 'nfsd_daemon_list' */ + struct list_head p_list; + unsigned int p_hindex; }; -struct raparm_hbucket { - struct raparms *pb_head; +struct fhcache_hbucket { + struct fhcache *pb_head; spinlock_t pb_lock; } ____cacheline_aligned_in_smp; -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<d_inode->i_fop); } +/* Daemon to handle expired fh cache entries */ +static struct task_struct *k_nfsd_task; + +/* Synchronization for daemon with enqueuer's */ +static spinlock_t k_nfsd_lock; + +/* List of FH cache entries that has to be cleaned up when they expire */ +static struct list_head nfsd_daemon_list; + /* - * Obtain the readahead parameters for the file - * specified by (dev, ino). + * Returns cached values of 'file' and svc_export; resets these entries + * to NULL. */ +static inline void fh_get_cached_values(struct fhcache *fh, struct file **filep, + struct svc_export **expp) +{ + *filep = fh->p_filp; + *expp = fh->p_exp; + + fh->p_filp = NULL; + fh->p_exp = NULL; +} + +/* + * Hold a reference to dentry and svc_export (file already has an extra + * reference count as it is not closed normally. + */ +static inline void fh_cache_get(struct file *file, struct svc_export *exp) +{ + dget(file->f_path.dentry); + cache_get(&exp->h); +} + +/* Drop a reference to file, dentry and svc_export */ +static inline void fh_cache_put(struct file *file, struct svc_export *exp) +{ + cache_put(&exp->h, &svc_export_cache); + dput(file->f_path.dentry); + fput(file); +} + +/* + * Holds a reference to 'file' and svc_export, and caches both. Add fh entry + * to list for daemon to cleanup later. + */ +static inline void fh_cache_upd(struct fhcache *fh, struct file *file, + struct svc_export *exp) +{ + struct fhcache_hbucket *fhb = &fhcache_hash[fh->p_hindex]; + + fh_cache_get(file, exp); + + spin_lock(&fhb->pb_lock); + fh->p_filp = file; + fh->p_exp = exp; + + /* + * Once we add the entry to the list, we'd rather it expire + * prematurely rather than updating it on every read. + */ + if (likely(list_empty(&fh->p_list))) { + fh->p_expires = jiffies + NFSD_CACHE_JIFFIES; + spin_lock(&k_nfsd_lock); + list_add_tail(&fh->p_list, &nfsd_daemon_list); + spin_unlock(&k_nfsd_lock); + } + spin_unlock(&fhb->pb_lock); +} -static inline struct raparms * -nfsd_get_raparms(dev_t dev, ino_t ino) +/* Daemon cache cleanup handler */ +void daemon_free_entries(void) { - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; + unsigned long now = jiffies; + + spin_lock(&k_nfsd_lock); + while (!list_empty(&nfsd_daemon_list)) { + struct fhcache *fh = list_entry(nfsd_daemon_list.next, + struct fhcache, p_list); + struct fhcache_hbucket *fhb; - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; + if (time_after(fh->p_expires, now) || now != jiffies) { + /* + * This (and all subsequent entries) have not expired; + * or we have spent too long in this loop. + */ + break; + } + + fhb = &fhcache_hash[fh->p_hindex]; + + /* + * Make sure we do not deadlock with updaters - we can free + * entry next time in case of a race. + */ + if (!spin_trylock(&fhb->pb_lock)) { + /* + * Entry is being used, no need to free this, try later + */ + break; + } + + if (unlikely(!fh->p_filp)) { + /* + * Handle race with get_fhcache where it overwrites + * the fh. We remove this entry - it will be added + * back later by upd() which is racing with us. + */ + list_del_init(&fh->p_list); + spin_unlock(&fhb->pb_lock); + } else { + struct file *file; + struct svc_export *exp; + + if (atomic_read(&fh->p_count)) { + spin_unlock(&fhb->pb_lock); + break; + } + + list_del_init(&fh->p_list); + fh_get_cached_values(fh, &file, &exp); + spin_unlock(&fhb->pb_lock); + spin_unlock(&k_nfsd_lock); + + fh_cache_put(file, exp); + spin_lock(&k_nfsd_lock); + } + } + spin_unlock(&k_nfsd_lock); +} + +static int k_nfsd_thread(void *unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(NFSD_CACHE_JIFFIES); - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) + if (kthread_should_stop()) + break; + + daemon_free_entries(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Obtain the cached file, export and d_inode values for the FH + * specified by fh->auth[3] + */ +static inline struct fhcache * +nfsd_get_fhcache(__u32 auth) +{ + struct fhcache *fh, **fhp, **ffhp = NULL; + int depth = 0; + unsigned int hash; + struct fhcache_hbucket *fhb; + struct file *file = NULL; + struct svc_export *exp = NULL; + + if (!auth) + return NULL; + + hash = jhash_1word(auth, 0xfeedbeef) & FHPARM_HASH_MASK; + fhb = &fhcache_hash[hash]; + + spin_lock(&fhb->pb_lock); + for (fhp = &fhb->pb_head; (fh = *fhp); fhp = &fh->p_next) { + if (fh->p_auth == auth) { + /* Same inode */ + if (!fh->p_filp) { + /* Someone is racing in the same code */ + spin_unlock(&fhb->pb_lock); + return NULL; + } + + /* + * Hold an extra reference to dentry/exp since these + * are released in fh_put(). 'file' already has an + * extra hold from the first lookup which was never + * dropped. + */ + fh_cache_get(fh->p_filp, fh->p_exp); goto found; + } + depth++; - if (ra->p_count == 0) - frap = rap; + + /* Unused or different inode */ + if (!atomic_read(&fh->p_count)) { + if (!ffhp || (*ffhp)->p_filp) + ffhp = fhp; + } } - depth = nfsdstats.ra_size*11/10; - if (!frap) { - spin_unlock(&rab->pb_lock); + + if (!ffhp) { + spin_unlock(&fhb->pb_lock); return NULL; } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; + + depth = nfsdstats.ra_size*11/10; + fhp = ffhp; + fh = *ffhp; + fh->p_hindex = hash; + fh->p_auth = auth; + + if (fh->p_filp) + fh_get_cached_values(fh, &file, &exp); + found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; + if (fhp != &fhb->pb_head) { + *fhp = fh->p_next; + fh->p_next = fhb->pb_head; + fhb->pb_head = fh; } - ra->p_count++; + + atomic_inc(&fh->p_count); nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); - return ra; + spin_unlock(&fhb->pb_lock); + + if (file) { + /* + * Free the existing entry. The new entry will expire + * prematurely, but it will be updated to the correct expiry + * and be cached for the full time duration if it is used + * again after expiry. + */ + fh_cache_put(file, exp); + } + return fh; } /* @@ -892,7 +1091,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { struct inode *inode; - struct raparms *ra; mm_segment_t oldfs; __be32 err; int host_err; @@ -903,11 +1101,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) goto out; - /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - - if (ra && ra->p_set) - file->f_ra = ra->p_ra; if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { @@ -926,16 +1119,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st set_fs(oldfs); } - /* Write back readahead params */ - if (ra) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); - } - if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; @@ -1078,12 +1261,38 @@ nfsd_read(struct svc_rqst *rqstp, struct goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); } else { - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); - if (err) - goto out; - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - nfsd_close(file); + struct fhcache *fh; + + /* Check if this fh is cached */ + fh = nfsd_get_fhcache(fhp->fh_handle.fh_auth[3]); + if (fh && fh->p_filp) { + /* Got cached values */ + file = fh->p_filp; + fhp->fh_dentry = file->f_path.dentry; + fhp->fh_export = fh->p_exp; + err = fh_verify(rqstp, fhp, S_IFREG, NFSD_MAY_READ); + } else { + /* Nothing in cache, or no free cache entry available */ + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, + &file); + } + + if (!err) + err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, + count); + + if (fh) { + if (!fh->p_filp && file) { + /* Write back cached values */ + fh_cache_upd(fh, file, fhp->fh_export); + } + + /* Drop our reference */ + atomic_dec(&fh->p_count); + } else if (file) + nfsd_close(file); } + out: return err; } @@ -1791,6 +2000,38 @@ nfsd_unlink(struct svc_rqst *rqstp, stru goto out_nfserr; if (type != S_IFDIR) { /* It's UNLINK */ + int i, found = 0; + + for (i = 0 ; i < FHPARM_HASH_SIZE && !found; i++) { + struct fhcache_hbucket *fhb = &fhcache_hash[i]; + struct fhcache *fh; + + spin_lock(&fhb->pb_lock); + for (fh = fhb->pb_head; fh; fh = fh->p_next) { + if (fh->p_filp && + fh->p_filp->f_path.dentry == rdentry) { + /* Found the entry for removed file */ + struct file *file; + struct svc_export *exp; + + fh_get_cached_values(fh, &file, &exp); + spin_lock(&k_nfsd_lock); + list_del_init(&fh->p_list); + spin_unlock(&k_nfsd_lock); + + spin_unlock(&fhb->pb_lock); + + /* Drop reference to this entry */ + fh_cache_put(file, exp); + + spin_lock(&fhb->pb_lock); + found = 1; + break; + } + } + spin_unlock(&fhb->pb_lock); + } + #ifdef MSNFS if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && (atomic_read(&rdentry->d_count) > 1)) { @@ -2061,23 +2302,36 @@ nfsd_permission(struct svc_rqst *rqstp, void nfsd_racache_shutdown(void) { - struct raparms *raparm, *last_raparm; unsigned int i; - dprintk("nfsd: freeing readahead buffers.\n"); + dprintk("nfsd: freeing FH buffers.\n"); - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); + /* First stop the daemon, and we will clean up here ourselves */ + kthread_stop(k_nfsd_task); + k_nfsd_task = NULL; + + for (i = 0; i < FHPARM_HASH_SIZE; i++) { + struct fhcache *fhcache, *last_fhcache; + + fhcache = fhcache_hash[i].pb_head; + while(fhcache) { + last_fhcache = fhcache; + if (fhcache->p_filp) { + struct file *file; + struct svc_export *exp; + + fh_get_cached_values(fhcache, &file, &exp); + list_del(&fhcache->p_list); + fh_cache_put(file, exp); + } + fhcache = fhcache->p_next; + kfree(last_fhcache); } - raparm_hash[i].pb_head = NULL; + fhcache_hash[i].pb_head = NULL; } } /* - * Initialize readahead param cache + * Initialize file cache */ int nfsd_racache_init(int cache_size) @@ -2085,36 +2339,49 @@ nfsd_racache_init(int cache_size) int i; int j = 0; int nperbucket; - struct raparms **raparm = NULL; + struct fhcache **fhcache = NULL; - if (raparm_hash[0].pb_head) + if (fhcache_hash[0].pb_head) return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + nperbucket = DIV_ROUND_UP(cache_size, FHPARM_HASH_SIZE); if (nperbucket < 2) nperbucket = 2; - cache_size = nperbucket * RAPARM_HASH_SIZE; + cache_size = nperbucket * FHPARM_HASH_SIZE; - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); + dprintk("nfsd: allocating %d file cache buffers.\n", cache_size); - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); + for (i = 0; i < FHPARM_HASH_SIZE; i++) { + spin_lock_init(&fhcache_hash[i].pb_lock); - raparm = &raparm_hash[i].pb_head; + fhcache = &fhcache_hash[i].pb_head; for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) + *fhcache = kzalloc(sizeof(struct fhcache), GFP_KERNEL); + if (!*fhcache) { + dprintk("nfsd: kmalloc failed, freeing file cache buffers\n"); goto out_nomem; - raparm = &(*raparm)->p_next; + } + INIT_LIST_HEAD(&(*fhcache)->p_list); + fhcache = &(*fhcache)->p_next; } - *raparm = NULL; + *fhcache = NULL; } nfsdstats.ra_size = cache_size; + + INIT_LIST_HEAD(&nfsd_daemon_list); + spin_lock_init(&k_nfsd_lock); + k_nfsd_task = kthread_run(k_nfsd_thread, NULL, "nfsd_cacher"); + + if (IS_ERR(k_nfsd_task)) { + printk(KERN_ERR "%s: unable to create kernel thread: %ld\n", + __FUNCTION__, PTR_ERR(k_nfsd_task)); + goto out_nomem; + } + return 0; out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); nfsd_racache_shutdown(); return -ENOMEM; }