From: Krishna Kumar Subject: [RFC] [PATCH v2 1/1]: nfsd: Change caching from ino/dev to file-handle Date: Sat, 24 Jan 2009 18:05:11 +0530 Message-ID: <20090124123511.10995.88449.sendpatchset@localhost.localdomain> References: <20090124123457.10995.57636.sendpatchset@localhost.localdomain> Cc: linux-nfs@vger.kernel.org, Krishna Kumar To: bfields@fieldses.org Return-path: Received: from e3.ny.us.ibm.com ([32.97.182.143]:53888 "EHLO e3.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751641AbZAXMfR (ORCPT ); Sat, 24 Jan 2009 07:35:17 -0500 Received: from d01relay02.pok.ibm.com (d01relay02.pok.ibm.com [9.56.227.234]) by e3.ny.us.ibm.com (8.13.1/8.13.1) with ESMTP id n0OCXWbK029838 for ; Sat, 24 Jan 2009 07:33:32 -0500 Received: from d01av01.pok.ibm.com (d01av01.pok.ibm.com [9.56.224.215]) by d01relay02.pok.ibm.com (8.13.8/8.13.8/NCO v9.1) with ESMTP id n0OCZG2j194450 for ; Sat, 24 Jan 2009 07:35:16 -0500 Received: from d01av01.pok.ibm.com (loopback [127.0.0.1]) by d01av01.pok.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id n0OCZGLQ025281 for ; Sat, 24 Jan 2009 07:35:16 -0500 In-Reply-To: <20090124123457.10995.57636.sendpatchset-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> Sender: linux-nfs-owner@vger.kernel.org List-ID: From: Krishna Kumar Implement the file handle caching. List of changes: 1. Remove all implementation and users of readahead, rename RA to FH, parm to cache. 2. Add fields in the fhparms to cache file, svc_export, expiry list and expiry time. Modify some other fields (eg p_count is atomic). 3. Implement a daemon to clean up cached FH's. 4. Added four helper functions: fh_cache_get: Hold a reference to dentry and svc_export. fh_cache_put: Drop a reference to file, dentry and svc_export. fh_get_cached_entries: Returns cached file and svc_export. fh_cache_upd: Cache file and svc_export. Add entry to list for daemon to cleanup. 5. nfsd_get_raparms is slightly rewritten (changed to nfsd_get_fhcache). 6. nfsd_read rewritten to use the cache, remove RA from nfsd_vfs_read. 7. File remove operation from the client results in the server checking the cache and drops reference immediately (remove operation on the server still retains the reference for a short time before the daemon frees up the entry). 8. init and shutdown are slightly modified. (ra_size, ra_depth, nfsd_racache_init and nfsd_racache_shutdown still retain the "ra" prefix for now) Signed-off-by: Krishna Kumar --- fs/nfsd/vfs.c | 429 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 338 insertions(+), 91 deletions(-) diff -ruNp 2.6.29-rc2.org/fs/nfsd/vfs.c 2.6.29-rc2.new/fs/nfsd/vfs.c --- 2.6.29-rc2.org/fs/nfsd/vfs.c 2009-01-22 13:23:18.000000000 +0530 +++ 2.6.29-rc2.new/fs/nfsd/vfs.c 2009-01-22 13:23:18.000000000 +0530 @@ -55,38 +55,53 @@ #include #endif /* CONFIG_NFSD_V4 */ #include +#include #include #define NFSDDBG_FACILITY NFSDDBG_FILEOP +/* Number of jiffies to cache the file before releasing */ +#define NFSD_CACHE_JIFFIES 100 /* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. + * This is a cache of file handles to quicken file lookup. This also + * helps prevent multiple open/close of a file when a client reads it. + * * If you increase the number of cached files very much, you'll need to * add a hash table here. */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; +struct fhcache { + struct fhcache *p_next; + + /* Hashed on this parameter */ + __u32 p_auth; + + /* Cached information */ + struct file *p_filp; + struct svc_export *p_exp; + + /* Refcount for overwrite */ + atomic_t p_count; + + /* When this entry expires */ + unsigned long p_expires; + + /* List of entries linked to 'nfsd_daemon_list' */ + struct list_head p_list; + unsigned int p_hindex; }; -struct raparm_hbucket { - struct raparms *pb_head; +struct fhcache_hbucket { + struct fhcache *pb_head; spinlock_t pb_lock; } ____cacheline_aligned_in_smp; -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<d_inode->i_fop); } +/* Daemon to handle expired fh cache entries */ +static struct task_struct *k_nfsd_task; + +/* Synchronization for daemon with enqueuer's */ +static DEFINE_SPINLOCK(k_nfsd_lock); + +/* List of FH cache entries that has to be cleaned up when they expire */ +static struct list_head nfsd_daemon_list; + +/* + * Returns cached values of 'file' and svc_export; resets these entries + * to NULL. + */ +static inline void fh_get_cached_entries(struct fhcache *fh, + struct file **filep, + struct svc_export **expp) +{ + *filep = fh->p_filp; + *expp = fh->p_exp; + + fh->p_filp = NULL; + fh->p_exp = NULL; +} + +/* + * Hold a reference to dentry and svc_export (file already has an extra + * reference count as it is not closed normally. + */ +static inline void fh_cache_get(struct file *file, struct svc_export *exp) +{ + dget(file->f_dentry); + cache_get(&exp->h); +} + +/* Drop a reference to file, dentry and svc_export */ +static inline void fh_cache_put(struct file *file, struct svc_export *exp) +{ + cache_put(&exp->h, &svc_export_cache); + dput(file->f_dentry); + fput(file); +} + /* - * Obtain the readahead parameters for the file - * specified by (dev, ino). + * Holds a reference to dentry and svc_export, and caches both. Add fh entry + * to list for daemon to cleanup later. Once we add the entry to the list, + * we'd rather it expire prematurely rather than updating it on every read. */ +static inline void fh_cache_upd(struct fhcache *fh, struct file *file, + struct svc_export *exp) +{ + if (fh) { + if (!fh->p_filp && file) { + struct fhcache_hbucket *fhb; + + fh_cache_get(file, exp); + + fhb = &fhcache_hash[fh->p_hindex]; + + spin_lock(&fhb->pb_lock); + fh->p_filp = file; + fh->p_exp = exp; + fh->p_expires = jiffies + NFSD_CACHE_JIFFIES; -static inline struct raparms * -nfsd_get_raparms(dev_t dev, ino_t ino) + spin_lock(&k_nfsd_lock); + list_add_tail(&fh->p_list, &nfsd_daemon_list); + spin_unlock(&k_nfsd_lock); + + spin_unlock(&fhb->pb_lock); + } + + /* Drop our reference */ + atomic_dec(&fh->p_count); + } else if (file) + nfsd_close(file); +} + +/* Daemon cache cleanup handler */ +void daemon_free_entries(void) { - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; + unsigned long now = jiffies; + + spin_lock(&k_nfsd_lock); + while (!list_empty(&nfsd_daemon_list)) { + struct fhcache *fh = list_entry(nfsd_daemon_list.next, + struct fhcache, p_list); + struct fhcache_hbucket *fhb; + struct file *file; + struct svc_export *exp; + + if (time_after(fh->p_expires, now) || now != jiffies) { + /* + * This (and all subsequent entries) have not expired; + * or we have spent too long in this loop. + */ + break; + } + + fhb = &fhcache_hash[fh->p_hindex]; + + /* + * Make sure we do not deadlock with updaters - we can free + * entry next time in case of a race. + */ + if (!spin_trylock(&fhb->pb_lock)) { + /* + * Entry is being used, no need to free this, try later + */ + break; + } + + if (atomic_read(&fh->p_count)) { + spin_unlock(&fhb->pb_lock); + break; + } - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; + list_del(&fh->p_list); + fh_get_cached_entries(fh, &file, &exp); + spin_unlock(&fhb->pb_lock); + spin_unlock(&k_nfsd_lock); + + fh_cache_put(file, exp); + spin_lock(&k_nfsd_lock); + } + spin_unlock(&k_nfsd_lock); +} + +static int k_nfsd_thread(void *unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(NFSD_CACHE_JIFFIES); - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) + if (kthread_should_stop()) + break; + + daemon_free_entries(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Obtain the cached file, export and d_inode values for the FH + * specified by fh->auth[3] + */ +static inline struct fhcache * +nfsd_get_fhcache(__u32 auth) +{ + struct fhcache *fh, **fhp, **ffhp = NULL; + int depth = 0; + unsigned int hash; + struct fhcache_hbucket *fhb; + + if (!auth) + return NULL; + + hash = jhash_1word(auth, 0xfeedbeef) & FHPARM_HASH_MASK; + fhb = &fhcache_hash[hash]; + + spin_lock(&fhb->pb_lock); + for (fhp = &fhb->pb_head; (fh = *fhp); fhp = &fh->p_next) { + if (fh->p_auth == auth) { + /* Same inode */ + if (unlikely(!fh->p_filp)) { + /* + * Someone is racing in the same code, and + * this is the first reference to this file. + */ + spin_unlock(&fhb->pb_lock); + return NULL; + } + + /* + * Hold an extra reference to dentry/exp since these + * are released in fh_put(). 'file' already has an + * extra hold from the first lookup which was never + * dropped. + */ + fh_cache_get(fh->p_filp, fh->p_exp); goto found; + } + depth++; - if (ra->p_count == 0) - frap = rap; + + if (!ffhp && !fh->p_filp) { + /* + * This is an unused inode (or a different one), and no + * entry was found till now. + */ + if (!atomic_read(&fh->p_count)) /* Entry is unused */ + ffhp = fhp; + } } - depth = nfsdstats.ra_size*11/10; - if (!frap) { - spin_unlock(&rab->pb_lock); + + if (!ffhp) { + spin_unlock(&fhb->pb_lock); return NULL; } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; + + depth = nfsdstats.ra_size*11/10; + fhp = ffhp; + fh = *ffhp; + fh->p_hindex = hash; + fh->p_auth = auth; + found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; + if (fhp != &fhb->pb_head) { + *fhp = fh->p_next; + fh->p_next = fhb->pb_head; + fhb->pb_head = fh; } - ra->p_count++; + + atomic_inc(&fh->p_count); nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); - return ra; + spin_unlock(&fhb->pb_lock); + + return fh; } /* @@ -892,7 +1079,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { struct inode *inode; - struct raparms *ra; mm_segment_t oldfs; __be32 err; int host_err; @@ -903,11 +1089,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) goto out; - /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - - if (ra && ra->p_set) - file->f_ra = ra->p_ra; if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { @@ -926,16 +1107,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st set_fs(oldfs); } - /* Write back readahead params */ - if (ra) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); - } - if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; @@ -1078,12 +1249,30 @@ nfsd_read(struct svc_rqst *rqstp, struct goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); } else { - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); - if (err) - goto out; - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - nfsd_close(file); + struct fhcache *fh; + + /* Check if this fh is cached */ + fh = nfsd_get_fhcache(fhp->fh_handle.fh_auth[3]); + if (fh && fh->p_filp) { + /* Got cached values */ + file = fh->p_filp; + fhp->fh_dentry = file->f_dentry; + fhp->fh_export = fh->p_exp; + err = fh_verify(rqstp, fhp, S_IFREG, NFSD_MAY_READ); + } else { + /* Nothing in cache */ + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, + &file); + } + + if (!err) + err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, + count); + + /* Update cached values if required, and clean up */ + fh_cache_upd(fh, file, fhp->fh_export); } + out: return err; } @@ -1791,6 +1980,39 @@ nfsd_unlink(struct svc_rqst *rqstp, stru goto out_nfserr; if (type != S_IFDIR) { /* It's UNLINK */ + int i, found = 0; + + for (i = 0 ; i < FHPARM_HASH_SIZE && !found; i++) { + struct fhcache_hbucket *fhb = &fhcache_hash[i]; + struct fhcache *fh; + + spin_lock(&fhb->pb_lock); + for (fh = fhb->pb_head; fh; fh = fh->p_next) { + if (fh->p_filp && + fh->p_filp->f_dentry == rdentry) { + /* Found the entry for removed file */ + struct file *file; + struct svc_export *exp; + + fh_get_cached_entries(fh, &file, &exp); + + spin_lock(&k_nfsd_lock); + list_del(&fh->p_list); + spin_unlock(&k_nfsd_lock); + + spin_unlock(&fhb->pb_lock); + + /* Drop reference to this entry */ + fh_cache_put(file, exp); + + spin_lock(&fhb->pb_lock); + found = 1; + break; + } + } + spin_unlock(&fhb->pb_lock); + } + #ifdef MSNFS if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && (atomic_read(&rdentry->d_count) > 1)) { @@ -2061,23 +2283,36 @@ nfsd_permission(struct svc_rqst *rqstp, void nfsd_racache_shutdown(void) { - struct raparms *raparm, *last_raparm; unsigned int i; - dprintk("nfsd: freeing readahead buffers.\n"); + dprintk("nfsd: freeing FH buffers.\n"); - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); + /* Stop the daemon and free the list entries */ + kthread_stop(k_nfsd_task); + k_nfsd_task = NULL; + + for (i = 0; i < FHPARM_HASH_SIZE; i++) { + struct fhcache *fhcache, *last_fhcache; + + fhcache = fhcache_hash[i].pb_head; + while(fhcache) { + last_fhcache = fhcache; + if (fhcache->p_filp) { + struct file *file; + struct svc_export *exp; + + fh_get_cached_entries(fhcache, &file, &exp); + list_del(&fhcache->p_list); + fh_cache_put(file, exp); + } + fhcache = fhcache->p_next; + kfree(last_fhcache); } - raparm_hash[i].pb_head = NULL; + fhcache_hash[i].pb_head = NULL; } } /* - * Initialize readahead param cache + * Initialize file handle cache */ int nfsd_racache_init(int cache_size) @@ -2085,36 +2320,48 @@ nfsd_racache_init(int cache_size) int i; int j = 0; int nperbucket; - struct raparms **raparm = NULL; + struct fhcache **fhcache = NULL; - if (raparm_hash[0].pb_head) + if (fhcache_hash[0].pb_head) return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + nperbucket = DIV_ROUND_UP(cache_size, FHPARM_HASH_SIZE); if (nperbucket < 2) nperbucket = 2; - cache_size = nperbucket * RAPARM_HASH_SIZE; + cache_size = nperbucket * FHPARM_HASH_SIZE; - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); + dprintk("nfsd: allocating %d file handle cache buffers.\n", cache_size); - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); + for (i = 0; i < FHPARM_HASH_SIZE; i++) { + spin_lock_init(&fhcache_hash[i].pb_lock); - raparm = &raparm_hash[i].pb_head; + fhcache = &fhcache_hash[i].pb_head; for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) + *fhcache = kzalloc(sizeof(struct fhcache), GFP_KERNEL); + if (!*fhcache) { + dprintk("nfsd: kmalloc failed, freeing file cache buffers\n"); goto out_nomem; - raparm = &(*raparm)->p_next; + } + INIT_LIST_HEAD(&(*fhcache)->p_list); + fhcache = &(*fhcache)->p_next; } - *raparm = NULL; + *fhcache = NULL; } nfsdstats.ra_size = cache_size; + + INIT_LIST_HEAD(&nfsd_daemon_list); + k_nfsd_task = kthread_run(k_nfsd_thread, NULL, "nfsd_cacher"); + + if (IS_ERR(k_nfsd_task)) { + printk(KERN_ERR "%s: unable to create kernel thread: %ld\n", + __FUNCTION__, PTR_ERR(k_nfsd_task)); + goto out_nomem; + } + return 0; out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); nfsd_racache_shutdown(); return -ENOMEM; }