These patches implement wave 3 of the pNFS submission, which encompasses file
layout data server connection, READ I/O, and recovery through the MDS.
Responded to all Version 1 comments.
-->Andy
Changes since Version 1:
Renamed patches from "pnfs: wave3:XXX" to NFSv4.1:XXX
0001-NFSv4-remove-CONFIG_NFS_V4-from-nfs_read_data.patch
Patch comment and title adjusted
0002-NFSv4.1-put_layout_hdr-can-remove-nfsi-layout.patch
Unchanged
0003-NFS-move-nfs_client-initialization-into-nfs_get_clie.patch
added init_client rpc_ops
0004-NFSv4.1-send-zero-stateid-seqid-on-v4.1-i-o.patch
0005-NFSv4.1-new-flag-for-state-renewal-check.patch
0006-NFSv4.1-new-flag-for-lease-time-check.patch
0007-NFSv4.1-add-MDS-mount-DS-only-check.patch
Unchanged
0008-NFSv4.1-lseg-refcounting.patch
Changed parameter name ino=>inode
Changed function name put_lseg_common
inlined put_lseg_locked
0009-NFSv4.1-coelesce-across-layout-stripes.patch
0010-NFSv4.1-shift-pnfs_update_layout-locations.patch
Switched the order of the above two patches
Moved pnfs_update_layout to pg_doio/pg_test
switched put_lseg/put_nfs_open_context
0011-NFSv4.1-generic-read.patch
Keep nfs_initiate_read static, don't EXPORT here
Removed bfields Signed-off-by;
Removed Reported-by: Alexandros
0012-NFSv4.1-data-server-connection.patch
Unchanged
0013-NFSv4.1-filelayout-i-o-helpers.patch
Added dense stripe helper function
0014-NFSv4.1-filelayout-read.patch
Declare nfs_initiate_read non-static and EXPORT_SYMBOL_GPL
0015-NFSv4.1-filelayout-async-error-handler.patch
Unchanged
0016-NFSv4.1-move-deviceid-cache-to-filelayout-driver.patch
New patch from Christoph
0017-NFSv4.1-turn-off-pNFS-on-ds-connection-failure.patch
Modified to use filelayout driver deviceid cache
0018-NFSv4.1-lseg-documentation.patch
New patch from Fred.
This whole subthread was not readable as non of the participants knows
how to quote. Please stop posting comments after full quotes and only
quote a couple of lines of context that are relevant to your
contribution. Thanks!
On 2011-02-16 14:55, Fred Isaman wrote:
>
> On Feb 16, 2011, at 2:42 PM, Benny Halevy wrote:
>
>> On 2011-02-15 03:38, [email protected] wrote:
>>> From: Fred Isaman <[email protected]>
>>>
>>> Move the pnfs_update_layout call location to nfs_pageio_do_add_request().
>>> Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach
>>> it to each nfs_read_data so it can be sent to the layout driver.
>>>
>>> @@ -131,10 +132,12 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>>> zero_user_segment(page, len, PAGE_CACHE_SIZE);
>>>
>>> nfs_list_add_request(new, &one_request);
>>> + lseg = pnfs_update_layout(inode, ctx, IOMODE_READ);
>>> if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
>>> - nfs_pagein_multi(inode, &one_request, 1, len, 0);
>>> + nfs_pagein_multi(inode, &one_request, 1, len, 0, lseg);
>>> else
>>> - nfs_pagein_one(inode, &one_request, 1, len, 0);
>>> + nfs_pagein_one(inode, &one_request, 1, len, 0, lseg);
>>> + put_lseg(lseg);
>>> return 0;
>>> }
>>>
>>> @@ -160,7 +163,8 @@ static void nfs_readpage_release(struct nfs_page *req)
>>> */
>>> static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>>> const struct rpc_call_ops *call_ops,
>>> - unsigned int count, unsigned int offset)
>>> + unsigned int count, unsigned int offset,
>>> + struct pnfs_layout_segment *lseg)
>>> {
>>> struct inode *inode = req->wb_context->path.dentry->d_inode;
>>> int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
>>> @@ -183,6 +187,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>>> data->req = req;
>>> data->inode = inode;
>>> data->cred = msg.rpc_cred;
>>> + data->lseg = get_lseg(lseg);
>>>
>>> data->args.fh = NFS_FH(inode);
>>> data->args.offset = req_offset(req) + offset;
>>> @@ -240,7 +245,7 @@ nfs_async_read_error(struct list_head *head)
>>> * won't see the new data until our attribute cache is updated. This is more
>>> * or less conventional NFS client behavior.
>>> */
>>> -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>>> +static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>>> {
>>> struct nfs_page *req = nfs_list_entry(head->next);
>>> struct page *page = req->wb_page;
>>> @@ -266,6 +271,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>>> } while(nbytes != 0);
>>> atomic_set(&req->wb_complete, requests);
>>>
>>> + /* We know lseg==NULL */
Can you provide more details?
If it's always NULL why bother to pass it in?
>>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>>> ClearPageError(page);
>>> offset = 0;
>>> nbytes = count;
>>> @@ -280,12 +287,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>>> if (nbytes < rsize)
>>> rsize = nbytes;
>>> ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
>>> - rsize, offset);
>>> + rsize, offset, lseg);
>>> if (ret == 0)
>>> ret = ret2;
>>> offset += rsize;
>>> nbytes -= rsize;
>>> } while (nbytes != 0);
>>> + put_lseg(lseg);
>>>
>>> return ret;
>>>
>>> @@ -300,7 +308,7 @@ out_bad:
>>> return -ENOMEM;
>>> }
>>>
>>> -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>>> +static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>>> {
>>> struct nfs_page *req;
>>> struct page **pages;
>>> @@ -320,9 +328,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
>>> *pages++ = req->wb_page;
>>> }
>>> req = nfs_list_entry(data->pages.next);
>>> + if ((!lseg) && list_is_singular(&data->pages))
>>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
When is lseg NULL and why getting it here works better than in nfs_readpage_async?
>>>
>>> - return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
>>> + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
>>> + put_lseg(lseg);
>>
>> Shouldn't that be done only if pnfs_update_layout was called here?
>> Otherwise, the caller, nfs_readpage_async puts the lseg it passes down.
>>
>
> You are right there is a problem. But it needs to be fixed by removing the put_lseg from nfs_readpage_async.
>
>
If we can avoid getting the lseg in one place and putting it in another that would be better.
Benny
>>> + return ret;
>>> out_bad:
>>> + put_lseg(lseg);
>>
>> I'd unify the common exit path by doing nfs_async_read_error on the error path
>> and then goto out for the common code.
>>
>
> OK.
>
> Fred
>
This should be authored by Christoph
-->Andy
On Feb 15, 2011, at 3:39 AM, [email protected] wrote:
> From: Andy Adamson <[email protected]>
>
> No need for generic cache with only one user.
> Keep a simple hash of deviceids in the filelayout driver.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Andy Adamson <[email protected]>
> ---
> fs/nfs/nfs4filelayout.c | 46 +++-----------
> fs/nfs/nfs4filelayout.h | 8 ++-
> fs/nfs/nfs4filelayoutdev.c | 106 +++++++++++++++++++++++---------
> fs/nfs/pnfs.c | 147 +-------------------------------------------
> fs/nfs/pnfs.h | 48 --------------
> include/linux/nfs_fs_sb.h | 1 -
> 6 files changed, 92 insertions(+), 264 deletions(-)
>
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index 9ae1a47e..84c7577 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -42,32 +42,6 @@ MODULE_DESCRIPTION("The NFSv4 file layout driver");
>
> #define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
>
> -static int
> -filelayout_set_layoutdriver(struct nfs_server *nfss)
> -{
> - int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
> - nfs4_fl_free_deviceid_callback);
> - if (status) {
> - printk(KERN_WARNING "%s: deviceid cache could not be "
> - "initialized\n", __func__);
> - return status;
> - }
> - dprintk("%s: deviceid cache has been initialized successfully\n",
> - __func__);
> - return 0;
> -}
> -
> -/* Clear out the layout by destroying its device list */
> -static int
> -filelayout_clear_layoutdriver(struct nfs_server *nfss)
> -{
> - dprintk("--> %s\n", __func__);
> -
> - if (nfss->nfs_client->cl_devid_cache)
> - pnfs_put_deviceid_cache(nfss->nfs_client);
> - return 0;
> -}
> -
> static loff_t
> filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
> loff_t offset)
> @@ -291,7 +265,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
> }
>
> /* find and reference the deviceid */
> - dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
> + dsaddr = nfs4_fl_find_get_deviceid(id);
> if (dsaddr == NULL) {
> dsaddr = get_device_info(lo->plh_inode, id);
> if (dsaddr == NULL)
> @@ -326,7 +300,7 @@ out:
> dprintk("--> %s returns %d\n", __func__, status);
> return status;
> out_put:
> - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
> + nfs4_fl_put_deviceid(dsaddr);
> goto out;
> }
>
> @@ -435,12 +409,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
> static void
> filelayout_free_lseg(struct pnfs_layout_segment *lseg)
> {
> - struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
> struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
>
> dprintk("--> %s\n", __func__);
> - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
> - &fl->dsaddr->deviceid);
> + nfs4_fl_put_deviceid(fl->dsaddr);
> _filelayout_free_lseg(fl);
> }
>
> @@ -470,13 +442,11 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> }
>
> static struct pnfs_layoutdriver_type filelayout_type = {
> - .id = LAYOUT_NFSV4_1_FILES,
> - .name = "LAYOUT_NFSV4_1_FILES",
> - .owner = THIS_MODULE,
> - .set_layoutdriver = filelayout_set_layoutdriver,
> - .clear_layoutdriver = filelayout_clear_layoutdriver,
> - .alloc_lseg = filelayout_alloc_lseg,
> - .free_lseg = filelayout_free_lseg,
> + .id = LAYOUT_NFSV4_1_FILES,
> + .name = "LAYOUT_NFSV4_1_FILES",
> + .owner = THIS_MODULE,
> + .alloc_lseg = filelayout_alloc_lseg,
> + .free_lseg = filelayout_free_lseg,
> .pg_test = filelayout_pg_test,
> .read_pagelist = filelayout_read_pagelist,
> };
> diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
> index 9fef76e..23f1e1e 100644
> --- a/fs/nfs/nfs4filelayout.h
> +++ b/fs/nfs/nfs4filelayout.h
> @@ -56,7 +56,9 @@ struct nfs4_pnfs_ds {
> };
>
> struct nfs4_file_layout_dsaddr {
> - struct pnfs_deviceid_node deviceid;
> + struct hlist_node node;
> + struct nfs4_deviceid deviceid;
> + atomic_t ref;
> u32 stripe_count;
> u8 *stripe_indices;
> u32 ds_num;
> @@ -86,7 +88,6 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
> extern struct nfs_fh *
> nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
>
> -extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
> extern void print_ds(struct nfs4_pnfs_ds *ds);
> extern void print_deviceid(struct nfs4_deviceid *dev_id);
> u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
> @@ -94,7 +95,8 @@ u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
> struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
> u32 ds_idx);
> extern struct nfs4_file_layout_dsaddr *
> -nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
> +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
> +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
> struct nfs4_file_layout_dsaddr *
> get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
>
> diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
> index e8496f3..ac38c75 100644
> --- a/fs/nfs/nfs4filelayoutdev.c
> +++ b/fs/nfs/nfs4filelayoutdev.c
> @@ -37,6 +37,30 @@
> #define NFSDBG_FACILITY NFSDBG_PNFS_LD
>
> /*
> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
> + */
> +#define NFS4_FL_DEVICE_ID_HASH_BITS 5
> +#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
> +#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
> +
> +static inline u32
> +nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
> +{
> + unsigned char *cptr = (unsigned char *)id->data;
> + unsigned int nbytes = NFS4_DEVICEID4_SIZE;
> + u32 x = 0;
> +
> + while (nbytes--) {
> + x *= 37;
> + x += *cptr++;
> + }
> + return x & NFS4_FL_DEVICE_ID_HASH_MASK;
> +}
> +
> +static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
> +static DEFINE_SPINLOCK(filelayout_deviceid_lock);
> +
> +/*
> * Data server cache
> *
> * Data servers can be mapped to different device ids.
> @@ -183,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
> struct nfs4_pnfs_ds *ds;
> int i;
>
> - print_deviceid(&dsaddr->deviceid.de_id);
> + print_deviceid(&dsaddr->deviceid);
>
> for (i = 0; i < dsaddr->ds_num; i++) {
> ds = dsaddr->ds_list[i];
> @@ -200,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
> kfree(dsaddr);
> }
>
> -void
> -nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
> -{
> - struct nfs4_file_layout_dsaddr *dsaddr =
> - container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
> -
> - nfs4_fl_free_deviceid(dsaddr);
> -}
> -
> static struct nfs4_pnfs_ds *
> nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
> {
> @@ -357,7 +372,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
> dsaddr->stripe_count = cnt;
> dsaddr->ds_num = num;
>
> - memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
> + memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
>
> /* Go back an read stripe indices */
> p = indicesp;
> @@ -407,28 +422,37 @@ out_err:
> }
>
> /*
> - * Decode the opaque device specified in 'dev'
> - * and add it to the list of available devices.
> - * If the deviceid is already cached, nfs4_add_deviceid will return
> - * a pointer to the cached struct and throw away the new.
> + * Decode the opaque device specified in 'dev' and add it to the cache of
> + * available devices.
> */
> -static struct nfs4_file_layout_dsaddr*
> +static struct nfs4_file_layout_dsaddr *
> decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
> {
> - struct nfs4_file_layout_dsaddr *dsaddr;
> - struct pnfs_deviceid_node *d;
> + struct nfs4_file_layout_dsaddr *d, *new;
> + long hash;
>
> - dsaddr = decode_device(inode, dev);
> - if (!dsaddr) {
> + new = decode_device(inode, dev);
> + if (!new) {
> printk(KERN_WARNING "%s: Could not decode or add device\n",
> __func__);
> return NULL;
> }
>
> - d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
> - &dsaddr->deviceid);
> + spin_lock(&filelayout_deviceid_lock);
> + d = nfs4_fl_find_get_deviceid(&new->deviceid);
> + if (d) {
> + spin_unlock(&filelayout_deviceid_lock);
> + nfs4_fl_free_deviceid(new);
> + return d;
> + }
> +
> + INIT_HLIST_NODE(&new->node);
> + atomic_set(&new->ref, 1);
> + hash = nfs4_fl_deviceid_hash(&new->deviceid);
> + hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
> + spin_unlock(&filelayout_deviceid_lock);
>
> - return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
> + return new;
> }
>
> /*
> @@ -503,14 +527,38 @@ out_free:
> return dsaddr;
> }
>
> -struct nfs4_file_layout_dsaddr *
> -nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
> +void
> +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
> {
> - struct pnfs_deviceid_node *d;
> + if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
> + hlist_del_rcu(&dsaddr->node);
> + spin_unlock(&filelayout_deviceid_lock);
> +
> + synchronize_rcu();
> + nfs4_fl_free_deviceid(dsaddr);
> + }
> +}
>
> - d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
> - return (d == NULL) ? NULL :
> - container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
> +struct nfs4_file_layout_dsaddr *
> +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
> +{
> + struct nfs4_file_layout_dsaddr *d;
> + struct hlist_node *n;
> + long hash = nfs4_fl_deviceid_hash(id);
> +
> +
> + rcu_read_lock();
> + hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
> + if (!memcmp(&d->deviceid, id, sizeof(*id))) {
> + if (!atomic_inc_not_zero(&d->ref))
> + goto fail;
> + rcu_read_unlock();
> + return d;
> + }
> + }
> +fail:
> + rcu_read_unlock();
> + return NULL;
> }
>
> /*
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 92c55a4..349a378 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -75,10 +75,8 @@ find_pnfs_driver(u32 id)
> void
> unset_pnfs_layoutdriver(struct nfs_server *nfss)
> {
> - if (nfss->pnfs_curr_ld) {
> - nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
> + if (nfss->pnfs_curr_ld)
> module_put(nfss->pnfs_curr_ld->owner);
> - }
> nfss->pnfs_curr_ld = NULL;
> }
>
> @@ -116,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
> goto out_no_driver;
> }
> server->pnfs_curr_ld = ld_type;
> - if (ld_type->set_layoutdriver(server)) {
> - printk(KERN_ERR
> - "%s: Error initializing mount point for layout driver %u.\n",
> - __func__, id);
> - module_put(ld_type->owner);
> - goto out_no_driver;
> - }
> +
> dprintk("%s: pNFS module for %u set\n", __func__, id);
> return;
>
> @@ -909,138 +901,3 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
> dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
> return trypnfs;
> }
> -
> -/*
> - * Device ID cache. Currently supports one layout type per struct nfs_client.
> - * Add layout type to the lookup key to expand to support multiple types.
> - */
> -int
> -pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
> - void (*free_callback)(struct pnfs_deviceid_node *))
> -{
> - struct pnfs_deviceid_cache *c;
> -
> - c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
> - if (!c)
> - return -ENOMEM;
> - spin_lock(&clp->cl_lock);
> - if (clp->cl_devid_cache != NULL) {
> - atomic_inc(&clp->cl_devid_cache->dc_ref);
> - dprintk("%s [kref [%d]]\n", __func__,
> - atomic_read(&clp->cl_devid_cache->dc_ref));
> - kfree(c);
> - } else {
> - /* kzalloc initializes hlists */
> - spin_lock_init(&c->dc_lock);
> - atomic_set(&c->dc_ref, 1);
> - c->dc_free_callback = free_callback;
> - clp->cl_devid_cache = c;
> - dprintk("%s [new]\n", __func__);
> - }
> - spin_unlock(&clp->cl_lock);
> - return 0;
> -}
> -EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
> -
> -/*
> - * Called from pnfs_layoutdriver_type->free_lseg
> - * last layout segment reference frees deviceid
> - */
> -void
> -pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
> - struct pnfs_deviceid_node *devid)
> -{
> - struct nfs4_deviceid *id = &devid->de_id;
> - struct pnfs_deviceid_node *d;
> - struct hlist_node *n;
> - long h = nfs4_deviceid_hash(id);
> -
> - dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
> - if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
> - return;
> -
> - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
> - if (!memcmp(&d->de_id, id, sizeof(*id))) {
> - hlist_del_rcu(&d->de_node);
> - spin_unlock(&c->dc_lock);
> - synchronize_rcu();
> - c->dc_free_callback(devid);
> - return;
> - }
> - spin_unlock(&c->dc_lock);
> - /* Why wasn't it found in the list? */
> - BUG();
> -}
> -EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
> -
> -/* Find and reference a deviceid */
> -struct pnfs_deviceid_node *
> -pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
> -{
> - struct pnfs_deviceid_node *d;
> - struct hlist_node *n;
> - long hash = nfs4_deviceid_hash(id);
> -
> - dprintk("--> %s hash %ld\n", __func__, hash);
> - rcu_read_lock();
> - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> - if (!memcmp(&d->de_id, id, sizeof(*id))) {
> - if (!atomic_inc_not_zero(&d->de_ref)) {
> - goto fail;
> - } else {
> - rcu_read_unlock();
> - return d;
> - }
> - }
> - }
> -fail:
> - rcu_read_unlock();
> - return NULL;
> -}
> -EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
> -
> -/*
> - * Add a deviceid to the cache.
> - * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
> - */
> -struct pnfs_deviceid_node *
> -pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
> -{
> - struct pnfs_deviceid_node *d;
> - long hash = nfs4_deviceid_hash(&new->de_id);
> -
> - dprintk("--> %s hash %ld\n", __func__, hash);
> - spin_lock(&c->dc_lock);
> - d = pnfs_find_get_deviceid(c, &new->de_id);
> - if (d) {
> - spin_unlock(&c->dc_lock);
> - dprintk("%s [discard]\n", __func__);
> - c->dc_free_callback(new);
> - return d;
> - }
> - INIT_HLIST_NODE(&new->de_node);
> - atomic_set(&new->de_ref, 1);
> - hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
> - spin_unlock(&c->dc_lock);
> - dprintk("%s [new]\n", __func__);
> - return new;
> -}
> -EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
> -
> -void
> -pnfs_put_deviceid_cache(struct nfs_client *clp)
> -{
> - struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
> -
> - dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
> - if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
> - int i;
> - /* Verify cache is empty */
> - for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
> - BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
> - clp->cl_devid_cache = NULL;
> - spin_unlock(&clp->cl_lock);
> - kfree(local);
> - }
> -}
> -EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 585023f..acbb778 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -68,8 +68,6 @@ struct pnfs_layoutdriver_type {
> const u32 id;
> const char *name;
> struct module *owner;
> - int (*set_layoutdriver) (struct nfs_server *);
> - int (*clear_layoutdriver) (struct nfs_server *);
> struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
> void (*free_lseg) (struct pnfs_layout_segment *lseg);
>
> @@ -106,52 +104,6 @@ struct pnfs_device {
> unsigned int pglen;
> };
>
> -/*
> - * Device ID RCU cache. A device ID is unique per client ID and layout type.
> - */
> -#define NFS4_DEVICE_ID_HASH_BITS 5
> -#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
> -#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
> -
> -static inline u32
> -nfs4_deviceid_hash(struct nfs4_deviceid *id)
> -{
> - unsigned char *cptr = (unsigned char *)id->data;
> - unsigned int nbytes = NFS4_DEVICEID4_SIZE;
> - u32 x = 0;
> -
> - while (nbytes--) {
> - x *= 37;
> - x += *cptr++;
> - }
> - return x & NFS4_DEVICE_ID_HASH_MASK;
> -}
> -
> -struct pnfs_deviceid_node {
> - struct hlist_node de_node;
> - struct nfs4_deviceid de_id;
> - atomic_t de_ref;
> -};
> -
> -struct pnfs_deviceid_cache {
> - spinlock_t dc_lock;
> - atomic_t dc_ref;
> - void (*dc_free_callback)(struct pnfs_deviceid_node *);
> - struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
> -};
> -
> -extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
> - void (*free_callback)(struct pnfs_deviceid_node *));
> -extern void pnfs_put_deviceid_cache(struct nfs_client *);
> -extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
> - struct pnfs_deviceid_cache *,
> - struct nfs4_deviceid *);
> -extern struct pnfs_deviceid_node *pnfs_add_deviceid(
> - struct pnfs_deviceid_cache *,
> - struct pnfs_deviceid_node *);
> -extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
> - struct pnfs_deviceid_node *devid);
> -
> extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
> extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
>
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 2669a9a..7f71698 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -81,7 +81,6 @@ struct nfs_client {
> u32 cl_exchange_flags;
> struct nfs4_session *cl_session; /* sharred session */
> struct list_head cl_layouts;
> - struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
> #endif /* CONFIG_NFS_V4_1 */
>
> #ifdef CONFIG_NFS_FSCACHE
> --
> 1.7.2.3
>
From: Andy Adamson <[email protected]>
Attempt a pNFS file layout read by setting up the nfs_read_data struct and
calling nfs_initiate_read with the data server rpc client and the
filelayout rpc call ops.
Error handling is implemented in a subsequent patch.
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Mingyang Guo <[email protected]>
Signed-off-by: Oleg Drokin <[email protected]>
Signed-off-by: Ricardo Labiaga <[email protected]>
Tested-by: Guo Mingyang <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/internal.h | 2 +
fs/nfs/nfs4_fs.h | 3 ++
fs/nfs/nfs4filelayout.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4proc.c | 3 +-
fs/nfs/read.c | 3 +-
include/linux/nfs_xdr.h | 1 +
6 files changed, 90 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5cc9201..5e9df99 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -271,6 +271,8 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
#endif
/* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops);
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
/* write.c */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5dc378e..457b1fe 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
extern int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+ struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+ int cache_reply, struct rpc_task *task);
extern void nfs4_destroy_session(struct nfs4_session *session);
extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
extern int nfs4_proc_create_session(struct nfs_client *);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a2cde39..f421ef0 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -101,6 +101,85 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
}
/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+ &rdata->args.seq_args, &rdata->res.seq_res,
+ 0, task))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+ /* Note this may cause RPC to be resent */
+ rdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_release(void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ rdata->mds_ops->rpc_release(data);
+}
+
+struct rpc_call_ops filelayout_read_call_ops = {
+ .rpc_call_prepare = filelayout_read_prepare,
+ .rpc_call_done = filelayout_read_call_done,
+ .rpc_release = filelayout_read_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ loff_t offset = data->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+ __func__, data->inode->i_ino,
+ data->args.pgbase, (size_t)data->args.count, offset);
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds) {
+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ dprintk("%s USE DS:ip %x %hu\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+ /* No multipath support. Use first DS */
+ data->ds_clp = ds->ds_clp;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ data->args.fh = fh;
+
+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ data->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+ &filelayout_read_call_ops);
+ return PNFS_ATTEMPTED;
+}
+
+/*
* filelayout_check_layout()
*
* Make sure layout segment parameters are sane WRT the device.
@@ -320,6 +399,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.alloc_lseg = filelayout_alloc_lseg,
.free_lseg = filelayout_free_lseg,
.pg_test = filelayout_pg_test,
+ .read_pagelist = filelayout_read_pagelist,
};
static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index df7d617..64fb39b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -505,7 +505,7 @@ out:
return ret_id;
}
-static int nfs41_setup_sequence(struct nfs4_session *session,
+int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
int cache_reply,
@@ -571,6 +571,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
res->sr_status = 1;
return 0;
}
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_args *args,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5e6f7cc..5fc4ecc 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -160,7 +160,7 @@ static void nfs_readpage_release(struct nfs_page *req)
nfs_release_request(req);
}
-static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
const struct rpc_call_ops *call_ops)
{
struct inode *inode = data->inode;
@@ -198,6 +198,7 @@ static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
rpc_put_task(task);
return 0;
}
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
/*
* Set up the NFS read request struct
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c66ff7f..b63faef 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1020,6 +1020,7 @@ struct nfs_read_data {
struct pnfs_layout_segment *lseg;
struct nfs_client *ds_clp; /* pNFS data server */
const struct rpc_call_ops *mds_ops;
+ __u64 mds_offset;
struct page *page_array[NFS_PAGEVEC_SIZE];
};
--
1.7.2.3
From: Andy Adamson <[email protected]>
If a data server is unavailable, go through MDS.
Mark the deviceid containing the data server as a negative cache entry.
Do not try to connect to any data server on a deviceid marked as a negative
cache entry. Mark any layout that tries to use the marked deviceid as failed.
Inodes with a layout marked as fails will not use the layout for I/O, and will
not perform any more layoutgets.
Inodes without a layout will still do layoutget, but the layout will get
marked immediately.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/nfs4filelayout.c | 4 +++-
fs/nfs/nfs4filelayout.h | 4 ++++
fs/nfs/nfs4filelayoutdev.c | 28 ++++++++++++++++++++++++----
fs/nfs/pnfs.c | 9 +++++----
4 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 84c7577..eba9873 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -211,7 +211,9 @@ filelayout_read_pagelist(struct nfs_read_data *data)
idx = nfs4_fl_calc_ds_index(lseg, j);
ds = nfs4_fl_prepare_ds(lseg, idx);
if (!ds) {
- printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ /* Either layout fh index faulty, or ds connect failed */
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
return PNFS_NOT_ATTEMPTED;
}
dprintk("%s USE DS:ip %x %hu\n", __func__,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 23f1e1e..ee0c907 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -55,10 +55,14 @@ struct nfs4_pnfs_ds {
atomic_t ds_count;
};
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
+
struct nfs4_file_layout_dsaddr {
struct hlist_node node;
struct nfs4_deviceid deviceid;
atomic_t ref;
+ unsigned long flags;
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index ac38c75..433204f 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -602,6 +602,21 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
return flseg->fh_array[i];
}
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+ int err, u32 ds_addr)
+{
+ u32 *p = (u32 *)&dsaddr->deviceid;
+
+ printk(KERN_ERR "NFS: data server %x connection error %d."
+ " Deviceid [%x%x%x%x] marked out of use.\n",
+ ds_addr, err, p[0], p[1], p[2], p[3]);
+
+ spin_lock(&filelayout_deviceid_lock);
+ dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+ spin_unlock(&filelayout_deviceid_lock);
+}
+
struct nfs4_pnfs_ds *
nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
@@ -615,13 +630,18 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
}
if (!ds->ds_clp) {
+ struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
int err;
- err = nfs4_ds_connect(NFS_SERVER(lseg->pls_layout->plh_inode),
- dsaddr->ds_list[ds_idx]);
+ if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+ /* Already tried to connect, don't try again */
+ dprintk("%s Deviceid marked out of use\n", __func__);
+ return NULL;
+ }
+ err = nfs4_ds_connect(s, ds);
if (err) {
- printk(KERN_ERR "%s nfs4_ds_connect error %d\n",
- __func__, err);
+ filelayout_mark_devid_negative(dsaddr, err,
+ ntohl(ds->ds_ip_addr));
return NULL;
}
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 349a378..a7ea646 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -739,15 +739,16 @@ pnfs_update_layout(struct inode *ino,
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
- /* Check to see if the layout for the given range already exists */
- lseg = pnfs_find_lseg(lo, iomode);
- if (lseg)
- goto out_unlock;
/* if LAYOUTGET already failed once we don't try again */
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
+ /* Check to see if the layout for the given range already exists */
+ lseg = pnfs_find_lseg(lo, iomode);
+ if (lseg)
+ goto out_unlock;
+
if (pnfs_layoutgets_blocked(lo, NULL, 0))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
--
1.7.2.3
On Wed, Feb 16, 2011 at 3:08 PM, Benny Halevy <[email protected]> wrote:
> On 2011-02-16 14:55, Fred Isaman wrote:
>>
>> On Feb 16, 2011, at 2:42 PM, Benny Halevy wrote:
>>
>>> On 2011-02-15 03:38, [email protected] wrote:
>>>> From: Fred Isaman <[email protected]>
>>>>
>>>> Move the pnfs_update_layout call location to nfs_pageio_do_add_request().
>>>> Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach
>>>> it to each nfs_read_data so it can be sent to the layout driver.
>>>>
>>>> @@ -131,10 +132,12 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>>>> ? ? ? ? ? ? zero_user_segment(page, len, PAGE_CACHE_SIZE);
>>>>
>>>> ? ? nfs_list_add_request(new, &one_request);
>>>> + ? lseg = pnfs_update_layout(inode, ctx, IOMODE_READ);
>>>> ? ? if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
>>>> - ? ? ? ? ? nfs_pagein_multi(inode, &one_request, 1, len, 0);
>>>> + ? ? ? ? ? nfs_pagein_multi(inode, &one_request, 1, len, 0, lseg);
>>>> ? ? else
>>>> - ? ? ? ? ? nfs_pagein_one(inode, &one_request, 1, len, 0);
>>>> + ? ? ? ? ? nfs_pagein_one(inode, &one_request, 1, len, 0, lseg);
>>>> + ? put_lseg(lseg);
>>>> ? ? return 0;
>>>> }
>>>>
>>>> @@ -160,7 +163,8 @@ static void nfs_readpage_release(struct nfs_page *req)
>>>> ?*/
>>>> static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>>>> ? ? ? ? ? ? const struct rpc_call_ops *call_ops,
>>>> - ? ? ? ? ? unsigned int count, unsigned int offset)
>>>> + ? ? ? ? ? unsigned int count, unsigned int offset,
>>>> + ? ? ? ? ? struct pnfs_layout_segment *lseg)
>>>> {
>>>> ? ? struct inode *inode = req->wb_context->path.dentry->d_inode;
>>>> ? ? int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
>>>> @@ -183,6 +187,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>>>> ? ? data->req ? ? ? ? = req;
>>>> ? ? data->inode ? ? ? = inode;
>>>> ? ? data->cred ? ? ? ?= msg.rpc_cred;
>>>> + ? data->lseg ? ? ? ?= get_lseg(lseg);
>>>>
>>>> ? ? data->args.fh ? ? = NFS_FH(inode);
>>>> ? ? data->args.offset = req_offset(req) + offset;
>>>> @@ -240,7 +245,7 @@ nfs_async_read_error(struct list_head *head)
>>>> ?* won't see the new data until our attribute cache is updated. ?This is more
>>>> ?* or less conventional NFS client behavior.
>>>> ?*/
>>>> -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>>>> +static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>>>> {
>>>> ? ? struct nfs_page *req = nfs_list_entry(head->next);
>>>> ? ? struct page *page = req->wb_page;
>>>> @@ -266,6 +271,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>>>> ? ? } while(nbytes != 0);
>>>> ? ? atomic_set(&req->wb_complete, requests);
>>>>
>>>> + ? /* We know lseg==NULL */
>
> Can you provide more details?
> If it's always NULL why bother to pass it in?
>
>>>> + ? lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>>>> ? ? ClearPageError(page);
>>>> ? ? offset = 0;
>>>> ? ? nbytes = count;
>>>> @@ -280,12 +287,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>>>> ? ? ? ? ? ? if (nbytes < rsize)
>>>> ? ? ? ? ? ? ? ? ? ? rsize = nbytes;
>>>> ? ? ? ? ? ? ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
>>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? rsize, offset);
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?rsize, offset, lseg);
>>>> ? ? ? ? ? ? if (ret == 0)
>>>> ? ? ? ? ? ? ? ? ? ? ret = ret2;
>>>> ? ? ? ? ? ? offset += rsize;
>>>> ? ? ? ? ? ? nbytes -= rsize;
>>>> ? ? } while (nbytes != 0);
>>>> + ? put_lseg(lseg);
>>>>
>>>> ? ? return ret;
>>>>
>>>> @@ -300,7 +308,7 @@ out_bad:
>>>> ? ? return -ENOMEM;
>>>> }
>>>>
>>>> -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>>>> +static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>>>> {
>>>> ? ? struct nfs_page ? ? ? ? *req;
>>>> ? ? struct page ? ? ? ? ? ? **pages;
>>>> @@ -320,9 +328,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
>>>> ? ? ? ? ? ? *pages++ = req->wb_page;
>>>> ? ? }
>>>> ? ? req = nfs_list_entry(data->pages.next);
>>>> + ? if ((!lseg) && list_is_singular(&data->pages))
>>>> + ? ? ? ? ? lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>
> When is lseg NULL and why getting it here works better than in nfs_readpage_async?
>
>>>>
>>>> - ? return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
>>>> + ? ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
>>>> + ? put_lseg(lseg);
>>>
>>> Shouldn't that be done only if pnfs_update_layout was called here?
>>> Otherwise, the caller, nfs_readpage_async puts the lseg it passes down.
>>>
>>
>> You are right there is a problem. ?But it needs to be fixed by removing the put_lseg from nfs_readpage_async.
>>
>>
>
> If we can avoid getting the lseg in one place and putting it in another that would be better.
>
> Benny
>
I agree, but I don't see how It is possible with the current code,
where the pnfs_update_layout occurs in pg_test.
Fred
>>>> + ? return ret;
>>>> out_bad:
>>>> + ? put_lseg(lseg);
>>>
>>> I'd unify the common exit path by doing nfs_async_read_error on the error path
>>> and then goto out for the common code.
>>>
>>
>> OK.
>>
>> Fred
>>
>
From: Fred Isaman <[email protected]>
Move the pnfs_update_layout call location to nfs_pageio_do_add_request().
Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach
it to each nfs_read_data so it can be sent to the layout driver.
Signed-off-by: Andy Adamon <[email protected]>
Signed-off-by: Andy Adamon <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Oleg Drokin <[email protected]>
Signed-off-by: Tao Guo <[email protected]>
---
fs/nfs/file.c | 4 ----
fs/nfs/pagelist.c | 6 ++++--
fs/nfs/pnfs.c | 27 ++++++++++++++++-----------
fs/nfs/pnfs.h | 1 +
fs/nfs/read.c | 36 ++++++++++++++++++++++++------------
fs/nfs/write.c | 4 ++--
include/linux/nfs_page.h | 4 ++--
include/linux/nfs_xdr.h | 1 +
8 files changed, 50 insertions(+), 33 deletions(-)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029e..d85a534 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
- pnfs_update_layout(mapping->host,
- nfs_file_open_context(file),
- IOMODE_RW);
-
start:
/*
* Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9b9a65c..b49cb4b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
#include <linux/nfs_mount.h>
#include "internal.h"
+#include "pnfs.h"
static struct kmem_cache *nfs_page_cachep;
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
*/
void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
struct inode *inode,
- int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+ int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
size_t bsize,
int io_flags)
{
@@ -315,7 +316,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
nfs_page_array_len(desc->pg_base,
desc->pg_count),
desc->pg_count,
- desc->pg_ioflags);
+ desc->pg_ioflags,
+ desc->pg_lseg);
if (error < 0)
desc->pg_error = error;
else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d12f463..a09e3a0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -245,7 +245,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
}
-static void
+void
put_lseg(struct pnfs_layout_segment *lseg)
{
struct inode *inode;
@@ -784,7 +784,6 @@ pnfs_update_layout(struct inode *ino,
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
- put_lseg(lseg); /* STUB - callers currently ignore return value */
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
@@ -858,23 +857,29 @@ out_forget_reply:
goto out;
}
-static void
-pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
+static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev,
+ struct nfs_page *req)
{
- struct pnfs_layoutdriver_type *ld;
-
- ld = NFS_SERVER(inode)->pnfs_curr_ld;
- pgio->pg_test = (ld ? ld->pg_test : NULL);
+ if (pgio->pg_count == prev->wb_bytes) {
+ /* This is first coelesce call for a series of nfs_pages */
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ prev->wb_context,
+ IOMODE_READ);
+ }
+ return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
}
/*
* rsize is already set by caller to MDS rsize.
*/
void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
- struct inode *inode)
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
{
- pnfs_set_pg_test(inode, pgio);
+ struct pnfs_layoutdriver_type *ld;
+
+ ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
}
/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index db52d96..5107d14 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -151,6 +151,7 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2a27659..7896e3d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -20,17 +20,17 @@
#include <linux/nfs_page.h>
#include <asm/system.h>
+#include "pnfs.h"
#include "nfs4_fs.h"
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
-#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
+static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
static const struct rpc_call_ops nfs_read_partial_ops;
static const struct rpc_call_ops nfs_read_full_ops;
@@ -69,6 +69,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
static void nfs_readdata_release(struct nfs_read_data *rdata)
{
+ put_lseg(rdata->lseg);
put_nfs_open_context(rdata->args.context);
nfs_readdata_free(rdata);
}
@@ -117,11 +118,11 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
LIST_HEAD(one_request);
struct nfs_page *new;
unsigned int len;
+ struct pnfs_layout_segment *lseg;
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- pnfs_update_layout(inode, ctx, IOMODE_READ);
new = nfs_create_request(ctx, inode, page, 0, len);
if (IS_ERR(new)) {
unlock_page(page);
@@ -131,10 +132,12 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
zero_user_segment(page, len, PAGE_CACHE_SIZE);
nfs_list_add_request(new, &one_request);
+ lseg = pnfs_update_layout(inode, ctx, IOMODE_READ);
if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
- nfs_pagein_multi(inode, &one_request, 1, len, 0);
+ nfs_pagein_multi(inode, &one_request, 1, len, 0, lseg);
else
- nfs_pagein_one(inode, &one_request, 1, len, 0);
+ nfs_pagein_one(inode, &one_request, 1, len, 0, lseg);
+ put_lseg(lseg);
return 0;
}
@@ -160,7 +163,8 @@ static void nfs_readpage_release(struct nfs_page *req)
*/
static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
const struct rpc_call_ops *call_ops,
- unsigned int count, unsigned int offset)
+ unsigned int count, unsigned int offset,
+ struct pnfs_layout_segment *lseg)
{
struct inode *inode = req->wb_context->path.dentry->d_inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
@@ -183,6 +187,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
data->req = req;
data->inode = inode;
data->cred = msg.rpc_cred;
+ data->lseg = get_lseg(lseg);
data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
@@ -240,7 +245,7 @@ nfs_async_read_error(struct list_head *head)
* won't see the new data until our attribute cache is updated. This is more
* or less conventional NFS client behavior.
*/
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
{
struct nfs_page *req = nfs_list_entry(head->next);
struct page *page = req->wb_page;
@@ -266,6 +271,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
} while(nbytes != 0);
atomic_set(&req->wb_complete, requests);
+ /* We know lseg==NULL */
+ lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
ClearPageError(page);
offset = 0;
nbytes = count;
@@ -280,12 +287,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
if (nbytes < rsize)
rsize = nbytes;
ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
- rsize, offset);
+ rsize, offset, lseg);
if (ret == 0)
ret = ret2;
offset += rsize;
nbytes -= rsize;
} while (nbytes != 0);
+ put_lseg(lseg);
return ret;
@@ -300,7 +308,7 @@ out_bad:
return -ENOMEM;
}
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
{
struct nfs_page *req;
struct page **pages;
@@ -320,9 +328,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
*pages++ = req->wb_page;
}
req = nfs_list_entry(data->pages.next);
+ if ((!lseg) && list_is_singular(&data->pages))
+ lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
- return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
+ ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
+ put_lseg(lseg);
+ return ret;
out_bad:
+ put_lseg(lseg);
nfs_async_read_error(head);
return ret;
}
@@ -625,7 +638,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
- pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
pnfs_pageio_init_read(&pgio, inode);
if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6e90cdf..aca0268 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -880,7 +880,7 @@ static void nfs_redirty_request(struct nfs_page *req)
* Generate multiple small requests to write out a single
* contiguous dirty area on one page.
*/
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
{
struct nfs_page *req = nfs_list_entry(head->next);
struct page *page = req->wb_page;
@@ -947,7 +947,7 @@ out_bad:
* This is the case if nfs_updatepage detects a conflicting request
* that has been written but not committed.
*/
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
{
struct nfs_page *req;
struct page **pages;
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 4eaf27a..ba88ff4 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -59,7 +59,7 @@ struct nfs_pageio_descriptor {
unsigned int pg_base;
struct inode *pg_inode;
- int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
+ int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
int pg_ioflags;
int pg_error;
struct pnfs_layout_segment *pg_lseg;
@@ -81,7 +81,7 @@ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
pgoff_t idx_start, unsigned int npages, int tag);
extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
struct inode *inode,
- int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+ int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
size_t bsize,
int how);
extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d159fe7..560923e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1017,6 +1017,7 @@ struct nfs_read_data {
struct nfs_readargs args;
struct nfs_readres res;
unsigned long timestamp; /* For lease renewal */
+ struct pnfs_layout_segment *lseg;
struct page *page_array[NFS_PAGEVEC_SIZE];
};
--
1.7.2.3
From: Andy Adamson <[email protected]>
Prevents an Oops triggered by CB_LAYOUTRECALL and LAYOUTGET race on a
pnfs_layout_hdr first pnfs_layout_segment.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/pnfs.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0f5b66f..7d031cd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -768,7 +768,7 @@ pnfs_update_layout(struct inode *ino,
put_layout_hdr(lo);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
- nfsi->layout->plh_flags, lseg);
+ nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
--
1.7.2.3
From: Andy Adamson <[email protected]>
Now nfs_get_client returns an nfs_client ready to be used no matter if it was
found or created.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/client.c | 56 +++++++++++++++++++++++++---------------------
fs/nfs/internal.h | 9 +++++++
fs/nfs/nfs3proc.c | 1 +
fs/nfs/nfs4proc.c | 1 +
fs/nfs/proc.c | 1 +
include/linux/nfs_xdr.h | 3 ++
6 files changed, 45 insertions(+), 26 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bd3ca32..b9ed2a8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -481,7 +481,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
* Look up a client by IP address and protocol version
* - creates a new record if one doesn't yet exist
*/
-static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport)
{
struct nfs_client *clp, *new = NULL;
int error;
@@ -512,6 +517,13 @@ install_client:
clp = new;
list_add(&clp->cl_share_link, &nfs_client_list);
spin_unlock(&nfs_client_lock);
+
+ error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+ authflavour, noresvport);
+ if (error < 0) {
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+ }
dprintk("--> nfs_get_client() = %p [new]\n", clp);
return clp;
@@ -767,9 +779,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
/*
* Initialise an NFS2 or NFS3 client
*/
-static int nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const struct nfs_parsed_mount_data *data)
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
+ const char *ip_addr, rpc_authflavor_t authflavour,
+ int noresvport)
{
int error;
@@ -784,7 +796,7 @@ static int nfs_init_client(struct nfs_client *clp,
* - RFC 2623, sec 2.3.2
*/
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
- 0, data->flags & NFS_MOUNT_NORESVPORT);
+ 0, noresvport);
if (error < 0)
goto error;
nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +832,17 @@ static int nfs_init_server(struct nfs_server *server,
cl_init.rpc_ops = &nfs_v3_clientops;
#endif
+ nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+ data->timeo, data->retrans);
+
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init);
+ clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+ data->flags & NFS_MOUNT_NORESVPORT);
if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp);
}
- nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
- data->timeo, data->retrans);
- error = nfs_init_client(clp, &timeparms, data);
- if (error < 0)
- goto error;
-
server->nfs_client = clp;
/* Initialise the client representation from the mount data */
@@ -1307,11 +1317,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
/*
* Initialise an NFS4 client record
*/
-static int nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr,
- rpc_authflavor_t authflavour,
- int flags)
+int nfs4_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport)
{
int error;
@@ -1325,7 +1335,7 @@ static int nfs4_init_client(struct nfs_client *clp,
clp->rpc_ops = &nfs_v4_clientops;
error = nfs_create_rpc_client(clp, timeparms, authflavour,
- 1, flags & NFS_MOUNT_NORESVPORT);
+ 1, noresvport);
if (error < 0)
goto error;
strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,22 +1388,16 @@ static int nfs4_set_client(struct nfs_server *server,
dprintk("--> nfs4_set_client()\n");
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init);
+ clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+ server->flags & NFS_MOUNT_NORESVPORT);
if (IS_ERR(clp)) {
error = PTR_ERR(clp);
goto error;
}
- error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
- server->flags);
- if (error < 0)
- goto error_put;
server->nfs_client = clp;
dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
return 0;
-
-error_put:
- nfs_put_client(clp);
error:
dprintk("<-- nfs4_set_client() = xerror %d\n", error);
return error;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf9fdbd..4d7b3a9 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -215,6 +215,10 @@ extern struct rpc_procinfo nfs4_procedures[];
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr, rpc_authflavor_t authflavour,
+ int noresvport);
/* dir.c */
extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -274,6 +278,11 @@ extern int nfs_migrate_page(struct address_space *,
#endif
/* nfs4proc.c */
+extern int nfs4_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport);
extern int _nfs4_call_sync(struct nfs_server *server,
struct rpc_message *msg,
struct nfs4_sequence_args *args,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c0..d0c80d8 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.lock = nfs3_proc_lock,
.clear_acl_cache = nfs3_forget_cached_acls,
.close_context = nfs_close_context,
+ .init_client = nfs_init_client,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78936a8..7faec0f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5595,6 +5595,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.clear_acl_cache = nfs4_zap_acl_attr,
.close_context = nfs4_close_context,
.open_context = nfs4_atomic_open,
+ .init_client = nfs4_init_client,
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21..b8ec170 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.lock = nfs_proc_lock,
.lock_check_bounds = nfs_lock_check_bounds,
.close_context = nfs_close_context,
+ .init_client = nfs_init_client,
};
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 51bfadb..d159fe7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1040,6 +1040,7 @@ struct nfs_write_data {
};
struct nfs_access_entry;
+struct nfs_client;
/*
* RPC procedure vector for NFSv2/NFSv3 demuxing
@@ -1104,6 +1105,8 @@ struct nfs_rpc_ops {
struct nfs_open_context *ctx,
int open_flags,
struct iattr *iattr);
+ int (*init_client) (struct nfs_client *, const struct rpc_timeout *,
+ const char *, rpc_authflavor_t, int);
};
/*
--
1.7.2.3
From: Andy Adamson <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
Documentation/filesystems/nfs/pnfs.txt | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index bc0b9cf..983e14a 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -46,3 +46,10 @@ data server cache
file driver devices refer to data servers, which are kept in a module
level cache. Its reference is held over the lifetime of the deviceid
pointing to it.
+
+lseg
+----
+lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
+bit which holds it in the pnfs_layout_hdr's list. When the final lseg
+is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
+bit is set, preventing any new lsegs from being added.
--
1.7.2.3
From: Andy Adamson <[email protected]>
Use our own async error handler.
Mark the layout as failed and retry i/o through the MDS on specified errors.
Update the mds_offset in nfs_readpage_retry so that a failed short-read retry
to a DS gets correctly resent through the MDS.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/internal.h | 1 +
fs/nfs/nfs4filelayout.c | 79 +++++++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4proc.c | 33 +++++++++++++++---
fs/nfs/nfs4state.c | 1 +
fs/nfs/read.c | 1 +
include/linux/nfs_xdr.h | 1 +
include/linux/sunrpc/clnt.h | 1 +
net/sunrpc/clnt.c | 8 ++++
8 files changed, 119 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5e9df99..1a3228e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -285,6 +285,7 @@ extern int nfs_migrate_page(struct address_space *,
#endif
/* nfs4proc.c */
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
extern int nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
const char *ip_addr,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index f421ef0..9ae1a47e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,6 +40,8 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dean Hildebrand <[email protected]>");
MODULE_DESCRIPTION("The NFSv4 file layout driver");
+#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
+
static int
filelayout_set_layoutdriver(struct nfs_server *nfss)
{
@@ -100,6 +102,81 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
BUG();
}
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ } else {
+ dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+ }
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ int *reset)
+{
+ if (task->tk_status >= 0)
+ return 0;
+ switch (task->tk_status) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_state_recovery(clp);
+ task->tk_status = 0;
+ return -EAGAIN;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ case -EKEYEXPIRED:
+ rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+ task->tk_status = 0;
+ return -EAGAIN;
+ default:
+ dprintk("%s DS error. Retry through MDS %d\n", __func__,
+ task->tk_status);
+ *reset = 1;
+ task->tk_status = 0;
+ return -EAGAIN;
+ }
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+ struct nfs_read_data *data)
+{
+ struct nfs_client *clp = data->ds_clp;
+ int reset = 0;
+
+ dprintk("%s DS read\n", __func__);
+
+ if (filelayout_async_handle_error(task, data->args.context->state,
+ data->ds_clp, &reset) == -EAGAIN) {
+ dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+ __func__, data->ds_clp, data->ds_clp->cl_session);
+ if (reset) {
+ nfs4_reset_read(task, data);
+ filelayout_set_lo_fail(data->lseg);
+ clp = NFS_SERVER(data->inode)->nfs_client;
+ }
+ nfs_restart_rpc(task, clp);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
/*
* Call ops for the async read/write cases
* In the case of dense layouts, the offset needs to be reset to its
@@ -109,6 +186,8 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
{
struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+ rdata->read_done_cb = filelayout_read_done_cb;
+
if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
&rdata->args.seq_args, &rdata->res.seq_res,
0, task))
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 64fb39b..f91e259 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3075,15 +3075,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return err;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->inode);
- dprintk("--> %s\n", __func__);
-
- if (!nfs4_sequence_done(task, &data->res.seq_res))
- return -EAGAIN;
-
if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
nfs_restart_rpc(task, server->nfs_client);
return -EAGAIN;
@@ -3095,12 +3090,38 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+
+ return data->read_done_cb(task, data);
+}
+
static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
{
data->timestamp = jiffies;
+ data->read_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
}
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+ dprintk("%s Reset task for i/o through\n", __func__);
+ /* offsets will differ in the dense stripe case */
+ data->args.offset = data->mds_offset;
+ data->ds_clp = NULL;
+ data->args.fh = NFS_FH(data->inode);
+ data->read_done_cb = nfs4_read_done_cb;
+ task->tk_ops = data->mds_ops;
+ rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+
static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9e33e88..6da026a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1022,6 +1022,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
nfs4_schedule_state_manager(clp);
}
+EXPORT_SYMBOL_GPL(nfs4_schedule_state_recovery);
int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
{
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5fc4ecc..5c5fbac 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -395,6 +395,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
return;
/* Yes, so retry the read at the end of the data */
+ data->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b63faef..eb0e870 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1020,6 +1020,7 @@ struct nfs_read_data {
struct pnfs_layout_segment *lseg;
struct nfs_client *ds_clp; /* pNFS data server */
const struct rpc_call_ops *mds_ops;
+ int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
__u64 mds_offset;
struct page *page_array[NFS_PAGEVEC_SIZE];
};
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index ef9476a..db7bcaf 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -129,6 +129,7 @@ struct rpc_create_args {
struct rpc_clnt *rpc_create(struct rpc_create_args *args);
struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *,
struct rpc_program *, u32);
+void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt);
struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
void rpc_shutdown_client(struct rpc_clnt *);
void rpc_release_client(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 57d344c..5c4df70 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -597,6 +597,14 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
}
}
+void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+ rpc_task_release_client(task);
+ rpc_task_set_client(task, clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_task_reset_client);
+
+
static void
rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
{
--
1.7.2.3
From: Andy Adamson <[email protected]>
No need for generic cache with only one user.
Keep a simple hash of deviceids in the filelayout driver.
Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Andy Adamson <[email protected]>
---
fs/nfs/nfs4filelayout.c | 46 +++-----------
fs/nfs/nfs4filelayout.h | 8 ++-
fs/nfs/nfs4filelayoutdev.c | 106 +++++++++++++++++++++++---------
fs/nfs/pnfs.c | 147 +-------------------------------------------
fs/nfs/pnfs.h | 48 --------------
include/linux/nfs_fs_sb.h | 1 -
6 files changed, 92 insertions(+), 264 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 9ae1a47e..84c7577 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -42,32 +42,6 @@ MODULE_DESCRIPTION("The NFSv4 file layout driver");
#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
-static int
-filelayout_set_layoutdriver(struct nfs_server *nfss)
-{
- int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
- nfs4_fl_free_deviceid_callback);
- if (status) {
- printk(KERN_WARNING "%s: deviceid cache could not be "
- "initialized\n", __func__);
- return status;
- }
- dprintk("%s: deviceid cache has been initialized successfully\n",
- __func__);
- return 0;
-}
-
-/* Clear out the layout by destroying its device list */
-static int
-filelayout_clear_layoutdriver(struct nfs_server *nfss)
-{
- dprintk("--> %s\n", __func__);
-
- if (nfss->nfs_client->cl_devid_cache)
- pnfs_put_deviceid_cache(nfss->nfs_client);
- return 0;
-}
-
static loff_t
filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
loff_t offset)
@@ -291,7 +265,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
}
/* find and reference the deviceid */
- dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+ dsaddr = nfs4_fl_find_get_deviceid(id);
if (dsaddr == NULL) {
dsaddr = get_device_info(lo->plh_inode, id);
if (dsaddr == NULL)
@@ -326,7 +300,7 @@ out:
dprintk("--> %s returns %d\n", __func__, status);
return status;
out_put:
- pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+ nfs4_fl_put_deviceid(dsaddr);
goto out;
}
@@ -435,12 +409,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
static void
filelayout_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
dprintk("--> %s\n", __func__);
- pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
- &fl->dsaddr->deviceid);
+ nfs4_fl_put_deviceid(fl->dsaddr);
_filelayout_free_lseg(fl);
}
@@ -470,13 +442,11 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
}
static struct pnfs_layoutdriver_type filelayout_type = {
- .id = LAYOUT_NFSV4_1_FILES,
- .name = "LAYOUT_NFSV4_1_FILES",
- .owner = THIS_MODULE,
- .set_layoutdriver = filelayout_set_layoutdriver,
- .clear_layoutdriver = filelayout_clear_layoutdriver,
- .alloc_lseg = filelayout_alloc_lseg,
- .free_lseg = filelayout_free_lseg,
+ .id = LAYOUT_NFSV4_1_FILES,
+ .name = "LAYOUT_NFSV4_1_FILES",
+ .owner = THIS_MODULE,
+ .alloc_lseg = filelayout_alloc_lseg,
+ .free_lseg = filelayout_free_lseg,
.pg_test = filelayout_pg_test,
.read_pagelist = filelayout_read_pagelist,
};
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 9fef76e..23f1e1e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -56,7 +56,9 @@ struct nfs4_pnfs_ds {
};
struct nfs4_file_layout_dsaddr {
- struct pnfs_deviceid_node deviceid;
+ struct hlist_node node;
+ struct nfs4_deviceid deviceid;
+ atomic_t ref;
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
@@ -86,7 +88,6 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
extern struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
-extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
extern void print_ds(struct nfs4_pnfs_ds *ds);
extern void print_deviceid(struct nfs4_deviceid *dev_id);
u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
@@ -94,7 +95,8 @@ u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx);
extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index e8496f3..ac38c75 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_FL_DEVICE_ID_HASH_BITS 5
+#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
+#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
+{
+ unsigned char *cptr = (unsigned char *)id->data;
+ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+ u32 x = 0;
+
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x & NFS4_FL_DEVICE_ID_HASH_MASK;
+}
+
+static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(filelayout_deviceid_lock);
+
+/*
* Data server cache
*
* Data servers can be mapped to different device ids.
@@ -183,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
struct nfs4_pnfs_ds *ds;
int i;
- print_deviceid(&dsaddr->deviceid.de_id);
+ print_deviceid(&dsaddr->deviceid);
for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
@@ -200,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
kfree(dsaddr);
}
-void
-nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-{
- struct nfs4_file_layout_dsaddr *dsaddr =
- container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-
- nfs4_fl_free_deviceid(dsaddr);
-}
-
static struct nfs4_pnfs_ds *
nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
{
@@ -357,7 +372,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
dsaddr->stripe_count = cnt;
dsaddr->ds_num = num;
- memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+ memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
/* Go back an read stripe indices */
p = indicesp;
@@ -407,28 +422,37 @@ out_err:
}
/*
- * Decode the opaque device specified in 'dev'
- * and add it to the list of available devices.
- * If the deviceid is already cached, nfs4_add_deviceid will return
- * a pointer to the cached struct and throw away the new.
+ * Decode the opaque device specified in 'dev' and add it to the cache of
+ * available devices.
*/
-static struct nfs4_file_layout_dsaddr*
+static struct nfs4_file_layout_dsaddr *
decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
{
- struct nfs4_file_layout_dsaddr *dsaddr;
- struct pnfs_deviceid_node *d;
+ struct nfs4_file_layout_dsaddr *d, *new;
+ long hash;
- dsaddr = decode_device(inode, dev);
- if (!dsaddr) {
+ new = decode_device(inode, dev);
+ if (!new) {
printk(KERN_WARNING "%s: Could not decode or add device\n",
__func__);
return NULL;
}
- d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
- &dsaddr->deviceid);
+ spin_lock(&filelayout_deviceid_lock);
+ d = nfs4_fl_find_get_deviceid(&new->deviceid);
+ if (d) {
+ spin_unlock(&filelayout_deviceid_lock);
+ nfs4_fl_free_deviceid(new);
+ return d;
+ }
+
+ INIT_HLIST_NODE(&new->node);
+ atomic_set(&new->ref, 1);
+ hash = nfs4_fl_deviceid_hash(&new->deviceid);
+ hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
+ spin_unlock(&filelayout_deviceid_lock);
- return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+ return new;
}
/*
@@ -503,14 +527,38 @@ out_free:
return dsaddr;
}
-struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
- struct pnfs_deviceid_node *d;
+ if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+ hlist_del_rcu(&dsaddr->node);
+ spin_unlock(&filelayout_deviceid_lock);
+
+ synchronize_rcu();
+ nfs4_fl_free_deviceid(dsaddr);
+ }
+}
- d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
- return (d == NULL) ? NULL :
- container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
+{
+ struct nfs4_file_layout_dsaddr *d;
+ struct hlist_node *n;
+ long hash = nfs4_fl_deviceid_hash(id);
+
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
+ if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+ if (!atomic_inc_not_zero(&d->ref))
+ goto fail;
+ rcu_read_unlock();
+ return d;
+ }
+ }
+fail:
+ rcu_read_unlock();
+ return NULL;
}
/*
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 92c55a4..349a378 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -75,10 +75,8 @@ find_pnfs_driver(u32 id)
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
- if (nfss->pnfs_curr_ld) {
- nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+ if (nfss->pnfs_curr_ld)
module_put(nfss->pnfs_curr_ld->owner);
- }
nfss->pnfs_curr_ld = NULL;
}
@@ -116,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
goto out_no_driver;
}
server->pnfs_curr_ld = ld_type;
- if (ld_type->set_layoutdriver(server)) {
- printk(KERN_ERR
- "%s: Error initializing mount point for layout driver %u.\n",
- __func__, id);
- module_put(ld_type->owner);
- goto out_no_driver;
- }
+
dprintk("%s: pNFS module for %u set\n", __func__, id);
return;
@@ -909,138 +901,3 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
return trypnfs;
}
-
-/*
- * Device ID cache. Currently supports one layout type per struct nfs_client.
- * Add layout type to the lookup key to expand to support multiple types.
- */
-int
-pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
- void (*free_callback)(struct pnfs_deviceid_node *))
-{
- struct pnfs_deviceid_cache *c;
-
- c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
- spin_lock(&clp->cl_lock);
- if (clp->cl_devid_cache != NULL) {
- atomic_inc(&clp->cl_devid_cache->dc_ref);
- dprintk("%s [kref [%d]]\n", __func__,
- atomic_read(&clp->cl_devid_cache->dc_ref));
- kfree(c);
- } else {
- /* kzalloc initializes hlists */
- spin_lock_init(&c->dc_lock);
- atomic_set(&c->dc_ref, 1);
- c->dc_free_callback = free_callback;
- clp->cl_devid_cache = c;
- dprintk("%s [new]\n", __func__);
- }
- spin_unlock(&clp->cl_lock);
- return 0;
-}
-EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
-
-/*
- * Called from pnfs_layoutdriver_type->free_lseg
- * last layout segment reference frees deviceid
- */
-void
-pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
- struct pnfs_deviceid_node *devid)
-{
- struct nfs4_deviceid *id = &devid->de_id;
- struct pnfs_deviceid_node *d;
- struct hlist_node *n;
- long h = nfs4_deviceid_hash(id);
-
- dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
- if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
- return;
-
- hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
- if (!memcmp(&d->de_id, id, sizeof(*id))) {
- hlist_del_rcu(&d->de_node);
- spin_unlock(&c->dc_lock);
- synchronize_rcu();
- c->dc_free_callback(devid);
- return;
- }
- spin_unlock(&c->dc_lock);
- /* Why wasn't it found in the list? */
- BUG();
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
-
-/* Find and reference a deviceid */
-struct pnfs_deviceid_node *
-pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-{
- struct pnfs_deviceid_node *d;
- struct hlist_node *n;
- long hash = nfs4_deviceid_hash(id);
-
- dprintk("--> %s hash %ld\n", __func__, hash);
- rcu_read_lock();
- hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
- if (!memcmp(&d->de_id, id, sizeof(*id))) {
- if (!atomic_inc_not_zero(&d->de_ref)) {
- goto fail;
- } else {
- rcu_read_unlock();
- return d;
- }
- }
- }
-fail:
- rcu_read_unlock();
- return NULL;
-}
-EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
-
-/*
- * Add a deviceid to the cache.
- * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
- */
-struct pnfs_deviceid_node *
-pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
-{
- struct pnfs_deviceid_node *d;
- long hash = nfs4_deviceid_hash(&new->de_id);
-
- dprintk("--> %s hash %ld\n", __func__, hash);
- spin_lock(&c->dc_lock);
- d = pnfs_find_get_deviceid(c, &new->de_id);
- if (d) {
- spin_unlock(&c->dc_lock);
- dprintk("%s [discard]\n", __func__);
- c->dc_free_callback(new);
- return d;
- }
- INIT_HLIST_NODE(&new->de_node);
- atomic_set(&new->de_ref, 1);
- hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
- spin_unlock(&c->dc_lock);
- dprintk("%s [new]\n", __func__);
- return new;
-}
-EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-
-void
-pnfs_put_deviceid_cache(struct nfs_client *clp)
-{
- struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
-
- dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
- if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
- int i;
- /* Verify cache is empty */
- for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
- BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
- clp->cl_devid_cache = NULL;
- spin_unlock(&clp->cl_lock);
- kfree(local);
- }
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 585023f..acbb778 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,8 +68,6 @@ struct pnfs_layoutdriver_type {
const u32 id;
const char *name;
struct module *owner;
- int (*set_layoutdriver) (struct nfs_server *);
- int (*clear_layoutdriver) (struct nfs_server *);
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
@@ -106,52 +104,6 @@ struct pnfs_device {
unsigned int pglen;
};
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_DEVICE_ID_HASH_BITS 5
-#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
-#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_deviceid_hash(struct nfs4_deviceid *id)
-{
- unsigned char *cptr = (unsigned char *)id->data;
- unsigned int nbytes = NFS4_DEVICEID4_SIZE;
- u32 x = 0;
-
- while (nbytes--) {
- x *= 37;
- x += *cptr++;
- }
- return x & NFS4_DEVICE_ID_HASH_MASK;
-}
-
-struct pnfs_deviceid_node {
- struct hlist_node de_node;
- struct nfs4_deviceid de_id;
- atomic_t de_ref;
-};
-
-struct pnfs_deviceid_cache {
- spinlock_t dc_lock;
- atomic_t dc_ref;
- void (*dc_free_callback)(struct pnfs_deviceid_node *);
- struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-};
-
-extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
- void (*free_callback)(struct pnfs_deviceid_node *));
-extern void pnfs_put_deviceid_cache(struct nfs_client *);
-extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
- struct pnfs_deviceid_cache *,
- struct nfs4_deviceid *);
-extern struct pnfs_deviceid_node *pnfs_add_deviceid(
- struct pnfs_deviceid_cache *,
- struct pnfs_deviceid_node *);
-extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
- struct pnfs_deviceid_node *devid);
-
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2669a9a..7f71698 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -81,7 +81,6 @@ struct nfs_client {
u32 cl_exchange_flags;
struct nfs4_session *cl_session; /* sharred session */
struct list_head cl_layouts;
- struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
#endif /* CONFIG_NFS_V4_1 */
#ifdef CONFIG_NFS_FSCACHE
--
1.7.2.3
This should be authored by Fred
-->Andy
On Feb 15, 2011, at 3:39 AM, [email protected] wrote:
> From: Andy Adamson <[email protected]>
>
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> Documentation/filesystems/nfs/pnfs.txt | 7 +++++++
> 1 files changed, 7 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
> index bc0b9cf..983e14a 100644
> --- a/Documentation/filesystems/nfs/pnfs.txt
> +++ b/Documentation/filesystems/nfs/pnfs.txt
> @@ -46,3 +46,10 @@ data server cache
> file driver devices refer to data servers, which are kept in a module
> level cache. Its reference is held over the lifetime of the deviceid
> pointing to it.
> +
> +lseg
> +----
> +lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
> +bit which holds it in the pnfs_layout_hdr's list. When the final lseg
> +is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
> +bit is set, preventing any new lsegs from being added.
> --
> 1.7.2.3
>
From: Andy Adamson <[email protected]>
Data servers not sharing a session with the mount MDS always have an empty
cl_superblocks list.
Replace the cl_superblocks empty list check to see if it is time to shut down
renewd with the NFS_CS_STOP_RENEW bit which is not set by such a data server.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/client.c | 5 +++++
fs/nfs/nfs4renewd.c | 6 +-----
include/linux/nfs_fs_sb.h | 1 +
3 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b9ed2a8..a86698c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1019,14 +1019,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
spin_lock(&nfs_client_lock);
list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
+ clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
spin_unlock(&nfs_client_lock);
}
static void nfs_server_remove_lists(struct nfs_server *server)
{
+ struct nfs_client *clp = server->nfs_client;
+
spin_lock(&nfs_client_lock);
list_del_rcu(&server->client_link);
+ if (clp && list_empty(&clp->cl_superblocks))
+ set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
list_del(&server->master_link);
spin_unlock(&nfs_client_lock);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d..df8e7f3 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
ops = clp->cl_mvops->state_renewal_ops;
dprintk("%s: start\n", __func__);
- rcu_read_lock();
- if (list_empty(&clp->cl_superblocks)) {
- rcu_read_unlock();
+ if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
goto out;
- }
- rcu_read_unlock();
spin_lock(&clp->cl_lock);
lease = clp->cl_lease_time;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b197563..2c2dc18 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -30,6 +30,7 @@ struct nfs_client {
#define NFS_CS_CALLBACK 1 /* - callback started */
#define NFS_CS_IDMAP 2 /* - idmap started */
#define NFS_CS_RENEWD 3 /* - renewd started */
+#define NFS_CS_STOP_RENEW 4 /* no more state to renew */
struct sockaddr_storage cl_addr; /* server identifier */
size_t cl_addrlen;
char * cl_hostname; /* hostname of server */
--
1.7.2.3
From: Fred Isaman <[email protected]>
Add a pg_test layout driver hook which is used to avoid coelescing I/O across
layout stripes.
Signed-off-by: Andy Adamon <[email protected]>
Signed-off-by: Andy Adamon <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Oleg Drokin <[email protected]>
Signed-off-by: Tao Guo <[email protected]>
---
fs/nfs/nfs4filelayout.c | 26 ++++++++++++++++++++++++++
fs/nfs/pagelist.c | 12 ++++++++++--
fs/nfs/pnfs.c | 19 +++++++++++++++++++
fs/nfs/pnfs.h | 12 ++++++++++++
fs/nfs/read.c | 1 +
fs/nfs/write.c | 3 +++
include/linux/nfs_page.h | 2 ++
7 files changed, 73 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930c..98e26e0 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -252,6 +252,31 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
_filelayout_free_lseg(fl);
}
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 : coalesce page
+ * return 0 : don't coalesce page
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ u64 p_stripe, r_stripe;
+ u32 stripe_unit;
+
+ if (!pgio->pg_lseg)
+ return 1;
+ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+ stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ return (p_stripe == r_stripe);
+}
+
static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
@@ -260,6 +285,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.clear_layoutdriver = filelayout_clear_layoutdriver,
.alloc_lseg = filelayout_alloc_lseg,
.free_lseg = filelayout_free_lseg,
+ .pg_test = filelayout_pg_test,
};
static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3..9b9a65c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -226,6 +226,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_doio = doio;
desc->pg_ioflags = io_flags;
desc->pg_error = 0;
+ desc->pg_lseg = NULL;
}
/**
@@ -240,7 +241,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
* Return 'true' if this is the case, else return 'false'.
*/
static int nfs_can_coalesce_requests(struct nfs_page *prev,
- struct nfs_page *req)
+ struct nfs_page *req,
+ struct nfs_pageio_descriptor *pgio)
{
if (req->wb_context->cred != prev->wb_context->cred)
return 0;
@@ -254,6 +256,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
return 0;
if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
return 0;
+ /*
+ * Non-whole file layouts need to check that req is inside of
+ * pgio->pg_lseg.
+ */
+ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+ return 0;
return 1;
}
@@ -286,7 +294,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
if (newlen > desc->pg_bsize)
return 0;
prev = nfs_list_entry(desc->pg_list.prev);
- if (!nfs_can_coalesce_requests(prev, req))
+ if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
} else
desc->pg_base = req->wb_pgbase;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1173434..d12f463 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -858,6 +858,25 @@ out_forget_reply:
goto out;
}
+static void
+pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
+{
+ struct pnfs_layoutdriver_type *ld;
+
+ ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ pgio->pg_test = (ld ? ld->pg_test : NULL);
+}
+
+/*
+ * rsize is already set by caller to MDS rsize.
+ */
+void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode)
+{
+ pnfs_set_pg_test(inode, pgio);
+}
+
/*
* Device ID cache. Currently supports one layout type per struct nfs_client.
* Add layout type to the lookup key to expand to support multiple types.
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9a994bc..db52d96 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+#include <linux/nfs_page.h>
+
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
NFS_LSEG_ROC, /* roc bit received from server */
@@ -65,6 +67,9 @@ struct pnfs_layoutdriver_type {
int (*clear_layoutdriver) (struct nfs_server *);
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
+ /* test for nfs page cache coalescing */
+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
};
struct pnfs_layout_hdr {
@@ -151,6 +156,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
@@ -250,6 +256,12 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
{
}
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+ pgio->pg_test = NULL;
+}
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7..2a27659 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -626,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
goto read_complete; /* all pages were read */
pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+ pnfs_pageio_init_read(&pgio, inode);
if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
else
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c8278f4..6e90cdf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
#include "iostat.h"
#include "nfs4_fs.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -982,6 +983,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
{
size_t wsize = NFS_SERVER(inode)->wsize;
+ pgio->pg_test = NULL;
+
if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index d55cee7..4eaf27a 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -62,6 +62,8 @@ struct nfs_pageio_descriptor {
int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
int pg_ioflags;
int pg_error;
+ struct pnfs_layout_segment *pg_lseg;
+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
};
#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
--
1.7.2.3
On 2011-02-15 03:38, [email protected] wrote:
> From: Fred Isaman <[email protected]>
>
> Move the pnfs_update_layout call location to nfs_pageio_do_add_request().
> Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach
> it to each nfs_read_data so it can be sent to the layout driver.
>
> Signed-off-by: Andy Adamon <[email protected]>
> Signed-off-by: Andy Adamon <[email protected]>
> Signed-off-by: Dean Hildebrand <[email protected]>
> Signed-off-by: Fred Isaman <[email protected]>
> Signed-off-by: Fred Isaman <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> Signed-off-by: Boaz Harrosh <[email protected]>
> Signed-off-by: Oleg Drokin <[email protected]>
> Signed-off-by: Tao Guo <[email protected]>
> ---
> fs/nfs/file.c | 4 ----
> fs/nfs/pagelist.c | 6 ++++--
> fs/nfs/pnfs.c | 27 ++++++++++++++++-----------
> fs/nfs/pnfs.h | 1 +
> fs/nfs/read.c | 36 ++++++++++++++++++++++++------------
> fs/nfs/write.c | 4 ++--
> include/linux/nfs_page.h | 4 ++--
> include/linux/nfs_xdr.h | 1 +
> 8 files changed, 50 insertions(+), 33 deletions(-)
>
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 7bf029e..d85a534 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
> file->f_path.dentry->d_name.name,
> mapping->host->i_ino, len, (long long) pos);
>
> - pnfs_update_layout(mapping->host,
> - nfs_file_open_context(file),
> - IOMODE_RW);
> -
> start:
> /*
> * Prevent starvation issues if someone is doing a consistency
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index 9b9a65c..b49cb4b 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -20,6 +20,7 @@
> #include <linux/nfs_mount.h>
>
> #include "internal.h"
> +#include "pnfs.h"
>
> static struct kmem_cache *nfs_page_cachep;
>
> @@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
> */
> void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
> struct inode *inode,
> - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
> + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
> size_t bsize,
> int io_flags)
> {
> @@ -315,7 +316,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
> nfs_page_array_len(desc->pg_base,
> desc->pg_count),
> desc->pg_count,
> - desc->pg_ioflags);
> + desc->pg_ioflags,
> + desc->pg_lseg);
> if (error < 0)
> desc->pg_error = error;
> else
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index d12f463..a09e3a0 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -245,7 +245,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
> rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
> }
>
> -static void
> +void
> put_lseg(struct pnfs_layout_segment *lseg)
> {
> struct inode *inode;
> @@ -784,7 +784,6 @@ pnfs_update_layout(struct inode *ino,
> out:
> dprintk("%s end, state 0x%lx lseg %p\n", __func__,
> nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
> - put_lseg(lseg); /* STUB - callers currently ignore return value */
> return lseg;
> out_unlock:
> spin_unlock(&ino->i_lock);
> @@ -858,23 +857,29 @@ out_forget_reply:
> goto out;
> }
>
> -static void
> -pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
> +static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
> + struct nfs_page *prev,
> + struct nfs_page *req)
> {
> - struct pnfs_layoutdriver_type *ld;
> -
> - ld = NFS_SERVER(inode)->pnfs_curr_ld;
> - pgio->pg_test = (ld ? ld->pg_test : NULL);
> + if (pgio->pg_count == prev->wb_bytes) {
> + /* This is first coelesce call for a series of nfs_pages */
> + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
> + prev->wb_context,
> + IOMODE_READ);
> + }
> + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
> }
>
> /*
> * rsize is already set by caller to MDS rsize.
> */
> void
> -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
> - struct inode *inode)
> +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
> {
> - pnfs_set_pg_test(inode, pgio);
> + struct pnfs_layoutdriver_type *ld;
> +
> + ld = NFS_SERVER(inode)->pnfs_curr_ld;
> + pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
> }
>
> /*
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index db52d96..5107d14 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -151,6 +151,7 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
>
> /* pnfs.c */
> void get_layout_hdr(struct pnfs_layout_hdr *lo);
> +void put_lseg(struct pnfs_layout_segment *lseg);
> struct pnfs_layout_segment *
> pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
> enum pnfs_iomode access_type);
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index 2a27659..7896e3d 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -20,17 +20,17 @@
> #include <linux/nfs_page.h>
>
> #include <asm/system.h>
> +#include "pnfs.h"
>
> #include "nfs4_fs.h"
> #include "internal.h"
> #include "iostat.h"
> #include "fscache.h"
> -#include "pnfs.h"
>
> #define NFSDBG_FACILITY NFSDBG_PAGECACHE
>
> -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
> -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
> +static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
> +static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
> static const struct rpc_call_ops nfs_read_partial_ops;
> static const struct rpc_call_ops nfs_read_full_ops;
>
> @@ -69,6 +69,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
>
> static void nfs_readdata_release(struct nfs_read_data *rdata)
> {
> + put_lseg(rdata->lseg);
> put_nfs_open_context(rdata->args.context);
> nfs_readdata_free(rdata);
> }
> @@ -117,11 +118,11 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
> LIST_HEAD(one_request);
> struct nfs_page *new;
> unsigned int len;
> + struct pnfs_layout_segment *lseg;
>
> len = nfs_page_length(page);
> if (len == 0)
> return nfs_return_empty_page(page);
> - pnfs_update_layout(inode, ctx, IOMODE_READ);
> new = nfs_create_request(ctx, inode, page, 0, len);
> if (IS_ERR(new)) {
> unlock_page(page);
> @@ -131,10 +132,12 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
> zero_user_segment(page, len, PAGE_CACHE_SIZE);
>
> nfs_list_add_request(new, &one_request);
> + lseg = pnfs_update_layout(inode, ctx, IOMODE_READ);
> if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
> - nfs_pagein_multi(inode, &one_request, 1, len, 0);
> + nfs_pagein_multi(inode, &one_request, 1, len, 0, lseg);
> else
> - nfs_pagein_one(inode, &one_request, 1, len, 0);
> + nfs_pagein_one(inode, &one_request, 1, len, 0, lseg);
> + put_lseg(lseg);
> return 0;
> }
>
> @@ -160,7 +163,8 @@ static void nfs_readpage_release(struct nfs_page *req)
> */
> static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
> const struct rpc_call_ops *call_ops,
> - unsigned int count, unsigned int offset)
> + unsigned int count, unsigned int offset,
> + struct pnfs_layout_segment *lseg)
> {
> struct inode *inode = req->wb_context->path.dentry->d_inode;
> int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
> @@ -183,6 +187,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
> data->req = req;
> data->inode = inode;
> data->cred = msg.rpc_cred;
> + data->lseg = get_lseg(lseg);
>
> data->args.fh = NFS_FH(inode);
> data->args.offset = req_offset(req) + offset;
> @@ -240,7 +245,7 @@ nfs_async_read_error(struct list_head *head)
> * won't see the new data until our attribute cache is updated. This is more
> * or less conventional NFS client behavior.
> */
> -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
> +static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
> {
> struct nfs_page *req = nfs_list_entry(head->next);
> struct page *page = req->wb_page;
> @@ -266,6 +271,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
> } while(nbytes != 0);
> atomic_set(&req->wb_complete, requests);
>
> + /* We know lseg==NULL */
> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
> ClearPageError(page);
> offset = 0;
> nbytes = count;
> @@ -280,12 +287,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
> if (nbytes < rsize)
> rsize = nbytes;
> ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
> - rsize, offset);
> + rsize, offset, lseg);
> if (ret == 0)
> ret = ret2;
> offset += rsize;
> nbytes -= rsize;
> } while (nbytes != 0);
> + put_lseg(lseg);
>
> return ret;
>
> @@ -300,7 +308,7 @@ out_bad:
> return -ENOMEM;
> }
>
> -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
> +static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
> {
> struct nfs_page *req;
> struct page **pages;
> @@ -320,9 +328,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
> *pages++ = req->wb_page;
> }
> req = nfs_list_entry(data->pages.next);
> + if ((!lseg) && list_is_singular(&data->pages))
> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>
> - return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
> + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
> + put_lseg(lseg);
Shouldn't that be done only if pnfs_update_layout was called here?
Otherwise, the caller, nfs_readpage_async puts the lseg it passes down.
> + return ret;
> out_bad:
> + put_lseg(lseg);
I'd unify the common exit path by doing nfs_async_read_error on the error path
and then goto out for the common code.
Benny
> nfs_async_read_error(head);
> return ret;
> }
> @@ -625,7 +638,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
> if (ret == 0)
> goto read_complete; /* all pages were read */
>
> - pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
> pnfs_pageio_init_read(&pgio, inode);
> if (rsize < PAGE_CACHE_SIZE)
> nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 6e90cdf..aca0268 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -880,7 +880,7 @@ static void nfs_redirty_request(struct nfs_page *req)
> * Generate multiple small requests to write out a single
> * contiguous dirty area on one page.
> */
> -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
> +static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
> {
> struct nfs_page *req = nfs_list_entry(head->next);
> struct page *page = req->wb_page;
> @@ -947,7 +947,7 @@ out_bad:
> * This is the case if nfs_updatepage detects a conflicting request
> * that has been written but not committed.
> */
> -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
> +static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
> {
> struct nfs_page *req;
> struct page **pages;
> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> index 4eaf27a..ba88ff4 100644
> --- a/include/linux/nfs_page.h
> +++ b/include/linux/nfs_page.h
> @@ -59,7 +59,7 @@ struct nfs_pageio_descriptor {
> unsigned int pg_base;
>
> struct inode *pg_inode;
> - int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
> + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
> int pg_ioflags;
> int pg_error;
> struct pnfs_layout_segment *pg_lseg;
> @@ -81,7 +81,7 @@ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
> pgoff_t idx_start, unsigned int npages, int tag);
> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
> struct inode *inode,
> - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
> + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
> size_t bsize,
> int how);
> extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index d159fe7..560923e 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -1017,6 +1017,7 @@ struct nfs_read_data {
> struct nfs_readargs args;
> struct nfs_readres res;
> unsigned long timestamp; /* For lease renewal */
> + struct pnfs_layout_segment *lseg;
> struct page *page_array[NFS_PAGEVEC_SIZE];
> };
>
From: Andy Adamson <[email protected]>
Data servers require a zero stateid seqid, and there is no advantage to not
doing the same for all NFSv4.1
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/nfs4xdr.c | 10 +++++++---
1 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e2c168..2380c45 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
hdr->replen += decode_putrootfh_maxsz;
}
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
{
nfs4_stateid stateid;
__be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
p = reserve_space(xdr, NFS4_STATEID_SIZE);
if (ctx->state != NULL) {
nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+ if (zero_seqid)
+ stateid.stateid.seqid = 0;
xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
} else
xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
p = reserve_space(xdr, 4);
*p = cpu_to_be32(OP_READ);
- encode_stateid(xdr, args->context, args->lock_context);
+ encode_stateid(xdr, args->context, args->lock_context,
+ hdr->minorversion);
p = reserve_space(xdr, 12);
p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
p = reserve_space(xdr, 4);
*p = cpu_to_be32(OP_WRITE);
- encode_stateid(xdr, args->context, args->lock_context);
+ encode_stateid(xdr, args->context, args->lock_context,
+ hdr->minorversion);
p = reserve_space(xdr, 16);
p = xdr_encode_hyper(p, args->offset);
--
1.7.2.3
On 2011-02-15 03:39, [email protected] wrote:
> From: Andy Adamson <[email protected]>
>
> Use our own async error handler.
> Mark the layout as failed and retry i/o through the MDS on specified errors.
>
> Update the mds_offset in nfs_readpage_retry so that a failed short-read retry
> to a DS gets correctly resent through the MDS.
>
> Signed-off-by: Andy Adamson <[email protected]>
> ---
> fs/nfs/internal.h | 1 +
> fs/nfs/nfs4filelayout.c | 79 +++++++++++++++++++++++++++++++++++++++++++
> fs/nfs/nfs4proc.c | 33 +++++++++++++++---
> fs/nfs/nfs4state.c | 1 +
> fs/nfs/read.c | 1 +
> include/linux/nfs_xdr.h | 1 +
> include/linux/sunrpc/clnt.h | 1 +
> net/sunrpc/clnt.c | 8 ++++
> 8 files changed, 119 insertions(+), 6 deletions(-)
>
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 5e9df99..1a3228e 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -285,6 +285,7 @@ extern int nfs_migrate_page(struct address_space *,
> #endif
>
> /* nfs4proc.c */
> +extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
> extern int nfs4_init_client(struct nfs_client *clp,
> const struct rpc_timeout *timeparms,
> const char *ip_addr,
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index f421ef0..9ae1a47e 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -40,6 +40,8 @@ MODULE_LICENSE("GPL");
> MODULE_AUTHOR("Dean Hildebrand <[email protected]>");
> MODULE_DESCRIPTION("The NFSv4 file layout driver");
>
> +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
> +
> static int
> filelayout_set_layoutdriver(struct nfs_server *nfss)
> {
> @@ -100,6 +102,81 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
> BUG();
> }
>
> +/* For data server errors we don't recover from */
> +static void
> +filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
> +{
> + if (lseg->pls_range.iomode == IOMODE_RW) {
> + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
> + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
> + } else {
> + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
> + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
> + }
> +}
> +
> +static int filelayout_async_handle_error(struct rpc_task *task,
> + struct nfs4_state *state,
> + struct nfs_client *clp,
> + int *reset)
> +{
> + if (task->tk_status >= 0)
> + return 0;
> + switch (task->tk_status) {
> + case -NFS4ERR_BADSESSION:
> + case -NFS4ERR_BADSLOT:
> + case -NFS4ERR_BAD_HIGH_SLOT:
> + case -NFS4ERR_DEADSESSION:
> + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
> + case -NFS4ERR_SEQ_FALSE_RETRY:
> + case -NFS4ERR_SEQ_MISORDERED:
> + dprintk("%s ERROR %d, Reset session. Exchangeid "
> + "flags 0x%x\n", __func__, task->tk_status,
> + clp->cl_exchange_flags);
> + nfs4_schedule_state_recovery(clp);
> + task->tk_status = 0;
> + return -EAGAIN;
> + case -NFS4ERR_DELAY:
> + case -NFS4ERR_GRACE:
> + case -EKEYEXPIRED:
> + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
> + task->tk_status = 0;
> + return -EAGAIN;
> + default:
> + dprintk("%s DS error. Retry through MDS %d\n", __func__,
> + task->tk_status);
> + *reset = 1;
> + task->tk_status = 0;
> + return -EAGAIN;
Since all cases end with
task->tk_status = 0;
return -EAGAIN;
how about moving it out to the function body and break from the switch statement instead?
Also, *reset better be always set when returning -EAGAIN. Although the current caller
caller sets its initial value this is not documented anywhere and may break easily
in the future.
Benny
> + }
> +}
> +
> +/* NFS_PROTO call done callback routines */
> +
> +static int filelayout_read_done_cb(struct rpc_task *task,
> + struct nfs_read_data *data)
> +{
> + struct nfs_client *clp = data->ds_clp;
> + int reset = 0;
> +
> + dprintk("%s DS read\n", __func__);
> +
> + if (filelayout_async_handle_error(task, data->args.context->state,
> + data->ds_clp, &reset) == -EAGAIN) {
> + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
> + __func__, data->ds_clp, data->ds_clp->cl_session);
> + if (reset) {
> + nfs4_reset_read(task, data);
> + filelayout_set_lo_fail(data->lseg);
> + clp = NFS_SERVER(data->inode)->nfs_client;
> + }
> + nfs_restart_rpc(task, clp);
> + return -EAGAIN;
> + }
> +
> + return 0;
> +}
> +
> /*
> * Call ops for the async read/write cases
> * In the case of dense layouts, the offset needs to be reset to its
> @@ -109,6 +186,8 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
> {
> struct nfs_read_data *rdata = (struct nfs_read_data *)data;
>
> + rdata->read_done_cb = filelayout_read_done_cb;
> +
> if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
> &rdata->args.seq_args, &rdata->res.seq_res,
> 0, task))
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 64fb39b..f91e259 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -3075,15 +3075,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
> return err;
> }
>
> -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
> +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
> {
> struct nfs_server *server = NFS_SERVER(data->inode);
>
> - dprintk("--> %s\n", __func__);
> -
> - if (!nfs4_sequence_done(task, &data->res.seq_res))
> - return -EAGAIN;
> -
> if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
> nfs_restart_rpc(task, server->nfs_client);
> return -EAGAIN;
> @@ -3095,12 +3090,38 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
> return 0;
> }
>
> +static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
> +{
> +
> + dprintk("--> %s\n", __func__);
> +
> + if (!nfs4_sequence_done(task, &data->res.seq_res))
> + return -EAGAIN;
> +
> + return data->read_done_cb(task, data);
> +}
> +
> static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
> {
> data->timestamp = jiffies;
> + data->read_done_cb = nfs4_read_done_cb;
> msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> }
>
> +/* Reset the the nfs_read_data to send the read to the MDS. */
> +void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
> +{
> + dprintk("%s Reset task for i/o through\n", __func__);
> + /* offsets will differ in the dense stripe case */
> + data->args.offset = data->mds_offset;
> + data->ds_clp = NULL;
> + data->args.fh = NFS_FH(data->inode);
> + data->read_done_cb = nfs4_read_done_cb;
> + task->tk_ops = data->mds_ops;
> + rpc_task_reset_client(task, NFS_CLIENT(data->inode));
> +}
> +EXPORT_SYMBOL_GPL(nfs4_reset_read);
> +
> static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
> {
> struct inode *inode = data->inode;
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 9e33e88..6da026a 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -1022,6 +1022,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
> set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
> nfs4_schedule_state_manager(clp);
> }
> +EXPORT_SYMBOL_GPL(nfs4_schedule_state_recovery);
>
> int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
> {
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index 5fc4ecc..5c5fbac 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -395,6 +395,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
> return;
>
> /* Yes, so retry the read at the end of the data */
> + data->mds_offset += resp->count;
> argp->offset += resp->count;
> argp->pgbase += resp->count;
> argp->count -= resp->count;
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index b63faef..eb0e870 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -1020,6 +1020,7 @@ struct nfs_read_data {
> struct pnfs_layout_segment *lseg;
> struct nfs_client *ds_clp; /* pNFS data server */
> const struct rpc_call_ops *mds_ops;
> + int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
> __u64 mds_offset;
> struct page *page_array[NFS_PAGEVEC_SIZE];
> };
> diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
> index ef9476a..db7bcaf 100644
> --- a/include/linux/sunrpc/clnt.h
> +++ b/include/linux/sunrpc/clnt.h
> @@ -129,6 +129,7 @@ struct rpc_create_args {
> struct rpc_clnt *rpc_create(struct rpc_create_args *args);
> struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *,
> struct rpc_program *, u32);
> +void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt);
> struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
> void rpc_shutdown_client(struct rpc_clnt *);
> void rpc_release_client(struct rpc_clnt *);
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 57d344c..5c4df70 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -597,6 +597,14 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
> }
> }
>
> +void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt)
> +{
> + rpc_task_release_client(task);
> + rpc_task_set_client(task, clnt);
> +}
> +EXPORT_SYMBOL_GPL(rpc_task_reset_client);
> +
> +
> static void
> rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
> {
From: Andy Adamson <[email protected]>
Introduce a data server set_client and init session following the
nfs4_set_client and nfs4_init_session convention.
Once a new nfs_client is on the nfs_client_list, the nfs_client cl_cons_state
serializes access to creating an nfs_client struct with matching properties.
Use the new nfs_get_client() that initializes new clients.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/client.c | 41 +++++++++++++++++++++++++++++
fs/nfs/internal.h | 5 +++
fs/nfs/nfs4_fs.h | 12 ++++++++
fs/nfs/nfs4filelayoutdev.c | 61 ++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4proc.c | 29 +++++++++++++++++++-
include/linux/nfs_xdr.h | 1 +
6 files changed, 147 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4a501d0..738f624 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1423,6 +1423,47 @@ error:
return error;
}
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen, int ds_proto)
+{
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .rpc_ops = &nfs_v4_clientops,
+ .proto = ds_proto,
+ .minorversion = mds_clp->cl_minorversion,
+ };
+ struct rpc_timeout ds_timeout = {
+ .to_initval = 15 * HZ,
+ .to_maxval = 15 * HZ,
+ .to_retries = 1,
+ .to_exponential = 1,
+ };
+ struct nfs_client *clp;
+
+ /*
+ * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+ * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+ * (section 13.1 RFC 5661).
+ */
+ clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+ mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+
+ dprintk("<-- %s %p\n", __func__, clp);
+ return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
/*
* Session has been established, and the client marked ready.
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4d7b3a9..5cc9201 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
struct nfs_fattr *);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen, int ds_proto);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
@@ -213,6 +216,8 @@ extern const u32 nfs41_maxwrite_overhead;
extern struct rpc_procinfo nfs4_procedures[];
#endif
+extern int nfs4_init_ds_session(struct nfs_client *clp);
+
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern int nfs_init_client(struct nfs_client *clp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5d84642..5dc378e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -266,6 +266,12 @@ is_ds_only_client(struct nfs_client *clp)
return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
EXCHGID4_FLAG_USE_PNFS_DS;
}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
#else /* CONFIG_NFS_v4_1 */
static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
{
@@ -289,6 +295,12 @@ is_ds_only_client(struct nfs_client *clp)
{
return false;
}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return false;
+}
#endif /* CONFIG_NFS_V4_1 */
extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f5c9b12..8e21e65 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -104,6 +104,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
return NULL;
}
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+ struct nfs_client *clp;
+ struct sockaddr_in sin;
+ int status = 0;
+
+ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+ mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ds->ds_ip_addr;
+ sin.sin_port = ds->ds_port;
+
+ clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+ sizeof(sin), IPPROTO_TCP);
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+ if (!is_ds_client(clp)) {
+ status = -ENODEV;
+ goto out_put;
+ }
+ ds->ds_clp = clp;
+ dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+ goto out;
+ }
+
+ /*
+ * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+ * be equal to the MDS lease. Renewal is scheduled in create_session.
+ */
+ spin_lock(&mds_srv->nfs_client->cl_lock);
+ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+ spin_unlock(&mds_srv->nfs_client->cl_lock);
+ clp->cl_last_renewal = jiffies;
+
+ /* New nfs_client */
+ status = nfs4_init_ds_session(clp);
+ if (status)
+ goto out_put;
+
+ ds->ds_clp = clp;
+ dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+ ntohs(ds->ds_port));
+out:
+ return status;
+out_put:
+ nfs_put_client(clp);
+ goto out;
+}
+
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7faec0f..df7d617 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1574,9 +1574,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return 0;
}
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
{
- struct nfs_client *clp = server->nfs_client;
unsigned int loop;
int ret;
@@ -1593,6 +1592,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
return ret;
}
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+ return nfs4_client_recover_expired_lease(server->nfs_client);
+}
+
/*
* OPEN_EXPIRED:
* reclaim state on the server after a network partition.
@@ -5073,6 +5077,27 @@ int nfs4_init_session(struct nfs_server *server)
return ret;
}
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+ struct nfs4_session *session = clp->cl_session;
+ int ret;
+
+ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+ return 0;
+
+ ret = nfs4_client_recover_expired_lease(clp);
+ if (!ret)
+ /* Test for the DS role */
+ if (!is_ds_client(clp))
+ ret = -ENODEV;
+ if (!ret)
+ ret = nfs4_check_client_ready(clp);
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+
/*
* Renew the cl_session lease.
*/
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9d2b9da..c66ff7f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1018,6 +1018,7 @@ struct nfs_read_data {
struct nfs_readres res;
unsigned long timestamp; /* For lease renewal */
struct pnfs_layout_segment *lseg;
+ struct nfs_client *ds_clp; /* pNFS data server */
const struct rpc_call_ops *mds_ops;
struct page *page_array[NFS_PAGEVEC_SIZE];
};
--
1.7.2.3
On Wed, Feb 16, 2011 at 4:09 PM, Fred Isaman <[email protected]> wrote:
> On Wed, Feb 16, 2011 at 3:08 PM, Benny Halevy <[email protected]> wrote:
>> On 2011-02-16 14:55, Fred Isaman wrote:
>>>
>>> On Feb 16, 2011, at 2:42 PM, Benny Halevy wrote:
>>>
>>>> On 2011-02-15 03:38, [email protected] wrote:
>>>>> From: Fred Isaman <[email protected]>
>>>>>
>>>>> Move the pnfs_update_layout call location to nfs_pageio_do_add_request().
>>>>> Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach
>>>>> it to each nfs_read_data so it can be sent to the layout driver.
>>>>>
>>>>> @@ -131,10 +132,12 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>>>>> ? ? ? ? ? ? zero_user_segment(page, len, PAGE_CACHE_SIZE);
>>>>>
>>>>> ? ? nfs_list_add_request(new, &one_request);
>>>>> + ? lseg = pnfs_update_layout(inode, ctx, IOMODE_READ);
>>>>> ? ? if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
>>>>> - ? ? ? ? ? nfs_pagein_multi(inode, &one_request, 1, len, 0);
>>>>> + ? ? ? ? ? nfs_pagein_multi(inode, &one_request, 1, len, 0, lseg);
>>>>> ? ? else
>>>>> - ? ? ? ? ? nfs_pagein_one(inode, &one_request, 1, len, 0);
>>>>> + ? ? ? ? ? nfs_pagein_one(inode, &one_request, 1, len, 0, lseg);
>>>>> + ? put_lseg(lseg);
>>>>> ? ? return 0;
>>>>> }
>>>>>
>>>>> @@ -160,7 +163,8 @@ static void nfs_readpage_release(struct nfs_page *req)
>>>>> ?*/
>>>>> static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>>>>> ? ? ? ? ? ? const struct rpc_call_ops *call_ops,
>>>>> - ? ? ? ? ? unsigned int count, unsigned int offset)
>>>>> + ? ? ? ? ? unsigned int count, unsigned int offset,
>>>>> + ? ? ? ? ? struct pnfs_layout_segment *lseg)
>>>>> {
>>>>> ? ? struct inode *inode = req->wb_context->path.dentry->d_inode;
>>>>> ? ? int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
>>>>> @@ -183,6 +187,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>>>>> ? ? data->req ? ? ? ? = req;
>>>>> ? ? data->inode ? ? ? = inode;
>>>>> ? ? data->cred ? ? ? ?= msg.rpc_cred;
>>>>> + ? data->lseg ? ? ? ?= get_lseg(lseg);
>>>>>
>>>>> ? ? data->args.fh ? ? = NFS_FH(inode);
>>>>> ? ? data->args.offset = req_offset(req) + offset;
>>>>> @@ -240,7 +245,7 @@ nfs_async_read_error(struct list_head *head)
>>>>> ?* won't see the new data until our attribute cache is updated. ?This is more
>>>>> ?* or less conventional NFS client behavior.
>>>>> ?*/
>>>>> -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>>>>> +static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>>>>> {
>>>>> ? ? struct nfs_page *req = nfs_list_entry(head->next);
>>>>> ? ? struct page *page = req->wb_page;
>>>>> @@ -266,6 +271,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>>>>> ? ? } while(nbytes != 0);
>>>>> ? ? atomic_set(&req->wb_complete, requests);
>>>>>
>>>>> + ? /* We know lseg==NULL */
>>
>> Can you provide more details?
>> If it's always NULL why bother to pass it in?
>>
>>>>> + ? lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>>>>> ? ? ClearPageError(page);
>>>>> ? ? offset = 0;
>>>>> ? ? nbytes = count;
>>>>> @@ -280,12 +287,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>>>>> ? ? ? ? ? ? if (nbytes < rsize)
>>>>> ? ? ? ? ? ? ? ? ? ? rsize = nbytes;
>>>>> ? ? ? ? ? ? ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
>>>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? rsize, offset);
>>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?rsize, offset, lseg);
>>>>> ? ? ? ? ? ? if (ret == 0)
>>>>> ? ? ? ? ? ? ? ? ? ? ret = ret2;
>>>>> ? ? ? ? ? ? offset += rsize;
>>>>> ? ? ? ? ? ? nbytes -= rsize;
>>>>> ? ? } while (nbytes != 0);
>>>>> + ? put_lseg(lseg);
>>>>>
>>>>> ? ? return ret;
>>>>>
>>>>> @@ -300,7 +308,7 @@ out_bad:
>>>>> ? ? return -ENOMEM;
>>>>> }
>>>>>
>>>>> -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>>>>> +static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>>>>> {
>>>>> ? ? struct nfs_page ? ? ? ? *req;
>>>>> ? ? struct page ? ? ? ? ? ? **pages;
>>>>> @@ -320,9 +328,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
>>>>> ? ? ? ? ? ? *pages++ = req->wb_page;
>>>>> ? ? }
>>>>> ? ? req = nfs_list_entry(data->pages.next);
>>>>> + ? if ((!lseg) && list_is_singular(&data->pages))
>>>>> + ? ? ? ? ? lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>>
>> When is lseg NULL and why getting it here works better than in nfs_readpage_async?
>>
>>>>>
>>>>> - ? return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
>>>>> + ? ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
>>>>> + ? put_lseg(lseg);
>>>>
>>>> Shouldn't that be done only if pnfs_update_layout was called here?
>>>> Otherwise, the caller, nfs_readpage_async puts the lseg it passes down.
>>>>
>>>
>>> You are right there is a problem. ?But it needs to be fixed by removing the put_lseg from nfs_readpage_async.
>>>
>>>
>>
>> If we can avoid getting the lseg in one place and putting it in another that would be better.
>>
>> Benny
>>
>
> I agree, but I don't see how It is possible with the current code,
> where the pnfs_update_layout occurs in pg_test.
>
> Fred
>
Actually, in this case we can remove both the getting and the putting
entirely from nfs_readpage_async, and pass in a NULL lseg. The
->pg_doio functions will handle it correctly.
Fred
>>>>> + ? return ret;
>>>>> out_bad:
>>>>> + ? put_lseg(lseg);
>>>>
>>>> I'd unify the common exit path by doing nfs_async_read_error on the error path
>>>> and then goto out for the common code.
>>>>
>>>
>>> OK.
>>>
>>> Fred
>>>
>>
>
From: Fred Isaman <[email protected]>
Prepare for filelayout_read_pagelist with helper functions that find the correct
data server, filehandle, and offset.
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Marc Eshel <[email protected]>
Signed-off-by: Mike Sager <[email protected]>
Signed-off-by: Oleg Drokin <[email protected]>
Signed-off-by: Tao Guo <[email protected]>
Signed-off-by: Tigran Mkrtchyan <[email protected]>
Signed-off-by: Tigran Mkrtchyan <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4filelayout.c | 34 ++++++++++++++++++++++
fs/nfs/nfs4filelayout.h | 7 ++++
fs/nfs/nfs4filelayoutdev.c | 67 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 108 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 98e26e0..a2cde39 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -66,6 +66,40 @@ filelayout_clear_layoutdriver(struct nfs_server *nfss)
return 0;
}
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+ loff_t offset)
+{
+ u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+ u64 tmp;
+
+ offset -= flseg->pattern_offset;
+ tmp = offset;
+ do_div(tmp, stripe_width);
+
+ return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ switch (flseg->stripe_type) {
+ case STRIPE_SPARSE:
+ return offset;
+
+ case STRIPE_DENSE:
+ return filelayout_get_dense_offset(flseg, offset);
+ }
+
+ BUG();
+}
+
/*
* filelayout_check_layout()
*
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd..9fef76e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -83,9 +83,16 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
generic_hdr);
}
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
extern void print_ds(struct nfs4_pnfs_ds *ds);
extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+ u32 ds_idx);
extern struct nfs4_file_layout_dsaddr *
nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
struct nfs4_file_layout_dsaddr *
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 8e21e65..e8496f3 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -512,3 +512,70 @@ nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
return (d == NULL) ? NULL :
container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u64 tmp;
+
+ tmp = offset - flseg->pattern_offset;
+ do_div(tmp, flseg->stripe_unit);
+ tmp += flseg->first_stripe_index;
+ return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u32 i;
+
+ if (flseg->stripe_type == STRIPE_SPARSE) {
+ if (flseg->num_fh == 1)
+ i = 0;
+ else if (flseg->num_fh == 0)
+ /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+ return NULL;
+ else
+ i = nfs4_fl_calc_ds_index(lseg, j);
+ } else
+ i = j;
+ return flseg->fh_array[i];
+}
+
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+ struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+
+ if (ds == NULL) {
+ printk(KERN_ERR "%s: No data server for offset index %d\n",
+ __func__, ds_idx);
+ return NULL;
+ }
+
+ if (!ds->ds_clp) {
+ int err;
+
+ err = nfs4_ds_connect(NFS_SERVER(lseg->pls_layout->plh_inode),
+ dsaddr->ds_list[ds_idx]);
+ if (err) {
+ printk(KERN_ERR "%s nfs4_ds_connect error %d\n",
+ __func__, err);
+ return NULL;
+ }
+ }
+ return ds;
+}
--
1.7.2.3
From: Andy Adamson <[email protected]>
Separate the rpc run portion of nfs_read_rpcsetup into a new function
nfs_initiate_read that is called for normal NFS I/O.
Add a pNFS read_pagelist function that is called instead of nfs_intitate_read
for pNFS reads.
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Mike Sager <[email protected]>
Signed-off-by: Mingyang Guo <[email protected]>
Signed-off-by: Ricardo Labiaga <[email protected]>
Signed-off-by: Tao Guo <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/pnfs.c | 28 +++++++++++++++++++
fs/nfs/pnfs.h | 20 +++++++++++++
fs/nfs/read.c | 65 +++++++++++++++++++++++++++-----------------
include/linux/nfs_iostat.h | 1 +
include/linux/nfs_xdr.h | 1 +
5 files changed, 90 insertions(+), 25 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a09e3a0..92c55a4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
#include "pnfs.h"
+#include "iostat.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -883,6 +884,33 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
}
/*
+ * Call the appropriate parallel I/O subsystem read function.
+ */
+enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
+ const struct rpc_call_ops *call_ops)
+{
+ struct inode *inode = rdata->inode;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ enum pnfs_try_status trypnfs;
+
+ rdata->mds_ops = call_ops;
+
+ dprintk("%s: Reading ino:%lu %u@%llu\n",
+ __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+
+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
+ put_lseg(rdata->lseg);
+ rdata->lseg = NULL;
+ } else {
+ nfs_inc_stats(inode, NFSIOS_PNFS_READ);
+ }
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+/*
* Device ID cache. Currently supports one layout type per struct nfs_client.
* Add layout type to the lookup key to expand to support multiple types.
*/
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5107d14..585023f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -45,6 +45,11 @@ struct pnfs_layout_segment {
struct pnfs_layout_hdr *pls_layout;
};
+enum pnfs_try_status {
+ PNFS_ATTEMPTED = 0,
+ PNFS_NOT_ATTEMPTED = 1,
+};
+
#ifdef CONFIG_NFS_V4_1
#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -70,6 +75,12 @@ struct pnfs_layoutdriver_type {
/* test for nfs page cache coalescing */
int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+
+ /*
+ * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+ * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+ */
+ enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
};
struct pnfs_layout_hdr {
@@ -157,6 +168,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+ const struct rpc_call_ops *);
void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
@@ -227,6 +240,13 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
return NULL;
}
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+ const struct rpc_call_ops *call_ops)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
static inline bool
pnfs_roc(struct inode *ino)
{
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 7896e3d..5e6f7cc 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,6 +18,8 @@
#include <linux/sunrpc/clnt.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
#include <asm/system.h>
#include "pnfs.h"
@@ -158,25 +160,20 @@ static void nfs_readpage_release(struct nfs_page *req)
nfs_release_request(req);
}
-/*
- * Set up the NFS read request struct
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops,
- unsigned int count, unsigned int offset,
- struct pnfs_layout_segment *lseg)
+static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = data->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = req->wb_context->cred,
+ .rpc_cred = data->cred,
};
struct rpc_task_setup task_setup_data = {
.task = &data->task,
- .rpc_client = NFS_CLIENT(inode),
+ .rpc_client = clnt,
.rpc_message = &msg,
.callback_ops = call_ops,
.callback_data = data,
@@ -184,9 +181,37 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
.flags = RPC_TASK_ASYNC | swap_flags,
};
+ /* Set up the initial task struct. */
+ NFS_PROTO(inode)->read_setup(data, &msg);
+
+ dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+ "offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+ const struct rpc_call_ops *call_ops,
+ unsigned int count, unsigned int offset,
+ struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
data->req = req;
data->inode = inode;
- data->cred = msg.rpc_cred;
+ data->cred = req->wb_context->cred;
data->lseg = get_lseg(lseg);
data->args.fh = NFS_FH(inode);
@@ -202,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
data->res.eof = 0;
nfs_fattr_init(&data->fattr);
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->read_setup(data, &msg);
-
- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- count,
- (unsigned long long)data->args.offset);
+ if (data->lseg &&
+ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+ return 0;
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return PTR_ERR(task);
- rpc_put_task(task);
- return 0;
+ return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
}
static void
diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
index 68b10f5..37a1437 100644
--- a/include/linux/nfs_iostat.h
+++ b/include/linux/nfs_iostat.h
@@ -113,6 +113,7 @@ enum nfs_stat_eventcounters {
NFSIOS_SHORTREAD,
NFSIOS_SHORTWRITE,
NFSIOS_DELAY,
+ NFSIOS_PNFS_READ,
__NFSIOS_COUNTSMAX,
};
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 560923e..9d2b9da 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1018,6 +1018,7 @@ struct nfs_read_data {
struct nfs_readres res;
unsigned long timestamp; /* For lease renewal */
struct pnfs_layout_segment *lseg;
+ const struct rpc_call_ops *mds_ops;
struct page *page_array[NFS_PAGEVEC_SIZE];
};
--
1.7.2.3
On Feb 16, 2011, at 2:42 PM, Benny Halevy wrote:
> On 2011-02-15 03:38, [email protected] wrote:
>> From: Fred Isaman <[email protected]>
>>
>> Move the pnfs_update_layout call location to nfs_pageio_do_add_request().
>> Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach
>> it to each nfs_read_data so it can be sent to the layout driver.
>>
>> Signed-off-by: Andy Adamon <[email protected]>
>> Signed-off-by: Andy Adamon <[email protected]>
>> Signed-off-by: Dean Hildebrand <[email protected]>
>> Signed-off-by: Fred Isaman <[email protected]>
>> Signed-off-by: Fred Isaman <[email protected]>
>> Signed-off-by: Benny Halevy <[email protected]>
>> Signed-off-by: Boaz Harrosh <[email protected]>
>> Signed-off-by: Oleg Drokin <[email protected]>
>> Signed-off-by: Tao Guo <[email protected]>
>> ---
>> fs/nfs/file.c | 4 ----
>> fs/nfs/pagelist.c | 6 ++++--
>> fs/nfs/pnfs.c | 27 ++++++++++++++++-----------
>> fs/nfs/pnfs.h | 1 +
>> fs/nfs/read.c | 36 ++++++++++++++++++++++++------------
>> fs/nfs/write.c | 4 ++--
>> include/linux/nfs_page.h | 4 ++--
>> include/linux/nfs_xdr.h | 1 +
>> 8 files changed, 50 insertions(+), 33 deletions(-)
>>
>> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> index 7bf029e..d85a534 100644
>> --- a/fs/nfs/file.c
>> +++ b/fs/nfs/file.c
>> @@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
>> file->f_path.dentry->d_name.name,
>> mapping->host->i_ino, len, (long long) pos);
>>
>> - pnfs_update_layout(mapping->host,
>> - nfs_file_open_context(file),
>> - IOMODE_RW);
>> -
>> start:
>> /*
>> * Prevent starvation issues if someone is doing a consistency
>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>> index 9b9a65c..b49cb4b 100644
>> --- a/fs/nfs/pagelist.c
>> +++ b/fs/nfs/pagelist.c
>> @@ -20,6 +20,7 @@
>> #include <linux/nfs_mount.h>
>>
>> #include "internal.h"
>> +#include "pnfs.h"
>>
>> static struct kmem_cache *nfs_page_cachep;
>>
>> @@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
>> */
>> void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
>> struct inode *inode,
>> - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
>> + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
>> size_t bsize,
>> int io_flags)
>> {
>> @@ -315,7 +316,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
>> nfs_page_array_len(desc->pg_base,
>> desc->pg_count),
>> desc->pg_count,
>> - desc->pg_ioflags);
>> + desc->pg_ioflags,
>> + desc->pg_lseg);
>> if (error < 0)
>> desc->pg_error = error;
>> else
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index d12f463..a09e3a0 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -245,7 +245,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
>> rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
>> }
>>
>> -static void
>> +void
>> put_lseg(struct pnfs_layout_segment *lseg)
>> {
>> struct inode *inode;
>> @@ -784,7 +784,6 @@ pnfs_update_layout(struct inode *ino,
>> out:
>> dprintk("%s end, state 0x%lx lseg %p\n", __func__,
>> nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
>> - put_lseg(lseg); /* STUB - callers currently ignore return value */
>> return lseg;
>> out_unlock:
>> spin_unlock(&ino->i_lock);
>> @@ -858,23 +857,29 @@ out_forget_reply:
>> goto out;
>> }
>>
>> -static void
>> -pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
>> +static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
>> + struct nfs_page *prev,
>> + struct nfs_page *req)
>> {
>> - struct pnfs_layoutdriver_type *ld;
>> -
>> - ld = NFS_SERVER(inode)->pnfs_curr_ld;
>> - pgio->pg_test = (ld ? ld->pg_test : NULL);
>> + if (pgio->pg_count == prev->wb_bytes) {
>> + /* This is first coelesce call for a series of nfs_pages */
>> + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
>> + prev->wb_context,
>> + IOMODE_READ);
>> + }
>> + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
>> }
>>
>> /*
>> * rsize is already set by caller to MDS rsize.
>> */
>> void
>> -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
>> - struct inode *inode)
>> +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
>> {
>> - pnfs_set_pg_test(inode, pgio);
>> + struct pnfs_layoutdriver_type *ld;
>> +
>> + ld = NFS_SERVER(inode)->pnfs_curr_ld;
>> + pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
>> }
>>
>> /*
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index db52d96..5107d14 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -151,6 +151,7 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
>>
>> /* pnfs.c */
>> void get_layout_hdr(struct pnfs_layout_hdr *lo);
>> +void put_lseg(struct pnfs_layout_segment *lseg);
>> struct pnfs_layout_segment *
>> pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
>> enum pnfs_iomode access_type);
>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>> index 2a27659..7896e3d 100644
>> --- a/fs/nfs/read.c
>> +++ b/fs/nfs/read.c
>> @@ -20,17 +20,17 @@
>> #include <linux/nfs_page.h>
>>
>> #include <asm/system.h>
>> +#include "pnfs.h"
>>
>> #include "nfs4_fs.h"
>> #include "internal.h"
>> #include "iostat.h"
>> #include "fscache.h"
>> -#include "pnfs.h"
>>
>> #define NFSDBG_FACILITY NFSDBG_PAGECACHE
>>
>> -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
>> -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
>> +static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
>> +static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
>> static const struct rpc_call_ops nfs_read_partial_ops;
>> static const struct rpc_call_ops nfs_read_full_ops;
>>
>> @@ -69,6 +69,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
>>
>> static void nfs_readdata_release(struct nfs_read_data *rdata)
>> {
>> + put_lseg(rdata->lseg);
>> put_nfs_open_context(rdata->args.context);
>> nfs_readdata_free(rdata);
>> }
>> @@ -117,11 +118,11 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>> LIST_HEAD(one_request);
>> struct nfs_page *new;
>> unsigned int len;
>> + struct pnfs_layout_segment *lseg;
>>
>> len = nfs_page_length(page);
>> if (len == 0)
>> return nfs_return_empty_page(page);
>> - pnfs_update_layout(inode, ctx, IOMODE_READ);
>> new = nfs_create_request(ctx, inode, page, 0, len);
>> if (IS_ERR(new)) {
>> unlock_page(page);
>> @@ -131,10 +132,12 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>> zero_user_segment(page, len, PAGE_CACHE_SIZE);
>>
>> nfs_list_add_request(new, &one_request);
>> + lseg = pnfs_update_layout(inode, ctx, IOMODE_READ);
>> if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
>> - nfs_pagein_multi(inode, &one_request, 1, len, 0);
>> + nfs_pagein_multi(inode, &one_request, 1, len, 0, lseg);
>> else
>> - nfs_pagein_one(inode, &one_request, 1, len, 0);
>> + nfs_pagein_one(inode, &one_request, 1, len, 0, lseg);
>> + put_lseg(lseg);
>> return 0;
>> }
>>
>> @@ -160,7 +163,8 @@ static void nfs_readpage_release(struct nfs_page *req)
>> */
>> static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>> const struct rpc_call_ops *call_ops,
>> - unsigned int count, unsigned int offset)
>> + unsigned int count, unsigned int offset,
>> + struct pnfs_layout_segment *lseg)
>> {
>> struct inode *inode = req->wb_context->path.dentry->d_inode;
>> int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
>> @@ -183,6 +187,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
>> data->req = req;
>> data->inode = inode;
>> data->cred = msg.rpc_cred;
>> + data->lseg = get_lseg(lseg);
>>
>> data->args.fh = NFS_FH(inode);
>> data->args.offset = req_offset(req) + offset;
>> @@ -240,7 +245,7 @@ nfs_async_read_error(struct list_head *head)
>> * won't see the new data until our attribute cache is updated. This is more
>> * or less conventional NFS client behavior.
>> */
>> -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>> +static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>> {
>> struct nfs_page *req = nfs_list_entry(head->next);
>> struct page *page = req->wb_page;
>> @@ -266,6 +271,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>> } while(nbytes != 0);
>> atomic_set(&req->wb_complete, requests);
>>
>> + /* We know lseg==NULL */
>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>> ClearPageError(page);
>> offset = 0;
>> nbytes = count;
>> @@ -280,12 +287,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
>> if (nbytes < rsize)
>> rsize = nbytes;
>> ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
>> - rsize, offset);
>> + rsize, offset, lseg);
>> if (ret == 0)
>> ret = ret2;
>> offset += rsize;
>> nbytes -= rsize;
>> } while (nbytes != 0);
>> + put_lseg(lseg);
>>
>> return ret;
>>
>> @@ -300,7 +308,7 @@ out_bad:
>> return -ENOMEM;
>> }
>>
>> -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
>> +static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
>> {
>> struct nfs_page *req;
>> struct page **pages;
>> @@ -320,9 +328,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
>> *pages++ = req->wb_page;
>> }
>> req = nfs_list_entry(data->pages.next);
>> + if ((!lseg) && list_is_singular(&data->pages))
>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
>>
>> - return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
>> + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
>> + put_lseg(lseg);
>
> Shouldn't that be done only if pnfs_update_layout was called here?
> Otherwise, the caller, nfs_readpage_async puts the lseg it passes down.
>
You are right there is a problem. But it needs to be fixed by removing the put_lseg from nfs_readpage_async.
>> + return ret;
>> out_bad:
>> + put_lseg(lseg);
>
> I'd unify the common exit path by doing nfs_async_read_error on the error path
> and then goto out for the common code.
>
OK.
Fred
> Benny
>
>> nfs_async_read_error(head);
>> return ret;
>> }
>> @@ -625,7 +638,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
>> if (ret == 0)
>> goto read_complete; /* all pages were read */
>>
>> - pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
>> pnfs_pageio_init_read(&pgio, inode);
>> if (rsize < PAGE_CACHE_SIZE)
>> nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index 6e90cdf..aca0268 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -880,7 +880,7 @@ static void nfs_redirty_request(struct nfs_page *req)
>> * Generate multiple small requests to write out a single
>> * contiguous dirty area on one page.
>> */
>> -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
>> +static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
>> {
>> struct nfs_page *req = nfs_list_entry(head->next);
>> struct page *page = req->wb_page;
>> @@ -947,7 +947,7 @@ out_bad:
>> * This is the case if nfs_updatepage detects a conflicting request
>> * that has been written but not committed.
>> */
>> -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
>> +static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
>> {
>> struct nfs_page *req;
>> struct page **pages;
>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
>> index 4eaf27a..ba88ff4 100644
>> --- a/include/linux/nfs_page.h
>> +++ b/include/linux/nfs_page.h
>> @@ -59,7 +59,7 @@ struct nfs_pageio_descriptor {
>> unsigned int pg_base;
>>
>> struct inode *pg_inode;
>> - int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
>> + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
>> int pg_ioflags;
>> int pg_error;
>> struct pnfs_layout_segment *pg_lseg;
>> @@ -81,7 +81,7 @@ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
>> pgoff_t idx_start, unsigned int npages, int tag);
>> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
>> struct inode *inode,
>> - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
>> + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
>> size_t bsize,
>> int how);
>> extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>> index d159fe7..560923e 100644
>> --- a/include/linux/nfs_xdr.h
>> +++ b/include/linux/nfs_xdr.h
>> @@ -1017,6 +1017,7 @@ struct nfs_read_data {
>> struct nfs_readargs args;
>> struct nfs_readres res;
>> unsigned long timestamp; /* For lease renewal */
>> + struct pnfs_layout_segment *lseg;
>> struct page *page_array[NFS_PAGEVEC_SIZE];
>> };
>>
From: Fred Isaman <[email protected]>
Prepare put_lseg and get_lseg to be called from the pNFS I/O code.
Pull common code from pnfs_lseg_locked to call from pnfs_lseg.
Inline pnfs_lseg_locked into it's only caller.
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/pnfs.c | 62 +++++++++++++++++++++++++++++++++++---------------------
fs/nfs/pnfs.h | 20 ++++++++++++++++++
2 files changed, 59 insertions(+), 23 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7d031cd..1173434 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -230,32 +230,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
put_layout_hdr(NFS_I(ino)->layout);
}
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
- * could sleep, so must be called outside of the lock.
- * Returns 1 if object was removed, otherwise return 0.
- */
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
- struct list_head *tmp_list)
+static void
+put_lseg_common(struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ list_del(&lseg->pls_list);
+ if (list_empty(&lseg->pls_layout->plh_segs)) {
+ set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+ /* Matched by initial refcount set in alloc_init_layout_hdr */
+ put_layout_hdr_locked(lseg->pls_layout);
+ }
+ rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+static void
+put_lseg(struct pnfs_layout_segment *lseg)
{
+ struct inode *inode;
+
+ if (!lseg)
+ return;
+
dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- if (atomic_dec_and_test(&lseg->pls_refcount)) {
- struct inode *ino = lseg->pls_layout->plh_inode;
+ inode = lseg->pls_layout->plh_inode;
+ if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+ LIST_HEAD(free_me);
- BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- list_del(&lseg->pls_list);
- if (list_empty(&lseg->pls_layout->plh_segs)) {
- set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
- /* Matched by initial refcount set in alloc_init_layout_hdr */
- put_layout_hdr_locked(lseg->pls_layout);
- }
- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
- list_add(&lseg->pls_list, tmp_list);
- return 1;
+ put_lseg_common(lseg);
+ list_add(&lseg->pls_list, &free_me);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&free_me);
}
- return 0;
}
static bool
@@ -276,7 +285,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
* list. It will now be removed when all
* outstanding io is finished.
*/
- rv = put_lseg_locked(lseg, tmp_list);
+ dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount));
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ put_lseg_common(lseg);
+ list_add(&lseg->pls_list, tmp_list);
+ rv = 1;
+ }
}
return rv;
}
@@ -689,7 +704,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
is_matching_lseg(lseg, iomode)) {
- ret = lseg;
+ ret = get_lseg(lseg);
break;
}
if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -769,6 +784,7 @@ pnfs_update_layout(struct inode *ino,
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
+ put_lseg(lseg); /* STUB - callers currently ignore return value */
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
@@ -821,7 +837,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}
init_lseg(lo, lseg);
lseg->pls_range = res->range;
- *lgp->lsegpp = lseg;
+ *lgp->lsegpp = get_lseg(lseg);
pnfs_insert_layout(lo, lseg);
if (res->return_on_close) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea..9a994bc 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -177,6 +177,16 @@ static inline int lo_fail_bit(u32 iomode)
NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
}
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+ if (lseg) {
+ atomic_inc(&lseg->pls_refcount);
+ smp_mb__after_atomic_inc();
+ }
+ return lseg;
+}
+
/* Return true if a layout driver is being used for this mountpoint */
static inline int pnfs_enabled_sb(struct nfs_server *nfss)
{
@@ -194,6 +204,16 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
}
static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+ return NULL;
+}
+
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type)
{
--
1.7.2.3
From: Andy Adamson <[email protected]>
The DS only role cannot be used to mount.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/client.c | 6 ++++++
fs/nfs/nfs4_fs.h | 13 +++++++++++++
2 files changed, 19 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 280d41f6..4a501d0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1400,6 +1400,12 @@ static int nfs4_set_client(struct nfs_server *server,
goto error;
}
+ /* Cannot mount a DS only server */
+ if (is_ds_only_client(clp)) {
+ error = -ENODEV;
+ goto error;
+ }
+
/*
* Query for the lease time on clientid setup or renewal
*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a74740..5d84642 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -259,6 +259,13 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
extern int nfs4_init_session(struct nfs_server *server);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+ EXCHGID4_FLAG_USE_PNFS_DS;
+}
#else /* CONFIG_NFS_v4_1 */
static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
{
@@ -276,6 +283,12 @@ static inline int nfs4_init_session(struct nfs_server *server)
{
return 0;
}
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return false;
+}
#endif /* CONFIG_NFS_V4_1 */
extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
--
1.7.2.3
From: Andy Adamson <[email protected]>
Cleanup nfs_read_data. We also won't use CONFIG_NFS_V4_1 for additional
NFSv4.1 fields in subsequent patches.
Signed-off-by: Andy Adamson <[email protected]>
---
include/linux/nfs_xdr.h | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b006857..51bfadb 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1016,9 +1016,7 @@ struct nfs_read_data {
unsigned int npages; /* Max length of pagevec */
struct nfs_readargs args;
struct nfs_readres res;
-#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
-#endif
struct page *page_array[NFS_PAGEVEC_SIZE];
};
--
1.7.2.3
From: Andy Adamson <[email protected]>
Data servers cannot send nfs4_proc_get_lease_time. but still need to setup
state renewal. Add the NFS_CS_CHECK_LEASE_TIME bit to indicate if the lease
time can be checked.
Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/client.c | 9 +++++++++
fs/nfs/nfs4state.c | 5 +++++
include/linux/nfs_fs_sb.h | 1 +
3 files changed, 15 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a86698c..280d41f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1400,6 +1400,15 @@ static int nfs4_set_client(struct nfs_server *server,
goto error;
}
+ /*
+ * Query for the lease time on clientid setup or renewal
+ *
+ * Note that this will be set on nfs_clients that were created
+ * only for the DS role and did not set this bit, but now will
+ * serve a dual role.
+ */
+ set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
+
server->nfs_client = clp;
dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
return 0;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e6742b5..9e33e88 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
int status;
struct nfs_fsinfo fsinfo;
+ if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+ nfs4_schedule_state_renewal(clp);
+ return 0;
+ }
+
status = nfs4_proc_get_lease_time(clp, &fsinfo);
if (status == 0) {
/* Update lease time and schedule renewal */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2c2dc18..2669a9a 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -31,6 +31,7 @@ struct nfs_client {
#define NFS_CS_IDMAP 2 /* - idmap started */
#define NFS_CS_RENEWD 3 /* - renewd started */
#define NFS_CS_STOP_RENEW 4 /* no more state to renew */
+#define NFS_CS_CHECK_LEASE_TIME 5 /* need to check lease time */
struct sockaddr_storage cl_addr; /* server identifier */
size_t cl_addrlen;
char * cl_hostname; /* hostname of server */
--
1.7.2.3