2010-11-12 08:49:23

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 00/22] rewrite of CB_LAYOUTRECALL and layoutstate code, try 2

This is version 2 of code that rewrites the layout state handling and
the CB_LAYOUTRECALL paths, incorporating suggestions from Benny, as
well as some compile fixes for the non-v4.1 case, and reordering
the patches to put the simpler, less intrusive changes first.

I'm not entirely happy with the non-v4.1 compile fixes I added, but
they are a starting point.

It applies to Benny's current pnfs-submit branch (commit eb04948a).

patches 01-05 are straightforward tweaking preparing for the bigger changes

patches 06-08 modify CB_COMPUND processing to pass the CB_SEQUENCE info (the session in particular) up to subsequent operations.

patches 09-15 are more substantial changes affecting refcounting and blocking.

patches 16-17 are the point of it all, the rewrite of the stateid and race handling for LAYOUTGET, LAYOUTRETURN, and CB_LAYOUTRECALL

patches 18-22 are Andy's code that puts LAYOUTCOMMIT and LAYOUTRETURN in the same compound as CLOSE



Fred



2010-11-15 15:02:02

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH] SQUASHME: pnfs-submit: encode layoutreturn on close before close

On Sun, Nov 14, 2010 at 9:21 AM, Benny Halevy <[email protected]> wrote:
> And handle errors from layoutcommit and layoutreturn on the reply path.
>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> ?fs/nfs/nfs4xdr.c | ? 35 ++++++++++++++++++-----------------
> ?fs/nfs/pnfs.c ? ?| ? ?1 +
> ?2 files changed, 19 insertions(+), 17 deletions(-)
>
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 1804f35..0e6e5e4 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -441,17 +441,17 @@ static int nfs4_stat_to_errno(int);
> ?#define NFS4_enc_close_sz ? ? ?(compound_encode_hdr_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? encode_sequence_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? encode_putfh_maxsz + \
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_close_maxsz + \
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_getattr_maxsz + \
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_layoutcommit_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? encode_layoutreturn_maxsz + \
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_layoutcommit_maxsz)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_close_maxsz + \
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_getattr_maxsz)
> ?#define NFS4_dec_close_sz ? ? ?(compound_decode_hdr_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? decode_sequence_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? decode_putfh_maxsz + \
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_close_maxsz + \
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_getattr_maxsz + \
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_layoutcommit_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? decode_layoutreturn_maxsz + \
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_layoutcommit_maxsz)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_close_maxsz + \
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_getattr_maxsz)
> ?#define NFS4_enc_setattr_sz ? ?(compound_encode_hdr_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? encode_sequence_maxsz + \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? encode_putfh_maxsz + \
> @@ -2160,10 +2160,10 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
> ? ? ? ?encode_putfh(&xdr, args->fh, &hdr);
> ? ? ? ?if (args->op_bitmask & NFS4_HAS_LAYOUTCOMMIT) /* layoutcommit set */
> ? ? ? ? ? ? ? ?encode_layoutcommit(&xdr, &args->lc_args, &hdr);
> - ? ? ? encode_close(&xdr, args, &hdr);
> - ? ? ? encode_getfattr(&xdr, args->bitmask, &hdr);
> ? ? ? ?if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
> ? ? ? ? ? ? ? ?encode_layoutreturn(&xdr, &args->lr_args, &hdr);
> + ? ? ? encode_close(&xdr, args, &hdr);
> + ? ? ? encode_getfattr(&xdr, args->bitmask, &hdr);
> ? ? ? ?encode_nops(&hdr);
> ? ? ? ?return 0;
> ?}
> @@ -5743,9 +5743,16 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
> ? ? ? ?status = decode_putfh(&xdr);
> ? ? ? ?if (status)
> ? ? ? ? ? ? ? ?goto out;
> - ? ? ? /* We pay no attention to the layoutcommit return */
> - ? ? ? if (res->op_bitmask & NFS4_HAS_LAYOUTCOMMIT)
> - ? ? ? ? ? ? ? decode_layoutcommit(&xdr);
> + ? ? ? if (res->op_bitmask & NFS4_HAS_LAYOUTCOMMIT) {
> + ? ? ? ? ? ? ? status = decode_layoutcommit(&xdr);
> + ? ? ? ? ? ? ? if (status)
> + ? ? ? ? ? ? ? ? ? ? ? goto out;
> + ? ? ? }
> + ? ? ? if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN) {
> + ? ? ? ? ? ? ? status = decode_layoutreturn(&xdr, &res->lr_res);
> + ? ? ? ? ? ? ? if (status)
> + ? ? ? ? ? ? ? ? ? ? ? goto out;

What prevents infinite loop here? With LAYOUTCOMMIT, the inode data
is cleared so that on retry it will not be called. I see no
comparable "pre-cleaning" done for LAYOUTRETURN.

Fred

> + ? ? ? }
> ? ? ? ?status = decode_close(&xdr, res);
> ? ? ? ?if (status != 0)
> ? ? ? ? ? ? ? ?goto out;
> @@ -5757,12 +5764,6 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
> ? ? ? ? */
> ? ? ? ?decode_getfattr(&xdr, res->fattr, res->server,
> ? ? ? ? ? ? ? ? ? ? ? ?!RPC_IS_ASYNC(rqstp->rq_task));
> - ? ? ? /*
> - ? ? ? ?* With the forgetful model, we pay no attention to the
> - ? ? ? ?* layoutreturn status.
> - ? ? ? ?*/
> - ? ? ? if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN)
> - ? ? ? ? ? ? ? decode_layoutreturn(&xdr, &res->lr_res);
> ?out:
> ? ? ? ?return status;
> ?}
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 15673d0..90a868b 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -640,6 +640,7 @@ pnfs_roc(struct nfs4_closedata *data)
> ? ? ? ?LIST_HEAD(tmp_list);
> ? ? ? ?bool found = false;
>
> + ? ? ? data->arg.op_bitmask = data->res.op_bitmask = 0;
> ? ? ? ?spin_lock(&data->inode->i_lock);
> ? ? ? ?lo = NFS_I(data->inode)->layout;
> ? ? ? ?if (!lo || lo->roc_iomode == 0 ||
> --
> 1.7.2.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>

2010-11-12 16:56:49

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH 18/22] pnfs-submit: roc add layoutreturn op to close compound

On Fri, Nov 12, 2010 at 11:31 AM, Benny Halevy <[email protected]> wrote:
> On 2010-11-12 10:48, Fred Isaman wrote:
>> From: Andy Adamson <[email protected]>
>>
>> Signed-off-by: Andy Adamson <[email protected]>
>> ---
>> ?fs/nfs/nfs4proc.c ? ? ? | ? 73 +++++++++++++++++++++++++++++++++-------------
>> ?fs/nfs/nfs4state.c ? ? ?| ? 18 +-----------
>> ?fs/nfs/nfs4xdr.c ? ? ? ?| ? 14 ++++++++-
>> ?fs/nfs/pnfs.c ? ? ? ? ? | ? 64 +++++++++++++++++++++++++++++++++++++----
>> ?fs/nfs/pnfs.h ? ? ? ? ? | ? ?1 +
>> ?include/linux/nfs_xdr.h | ? 19 ++++++++++++
>> ?6 files changed, 143 insertions(+), 46 deletions(-)
>>
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index 6223c6a..2b47c59 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -74,6 +74,8 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
>> ?static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? struct nfs_fattr *fattr, struct iattr *sattr,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? struct nfs4_state *state);
>> +static void nfs4_layoutreturn_set_stateid(struct inode *ino,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct nfs4_layoutreturn_res *res);
>>
>> ?/* Prevent leaks of NFSv4 errors into userland */
>> ?static int nfs4_map_errors(int err)
>> @@ -1821,16 +1823,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
>> ? ? ? return err;
>> ?}
>>
>> -struct nfs4_closedata {
>> - ? ? struct path path;
>> - ? ? struct inode *inode;
>> - ? ? struct nfs4_state *state;
>> - ? ? struct nfs_closeargs arg;
>> - ? ? struct nfs_closeres res;
>> - ? ? struct nfs_fattr fattr;
>> - ? ? unsigned long timestamp;
>> -};
>> -
>> ?static void nfs4_free_closedata(void *data)
>> ?{
>> ? ? ? struct nfs4_closedata *calldata = data;
>> @@ -1840,6 +1832,17 @@ static void nfs4_free_closedata(void *data)
>> ? ? ? nfs_free_seqid(calldata->arg.seqid);
>> ? ? ? nfs4_put_state_owner(sp);
>> ? ? ? path_put(&calldata->path);
>> + ? ? if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN) {
>> + ? ? ? ? ? ? struct pnfs_layout_hdr *lo = NFS_I(calldata->inode)->layout;
>> +
>> + ? ? ? ? ? ? spin_lock(&lo->inode->i_lock);
>> + ? ? ? ? ? ? lo->plh_block_lgets--;
>> + ? ? ? ? ? ? lo->plh_outstanding--;
>> + ? ? ? ? ? ? if (!pnfs_layoutgets_blocked(lo, NULL))
>> + ? ? ? ? ? ? ? ? ? ? rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
>> + ? ? ? ? ? ? spin_unlock(&lo->inode->i_lock);
>> + ? ? ? ? ? ? put_layout_hdr(lo->inode);
>> + ? ? }
>> ? ? ? kfree(calldata);
>> ?}
>>
>> @@ -1869,6 +1872,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
>> ? ? ? switch (task->tk_status) {
>> ? ? ? ? ? ? ? case 0:
>> ? ? ? ? ? ? ? ? ? ? ? nfs_set_open_stateid(state, &calldata->res.stateid, 0);
>> + ? ? ? ? ? ? ? ? ? ? if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN)
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? nfs4_layoutreturn_set_stateid(calldata->inode,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? &calldata->res.lr_res);
>> ? ? ? ? ? ? ? ? ? ? ? renew_lease(server, calldata->timestamp);
>> ? ? ? ? ? ? ? ? ? ? ? nfs4_close_clear_stateid_flags(state,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? calldata->arg.fmode);
>> @@ -1920,8 +1926,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
>> ? ? ? ? ? ? ? return;
>> ? ? ? }
>>
>> - ? ? if (calldata->arg.fmode == 0)
>> + ? ? if (calldata->arg.fmode == 0) {
>> ? ? ? ? ? ? ? task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
>> + ? ? ? ? ? ? /* Are there layout segments to return on close? */
>> + ? ? ? ? ? ? if (pnfs_roc(calldata)) {
>> + ? ? ? ? ? ? ? ? ? ? struct nfs_inode *nfsi = NFS_I(calldata->inode);
>> + ? ? ? ? ? ? ? ? ? ? if (pnfs_return_layout_barrier(nfsi,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? &calldata->arg.lr_args.range)) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? dprintk("%s: waiting on barrier\n", __func__);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* FIXME race with wake here */
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? spin_lock(&calldata->inode->i_lock);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? nfsi->layout->plh_block_lgets--;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? nfsi->layout->plh_outstanding--;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (!pnfs_layoutgets_blocked(nfsi->layout, NULL))
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? rpc_wake_up(&nfsi->lo_rpcwaitq_stateid);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? spin_unlock(&calldata->inode->i_lock);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? put_layout_hdr(calldata->inode);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? return;
>> + ? ? ? ? ? ? ? ? ? ? }
>> + ? ? ? ? ? ? }
>> + ? ? }
>>
>> ? ? ? nfs_fattr_init(calldata->res.fattr);
>> ? ? ? calldata->timestamp = jiffies;
>> @@ -5587,6 +5612,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
>>
>> ? ? ? ? ? ? ? if (pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
>> ? ? ? ? ? ? ? ? ? ? ? dprintk("%s: waiting on barrier\n", __func__);
>> + ? ? ? ? ? ? ? ? ? ? /* FIXME race with wake here */
>> ? ? ? ? ? ? ? ? ? ? ? rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
>> ? ? ? ? ? ? ? ? ? ? ? return;
>> ? ? ? ? ? ? ? }
>> @@ -5602,6 +5628,19 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
>> ? ? ? rpc_call_start(task);
>> ?}
>>
>> +static void nfs4_layoutreturn_set_stateid(struct inode *ino,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct nfs4_layoutreturn_res *res)
>> +{
>> + ? ? struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
>> +
>> + ? ? spin_lock(&ino->i_lock);
>> + ? ? if (res->lrs_present)
>> + ? ? ? ? ? ? pnfs_set_layout_stateid(lo, &res->stateid, true);
>> + ? ? else
>> + ? ? ? ? ? ? BUG_ON(!list_empty(&lo->segs));
>> + ? ? spin_unlock(&ino->i_lock);
>> +}
>> +
>> ?static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
>> ?{
>> ? ? ? struct nfs4_layoutreturn *lrp = calldata;
>> @@ -5620,16 +5659,8 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
>> ? ? ? ? ? ? ? nfs_restart_rpc(task, lrp->clp);
>> ? ? ? ? ? ? ? return;
>> ? ? ? }
>> - ? ? if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
>> - ? ? ? ? ? ? struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
>> -
>> - ? ? ? ? ? ? spin_lock(&lo->inode->i_lock);
>> - ? ? ? ? ? ? if (lrp->res.lrs_present)
>> - ? ? ? ? ? ? ? ? ? ? pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
>> - ? ? ? ? ? ? else
>> - ? ? ? ? ? ? ? ? ? ? BUG_ON(!list_empty(&lo->segs));
>> - ? ? ? ? ? ? spin_unlock(&lo->inode->i_lock);
>> - ? ? }
>> + ? ? if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE))
>> + ? ? ? ? ? ? nfs4_layoutreturn_set_stateid(lrp->args.inode, &lrp->res);
>> ? ? ? dprintk("<-- %s\n", __func__);
>> ?}
>>
>> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
>> index ceb0d66..784f122 100644
>> --- a/fs/nfs/nfs4state.c
>> +++ b/fs/nfs/nfs4state.c
>> @@ -601,24 +601,8 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
>> ? ? ? if (!call_close) {
>> ? ? ? ? ? ? ? nfs4_put_open_state(state);
>> ? ? ? ? ? ? ? nfs4_put_state_owner(owner);
>> - ? ? } else {
>> - ? ? ? ? ? ? u32 roc_iomode;
>> - ? ? ? ? ? ? struct nfs_inode *nfsi = NFS_I(state->inode);
>> -
>> - ? ? ? ? ? ? if (has_layout(nfsi) &&
>> - ? ? ? ? ? ? ? ? (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) {
>> - ? ? ? ? ? ? ? ? ? ? struct pnfs_layout_range range = {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? .iomode = roc_iomode,
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? .offset = 0,
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? .length = NFS4_MAX_UINT64,
>> - ? ? ? ? ? ? ? ? ? ? };
>> -
>> - ? ? ? ? ? ? ? ? ? ? pnfs_return_layout(state->inode, &range, NULL,
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?RETURN_FILE, wait);
>> - ? ? ? ? ? ? }
>> -
>> + ? ? } else
>> ? ? ? ? ? ? ? nfs4_do_close(path, state, gfp_mask, wait);
>> - ? ? }
>> ?}
>>
>> ?void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index f530c7e..adb4c47 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -438,12 +438,14 @@ static int nfs4_stat_to_errno(int);
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_sequence_maxsz + \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_putfh_maxsz + \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_close_maxsz + \
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_getattr_maxsz)
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_getattr_maxsz + \
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_layoutreturn_maxsz)
>> ?#define NFS4_dec_close_sz ? ?(compound_decode_hdr_maxsz + \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_sequence_maxsz + \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_putfh_maxsz + \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_close_maxsz + \
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_getattr_maxsz)
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_getattr_maxsz + \
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?decode_layoutreturn_maxsz)
>> ?#define NFS4_enc_setattr_sz ?(compound_encode_hdr_maxsz + \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_sequence_maxsz + \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?encode_putfh_maxsz + \
>> @@ -2143,6 +2145,8 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
>> ? ? ? encode_putfh(&xdr, args->fh, &hdr);
>> ? ? ? encode_close(&xdr, args, &hdr);
>> ? ? ? encode_getfattr(&xdr, args->bitmask, &hdr);
>> + ? ? if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
>> + ? ? ? ? ? ? encode_layoutreturn(&xdr, &args->lr_args, &hdr);
>
> Sorry, I just noticed, but if there's no object I'll move the layoutreturn op
> before close in the compound.
>
> Benny
>

The reason the LAYOUTRETURN was last was so that we could ignore any
error on the return. Otherwise an error on the LAYOUTRETURN stops the
CLOSE from being processed. I'll defer to Andy, but while I see why
you would want the reutrn first, moving it will require paying careful
attention to how an error is dealt with. (Actually, we have the same
issue with the LAYOUTCOMMIT when it is later added.)

Fred

>> ? ? ? encode_nops(&hdr);
>> ? ? ? return 0;
>> ?}
>> @@ -5719,6 +5723,12 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
>> ? ? ? ?*/
>> ? ? ? decode_getfattr(&xdr, res->fattr, res->server,
>> ? ? ? ? ? ? ? ? ? ? ? !RPC_IS_ASYNC(rqstp->rq_task));
>> + ? ? /*
>> + ? ? ?* With the forgetful model, we pay no attention to the
>> + ? ? ?* layoutreturn status.
>> + ? ? ?*/
>> + ? ? if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN)
>> + ? ? ? ? ? ? decode_layoutreturn(&xdr, &res->lr_res);
>> ?out:
>> ? ? ? return status;
>> ?}
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 22abf83..76cfb11 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -623,6 +623,63 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
>> ? ? ? return ret;
>> ?}
>>
>> +/*
>> + * Return on close
>> + *
>> + * No LAYOUTRETURNS can be sent when BULK RECALL flag is set.
>> + * FIXME: add layoutcommit operation if layoutcommit_needed is true.
>> + */
>> +bool
>> +pnfs_roc(struct nfs4_closedata *data)
>> +{
>> + ? ? struct nfs4_layoutreturn_args *lr_args = &data->arg.lr_args;
>> + ? ? struct pnfs_layout_hdr *lo;
>> + ? ? struct pnfs_layout_segment *lseg, *tmp;
>> + ? ? struct pnfs_layout_range range = {
>> + ? ? ? ? ? ? .length = NFS4_MAX_UINT64,
>> + ? ? };
>> + ? ? LIST_HEAD(tmp_list);
>> + ? ? bool found = false;
>> +
>> + ? ? spin_lock(&data->inode->i_lock);
>> + ? ? lo = NFS_I(data->inode)->layout;
>> + ? ? if (!lo || lo->roc_iomode == 0 ||
>> + ? ? ? ? test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
>> + ? ? ? ? ? ? goto out_nolayout;
>> +
>> + ? ? range.iomode = lo->roc_iomode;
>> + ? ? list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
>> + ? ? ? ? ? ? if (should_free_lseg(&lseg->range, &range)) {
>> + ? ? ? ? ? ? ? ? ? ? mark_lseg_invalid(lseg, &tmp_list);
>> + ? ? ? ? ? ? ? ? ? ? found = true;
>> + ? ? ? ? ? ? }
>> + ? ? if (found == false)
>> + ? ? ? ? ? ? goto out_nolayout;
>> + ? ? /* Stop new and drop response to outstanding LAYOUTGETS */
>> + ? ? lo->plh_block_lgets++;
>> + ? ? lo->plh_outstanding++;
>> + ? ? /* Reference matched in pnfs_layoutreturn_release */
>> + ? ? get_layout_hdr(lo);
>> +
>> + ? ? spin_unlock(&data->inode->i_lock);
>> +
>> + ? ? pnfs_free_lseg_list(&tmp_list);
>> +
>> + ? ? lr_args->reclaim = 0;
>> + ? ? lr_args->layout_type = NFS_SERVER(data->inode)->pnfs_curr_ld->id;
>> + ? ? lr_args->return_type = RETURN_FILE;
>> + ? ? lr_args->range = range;
>> + ? ? lr_args->inode = data->inode;
>> + ? ? data->res.op_bitmask |= NFS4_HAS_LAYOUTRETURN;
>> + ? ? data->arg.op_bitmask |= NFS4_HAS_LAYOUTRETURN;
>> +
>> + ? ? return true;
>> +
>> +out_nolayout:
>> + ? ? spin_unlock(&data->inode->i_lock);
>> + ? ? return false;
>> +}
>> +
>> ?static int
>> ?return_layout(struct inode *ino, struct pnfs_layout_range *range,
>> ? ? ? ? ? ? enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo,
>> @@ -997,13 +1054,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>> ? ? ? *lgp->lsegpp = lseg;
>> ? ? ? pnfs_insert_layout(lo, lseg);
>>
>> - ? ? if (res->return_on_close) {
>> - ? ? ? ? ? ? /* FI: This needs to be re-examined. ?At lo level,
>> - ? ? ? ? ? ? ?* all it needs is a bit indicating whether any of
>> - ? ? ? ? ? ? ?* the lsegs in the list have the flags set.
>> - ? ? ? ? ? ? ?*/
>> + ? ? if (res->return_on_close)
>> ? ? ? ? ? ? ? lo->roc_iomode |= res->range.iomode;
>> - ? ? }
>>
>> ? ? ? /* Done processing layoutget. Set the layout stateid */
>> ? ? ? pnfs_set_layout_stateid(lo, &res->stateid, false);
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 7fd1f5d..916a057 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -234,6 +234,7 @@ void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct pnfs_layout_range *range,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? int notify_bit, atomic_t *notify_count,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct list_head *tmp_list);
>> +bool pnfs_roc(struct nfs4_closedata *data);
>>
>> ?static inline bool
>> ?has_layout(struct nfs_inode *nfsi)
>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>> index f472405..6c4ba71 100644
>> --- a/include/linux/nfs_xdr.h
>> +++ b/include/linux/nfs_xdr.h
>> @@ -351,12 +351,18 @@ struct nfs_open_confirmres {
>> ?/*
>> ? * Arguments to the close call.
>> ? */
>> +
>> +/* op_bitmask bits */
>> +#define NFS4_HAS_LAYOUTRETURN ?0x01
>> +
>> ?struct nfs_closeargs {
>> ? ? ? struct nfs_fh * ? ? ? ? fh;
>> ? ? ? nfs4_stateid * ? ? ? ? ?stateid;
>> ? ? ? struct nfs_seqid * ? ? ?seqid;
>> ? ? ? fmode_t ? ? ? ? ? ? ? ? fmode;
>> ? ? ? const u32 * ? ? ? ? ? ? bitmask;
>> + ? ? u32 ? ? ? ? ? ? ? ? ? ? op_bitmask; /* which optional ops to encode */
>> + ? ? struct nfs4_layoutreturn_args ? lr_args; /* optional */
>> ? ? ? struct nfs4_sequence_args ? ? ? seq_args;
>> ?};
>>
>> @@ -365,8 +371,21 @@ struct nfs_closeres {
>> ? ? ? struct nfs_fattr * ? ? ?fattr;
>> ? ? ? struct nfs_seqid * ? ? ?seqid;
>> ? ? ? const struct nfs_server *server;
>> + ? ? u32 ? ? ? ? ? ? ? ? ? ? op_bitmask; /* which optional ops encoded */
>> + ? ? struct nfs4_layoutreturn_res ? ?lr_res; /* optional */
>> ? ? ? struct nfs4_sequence_res ? ? ? ?seq_res;
>> ?};
>> +
>> +struct nfs4_closedata {
>> + ? ? struct path path;
>> + ? ? struct inode *inode;
>> + ? ? struct nfs4_state *state;
>> + ? ? struct nfs_closeargs arg;
>> + ? ? struct nfs_closeres res;
>> + ? ? struct nfs_fattr fattr;
>> + ? ? unsigned long timestamp;
>> +};
>> +
>> ?/*
>> ? * ?* Arguments to the lock,lockt, and locku call.
>> ? * ? */
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>

2010-11-15 12:54:12

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 2/2 v2] pnfs-submit: handle NFS4ERR_DELEG_REVOKED for LAYOUTRETURN

When getting NFS4ERR_DELEG_REVOKED just clear the lseg list.
If this was part of a singular LAYOUTRETURN call, simulate success
(for now). If this was part of return on close, retry the close
compound without sending LAYOUTCOMMIT nor LAYOUTRETURN.

[use helper for pnfs_mark_layout_revoked to support !CONFIG_NFS_V4_1]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 39 ++++++++++++++++++++++++++++++++++++---
fs/nfs/pnfs.c | 6 ++++--
fs/nfs/pnfs.h | 14 ++++++++++++++
3 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 76b3c7d..55505e4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1880,6 +1880,16 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
nfs4_close_clear_stateid_flags(state,
calldata->arg.fmode);
break;
+ case -NFS4ERR_DELEG_REVOKED:
+ if (calldata->res.op_bitmask & (NFS4_HAS_LAYOUTCOMMIT |
+ NFS4_HAS_LAYOUTRETURN)) {
+ pnfs_mark_layout_revoked(calldata->inode);
+ /* Retry without layout operations as
+ * pnfs_roc will find roc_iomode==0 next time around
+ */
+ rpc_restart_call_prepare(task);
+ break;
+ }
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_BAD_STATEID:
@@ -5639,6 +5649,7 @@ void nfs4_layoutreturn_set_stateid(struct inode *ino,
static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
+ struct inode *ino = lrp->args.inode;
struct nfs_server *server;

dprintk("--> %s\n", __func__);
@@ -5647,28 +5658,50 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
return;

if (lrp->args.return_type == RETURN_FILE)
- server = NFS_SERVER(lrp->args.inode);
+ server = NFS_SERVER(ino);
else
server = NULL;
if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN) {
nfs_restart_rpc(task, lrp->clp);
return;
}
- if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE))
- nfs4_layoutreturn_set_stateid(lrp->args.inode, &lrp->res);
+ switch (task->tk_status) {
+ case -NFS4ERR_DELEG_REVOKED:
+ task->tk_status = 0; /* TODO: revalidate remaining layouts? */
+ if (lrp->args.return_type == RETURN_FILE)
+ pnfs_mark_layout_revoked(ino);
+ break;
+ case 0:
+ if (lrp->args.return_type == RETURN_FILE)
+ nfs4_layoutreturn_set_stateid(lrp->args.inode, &lrp->res);
+ }
dprintk("<-- %s\n", __func__);
}

void nfs4_layoutreturn_file_release(struct inode *ino)
{
struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+ LIST_HEAD(tmp_list);

spin_lock(&ino->i_lock);
+ if (test_bit(NFS_LAYOUT_REVOKED, &lo->plh_flags)) {
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ /* layout driver's free_lseg may block, hence we don't
+ * call pnfs_free_lseg_list under the spin_lock */
+ pnfs_clear_lseg_list(lo, &tmp_list, &range);
+ clear_bit(NFS_LAYOUT_REVOKED, &lo->plh_flags);
+ }
lo->plh_block_lgets--;
lo->plh_outstanding--;
if (!pnfs_layoutgets_blocked(lo, NULL))
rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
put_layout_hdr(ino);
}

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 90a868b..0b7fc1d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -375,7 +375,7 @@ static void mark_lseg_invalid(struct pnfs_layout_segment *lseg,
}
}

-static void
+void
pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
struct pnfs_layout_range *range)
{
@@ -644,10 +644,12 @@ pnfs_roc(struct nfs4_closedata *data)
spin_lock(&data->inode->i_lock);
lo = NFS_I(data->inode)->layout;
if (!lo || lo->roc_iomode == 0 ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_REVOKED, &lo->plh_flags))
goto out_nolayout;

range.iomode = lo->roc_iomode;
+ lo->roc_iomode = 0;
list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
if (should_free_lseg(&lseg->range, &range)) {
mark_lseg_invalid(lseg, &tmp_list);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e553311..c5b4282 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -56,6 +56,7 @@ enum {
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
+ NFS_LAYOUT_REVOKED, /* layout revoked by the server */
};

/* Per-layout driver specific registration structure */
@@ -223,6 +224,8 @@ void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_clear_lseg_list(struct pnfs_layout_hdr *, struct list_head *tmp_list,
+ struct pnfs_layout_range *);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
@@ -297,6 +300,12 @@ layoutcommit_needed(struct nfs_inode *nfsi)
test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
}

+static inline void
+pnfs_mark_layout_revoked(struct inode *ino)
+{
+ set_bit(NFS_LAYOUT_REVOKED, &NFS_I(ino)->layout->plh_flags);
+}
+
#else /* CONFIG_NFS_V4_1 */

static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -423,6 +432,11 @@ static inline void nfs4_layoutreturn_set_stateid(struct inode *ino,
{
}

+static inline void
+pnfs_mark_layout_revoked(struct inode *ino)
+{
+}
+
#endif /* CONFIG_NFS_V4_1 */

#endif /* FS_NFS_PNFS_H */
--
1.7.2.3


2010-11-14 15:43:37

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 2010-11-12 10:48, Fred Isaman wrote:
> +int
> +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
> + struct nfs4_state *open_state)
> {
> + int status = 0;
> +
> dprintk("--> %s\n", __func__);
> spin_lock(&lo->inode->i_lock);
> - if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
> + if (lo->plh_block_lgets ||
> + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
> + /* We avoid -EAGAIN, as that has special meaning to
> + * some callers.
> + */
> + status = -NFS4ERR_LAYOUTTRYLATER;
> + } else if (list_empty(&lo->segs)) {
> int seq;
>
> do {
> @@ -494,12 +514,11 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
> memcpy(dst->data, open_state->stateid.data,
> sizeof(open_state->stateid.data));
> } while (read_seqretry(&open_state->seqlock, seq));

Using the open stateid after forgetting the layout could be a protocol bug,
or at least it falls into undefined territories.

The RFC says:

The loga_stateid field specifies a valid stateid. If a layout is not
currently held by the client, the loga_stateid field represents a
stateid reflecting the correspondingly valid open, byte-range lock,
or delegation stateid. Once a layout is held on the file by the
client, the loga_stateid field MUST be a stateid as returned from a
previous LAYOUTGET or LAYOUTRETURN operation or provided by a
CB_LAYOUTRECALL operation (see Section 12.5.3).

So the question is does the text above refer to the client view of the state or to
the server's view.
In other words, with the forgetful client model, when the client unilaterally forgets
the layout without letting the server know about it (no LAYOUTRETURN was sent),
does it mean "a layout is not currently held by the client"?

The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
while it still thinks that the client is holding a layout.
Since this could normally happen if the client sends multiple LAYOUTGETs in
parallel before it received any layout stateid the server should allow it
within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
not explicitly called out there), otherwise, it seems like the server is supposed
to return NFS4ERR_OLD_STATEID.

Strictly reading the spec, the client should use the most recent layout stateid
even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
a layout (did I say recently how much I hate the forgetful model which introduces
more corner cases rather than simplifying the protocol as it was supposed to do? ;-)

Benny

2010-11-12 16:31:37

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 18/22] pnfs-submit: roc add layoutreturn op to close compound

On 2010-11-12 10:48, Fred Isaman wrote:
> From: Andy Adamson <[email protected]>
>
> Signed-off-by: Andy Adamson <[email protected]>
> ---
> fs/nfs/nfs4proc.c | 73 +++++++++++++++++++++++++++++++++-------------
> fs/nfs/nfs4state.c | 18 +-----------
> fs/nfs/nfs4xdr.c | 14 ++++++++-
> fs/nfs/pnfs.c | 64 +++++++++++++++++++++++++++++++++++++----
> fs/nfs/pnfs.h | 1 +
> include/linux/nfs_xdr.h | 19 ++++++++++++
> 6 files changed, 143 insertions(+), 46 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 6223c6a..2b47c59 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -74,6 +74,8 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
> static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
> struct nfs_fattr *fattr, struct iattr *sattr,
> struct nfs4_state *state);
> +static void nfs4_layoutreturn_set_stateid(struct inode *ino,
> + struct nfs4_layoutreturn_res *res);
>
> /* Prevent leaks of NFSv4 errors into userland */
> static int nfs4_map_errors(int err)
> @@ -1821,16 +1823,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
> return err;
> }
>
> -struct nfs4_closedata {
> - struct path path;
> - struct inode *inode;
> - struct nfs4_state *state;
> - struct nfs_closeargs arg;
> - struct nfs_closeres res;
> - struct nfs_fattr fattr;
> - unsigned long timestamp;
> -};
> -
> static void nfs4_free_closedata(void *data)
> {
> struct nfs4_closedata *calldata = data;
> @@ -1840,6 +1832,17 @@ static void nfs4_free_closedata(void *data)
> nfs_free_seqid(calldata->arg.seqid);
> nfs4_put_state_owner(sp);
> path_put(&calldata->path);
> + if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN) {
> + struct pnfs_layout_hdr *lo = NFS_I(calldata->inode)->layout;
> +
> + spin_lock(&lo->inode->i_lock);
> + lo->plh_block_lgets--;
> + lo->plh_outstanding--;
> + if (!pnfs_layoutgets_blocked(lo, NULL))
> + rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
> + spin_unlock(&lo->inode->i_lock);
> + put_layout_hdr(lo->inode);
> + }
> kfree(calldata);
> }
>
> @@ -1869,6 +1872,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
> switch (task->tk_status) {
> case 0:
> nfs_set_open_stateid(state, &calldata->res.stateid, 0);
> + if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN)
> + nfs4_layoutreturn_set_stateid(calldata->inode,
> + &calldata->res.lr_res);
> renew_lease(server, calldata->timestamp);
> nfs4_close_clear_stateid_flags(state,
> calldata->arg.fmode);
> @@ -1920,8 +1926,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
> return;
> }
>
> - if (calldata->arg.fmode == 0)
> + if (calldata->arg.fmode == 0) {
> task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
> + /* Are there layout segments to return on close? */
> + if (pnfs_roc(calldata)) {
> + struct nfs_inode *nfsi = NFS_I(calldata->inode);
> + if (pnfs_return_layout_barrier(nfsi,
> + &calldata->arg.lr_args.range)) {
> + dprintk("%s: waiting on barrier\n", __func__);
> + /* FIXME race with wake here */
> + rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
> + spin_lock(&calldata->inode->i_lock);
> + nfsi->layout->plh_block_lgets--;
> + nfsi->layout->plh_outstanding--;
> + if (!pnfs_layoutgets_blocked(nfsi->layout, NULL))
> + rpc_wake_up(&nfsi->lo_rpcwaitq_stateid);
> + spin_unlock(&calldata->inode->i_lock);
> + put_layout_hdr(calldata->inode);
> + return;
> + }
> + }
> + }
>
> nfs_fattr_init(calldata->res.fattr);
> calldata->timestamp = jiffies;
> @@ -5587,6 +5612,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
>
> if (pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
> dprintk("%s: waiting on barrier\n", __func__);
> + /* FIXME race with wake here */
> rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
> return;
> }
> @@ -5602,6 +5628,19 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
> rpc_call_start(task);
> }
>
> +static void nfs4_layoutreturn_set_stateid(struct inode *ino,
> + struct nfs4_layoutreturn_res *res)
> +{
> + struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
> +
> + spin_lock(&ino->i_lock);
> + if (res->lrs_present)
> + pnfs_set_layout_stateid(lo, &res->stateid, true);
> + else
> + BUG_ON(!list_empty(&lo->segs));
> + spin_unlock(&ino->i_lock);
> +}
> +
> static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
> {
> struct nfs4_layoutreturn *lrp = calldata;
> @@ -5620,16 +5659,8 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
> nfs_restart_rpc(task, lrp->clp);
> return;
> }
> - if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
> - struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
> -
> - spin_lock(&lo->inode->i_lock);
> - if (lrp->res.lrs_present)
> - pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
> - else
> - BUG_ON(!list_empty(&lo->segs));
> - spin_unlock(&lo->inode->i_lock);
> - }
> + if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE))
> + nfs4_layoutreturn_set_stateid(lrp->args.inode, &lrp->res);
> dprintk("<-- %s\n", __func__);
> }
>
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index ceb0d66..784f122 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -601,24 +601,8 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
> if (!call_close) {
> nfs4_put_open_state(state);
> nfs4_put_state_owner(owner);
> - } else {
> - u32 roc_iomode;
> - struct nfs_inode *nfsi = NFS_I(state->inode);
> -
> - if (has_layout(nfsi) &&
> - (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) {
> - struct pnfs_layout_range range = {
> - .iomode = roc_iomode,
> - .offset = 0,
> - .length = NFS4_MAX_UINT64,
> - };
> -
> - pnfs_return_layout(state->inode, &range, NULL,
> - RETURN_FILE, wait);
> - }
> -
> + } else
> nfs4_do_close(path, state, gfp_mask, wait);
> - }
> }
>
> void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index f530c7e..adb4c47 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -438,12 +438,14 @@ static int nfs4_stat_to_errno(int);
> encode_sequence_maxsz + \
> encode_putfh_maxsz + \
> encode_close_maxsz + \
> - encode_getattr_maxsz)
> + encode_getattr_maxsz + \
> + encode_layoutreturn_maxsz)
> #define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
> decode_sequence_maxsz + \
> decode_putfh_maxsz + \
> decode_close_maxsz + \
> - decode_getattr_maxsz)
> + decode_getattr_maxsz + \
> + decode_layoutreturn_maxsz)
> #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
> encode_sequence_maxsz + \
> encode_putfh_maxsz + \
> @@ -2143,6 +2145,8 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
> encode_putfh(&xdr, args->fh, &hdr);
> encode_close(&xdr, args, &hdr);
> encode_getfattr(&xdr, args->bitmask, &hdr);
> + if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
> + encode_layoutreturn(&xdr, &args->lr_args, &hdr);

Sorry, I just noticed, but if there's no object I'll move the layoutreturn op
before close in the compound.

Benny

> encode_nops(&hdr);
> return 0;
> }
> @@ -5719,6 +5723,12 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
> */
> decode_getfattr(&xdr, res->fattr, res->server,
> !RPC_IS_ASYNC(rqstp->rq_task));
> + /*
> + * With the forgetful model, we pay no attention to the
> + * layoutreturn status.
> + */
> + if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN)
> + decode_layoutreturn(&xdr, &res->lr_res);
> out:
> return status;
> }
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 22abf83..76cfb11 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -623,6 +623,63 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
> return ret;
> }
>
> +/*
> + * Return on close
> + *
> + * No LAYOUTRETURNS can be sent when BULK RECALL flag is set.
> + * FIXME: add layoutcommit operation if layoutcommit_needed is true.
> + */
> +bool
> +pnfs_roc(struct nfs4_closedata *data)
> +{
> + struct nfs4_layoutreturn_args *lr_args = &data->arg.lr_args;
> + struct pnfs_layout_hdr *lo;
> + struct pnfs_layout_segment *lseg, *tmp;
> + struct pnfs_layout_range range = {
> + .length = NFS4_MAX_UINT64,
> + };
> + LIST_HEAD(tmp_list);
> + bool found = false;
> +
> + spin_lock(&data->inode->i_lock);
> + lo = NFS_I(data->inode)->layout;
> + if (!lo || lo->roc_iomode == 0 ||
> + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
> + goto out_nolayout;
> +
> + range.iomode = lo->roc_iomode;
> + list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
> + if (should_free_lseg(&lseg->range, &range)) {
> + mark_lseg_invalid(lseg, &tmp_list);
> + found = true;
> + }
> + if (found == false)
> + goto out_nolayout;
> + /* Stop new and drop response to outstanding LAYOUTGETS */
> + lo->plh_block_lgets++;
> + lo->plh_outstanding++;
> + /* Reference matched in pnfs_layoutreturn_release */
> + get_layout_hdr(lo);
> +
> + spin_unlock(&data->inode->i_lock);
> +
> + pnfs_free_lseg_list(&tmp_list);
> +
> + lr_args->reclaim = 0;
> + lr_args->layout_type = NFS_SERVER(data->inode)->pnfs_curr_ld->id;
> + lr_args->return_type = RETURN_FILE;
> + lr_args->range = range;
> + lr_args->inode = data->inode;
> + data->res.op_bitmask |= NFS4_HAS_LAYOUTRETURN;
> + data->arg.op_bitmask |= NFS4_HAS_LAYOUTRETURN;
> +
> + return true;
> +
> +out_nolayout:
> + spin_unlock(&data->inode->i_lock);
> + return false;
> +}
> +
> static int
> return_layout(struct inode *ino, struct pnfs_layout_range *range,
> enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo,
> @@ -997,13 +1054,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
> *lgp->lsegpp = lseg;
> pnfs_insert_layout(lo, lseg);
>
> - if (res->return_on_close) {
> - /* FI: This needs to be re-examined. At lo level,
> - * all it needs is a bit indicating whether any of
> - * the lsegs in the list have the flags set.
> - */
> + if (res->return_on_close)
> lo->roc_iomode |= res->range.iomode;
> - }
>
> /* Done processing layoutget. Set the layout stateid */
> pnfs_set_layout_stateid(lo, &res->stateid, false);
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 7fd1f5d..916a057 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -234,6 +234,7 @@ void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
> struct pnfs_layout_range *range,
> int notify_bit, atomic_t *notify_count,
> struct list_head *tmp_list);
> +bool pnfs_roc(struct nfs4_closedata *data);
>
> static inline bool
> has_layout(struct nfs_inode *nfsi)
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index f472405..6c4ba71 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -351,12 +351,18 @@ struct nfs_open_confirmres {
> /*
> * Arguments to the close call.
> */
> +
> +/* op_bitmask bits */
> +#define NFS4_HAS_LAYOUTRETURN 0x01
> +
> struct nfs_closeargs {
> struct nfs_fh * fh;
> nfs4_stateid * stateid;
> struct nfs_seqid * seqid;
> fmode_t fmode;
> const u32 * bitmask;
> + u32 op_bitmask; /* which optional ops to encode */
> + struct nfs4_layoutreturn_args lr_args; /* optional */
> struct nfs4_sequence_args seq_args;
> };
>
> @@ -365,8 +371,21 @@ struct nfs_closeres {
> struct nfs_fattr * fattr;
> struct nfs_seqid * seqid;
> const struct nfs_server *server;
> + u32 op_bitmask; /* which optional ops encoded */
> + struct nfs4_layoutreturn_res lr_res; /* optional */
> struct nfs4_sequence_res seq_res;
> };
> +
> +struct nfs4_closedata {
> + struct path path;
> + struct inode *inode;
> + struct nfs4_state *state;
> + struct nfs_closeargs arg;
> + struct nfs_closeres res;
> + struct nfs_fattr fattr;
> + unsigned long timestamp;
> +};
> +
> /*
> * * Arguments to the lock,lockt, and locku call.
> * */

2010-11-12 08:49:25

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 11/22] pnfs-submit: remove _pnfs_can_return_lseg call from pnfs_clear_lseg_list

Instead, have mark_invalid function that marks lseg invalid and
removes the reference that holds it in the list. Now when io is finished,
the lseg will automatically be removed from the list. This is
at the heart of many of the upcoming cb_layoutrecall changes.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 130 +++++++++++++++++++++++++++++++++++----------------------
1 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 120590b..1147eb3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -272,10 +272,42 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
lseg->layout = lo;
}

+static void
+_put_lseg_common(struct pnfs_layout_segment *lseg)
+{
+ BUG_ON(lseg->valid == true);
+ list_del(&lseg->fi_list);
+ if (list_empty(&lseg->layout->segs)) {
+ struct nfs_client *clp;
+
+ clp = NFS_SERVER(lseg->layout->inode)->nfs_client;
+ spin_lock(&clp->cl_lock);
+ /* List does not take a reference, so no need for put here */
+ list_del_init(&lseg->layout->layouts);
+ spin_unlock(&clp->cl_lock);
+ pnfs_invalidate_layout_stateid(lseg->layout);
+ }
+ rpc_wake_up(&NFS_I(lseg->layout->inode)->lo_rpcwaitq);
+}
+
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+ * could sleep, so must be called outside of the lock.
+ */
+static void
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount), lseg->valid);
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ _put_lseg_common(lseg);
+ list_add(&lseg->fi_list, tmp_list);
+ }
+}
+
void
put_lseg(struct pnfs_layout_segment *lseg)
{
- bool do_wake_up;
struct inode *ino;

if (!lseg)
@@ -283,15 +315,14 @@ put_lseg(struct pnfs_layout_segment *lseg)

dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
atomic_read(&lseg->pls_refcount), lseg->valid);
- do_wake_up = !lseg->valid;
ino = lseg->layout->inode;
- if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
+ _put_lseg_common(lseg);
+ spin_unlock(&ino->i_lock);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
put_layout_hdr(ino);
}
- if (do_wake_up)
- rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
}
EXPORT_SYMBOL_GPL(put_lseg);

@@ -314,10 +345,18 @@ should_free_lseg(struct pnfs_layout_range *lseg_range,
lseg_range->iomode == recall_range->iomode);
}

-static bool
-_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg)
+static void mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
{
- return atomic_read(&lseg->pls_refcount) == 1;
+ assert_spin_locked(&lseg->layout->inode->i_lock);
+ if (lseg->valid) {
+ lseg->valid = false;
+ /* Remove the reference keeping the lseg in the
+ * list. It will now be removed when all
+ * outstanding io is finished.
+ */
+ put_lseg_locked(lseg, tmp_list);
+ }
}

static void
@@ -330,42 +369,31 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
__func__, lo, range->offset, range->length, range->iomode);

assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
- if (!should_free_lseg(&lseg->range, range) ||
- !_pnfs_can_return_lseg(lseg))
- continue;
- dprintk("%s: freeing lseg %p iomode %d "
- "offset %llu length %llu\n", __func__,
- lseg, lseg->range.iomode, lseg->range.offset,
- lseg->range.length);
- list_move(&lseg->fi_list, tmp_list);
- }
- if (list_empty(&lo->segs)) {
- struct nfs_client *clp;
-
- clp = NFS_SERVER(lo->inode)->nfs_client;
- spin_lock(&clp->cl_lock);
- /* List does not take a reference, so no need for put here */
- list_del_init(&lo->layouts);
- spin_unlock(&clp->cl_lock);
- pnfs_invalidate_layout_stateid(lo);
- }
-
+ list_for_each_entry_safe(lseg, next, &lo->segs, fi_list)
+ if (should_free_lseg(&lseg->range, range)) {
+ dprintk("%s: freeing lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->range.iomode, lseg->range.offset,
+ lseg->range.length);
+ mark_lseg_invalid(lseg, tmp_list);
+ }
dprintk("%s:Return\n", __func__);
}

static void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+pnfs_free_lseg_list(struct list_head *free_me)
{
- struct pnfs_layout_segment *lseg;
+ struct pnfs_layout_segment *lseg, *tmp;
+ struct inode *ino;

- while (!list_empty(tmp_list)) {
- lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
- fi_list);
- dprintk("%s calling put_lseg on %p\n", __func__, lseg);
- list_del(&lseg->fi_list);
- put_lseg(lseg);
+ list_for_each_entry_safe(lseg, tmp, free_me, fi_list) {
+ BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
+ ino = lseg->layout->inode;
+ NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+ put_layout_hdr(ino);
}
+ INIT_LIST_HEAD(free_me);
}

void
@@ -530,6 +558,10 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,
return out;
}

+/* Return true if there is layout based io in progress in the given range.
+ * Assumes range has already been marked invalid, and layout marked to
+ * prevent any new lseg from being inserted.
+ */
bool
pnfs_return_layout_barrier(struct nfs_inode *nfsi,
struct pnfs_layout_range *range)
@@ -538,17 +570,11 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
bool ret = false;

spin_lock(&nfsi->vfs_inode.i_lock);
- list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) {
- if (!should_free_lseg(&lseg->range, range))
- continue;
- lseg->valid = false;
- if (!_pnfs_can_return_lseg(lseg)) {
- dprintk("%s: wait on lseg %p refcount %d\n",
- __func__, lseg,
- atomic_read(&lseg->pls_refcount));
+ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list)
+ if (should_free_lseg(&lseg->range, range)) {
ret = true;
+ break;
}
- }
spin_unlock(&nfsi->vfs_inode.i_lock);
dprintk("%s:Return %d\n", __func__, ret);
return ret;
@@ -558,13 +584,11 @@ void
pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp)
{
struct pnfs_layout_hdr *lo;
- LIST_HEAD(tmp_list);

if (lrp->args.return_type != RETURN_FILE)
return;
lo = NFS_I(lrp->args.inode)->layout;
spin_lock(&lrp->args.inode->i_lock);
- pnfs_clear_lseg_list(lo, &tmp_list, &lrp->args.range);
lo->plh_block_lgets--;
if (!pnfs_layoutgets_blocked(lo))
rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
@@ -576,7 +600,6 @@ pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp)
pnfs_set_layout_stateid(lo, &lrp->res.stateid);
put_layout_hdr_locked(lo); /* Matched in _pnfs_return_layout */
spin_unlock(&lrp->args.inode->i_lock);
- pnfs_free_lseg_list(&tmp_list);
}

static int
@@ -630,7 +653,11 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
arg.offset = 0;
arg.length = NFS4_MAX_UINT64;

+ /* probably should BUGON if type != RETURN_FILE */
if (type == RETURN_FILE) {
+ LIST_HEAD(tmp_list);
+ struct pnfs_layout_segment *lseg, *tmp;
+
spin_lock(&ino->i_lock);
lo = nfsi->layout;
if (lo && !has_layout_to_return(lo, &arg))
@@ -642,10 +669,13 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
}

lo->plh_block_lgets++;
+ list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
+ if (should_free_lseg(&lseg->range, &arg))
+ mark_lseg_invalid(lseg, &tmp_list);
/* Reference matched in pnfs_layoutreturn_release */
get_layout_hdr_locked(lo);
-
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);

if (layoutcommit_needed(nfsi)) {
if (stateid && !wait) { /* callback */
--
1.7.2.1


2010-11-14 18:12:47

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 2/2] pnfs-submit: handle NFS4ERR_DELEG_REVOKED for LAYOUTRETURN

When getting NFS4ERR_DELEG_REVOKED just clear the lseg list.
If this was part of a singular LAYOUTRETURN call, simulate success
(for now). If this was part of return on close, retry the close
compound without sending LAYOUTCOMMIT nor LAYOUTRETURN.

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 40 +++++++++++++++++++++++++++++++++++++---
fs/nfs/pnfs.c | 6 ++++--
fs/nfs/pnfs.h | 3 +++
3 files changed, 44 insertions(+), 5 deletions(-)

Untested yet...

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 76b3c7d..7aa902d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1880,6 +1880,17 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
nfs4_close_clear_stateid_flags(state,
calldata->arg.fmode);
break;
+ case -NFS4ERR_DELEG_REVOKED:
+ if (calldata->res.op_bitmask & (NFS4_HAS_LAYOUTCOMMIT |
+ NFS4_HAS_LAYOUTRETURN)) {
+ set_bit(NFS_LAYOUT_REVOKED,
+ &NFS_I(calldata->inode)->layout->plh_flags);
+ /* Retry without layout operations as
+ * pnfs_roc will find roc_iomode==0 next time around
+ */
+ rpc_restart_call_prepare(task);
+ break;
+ }
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_BAD_STATEID:
@@ -5639,6 +5650,7 @@ void nfs4_layoutreturn_set_stateid(struct inode *ino,
static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
+ struct inode *ino = lrp->args.inode;
struct nfs_server *server;

dprintk("--> %s\n", __func__);
@@ -5647,28 +5659,50 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
return;

if (lrp->args.return_type == RETURN_FILE)
- server = NFS_SERVER(lrp->args.inode);
+ server = NFS_SERVER(ino);
else
server = NULL;
if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN) {
nfs_restart_rpc(task, lrp->clp);
return;
}
- if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE))
- nfs4_layoutreturn_set_stateid(lrp->args.inode, &lrp->res);
+ switch (task->tk_status) {
+ case -NFS4ERR_DELEG_REVOKED:
+ task->tk_status = 0; /* TODO: revalidate remaining layouts? */
+ if (lrp->args.return_type == RETURN_FILE)
+ set_bit(NFS_LAYOUT_REVOKED, &NFS_I(ino)->layout->plh_flags);
+ break;
+ case 0:
+ if (lrp->args.return_type == RETURN_FILE)
+ nfs4_layoutreturn_set_stateid(lrp->args.inode, &lrp->res);
+ }
dprintk("<-- %s\n", __func__);
}

void nfs4_layoutreturn_file_release(struct inode *ino)
{
struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+ LIST_HEAD(tmp_list);

spin_lock(&ino->i_lock);
+ if (test_bit(NFS_LAYOUT_REVOKED, &lo->plh_flags)) {
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ /* layout driver's free_lseg may block, hence we don't
+ * call pnfs_free_lseg_list under the spin_lock */
+ pnfs_clear_lseg_list(lo, &tmp_list, &range);
+ clear_bit(NFS_LAYOUT_REVOKED, &lo->plh_flags);
+ }
lo->plh_block_lgets--;
lo->plh_outstanding--;
if (!pnfs_layoutgets_blocked(lo, NULL))
rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
put_layout_hdr(ino);
}

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 90a868b..0b7fc1d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -375,7 +375,7 @@ static void mark_lseg_invalid(struct pnfs_layout_segment *lseg,
}
}

-static void
+void
pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
struct pnfs_layout_range *range)
{
@@ -644,10 +644,12 @@ pnfs_roc(struct nfs4_closedata *data)
spin_lock(&data->inode->i_lock);
lo = NFS_I(data->inode)->layout;
if (!lo || lo->roc_iomode == 0 ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_REVOKED, &lo->plh_flags))
goto out_nolayout;

range.iomode = lo->roc_iomode;
+ lo->roc_iomode = 0;
list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
if (should_free_lseg(&lseg->range, &range)) {
mark_lseg_invalid(lseg, &tmp_list);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e553311..471813a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -56,6 +56,7 @@ enum {
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
+ NFS_LAYOUT_REVOKED, /* layout revoked by the server */
};

/* Per-layout driver specific registration structure */
@@ -223,6 +224,8 @@ void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_clear_lseg_list(struct pnfs_layout_hdr *, struct list_head *tmp_list,
+ struct pnfs_layout_range *);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
--
1.7.2.3


2010-11-12 08:49:23

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 01/22] pnfs-submit: remove RPC_ASSASSINATED(task) checks

See Trond's commit a6f03393ec8 "NFSv4: Get rid of the bogus
RPC_ASSASSINATED(task) checks"

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 6 ------
1 files changed, 0 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 73bd44e..ce322e5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5483,9 +5483,6 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;

- if (RPC_ASSASSINATED(task))
- return;
-
if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
nfs_restart_rpc(task, server->nfs_client);

@@ -5590,9 +5587,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
if (!nfs4_sequence_done(task, &lrp->res.seq_res))
return;

- if (RPC_ASSASSINATED(task))
- return;
-
if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
nfs_restart_rpc(task, server->nfs_client);

--
1.7.2.1


2010-11-12 08:49:24

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 08/22] SQUASHME: allow cb_sequence changes to compile without v4.1

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 26 ++++++++++++++++++++++++++
fs/nfs/callback_proc.c | 6 ------
fs/nfs/callback_xdr.c | 3 +--
fs/nfs/internal.h | 4 ++++
4 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 89fee05..0b1f3c4 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -8,6 +8,8 @@
#ifndef __LINUX_FS_NFS_CALLBACK_H
#define __LINUX_FS_NFS_CALLBACK_H

+#include "internal.h"
+
#define NFS4_CALLBACK 0x40000000
#define NFS4_CALLBACK_XDRSIZE 2048
#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
@@ -158,6 +160,30 @@ extern unsigned nfs4_callback_layoutrecall(
struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps);

+static inline void put_session_client(struct nfs4_session *session)
+{
+ if (session) /* matched by cb_sequence find_client_with_session */
+ nfs_put_client(session->clp);
+}
+
+static inline struct nfs_client *
+find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
+{
+ return cps->session ? cps->session->clp : nfs_find_client(addr, 4);
+}
+
+#else
+
+static inline struct nfs_client *
+find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
+{
+ return nfs_find_client(addr, 4);
+}
+
+static inline void put_session_client(struct nfs4_session *session)
+{
+}
+
#endif /* CONFIG_NFS_V4_1 */

extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2e62155..d02997a 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -21,12 +21,6 @@
#define NFSDBG_FACILITY NFSDBG_CALLBACK
#endif

-static struct nfs_client *
-find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
-{
- return cps->session ? cps->session->clp : nfs_find_client(addr, 4);
-}
-
__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
struct cb_getattrres *res,
struct cb_process_state *cps)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 1650ab0..01688ce 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -770,8 +770,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r

*hdr_res.status = status;
*hdr_res.nops = htonl(nops);
- if (cps.session) /* matched by cb_sequence find_client_with_session */
- nfs_put_client(cps.session->clp);
+ put_session_client(cps.session);
dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 755e555..6f14089 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -2,6 +2,8 @@
* NFS internal definitions
*/

+#ifndef __LINUX_FS_NFS_INTERNAL_H
+#define __LINUX_FS_NFS_INTERNAL_H
#include "nfs4_fs.h"
#include <linux/mount.h>
#include <linux/security.h>
@@ -415,3 +417,5 @@ static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client
return rpc_restart_call_prepare(task);
return rpc_restart_call(task);
}
+
+#endif /* __LINUX_FS_NFS_INTERNAL_H */
--
1.7.2.1


2010-11-12 08:49:26

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

Remove NFS_LAYOUT_STATEID_SET in favor of just checking list_empty(lo->segs).

LAYOUTGETs with openstateid are serialized. Waiting on the condition
(list_empty(lo->segs) && plh_outstanding>0) both drains outstanding RPCs once
the stateid is invalidated and allows only a single LAYOUTGET(openstateid)
through at a time.

Before sending a LAYOUTRETURN, plh_block_lgets is incremented. It is
decremented in the rpc_release function. While set, LAYOUTGETs are
paused in their rpc_prepare function, and any responses are
forgotten.

Callbacks are handled by blocking any matching LAYOUTGETS while processing and
initiating drain of IO. A notification system is set up so that when
all relevant IO is finished, the state manger thread is invoked, which
synchronously sends the final matching LAYOUTRETURN before unblocking
LAYOUTGETS.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 7 +
fs/nfs/callback_proc.c | 466 +++++++++++++++++++++++----------------------
fs/nfs/client.c | 3 +
fs/nfs/nfs4proc.c | 81 ++++++--
fs/nfs/nfs4state.c | 4 +
fs/nfs/nfs4xdr.c | 16 ++-
fs/nfs/pnfs.c | 177 +++++++++++++-----
fs/nfs/pnfs.h | 41 +++-
include/linux/nfs_fs_sb.h | 4 +
9 files changed, 497 insertions(+), 302 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index cea58cc..4a9905b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -163,6 +163,9 @@ struct cb_layoutrecallargs {
extern unsigned nfs4_callback_layoutrecall(
struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps);
+extern bool matches_outstanding_recall(struct inode *ino,
+ struct pnfs_layout_range *range);
+extern void nfs_client_return_layouts(struct nfs_client *clp);

static inline void put_session_client(struct nfs4_session *session)
{
@@ -178,6 +181,10 @@ find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)

#else

+static inline void nfs_client_return_layouts(struct nfs_client *clp)
+{
+}
+
static inline struct nfs_client *
find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
{
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 6e0fc40..af405cf 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -124,265 +124,283 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
#if defined(CONFIG_NFS_V4_1)

static bool
-pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
- const nfs4_stateid stateid)
+_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info,
+ struct inode *ino, struct pnfs_layout_range *range)
{
- bool res;
- u32 oldseqid, newseqid;
-
- spin_lock(&lo->inode->i_lock);
- {
- oldseqid = be32_to_cpu(lo->stateid.stateid.seqid);
- newseqid = be32_to_cpu(stateid.stateid.seqid);
- res = !memcmp(lo->stateid.stateid.other,
- stateid.stateid.other,
- NFS4_STATEID_OTHER_SIZE);
- if (res) { /* comparing layout stateids */
- if (oldseqid == ~0)
- res = (newseqid == 1);
- else
- res = (newseqid == oldseqid + 1);
- } else { /* open stateid */
- res = !memcmp(lo->stateid.data,
- &zero_stateid,
- NFS4_STATEID_SIZE);
- if (res)
- res = (newseqid == 1);
- }
- }
- spin_unlock(&lo->inode->i_lock);
+ struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args;

- return res;
+ switch (cb_args->cbl_recall_type) {
+ case RETURN_ALL:
+ return true;
+ case RETURN_FSID:
+ return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid,
+ sizeof(struct nfs_fsid));
+ case RETURN_FILE:
+ return (ino == cb_info->pcl_ino) &&
+ should_free_lseg(range, &cb_args->cbl_range);
+ default:
+ BUG();
+ }
}

-/*
- * Retrieve an inode based on layout recall parameters
- *
- * Note: caller must iput(inode) to dereference the inode.
- */
-static struct inode *
-nfs_layoutrecall_find_inode(struct nfs_client *clp,
- const struct cb_layoutrecallargs *args)
+bool
+matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range)
{
- struct nfs_inode *nfsi;
- struct pnfs_layout_hdr *lo;
- struct nfs_server *server;
- struct inode *ino = NULL;
-
- dprintk("%s: Begin recall_type=%d clp %p\n",
- __func__, args->cbl_recall_type, clp);
-
- spin_lock(&clp->cl_lock);
- list_for_each_entry(lo, &clp->cl_layouts, layouts) {
- nfsi = NFS_I(lo->inode);
- if (!nfsi)
- continue;
-
- dprintk("%s: Searching inode=%lu\n",
- __func__, nfsi->vfs_inode.i_ino);
-
- if (args->cbl_recall_type == RETURN_FILE) {
- if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh))
- continue;
- } else if (args->cbl_recall_type == RETURN_FSID) {
- server = NFS_SERVER(&nfsi->vfs_inode);
- if (server->fsid.major != args->cbl_fsid.major ||
- server->fsid.minor != args->cbl_fsid.minor)
- continue;
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+ struct pnfs_cb_lrecall_info *cb_info;
+ bool rv = false;
+
+ assert_spin_locked(&clp->cl_lock);
+ list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) {
+ if (_recall_matches_lget(cb_info, ino, range)) {
+ rv = true;
+ break;
}
-
- /* Make sure client didn't clean up layout without
- * telling the server */
- if (!has_layout(nfsi))
- continue;
-
- ino = igrab(&nfsi->vfs_inode);
- dprintk("%s: Found inode=%p\n", __func__, ino);
- break;
}
- spin_unlock(&clp->cl_lock);
- return ino;
+ return rv;
}

-struct recall_layout_threadargs {
- struct inode *inode;
- struct nfs_client *clp;
- struct completion started;
- struct cb_layoutrecallargs *rl;
- int result;
-};
-
-static int pnfs_recall_layout(void *data)
+/* Send a synchronous LAYOUTRETURN. By the time this is called, we know
+ * all IO has been drained, any matching lsegs deleted, and that no
+ * overlapping LAYOUTGETs will be sent or processed for the duration
+ * of this call.
+ * Note that it is possible that when this is called, the stateid has
+ * been invalidated. But will not be cleared, so can still use.
+ */
+static int
+pnfs_send_layoutreturn(struct nfs_client *clp,
+ struct pnfs_cb_lrecall_info *cb_info)
{
- struct inode *inode, *ino;
- struct nfs_client *clp;
- struct cb_layoutrecallargs rl;
+ struct cb_layoutrecallargs *args = &cb_info->pcl_args;
struct nfs4_layoutreturn *lrp;
- struct recall_layout_threadargs *args =
- (struct recall_layout_threadargs *)data;
- int status = 0;
-
- daemonize("nfsv4-layoutreturn");
-
- dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n",
- __func__, args->rl->cbl_recall_type,
- args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor);
-
- clp = args->clp;
- inode = args->inode;
- rl = *args->rl;
-
- /* support whole file layouts only */
- rl.cbl_range.offset = 0;
- rl.cbl_range.length = NFS4_MAX_UINT64;
-
- if (rl.cbl_recall_type == RETURN_FILE) {
- if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout,
- rl.cbl_stateid))
- status = pnfs_return_layout(inode, &rl.cbl_range,
- &rl.cbl_stateid, RETURN_FILE,
- false);
- else
- status = cpu_to_be32(NFS4ERR_DELAY);
- if (status)
- dprintk("%s RETURN_FILE error: %d\n", __func__, status);
- else
- status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
- args->result = status;
- complete(&args->started);
- goto out;
- }
-
- status = cpu_to_be32(NFS4_OK);
- args->result = status;
- complete(&args->started);
- args = NULL;
-
- /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
- while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
- /* FIXME: need to check status on pnfs_return_layout */
- pnfs_return_layout(ino, &rl.cbl_range, NULL, RETURN_FILE, false);
- iput(ino);
- }

lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
- if (!lrp) {
- dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
- __func__);
- goto out;
- }
-
- /* send final layoutreturn */
+ if (!lrp)
+ return -ENOMEM;
lrp->args.reclaim = 0;
- lrp->args.layout_type = rl.cbl_layout_type;
- lrp->args.return_type = rl.cbl_recall_type;
+ lrp->args.layout_type = args->cbl_layout_type;
+ lrp->args.return_type = args->cbl_recall_type;
lrp->clp = clp;
- lrp->args.range = rl.cbl_range;
- lrp->args.inode = inode;
- nfs4_proc_layoutreturn(lrp, true);
-
-out:
- clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
- nfs_put_client(clp);
- module_put_and_exit(0);
- dprintk("%s: exit status %d\n", __func__, 0);
- return 0;
+ if (args->cbl_recall_type == RETURN_FILE) {
+ lrp->args.range = args->cbl_range;
+ lrp->args.inode = cb_info->pcl_ino;
+ } else {
+ lrp->args.range.iomode = IOMODE_ANY;
+ lrp->args.inode = NULL;
+ }
+ return nfs4_proc_layoutreturn(lrp, true);
}

-/*
- * Asynchronous layout recall!
+/* Called by state manager to finish CB_LAYOUTRECALLS initiated by
+ * nfs4_callback_layoutrecall().
*/
-static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
- struct cb_layoutrecallargs *rl)
+void nfs_client_return_layouts(struct nfs_client *clp)
{
- struct recall_layout_threadargs data = {
- .clp = clp,
- .inode = inode,
- .rl = rl,
- };
- struct task_struct *t;
- int status = -EAGAIN;
+ struct pnfs_cb_lrecall_info *cb_info;

- dprintk("%s: -->\n", __func__);
+ spin_lock(&clp->cl_lock);
+ while (true) {
+ if (list_empty(&clp->cl_layoutrecalls)) {
+ spin_unlock(&clp->cl_lock);
+ break;
+ }
+ cb_info = list_first_entry(&clp->cl_layoutrecalls,
+ struct pnfs_cb_lrecall_info,
+ pcl_list);
+ spin_unlock(&clp->cl_lock);
+ if (atomic_read(&cb_info->pcl_count) != 0)
+ break;
+ /* What do on error return? These layoutreturns are
+ * required by the protocol. So if do not get
+ * successful reply, probably have to do something
+ * more drastic.
+ */
+ pnfs_send_layoutreturn(clp, cb_info);
+ spin_lock(&clp->cl_lock);
+ /* Removing from the list unblocks LAYOUTGETs */
+ list_del(&cb_info->pcl_list);
+ clp->cl_cb_lrecall_count--;
+ rpc_wake_up(&clp->cl_rpcwaitq_recall);
+ kfree(cb_info);
+ }
+}

- /* FIXME: do not allow two concurrent layout recalls */
- if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
- return status;
-
- init_completion(&data.started);
- __module_get(THIS_MODULE);
- atomic_inc(&clp->cl_count);
-
- t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
- if (IS_ERR(t)) {
- printk(KERN_INFO "NFS: Layout recall callback thread failed "
- "for client (clientid %08x/%08x)\n",
- (unsigned)(clp->cl_clientid >> 32),
- (unsigned)(clp->cl_clientid));
- status = PTR_ERR(t);
- goto out_module_put;
+void notify_drained(struct pnfs_cb_lrecall_info *d)
+{
+ if (d && atomic_dec_and_test(&d->pcl_count)) {
+ set_bit(NFS4CLNT_LAYOUT_RECALL, &d->pcl_clp->cl_state);
+ nfs4_schedule_state_manager(d->pcl_clp);
}
- wait_for_completion(&data.started);
- return data.result;
-out_module_put:
- nfs_put_client(clp);
- clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
- module_put(THIS_MODULE);
- return status;
}

-static int pnfs_recall_all_layouts(struct nfs_client *clp)
+static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
{
- struct cb_layoutrecallargs rl;
- struct inode *inode;
- int status = 0;
-
- rl.cbl_recall_type = RETURN_ALL;
- rl.cbl_range.iomode = IOMODE_ANY;
- rl.cbl_range.offset = 0;
- rl.cbl_range.length = NFS4_MAX_UINT64;
-
- /* we need the inode to get the nfs_server struct */
- inode = nfs_layoutrecall_find_inode(clp, &rl);
- if (!inode)
- return status;
- status = pnfs_async_return_layout(clp, inode, &rl);
- iput(inode);
+ struct nfs_client *clp = cb_info->pcl_clp;
+ struct pnfs_layout_hdr *lo;
+ int rv = NFS4ERR_NOMATCHING_LAYOUT;
+ struct cb_layoutrecallargs *args = &cb_info->pcl_args;
+
+ if (args->cbl_recall_type == RETURN_FILE) {
+ LIST_HEAD(free_me_list);
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(lo, &clp->cl_layouts, layouts) {
+ if (nfs_compare_fh(&args->cbl_fh,
+ &NFS_I(lo->inode)->fh))
+ continue;
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ rv = NFS4ERR_DELAY;
+ else {
+ /* FIXME I need to better understand igrab and
+ * does having a layout ref keep ino around?
+ * It should.
+ */
+ /* We need to hold the reference until any
+ * potential LAYOUTRETURN is finished.
+ */
+ get_layout_hdr(lo);
+ cb_info->pcl_ino = lo->inode;
+ rv = NFS4_OK;
+ }
+ break;
+ }
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&lo->inode->i_lock);
+ if (rv == NFS4_OK) {
+ lo->plh_block_lgets++;
+ nfs4_asynch_forget_layouts(lo, &args->cbl_range,
+ cb_info, &free_me_list);
+ }
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ spin_unlock(&lo->inode->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
+ } else {
+ struct pnfs_layout_hdr *tmp;
+ LIST_HEAD(recall_list);
+ LIST_HEAD(free_me_list);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ spin_lock(&clp->cl_lock);
+ /* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */
+ if (!list_is_singular(&clp->cl_layoutrecalls)) {
+ spin_unlock(&clp->cl_lock);
+ return NFS4ERR_DELAY;
+ }
+ list_for_each_entry(lo, &clp->cl_layouts, layouts) {
+ if ((args->cbl_recall_type == RETURN_FSID) &&
+ memcmp(&NFS_SERVER(lo->inode)->fsid,
+ &args->cbl_fsid, sizeof(struct nfs_fsid)))
+ continue;
+ get_layout_hdr(lo);
+ /* We could list_del(&lo->layouts) here */
+ BUG_ON(!list_empty(&lo->plh_bulk_recall));
+ list_add(&lo->plh_bulk_recall, &recall_list);
+ }
+ spin_unlock(&clp->cl_lock);
+ list_for_each_entry_safe(lo, tmp,
+ &recall_list, plh_bulk_recall) {
+ spin_lock(&lo->inode->i_lock);
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ nfs4_asynch_forget_layouts(lo, &range, cb_info,
+ &free_me_list);
+ list_del_init(&lo->plh_bulk_recall);
+ spin_unlock(&lo->inode->i_lock);
+ put_layout_hdr(lo->inode);
+ rv = NFS4_OK;
+ }
+ pnfs_free_lseg_list(&free_me_list);
+ }
+ return rv;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct pnfs_cb_lrecall_info *new;
+ u32 res;
+
+ dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new) {
+ res = NFS4ERR_RESOURCE;
+ goto out;
+ }
+ memcpy(&new->pcl_args, args, sizeof(*args));
+ atomic_set(&new->pcl_count, 1);
+ new->pcl_clp = clp;
+ new->pcl_ino = NULL;
+ spin_lock(&clp->cl_lock);
+ if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) {
+ kfree(new);
+ res = NFS4ERR_DELAY;
+ spin_unlock(&clp->cl_lock);
+ goto out;
+ }
+ clp->cl_cb_lrecall_count++;
+ /* Adding to the list will block conflicting LGET activity */
+ list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls);
+ spin_unlock(&clp->cl_lock);
+ res = initiate_layout_draining(new);
+ if (res || atomic_dec_and_test(&new->pcl_count)) {
+ spin_lock(&clp->cl_lock);
+ list_del(&new->pcl_list);
+ clp->cl_cb_lrecall_count--;
+ rpc_wake_up(&clp->cl_rpcwaitq_recall);
+ spin_unlock(&clp->cl_lock);
+ if (res == NFS4_OK) {
+ if (args->cbl_recall_type == RETURN_FILE) {
+ struct pnfs_layout_hdr *lo;
+
+ lo = NFS_I(new->pcl_ino)->layout;
+ spin_lock(&lo->inode->i_lock);
+ lo->plh_block_lgets--;
+ if (!pnfs_layoutgets_blocked(lo, NULL))
+ rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
+ spin_unlock(&lo->inode->i_lock);
+ put_layout_hdr(new->pcl_ino);
+ }
+ res = NFS4ERR_NOMATCHING_LAYOUT;
+ }
+ kfree(new);
+ }
+out:
+ dprintk("%s returning %i\n", __func__, res);
+ return res;

- return status;
}

__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps)
{
struct nfs_client *clp;
- struct inode *inode = NULL;
- __be32 res;
- int status;
+ u32 res;

dprintk("%s: -->\n", __func__);

- res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
- if (cps->session) /* set in cb_sequence */
+ if (cps->session) { /* set in cb_sequence */
clp = cps->session->clp;
- else
- goto out;
+ res = do_callback_layoutrecall(clp, args);
+ } else
+ res = NFS4ERR_OP_NOT_IN_SESSION;

- res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
- /*
- * In the _ALL or _FSID case, we need the inode to get
- * the nfs_server struct.
- */
- inode = nfs_layoutrecall_find_inode(clp, args);
- if (!inode)
- goto out;
- status = pnfs_async_return_layout(clp, inode, args);
- if (status)
- res = cpu_to_be32(NFS4ERR_DELAY);
- iput(inode);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
- return res;
+ dprintk("%s: exit with status = %d\n", __func__, res);
+ return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+ struct cb_layoutrecallargs args;
+
+ /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+ memset(&args, 0, sizeof(args));
+ args.cbl_recall_type = RETURN_ALL;
+ /* FIXME we ignore errors, what should we do? */
+ do_callback_layoutrecall(clp, &args);
}

int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
@@ -665,9 +683,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
flags |= FMODE_WRITE;
if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
&args->craa_type_mask))
- if (pnfs_recall_all_layouts(clp) == -EAGAIN)
- status = cpu_to_be32(NFS4ERR_DELAY);
-
+ pnfs_recall_all_layouts(clp);
if (flags)
nfs_expire_all_delegation_types(clp, flags);
out:
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3c8c841..dbf43e7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -158,6 +158,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
clp->cl_machine_cred = cred;
#if defined(CONFIG_NFS_V4_1)
INIT_LIST_HEAD(&clp->cl_layouts);
+ INIT_LIST_HEAD(&clp->cl_layoutrecalls);
+ rpc_init_wait_queue(&clp->cl_rpcwaitq_recall,
+ "NFS client CB_LAYOUTRECALLS");
#endif
nfs_fscache_get_client_cookie(clp);

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fe79872..6223c6a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5346,31 +5346,58 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct inode *ino = lgp->args.inode;
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_server *server = NFS_SERVER(ino);
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;

dprintk("--> %s\n", __func__);
+ spin_lock(&clp->cl_lock);
+ if (matches_outstanding_recall(ino, &lgp->args.range)) {
+ rpc_sleep_on(&clp->cl_rpcwaitq_recall, task, NULL);
+ spin_unlock(&clp->cl_lock);
+ return;
+ }
+ spin_unlock(&clp->cl_lock);
+ /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+ * right now covering the LAYOUTGET we are about to send.
+ * However, that is not so catastrophic, and there seems
+ * to be no way to prevent it completely.
+ */
spin_lock(&ino->i_lock);
- if (pnfs_layoutgets_blocked(nfsi->layout)) {
+ if (pnfs_layoutgets_blocked(nfsi->layout, NULL)) {
rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL);
spin_unlock(&ino->i_lock);
return;
}
+ /* This needs after above check but atomic with it in order to properly
+ * serialize openstateid LAYOUTGETs.
+ */
+ nfsi->layout->plh_outstanding++;
spin_unlock(&ino->i_lock);
+
if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
- &lgp->res.seq_res, 0, task))
+ &lgp->res.seq_res, 0, task)) {
+ spin_lock(&ino->i_lock);
+ nfsi->layout->plh_outstanding--;
+ spin_unlock(&ino->i_lock);
return;
+ }
rpc_call_start(task);
}

static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+ struct inode *ino = lgp->args.inode;

dprintk("--> %s\n", __func__);

- if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) {
+ /* layout code relies on fact that in this case
+ * code falls back to tk_action=call_start, but not
+ * back to rpc_prepare_task, to keep plh_outstanding
+ * correct.
+ */
return;
-
+ }
switch (task->tk_status) {
case 0:
break;
@@ -5379,7 +5406,11 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
task->tk_status = -NFS4ERR_DELAY;
/* Fall through */
default:
- if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, NFS_SERVER(ino),
+ NULL, NULL) == -EAGAIN) {
+ spin_lock(&ino->i_lock);
+ NFS_I(ino)->layout->plh_outstanding--;
+ spin_unlock(&ino->i_lock);
rpc_restart_call_prepare(task);
return;
}
@@ -5437,13 +5468,20 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
if (IS_ERR(task))
return PTR_ERR(task);
status = nfs4_wait_for_completion_rpc_task(task);
- if (status != 0)
- goto out;
- status = task->tk_status;
- if (status != 0)
- goto out;
- status = pnfs_layout_process(lgp);
-out:
+ if (status == 0)
+ status = task->tk_status;
+ if (status == 0)
+ status = pnfs_layout_process(lgp);
+ else {
+ struct inode *ino = lgp->args.inode;
+ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+
+ spin_lock(&ino->i_lock);
+ lo->plh_outstanding--;
+ if (!pnfs_layoutgets_blocked(lo, NULL))
+ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
+ spin_unlock(&ino->i_lock);
+ }
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
return status;
@@ -5587,9 +5625,9 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)

spin_lock(&lo->inode->i_lock);
if (lrp->res.lrs_present)
- pnfs_set_layout_stateid(lo, &lrp->res.stateid);
+ pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
else
- pnfs_invalidate_layout_stateid(lo);
+ BUG_ON(!list_empty(&lo->segs));
spin_unlock(&lo->inode->i_lock);
}
dprintk("<-- %s\n", __func__);
@@ -5606,10 +5644,11 @@ static void nfs4_layoutreturn_release(void *calldata)

spin_lock(&ino->i_lock);
lo->plh_block_lgets--;
- if (!pnfs_layoutgets_blocked(lo))
+ lo->plh_outstanding--;
+ if (!pnfs_layoutgets_blocked(lo, NULL))
rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
spin_unlock(&ino->i_lock);
- put_layout_hdr(lrp->args.inode);
+ put_layout_hdr(ino);
}
kfree(calldata);
dprintk("<-- %s\n", __func__);
@@ -5639,6 +5678,14 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
int status = 0;

dprintk("--> %s\n", __func__);
+ if (lrp->args.return_type == RETURN_FILE) {
+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+ /* FIXME we should test for BULK here */
+ spin_lock(&lo->inode->i_lock);
+ BUG_ON(lo->plh_block_lgets == 0);
+ lo->plh_outstanding++;
+ spin_unlock(&lo->inode->i_lock);
+ }
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 00632f6..ceb0d66 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1560,6 +1560,10 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs_client_return_marked_delegations(clp);
continue;
}
+ if (test_and_clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) {
+ nfs_client_return_layouts(clp);
+ continue;
+ }
/* Recall session slots */
if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
&& nfs4_has_session(clp)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 328cca5..f530c7e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1827,13 +1827,14 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
hdr->replen += decode_getdeviceinfo_maxsz;
}

-static void
+static int
encode_layoutget(struct xdr_stream *xdr,
const struct nfs4_layoutget_args *args,
struct compound_hdr *hdr)
{
nfs4_stateid stateid;
__be32 *p;
+ int status;

p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
*p++ = cpu_to_be32(OP_LAYOUTGET);
@@ -1843,8 +1844,11 @@ encode_layoutget(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->range.offset);
p = xdr_encode_hyper(p, args->range.length);
p = xdr_encode_hyper(p, args->minlength);
- pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
- args->ctx->state);
+ status = pnfs_choose_layoutget_stateid(&stateid,
+ NFS_I(args->inode)->layout,
+ args->ctx->state);
+ if (status)
+ return status;
p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
*p = cpu_to_be32(args->maxcount);

@@ -1857,6 +1861,7 @@ encode_layoutget(struct xdr_stream *xdr,
args->maxcount);
hdr->nops++;
hdr->replen += decode_layoutget_maxsz;
+ return 0;
}

static int
@@ -2782,12 +2787,15 @@ static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
+ int status;

xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, req, &hdr);
encode_sequence(&xdr, &args->seq_args, &hdr);
encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
- encode_layoutget(&xdr, args, &hdr);
+ status = encode_layoutget(&xdr, args, &hdr);
+ if (status)
+ return status;
encode_nops(&hdr);
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 07b04e8..2d817be 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
*/

/* Need to hold i_lock if caller does not already hold reference */
-static void
+void
get_layout_hdr(struct pnfs_layout_hdr *lo)
{
atomic_inc(&lo->plh_refcount);
@@ -278,24 +278,29 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
smp_mb();
lseg->valid = true;
lseg->layout = lo;
+ lseg->drain_notification = NULL;
}

static void
_put_lseg_common(struct pnfs_layout_segment *lseg)
{
+ struct inode *ino = lseg->layout->inode;
+
BUG_ON(lseg->valid == true);
list_del(&lseg->fi_list);
if (list_empty(&lseg->layout->segs)) {
struct nfs_client *clp;

- clp = NFS_SERVER(lseg->layout->inode)->nfs_client;
+ clp = NFS_SERVER(ino)->nfs_client;
spin_lock(&clp->cl_lock);
/* List does not take a reference, so no need for put here */
list_del_init(&lseg->layout->layouts);
spin_unlock(&clp->cl_lock);
- pnfs_invalidate_layout_stateid(lseg->layout);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->layout->plh_flags);
+ if (!pnfs_layoutgets_blocked(lseg->layout, NULL))
+ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
}
- rpc_wake_up(&NFS_I(lseg->layout->inode)->lo_rpcwaitq);
+ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
}

/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
@@ -325,9 +330,12 @@ put_lseg(struct pnfs_layout_segment *lseg)
atomic_read(&lseg->pls_refcount), lseg->valid);
ino = lseg->layout->inode;
if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
+ struct pnfs_cb_lrecall_info *drain_info = lseg->drain_notification;
+
_put_lseg_common(lseg);
spin_unlock(&ino->i_lock);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ notify_drained(drain_info);
/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
put_layout_hdr(ino);
}
@@ -345,7 +353,7 @@ EXPORT_SYMBOL_GPL(put_lseg);
* READ READ true
* READ RW false
*/
-static int
+bool
should_free_lseg(struct pnfs_layout_range *lseg_range,
struct pnfs_layout_range *recall_range)
{
@@ -388,16 +396,19 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
dprintk("%s:Return\n", __func__);
}

-static void
+void
pnfs_free_lseg_list(struct list_head *free_me)
{
struct pnfs_layout_segment *lseg, *tmp;
struct inode *ino;
+ struct pnfs_cb_lrecall_info *drain_info;

list_for_each_entry_safe(lseg, tmp, free_me, fi_list) {
BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
ino = lseg->layout->inode;
+ drain_info = lseg->drain_notification;
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ notify_drained(drain_info);
/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
put_layout_hdr(ino);
}
@@ -453,40 +464,49 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
}
}

-/* update lo->stateid with new if is more recent
- *
- * lo->stateid could be the open stateid, in which case we just use what given.
- */
+/* update lo->stateid with new if is more recent */
void
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new)
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+ bool update_barrier)
{
- nfs4_stateid *old = &lo->stateid;
- bool overwrite = false;
+ u32 oldseq, newseq;

assert_spin_locked(&lo->inode->i_lock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
- memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
- overwrite = true;
- else {
- u32 oldseq, newseq;
-
- oldseq = be32_to_cpu(old->stateid.seqid);
- newseq = be32_to_cpu(new->stateid.seqid);
- if ((int)(newseq - oldseq) > 0)
- overwrite = true;
+ oldseq = be32_to_cpu(lo->stateid.stateid.seqid);
+ newseq = be32_to_cpu(new->stateid.seqid);
+ if ((int)(newseq - oldseq) > 0) {
+ memcpy(&lo->stateid, &new->stateid, sizeof(new->stateid));
+ if (update_barrier)
+ lo->plh_barrier = be32_to_cpu(new->stateid.seqid);
+ else {
+ /* Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids. It needs to be
+ * within 2**31 to count as "behind", so if it
+ * gets too near that limit, give us a litle leeway
+ * and bring it to within 2**30.
+ * NOTE - and yes, this is all unsigned arithmetic.
+ */
+ if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+ lo->plh_barrier = newseq - (1 << 30);
+ }
}
- if (overwrite)
- memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
}

-void
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct nfs4_state *open_state)
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+ struct nfs4_state *open_state)
{
+ int status = 0;
+
dprintk("--> %s\n", __func__);
spin_lock(&lo->inode->i_lock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
+ if (lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ /* We avoid -EAGAIN, as that has special meaning to
+ * some callers.
+ */
+ status = -NFS4ERR_LAYOUTTRYLATER;
+ } else if (list_empty(&lo->segs)) {
int seq;

do {
@@ -494,12 +514,11 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
memcpy(dst->data, open_state->stateid.data,
sizeof(open_state->stateid.data));
} while (read_seqretry(&open_state->seqlock, seq));
- set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
} else
- memcpy(dst->data, lo->stateid.data,
- sizeof(lo->stateid.data));
+ memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data));
spin_unlock(&lo->inode->i_lock);
dprintk("<-- %s\n", __func__);
+ return status;
}

/*
@@ -566,6 +585,28 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,
return out;
}

+void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range,
+ struct pnfs_cb_lrecall_info *drain_info,
+ struct list_head *tmp_list)
+{
+ struct pnfs_layout_segment *lseg, *tmp;
+
+ assert_spin_locked(&lo->inode->i_lock);
+ list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
+ if (should_free_lseg(&lseg->range, range)) {
+ /* FIXME - need to change to something like a
+ * notification bitmap to remove the restriction
+ * of only being able to process a single
+ * CB_LAYOUTRECALL at a time.
+ */
+ BUG_ON(lseg->drain_notification);
+ lseg->drain_notification = drain_info;
+ atomic_inc(&drain_info->pcl_count);
+ mark_lseg_invalid(lseg, tmp_list);
+ }
+}
+
/* Return true if there is layout based io in progress in the given range.
* Assumes range has already been marked invalid, and layout marked to
* prevent any new lseg from being inserted.
@@ -711,14 +752,6 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);

assert_spin_locked(&lo->inode->i_lock);
- if (list_empty(&lo->segs)) {
- struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
-
- spin_lock(&clp->cl_lock);
- BUG_ON(!list_empty(&lo->layouts));
- list_add_tail(&lo->layouts, &clp->cl_layouts);
- spin_unlock(&clp->cl_lock);
- }
list_for_each_entry(lp, &lo->segs, fi_list) {
if (cmp_layout(&lp->range, &lseg->range) > 0)
continue;
@@ -735,6 +768,9 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
}
if (!found) {
list_add_tail(&lseg->fi_list, &lo->segs);
+ if (list_is_singular(&lo->segs) &&
+ !pnfs_layoutgets_blocked(lo, NULL))
+ rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu at tail\n",
__func__, lseg, lseg->range.iomode,
@@ -756,6 +792,7 @@ alloc_init_layout_hdr(struct inode *ino)
atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->layouts);
INIT_LIST_HEAD(&lo->segs);
+ INIT_LIST_HEAD(&lo->plh_bulk_recall);
lo->inode = ino;
return lo;
}
@@ -843,6 +880,7 @@ pnfs_update_layout(struct inode *ino,
.length = NFS4_MAX_UINT64,
};
struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;

@@ -878,9 +916,28 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;

get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */
+ if (list_empty(&lo->segs)) {
+ /* The lo must be on the clp list if there is any
+ * chance of a CB_LAYOUTRECALL(FILE) coming in.
+ */
+ spin_lock(&clp->cl_lock);
+ BUG_ON(!list_empty(&lo->layouts));
+ list_add_tail(&lo->layouts, &clp->cl_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
spin_unlock(&ino->i_lock);

lseg = send_layoutget(lo, ctx, &arg);
+ if (!lseg) {
+ spin_lock(&ino->i_lock);
+ if (list_empty(&lo->segs)) {
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->layouts);
+ spin_unlock(&clp->cl_lock);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ }
+ spin_unlock(&ino->i_lock);
+ }
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
nfsi->layout->plh_flags, lseg);
@@ -891,10 +948,15 @@ out_unlock:
}

bool
-pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo)
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid)
{
assert_spin_locked(&lo->inode->i_lock);
- return lo->plh_block_lgets;
+ if ((stateid) &&
+ (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+ return true;
+ return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ (list_empty(&lo->segs) && lo->plh_outstanding);
}

int
@@ -904,6 +966,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->inode;
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;

/* Inject layout blob into I/O device driver */
@@ -915,10 +978,25 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
status = PTR_ERR(lseg);
dprintk("%s: Could not allocate layout: error %d\n",
__func__, status);
+ spin_lock(&ino->i_lock);
goto out;
}

spin_lock(&ino->i_lock);
+ /* decrement needs to be done before call to pnfs_layoutget_blocked */
+ lo->plh_outstanding--;
+ spin_lock(&clp->cl_lock);
+ if (matches_outstanding_recall(ino, &res->range)) {
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s forget reply due to recall\n", __func__);
+ goto out_forget_reply;
+ }
+ spin_unlock(&clp->cl_lock);
+
+ if (pnfs_layoutgets_blocked(lo, &res->stateid)) {
+ dprintk("%s forget reply due to state\n", __func__);
+ goto out_forget_reply;
+ }
init_lseg(lo, lseg);
lseg->range = res->range;
get_lseg(lseg);
@@ -934,10 +1012,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}

/* Done processing layoutget. Set the layout stateid */
- pnfs_set_layout_stateid(lo, &res->stateid);
- spin_unlock(&ino->i_lock);
+ pnfs_set_layout_stateid(lo, &res->stateid, false);
out:
+ if (!pnfs_layoutgets_blocked(lo, NULL))
+ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
+ spin_unlock(&ino->i_lock);
return status;
+
+out_forget_reply:
+ spin_unlock(&ino->i_lock);
+ lseg->layout = lo;
+ NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ spin_lock(&ino->i_lock);
+ goto out;
}

void
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 891aeab..7ea121f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -31,6 +31,7 @@
#define FS_NFS_PNFS_H

#include <linux/nfs_page.h>
+#include "callback.h" /* for cb_layoutrecallargs */

struct pnfs_layout_segment {
struct list_head fi_list;
@@ -38,6 +39,7 @@ struct pnfs_layout_segment {
atomic_t pls_refcount;
bool valid;
struct pnfs_layout_hdr *layout;
+ struct pnfs_cb_lrecall_info *drain_notification;
};

enum pnfs_try_status {
@@ -52,7 +54,7 @@ enum pnfs_try_status {
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
- NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
+ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
};

@@ -94,10 +96,13 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_hdr {
atomic_t plh_refcount;
struct list_head layouts; /* other client layouts */
+ struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
struct list_head segs; /* layout segments list */
int roc_iomode;/* return on close iomode, 0=none */
nfs4_stateid stateid;
+ unsigned long plh_outstanding; /* number of RPCs out */
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
+ u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_flags;
struct rpc_cred *cred; /* layoutcommit credential */
/* DH: These vars keep track of the maximum write range
@@ -118,6 +123,14 @@ struct pnfs_device {
unsigned int pglen;
};

+struct pnfs_cb_lrecall_info {
+ struct list_head pcl_list; /* hook into cl_layoutrecalls list */
+ atomic_t pcl_count;
+ struct nfs_client *pcl_clp;
+ struct inode *pcl_ino;
+ struct cb_layoutrecallargs pcl_args;
+};
+
/*
* Device ID RCU cache. A device ID is unique per client ID and layout type.
*/
@@ -176,7 +189,10 @@ extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);

/* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
void put_lseg(struct pnfs_layout_segment *lseg);
+bool should_free_lseg(struct pnfs_layout_range *lseg_range,
+ struct pnfs_layout_range *recall_range);
struct pnfs_layout_segment *
pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
struct pnfs_layout_segment *
@@ -201,15 +217,24 @@ enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
struct nfs_open_context *, struct list_head *);
void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
-bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo);
+bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
void put_layout_hdr(struct inode *inode);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct nfs4_state *open_state);
+ const nfs4_stateid *new,
+ bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_hdr *lo,
+ struct nfs4_state *open_state);
+void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range,
+ struct pnfs_cb_lrecall_info *drain_info,
+ struct list_head *tmp_list);
+/* FIXME - this should be in callback.h, but pnfs_cb_lrecall_info needs to be there too */
+extern void notify_drained(struct pnfs_cb_lrecall_info *d);

static inline bool
has_layout(struct nfs_inode *nfsi)
@@ -223,12 +248,6 @@ static inline int lo_fail_bit(u32 iomode)
NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
}

-static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
-{
- assert_spin_locked(&lo->inode->i_lock);
- clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
-}
-
static inline void get_lseg(struct pnfs_layout_segment *lseg)
{
atomic_inc(&lseg->pls_refcount);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 3cae408..80dcc00 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -83,6 +83,10 @@ struct nfs_client {
u32 cl_exchange_flags;
struct nfs4_session *cl_session; /* sharred session */
struct list_head cl_layouts;
+ struct list_head cl_layoutrecalls;
+ unsigned long cl_cb_lrecall_count;
+#define PNFS_MAX_CB_LRECALLS (1)
+ struct rpc_wait_queue cl_rpcwaitq_recall;
struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
#endif /* CONFIG_NFS_V4_1 */

--
1.7.2.1


2010-11-14 12:05:17

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 08/22] SQUASHME: allow cb_sequence changes to compile without v4.1

On 2010-11-12 10:48, Fred Isaman wrote:
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> fs/nfs/callback.h | 26 ++++++++++++++++++++++++++
> fs/nfs/callback_proc.c | 6 ------
> fs/nfs/callback_xdr.c | 3 +--
> fs/nfs/internal.h | 4 ++++
> 4 files changed, 31 insertions(+), 8 deletions(-)
>
> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
> index 89fee05..0b1f3c4 100644
> --- a/fs/nfs/callback.h
> +++ b/fs/nfs/callback.h
> @@ -8,6 +8,8 @@
> #ifndef __LINUX_FS_NFS_CALLBACK_H
> #define __LINUX_FS_NFS_CALLBACK_H
>
> +#include "internal.h"
> +
> #define NFS4_CALLBACK 0x40000000
> #define NFS4_CALLBACK_XDRSIZE 2048
> #define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
> @@ -158,6 +160,30 @@ extern unsigned nfs4_callback_layoutrecall(
> struct cb_layoutrecallargs *args,
> void *dummy, struct cb_process_state *cps);
>
> +static inline void put_session_client(struct nfs4_session *session)
> +{
> + if (session) /* matched by cb_sequence find_client_with_session */

nit: comment out of scope. belongs to the call site, not here...

> + nfs_put_client(session->clp);
> +}
> +
> +static inline struct nfs_client *
> +find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
> +{
> + return cps->session ? cps->session->clp : nfs_find_client(addr, 4);
> +}
> +
> +#else

nit: /* CONFIG_NFS_V4_1 */ comment missing

(I'll fix both in my tree)

Benny

> +
> +static inline struct nfs_client *
> +find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
> +{
> + return nfs_find_client(addr, 4);
> +}
> +
> +static inline void put_session_client(struct nfs4_session *session)
> +{
> +}
> +
> #endif /* CONFIG_NFS_V4_1 */
>
> extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index 2e62155..d02997a 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -21,12 +21,6 @@
> #define NFSDBG_FACILITY NFSDBG_CALLBACK
> #endif
>
> -static struct nfs_client *
> -find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
> -{
> - return cps->session ? cps->session->clp : nfs_find_client(addr, 4);
> -}
> -
> __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
> struct cb_getattrres *res,
> struct cb_process_state *cps)
> diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
> index 1650ab0..01688ce 100644
> --- a/fs/nfs/callback_xdr.c
> +++ b/fs/nfs/callback_xdr.c
> @@ -770,8 +770,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
>
> *hdr_res.status = status;
> *hdr_res.nops = htonl(nops);
> - if (cps.session) /* matched by cb_sequence find_client_with_session */
> - nfs_put_client(cps.session->clp);
> + put_session_client(cps.session);
> dprintk("%s: done, status = %u\n", __func__, ntohl(status));
> return rpc_success;
> }
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 755e555..6f14089 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -2,6 +2,8 @@
> * NFS internal definitions
> */
>
> +#ifndef __LINUX_FS_NFS_INTERNAL_H
> +#define __LINUX_FS_NFS_INTERNAL_H
> #include "nfs4_fs.h"
> #include <linux/mount.h>
> #include <linux/security.h>
> @@ -415,3 +417,5 @@ static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client
> return rpc_restart_call_prepare(task);
> return rpc_restart_call(task);
> }
> +
> +#endif /* __LINUX_FS_NFS_INTERNAL_H */

2010-11-13 09:12:09

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On Fri, 2010-11-12 at 03:48 -0500, Fred Isaman wrote:
> Remove NFS_LAYOUT_STATEID_SET in favor of just checking list_empty(lo->segs).
>
> LAYOUTGETs with openstateid are serialized. Waiting on the condition
> (list_empty(lo->segs) && plh_outstanding>0) both drains outstanding RPCs once
> the stateid is invalidated and allows only a single LAYOUTGET(openstateid)
> through at a time.
>
> Before sending a LAYOUTRETURN, plh_block_lgets is incremented. It is
> decremented in the rpc_release function. While set, LAYOUTGETs are
> paused in their rpc_prepare function, and any responses are
> forgotten.
>
> Callbacks are handled by blocking any matching LAYOUTGETS while processing and
> initiating drain of IO. A notification system is set up so that when
> all relevant IO is finished, the state manger thread is invoked, which
> synchronously sends the final matching LAYOUTRETURN before unblocking
> LAYOUTGETS.
>
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> fs/nfs/callback.h | 7 +
> fs/nfs/callback_proc.c | 466 +++++++++++++++++++++++----------------------
> fs/nfs/client.c | 3 +
> fs/nfs/nfs4proc.c | 81 ++++++--
> fs/nfs/nfs4state.c | 4 +
> fs/nfs/nfs4xdr.c | 16 ++-
> fs/nfs/pnfs.c | 177 +++++++++++++-----
> fs/nfs/pnfs.h | 41 +++-
> include/linux/nfs_fs_sb.h | 4 +
> 9 files changed, 497 insertions(+), 302 deletions(-)
>
> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
> index cea58cc..4a9905b 100644
> --- a/fs/nfs/callback.h
> +++ b/fs/nfs/callback.h
> @@ -163,6 +163,9 @@ struct cb_layoutrecallargs {
> extern unsigned nfs4_callback_layoutrecall(
> struct cb_layoutrecallargs *args,
> void *dummy, struct cb_process_state *cps);
> +extern bool matches_outstanding_recall(struct inode *ino,
> + struct pnfs_layout_range *range);
> +extern void nfs_client_return_layouts(struct nfs_client *clp);
>
> static inline void put_session_client(struct nfs4_session *session)
> {
> @@ -178,6 +181,10 @@ find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
>
> #else
>
> +static inline void nfs_client_return_layouts(struct nfs_client *clp)
> +{
> +}
> +
> static inline struct nfs_client *
> find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
> {
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index 6e0fc40..af405cf 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -124,265 +124,283 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
> #if defined(CONFIG_NFS_V4_1)
>
> static bool
> -pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
> - const nfs4_stateid stateid)
> +_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info,
> + struct inode *ino, struct pnfs_layout_range *range)
> {
> - bool res;
> - u32 oldseqid, newseqid;
> -
> - spin_lock(&lo->inode->i_lock);
> - {
> - oldseqid = be32_to_cpu(lo->stateid.stateid.seqid);
> - newseqid = be32_to_cpu(stateid.stateid.seqid);
> - res = !memcmp(lo->stateid.stateid.other,
> - stateid.stateid.other,
> - NFS4_STATEID_OTHER_SIZE);
> - if (res) { /* comparing layout stateids */
> - if (oldseqid == ~0)
> - res = (newseqid == 1);
> - else
> - res = (newseqid == oldseqid + 1);
> - } else { /* open stateid */
> - res = !memcmp(lo->stateid.data,
> - &zero_stateid,
> - NFS4_STATEID_SIZE);
> - if (res)
> - res = (newseqid == 1);
> - }
> - }
> - spin_unlock(&lo->inode->i_lock);
> + struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args;
>
> - return res;
> + switch (cb_args->cbl_recall_type) {
> + case RETURN_ALL:
> + return true;
> + case RETURN_FSID:
> + return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid,
> + sizeof(struct nfs_fsid));
> + case RETURN_FILE:
> + return (ino == cb_info->pcl_ino) &&
> + should_free_lseg(range, &cb_args->cbl_range);
> + default:
> + BUG();

Why should we BUG() just because the server is screwed up? That's not a
client bug.

> + }
> }
>
> -/*
> - * Retrieve an inode based on layout recall parameters
> - *
> - * Note: caller must iput(inode) to dereference the inode.
> - */
> -static struct inode *
> -nfs_layoutrecall_find_inode(struct nfs_client *clp,
> - const struct cb_layoutrecallargs *args)
> +bool
> +matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range)
> {
> - struct nfs_inode *nfsi;
> - struct pnfs_layout_hdr *lo;
> - struct nfs_server *server;
> - struct inode *ino = NULL;
> -
> - dprintk("%s: Begin recall_type=%d clp %p\n",
> - __func__, args->cbl_recall_type, clp);
> -
> - spin_lock(&clp->cl_lock);
> - list_for_each_entry(lo, &clp->cl_layouts, layouts) {
> - nfsi = NFS_I(lo->inode);
> - if (!nfsi)
> - continue;
> -
> - dprintk("%s: Searching inode=%lu\n",
> - __func__, nfsi->vfs_inode.i_ino);
> -
> - if (args->cbl_recall_type == RETURN_FILE) {
> - if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh))
> - continue;
> - } else if (args->cbl_recall_type == RETURN_FSID) {
> - server = NFS_SERVER(&nfsi->vfs_inode);
> - if (server->fsid.major != args->cbl_fsid.major ||
> - server->fsid.minor != args->cbl_fsid.minor)
> - continue;
> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
> + struct pnfs_cb_lrecall_info *cb_info;
> + bool rv = false;
> +
> + assert_spin_locked(&clp->cl_lock);

Can we please go easy on the asserts? There is way too much asserting
going on in the NFSv4.1 code. This isn't a publicly visible interface,
so just get it right in the debugging process before the merge, and then
kill these asserts...

> + list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) {
> + if (_recall_matches_lget(cb_info, ino, range)) {
> + rv = true;
> + break;
> }
> -
> - /* Make sure client didn't clean up layout without
> - * telling the server */
> - if (!has_layout(nfsi))
> - continue;
> -
> - ino = igrab(&nfsi->vfs_inode);
> - dprintk("%s: Found inode=%p\n", __func__, ino);
> - break;
> }
> - spin_unlock(&clp->cl_lock);
> - return ino;
> + return rv;
> }
>
> -struct recall_layout_threadargs {
> - struct inode *inode;
> - struct nfs_client *clp;
> - struct completion started;
> - struct cb_layoutrecallargs *rl;
> - int result;
> -};
> -
> -static int pnfs_recall_layout(void *data)
> +/* Send a synchronous LAYOUTRETURN. By the time this is called, we know
> + * all IO has been drained, any matching lsegs deleted, and that no
> + * overlapping LAYOUTGETs will be sent or processed for the duration
> + * of this call.
> + * Note that it is possible that when this is called, the stateid has
> + * been invalidated. But will not be cleared, so can still use.
> + */
> +static int
> +pnfs_send_layoutreturn(struct nfs_client *clp,
> + struct pnfs_cb_lrecall_info *cb_info)
> {
> - struct inode *inode, *ino;
> - struct nfs_client *clp;
> - struct cb_layoutrecallargs rl;
> + struct cb_layoutrecallargs *args = &cb_info->pcl_args;
> struct nfs4_layoutreturn *lrp;
> - struct recall_layout_threadargs *args =
> - (struct recall_layout_threadargs *)data;
> - int status = 0;
> -
> - daemonize("nfsv4-layoutreturn");
> -
> - dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n",
> - __func__, args->rl->cbl_recall_type,
> - args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor);
> -
> - clp = args->clp;
> - inode = args->inode;
> - rl = *args->rl;
> -
> - /* support whole file layouts only */
> - rl.cbl_range.offset = 0;
> - rl.cbl_range.length = NFS4_MAX_UINT64;
> -
> - if (rl.cbl_recall_type == RETURN_FILE) {
> - if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout,
> - rl.cbl_stateid))
> - status = pnfs_return_layout(inode, &rl.cbl_range,
> - &rl.cbl_stateid, RETURN_FILE,
> - false);
> - else
> - status = cpu_to_be32(NFS4ERR_DELAY);
> - if (status)
> - dprintk("%s RETURN_FILE error: %d\n", __func__, status);
> - else
> - status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
> - args->result = status;
> - complete(&args->started);
> - goto out;
> - }
> -
> - status = cpu_to_be32(NFS4_OK);
> - args->result = status;
> - complete(&args->started);
> - args = NULL;
> -
> - /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
> - while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
> - /* FIXME: need to check status on pnfs_return_layout */
> - pnfs_return_layout(ino, &rl.cbl_range, NULL, RETURN_FILE, false);
> - iput(ino);
> - }
>
> lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
> - if (!lrp) {
> - dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
> - __func__);
> - goto out;
> - }
> -
> - /* send final layoutreturn */
> + if (!lrp)
> + return -ENOMEM;
> lrp->args.reclaim = 0;
> - lrp->args.layout_type = rl.cbl_layout_type;
> - lrp->args.return_type = rl.cbl_recall_type;
> + lrp->args.layout_type = args->cbl_layout_type;
> + lrp->args.return_type = args->cbl_recall_type;
> lrp->clp = clp;
> - lrp->args.range = rl.cbl_range;
> - lrp->args.inode = inode;
> - nfs4_proc_layoutreturn(lrp, true);
> -
> -out:
> - clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
> - nfs_put_client(clp);
> - module_put_and_exit(0);
> - dprintk("%s: exit status %d\n", __func__, 0);
> - return 0;
> + if (args->cbl_recall_type == RETURN_FILE) {
> + lrp->args.range = args->cbl_range;
> + lrp->args.inode = cb_info->pcl_ino;
> + } else {
> + lrp->args.range.iomode = IOMODE_ANY;
> + lrp->args.inode = NULL;
> + }
> + return nfs4_proc_layoutreturn(lrp, true);
> }
>
> -/*
> - * Asynchronous layout recall!
> +/* Called by state manager to finish CB_LAYOUTRECALLS initiated by
> + * nfs4_callback_layoutrecall().
> */
> -static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
> - struct cb_layoutrecallargs *rl)
> +void nfs_client_return_layouts(struct nfs_client *clp)
> {
> - struct recall_layout_threadargs data = {
> - .clp = clp,
> - .inode = inode,
> - .rl = rl,
> - };
> - struct task_struct *t;
> - int status = -EAGAIN;
> + struct pnfs_cb_lrecall_info *cb_info;
>
> - dprintk("%s: -->\n", __func__);
> + spin_lock(&clp->cl_lock);
> + while (true) {
> + if (list_empty(&clp->cl_layoutrecalls)) {
> + spin_unlock(&clp->cl_lock);
> + break;
> + }
> + cb_info = list_first_entry(&clp->cl_layoutrecalls,
> + struct pnfs_cb_lrecall_info,
> + pcl_list);
> + spin_unlock(&clp->cl_lock);
> + if (atomic_read(&cb_info->pcl_count) != 0)
> + break;
> + /* What do on error return? These layoutreturns are
> + * required by the protocol. So if do not get
> + * successful reply, probably have to do something
> + * more drastic.
> + */
> + pnfs_send_layoutreturn(clp, cb_info);
> + spin_lock(&clp->cl_lock);
> + /* Removing from the list unblocks LAYOUTGETs */
> + list_del(&cb_info->pcl_list);
> + clp->cl_cb_lrecall_count--;
> + rpc_wake_up(&clp->cl_rpcwaitq_recall);
> + kfree(cb_info);
> + }
> +}
>
> - /* FIXME: do not allow two concurrent layout recalls */
> - if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
> - return status;
> -
> - init_completion(&data.started);
> - __module_get(THIS_MODULE);
> - atomic_inc(&clp->cl_count);
> -
> - t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
> - if (IS_ERR(t)) {
> - printk(KERN_INFO "NFS: Layout recall callback thread failed "
> - "for client (clientid %08x/%08x)\n",
> - (unsigned)(clp->cl_clientid >> 32),
> - (unsigned)(clp->cl_clientid));
> - status = PTR_ERR(t);
> - goto out_module_put;
> +void notify_drained(struct pnfs_cb_lrecall_info *d)
> +{
> + if (d && atomic_dec_and_test(&d->pcl_count)) {
> + set_bit(NFS4CLNT_LAYOUT_RECALL, &d->pcl_clp->cl_state);
> + nfs4_schedule_state_manager(d->pcl_clp);
> }
> - wait_for_completion(&data.started);
> - return data.result;
> -out_module_put:
> - nfs_put_client(clp);
> - clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
> - module_put(THIS_MODULE);
> - return status;
> }
>
> -static int pnfs_recall_all_layouts(struct nfs_client *clp)
> +static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
> {
> - struct cb_layoutrecallargs rl;
> - struct inode *inode;
> - int status = 0;
> -
> - rl.cbl_recall_type = RETURN_ALL;
> - rl.cbl_range.iomode = IOMODE_ANY;
> - rl.cbl_range.offset = 0;
> - rl.cbl_range.length = NFS4_MAX_UINT64;
> -
> - /* we need the inode to get the nfs_server struct */
> - inode = nfs_layoutrecall_find_inode(clp, &rl);
> - if (!inode)
> - return status;
> - status = pnfs_async_return_layout(clp, inode, &rl);
> - iput(inode);
> + struct nfs_client *clp = cb_info->pcl_clp;
> + struct pnfs_layout_hdr *lo;
> + int rv = NFS4ERR_NOMATCHING_LAYOUT;
> + struct cb_layoutrecallargs *args = &cb_info->pcl_args;
> +
> + if (args->cbl_recall_type == RETURN_FILE) {
> + LIST_HEAD(free_me_list);
> +
> + spin_lock(&clp->cl_lock);
> + list_for_each_entry(lo, &clp->cl_layouts, layouts) {
> + if (nfs_compare_fh(&args->cbl_fh,
> + &NFS_I(lo->inode)->fh))
> + continue;
> + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
> + rv = NFS4ERR_DELAY;
> + else {
> + /* FIXME I need to better understand igrab and
> + * does having a layout ref keep ino around?
> + * It should.
> + */
> + /* We need to hold the reference until any
> + * potential LAYOUTRETURN is finished.
> + */
> + get_layout_hdr(lo);
> + cb_info->pcl_ino = lo->inode;
> + rv = NFS4_OK;
> + }
> + break;
> + }
> + spin_unlock(&clp->cl_lock);
> +
> + spin_lock(&lo->inode->i_lock);
> + if (rv == NFS4_OK) {
> + lo->plh_block_lgets++;
> + nfs4_asynch_forget_layouts(lo, &args->cbl_range,
> + cb_info, &free_me_list);
> + }
> + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
> + spin_unlock(&lo->inode->i_lock);
> + pnfs_free_lseg_list(&free_me_list);
> + } else {
> + struct pnfs_layout_hdr *tmp;
> + LIST_HEAD(recall_list);
> + LIST_HEAD(free_me_list);
> + struct pnfs_layout_range range = {
> + .iomode = IOMODE_ANY,
> + .offset = 0,
> + .length = NFS4_MAX_UINT64,
> + };
> +
> + spin_lock(&clp->cl_lock);
> + /* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */
> + if (!list_is_singular(&clp->cl_layoutrecalls)) {
> + spin_unlock(&clp->cl_lock);
> + return NFS4ERR_DELAY;
> + }
> + list_for_each_entry(lo, &clp->cl_layouts, layouts) {
> + if ((args->cbl_recall_type == RETURN_FSID) &&
> + memcmp(&NFS_SERVER(lo->inode)->fsid,
> + &args->cbl_fsid, sizeof(struct nfs_fsid)))
> + continue;
> + get_layout_hdr(lo);
> + /* We could list_del(&lo->layouts) here */
> + BUG_ON(!list_empty(&lo->plh_bulk_recall));
> + list_add(&lo->plh_bulk_recall, &recall_list);
> + }
> + spin_unlock(&clp->cl_lock);
> + list_for_each_entry_safe(lo, tmp,
> + &recall_list, plh_bulk_recall) {
> + spin_lock(&lo->inode->i_lock);
> + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
> + nfs4_asynch_forget_layouts(lo, &range, cb_info,
> + &free_me_list);
> + list_del_init(&lo->plh_bulk_recall);
> + spin_unlock(&lo->inode->i_lock);
> + put_layout_hdr(lo->inode);
> + rv = NFS4_OK;
> + }
> + pnfs_free_lseg_list(&free_me_list);
> + }
> + return rv;
> +}
> +
> +static u32 do_callback_layoutrecall(struct nfs_client *clp,
> + struct cb_layoutrecallargs *args)
> +{
> + struct pnfs_cb_lrecall_info *new;
> + u32 res;
> +
> + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
> + new = kmalloc(sizeof(*new), GFP_KERNEL);
> + if (!new) {
> + res = NFS4ERR_RESOURCE;
> + goto out;
> + }
> + memcpy(&new->pcl_args, args, sizeof(*args));
> + atomic_set(&new->pcl_count, 1);
> + new->pcl_clp = clp;
> + new->pcl_ino = NULL;
> + spin_lock(&clp->cl_lock);
> + if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) {
> + kfree(new);
> + res = NFS4ERR_DELAY;
> + spin_unlock(&clp->cl_lock);
> + goto out;
> + }
> + clp->cl_cb_lrecall_count++;
> + /* Adding to the list will block conflicting LGET activity */
> + list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls);
> + spin_unlock(&clp->cl_lock);
> + res = initiate_layout_draining(new);
> + if (res || atomic_dec_and_test(&new->pcl_count)) {
> + spin_lock(&clp->cl_lock);
> + list_del(&new->pcl_list);
> + clp->cl_cb_lrecall_count--;
> + rpc_wake_up(&clp->cl_rpcwaitq_recall);
> + spin_unlock(&clp->cl_lock);
> + if (res == NFS4_OK) {
> + if (args->cbl_recall_type == RETURN_FILE) {
> + struct pnfs_layout_hdr *lo;
> +
> + lo = NFS_I(new->pcl_ino)->layout;
> + spin_lock(&lo->inode->i_lock);
> + lo->plh_block_lgets--;
> + if (!pnfs_layoutgets_blocked(lo, NULL))
> + rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
> + spin_unlock(&lo->inode->i_lock);
> + put_layout_hdr(new->pcl_ino);
> + }
> + res = NFS4ERR_NOMATCHING_LAYOUT;
> + }
> + kfree(new);
> + }
> +out:
> + dprintk("%s returning %i\n", __func__, res);
> + return res;
>
> - return status;
> }
>
> __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
> void *dummy, struct cb_process_state *cps)
> {
> struct nfs_client *clp;
> - struct inode *inode = NULL;
> - __be32 res;
> - int status;
> + u32 res;
>
> dprintk("%s: -->\n", __func__);
>
> - res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
> - if (cps->session) /* set in cb_sequence */
> + if (cps->session) { /* set in cb_sequence */
> clp = cps->session->clp;
> - else
> - goto out;
> + res = do_callback_layoutrecall(clp, args);
> + } else
> + res = NFS4ERR_OP_NOT_IN_SESSION;
>
> - res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
> - /*
> - * In the _ALL or _FSID case, we need the inode to get
> - * the nfs_server struct.
> - */
> - inode = nfs_layoutrecall_find_inode(clp, args);
> - if (!inode)
> - goto out;
> - status = pnfs_async_return_layout(clp, inode, args);
> - if (status)
> - res = cpu_to_be32(NFS4ERR_DELAY);
> - iput(inode);
> -out:
> - dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
> - return res;
> + dprintk("%s: exit with status = %d\n", __func__, res);
> + return cpu_to_be32(res);
> +}
> +
> +static void pnfs_recall_all_layouts(struct nfs_client *clp)
> +{
> + struct cb_layoutrecallargs args;
> +
> + /* Pretend we got a CB_LAYOUTRECALL(ALL) */
> + memset(&args, 0, sizeof(args));
> + args.cbl_recall_type = RETURN_ALL;
> + /* FIXME we ignore errors, what should we do? */

We're a forgetful client: we don't care...

> + do_callback_layoutrecall(clp, &args);
> }



>
> int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
> @@ -665,9 +683,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
> flags |= FMODE_WRITE;
> if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
> &args->craa_type_mask))
> - if (pnfs_recall_all_layouts(clp) == -EAGAIN)
> - status = cpu_to_be32(NFS4ERR_DELAY);
> -
> + pnfs_recall_all_layouts(clp);
> if (flags)
> nfs_expire_all_delegation_types(clp, flags);
> out:
> diff --git a/fs/nfs/client.c b/fs/nfs/client.c
> index 3c8c841..dbf43e7 100644
> --- a/fs/nfs/client.c
> +++ b/fs/nfs/client.c
> @@ -158,6 +158,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
> clp->cl_machine_cred = cred;
> #if defined(CONFIG_NFS_V4_1)
> INIT_LIST_HEAD(&clp->cl_layouts);
> + INIT_LIST_HEAD(&clp->cl_layoutrecalls);
> + rpc_init_wait_queue(&clp->cl_rpcwaitq_recall,
> + "NFS client CB_LAYOUTRECALLS");
> #endif
> nfs_fscache_get_client_cookie(clp);
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index fe79872..6223c6a 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5346,31 +5346,58 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
> struct inode *ino = lgp->args.inode;
> struct nfs_inode *nfsi = NFS_I(ino);
> struct nfs_server *server = NFS_SERVER(ino);
> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>
> dprintk("--> %s\n", __func__);
> + spin_lock(&clp->cl_lock);
> + if (matches_outstanding_recall(ino, &lgp->args.range)) {
> + rpc_sleep_on(&clp->cl_rpcwaitq_recall, task, NULL);
> + spin_unlock(&clp->cl_lock);
> + return;
> + }
> + spin_unlock(&clp->cl_lock);
> + /* Note the is a race here, where a CB_LAYOUTRECALL can come in
> + * right now covering the LAYOUTGET we are about to send.
> + * However, that is not so catastrophic, and there seems
> + * to be no way to prevent it completely.
> + */
> spin_lock(&ino->i_lock);
> - if (pnfs_layoutgets_blocked(nfsi->layout)) {
> + if (pnfs_layoutgets_blocked(nfsi->layout, NULL)) {
> rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL);
> spin_unlock(&ino->i_lock);
> return;
> }
> + /* This needs after above check but atomic with it in order to properly
> + * serialize openstateid LAYOUTGETs.
> + */
> + nfsi->layout->plh_outstanding++;
> spin_unlock(&ino->i_lock);
> +
> if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
> - &lgp->res.seq_res, 0, task))
> + &lgp->res.seq_res, 0, task)) {
> + spin_lock(&ino->i_lock);
> + nfsi->layout->plh_outstanding--;
> + spin_unlock(&ino->i_lock);
> return;
> + }
> rpc_call_start(task);
> }
>
> static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
> {
> struct nfs4_layoutget *lgp = calldata;
> - struct nfs_server *server = NFS_SERVER(lgp->args.inode);
> + struct inode *ino = lgp->args.inode;
>
> dprintk("--> %s\n", __func__);
>
> - if (!nfs4_sequence_done(task, &lgp->res.seq_res))
> + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) {
> + /* layout code relies on fact that in this case
> + * code falls back to tk_action=call_start, but not
> + * back to rpc_prepare_task, to keep plh_outstanding
> + * correct.
> + */
> return;
> -
> + }
> switch (task->tk_status) {
> case 0:
> break;
> @@ -5379,7 +5406,11 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
> task->tk_status = -NFS4ERR_DELAY;
> /* Fall through */
> default:
> - if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
> + if (nfs4_async_handle_error(task, NFS_SERVER(ino),
> + NULL, NULL) == -EAGAIN) {
> + spin_lock(&ino->i_lock);
> + NFS_I(ino)->layout->plh_outstanding--;
> + spin_unlock(&ino->i_lock);
> rpc_restart_call_prepare(task);
> return;
> }
> @@ -5437,13 +5468,20 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
> if (IS_ERR(task))
> return PTR_ERR(task);
> status = nfs4_wait_for_completion_rpc_task(task);
> - if (status != 0)
> - goto out;
> - status = task->tk_status;
> - if (status != 0)
> - goto out;
> - status = pnfs_layout_process(lgp);
> -out:
> + if (status == 0)
> + status = task->tk_status;
> + if (status == 0)
> + status = pnfs_layout_process(lgp);
> + else {
> + struct inode *ino = lgp->args.inode;
> + struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
> +
> + spin_lock(&ino->i_lock);
> + lo->plh_outstanding--;
> + if (!pnfs_layoutgets_blocked(lo, NULL))
> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
> + spin_unlock(&ino->i_lock);
> + }
> rpc_put_task(task);
> dprintk("<-- %s status=%d\n", __func__, status);
> return status;
> @@ -5587,9 +5625,9 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
>
> spin_lock(&lo->inode->i_lock);
> if (lrp->res.lrs_present)
> - pnfs_set_layout_stateid(lo, &lrp->res.stateid);
> + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
> else
> - pnfs_invalidate_layout_stateid(lo);
> + BUG_ON(!list_empty(&lo->segs));
> spin_unlock(&lo->inode->i_lock);
> }
> dprintk("<-- %s\n", __func__);
> @@ -5606,10 +5644,11 @@ static void nfs4_layoutreturn_release(void *calldata)
>
> spin_lock(&ino->i_lock);
> lo->plh_block_lgets--;
> - if (!pnfs_layoutgets_blocked(lo))
> + lo->plh_outstanding--;
> + if (!pnfs_layoutgets_blocked(lo, NULL))
> rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
> spin_unlock(&ino->i_lock);
> - put_layout_hdr(lrp->args.inode);
> + put_layout_hdr(ino);
> }
> kfree(calldata);
> dprintk("<-- %s\n", __func__);
> @@ -5639,6 +5678,14 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
> int status = 0;
>
> dprintk("--> %s\n", __func__);
> + if (lrp->args.return_type == RETURN_FILE) {
> + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
> + /* FIXME we should test for BULK here */
> + spin_lock(&lo->inode->i_lock);
> + BUG_ON(lo->plh_block_lgets == 0);
> + lo->plh_outstanding++;
> + spin_unlock(&lo->inode->i_lock);
> + }
> task = rpc_run_task(&task_setup_data);
> if (IS_ERR(task))
> return PTR_ERR(task);
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 00632f6..ceb0d66 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -1560,6 +1560,10 @@ static void nfs4_state_manager(struct nfs_client *clp)
> nfs_client_return_marked_delegations(clp);
> continue;
> }
> + if (test_and_clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) {
> + nfs_client_return_layouts(clp);
> + continue;
> + }
> /* Recall session slots */
> if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
> && nfs4_has_session(clp)) {
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 328cca5..f530c7e 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -1827,13 +1827,14 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
> hdr->replen += decode_getdeviceinfo_maxsz;
> }
>
> -static void
> +static int
> encode_layoutget(struct xdr_stream *xdr,
> const struct nfs4_layoutget_args *args,
> struct compound_hdr *hdr)
> {
> nfs4_stateid stateid;
> __be32 *p;
> + int status;
>
> p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
> *p++ = cpu_to_be32(OP_LAYOUTGET);
> @@ -1843,8 +1844,11 @@ encode_layoutget(struct xdr_stream *xdr,
> p = xdr_encode_hyper(p, args->range.offset);
> p = xdr_encode_hyper(p, args->range.length);
> p = xdr_encode_hyper(p, args->minlength);
> - pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
> - args->ctx->state);
> + status = pnfs_choose_layoutget_stateid(&stateid,
> + NFS_I(args->inode)->layout,
> + args->ctx->state);
> + if (status)
> + return status;
> p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
> *p = cpu_to_be32(args->maxcount);
>
> @@ -1857,6 +1861,7 @@ encode_layoutget(struct xdr_stream *xdr,
> args->maxcount);
> hdr->nops++;
> hdr->replen += decode_layoutget_maxsz;
> + return 0;
> }
>
> static int
> @@ -2782,12 +2787,15 @@ static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
> struct compound_hdr hdr = {
> .minorversion = nfs4_xdr_minorversion(&args->seq_args),
> };
> + int status;
>
> xdr_init_encode(&xdr, &req->rq_snd_buf, p);
> encode_compound_hdr(&xdr, req, &hdr);
> encode_sequence(&xdr, &args->seq_args, &hdr);
> encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
> - encode_layoutget(&xdr, args, &hdr);
> + status = encode_layoutget(&xdr, args, &hdr);
> + if (status)
> + return status;
> encode_nops(&hdr);
> return 0;
> }
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 07b04e8..2d817be 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
> */
>
> /* Need to hold i_lock if caller does not already hold reference */
> -static void
> +void
> get_layout_hdr(struct pnfs_layout_hdr *lo)
> {
> atomic_inc(&lo->plh_refcount);
> @@ -278,24 +278,29 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
> smp_mb();
> lseg->valid = true;
> lseg->layout = lo;
> + lseg->drain_notification = NULL;
> }
>
> static void
> _put_lseg_common(struct pnfs_layout_segment *lseg)
> {
> + struct inode *ino = lseg->layout->inode;
> +
> BUG_ON(lseg->valid == true);
> list_del(&lseg->fi_list);
> if (list_empty(&lseg->layout->segs)) {
> struct nfs_client *clp;
>
> - clp = NFS_SERVER(lseg->layout->inode)->nfs_client;
> + clp = NFS_SERVER(ino)->nfs_client;
> spin_lock(&clp->cl_lock);
> /* List does not take a reference, so no need for put here */
> list_del_init(&lseg->layout->layouts);
> spin_unlock(&clp->cl_lock);
> - pnfs_invalidate_layout_stateid(lseg->layout);
> + clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->layout->plh_flags);
> + if (!pnfs_layoutgets_blocked(lseg->layout, NULL))
> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
> }
> - rpc_wake_up(&NFS_I(lseg->layout->inode)->lo_rpcwaitq);
> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
> }
>
> /* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
> @@ -325,9 +330,12 @@ put_lseg(struct pnfs_layout_segment *lseg)
> atomic_read(&lseg->pls_refcount), lseg->valid);
> ino = lseg->layout->inode;
> if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
> + struct pnfs_cb_lrecall_info *drain_info = lseg->drain_notification;
> +
> _put_lseg_common(lseg);
> spin_unlock(&ino->i_lock);
> NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
> + notify_drained(drain_info);
> /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
> put_layout_hdr(ino);
> }
> @@ -345,7 +353,7 @@ EXPORT_SYMBOL_GPL(put_lseg);
> * READ READ true
> * READ RW false
> */
> -static int
> +bool
> should_free_lseg(struct pnfs_layout_range *lseg_range,
> struct pnfs_layout_range *recall_range)
> {
> @@ -388,16 +396,19 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
> dprintk("%s:Return\n", __func__);
> }
>
> -static void
> +void
> pnfs_free_lseg_list(struct list_head *free_me)
> {
> struct pnfs_layout_segment *lseg, *tmp;
> struct inode *ino;
> + struct pnfs_cb_lrecall_info *drain_info;
>
> list_for_each_entry_safe(lseg, tmp, free_me, fi_list) {
> BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
> ino = lseg->layout->inode;
> + drain_info = lseg->drain_notification;
> NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
> + notify_drained(drain_info);
> /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
> put_layout_hdr(ino);
> }
> @@ -453,40 +464,49 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
> }
> }
>
> -/* update lo->stateid with new if is more recent
> - *
> - * lo->stateid could be the open stateid, in which case we just use what given.
> - */
> +/* update lo->stateid with new if is more recent */
> void
> -pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
> - const nfs4_stateid *new)
> +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
> + bool update_barrier)
> {
> - nfs4_stateid *old = &lo->stateid;
> - bool overwrite = false;
> + u32 oldseq, newseq;
>
> assert_spin_locked(&lo->inode->i_lock);
> - if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
> - memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
> - overwrite = true;
> - else {
> - u32 oldseq, newseq;
> -
> - oldseq = be32_to_cpu(old->stateid.seqid);
> - newseq = be32_to_cpu(new->stateid.seqid);
> - if ((int)(newseq - oldseq) > 0)
> - overwrite = true;
> + oldseq = be32_to_cpu(lo->stateid.stateid.seqid);
> + newseq = be32_to_cpu(new->stateid.seqid);
> + if ((int)(newseq - oldseq) > 0) {
> + memcpy(&lo->stateid, &new->stateid, sizeof(new->stateid));
> + if (update_barrier)
> + lo->plh_barrier = be32_to_cpu(new->stateid.seqid);
> + else {
> + /* Because of wraparound, we want to keep the barrier
> + * "close" to the current seqids. It needs to be
> + * within 2**31 to count as "behind", so if it
> + * gets too near that limit, give us a litle leeway
> + * and bring it to within 2**30.
> + * NOTE - and yes, this is all unsigned arithmetic.
> + */
> + if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
> + lo->plh_barrier = newseq - (1 << 30);
> + }
> }
> - if (overwrite)
> - memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
> }
>
> -void
> -pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
> - struct nfs4_state *open_state)
> +int
> +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
> + struct nfs4_state *open_state)
> {
> + int status = 0;
> +
> dprintk("--> %s\n", __func__);
> spin_lock(&lo->inode->i_lock);
> - if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
> + if (lo->plh_block_lgets ||
> + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
> + /* We avoid -EAGAIN, as that has special meaning to
> + * some callers.
> + */
> + status = -NFS4ERR_LAYOUTTRYLATER;
> + } else if (list_empty(&lo->segs)) {
> int seq;
>
> do {
> @@ -494,12 +514,11 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
> memcpy(dst->data, open_state->stateid.data,
> sizeof(open_state->stateid.data));
> } while (read_seqretry(&open_state->seqlock, seq));
> - set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
> } else
> - memcpy(dst->data, lo->stateid.data,
> - sizeof(lo->stateid.data));
> + memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data));
> spin_unlock(&lo->inode->i_lock);
> dprintk("<-- %s\n", __func__);
> + return status;
> }
>
> /*
> @@ -566,6 +585,28 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,
> return out;
> }
>
> +void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
> + struct pnfs_layout_range *range,
> + struct pnfs_cb_lrecall_info *drain_info,
> + struct list_head *tmp_list)
> +{
> + struct pnfs_layout_segment *lseg, *tmp;
> +
> + assert_spin_locked(&lo->inode->i_lock);

Poor practice. If you want to ensure the caller holds the inode->i_lock,
then just call the function '*_locked'. That is a lot more helpful than
these damned asserts.

> + list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
> + if (should_free_lseg(&lseg->range, range)) {
> + /* FIXME - need to change to something like a
> + * notification bitmap to remove the restriction
> + * of only being able to process a single
> + * CB_LAYOUTRECALL at a time.
> + */
> + BUG_ON(lseg->drain_notification);
> + lseg->drain_notification = drain_info;
> + atomic_inc(&drain_info->pcl_count);
> + mark_lseg_invalid(lseg, tmp_list);
> + }
> +}
> +
> /* Return true if there is layout based io in progress in the given range.
> * Assumes range has already been marked invalid, and layout marked to
> * prevent any new lseg from being inserted.
> @@ -711,14 +752,6 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
> dprintk("%s:Begin\n", __func__);
>
> assert_spin_locked(&lo->inode->i_lock);
> - if (list_empty(&lo->segs)) {
> - struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
> -
> - spin_lock(&clp->cl_lock);
> - BUG_ON(!list_empty(&lo->layouts));
> - list_add_tail(&lo->layouts, &clp->cl_layouts);
> - spin_unlock(&clp->cl_lock);
> - }
> list_for_each_entry(lp, &lo->segs, fi_list) {
> if (cmp_layout(&lp->range, &lseg->range) > 0)
> continue;
> @@ -735,6 +768,9 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
> }
> if (!found) {
> list_add_tail(&lseg->fi_list, &lo->segs);
> + if (list_is_singular(&lo->segs) &&
> + !pnfs_layoutgets_blocked(lo, NULL))
> + rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
> dprintk("%s: inserted lseg %p "
> "iomode %d offset %llu length %llu at tail\n",
> __func__, lseg, lseg->range.iomode,
> @@ -756,6 +792,7 @@ alloc_init_layout_hdr(struct inode *ino)
> atomic_set(&lo->plh_refcount, 1);
> INIT_LIST_HEAD(&lo->layouts);
> INIT_LIST_HEAD(&lo->segs);
> + INIT_LIST_HEAD(&lo->plh_bulk_recall);
> lo->inode = ino;
> return lo;
> }
> @@ -843,6 +880,7 @@ pnfs_update_layout(struct inode *ino,
> .length = NFS4_MAX_UINT64,
> };
> struct nfs_inode *nfsi = NFS_I(ino);
> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
> struct pnfs_layout_hdr *lo;
> struct pnfs_layout_segment *lseg = NULL;
>
> @@ -878,9 +916,28 @@ pnfs_update_layout(struct inode *ino,
> goto out_unlock;
>
> get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */
> + if (list_empty(&lo->segs)) {
> + /* The lo must be on the clp list if there is any
> + * chance of a CB_LAYOUTRECALL(FILE) coming in.
> + */
> + spin_lock(&clp->cl_lock);
> + BUG_ON(!list_empty(&lo->layouts));
> + list_add_tail(&lo->layouts, &clp->cl_layouts);
> + spin_unlock(&clp->cl_lock);
> + }
> spin_unlock(&ino->i_lock);
>
> lseg = send_layoutget(lo, ctx, &arg);
> + if (!lseg) {
> + spin_lock(&ino->i_lock);
> + if (list_empty(&lo->segs)) {
> + spin_lock(&clp->cl_lock);
> + list_del_init(&lo->layouts);
> + spin_unlock(&clp->cl_lock);
> + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
> + }
> + spin_unlock(&ino->i_lock);
> + }
> out:
> dprintk("%s end, state 0x%lx lseg %p\n", __func__,
> nfsi->layout->plh_flags, lseg);
> @@ -891,10 +948,15 @@ out_unlock:
> }
>
> bool
> -pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo)
> +pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid)
> {
> assert_spin_locked(&lo->inode->i_lock);
> - return lo->plh_block_lgets;
> + if ((stateid) &&
> + (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
> + return true;
> + return lo->plh_block_lgets ||
> + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
> + (list_empty(&lo->segs) && lo->plh_outstanding);
> }
>
> int
> @@ -904,6 +966,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
> struct nfs4_layoutget_res *res = &lgp->res;
> struct pnfs_layout_segment *lseg;
> struct inode *ino = lo->inode;
> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
> int status = 0;
>
> /* Inject layout blob into I/O device driver */
> @@ -915,10 +978,25 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
> status = PTR_ERR(lseg);
> dprintk("%s: Could not allocate layout: error %d\n",
> __func__, status);
> + spin_lock(&ino->i_lock);
> goto out;
> }
>
> spin_lock(&ino->i_lock);
> + /* decrement needs to be done before call to pnfs_layoutget_blocked */
> + lo->plh_outstanding--;
> + spin_lock(&clp->cl_lock);
> + if (matches_outstanding_recall(ino, &res->range)) {
> + spin_unlock(&clp->cl_lock);
> + dprintk("%s forget reply due to recall\n", __func__);
> + goto out_forget_reply;
> + }
> + spin_unlock(&clp->cl_lock);
> +
> + if (pnfs_layoutgets_blocked(lo, &res->stateid)) {
> + dprintk("%s forget reply due to state\n", __func__);
> + goto out_forget_reply;
> + }
> init_lseg(lo, lseg);
> lseg->range = res->range;
> get_lseg(lseg);
> @@ -934,10 +1012,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
> }
>
> /* Done processing layoutget. Set the layout stateid */
> - pnfs_set_layout_stateid(lo, &res->stateid);
> - spin_unlock(&ino->i_lock);
> + pnfs_set_layout_stateid(lo, &res->stateid, false);
> out:
> + if (!pnfs_layoutgets_blocked(lo, NULL))
> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
> + spin_unlock(&ino->i_lock);
> return status;
> +
> +out_forget_reply:
> + spin_unlock(&ino->i_lock);
> + lseg->layout = lo;
> + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
> + spin_lock(&ino->i_lock);
> + goto out;
> }
>
> void
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 891aeab..7ea121f 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -31,6 +31,7 @@
> #define FS_NFS_PNFS_H
>
> #include <linux/nfs_page.h>
> +#include "callback.h" /* for cb_layoutrecallargs */
>
> struct pnfs_layout_segment {
> struct list_head fi_list;
> @@ -38,6 +39,7 @@ struct pnfs_layout_segment {
> atomic_t pls_refcount;
> bool valid;
> struct pnfs_layout_hdr *layout;
> + struct pnfs_cb_lrecall_info *drain_notification;
> };
>
> enum pnfs_try_status {
> @@ -52,7 +54,7 @@ enum pnfs_try_status {
> enum {
> NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
> NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
> - NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
> + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
> NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
> };
>
> @@ -94,10 +96,13 @@ struct pnfs_layoutdriver_type {
> struct pnfs_layout_hdr {
> atomic_t plh_refcount;
> struct list_head layouts; /* other client layouts */
> + struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
> struct list_head segs; /* layout segments list */
> int roc_iomode;/* return on close iomode, 0=none */
> nfs4_stateid stateid;
> + unsigned long plh_outstanding; /* number of RPCs out */
> unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
> + u32 plh_barrier; /* ignore lower seqids */
> unsigned long plh_flags;
> struct rpc_cred *cred; /* layoutcommit credential */
> /* DH: These vars keep track of the maximum write range
> @@ -118,6 +123,14 @@ struct pnfs_device {
> unsigned int pglen;
> };
>
> +struct pnfs_cb_lrecall_info {
> + struct list_head pcl_list; /* hook into cl_layoutrecalls list */
> + atomic_t pcl_count;
> + struct nfs_client *pcl_clp;
> + struct inode *pcl_ino;
> + struct cb_layoutrecallargs pcl_args;
> +};
> +
> /*
> * Device ID RCU cache. A device ID is unique per client ID and layout type.
> */
> @@ -176,7 +189,10 @@ extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
> extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
>
> /* pnfs.c */
> +void get_layout_hdr(struct pnfs_layout_hdr *lo);
> void put_lseg(struct pnfs_layout_segment *lseg);
> +bool should_free_lseg(struct pnfs_layout_range *lseg_range,
> + struct pnfs_layout_range *recall_range);
> struct pnfs_layout_segment *
> pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
> struct pnfs_layout_segment *
> @@ -201,15 +217,24 @@ enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
> struct nfs_open_context *, struct list_head *);
> void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
> -bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo);
> +bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
> int pnfs_layout_process(struct nfs4_layoutget *lgp);
> +void pnfs_free_lseg_list(struct list_head *tmp_list);
> void pnfs_destroy_layout(struct nfs_inode *);
> void pnfs_destroy_all_layouts(struct nfs_client *);
> void put_layout_hdr(struct inode *inode);
> void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
> - const nfs4_stateid *new);
> -void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
> - struct nfs4_state *open_state);
> + const nfs4_stateid *new,
> + bool update_barrier);
> +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
> + struct pnfs_layout_hdr *lo,
> + struct nfs4_state *open_state);
> +void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
> + struct pnfs_layout_range *range,
> + struct pnfs_cb_lrecall_info *drain_info,
> + struct list_head *tmp_list);
> +/* FIXME - this should be in callback.h, but pnfs_cb_lrecall_info needs to be there too */
> +extern void notify_drained(struct pnfs_cb_lrecall_info *d);
>
> static inline bool
> has_layout(struct nfs_inode *nfsi)
> @@ -223,12 +248,6 @@ static inline int lo_fail_bit(u32 iomode)
> NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
> }
>
> -static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
> -{
> - assert_spin_locked(&lo->inode->i_lock);
> - clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
> -}
> -
> static inline void get_lseg(struct pnfs_layout_segment *lseg)
> {
> atomic_inc(&lseg->pls_refcount);
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 3cae408..80dcc00 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -83,6 +83,10 @@ struct nfs_client {
> u32 cl_exchange_flags;
> struct nfs4_session *cl_session; /* sharred session */
> struct list_head cl_layouts;
> + struct list_head cl_layoutrecalls;
> + unsigned long cl_cb_lrecall_count;
> +#define PNFS_MAX_CB_LRECALLS (1)
> + struct rpc_wait_queue cl_rpcwaitq_recall;
> struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
> #endif /* CONFIG_NFS_V4_1 */
>




2010-11-15 14:51:40

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On Sun, Nov 14, 2010 at 10:43 AM, Benny Halevy <[email protected]> wrote:
>
> Using the open stateid after forgetting the layout could be a protocol bug,
> or at least it falls into undefined territories.
>
> The RFC says:
>
> ? The loga_stateid field specifies a valid stateid. ?If a layout is not
> ? currently held by the client, the loga_stateid field represents a
> ? stateid reflecting the correspondingly valid open, byte-range lock,
> ? or delegation stateid. ?Once a layout is held on the file by the
> ? client, the loga_stateid field MUST be a stateid as returned from a
> ? previous LAYOUTGET or LAYOUTRETURN operation or provided by a
> ? CB_LAYOUTRECALL operation (see Section 12.5.3).
>
> So the question is does the text above refer to the client view of the state or to
> the server's view.
> In other words, with the forgetful client model, when the client unilaterally forgets
> the layout without letting the server know about it (no LAYOUTRETURN was sent),
> does it mean "a layout is not currently held by the client"?
>

I would argue that yes, this is in fact what it means.

It seems the server has two options when confronted with an
openstateid. Either interpret this as a declaration by the client
that it has forgotten all previous layouts and behave appropriately
(wipe any layout state assigned to the file and create a new
layoutstateid), or assume this is part of parallel spew of
LAYOUTGET(openstateid) and try to use an existing layout state with
the appropriate (possibly not one) seqid. I argue that, as the spec
stands, the second option is not really a choice, because the first
option exists. If a client using the second option encounters a
server using the first, bad things happen. The client will issue
multiple LAYOUTGET(openstateids), the server will, upon seeing each,
discard any previous state and return a new state with segid=1, with
the final valid state being that of whichever one was processed last.
The client will see all the OK returns, and not have any easy method
of determining which is the one that the server considers valid.

Thus I claim that, because of the forgetful model, the client must
serialize its LAYOUTGET(openstateid) calls.

> The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
> while it still thinks that the client is holding a layout.
> Since this could normally happen if the client sends multiple LAYOUTGETs in
> parallel before it received any layout stateid the server should allow it
> within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
> not explicitly called out there), otherwise, it seems like the server is supposed
> to return NFS4ERR_OLD_STATEID.
>
> Strictly reading the spec, the client should use the most recent layout stateid
> even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
> or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
> clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
> a layout (did I say recently how much I hate the forgetful model which introduces
> more corner cases rather than simplifying the protocol as it was supposed to do? ;-)
>

Strict reading again depends on whose point of view, client or server...

"Once a client has no more layouts on a file, the layout stateid is no
longer valid and MUST NOT be used. Any attempt to use such a layout
stateid will result in NFS4ERR_BAD_STATEID."


Fred

> Benny
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>

2010-11-12 08:49:24

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 07/22] SQUASHME: pnfs-submit: fixups for nfsv4.1 callbacks

From: Benny Halevy <[email protected]>

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/callback_proc.c | 52 +++++++++++++++++++++--------------------------
1 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 53a85648..2e62155 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -21,6 +21,12 @@
#define NFSDBG_FACILITY NFSDBG_CALLBACK
#endif

+static struct nfs_client *
+find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
+{
+ return cps->session ? cps->session->clp : nfs_find_client(addr, 4);
+}
+
__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
struct cb_getattrres *res,
struct cb_process_state *cps)
@@ -32,13 +38,9 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,

res->bitmap[0] = res->bitmap[1] = 0;
res->status = htonl(NFS4ERR_BADHANDLE);
- if (cps->session) { /* set in cb_sequence */
- clp = cps->session->clp;
- } else {
- clp = nfs_find_client(args->addr, 4);
- if (clp == NULL)
- goto out;
- }
+ clp = find_client_from_cps(cps, args->addr);
+ if (clp == NULL)
+ goto out;

dprintk("NFS: GETATTR callback request from %s\n",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
@@ -81,13 +83,9 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
__be32 res;

res = htonl(NFS4ERR_BADHANDLE);
- if (cps->session) { /* set in cb_sequence */
- clp = cps->session->clp;
- } else {
- clp = nfs_find_client(args->addr, 4);
- if (clp == NULL)
- goto out;
- }
+ clp = find_client_from_cps(cps, args->addr);
+ if (clp == NULL)
+ goto out;

dprintk("NFS: RECALL callback request from %s\n",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
@@ -111,11 +109,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
}
iput(inode);
}
- if (!cps->session) {
- clp = nfs_find_client_next(prev);
- nfs_put_client(prev);
- }
- } while (!cps->session && clp != NULL);
+ if (cps->session)
+ break;
+ clp = nfs_find_client_next(prev);
+ nfs_put_client(prev);
+ } while (clp != NULL);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
return res;
@@ -376,11 +374,6 @@ __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
else
goto out;

- /* the callback must come from the MDS personality */
- res = cpu_to_be32(NFS4ERR_NOTSUPP);
- if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
- goto out;
-
res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
/*
* In the _ALL or _FSID case, we need the inode to get
@@ -579,6 +572,12 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
if (status)
goto out_putclient;

+ /* The callback must come from the MDS personality */
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) {
+ status = htonl(NFS4ERR_INVAL);
+ goto out_putclient;
+ }
+
/*
* Check for pending referring calls. If a match is found, a
* related callback was received before the response to the original
@@ -658,11 +657,6 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
dprintk("NFS: RECALL_ANY callback request from %s\n",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));

- /* the callback must come from the MDS personality */
- status = cpu_to_be32(NFS4ERR_NOTSUPP);
- if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
- goto out;
-
status = cpu_to_be32(NFS4ERR_INVAL);
if (!validate_bitmap_values((const unsigned long *)
&args->craa_type_mask))
--
1.7.2.1


2010-11-14 11:44:17

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 2010-11-13 11:11, Trond Myklebust wrote:
> On Fri, 2010-11-12 at 03:48 -0500, Fred Isaman wrote:
>> Remove NFS_LAYOUT_STATEID_SET in favor of just checking list_empty(lo->segs).
>>
>> LAYOUTGETs with openstateid are serialized. Waiting on the condition
>> (list_empty(lo->segs) && plh_outstanding>0) both drains outstanding RPCs once
>> the stateid is invalidated and allows only a single LAYOUTGET(openstateid)
>> through at a time.
>>
>> Before sending a LAYOUTRETURN, plh_block_lgets is incremented. It is
>> decremented in the rpc_release function. While set, LAYOUTGETs are
>> paused in their rpc_prepare function, and any responses are
>> forgotten.
>>
>> Callbacks are handled by blocking any matching LAYOUTGETS while processing and
>> initiating drain of IO. A notification system is set up so that when
>> all relevant IO is finished, the state manger thread is invoked, which
>> synchronously sends the final matching LAYOUTRETURN before unblocking
>> LAYOUTGETS.
>>
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/callback.h | 7 +
>> fs/nfs/callback_proc.c | 466 +++++++++++++++++++++++----------------------
>> fs/nfs/client.c | 3 +
>> fs/nfs/nfs4proc.c | 81 ++++++--
>> fs/nfs/nfs4state.c | 4 +
>> fs/nfs/nfs4xdr.c | 16 ++-
>> fs/nfs/pnfs.c | 177 +++++++++++++-----
>> fs/nfs/pnfs.h | 41 +++-
>> include/linux/nfs_fs_sb.h | 4 +
>> 9 files changed, 497 insertions(+), 302 deletions(-)
>>
>> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
>> index cea58cc..4a9905b 100644
>> --- a/fs/nfs/callback.h
>> +++ b/fs/nfs/callback.h
>> @@ -163,6 +163,9 @@ struct cb_layoutrecallargs {
>> extern unsigned nfs4_callback_layoutrecall(
>> struct cb_layoutrecallargs *args,
>> void *dummy, struct cb_process_state *cps);
>> +extern bool matches_outstanding_recall(struct inode *ino,
>> + struct pnfs_layout_range *range);
>> +extern void nfs_client_return_layouts(struct nfs_client *clp);
>>
>> static inline void put_session_client(struct nfs4_session *session)
>> {
>> @@ -178,6 +181,10 @@ find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
>>
>> #else
>>
>> +static inline void nfs_client_return_layouts(struct nfs_client *clp)
>> +{
>> +}
>> +
>> static inline struct nfs_client *
>> find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
>> {
>> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
>> index 6e0fc40..af405cf 100644
>> --- a/fs/nfs/callback_proc.c
>> +++ b/fs/nfs/callback_proc.c
>> @@ -124,265 +124,283 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
>> #if defined(CONFIG_NFS_V4_1)
>>
>> static bool
>> -pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
>> - const nfs4_stateid stateid)
>> +_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info,
>> + struct inode *ino, struct pnfs_layout_range *range)
>> {
>> - bool res;
>> - u32 oldseqid, newseqid;
>> -
>> - spin_lock(&lo->inode->i_lock);
>> - {
>> - oldseqid = be32_to_cpu(lo->stateid.stateid.seqid);
>> - newseqid = be32_to_cpu(stateid.stateid.seqid);
>> - res = !memcmp(lo->stateid.stateid.other,
>> - stateid.stateid.other,
>> - NFS4_STATEID_OTHER_SIZE);
>> - if (res) { /* comparing layout stateids */
>> - if (oldseqid == ~0)
>> - res = (newseqid == 1);
>> - else
>> - res = (newseqid == oldseqid + 1);
>> - } else { /* open stateid */
>> - res = !memcmp(lo->stateid.data,
>> - &zero_stateid,
>> - NFS4_STATEID_SIZE);
>> - if (res)
>> - res = (newseqid == 1);
>> - }
>> - }
>> - spin_unlock(&lo->inode->i_lock);
>> + struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args;
>>
>> - return res;
>> + switch (cb_args->cbl_recall_type) {
>> + case RETURN_ALL:
>> + return true;
>> + case RETURN_FSID:
>> + return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid,
>> + sizeof(struct nfs_fsid));
>> + case RETURN_FILE:
>> + return (ino == cb_info->pcl_ino) &&
>> + should_free_lseg(range, &cb_args->cbl_range);
>> + default:
>> + BUG();
>
> Why should we BUG() just because the server is screwed up? That's not a
> client bug.
>

Agreed. This should be handled earlier in nfs4_callback_layoutrecall
or do_callback_layoutrecall so that we can return NFS4ERR_INVALID.


>> + }
>> }
>>
>> -/*
>> - * Retrieve an inode based on layout recall parameters
>> - *
>> - * Note: caller must iput(inode) to dereference the inode.
>> - */
>> -static struct inode *
>> -nfs_layoutrecall_find_inode(struct nfs_client *clp,
>> - const struct cb_layoutrecallargs *args)
>> +bool
>> +matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range)
>> {
>> - struct nfs_inode *nfsi;
>> - struct pnfs_layout_hdr *lo;
>> - struct nfs_server *server;
>> - struct inode *ino = NULL;
>> -
>> - dprintk("%s: Begin recall_type=%d clp %p\n",
>> - __func__, args->cbl_recall_type, clp);
>> -
>> - spin_lock(&clp->cl_lock);
>> - list_for_each_entry(lo, &clp->cl_layouts, layouts) {
>> - nfsi = NFS_I(lo->inode);
>> - if (!nfsi)
>> - continue;
>> -
>> - dprintk("%s: Searching inode=%lu\n",
>> - __func__, nfsi->vfs_inode.i_ino);
>> -
>> - if (args->cbl_recall_type == RETURN_FILE) {
>> - if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh))
>> - continue;
>> - } else if (args->cbl_recall_type == RETURN_FSID) {
>> - server = NFS_SERVER(&nfsi->vfs_inode);
>> - if (server->fsid.major != args->cbl_fsid.major ||
>> - server->fsid.minor != args->cbl_fsid.minor)
>> - continue;
>> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>> + struct pnfs_cb_lrecall_info *cb_info;
>> + bool rv = false;
>> +
>> + assert_spin_locked(&clp->cl_lock);
>
> Can we please go easy on the asserts? There is way too much asserting
> going on in the NFSv4.1 code. This isn't a publicly visible interface,
> so just get it right in the debugging process before the merge, and then
> kill these asserts...
>

OK. We can keep them in a DEVONLY patch only in the development tree
(it becomes handy when any changes are made on these code paths)

>> + list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) {
>> + if (_recall_matches_lget(cb_info, ino, range)) {
>> + rv = true;
>> + break;
>> }
>> -
>> - /* Make sure client didn't clean up layout without
>> - * telling the server */
>> - if (!has_layout(nfsi))
>> - continue;
>> -
>> - ino = igrab(&nfsi->vfs_inode);
>> - dprintk("%s: Found inode=%p\n", __func__, ino);
>> - break;
>> }
>> - spin_unlock(&clp->cl_lock);
>> - return ino;
>> + return rv;
>> }
>>
>> -struct recall_layout_threadargs {
>> - struct inode *inode;
>> - struct nfs_client *clp;
>> - struct completion started;
>> - struct cb_layoutrecallargs *rl;
>> - int result;
>> -};
>> -
>> -static int pnfs_recall_layout(void *data)
>> +/* Send a synchronous LAYOUTRETURN. By the time this is called, we know
>> + * all IO has been drained, any matching lsegs deleted, and that no
>> + * overlapping LAYOUTGETs will be sent or processed for the duration
>> + * of this call.
>> + * Note that it is possible that when this is called, the stateid has
>> + * been invalidated. But will not be cleared, so can still use.
>> + */
>> +static int
>> +pnfs_send_layoutreturn(struct nfs_client *clp,
>> + struct pnfs_cb_lrecall_info *cb_info)
>> {
>> - struct inode *inode, *ino;
>> - struct nfs_client *clp;
>> - struct cb_layoutrecallargs rl;
>> + struct cb_layoutrecallargs *args = &cb_info->pcl_args;
>> struct nfs4_layoutreturn *lrp;
>> - struct recall_layout_threadargs *args =
>> - (struct recall_layout_threadargs *)data;
>> - int status = 0;
>> -
>> - daemonize("nfsv4-layoutreturn");
>> -
>> - dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n",
>> - __func__, args->rl->cbl_recall_type,
>> - args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor);
>> -
>> - clp = args->clp;
>> - inode = args->inode;
>> - rl = *args->rl;
>> -
>> - /* support whole file layouts only */
>> - rl.cbl_range.offset = 0;
>> - rl.cbl_range.length = NFS4_MAX_UINT64;
>> -
>> - if (rl.cbl_recall_type == RETURN_FILE) {
>> - if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout,
>> - rl.cbl_stateid))
>> - status = pnfs_return_layout(inode, &rl.cbl_range,
>> - &rl.cbl_stateid, RETURN_FILE,
>> - false);
>> - else
>> - status = cpu_to_be32(NFS4ERR_DELAY);
>> - if (status)
>> - dprintk("%s RETURN_FILE error: %d\n", __func__, status);
>> - else
>> - status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
>> - args->result = status;
>> - complete(&args->started);
>> - goto out;
>> - }
>> -
>> - status = cpu_to_be32(NFS4_OK);
>> - args->result = status;
>> - complete(&args->started);
>> - args = NULL;
>> -
>> - /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
>> - while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
>> - /* FIXME: need to check status on pnfs_return_layout */
>> - pnfs_return_layout(ino, &rl.cbl_range, NULL, RETURN_FILE, false);
>> - iput(ino);
>> - }
>>
>> lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
>> - if (!lrp) {
>> - dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
>> - __func__);
>> - goto out;
>> - }
>> -
>> - /* send final layoutreturn */
>> + if (!lrp)
>> + return -ENOMEM;
>> lrp->args.reclaim = 0;
>> - lrp->args.layout_type = rl.cbl_layout_type;
>> - lrp->args.return_type = rl.cbl_recall_type;
>> + lrp->args.layout_type = args->cbl_layout_type;
>> + lrp->args.return_type = args->cbl_recall_type;
>> lrp->clp = clp;
>> - lrp->args.range = rl.cbl_range;
>> - lrp->args.inode = inode;
>> - nfs4_proc_layoutreturn(lrp, true);
>> -
>> -out:
>> - clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
>> - nfs_put_client(clp);
>> - module_put_and_exit(0);
>> - dprintk("%s: exit status %d\n", __func__, 0);
>> - return 0;
>> + if (args->cbl_recall_type == RETURN_FILE) {
>> + lrp->args.range = args->cbl_range;
>> + lrp->args.inode = cb_info->pcl_ino;
>> + } else {
>> + lrp->args.range.iomode = IOMODE_ANY;
>> + lrp->args.inode = NULL;
>> + }
>> + return nfs4_proc_layoutreturn(lrp, true);
>> }
>>
>> -/*
>> - * Asynchronous layout recall!
>> +/* Called by state manager to finish CB_LAYOUTRECALLS initiated by
>> + * nfs4_callback_layoutrecall().
>> */
>> -static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
>> - struct cb_layoutrecallargs *rl)
>> +void nfs_client_return_layouts(struct nfs_client *clp)
>> {
>> - struct recall_layout_threadargs data = {
>> - .clp = clp,
>> - .inode = inode,
>> - .rl = rl,
>> - };
>> - struct task_struct *t;
>> - int status = -EAGAIN;
>> + struct pnfs_cb_lrecall_info *cb_info;
>>
>> - dprintk("%s: -->\n", __func__);
>> + spin_lock(&clp->cl_lock);
>> + while (true) {
>> + if (list_empty(&clp->cl_layoutrecalls)) {
>> + spin_unlock(&clp->cl_lock);
>> + break;
>> + }
>> + cb_info = list_first_entry(&clp->cl_layoutrecalls,
>> + struct pnfs_cb_lrecall_info,
>> + pcl_list);
>> + spin_unlock(&clp->cl_lock);
>> + if (atomic_read(&cb_info->pcl_count) != 0)
>> + break;
>> + /* What do on error return? These layoutreturns are
>> + * required by the protocol. So if do not get
>> + * successful reply, probably have to do something
>> + * more drastic.
>> + */
>> + pnfs_send_layoutreturn(clp, cb_info);
>> + spin_lock(&clp->cl_lock);
>> + /* Removing from the list unblocks LAYOUTGETs */
>> + list_del(&cb_info->pcl_list);
>> + clp->cl_cb_lrecall_count--;
>> + rpc_wake_up(&clp->cl_rpcwaitq_recall);
>> + kfree(cb_info);
>> + }
>> +}
>>
>> - /* FIXME: do not allow two concurrent layout recalls */
>> - if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
>> - return status;
>> -
>> - init_completion(&data.started);
>> - __module_get(THIS_MODULE);
>> - atomic_inc(&clp->cl_count);
>> -
>> - t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
>> - if (IS_ERR(t)) {
>> - printk(KERN_INFO "NFS: Layout recall callback thread failed "
>> - "for client (clientid %08x/%08x)\n",
>> - (unsigned)(clp->cl_clientid >> 32),
>> - (unsigned)(clp->cl_clientid));
>> - status = PTR_ERR(t);
>> - goto out_module_put;
>> +void notify_drained(struct pnfs_cb_lrecall_info *d)
>> +{
>> + if (d && atomic_dec_and_test(&d->pcl_count)) {
>> + set_bit(NFS4CLNT_LAYOUT_RECALL, &d->pcl_clp->cl_state);
>> + nfs4_schedule_state_manager(d->pcl_clp);
>> }
>> - wait_for_completion(&data.started);
>> - return data.result;
>> -out_module_put:
>> - nfs_put_client(clp);
>> - clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
>> - module_put(THIS_MODULE);
>> - return status;
>> }
>>
>> -static int pnfs_recall_all_layouts(struct nfs_client *clp)
>> +static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
>> {
>> - struct cb_layoutrecallargs rl;
>> - struct inode *inode;
>> - int status = 0;
>> -
>> - rl.cbl_recall_type = RETURN_ALL;
>> - rl.cbl_range.iomode = IOMODE_ANY;
>> - rl.cbl_range.offset = 0;
>> - rl.cbl_range.length = NFS4_MAX_UINT64;
>> -
>> - /* we need the inode to get the nfs_server struct */
>> - inode = nfs_layoutrecall_find_inode(clp, &rl);
>> - if (!inode)
>> - return status;
>> - status = pnfs_async_return_layout(clp, inode, &rl);
>> - iput(inode);
>> + struct nfs_client *clp = cb_info->pcl_clp;
>> + struct pnfs_layout_hdr *lo;
>> + int rv = NFS4ERR_NOMATCHING_LAYOUT;
>> + struct cb_layoutrecallargs *args = &cb_info->pcl_args;
>> +
>> + if (args->cbl_recall_type == RETURN_FILE) {
>> + LIST_HEAD(free_me_list);
>> +
>> + spin_lock(&clp->cl_lock);
>> + list_for_each_entry(lo, &clp->cl_layouts, layouts) {
>> + if (nfs_compare_fh(&args->cbl_fh,
>> + &NFS_I(lo->inode)->fh))
>> + continue;
>> + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
>> + rv = NFS4ERR_DELAY;
>> + else {
>> + /* FIXME I need to better understand igrab and
>> + * does having a layout ref keep ino around?
>> + * It should.
>> + */
>> + /* We need to hold the reference until any
>> + * potential LAYOUTRETURN is finished.
>> + */
>> + get_layout_hdr(lo);
>> + cb_info->pcl_ino = lo->inode;
>> + rv = NFS4_OK;
>> + }
>> + break;
>> + }
>> + spin_unlock(&clp->cl_lock);
>> +
>> + spin_lock(&lo->inode->i_lock);
>> + if (rv == NFS4_OK) {
>> + lo->plh_block_lgets++;
>> + nfs4_asynch_forget_layouts(lo, &args->cbl_range,
>> + cb_info, &free_me_list);
>> + }
>> + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
>> + spin_unlock(&lo->inode->i_lock);
>> + pnfs_free_lseg_list(&free_me_list);
>> + } else {
>> + struct pnfs_layout_hdr *tmp;
>> + LIST_HEAD(recall_list);
>> + LIST_HEAD(free_me_list);
>> + struct pnfs_layout_range range = {
>> + .iomode = IOMODE_ANY,
>> + .offset = 0,
>> + .length = NFS4_MAX_UINT64,
>> + };
>> +
>> + spin_lock(&clp->cl_lock);
>> + /* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */
>> + if (!list_is_singular(&clp->cl_layoutrecalls)) {
>> + spin_unlock(&clp->cl_lock);
>> + return NFS4ERR_DELAY;
>> + }
>> + list_for_each_entry(lo, &clp->cl_layouts, layouts) {
>> + if ((args->cbl_recall_type == RETURN_FSID) &&
>> + memcmp(&NFS_SERVER(lo->inode)->fsid,
>> + &args->cbl_fsid, sizeof(struct nfs_fsid)))
>> + continue;
>> + get_layout_hdr(lo);
>> + /* We could list_del(&lo->layouts) here */
>> + BUG_ON(!list_empty(&lo->plh_bulk_recall));
>> + list_add(&lo->plh_bulk_recall, &recall_list);
>> + }
>> + spin_unlock(&clp->cl_lock);
>> + list_for_each_entry_safe(lo, tmp,
>> + &recall_list, plh_bulk_recall) {
>> + spin_lock(&lo->inode->i_lock);
>> + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
>> + nfs4_asynch_forget_layouts(lo, &range, cb_info,
>> + &free_me_list);
>> + list_del_init(&lo->plh_bulk_recall);
>> + spin_unlock(&lo->inode->i_lock);
>> + put_layout_hdr(lo->inode);
>> + rv = NFS4_OK;
>> + }
>> + pnfs_free_lseg_list(&free_me_list);
>> + }
>> + return rv;
>> +}
>> +
>> +static u32 do_callback_layoutrecall(struct nfs_client *clp,
>> + struct cb_layoutrecallargs *args)
>> +{
>> + struct pnfs_cb_lrecall_info *new;
>> + u32 res;
>> +
>> + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
>> + new = kmalloc(sizeof(*new), GFP_KERNEL);
>> + if (!new) {
>> + res = NFS4ERR_RESOURCE;
>> + goto out;
>> + }
>> + memcpy(&new->pcl_args, args, sizeof(*args));
>> + atomic_set(&new->pcl_count, 1);
>> + new->pcl_clp = clp;
>> + new->pcl_ino = NULL;
>> + spin_lock(&clp->cl_lock);
>> + if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) {
>> + kfree(new);
>> + res = NFS4ERR_DELAY;
>> + spin_unlock(&clp->cl_lock);
>> + goto out;
>> + }
>> + clp->cl_cb_lrecall_count++;
>> + /* Adding to the list will block conflicting LGET activity */
>> + list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls);
>> + spin_unlock(&clp->cl_lock);
>> + res = initiate_layout_draining(new);
>> + if (res || atomic_dec_and_test(&new->pcl_count)) {
>> + spin_lock(&clp->cl_lock);
>> + list_del(&new->pcl_list);
>> + clp->cl_cb_lrecall_count--;
>> + rpc_wake_up(&clp->cl_rpcwaitq_recall);
>> + spin_unlock(&clp->cl_lock);
>> + if (res == NFS4_OK) {
>> + if (args->cbl_recall_type == RETURN_FILE) {
>> + struct pnfs_layout_hdr *lo;
>> +
>> + lo = NFS_I(new->pcl_ino)->layout;
>> + spin_lock(&lo->inode->i_lock);
>> + lo->plh_block_lgets--;
>> + if (!pnfs_layoutgets_blocked(lo, NULL))
>> + rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
>> + spin_unlock(&lo->inode->i_lock);
>> + put_layout_hdr(new->pcl_ino);
>> + }
>> + res = NFS4ERR_NOMATCHING_LAYOUT;
>> + }
>> + kfree(new);
>> + }
>> +out:
>> + dprintk("%s returning %i\n", __func__, res);
>> + return res;
>>
>> - return status;
>> }
>>
>> __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
>> void *dummy, struct cb_process_state *cps)
>> {
>> struct nfs_client *clp;
>> - struct inode *inode = NULL;
>> - __be32 res;
>> - int status;
>> + u32 res;
>>
>> dprintk("%s: -->\n", __func__);
>>
>> - res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
>> - if (cps->session) /* set in cb_sequence */
>> + if (cps->session) { /* set in cb_sequence */
>> clp = cps->session->clp;
>> - else
>> - goto out;
>> + res = do_callback_layoutrecall(clp, args);
>> + } else
>> + res = NFS4ERR_OP_NOT_IN_SESSION;
>>
>> - res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
>> - /*
>> - * In the _ALL or _FSID case, we need the inode to get
>> - * the nfs_server struct.
>> - */
>> - inode = nfs_layoutrecall_find_inode(clp, args);
>> - if (!inode)
>> - goto out;
>> - status = pnfs_async_return_layout(clp, inode, args);
>> - if (status)
>> - res = cpu_to_be32(NFS4ERR_DELAY);
>> - iput(inode);
>> -out:
>> - dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
>> - return res;
>> + dprintk("%s: exit with status = %d\n", __func__, res);
>> + return cpu_to_be32(res);
>> +}
>> +
>> +static void pnfs_recall_all_layouts(struct nfs_client *clp)
>> +{
>> + struct cb_layoutrecallargs args;
>> +
>> + /* Pretend we got a CB_LAYOUTRECALL(ALL) */
>> + memset(&args, 0, sizeof(args));
>> + args.cbl_recall_type = RETURN_ALL;
>> + /* FIXME we ignore errors, what should we do? */
>
> We're a forgetful client: we don't care...
>

Well, CB_RECALL_ANY is generated in order to trim the server's state down
by allowing the client to *return* state it needs less or no longer needs.
Just forgetting this state doesn't help the server at all with this job!
There's no equivalent error to NFS4ERR_NOMATCHING_LAYOUT for CB_RECALL_ANY.

>> + do_callback_layoutrecall(clp, &args);
>> }
>
>
>
>>
>> int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
>> @@ -665,9 +683,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
>> flags |= FMODE_WRITE;
>> if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
>> &args->craa_type_mask))
>> - if (pnfs_recall_all_layouts(clp) == -EAGAIN)
>> - status = cpu_to_be32(NFS4ERR_DELAY);
>> -
>> + pnfs_recall_all_layouts(clp);
>> if (flags)
>> nfs_expire_all_delegation_types(clp, flags);
>> out:
>> diff --git a/fs/nfs/client.c b/fs/nfs/client.c
>> index 3c8c841..dbf43e7 100644
>> --- a/fs/nfs/client.c
>> +++ b/fs/nfs/client.c
>> @@ -158,6 +158,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
>> clp->cl_machine_cred = cred;
>> #if defined(CONFIG_NFS_V4_1)
>> INIT_LIST_HEAD(&clp->cl_layouts);
>> + INIT_LIST_HEAD(&clp->cl_layoutrecalls);
>> + rpc_init_wait_queue(&clp->cl_rpcwaitq_recall,
>> + "NFS client CB_LAYOUTRECALLS");
>> #endif
>> nfs_fscache_get_client_cookie(clp);
>>
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index fe79872..6223c6a 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -5346,31 +5346,58 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
>> struct inode *ino = lgp->args.inode;
>> struct nfs_inode *nfsi = NFS_I(ino);
>> struct nfs_server *server = NFS_SERVER(ino);
>> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>>
>> dprintk("--> %s\n", __func__);
>> + spin_lock(&clp->cl_lock);
>> + if (matches_outstanding_recall(ino, &lgp->args.range)) {
>> + rpc_sleep_on(&clp->cl_rpcwaitq_recall, task, NULL);
>> + spin_unlock(&clp->cl_lock);
>> + return;
>> + }
>> + spin_unlock(&clp->cl_lock);
>> + /* Note the is a race here, where a CB_LAYOUTRECALL can come in
>> + * right now covering the LAYOUTGET we are about to send.
>> + * However, that is not so catastrophic, and there seems
>> + * to be no way to prevent it completely.
>> + */
>> spin_lock(&ino->i_lock);
>> - if (pnfs_layoutgets_blocked(nfsi->layout)) {
>> + if (pnfs_layoutgets_blocked(nfsi->layout, NULL)) {
>> rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL);
>> spin_unlock(&ino->i_lock);
>> return;
>> }
>> + /* This needs after above check but atomic with it in order to properly
>> + * serialize openstateid LAYOUTGETs.
>> + */
>> + nfsi->layout->plh_outstanding++;
>> spin_unlock(&ino->i_lock);
>> +
>> if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
>> - &lgp->res.seq_res, 0, task))
>> + &lgp->res.seq_res, 0, task)) {
>> + spin_lock(&ino->i_lock);
>> + nfsi->layout->plh_outstanding--;
>> + spin_unlock(&ino->i_lock);
>> return;
>> + }
>> rpc_call_start(task);
>> }
>>
>> static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
>> {
>> struct nfs4_layoutget *lgp = calldata;
>> - struct nfs_server *server = NFS_SERVER(lgp->args.inode);
>> + struct inode *ino = lgp->args.inode;
>>
>> dprintk("--> %s\n", __func__);
>>
>> - if (!nfs4_sequence_done(task, &lgp->res.seq_res))
>> + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) {
>> + /* layout code relies on fact that in this case
>> + * code falls back to tk_action=call_start, but not
>> + * back to rpc_prepare_task, to keep plh_outstanding
>> + * correct.
>> + */
>> return;
>> -
>> + }
>> switch (task->tk_status) {
>> case 0:
>> break;
>> @@ -5379,7 +5406,11 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
>> task->tk_status = -NFS4ERR_DELAY;
>> /* Fall through */
>> default:
>> - if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
>> + if (nfs4_async_handle_error(task, NFS_SERVER(ino),
>> + NULL, NULL) == -EAGAIN) {
>> + spin_lock(&ino->i_lock);
>> + NFS_I(ino)->layout->plh_outstanding--;
>> + spin_unlock(&ino->i_lock);
>> rpc_restart_call_prepare(task);
>> return;
>> }
>> @@ -5437,13 +5468,20 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
>> if (IS_ERR(task))
>> return PTR_ERR(task);
>> status = nfs4_wait_for_completion_rpc_task(task);
>> - if (status != 0)
>> - goto out;
>> - status = task->tk_status;
>> - if (status != 0)
>> - goto out;
>> - status = pnfs_layout_process(lgp);
>> -out:
>> + if (status == 0)
>> + status = task->tk_status;
>> + if (status == 0)
>> + status = pnfs_layout_process(lgp);
>> + else {
>> + struct inode *ino = lgp->args.inode;
>> + struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
>> +
>> + spin_lock(&ino->i_lock);
>> + lo->plh_outstanding--;
>> + if (!pnfs_layoutgets_blocked(lo, NULL))
>> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>> + spin_unlock(&ino->i_lock);
>> + }
>> rpc_put_task(task);
>> dprintk("<-- %s status=%d\n", __func__, status);
>> return status;
>> @@ -5587,9 +5625,9 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
>>
>> spin_lock(&lo->inode->i_lock);
>> if (lrp->res.lrs_present)
>> - pnfs_set_layout_stateid(lo, &lrp->res.stateid);
>> + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
>> else
>> - pnfs_invalidate_layout_stateid(lo);
>> + BUG_ON(!list_empty(&lo->segs));
>> spin_unlock(&lo->inode->i_lock);
>> }
>> dprintk("<-- %s\n", __func__);
>> @@ -5606,10 +5644,11 @@ static void nfs4_layoutreturn_release(void *calldata)
>>
>> spin_lock(&ino->i_lock);
>> lo->plh_block_lgets--;
>> - if (!pnfs_layoutgets_blocked(lo))
>> + lo->plh_outstanding--;
>> + if (!pnfs_layoutgets_blocked(lo, NULL))
>> rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>> spin_unlock(&ino->i_lock);
>> - put_layout_hdr(lrp->args.inode);
>> + put_layout_hdr(ino);
>> }
>> kfree(calldata);
>> dprintk("<-- %s\n", __func__);
>> @@ -5639,6 +5678,14 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
>> int status = 0;
>>
>> dprintk("--> %s\n", __func__);
>> + if (lrp->args.return_type == RETURN_FILE) {
>> + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
>> + /* FIXME we should test for BULK here */
>> + spin_lock(&lo->inode->i_lock);
>> + BUG_ON(lo->plh_block_lgets == 0);
>> + lo->plh_outstanding++;
>> + spin_unlock(&lo->inode->i_lock);
>> + }
>> task = rpc_run_task(&task_setup_data);
>> if (IS_ERR(task))
>> return PTR_ERR(task);
>> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
>> index 00632f6..ceb0d66 100644
>> --- a/fs/nfs/nfs4state.c
>> +++ b/fs/nfs/nfs4state.c
>> @@ -1560,6 +1560,10 @@ static void nfs4_state_manager(struct nfs_client *clp)
>> nfs_client_return_marked_delegations(clp);
>> continue;
>> }
>> + if (test_and_clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) {
>> + nfs_client_return_layouts(clp);
>> + continue;
>> + }
>> /* Recall session slots */
>> if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
>> && nfs4_has_session(clp)) {
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index 328cca5..f530c7e 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -1827,13 +1827,14 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
>> hdr->replen += decode_getdeviceinfo_maxsz;
>> }
>>
>> -static void
>> +static int
>> encode_layoutget(struct xdr_stream *xdr,
>> const struct nfs4_layoutget_args *args,
>> struct compound_hdr *hdr)
>> {
>> nfs4_stateid stateid;
>> __be32 *p;
>> + int status;
>>
>> p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
>> *p++ = cpu_to_be32(OP_LAYOUTGET);
>> @@ -1843,8 +1844,11 @@ encode_layoutget(struct xdr_stream *xdr,
>> p = xdr_encode_hyper(p, args->range.offset);
>> p = xdr_encode_hyper(p, args->range.length);
>> p = xdr_encode_hyper(p, args->minlength);
>> - pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
>> - args->ctx->state);
>> + status = pnfs_choose_layoutget_stateid(&stateid,
>> + NFS_I(args->inode)->layout,
>> + args->ctx->state);
>> + if (status)
>> + return status;
>> p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
>> *p = cpu_to_be32(args->maxcount);
>>
>> @@ -1857,6 +1861,7 @@ encode_layoutget(struct xdr_stream *xdr,
>> args->maxcount);
>> hdr->nops++;
>> hdr->replen += decode_layoutget_maxsz;
>> + return 0;
>> }
>>
>> static int
>> @@ -2782,12 +2787,15 @@ static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
>> struct compound_hdr hdr = {
>> .minorversion = nfs4_xdr_minorversion(&args->seq_args),
>> };
>> + int status;
>>
>> xdr_init_encode(&xdr, &req->rq_snd_buf, p);
>> encode_compound_hdr(&xdr, req, &hdr);
>> encode_sequence(&xdr, &args->seq_args, &hdr);
>> encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
>> - encode_layoutget(&xdr, args, &hdr);
>> + status = encode_layoutget(&xdr, args, &hdr);
>> + if (status)
>> + return status;
>> encode_nops(&hdr);
>> return 0;
>> }
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 07b04e8..2d817be 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
>> */
>>
>> /* Need to hold i_lock if caller does not already hold reference */
>> -static void
>> +void
>> get_layout_hdr(struct pnfs_layout_hdr *lo)
>> {
>> atomic_inc(&lo->plh_refcount);
>> @@ -278,24 +278,29 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
>> smp_mb();
>> lseg->valid = true;
>> lseg->layout = lo;
>> + lseg->drain_notification = NULL;
>> }
>>
>> static void
>> _put_lseg_common(struct pnfs_layout_segment *lseg)
>> {
>> + struct inode *ino = lseg->layout->inode;
>> +
>> BUG_ON(lseg->valid == true);
>> list_del(&lseg->fi_list);
>> if (list_empty(&lseg->layout->segs)) {
>> struct nfs_client *clp;
>>
>> - clp = NFS_SERVER(lseg->layout->inode)->nfs_client;
>> + clp = NFS_SERVER(ino)->nfs_client;
>> spin_lock(&clp->cl_lock);
>> /* List does not take a reference, so no need for put here */
>> list_del_init(&lseg->layout->layouts);
>> spin_unlock(&clp->cl_lock);
>> - pnfs_invalidate_layout_stateid(lseg->layout);
>> + clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->layout->plh_flags);
>> + if (!pnfs_layoutgets_blocked(lseg->layout, NULL))
>> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>> }
>> - rpc_wake_up(&NFS_I(lseg->layout->inode)->lo_rpcwaitq);
>> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
>> }
>>
>> /* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
>> @@ -325,9 +330,12 @@ put_lseg(struct pnfs_layout_segment *lseg)
>> atomic_read(&lseg->pls_refcount), lseg->valid);
>> ino = lseg->layout->inode;
>> if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
>> + struct pnfs_cb_lrecall_info *drain_info = lseg->drain_notification;
>> +
>> _put_lseg_common(lseg);
>> spin_unlock(&ino->i_lock);
>> NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>> + notify_drained(drain_info);
>> /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
>> put_layout_hdr(ino);
>> }
>> @@ -345,7 +353,7 @@ EXPORT_SYMBOL_GPL(put_lseg);
>> * READ READ true
>> * READ RW false
>> */
>> -static int
>> +bool
>> should_free_lseg(struct pnfs_layout_range *lseg_range,
>> struct pnfs_layout_range *recall_range)
>> {
>> @@ -388,16 +396,19 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
>> dprintk("%s:Return\n", __func__);
>> }
>>
>> -static void
>> +void
>> pnfs_free_lseg_list(struct list_head *free_me)
>> {
>> struct pnfs_layout_segment *lseg, *tmp;
>> struct inode *ino;
>> + struct pnfs_cb_lrecall_info *drain_info;
>>
>> list_for_each_entry_safe(lseg, tmp, free_me, fi_list) {
>> BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
>> ino = lseg->layout->inode;
>> + drain_info = lseg->drain_notification;
>> NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>> + notify_drained(drain_info);
>> /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
>> put_layout_hdr(ino);
>> }
>> @@ -453,40 +464,49 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
>> }
>> }
>>
>> -/* update lo->stateid with new if is more recent
>> - *
>> - * lo->stateid could be the open stateid, in which case we just use what given.
>> - */
>> +/* update lo->stateid with new if is more recent */
>> void
>> -pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
>> - const nfs4_stateid *new)
>> +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
>> + bool update_barrier)
>> {
>> - nfs4_stateid *old = &lo->stateid;
>> - bool overwrite = false;
>> + u32 oldseq, newseq;
>>
>> assert_spin_locked(&lo->inode->i_lock);
>> - if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
>> - memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
>> - overwrite = true;
>> - else {
>> - u32 oldseq, newseq;
>> -
>> - oldseq = be32_to_cpu(old->stateid.seqid);
>> - newseq = be32_to_cpu(new->stateid.seqid);
>> - if ((int)(newseq - oldseq) > 0)
>> - overwrite = true;
>> + oldseq = be32_to_cpu(lo->stateid.stateid.seqid);
>> + newseq = be32_to_cpu(new->stateid.seqid);
>> + if ((int)(newseq - oldseq) > 0) {
>> + memcpy(&lo->stateid, &new->stateid, sizeof(new->stateid));
>> + if (update_barrier)
>> + lo->plh_barrier = be32_to_cpu(new->stateid.seqid);
>> + else {
>> + /* Because of wraparound, we want to keep the barrier
>> + * "close" to the current seqids. It needs to be
>> + * within 2**31 to count as "behind", so if it
>> + * gets too near that limit, give us a litle leeway
>> + * and bring it to within 2**30.
>> + * NOTE - and yes, this is all unsigned arithmetic.
>> + */
>> + if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
>> + lo->plh_barrier = newseq - (1 << 30);
>> + }
>> }
>> - if (overwrite)
>> - memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
>> }
>>
>> -void
>> -pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> - struct nfs4_state *open_state)
>> +int
>> +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> + struct nfs4_state *open_state)
>> {
>> + int status = 0;
>> +
>> dprintk("--> %s\n", __func__);
>> spin_lock(&lo->inode->i_lock);
>> - if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
>> + if (lo->plh_block_lgets ||
>> + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
>> + /* We avoid -EAGAIN, as that has special meaning to
>> + * some callers.
>> + */
>> + status = -NFS4ERR_LAYOUTTRYLATER;
>> + } else if (list_empty(&lo->segs)) {
>> int seq;
>>
>> do {
>> @@ -494,12 +514,11 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> memcpy(dst->data, open_state->stateid.data,
>> sizeof(open_state->stateid.data));
>> } while (read_seqretry(&open_state->seqlock, seq));
>> - set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
>> } else
>> - memcpy(dst->data, lo->stateid.data,
>> - sizeof(lo->stateid.data));
>> + memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data));
>> spin_unlock(&lo->inode->i_lock);
>> dprintk("<-- %s\n", __func__);
>> + return status;
>> }
>>
>> /*
>> @@ -566,6 +585,28 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,
>> return out;
>> }
>>
>> +void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
>> + struct pnfs_layout_range *range,
>> + struct pnfs_cb_lrecall_info *drain_info,
>> + struct list_head *tmp_list)
>> +{
>> + struct pnfs_layout_segment *lseg, *tmp;
>> +
>> + assert_spin_locked(&lo->inode->i_lock);
>
> Poor practice. If you want to ensure the caller holds the inode->i_lock,
> then just call the function '*_locked'. That is a lot more helpful than
> these damned asserts.
>

That makes sense.

Benny

>> + list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
>> + if (should_free_lseg(&lseg->range, range)) {
>> + /* FIXME - need to change to something like a
>> + * notification bitmap to remove the restriction
>> + * of only being able to process a single
>> + * CB_LAYOUTRECALL at a time.
>> + */
>> + BUG_ON(lseg->drain_notification);
>> + lseg->drain_notification = drain_info;
>> + atomic_inc(&drain_info->pcl_count);
>> + mark_lseg_invalid(lseg, tmp_list);
>> + }
>> +}
>> +
>> /* Return true if there is layout based io in progress in the given range.
>> * Assumes range has already been marked invalid, and layout marked to
>> * prevent any new lseg from being inserted.
>> @@ -711,14 +752,6 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
>> dprintk("%s:Begin\n", __func__);
>>
>> assert_spin_locked(&lo->inode->i_lock);
>> - if (list_empty(&lo->segs)) {
>> - struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
>> -
>> - spin_lock(&clp->cl_lock);
>> - BUG_ON(!list_empty(&lo->layouts));
>> - list_add_tail(&lo->layouts, &clp->cl_layouts);
>> - spin_unlock(&clp->cl_lock);
>> - }
>> list_for_each_entry(lp, &lo->segs, fi_list) {
>> if (cmp_layout(&lp->range, &lseg->range) > 0)
>> continue;
>> @@ -735,6 +768,9 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
>> }
>> if (!found) {
>> list_add_tail(&lseg->fi_list, &lo->segs);
>> + if (list_is_singular(&lo->segs) &&
>> + !pnfs_layoutgets_blocked(lo, NULL))
>> + rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
>> dprintk("%s: inserted lseg %p "
>> "iomode %d offset %llu length %llu at tail\n",
>> __func__, lseg, lseg->range.iomode,
>> @@ -756,6 +792,7 @@ alloc_init_layout_hdr(struct inode *ino)
>> atomic_set(&lo->plh_refcount, 1);
>> INIT_LIST_HEAD(&lo->layouts);
>> INIT_LIST_HEAD(&lo->segs);
>> + INIT_LIST_HEAD(&lo->plh_bulk_recall);
>> lo->inode = ino;
>> return lo;
>> }
>> @@ -843,6 +880,7 @@ pnfs_update_layout(struct inode *ino,
>> .length = NFS4_MAX_UINT64,
>> };
>> struct nfs_inode *nfsi = NFS_I(ino);
>> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>> struct pnfs_layout_hdr *lo;
>> struct pnfs_layout_segment *lseg = NULL;
>>
>> @@ -878,9 +916,28 @@ pnfs_update_layout(struct inode *ino,
>> goto out_unlock;
>>
>> get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */
>> + if (list_empty(&lo->segs)) {
>> + /* The lo must be on the clp list if there is any
>> + * chance of a CB_LAYOUTRECALL(FILE) coming in.
>> + */
>> + spin_lock(&clp->cl_lock);
>> + BUG_ON(!list_empty(&lo->layouts));
>> + list_add_tail(&lo->layouts, &clp->cl_layouts);
>> + spin_unlock(&clp->cl_lock);
>> + }
>> spin_unlock(&ino->i_lock);
>>
>> lseg = send_layoutget(lo, ctx, &arg);
>> + if (!lseg) {
>> + spin_lock(&ino->i_lock);
>> + if (list_empty(&lo->segs)) {
>> + spin_lock(&clp->cl_lock);
>> + list_del_init(&lo->layouts);
>> + spin_unlock(&clp->cl_lock);
>> + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
>> + }
>> + spin_unlock(&ino->i_lock);
>> + }
>> out:
>> dprintk("%s end, state 0x%lx lseg %p\n", __func__,
>> nfsi->layout->plh_flags, lseg);
>> @@ -891,10 +948,15 @@ out_unlock:
>> }
>>
>> bool
>> -pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo)
>> +pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid)
>> {
>> assert_spin_locked(&lo->inode->i_lock);
>> - return lo->plh_block_lgets;
>> + if ((stateid) &&
>> + (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
>> + return true;
>> + return lo->plh_block_lgets ||
>> + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
>> + (list_empty(&lo->segs) && lo->plh_outstanding);
>> }
>>
>> int
>> @@ -904,6 +966,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>> struct nfs4_layoutget_res *res = &lgp->res;
>> struct pnfs_layout_segment *lseg;
>> struct inode *ino = lo->inode;
>> + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>> int status = 0;
>>
>> /* Inject layout blob into I/O device driver */
>> @@ -915,10 +978,25 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>> status = PTR_ERR(lseg);
>> dprintk("%s: Could not allocate layout: error %d\n",
>> __func__, status);
>> + spin_lock(&ino->i_lock);
>> goto out;
>> }
>>
>> spin_lock(&ino->i_lock);
>> + /* decrement needs to be done before call to pnfs_layoutget_blocked */
>> + lo->plh_outstanding--;
>> + spin_lock(&clp->cl_lock);
>> + if (matches_outstanding_recall(ino, &res->range)) {
>> + spin_unlock(&clp->cl_lock);
>> + dprintk("%s forget reply due to recall\n", __func__);
>> + goto out_forget_reply;
>> + }
>> + spin_unlock(&clp->cl_lock);
>> +
>> + if (pnfs_layoutgets_blocked(lo, &res->stateid)) {
>> + dprintk("%s forget reply due to state\n", __func__);
>> + goto out_forget_reply;
>> + }
>> init_lseg(lo, lseg);
>> lseg->range = res->range;
>> get_lseg(lseg);
>> @@ -934,10 +1012,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>> }
>>
>> /* Done processing layoutget. Set the layout stateid */
>> - pnfs_set_layout_stateid(lo, &res->stateid);
>> - spin_unlock(&ino->i_lock);
>> + pnfs_set_layout_stateid(lo, &res->stateid, false);
>> out:
>> + if (!pnfs_layoutgets_blocked(lo, NULL))
>> + rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>> + spin_unlock(&ino->i_lock);
>> return status;
>> +
>> +out_forget_reply:
>> + spin_unlock(&ino->i_lock);
>> + lseg->layout = lo;
>> + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>> + spin_lock(&ino->i_lock);
>> + goto out;
>> }
>>
>> void
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 891aeab..7ea121f 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -31,6 +31,7 @@
>> #define FS_NFS_PNFS_H
>>
>> #include <linux/nfs_page.h>
>> +#include "callback.h" /* for cb_layoutrecallargs */
>>
>> struct pnfs_layout_segment {
>> struct list_head fi_list;
>> @@ -38,6 +39,7 @@ struct pnfs_layout_segment {
>> atomic_t pls_refcount;
>> bool valid;
>> struct pnfs_layout_hdr *layout;
>> + struct pnfs_cb_lrecall_info *drain_notification;
>> };
>>
>> enum pnfs_try_status {
>> @@ -52,7 +54,7 @@ enum pnfs_try_status {
>> enum {
>> NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
>> NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
>> - NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
>> + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
>> NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
>> };
>>
>> @@ -94,10 +96,13 @@ struct pnfs_layoutdriver_type {
>> struct pnfs_layout_hdr {
>> atomic_t plh_refcount;
>> struct list_head layouts; /* other client layouts */
>> + struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
>> struct list_head segs; /* layout segments list */
>> int roc_iomode;/* return on close iomode, 0=none */
>> nfs4_stateid stateid;
>> + unsigned long plh_outstanding; /* number of RPCs out */
>> unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
>> + u32 plh_barrier; /* ignore lower seqids */
>> unsigned long plh_flags;
>> struct rpc_cred *cred; /* layoutcommit credential */
>> /* DH: These vars keep track of the maximum write range
>> @@ -118,6 +123,14 @@ struct pnfs_device {
>> unsigned int pglen;
>> };
>>
>> +struct pnfs_cb_lrecall_info {
>> + struct list_head pcl_list; /* hook into cl_layoutrecalls list */
>> + atomic_t pcl_count;
>> + struct nfs_client *pcl_clp;
>> + struct inode *pcl_ino;
>> + struct cb_layoutrecallargs pcl_args;
>> +};
>> +
>> /*
>> * Device ID RCU cache. A device ID is unique per client ID and layout type.
>> */
>> @@ -176,7 +189,10 @@ extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
>> extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
>>
>> /* pnfs.c */
>> +void get_layout_hdr(struct pnfs_layout_hdr *lo);
>> void put_lseg(struct pnfs_layout_segment *lseg);
>> +bool should_free_lseg(struct pnfs_layout_range *lseg_range,
>> + struct pnfs_layout_range *recall_range);
>> struct pnfs_layout_segment *
>> pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
>> struct pnfs_layout_segment *
>> @@ -201,15 +217,24 @@ enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
>> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
>> struct nfs_open_context *, struct list_head *);
>> void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
>> -bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo);
>> +bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
>> int pnfs_layout_process(struct nfs4_layoutget *lgp);
>> +void pnfs_free_lseg_list(struct list_head *tmp_list);
>> void pnfs_destroy_layout(struct nfs_inode *);
>> void pnfs_destroy_all_layouts(struct nfs_client *);
>> void put_layout_hdr(struct inode *inode);
>> void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
>> - const nfs4_stateid *new);
>> -void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> - struct nfs4_state *open_state);
>> + const nfs4_stateid *new,
>> + bool update_barrier);
>> +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
>> + struct pnfs_layout_hdr *lo,
>> + struct nfs4_state *open_state);
>> +void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
>> + struct pnfs_layout_range *range,
>> + struct pnfs_cb_lrecall_info *drain_info,
>> + struct list_head *tmp_list);
>> +/* FIXME - this should be in callback.h, but pnfs_cb_lrecall_info needs to be there too */
>> +extern void notify_drained(struct pnfs_cb_lrecall_info *d);
>>
>> static inline bool
>> has_layout(struct nfs_inode *nfsi)
>> @@ -223,12 +248,6 @@ static inline int lo_fail_bit(u32 iomode)
>> NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
>> }
>>
>> -static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
>> -{
>> - assert_spin_locked(&lo->inode->i_lock);
>> - clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
>> -}
>> -
>> static inline void get_lseg(struct pnfs_layout_segment *lseg)
>> {
>> atomic_inc(&lseg->pls_refcount);
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 3cae408..80dcc00 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -83,6 +83,10 @@ struct nfs_client {
>> u32 cl_exchange_flags;
>> struct nfs4_session *cl_session; /* sharred session */
>> struct list_head cl_layouts;
>> + struct list_head cl_layoutrecalls;
>> + unsigned long cl_cb_lrecall_count;
>> +#define PNFS_MAX_CB_LRECALLS (1)
>> + struct rpc_wait_queue cl_rpcwaitq_recall;
>> struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>> #endif /* CONFIG_NFS_V4_1 */
>>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2010-11-12 08:49:23

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 03/22] pnfs-submit: layoutreturn's rpc_call_op functions need to handle bulk returns

nfs4_proc_layoutreturn and its descendants were assuming that
inode and lo were always available, but that is not true in the
case of a bulk return.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback_proc.c | 1 +
fs/nfs/nfs4proc.c | 37 ++++++++++++++++++-------------------
fs/nfs/pnfs.c | 4 +++-
include/linux/nfs_xdr.h | 1 +
4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 6b560ce..4dabc62 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -270,6 +270,7 @@ static int pnfs_recall_layout(void *data)
lrp->args.reclaim = 0;
lrp->args.layout_type = rl.cbl_layout_type;
lrp->args.return_type = rl.cbl_recall_type;
+ lrp->clp = clp;
lrp->args.range = rl.cbl_seg;
lrp->args.inode = inode;
nfs4_proc_layoutreturn(lrp, true);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d01068c..8dbd711 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5553,23 +5553,23 @@ static void
nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
- struct inode *ino = lrp->args.inode;
- struct nfs_inode *nfsi = NFS_I(ino);
- struct nfs_server *server = NFS_SERVER(ino);

dprintk("--> %s\n", __func__);
- if ((lrp->args.return_type == RETURN_FILE) &&
- pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
- dprintk("%s: waiting on barrier\n", __func__);
- rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
- return;
+ if (lrp->args.return_type == RETURN_FILE) {
+ struct nfs_inode *nfsi = NFS_I(lrp->args.inode);
+
+ if (pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
+ dprintk("%s: waiting on barrier\n", __func__);
+ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
+ return;
+ }
}
if (lrp->stateid) {
/* Forget the layout, without sending the return */
rpc_exit(task, 0);
return;
}
- if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args,
+ if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
&lrp->res.seq_res, 0, task))
return;
rpc_call_start(task);
@@ -5578,16 +5578,19 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
- struct inode *ino = lrp->args.inode;
- struct nfs_server *server = NFS_SERVER(ino);
+ struct nfs_server *server;

dprintk("--> %s\n", __func__);

if (!nfs4_sequence_done(task, &lrp->res.seq_res))
return;

- if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
- nfs_restart_rpc(task, server->nfs_client);
+ if (lrp->args.return_type == RETURN_FILE)
+ server = NFS_SERVER(lrp->args.inode);
+ else
+ server = NULL;
+ if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN)
+ nfs_restart_rpc(task, lrp->clp);

dprintk("<-- %s\n", __func__);
}
@@ -5595,10 +5598,8 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
static void nfs4_layoutreturn_release(void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
- struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;

- dprintk("--> %s return_type %d lo %p\n", __func__,
- lrp->args.return_type, lo);
+ dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);

pnfs_layoutreturn_release(lrp);
kfree(calldata);
@@ -5613,8 +5614,6 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {

int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
{
- struct inode *ino = lrp->args.inode;
- struct nfs_server *server = NFS_SERVER(ino);
struct rpc_task *task;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
@@ -5622,7 +5621,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
.rpc_resp = &lrp->res,
};
struct rpc_task_setup task_setup_data = {
- .rpc_client = server->client,
+ .rpc_client = lrp->clp->cl_rpcclient,
.rpc_message = &msg,
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 72d7ed3..149f95e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -582,11 +582,12 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
void
pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp)
{
- struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+ struct pnfs_layout_hdr *lo;
LIST_HEAD(tmp_list);

if (lrp->args.return_type != RETURN_FILE)
return;
+ lo = NFS_I(lrp->args.inode)->layout;
spin_lock(&lrp->args.inode->i_lock);
pnfs_clear_lseg_list(lo, &tmp_list, &lrp->args.range);
if (!lrp->res.valid)
@@ -625,6 +626,7 @@ return_layout(struct inode *ino, struct pnfs_layout_range *range,
lrp->args.range = *range;
lrp->args.inode = ino;
lrp->stateid = stateid;
+ lrp->clp = server->nfs_client;

status = nfs4_proc_layoutreturn(lrp, wait);
out:
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 53a4d2f..23a4519 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -279,6 +279,7 @@ struct nfs4_layoutreturn {
struct nfs4_layoutreturn_res res;
struct rpc_cred *cred;
const nfs4_stateid *stateid;
+ struct nfs_client *clp;
int rpc_status;
};

--
1.7.2.1


2010-11-14 11:09:51

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 18/22] pnfs-submit: roc add layoutreturn op to close compound

On 2010-11-12 18:56, Fred Isaman wrote:
> On Fri, Nov 12, 2010 at 11:31 AM, Benny Halevy <[email protected]> wrote:
>> On 2010-11-12 10:48, Fred Isaman wrote:
>>> From: Andy Adamson <[email protected]>
>>> @@ -2143,6 +2145,8 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
>>> encode_putfh(&xdr, args->fh, &hdr);
>>> encode_close(&xdr, args, &hdr);
>>> encode_getfattr(&xdr, args->bitmask, &hdr);
>>> + if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
>>> + encode_layoutreturn(&xdr, &args->lr_args, &hdr);
>>
>> Sorry, I just noticed, but if there's no object I'll move the layoutreturn op
>> before close in the compound.
>>
>> Benny
>>
>
> The reason the LAYOUTRETURN was last was so that we could ignore any
> error on the return. Otherwise an error on the LAYOUTRETURN stops the
> CLOSE from being processed. I'll defer to Andy, but while I see why
> you would want the reutrn first, moving it will require paying careful
> attention to how an error is dealt with. (Actually, we have the same
> issue with the LAYOUTCOMMIT when it is later added.)

That's true, but the whole point behind doing the layoutreturn on close
is doing it before CLOSE :)

As we say in section 18.43:
The logr_return_on_close result field is a directive to return the
layout before closing the file.
^^^^^^^^^^^^^^^^^^^^^^^^

Benny

>
> Fred
>

2010-11-12 08:49:25

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 14/22] pnfs-submit: rename lo->state to lo->plh_flags

Preparing for change in stateid code, and want to avoid name confusion.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 18 +++++++++---------
fs/nfs/pnfs.h | 6 +++---
2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 353c674..5227d51 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -81,10 +81,10 @@ pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx)
dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx);
spin_lock(&nfsi->vfs_inode.i_lock);
if (has_layout(nfsi) &&
- !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state)) {
+ !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags)) {
nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred);
__set_bit(NFS_LAYOUT_NEED_LCOMMIT,
- &nfsi->layout->state);
+ &nfsi->layout->plh_flags);
nfsi->change_attr++;
spin_unlock(&nfsi->vfs_inode.i_lock);
dprintk("%s: Set layoutcommit\n", __func__);
@@ -457,7 +457,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool overwrite = false;

assert_spin_locked(&lo->inode->i_lock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
overwrite = true;
else {
@@ -478,7 +478,7 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
{
dprintk("--> %s\n", __func__);
spin_lock(&lo->inode->i_lock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
int seq;

do {
@@ -486,7 +486,7 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
memcpy(dst->data, open_state->stateid.data,
sizeof(open_state->stateid.data));
} while (read_seqretry(&open_state->seqlock, seq));
- set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+ set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
} else
memcpy(dst->data, lo->stateid.data,
sizeof(lo->stateid.data));
@@ -534,7 +534,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
nfs4_proc_layoutget(lgp);
if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */
- set_bit(lo_fail_bit(range->iomode), &lo->state);
+ set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
}
return lseg;
}
@@ -866,7 +866,7 @@ pnfs_update_layout(struct inode *ino,
}

/* if LAYOUTGET already failed once we don't try again */
- if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;

get_layout_hdr_locked(lo); /* Matched in pnfs_layoutget_release */
@@ -875,7 +875,7 @@ pnfs_update_layout(struct inode *ino,
lseg = send_layoutget(lo, ctx, &arg);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
- nfsi->layout->state, lseg);
+ nfsi->layout->plh_flags, lseg);
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
@@ -1174,7 +1174,7 @@ pnfs_layoutcommit_inode(struct inode *inode, int sync)
nfsi->layout->write_begin_pos = 0;
nfsi->layout->write_end_pos = 0;
nfsi->layout->cred = NULL;
- __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
+ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
memcpy(data->args.stateid.data, nfsi->layout->stateid.data,
NFS4_STATEID_SIZE);

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4f2c541..dabf03e 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -98,7 +98,7 @@ struct pnfs_layout_hdr {
int roc_iomode;/* return on close iomode, 0=none */
nfs4_stateid stateid;
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
- unsigned long state;
+ unsigned long plh_flags;
struct rpc_cred *cred; /* layoutcommit credential */
/* DH: These vars keep track of the maximum write range
* so the values can be used for layoutcommit.
@@ -226,7 +226,7 @@ static inline int lo_fail_bit(u32 iomode)
static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
{
assert_spin_locked(&lo->inode->i_lock);
- clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+ clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
}

static inline void get_lseg(struct pnfs_layout_segment *lseg)
@@ -269,7 +269,7 @@ static inline bool
layoutcommit_needed(struct nfs_inode *nfsi)
{
return has_layout(nfsi) &&
- test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
+ test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
}

#else /* CONFIG_NFS_V4_1 */
--
1.7.2.1


2010-11-12 08:49:27

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 19/22] pnfs-submit refactor layoutcommit xdr structures

From: Andy Adamson <[email protected]>

Separate the layoutcommit operation args from the layoutcommit compound args
in preparation to add the layoutcommit operation to the close compound
when return-on-close set and layoutcommit is needed prior to the layoutreturn.

- Move pnfs_layoutcommit_arg inode to pnfs_layoutcommit_data because it is
not needed for encode_layoutcommit.

- Move pnfs_layoutcommit_data rpc_cred to pnfs_layoutcommit_arg so that
pnfs_layoutcommit_setup can be called with pnfs_layoutcommit_arg only.

- Move layoutcommit operation fields from pnfs_layoutcommit_arg to a new
struct nfs_layoutcommit_op_args which is passed to encode_layoutcommit.

This new structure will also be used for embedded layoutcommit calls.

Remove unused fields:
- Remove unused pnfs_layoutcommit_data rpc_task.
- Remove unused pnfs_layoutcommit_arg time_modify_changed and time_modify.
- Remove unused pnfs_layoutcommit_arg void layoutdriver_data which will be
restored for the block layoutdriver.
- Remove unused sizechanged and newsize from pnfs_layoutcommit_res.

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 23 +++++++++++------------
fs/nfs/nfs4xdr.c | 27 ++++++++++-----------------
fs/nfs/pnfs.c | 21 +++++++++------------
include/linux/nfs_xdr.h | 19 +++++++++----------
4 files changed, 39 insertions(+), 51 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2b47c59..09ed784 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5516,7 +5516,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data)
{
struct nfs4_layoutcommit_data *ldata =
(struct nfs4_layoutcommit_data *)data;
- struct nfs_server *server = NFS_SERVER(ldata->args.inode);
+ struct nfs_server *server = NFS_SERVER(ldata->inode);

if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args,
&ldata->res.seq_res, 1, task))
@@ -5529,7 +5529,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutcommit_data *data =
(struct nfs4_layoutcommit_data *)calldata;
- struct nfs_server *server = NFS_SERVER(data->args.inode);
+ struct nfs_server *server = NFS_SERVER(data->inode);

if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -5546,8 +5546,8 @@ static void nfs4_layoutcommit_release(void *lcdata)
(struct nfs4_layoutcommit_data *)lcdata;

/* Matched by get_layout in pnfs_layoutcommit_inode */
- put_layout_hdr(data->args.inode);
- put_rpccred(data->cred);
+ put_layout_hdr(data->inode);
+ put_rpccred(data->args.cred);
kfree(lcdata);
}

@@ -5565,11 +5565,11 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = data->cred,
+ .rpc_cred = data->args.cred,
};
struct rpc_task_setup task_setup_data = {
.task = &data->task,
- .rpc_client = NFS_CLIENT(data->args.inode),
+ .rpc_client = NFS_CLIENT(data->inode),
.rpc_message = &msg,
.callback_ops = &nfs4_layoutcommit_ops,
.callback_data = data,
@@ -5578,13 +5578,12 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
struct rpc_task *task;
int status = 0;

- dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu "
+ dprintk("NFS: initiating layoutcommit call. %llu@%llu lbw: %llu "
"type: %d issync %d\n",
- data->task.tk_pid,
- data->args.range.length,
- data->args.range.offset,
- data->args.lastbytewritten,
- data->args.layout_type, issync);
+ data->args.op.range.length,
+ data->args.op.range.offset,
+ data->args.op.lastbytewritten,
+ data->args.op.layout_type, issync);

task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index adb4c47..f11870e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1868,7 +1868,7 @@ encode_layoutget(struct xdr_stream *xdr,

static int
encode_layoutcommit(struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *args,
+ const struct nfs4_layoutcommit_op_args *args,
struct compound_hdr *hdr)
{
__be32 *p;
@@ -1885,14 +1885,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
*p++ = cpu_to_be32(1); /* newoffset = TRUE */
p = xdr_encode_hyper(p, args->lastbytewritten);
- *p = cpu_to_be32(args->time_modify_changed != 0);
- if (args->time_modify_changed) {
- p = reserve_space(xdr, 12);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(args->time_modify.tv_sec);
- *p = cpu_to_be32(args->time_modify.tv_nsec);
- }
-
+ *p = cpu_to_be32(0); /* nt_timechanged = FALSE */
p = reserve_space(xdr, 4);
*p = cpu_to_be32(args->layout_type);

@@ -2819,7 +2812,7 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p,
encode_compound_hdr(&xdr, req, &hdr);
encode_sequence(&xdr, &args->seq_args, &hdr);
encode_putfh(&xdr, args->fh, &hdr);
- encode_layoutcommit(&xdr, args, &hdr);
+ encode_layoutcommit(&xdr, &args->op, &hdr);
encode_getfattr(&xdr, args->bitmask, &hdr);
encode_nops(&hdr);
return 0;
@@ -5308,10 +5301,10 @@ out_overflow:
return -EIO;
}

-static int decode_layoutcommit(struct xdr_stream *xdr,
- struct rpc_rqst *req,
- struct nfs4_layoutcommit_res *res)
+static int decode_layoutcommit(struct xdr_stream *xdr)
{
+ u32 sizechanged;
+ u64 newsize;
__be32 *p;
int status;

@@ -5322,13 +5315,13 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
- res->sizechanged = be32_to_cpup(p);
+ sizechanged = be32_to_cpup(p);

- if (res->sizechanged) {
+ if (sizechanged) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
goto out_overflow;
- xdr_decode_hyper(p, &res->newsize);
+ xdr_decode_hyper(p, &newsize);
}
return 0;
out_overflow:
@@ -6463,7 +6456,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p,
status = decode_putfh(&xdr);
if (status)
goto out;
- status = decode_layoutcommit(&xdr, rqstp, res);
+ status = decode_layoutcommit(&xdr);
if (status)
goto out;
decode_getfattr(&xdr, res->fattr, res->server,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 76cfb11..b3f1946 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1257,21 +1257,18 @@ pnfs_layoutcommit_setup(struct inode *inode,

dprintk("--> %s\n", __func__);

- data->args.inode = inode;
+ data->inode = inode;
data->args.fh = NFS_FH(inode);
- data->args.layout_type = nfss->pnfs_curr_ld->id;
+ data->args.op.layout_type = nfss->pnfs_curr_ld->id;
data->res.fattr = &data->fattr;
nfs_fattr_init(&data->fattr);

- /* TODO: Need to determine the correct values */
- data->args.time_modify_changed = 0;
-
/* Set values from inode so it can be reset
*/
- data->args.range.iomode = IOMODE_RW;
- data->args.range.offset = write_begin_pos;
- data->args.range.length = write_end_pos - write_begin_pos + 1;
- data->args.lastbytewritten = min(write_end_pos,
+ data->args.op.range.iomode = IOMODE_RW;
+ data->args.op.range.offset = write_begin_pos;
+ data->args.op.range.length = write_end_pos - write_begin_pos + 1;
+ data->args.op.lastbytewritten = min(write_end_pos,
i_size_read(inode) - 1);
data->args.bitmask = nfss->attr_bitmask;
data->res.server = nfss;
@@ -1311,12 +1308,12 @@ pnfs_layoutcommit_inode(struct inode *inode, int sync)
*/
write_begin_pos = nfsi->layout->write_begin_pos;
write_end_pos = nfsi->layout->write_end_pos;
- data->cred = nfsi->layout->cred;
+ data->args.cred = nfsi->layout->cred;
nfsi->layout->write_begin_pos = 0;
nfsi->layout->write_end_pos = 0;
nfsi->layout->cred = NULL;
__clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
- memcpy(data->args.stateid.data, nfsi->layout->stateid.data,
+ memcpy(data->args.op.stateid.data, nfsi->layout->stateid.data,
NFS4_STATEID_SIZE);

/* Reference for layoutcommit matched in pnfs_layoutcommit_release */
@@ -1329,7 +1326,7 @@ pnfs_layoutcommit_inode(struct inode *inode, int sync)
write_end_pos);
if (status) {
/* The layout driver failed to setup the layoutcommit */
- put_rpccred(data->cred);
+ put_rpccred(data->args.cred);
put_layout_hdr(inode);
goto out_free;
}
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6c4ba71..851b09f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -225,25 +225,24 @@ struct nfs4_layoutget {
struct pnfs_layout_segment **lsegpp;
};

-struct nfs4_layoutcommit_args {
+struct nfs4_layoutcommit_op_args {
nfs4_stateid stateid;
__u64 lastbytewritten;
- __u32 time_modify_changed;
- struct timespec time_modify;
- const u32 *bitmask;
- struct nfs_fh *fh;
- struct inode *inode;

/* Values set by layout driver */
struct pnfs_layout_range range;
__u32 layout_type;
- void *layoutdriver_data;
+};
+
+struct nfs4_layoutcommit_args {
+ struct nfs4_layoutcommit_op_args op;
+ const u32 *bitmask;
+ struct nfs_fh *fh;
+ struct rpc_cred *cred;
struct nfs4_sequence_args seq_args;
};

struct nfs4_layoutcommit_res {
- __u32 sizechanged;
- __u64 newsize;
struct nfs_fattr *fattr;
const struct nfs_server *server;
struct nfs4_sequence_res seq_res;
@@ -251,7 +250,7 @@ struct nfs4_layoutcommit_res {

struct nfs4_layoutcommit_data {
struct rpc_task task;
- struct rpc_cred *cred;
+ struct inode *inode;
struct nfs_fattr fattr;
struct nfs4_layoutcommit_args args;
struct nfs4_layoutcommit_res res;
--
1.7.2.1


2010-11-15 19:35:08

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [nfsv4] [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 11/15/2010 07:53 PM, Fred Isaman wrote:
> On Mon, Nov 15, 2010 at 11:17 AM, Benny Halevy <[email protected]> wrote:
>> On 2010-11-15 16:51, Fred Isaman wrote:
>>> On Sun, Nov 14, 2010 at 10:43 AM, Benny Halevy <[email protected]> wrote:
>>>>
>>>> Using the open stateid after forgetting the layout could be a protocol bug,
>>>> or at least it falls into undefined territories.
>>>>
>>>> The RFC says:
>>>>
>>>> The loga_stateid field specifies a valid stateid. If a layout is not
>>>> currently held by the client, the loga_stateid field represents a
>>>> stateid reflecting the correspondingly valid open, byte-range lock,
>>>> or delegation stateid. Once a layout is held on the file by the
>>>> client, the loga_stateid field MUST be a stateid as returned from a
>>>> previous LAYOUTGET or LAYOUTRETURN operation or provided by a
>>>> CB_LAYOUTRECALL operation (see Section 12.5.3).
>>>>
>>>> So the question is does the text above refer to the client view of the state or to
>>>> the server's view.
>>>> In other words, with the forgetful client model, when the client unilaterally forgets
>>>> the layout without letting the server know about it (no LAYOUTRETURN was sent),
>>>> does it mean "a layout is not currently held by the client"?
>>>>
>>>
>>> I would argue that yes, this is in fact what it means.
>>>
>>> It seems the server has two options when confronted with an
>>> openstateid. Either interpret this as a declaration by the client
>>> that it has forgotten all previous layouts and behave appropriately
>>> (wipe any layout state assigned to the file and create a new
>>> layoutstateid), or assume this is part of parallel spew of
>>> LAYOUTGET(openstateid) and try to use an existing layout state with
>>> the appropriate (possibly not one) seqid. I argue that, as the spec
>>> stands, the second option is not really a choice, because the first
>>> option exists. If a client using the second option encounters a
>>> server using the first, bad things happen. The client will issue
>>> multiple LAYOUTGET(openstateids), the server will, upon seeing each,
>>> discard any previous state and return a new state with segid=1, with
>>
>> Is this the specified behavior?
>>
>>> the final valid state being that of whichever one was processed last.
>>> The client will see all the OK returns, and not have any easy method
>>> of determining which is the one that the server considers valid.
>>>
>>> Thus I claim that, because of the forgetful model, the client must
>>> serialize its LAYOUTGET(openstateid) calls.
>>>
>>
>> I disagree. LAYOUTGET(openstateid) should be no different than
>> any other layout stateid and the client should be able to send multiple
>> such LAYOUTGETs *initially* (and only initially). The server can process
>> these as any other LAYOUTGET with the sequenceid rules assuming seqid==0
>> (which is disallowed otherwise)
>>
>>>> The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
>>>> while it still thinks that the client is holding a layout.
>>>> Since this could normally happen if the client sends multiple LAYOUTGETs in
>>>> parallel before it received any layout stateid the server should allow it
>>>> within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
>>>> not explicitly called out there), otherwise, it seems like the server is supposed
>>>> to return NFS4ERR_OLD_STATEID.
>>>>
>>>> Strictly reading the spec, the client should use the most recent layout stateid
>>>> even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
>>>> or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
>>>> clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
>>>> a layout (did I say recently how much I hate the forgetful model which introduces
>>>> more corner cases rather than simplifying the protocol as it was supposed to do? ;-)
>>>>
>>>
>>> Strict reading again depends on whose point of view, client or server...
>>>
>>> "Once a client has no more layouts on a file, the layout stateid is no
>>> longer valid and MUST NOT be used. Any attempt to use such a layout
>>> stateid will result in NFS4ERR_BAD_STATEID."
>>
>> In NFSv4.1 the server decides about stateids. It's not up to the client
>> to throw away the stateid and revert to the initial stateid.
>> It must send an appropriate LAYOUTRETURN and get lrs_present==false
>> to do that and then it can be sure its layout state for the file is synchronized
>> with the server's.
>>
>> Benny
>>
>
> I actually agree that your method is better. I merely disagree that
> the spec as is allows it. Another quote:
>
> "When a client has no layout on a file, it MUST present an open stateid...".
>
> The problem is that the spec is currently not clear about how the
> forgetful model interacts with sending openstateids, particularly with
> multiple parallel LAYOUTGETs. If a server implementor assumes the
> client can silently forget its layouts, then later send a
> LAYOUTGET(openstateid),

No the spec does not say that, and the Server is not to assume a
forgetful client ever. The first and only time the Server is to encounter
a forgetful client is when NOMATCHING_LAYOUT is returned from a callback.
Until then the Server gave out a layout and assumes the client has it.
If a client is to send an LAYOUTGET(openstate) outside the VALID_SEQID_RANGE
it will be returned an error. So the forgetful client cannot be all that
forgetful it must remember it's stateid, though it is free not to use
these old segments and ask for new ones (And return NOMATCHING on recalls).

I agree with you that you have exposed the exact logical contradiction
of the forgetful model, And why it is stupid really. (The faster we are
to return NOMATCHING to the "forgetful model" the better off we'll be ;-))

which seems to be what the spec currently
> says, then we get potential problems that can only be avoided if the
> client serializes the LAYOUTGET(openstate) calls.
>

Given above, that the Server cannot do that, hence the client is now
able to actually take advantage of the concurrency inherited in the STD
and the VALID_SEQID_RANGE model.

> If you want your behavior, where the client is expected to remember
> the layout stateid even after forgetting the layouts, I think an
> errata is needed.
>

I don't think so. Once you realize that there is only a single point
in time the server "assumes" forgetfulness, .i.e at recall=>NOMATCHING
that picture changes.

Boaz
> Fred
>
>
>>>
>>>
>>> Fred
>>>
>>>> Benny

2010-11-12 08:49:25

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 10/22] pnfs-submit: Have LAYOUTGETS wait when lo->plh_block_lgets is set

Preparing for changes to come, change blocking mechanism of LAYOUTGET.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/inode.c | 3 ++-
fs/nfs/nfs4proc.c | 23 ++++++-----------------
fs/nfs/pnfs.c | 11 +++++++++++
fs/nfs/pnfs.h | 2 ++
include/linux/nfs_fs.h | 1 +
5 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 72f27cc..8727ade 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1459,7 +1459,8 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
nfsi->delegation = NULL;
nfsi->delegation_state = 0;
init_rwsem(&nfsi->rwsem);
- rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layout");
+ rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layoutreturn");
+ rpc_init_wait_queue(&nfsi->lo_rpcwaitq_stateid, "pNFS Layoutstateid");
nfsi->layout = NULL;
#endif
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dbd711..5ccde2a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5346,30 +5346,19 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct inode *ino = lgp->args.inode;
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_server *server = NFS_SERVER(ino);
- struct pnfs_layout_segment *lseg;

dprintk("--> %s\n", __func__);
spin_lock(&ino->i_lock);
- lseg = pnfs_has_layout(nfsi->layout, &lgp->args.range);
- if (likely(!lseg)) {
+ if (pnfs_layoutgets_blocked(nfsi->layout)) {
+ rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL);
spin_unlock(&ino->i_lock);
- dprintk("%s: no lseg found, proceeding\n", __func__);
- if (!nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
- &lgp->res.seq_res, 0, task))
- rpc_call_start(task);
return;
}
- if (!lseg->valid) {
- spin_unlock(&ino->i_lock);
- dprintk("%s: invalid lseg found, waiting\n", __func__);
- rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
- return;
- }
- get_lseg(lseg);
- *lgp->lsegpp = lseg;
spin_unlock(&ino->i_lock);
- dprintk("%s: valid lseg found, no rpc required\n", __func__);
- rpc_exit(task, NFS4_OK);
+ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
+ &lgp->res.seq_res, 0, task))
+ return;
+ rpc_call_start(task);
}

static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e699fd..120590b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -565,6 +565,9 @@ pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp)
lo = NFS_I(lrp->args.inode)->layout;
spin_lock(&lrp->args.inode->i_lock);
pnfs_clear_lseg_list(lo, &tmp_list, &lrp->args.range);
+ lo->plh_block_lgets--;
+ if (!pnfs_layoutgets_blocked(lo))
+ rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
if (!lrp->res.valid)
; /* forgetful model internal release */
else if (!lrp->res.lrs_present)
@@ -638,6 +641,7 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
goto out;
}

+ lo->plh_block_lgets++;
/* Reference matched in pnfs_layoutreturn_release */
get_layout_hdr_locked(lo);

@@ -870,6 +874,13 @@ out_unlock:
goto out;
}

+bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo)
+{
+ assert_spin_locked(&lo->inode->i_lock);
+ return lo->plh_block_lgets;
+}
+
int
pnfs_layout_process(struct nfs4_layoutget *lgp)
{
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index ee5a752..a9a3bea 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -97,6 +97,7 @@ struct pnfs_layout_hdr {
struct list_head segs; /* layout segments list */
int roc_iomode;/* return on close iomode, 0=none */
nfs4_stateid stateid;
+ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
unsigned long state;
struct rpc_cred *cred; /* layoutcommit credential */
/* DH: These vars keep track of the maximum write range
@@ -200,6 +201,7 @@ enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
struct nfs_open_context *, struct list_head *);
void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
+bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_layoutreturn_release(struct nfs4_layoutreturn *lpr);
void pnfs_destroy_layout(struct nfs_inode *);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d8bfa42..061d81a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -191,6 +191,7 @@ struct nfs_inode {

/* pNFS layout information */
struct rpc_wait_queue lo_rpcwaitq;
+ struct rpc_wait_queue lo_rpcwaitq_stateid;
struct pnfs_layout_hdr *layout;
#endif /* CONFIG_NFS_V4*/
#ifdef CONFIG_NFS_FSCACHE
--
1.7.2.1


2010-11-12 08:49:25

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 15/22] pnfs-submit: change pnfs_layout_hdr refcount to atomic_t

This is needed because we need to increment the refcount outside
of the i_lock. In particular, we will need to scan cl_layouts
while holding cl_lock, and grab reference of each lo found.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 50 +++++++++++++++++++++++++++++---------------------
fs/nfs/pnfs.h | 2 +-
2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5227d51..07b04e8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -232,34 +232,42 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
* pNFS client layout cache
*/

+/* Need to hold i_lock if caller does not already hold reference */
static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+get_layout_hdr(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->inode->i_lock);
- lo->refcount++;
+ atomic_inc(&lo->plh_refcount);
+ smp_mb__after_atomic_inc();
+}
+
+static void
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ dprintk("%s: freeing layout cache %p\n", __func__, lo);
+ BUG_ON(!list_empty(&lo->layouts));
+ NFS_I(lo->inode)->layout = NULL;
+ kfree(lo);
}

static void
put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
{
assert_spin_locked(&lo->inode->i_lock);
- BUG_ON(lo->refcount == 0);
-
- lo->refcount--;
- if (!lo->refcount) {
- dprintk("%s: freeing layout cache %p\n", __func__, lo);
- BUG_ON(!list_empty(&lo->layouts));
- NFS_I(lo->inode)->layout = NULL;
- kfree(lo);
- }
+ BUG_ON(atomic_read(&lo->plh_refcount) == 0);
+ if (atomic_dec_and_test(&lo->plh_refcount))
+ destroy_layout_hdr(lo);
}

void
put_layout_hdr(struct inode *inode)
{
- spin_lock(&inode->i_lock);
- put_layout_hdr_locked(NFS_I(inode)->layout);
- spin_unlock(&inode->i_lock);
+ struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
+
+ BUG_ON(atomic_read(&lo->plh_refcount) == 0);
+ if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ destroy_layout_hdr(lo);
+ spin_unlock(&inode->i_lock);
+ }
}

static void
@@ -413,7 +421,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_clear_lseg_list(lo, &tmp_list, &range);
WARN_ON(!list_empty(&nfsi->layout->segs));
WARN_ON(!list_empty(&nfsi->layout->layouts));
- WARN_ON(nfsi->layout->refcount != 1);
+ WARN_ON(atomic_read(&nfsi->layout->plh_refcount) != 1);

/* Matched by refcount set to 1 in alloc_init_layout_hdr */
put_layout_hdr_locked(lo);
@@ -651,7 +659,7 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
if (should_free_lseg(&lseg->range, &arg))
mark_lseg_invalid(lseg, &tmp_list);
/* Reference matched in nfs4_layoutreturn_release */
- get_layout_hdr_locked(lo);
+ get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);

@@ -732,7 +740,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
__func__, lseg, lseg->range.iomode,
lseg->range.offset, lseg->range.length);
}
- get_layout_hdr_locked(lo);
+ get_layout_hdr(lo);

dprintk("%s:Return\n", __func__);
}
@@ -745,7 +753,7 @@ alloc_init_layout_hdr(struct inode *ino)
lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
if (!lo)
return NULL;
- lo->refcount = 1;
+ atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->layouts);
INIT_LIST_HEAD(&lo->segs);
lo->inode = ino;
@@ -869,7 +877,7 @@ pnfs_update_layout(struct inode *ino,
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;

- get_layout_hdr_locked(lo); /* Matched in pnfs_layoutget_release */
+ get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */
spin_unlock(&ino->i_lock);

lseg = send_layoutget(lo, ctx, &arg);
@@ -1179,7 +1187,7 @@ pnfs_layoutcommit_inode(struct inode *inode, int sync)
NFS4_STATEID_SIZE);

/* Reference for layoutcommit matched in pnfs_layoutcommit_release */
- get_layout_hdr_locked(NFS_I(inode)->layout);
+ get_layout_hdr(NFS_I(inode)->layout);

spin_unlock(&inode->i_lock);

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index dabf03e..891aeab 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -92,7 +92,7 @@ struct pnfs_layoutdriver_type {
};

struct pnfs_layout_hdr {
- unsigned long refcount;
+ atomic_t plh_refcount;
struct list_head layouts; /* other client layouts */
struct list_head segs; /* layout segments list */
int roc_iomode;/* return on close iomode, 0=none */
--
1.7.2.1


2010-11-12 08:49:24

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 04/22] pnfs-submit: argument to should_free_lseg changed from lseg to range

We only use lseg->range, and we will need the function for
the callback code, where we have only the range, and
not an enclosing lseg.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 14 +++++++-------
1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 149f95e..ec291d3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -316,11 +316,11 @@ EXPORT_SYMBOL_GPL(put_lseg);
* READ RW false
*/
static int
-should_free_lseg(struct pnfs_layout_segment *lseg,
- struct pnfs_layout_range *range)
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+ struct pnfs_layout_range *recall_range)
{
- return (range->iomode == IOMODE_ANY ||
- lseg->range.iomode == range->iomode);
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode);
}

static bool
@@ -340,7 +340,7 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,

assert_spin_locked(&lo->inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
- if (!should_free_lseg(lseg, range) ||
+ if (!should_free_lseg(&lseg->range, range) ||
!_pnfs_can_return_lseg(lseg))
continue;
dprintk("%s: freeing lseg %p iomode %d "
@@ -546,7 +546,7 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,

assert_spin_locked(&lo->inode->i_lock);
list_for_each_entry(lseg, &lo->segs, fi_list)
- if (should_free_lseg(lseg, range)) {
+ if (should_free_lseg(&lseg->range, range)) {
out = lseg;
break;
}
@@ -564,7 +564,7 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,

spin_lock(&nfsi->vfs_inode.i_lock);
list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) {
- if (!should_free_lseg(lseg, range))
+ if (!should_free_lseg(&lseg->range, range))
continue;
lseg->valid = false;
if (!_pnfs_can_return_lseg(lseg)) {
--
1.7.2.1


2010-11-12 08:49:24

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 09/22] pnfs-submit: change pnfs_layout_segment refcounting from kref to atomic_t

Preparing for changes in pnfs_clear_lseg_list

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 37 ++++++++++++++-----------------------
fs/nfs/pnfs.h | 5 +++--
2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 519055d..8e699fd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -266,41 +266,32 @@ static void
init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
{
INIT_LIST_HEAD(&lseg->fi_list);
- kref_init(&lseg->kref);
+ atomic_set(&lseg->pls_refcount, 1);
+ smp_mb();
lseg->valid = true;
lseg->layout = lo;
}

-/* Called without i_lock held, as the free_lseg call may sleep */
-static void
-destroy_lseg(struct kref *kref)
-{
- struct pnfs_layout_segment *lseg =
- container_of(kref, struct pnfs_layout_segment, kref);
- struct inode *ino = lseg->layout->inode;
-
- dprintk("--> %s\n", __func__);
- NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
- /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
- put_layout_hdr(ino);
-}
-
void
put_lseg(struct pnfs_layout_segment *lseg)
{
bool do_wake_up;
- struct nfs_inode *nfsi;
+ struct inode *ino;

if (!lseg)
return;

dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
- atomic_read(&lseg->kref.refcount), lseg->valid);
+ atomic_read(&lseg->pls_refcount), lseg->valid);
do_wake_up = !lseg->valid;
- nfsi = NFS_I(lseg->layout->inode);
- kref_put(&lseg->kref, destroy_lseg);
+ ino = lseg->layout->inode;
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+ put_layout_hdr(ino);
+ }
if (do_wake_up)
- rpc_wake_up(&nfsi->lo_rpcwaitq);
+ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
}
EXPORT_SYMBOL_GPL(put_lseg);

@@ -326,7 +317,7 @@ should_free_lseg(struct pnfs_layout_range *lseg_range,
static bool
_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg)
{
- return atomic_read(&lseg->kref.refcount) == 1;
+ return atomic_read(&lseg->pls_refcount) == 1;
}

static void
@@ -554,7 +545,7 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
if (!_pnfs_can_return_lseg(lseg)) {
dprintk("%s: wait on lseg %p refcount %d\n",
__func__, lseg,
- atomic_read(&lseg->kref.refcount));
+ atomic_read(&lseg->pls_refcount));
ret = true;
}
}
@@ -812,7 +803,7 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo,
}

dprintk("%s:Return lseg %p ref %d valid %d\n",
- __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0,
+ __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0,
ret ? ret->valid : 0);
return ret;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index fdcfb9b..ee5a752 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -35,7 +35,7 @@
struct pnfs_layout_segment {
struct list_head fi_list;
struct pnfs_layout_range range;
- struct kref kref;
+ atomic_t pls_refcount;
bool valid;
struct pnfs_layout_hdr *layout;
};
@@ -228,7 +228,8 @@ static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)

static inline void get_lseg(struct pnfs_layout_segment *lseg)
{
- kref_get(&lseg->kref);
+ atomic_inc(&lseg->pls_refcount);
+ smp_mb__after_atomic_inc();
}

/* Return true if a layout driver is being used for this mountpoint */
--
1.7.2.1


2010-11-12 08:49:23

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 02/22] pnfs-submit: remove unnecessary field lgp->status

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 3 +--
include/linux/nfs_xdr.h | 1 -
2 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ce322e5..d01068c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5395,7 +5395,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
return;
}
}
- lgp->status = task->tk_status;
dprintk("<-- %s\n", __func__);
}

@@ -5451,7 +5450,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
status = nfs4_wait_for_completion_rpc_task(task);
if (status != 0)
goto out;
- status = lgp->status;
+ status = task->tk_status;
if (status != 0)
goto out;
status = pnfs_layout_process(lgp);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 1ff6cb0..53a4d2f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -223,7 +223,6 @@ struct nfs4_layoutget {
struct nfs4_layoutget_args args;
struct nfs4_layoutget_res res;
struct pnfs_layout_segment **lsegpp;
- int status;
};

struct nfs4_layoutcommit_args {
--
1.7.2.1


2010-11-15 17:53:32

by Fred Isaman

[permalink] [raw]
Subject: Re: [nfsv4] [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On Mon, Nov 15, 2010 at 11:17 AM, Benny Halevy <[email protected]> wrote:
> On 2010-11-15 16:51, Fred Isaman wrote:
>> On Sun, Nov 14, 2010 at 10:43 AM, Benny Halevy <[email protected]> wrote:
>>>
>>> Using the open stateid after forgetting the layout could be a protocol bug,
>>> or at least it falls into undefined territories.
>>>
>>> The RFC says:
>>>
>>> ? The loga_stateid field specifies a valid stateid. ?If a layout is not
>>> ? currently held by the client, the loga_stateid field represents a
>>> ? stateid reflecting the correspondingly valid open, byte-range lock,
>>> ? or delegation stateid. ?Once a layout is held on the file by the
>>> ? client, the loga_stateid field MUST be a stateid as returned from a
>>> ? previous LAYOUTGET or LAYOUTRETURN operation or provided by a
>>> ? CB_LAYOUTRECALL operation (see Section 12.5.3).
>>>
>>> So the question is does the text above refer to the client view of the state or to
>>> the server's view.
>>> In other words, with the forgetful client model, when the client unilaterally forgets
>>> the layout without letting the server know about it (no LAYOUTRETURN was sent),
>>> does it mean "a layout is not currently held by the client"?
>>>
>>
>> I would argue that yes, this is in fact what it means.
>>
>> It seems the server has two options when confronted with an
>> openstateid. ?Either interpret this as a declaration by the client
>> that it has forgotten all previous layouts and behave appropriately
>> (wipe any layout state assigned to the file and create a new
>> layoutstateid), or assume this is part of parallel spew of
>> LAYOUTGET(openstateid) and try to use an existing layout state with
>> the appropriate (possibly not one) seqid. ?I argue that, as the spec
>> stands, the second option is not really a choice, because the first
>> option exists. ?If a client using the second option encounters a
>> server using the first, bad things happen. ?The client will issue
>> multiple LAYOUTGET(openstateids), the server will, upon seeing each,
>> discard any previous state and return a new state with segid=1, with
>
> Is this the specified behavior?
>
>> the final valid state being that of whichever one was processed last.
>> The client will see all the OK returns, and not have any easy method
>> of determining which is the one that the server considers valid.
>>
>> Thus I claim that, because of the forgetful model, the client must
>> serialize its LAYOUTGET(openstateid) calls.
>>
>
> I disagree. LAYOUTGET(openstateid) should be no different than
> any other layout stateid and the client should be able to send multiple
> such LAYOUTGETs *initially* (and only initially). ?The server can process
> these as any other LAYOUTGET with the sequenceid rules assuming seqid==0
> (which is disallowed otherwise)
>
>>> The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
>>> while it still thinks that the client is holding a layout.
>>> Since this could normally happen if the client sends multiple LAYOUTGETs in
>>> parallel before it received any layout stateid the server should allow it
>>> within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
>>> not explicitly called out there), otherwise, it seems like the server is supposed
>>> to return NFS4ERR_OLD_STATEID.
>>>
>>> Strictly reading the spec, the client should use the most recent layout stateid
>>> even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
>>> or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
>>> clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
>>> a layout (did I say recently how much I hate the forgetful model which introduces
>>> more corner cases rather than simplifying the protocol as it was supposed to do? ;-)
>>>
>>
>> Strict reading again depends on whose point of view, client or server...
>>
>> "Once a client has no more layouts on a file, the layout stateid is no
>> longer valid and MUST NOT be used. ?Any attempt to use such a layout
>> stateid will result in NFS4ERR_BAD_STATEID."
>
> In NFSv4.1 the server decides about stateids. It's not up to the client
> to throw away the stateid and revert to the initial stateid.
> It must send an appropriate LAYOUTRETURN and get lrs_present==false
> to do that and then it can be sure its layout state for the file is synchronized
> with the server's.
>
> Benny
>

I actually agree that your method is better. I merely disagree that
the spec as is allows it. Another quote:

"When a client has no layout on a file, it MUST present an open stateid...".

The problem is that the spec is currently not clear about how the
forgetful model interacts with sending openstateids, particularly with
multiple parallel LAYOUTGETs. If a server implementor assumes the
client can silently forget its layouts, then later send a
LAYOUTGET(openstateid), which seems to be what the spec currently
says, then we get potential problems that can only be avoided if the
client serializes the LAYOUTGET(openstate) calls.

If you want your behavior, where the client is expected to remember
the layout stateid even after forgetting the layouts, I think an
errata is needed.

Fred


>>
>>
>> Fred
>>
>>> Benny
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>> the body of a message to [email protected]
>>> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>>>
> _______________________________________________
> nfsv4 mailing list
> [email protected]
> https://www.ietf.org/mailman/listinfo/nfsv4
>

2010-11-15 15:34:57

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH] SQUASHME: pnfs-submit: encode layoutreturn on close before close

On 2010-11-15 17:02, Fred Isaman wrote:
> On Sun, Nov 14, 2010 at 9:21 AM, Benny Halevy <[email protected]> wrote:
>> And handle errors from layoutcommit and layoutreturn on the reply path.
>>
>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> fs/nfs/nfs4xdr.c | 35 ++++++++++++++++++-----------------
>> fs/nfs/pnfs.c | 1 +
>> 2 files changed, 19 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index 1804f35..0e6e5e4 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -441,17 +441,17 @@ static int nfs4_stat_to_errno(int);
>> #define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
>> encode_sequence_maxsz + \
>> encode_putfh_maxsz + \
>> - encode_close_maxsz + \
>> - encode_getattr_maxsz + \
>> + encode_layoutcommit_maxsz + \
>> encode_layoutreturn_maxsz + \
>> - encode_layoutcommit_maxsz)
>> + encode_close_maxsz + \
>> + encode_getattr_maxsz)
>> #define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
>> decode_sequence_maxsz + \
>> decode_putfh_maxsz + \
>> - decode_close_maxsz + \
>> - decode_getattr_maxsz + \
>> + decode_layoutcommit_maxsz + \
>> decode_layoutreturn_maxsz + \
>> - decode_layoutcommit_maxsz)
>> + decode_close_maxsz + \
>> + decode_getattr_maxsz)
>> #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
>> encode_sequence_maxsz + \
>> encode_putfh_maxsz + \
>> @@ -2160,10 +2160,10 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
>> encode_putfh(&xdr, args->fh, &hdr);
>> if (args->op_bitmask & NFS4_HAS_LAYOUTCOMMIT) /* layoutcommit set */
>> encode_layoutcommit(&xdr, &args->lc_args, &hdr);
>> - encode_close(&xdr, args, &hdr);
>> - encode_getfattr(&xdr, args->bitmask, &hdr);
>> if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
>> encode_layoutreturn(&xdr, &args->lr_args, &hdr);
>> + encode_close(&xdr, args, &hdr);
>> + encode_getfattr(&xdr, args->bitmask, &hdr);
>> encode_nops(&hdr);
>> return 0;
>> }
>> @@ -5743,9 +5743,16 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
>> status = decode_putfh(&xdr);
>> if (status)
>> goto out;
>> - /* We pay no attention to the layoutcommit return */
>> - if (res->op_bitmask & NFS4_HAS_LAYOUTCOMMIT)
>> - decode_layoutcommit(&xdr);
>> + if (res->op_bitmask & NFS4_HAS_LAYOUTCOMMIT) {
>> + status = decode_layoutcommit(&xdr);
>> + if (status)
>> + goto out;
>> + }
>> + if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN) {
>> + status = decode_layoutreturn(&xdr, &res->lr_res);
>> + if (status)
>> + goto out;
>
> What prevents infinite loop here? With LAYOUTCOMMIT, the inode data
> is cleared so that on retry it will not be called. I see no
> comparable "pre-cleaning" done for LAYOUTRETURN.
>

pnfs_roc will find no layout segment to return the second time around.

Benny

> Fred
>
>> + }
>> status = decode_close(&xdr, res);
>> if (status != 0)
>> goto out;
>> @@ -5757,12 +5764,6 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
>> */
>> decode_getfattr(&xdr, res->fattr, res->server,
>> !RPC_IS_ASYNC(rqstp->rq_task));
>> - /*
>> - * With the forgetful model, we pay no attention to the
>> - * layoutreturn status.
>> - */
>> - if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN)
>> - decode_layoutreturn(&xdr, &res->lr_res);
>> out:
>> return status;
>> }
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 15673d0..90a868b 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -640,6 +640,7 @@ pnfs_roc(struct nfs4_closedata *data)
>> LIST_HEAD(tmp_list);
>> bool found = false;
>>
>> + data->arg.op_bitmask = data->res.op_bitmask = 0;
>> spin_lock(&data->inode->i_lock);
>> lo = NFS_I(data->inode)->layout;
>> if (!lo || lo->roc_iomode == 0 ||
>> --
>> 1.7.2.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>

2010-11-12 08:49:25

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 12/22] pnfs_submit: nfs4_layoutreturn_release should not reference results

Since the release function may be called without sending any RPC,
it must not refer to any of the result fields. This is
better accomplished in the rpc_done function.

In the process, this basically reimplements the commit
"pnfs: do not change layout stateid when dropping layouts."

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 24 ++++++++++++++++++++++--
fs/nfs/nfs4xdr.c | 1 -
fs/nfs/pnfs.c | 26 ++------------------------
fs/nfs/pnfs.h | 3 ++-
include/linux/nfs_xdr.h | 1 -
5 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5ccde2a..fe79872 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5578,9 +5578,20 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
server = NFS_SERVER(lrp->args.inode);
else
server = NULL;
- if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN) {
nfs_restart_rpc(task, lrp->clp);
+ return;
+ }
+ if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;

+ spin_lock(&lo->inode->i_lock);
+ if (lrp->res.lrs_present)
+ pnfs_set_layout_stateid(lo, &lrp->res.stateid);
+ else
+ pnfs_invalidate_layout_stateid(lo);
+ spin_unlock(&lo->inode->i_lock);
+ }
dprintk("<-- %s\n", __func__);
}

@@ -5589,8 +5600,17 @@ static void nfs4_layoutreturn_release(void *calldata)
struct nfs4_layoutreturn *lrp = calldata;

dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
+ if (lrp->args.return_type == RETURN_FILE) {
+ struct inode *ino = lrp->args.inode;
+ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;

- pnfs_layoutreturn_release(lrp);
+ spin_lock(&ino->i_lock);
+ lo->plh_block_lgets--;
+ if (!pnfs_layoutgets_blocked(lo))
+ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
+ spin_unlock(&ino->i_lock);
+ put_layout_hdr(lrp->args.inode);
+ }
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 550e457..328cca5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5287,7 +5287,6 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
- res->valid = true;
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
status = decode_stateid(xdr, &res->stateid);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1147eb3..353c674 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -449,7 +449,7 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
*
* lo->stateid could be the open stateid, in which case we just use what given.
*/
-static void
+void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
@@ -580,28 +580,6 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
return ret;
}

-void
-pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp)
-{
- struct pnfs_layout_hdr *lo;
-
- if (lrp->args.return_type != RETURN_FILE)
- return;
- lo = NFS_I(lrp->args.inode)->layout;
- spin_lock(&lrp->args.inode->i_lock);
- lo->plh_block_lgets--;
- if (!pnfs_layoutgets_blocked(lo))
- rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
- if (!lrp->res.valid)
- ; /* forgetful model internal release */
- else if (!lrp->res.lrs_present)
- pnfs_invalidate_layout_stateid(lo);
- else
- pnfs_set_layout_stateid(lo, &lrp->res.stateid);
- put_layout_hdr_locked(lo); /* Matched in _pnfs_return_layout */
- spin_unlock(&lrp->args.inode->i_lock);
-}
-
static int
return_layout(struct inode *ino, struct pnfs_layout_range *range,
enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo,
@@ -672,7 +650,7 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
if (should_free_lseg(&lseg->range, &arg))
mark_lseg_invalid(lseg, &tmp_list);
- /* Reference matched in pnfs_layoutreturn_release */
+ /* Reference matched in nfs4_layoutreturn_release */
get_layout_hdr_locked(lo);
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a9a3bea..4f2c541 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -203,10 +203,11 @@ void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
-void pnfs_layoutreturn_release(struct nfs4_layoutreturn *lpr);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
void put_layout_hdr(struct inode *inode);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new);
void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state);

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 23a4519..f472405 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -269,7 +269,6 @@ struct nfs4_layoutreturn_args {

struct nfs4_layoutreturn_res {
struct nfs4_sequence_res seq_res;
- bool valid; /* internal, true if received reply */
u32 lrs_present;
nfs4_stateid stateid;
};
--
1.7.2.1


2010-11-14 14:22:03

by Benny Halevy

[permalink] [raw]
Subject: [PATCH] SQUASHME: pnfs-submit: encode layoutreturn on close before close

And handle errors from layoutcommit and layoutreturn on the reply path.

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4xdr.c | 35 ++++++++++++++++++-----------------
fs/nfs/pnfs.c | 1 +
2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1804f35..0e6e5e4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -441,17 +441,17 @@ static int nfs4_stat_to_errno(int);
#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
- encode_close_maxsz + \
- encode_getattr_maxsz + \
+ encode_layoutcommit_maxsz + \
encode_layoutreturn_maxsz + \
- encode_layoutcommit_maxsz)
+ encode_close_maxsz + \
+ encode_getattr_maxsz)
#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
- decode_close_maxsz + \
- decode_getattr_maxsz + \
+ decode_layoutcommit_maxsz + \
decode_layoutreturn_maxsz + \
- decode_layoutcommit_maxsz)
+ decode_close_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -2160,10 +2160,10 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
encode_putfh(&xdr, args->fh, &hdr);
if (args->op_bitmask & NFS4_HAS_LAYOUTCOMMIT) /* layoutcommit set */
encode_layoutcommit(&xdr, &args->lc_args, &hdr);
- encode_close(&xdr, args, &hdr);
- encode_getfattr(&xdr, args->bitmask, &hdr);
if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
encode_layoutreturn(&xdr, &args->lr_args, &hdr);
+ encode_close(&xdr, args, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
encode_nops(&hdr);
return 0;
}
@@ -5743,9 +5743,16 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
status = decode_putfh(&xdr);
if (status)
goto out;
- /* We pay no attention to the layoutcommit return */
- if (res->op_bitmask & NFS4_HAS_LAYOUTCOMMIT)
- decode_layoutcommit(&xdr);
+ if (res->op_bitmask & NFS4_HAS_LAYOUTCOMMIT) {
+ status = decode_layoutcommit(&xdr);
+ if (status)
+ goto out;
+ }
+ if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN) {
+ status = decode_layoutreturn(&xdr, &res->lr_res);
+ if (status)
+ goto out;
+ }
status = decode_close(&xdr, res);
if (status != 0)
goto out;
@@ -5757,12 +5764,6 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
*/
decode_getfattr(&xdr, res->fattr, res->server,
!RPC_IS_ASYNC(rqstp->rq_task));
- /*
- * With the forgetful model, we pay no attention to the
- * layoutreturn status.
- */
- if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN)
- decode_layoutreturn(&xdr, &res->lr_res);
out:
return status;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 15673d0..90a868b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -640,6 +640,7 @@ pnfs_roc(struct nfs4_closedata *data)
LIST_HEAD(tmp_list);
bool found = false;

+ data->arg.op_bitmask = data->res.op_bitmask = 0;
spin_lock(&data->inode->i_lock);
lo = NFS_I(data->inode)->layout;
if (!lo || lo->roc_iomode == 0 ||
--
1.7.2.3


2010-11-15 16:17:58

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 2010-11-15 16:51, Fred Isaman wrote:
> On Sun, Nov 14, 2010 at 10:43 AM, Benny Halevy <[email protected]> wrote:
>>
>> Using the open stateid after forgetting the layout could be a protocol bug,
>> or at least it falls into undefined territories.
>>
>> The RFC says:
>>
>> The loga_stateid field specifies a valid stateid. If a layout is not
>> currently held by the client, the loga_stateid field represents a
>> stateid reflecting the correspondingly valid open, byte-range lock,
>> or delegation stateid. Once a layout is held on the file by the
>> client, the loga_stateid field MUST be a stateid as returned from a
>> previous LAYOUTGET or LAYOUTRETURN operation or provided by a
>> CB_LAYOUTRECALL operation (see Section 12.5.3).
>>
>> So the question is does the text above refer to the client view of the state or to
>> the server's view.
>> In other words, with the forgetful client model, when the client unilaterally forgets
>> the layout without letting the server know about it (no LAYOUTRETURN was sent),
>> does it mean "a layout is not currently held by the client"?
>>
>
> I would argue that yes, this is in fact what it means.
>
> It seems the server has two options when confronted with an
> openstateid. Either interpret this as a declaration by the client
> that it has forgotten all previous layouts and behave appropriately
> (wipe any layout state assigned to the file and create a new
> layoutstateid), or assume this is part of parallel spew of
> LAYOUTGET(openstateid) and try to use an existing layout state with
> the appropriate (possibly not one) seqid. I argue that, as the spec
> stands, the second option is not really a choice, because the first
> option exists. If a client using the second option encounters a
> server using the first, bad things happen. The client will issue
> multiple LAYOUTGET(openstateids), the server will, upon seeing each,
> discard any previous state and return a new state with segid=1, with

Is this the specified behavior?

> the final valid state being that of whichever one was processed last.
> The client will see all the OK returns, and not have any easy method
> of determining which is the one that the server considers valid.
>
> Thus I claim that, because of the forgetful model, the client must
> serialize its LAYOUTGET(openstateid) calls.
>

I disagree. LAYOUTGET(openstateid) should be no different than
any other layout stateid and the client should be able to send multiple
such LAYOUTGETs *initially* (and only initially). The server can process
these as any other LAYOUTGET with the sequenceid rules assuming seqid==0
(which is disallowed otherwise)

>> The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
>> while it still thinks that the client is holding a layout.
>> Since this could normally happen if the client sends multiple LAYOUTGETs in
>> parallel before it received any layout stateid the server should allow it
>> within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
>> not explicitly called out there), otherwise, it seems like the server is supposed
>> to return NFS4ERR_OLD_STATEID.
>>
>> Strictly reading the spec, the client should use the most recent layout stateid
>> even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
>> or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
>> clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
>> a layout (did I say recently how much I hate the forgetful model which introduces
>> more corner cases rather than simplifying the protocol as it was supposed to do? ;-)
>>
>
> Strict reading again depends on whose point of view, client or server...
>
> "Once a client has no more layouts on a file, the layout stateid is no
> longer valid and MUST NOT be used. Any attempt to use such a layout
> stateid will result in NFS4ERR_BAD_STATEID."

In NFSv4.1 the server decides about stateids. It's not up to the client
to throw away the stateid and revert to the initial stateid.
It must send an appropriate LAYOUTRETURN and get lrs_present==false
to do that and then it can be sure its layout state for the file is synchronized
with the server's.

Benny

>
>
> Fred
>
>> Benny
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>

2010-11-12 08:49:27

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 22/22] SQUASHME: make roc patches compile without v4.1

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 55 ++++++++++++++++++++--------------------------------
fs/nfs/nfs4xdr.c | 34 ++++++++++++++++++++++++++++++++
fs/nfs/pnfs.h | 24 +++++++++++++++++++++++
3 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 09ed784..fabeae2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -74,8 +74,6 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
struct nfs4_state *state);
-static void nfs4_layoutreturn_set_stateid(struct inode *ino,
- struct nfs4_layoutreturn_res *res);

/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
@@ -1832,17 +1830,8 @@ static void nfs4_free_closedata(void *data)
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
path_put(&calldata->path);
- if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN) {
- struct pnfs_layout_hdr *lo = NFS_I(calldata->inode)->layout;
-
- spin_lock(&lo->inode->i_lock);
- lo->plh_block_lgets--;
- lo->plh_outstanding--;
- if (!pnfs_layoutgets_blocked(lo, NULL))
- rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
- spin_unlock(&lo->inode->i_lock);
- put_layout_hdr(lo->inode);
- }
+ if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN)
+ nfs4_layoutreturn_file_release(calldata->inode);
kfree(calldata);
}

@@ -1931,18 +1920,13 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
/* Are there layout segments to return on close? */
if (pnfs_roc(calldata)) {
struct nfs_inode *nfsi = NFS_I(calldata->inode);
+
if (pnfs_return_layout_barrier(nfsi,
&calldata->arg.lr_args.range)) {
dprintk("%s: waiting on barrier\n", __func__);
/* FIXME race with wake here */
rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
- spin_lock(&calldata->inode->i_lock);
- nfsi->layout->plh_block_lgets--;
- nfsi->layout->plh_outstanding--;
- if (!pnfs_layoutgets_blocked(nfsi->layout, NULL))
- rpc_wake_up(&nfsi->lo_rpcwaitq_stateid);
- spin_unlock(&calldata->inode->i_lock);
- put_layout_hdr(calldata->inode);
+ nfs4_layoutreturn_file_release(calldata->inode);
return;
}
}
@@ -5627,8 +5611,8 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
rpc_call_start(task);
}

-static void nfs4_layoutreturn_set_stateid(struct inode *ino,
- struct nfs4_layoutreturn_res *res)
+void nfs4_layoutreturn_set_stateid(struct inode *ino,
+ struct nfs4_layoutreturn_res *res)
{
struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;

@@ -5663,23 +5647,26 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
dprintk("<-- %s\n", __func__);
}

+void nfs4_layoutreturn_file_release(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+
+ spin_lock(&ino->i_lock);
+ lo->plh_block_lgets--;
+ lo->plh_outstanding--;
+ if (!pnfs_layoutgets_blocked(lo, NULL))
+ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
+ spin_unlock(&ino->i_lock);
+ put_layout_hdr(ino);
+}
+
static void nfs4_layoutreturn_release(void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;

dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
- if (lrp->args.return_type == RETURN_FILE) {
- struct inode *ino = lrp->args.inode;
- struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
-
- spin_lock(&ino->i_lock);
- lo->plh_block_lgets--;
- lo->plh_outstanding--;
- if (!pnfs_layoutgets_blocked(lo, NULL))
- rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
- spin_unlock(&ino->i_lock);
- put_layout_hdr(ino);
- }
+ if (lrp->args.return_type == RETURN_FILE)
+ nfs4_layoutreturn_file_release(lrp->args.inode);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b016ec8..1804f35 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,6 +338,10 @@ static int nfs4_stat_to_errno(int);
#else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0
+#define encode_layoutcommit_maxsz 0
+#define decode_layoutcommit_maxsz 0
+#define encode_layoutreturn_maxsz 0
+#define decode_layoutreturn_maxsz 0
#endif /* CONFIG_NFS_V4_1 */

#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
@@ -1929,6 +1933,22 @@ encode_layoutreturn(struct xdr_stream *xdr,
hdr->nops++;
hdr->replen += decode_layoutreturn_maxsz;
}
+#else
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_op_args *args,
+ struct compound_hdr *hdr)
+{
+ return 0;
+}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+}
+
#endif /* CONFIG_NFS_V4_1 */

/*
@@ -5332,6 +5352,20 @@ out_overflow:
print_overflow_msg(__func__, xdr);
return -EIO;
}
+
+#else
+
+static int decode_layoutcommit(struct xdr_stream *xdr)
+{
+ return 0;
+}
+
+static int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ return 0;
+}
+
#endif /* CONFIG_NFS_V4_1 */

/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4812288..e553311 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -188,6 +188,9 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
int issync);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
+extern void nfs4_layoutreturn_file_release(struct inode *ino);
+extern void nfs4_layoutreturn_set_stateid(struct inode *ino,
+ struct nfs4_layoutreturn_res *res);

/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -399,6 +402,27 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
pgio->pg_lseg = NULL;
}

+static inline void nfs4_layoutreturn_file_release(struct inode *ino)
+{
+}
+
+static inline bool pnfs_roc(struct nfs4_closedata *data)
+{
+ return false;
+}
+
+static inline bool pnfs_return_layout_barrier(struct nfs_inode *nfsi,
+ struct pnfs_layout_range *range)
+{
+ BUG();
+ return false;
+}
+
+static inline void nfs4_layoutreturn_set_stateid(struct inode *ino,
+ struct nfs4_layoutreturn_res *res)
+{
+}
+
#endif /* CONFIG_NFS_V4_1 */

#endif /* FS_NFS_PNFS_H */
--
1.7.2.1


2010-11-12 08:49:24

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 06/22] NFSv4.1: Callback share session between ops

From: Andy Adamson <[email protected]>

The NFSv4.1 session found in cb_sequence needs to be shared by other
callback operations in the same cb_compound.
Hold a reference to the session's nfs_client throughout the cb_compound
processing.

Move NFS4ERR_RETRY_UNCACHED_REP processing into nfs4_callback_sequence.

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 24 ++++++--
fs/nfs/callback_proc.c | 138 ++++++++++++++++++++++++++++--------------------
fs/nfs/callback_xdr.c | 29 +++++-----
3 files changed, 113 insertions(+), 78 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 2ce61b8..89fee05 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -34,6 +34,11 @@ enum nfs4_callback_opnum {
OP_CB_ILLEGAL = 10044,
};

+struct cb_process_state {
+ __be32 drc_status;
+ struct nfs4_session *session;
+};
+
struct cb_compound_hdr_arg {
unsigned int taglen;
const char *tag;
@@ -104,7 +109,8 @@ struct cb_sequenceres {
};

extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
- struct cb_sequenceres *res);
+ struct cb_sequenceres *res,
+ struct cb_process_state *cps);

extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
const nfs4_stateid *stateid);
@@ -125,14 +131,17 @@ struct cb_recallanyargs {
uint32_t craa_type_mask;
};

-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args,
+ void *dummy,
+ struct cb_process_state *cps);

struct cb_recallslotargs {
struct sockaddr *crsa_addr;
uint32_t crsa_target_max_slots;
};
extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
- void *dummy);
+ void *dummy,
+ struct cb_process_state *cps);

struct cb_layoutrecallargs {
struct sockaddr *cbl_addr;
@@ -147,12 +156,15 @@ struct cb_layoutrecallargs {

extern unsigned nfs4_callback_layoutrecall(
struct cb_layoutrecallargs *args,
- void *dummy);
+ void *dummy, struct cb_process_state *cps);

#endif /* CONFIG_NFS_V4_1 */

-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+ struct cb_getattrres *res,
+ struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+ struct cb_process_state *cps);

#ifdef CONFIG_NFS_V4
extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 74a6d6b..53a85648 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -20,8 +20,10 @@
#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
#endif
-
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+ struct cb_getattrres *res,
+ struct cb_process_state *cps)
{
struct nfs_client *clp;
struct nfs_delegation *delegation;
@@ -30,9 +32,13 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *

res->bitmap[0] = res->bitmap[1] = 0;
res->status = htonl(NFS4ERR_BADHANDLE);
- clp = nfs_find_client(args->addr, 4);
- if (clp == NULL)
- goto out;
+ if (cps->session) { /* set in cb_sequence */
+ clp = cps->session->clp;
+ } else {
+ clp = nfs_find_client(args->addr, 4);
+ if (clp == NULL)
+ goto out;
+ }

dprintk("NFS: GETATTR callback request from %s\n",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
@@ -60,22 +66,28 @@ out_iput:
rcu_read_unlock();
iput(inode);
out_putclient:
- nfs_put_client(clp);
+ if (!cps->session)
+ nfs_put_client(clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
return res->status;
}

-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+ struct cb_process_state *cps)
{
struct nfs_client *clp;
struct inode *inode;
__be32 res;

res = htonl(NFS4ERR_BADHANDLE);
- clp = nfs_find_client(args->addr, 4);
- if (clp == NULL)
- goto out;
+ if (cps->session) { /* set in cb_sequence */
+ clp = cps->session->clp;
+ } else {
+ clp = nfs_find_client(args->addr, 4);
+ if (clp == NULL)
+ goto out;
+ }

dprintk("NFS: RECALL callback request from %s\n",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
@@ -99,9 +111,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
}
iput(inode);
}
- clp = nfs_find_client_next(prev);
- nfs_put_client(prev);
- } while (clp != NULL);
+ if (!cps->session) {
+ clp = nfs_find_client_next(prev);
+ nfs_put_client(prev);
+ }
+ } while (!cps->session && clp != NULL);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
return res;
@@ -347,46 +361,40 @@ static int pnfs_recall_all_layouts(struct nfs_client *clp)
}

__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
- void *dummy)
+ void *dummy, struct cb_process_state *cps)
{
struct nfs_client *clp;
struct inode *inode = NULL;
__be32 res;
int status;
- unsigned int num_client = 0;

dprintk("%s: -->\n", __func__);

res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
- clp = nfs_find_client(args->cbl_addr, 4);
- if (clp == NULL)
+ if (cps->session) /* set in cb_sequence */
+ clp = cps->session->clp;
+ else
goto out;

- res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
- do {
- struct nfs_client *prev = clp;
- num_client++;
- /* the callback must come from the MDS personality */
- if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
- goto loop;
- /* In the _ALL or _FSID case, we need the inode to get
- * the nfs_server struct.
- */
- inode = nfs_layoutrecall_find_inode(clp, args);
- if (!inode)
- goto loop;
- status = pnfs_async_return_layout(clp, inode, args);
- if (status)
- res = cpu_to_be32(NFS4ERR_DELAY);
- iput(inode);
-loop:
- clp = nfs_find_client_next(prev);
- nfs_put_client(prev);
- } while (clp != NULL);
+ /* the callback must come from the MDS personality */
+ res = cpu_to_be32(NFS4ERR_NOTSUPP);
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+ goto out;

+ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
+ /*
+ * In the _ALL or _FSID case, we need the inode to get
+ * the nfs_server struct.
+ */
+ inode = nfs_layoutrecall_find_inode(clp, args);
+ if (!inode)
+ goto out;
+ status = pnfs_async_return_layout(clp, inode, args);
+ if (status)
+ res = cpu_to_be32(NFS4ERR_DELAY);
+ iput(inode);
out:
- dprintk("%s: exit with status = %d numclient %u\n",
- __func__, ntohl(res), num_client);
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
return res;
}

@@ -553,12 +561,15 @@ out:
}

__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
- struct cb_sequenceres *res)
+ struct cb_sequenceres *res,
+ struct cb_process_state *cps)
{
struct nfs_client *clp;
int i;
__be32 status;

+ cps->session = NULL;
+
status = htonl(NFS4ERR_BADSESSION);
clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
if (clp == NULL)
@@ -584,21 +595,27 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
res->csr_slotid = args->csa_slotid;
res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+ cps->session = clp->cl_session; /* caller must put nfs_client */

-out_putclient:
- nfs_put_client(clp);
out:
for (i = 0; i < args->csa_nrclists; i++)
kfree(args->csa_rclists[i].rcl_refcalls);
kfree(args->csa_rclists);

- if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+ if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
res->csr_status = 0;
- else
+ cps->drc_status = status;
+ status = 0;
+ } else
res->csr_status = status;
+
dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
ntohl(status), ntohl(res->csr_status));
return status;
+
+out_putclient:
+ nfs_put_client(clp);
+ goto out;
}

static inline bool
@@ -625,24 +642,31 @@ validate_bitmap_values(const unsigned long *mask)
return false;
}

-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+ struct cb_process_state *cps)
{
struct nfs_client *clp;
__be32 status;
fmode_t flags = 0;

status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
- clp = nfs_find_client(args->craa_addr, 4);
- if (clp == NULL)
+ if (cps->session) /* set in cb_sequence */
+ clp = cps->session->clp;
+ else
goto out;

dprintk("NFS: RECALL_ANY callback request from %s\n",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));

+ /* the callback must come from the MDS personality */
+ status = cpu_to_be32(NFS4ERR_NOTSUPP);
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+ goto out;
+
status = cpu_to_be32(NFS4ERR_INVAL);
if (!validate_bitmap_values((const unsigned long *)
&args->craa_type_mask))
- goto out_put;
+ goto out;

status = cpu_to_be32(NFS4_OK);
if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
@@ -658,23 +682,23 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)

if (flags)
nfs_expire_all_delegation_types(clp, flags);
-out_put:
- nfs_put_client(clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
}

/* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+ struct cb_process_state *cps)
{
struct nfs_client *clp;
struct nfs4_slot_table *fc_tbl;
__be32 status;

status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
- clp = nfs_find_client(args->crsa_addr, 4);
- if (clp == NULL)
+ if (cps->session) /* set in cb_sequence */
+ clp = cps->session->clp;
+ else
goto out;

dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
@@ -686,16 +710,14 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
status = htonl(NFS4ERR_BAD_HIGH_SLOT);
if (args->crsa_target_max_slots > fc_tbl->max_slots ||
args->crsa_target_max_slots < 1)
- goto out_putclient;
+ goto out;

status = htonl(NFS4_OK);
if (args->crsa_target_max_slots == fc_tbl->max_slots)
- goto out_putclient;
+ goto out;

fc_tbl->target_max_slots = args->crsa_target_max_slots;
nfs41_handle_recall_slot(clp);
-out_putclient:
- nfs_put_client(clp); /* balance nfs_find_client */
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 63b17d0..1650ab0 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include "nfs4_fs.h"
#include "callback.h"
+#include "internal.h"

#define CB_OP_TAGLEN_MAXSZ (512)
#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
@@ -34,7 +35,8 @@
/* Internal error code */
#define NFS4ERR_RESOURCE_HDR 11050

-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+ struct cb_process_state *);
typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);

@@ -676,7 +678,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
static __be32 process_op(uint32_t minorversion, int nop,
struct svc_rqst *rqstp,
struct xdr_stream *xdr_in, void *argp,
- struct xdr_stream *xdr_out, void *resp, int* drc_status)
+ struct xdr_stream *xdr_out, void *resp,
+ struct cb_process_state *cps)
{
struct callback_op *op = &callback_ops[0];
unsigned int op_nr;
@@ -699,8 +702,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
if (status)
goto encode_hdr;

- if (*drc_status) {
- status = *drc_status;
+ if (cps->drc_status) {
+ status = cps->drc_status;
goto encode_hdr;
}

@@ -708,16 +711,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
if (maxlen > 0 && maxlen < PAGE_SIZE) {
status = op->decode_args(rqstp, xdr_in, argp);
if (likely(status == 0))
- status = op->process_op(argp, resp);
+ status = op->process_op(argp, resp, cps);
} else
status = htonl(NFS4ERR_RESOURCE);

- /* Only set by OP_CB_SEQUENCE processing */
- if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
- *drc_status = status;
- status = 0;
- }
-
encode_hdr:
res = encode_op_hdr(xdr_out, op_nr, status);
if (unlikely(res))
@@ -736,8 +733,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
struct cb_compound_hdr_arg hdr_arg = { 0 };
struct cb_compound_hdr_res hdr_res = { NULL };
struct xdr_stream xdr_in, xdr_out;
- __be32 *p;
- __be32 status, drc_status = 0;
+ __be32 *p, status;
+ struct cb_process_state cps = {
+ .drc_status = 0,
+ };
unsigned int nops = 0;

dprintk("%s: start\n", __func__);
@@ -758,7 +757,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r

while (status == 0 && nops != hdr_arg.nops) {
status = process_op(hdr_arg.minorversion, nops, rqstp,
- &xdr_in, argp, &xdr_out, resp, &drc_status);
+ &xdr_in, argp, &xdr_out, resp, &cps);
nops++;
}

@@ -771,6 +770,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r

*hdr_res.status = status;
*hdr_res.nops = htonl(nops);
+ if (cps.session) /* matched by cb_sequence find_client_with_session */
+ nfs_put_client(cps.session->clp);
dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success;
}
--
1.7.2.1


2010-11-12 08:49:24

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 05/22] pnfs-submit: change layout state seqlock to a spinlock

This prepares for future changes, where the layout state needs
to change atomically with several other variables. In particular,
it will need to know if lo->segs is empty. Moreover, the
layoutstateid is not really a read-mostly structure, as it is
written on each LAYOUTGET.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback_proc.c | 8 +++---
fs/nfs/nfs4xdr.c | 6 +++-
fs/nfs/pnfs.c | 50 ++++++++++++++++-------------------------------
fs/nfs/pnfs.h | 4 +--
4 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 4dabc62..74a6d6b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -121,12 +121,11 @@ static bool
pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
const nfs4_stateid stateid)
{
- int seqlock;
bool res;
u32 oldseqid, newseqid;

- do {
- seqlock = read_seqbegin(&lo->seqlock);
+ spin_lock(&lo->inode->i_lock);
+ {
oldseqid = be32_to_cpu(lo->stateid.stateid.seqid);
newseqid = be32_to_cpu(stateid.stateid.seqid);
res = !memcmp(lo->stateid.stateid.other,
@@ -144,7 +143,8 @@ pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
if (res)
res = (newseqid == 1);
}
- } while (read_seqretry(&lo->seqlock, seqlock));
+ }
+ spin_unlock(&lo->inode->i_lock);

return res;
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 238eeb2..550e457 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1915,8 +1915,10 @@ encode_layoutreturn(struct xdr_stream *xdr,
p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
p = xdr_encode_hyper(p, args->range.offset);
p = xdr_encode_hyper(p, args->range.length);
- pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
- NULL);
+ spin_lock(&args->inode->i_lock);
+ memcpy(stateid.data, NFS_I(args->inode)->layout->stateid.data,
+ NFS4_STATEID_SIZE);
+ spin_unlock(&args->inode->i_lock);
p = xdr_encode_opaque_fixed(p, &stateid.data,
NFS4_STATEID_SIZE);
p = reserve_space(xdr, 4);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ec291d3..519055d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -437,7 +437,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
nfs4_stateid *old = &lo->stateid;
bool overwrite = false;

- write_seqlock(&lo->seqlock);
+ assert_spin_locked(&lo->inode->i_lock);
if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
overwrite = true;
@@ -451,43 +451,27 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
}
if (overwrite)
memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
- write_sequnlock(&lo->seqlock);
-}
-
-static void
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
- struct nfs4_state *state)
-{
- int seq;
-
- dprintk("--> %s\n", __func__);
- write_seqlock(&lo->seqlock);
- do {
- seq = read_seqbegin(&state->seqlock);
- memcpy(lo->stateid.data, state->stateid.data,
- sizeof(state->stateid.data));
- } while (read_seqretry(&state->seqlock, seq));
- set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
- write_sequnlock(&lo->seqlock);
- dprintk("<-- %s\n", __func__);
}

void
pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state)
{
- int seq;
-
dprintk("--> %s\n", __func__);
- do {
- seq = read_seqbegin(&lo->seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
- /* This will trigger retry of the read */
- pnfs_layout_from_open_stateid(lo, open_state);
- } else
- memcpy(dst->data, lo->stateid.data,
- sizeof(lo->stateid.data));
- } while (read_seqretry(&lo->seqlock, seq));
+ spin_lock(&lo->inode->i_lock);
+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+ int seq;
+
+ do {
+ seq = read_seqbegin(&open_state->seqlock);
+ memcpy(dst->data, open_state->stateid.data,
+ sizeof(open_state->stateid.data));
+ } while (read_seqretry(&open_state->seqlock, seq));
+ set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+ } else
+ memcpy(dst->data, lo->stateid.data,
+ sizeof(lo->stateid.data));
+ spin_unlock(&lo->inode->i_lock);
dprintk("<-- %s\n", __func__);
}

@@ -761,7 +745,6 @@ alloc_init_layout_hdr(struct inode *ino)
lo->refcount = 1;
INIT_LIST_HEAD(&lo->layouts);
INIT_LIST_HEAD(&lo->segs);
- seqlock_init(&lo->seqlock);
lo->inode = ino;
return lo;
}
@@ -1182,7 +1165,8 @@ pnfs_layoutcommit_inode(struct inode *inode, int sync)
nfsi->layout->write_end_pos = 0;
nfsi->layout->cred = NULL;
__clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
- pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout, NULL);
+ memcpy(data->args.stateid.data, nfsi->layout->stateid.data,
+ NFS4_STATEID_SIZE);

/* Reference for layoutcommit matched in pnfs_layoutcommit_release */
get_layout_hdr_locked(NFS_I(inode)->layout);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 7e06437..fdcfb9b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -96,7 +96,6 @@ struct pnfs_layout_hdr {
struct list_head layouts; /* other client layouts */
struct list_head segs; /* layout segments list */
int roc_iomode;/* return on close iomode, 0=none */
- seqlock_t seqlock; /* Protects the stateid */
nfs4_stateid stateid;
unsigned long state;
struct rpc_cred *cred; /* layoutcommit credential */
@@ -223,9 +222,8 @@ static inline int lo_fail_bit(u32 iomode)

static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
{
- write_seqlock(&lo->seqlock);
+ assert_spin_locked(&lo->inode->i_lock);
clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
- write_sequnlock(&lo->seqlock);
}

static inline void get_lseg(struct pnfs_layout_segment *lseg)
--
1.7.2.1


2010-11-12 08:49:26

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 18/22] pnfs-submit: roc add layoutreturn op to close compound

From: Andy Adamson <[email protected]>

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/nfs4proc.c | 73 +++++++++++++++++++++++++++++++++-------------
fs/nfs/nfs4state.c | 18 +-----------
fs/nfs/nfs4xdr.c | 14 ++++++++-
fs/nfs/pnfs.c | 64 +++++++++++++++++++++++++++++++++++++----
fs/nfs/pnfs.h | 1 +
include/linux/nfs_xdr.h | 19 ++++++++++++
6 files changed, 143 insertions(+), 46 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6223c6a..2b47c59 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -74,6 +74,8 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
struct nfs4_state *state);
+static void nfs4_layoutreturn_set_stateid(struct inode *ino,
+ struct nfs4_layoutreturn_res *res);

/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
@@ -1821,16 +1823,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
return err;
}

-struct nfs4_closedata {
- struct path path;
- struct inode *inode;
- struct nfs4_state *state;
- struct nfs_closeargs arg;
- struct nfs_closeres res;
- struct nfs_fattr fattr;
- unsigned long timestamp;
-};
-
static void nfs4_free_closedata(void *data)
{
struct nfs4_closedata *calldata = data;
@@ -1840,6 +1832,17 @@ static void nfs4_free_closedata(void *data)
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
path_put(&calldata->path);
+ if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN) {
+ struct pnfs_layout_hdr *lo = NFS_I(calldata->inode)->layout;
+
+ spin_lock(&lo->inode->i_lock);
+ lo->plh_block_lgets--;
+ lo->plh_outstanding--;
+ if (!pnfs_layoutgets_blocked(lo, NULL))
+ rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
+ spin_unlock(&lo->inode->i_lock);
+ put_layout_hdr(lo->inode);
+ }
kfree(calldata);
}

@@ -1869,6 +1872,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
switch (task->tk_status) {
case 0:
nfs_set_open_stateid(state, &calldata->res.stateid, 0);
+ if (calldata->res.op_bitmask & NFS4_HAS_LAYOUTRETURN)
+ nfs4_layoutreturn_set_stateid(calldata->inode,
+ &calldata->res.lr_res);
renew_lease(server, calldata->timestamp);
nfs4_close_clear_stateid_flags(state,
calldata->arg.fmode);
@@ -1920,8 +1926,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
return;
}

- if (calldata->arg.fmode == 0)
+ if (calldata->arg.fmode == 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+ /* Are there layout segments to return on close? */
+ if (pnfs_roc(calldata)) {
+ struct nfs_inode *nfsi = NFS_I(calldata->inode);
+ if (pnfs_return_layout_barrier(nfsi,
+ &calldata->arg.lr_args.range)) {
+ dprintk("%s: waiting on barrier\n", __func__);
+ /* FIXME race with wake here */
+ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
+ spin_lock(&calldata->inode->i_lock);
+ nfsi->layout->plh_block_lgets--;
+ nfsi->layout->plh_outstanding--;
+ if (!pnfs_layoutgets_blocked(nfsi->layout, NULL))
+ rpc_wake_up(&nfsi->lo_rpcwaitq_stateid);
+ spin_unlock(&calldata->inode->i_lock);
+ put_layout_hdr(calldata->inode);
+ return;
+ }
+ }
+ }

nfs_fattr_init(calldata->res.fattr);
calldata->timestamp = jiffies;
@@ -5587,6 +5612,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)

if (pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
dprintk("%s: waiting on barrier\n", __func__);
+ /* FIXME race with wake here */
rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
return;
}
@@ -5602,6 +5628,19 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
rpc_call_start(task);
}

+static void nfs4_layoutreturn_set_stateid(struct inode *ino,
+ struct nfs4_layoutreturn_res *res)
+{
+ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+
+ spin_lock(&ino->i_lock);
+ if (res->lrs_present)
+ pnfs_set_layout_stateid(lo, &res->stateid, true);
+ else
+ BUG_ON(!list_empty(&lo->segs));
+ spin_unlock(&ino->i_lock);
+}
+
static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
@@ -5620,16 +5659,8 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
nfs_restart_rpc(task, lrp->clp);
return;
}
- if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
- struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
-
- spin_lock(&lo->inode->i_lock);
- if (lrp->res.lrs_present)
- pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- else
- BUG_ON(!list_empty(&lo->segs));
- spin_unlock(&lo->inode->i_lock);
- }
+ if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE))
+ nfs4_layoutreturn_set_stateid(lrp->args.inode, &lrp->res);
dprintk("<-- %s\n", __func__);
}

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ceb0d66..784f122 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -601,24 +601,8 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
if (!call_close) {
nfs4_put_open_state(state);
nfs4_put_state_owner(owner);
- } else {
- u32 roc_iomode;
- struct nfs_inode *nfsi = NFS_I(state->inode);
-
- if (has_layout(nfsi) &&
- (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) {
- struct pnfs_layout_range range = {
- .iomode = roc_iomode,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
-
- pnfs_return_layout(state->inode, &range, NULL,
- RETURN_FILE, wait);
- }
-
+ } else
nfs4_do_close(path, state, gfp_mask, wait);
- }
}

void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f530c7e..adb4c47 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -438,12 +438,14 @@ static int nfs4_stat_to_errno(int);
encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_close_maxsz + \
- encode_getattr_maxsz)
+ encode_getattr_maxsz + \
+ encode_layoutreturn_maxsz)
#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_close_maxsz + \
- decode_getattr_maxsz)
+ decode_getattr_maxsz + \
+ decode_layoutreturn_maxsz)
#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -2143,6 +2145,8 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
encode_putfh(&xdr, args->fh, &hdr);
encode_close(&xdr, args, &hdr);
encode_getfattr(&xdr, args->bitmask, &hdr);
+ if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
+ encode_layoutreturn(&xdr, &args->lr_args, &hdr);
encode_nops(&hdr);
return 0;
}
@@ -5719,6 +5723,12 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
*/
decode_getfattr(&xdr, res->fattr, res->server,
!RPC_IS_ASYNC(rqstp->rq_task));
+ /*
+ * With the forgetful model, we pay no attention to the
+ * layoutreturn status.
+ */
+ if (res->op_bitmask & NFS4_HAS_LAYOUTRETURN)
+ decode_layoutreturn(&xdr, &res->lr_res);
out:
return status;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 22abf83..76cfb11 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -623,6 +623,63 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
return ret;
}

+/*
+ * Return on close
+ *
+ * No LAYOUTRETURNS can be sent when BULK RECALL flag is set.
+ * FIXME: add layoutcommit operation if layoutcommit_needed is true.
+ */
+bool
+pnfs_roc(struct nfs4_closedata *data)
+{
+ struct nfs4_layoutreturn_args *lr_args = &data->arg.lr_args;
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_segment *lseg, *tmp;
+ struct pnfs_layout_range range = {
+ .length = NFS4_MAX_UINT64,
+ };
+ LIST_HEAD(tmp_list);
+ bool found = false;
+
+ spin_lock(&data->inode->i_lock);
+ lo = NFS_I(data->inode)->layout;
+ if (!lo || lo->roc_iomode == 0 ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ goto out_nolayout;
+
+ range.iomode = lo->roc_iomode;
+ list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
+ if (should_free_lseg(&lseg->range, &range)) {
+ mark_lseg_invalid(lseg, &tmp_list);
+ found = true;
+ }
+ if (found == false)
+ goto out_nolayout;
+ /* Stop new and drop response to outstanding LAYOUTGETS */
+ lo->plh_block_lgets++;
+ lo->plh_outstanding++;
+ /* Reference matched in pnfs_layoutreturn_release */
+ get_layout_hdr(lo);
+
+ spin_unlock(&data->inode->i_lock);
+
+ pnfs_free_lseg_list(&tmp_list);
+
+ lr_args->reclaim = 0;
+ lr_args->layout_type = NFS_SERVER(data->inode)->pnfs_curr_ld->id;
+ lr_args->return_type = RETURN_FILE;
+ lr_args->range = range;
+ lr_args->inode = data->inode;
+ data->res.op_bitmask |= NFS4_HAS_LAYOUTRETURN;
+ data->arg.op_bitmask |= NFS4_HAS_LAYOUTRETURN;
+
+ return true;
+
+out_nolayout:
+ spin_unlock(&data->inode->i_lock);
+ return false;
+}
+
static int
return_layout(struct inode *ino, struct pnfs_layout_range *range,
enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo,
@@ -997,13 +1054,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
*lgp->lsegpp = lseg;
pnfs_insert_layout(lo, lseg);

- if (res->return_on_close) {
- /* FI: This needs to be re-examined. At lo level,
- * all it needs is a bit indicating whether any of
- * the lsegs in the list have the flags set.
- */
+ if (res->return_on_close)
lo->roc_iomode |= res->range.iomode;
- }

/* Done processing layoutget. Set the layout stateid */
pnfs_set_layout_stateid(lo, &res->stateid, false);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 7fd1f5d..916a057 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -234,6 +234,7 @@ void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range,
int notify_bit, atomic_t *notify_count,
struct list_head *tmp_list);
+bool pnfs_roc(struct nfs4_closedata *data);

static inline bool
has_layout(struct nfs_inode *nfsi)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f472405..6c4ba71 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -351,12 +351,18 @@ struct nfs_open_confirmres {
/*
* Arguments to the close call.
*/
+
+/* op_bitmask bits */
+#define NFS4_HAS_LAYOUTRETURN 0x01
+
struct nfs_closeargs {
struct nfs_fh * fh;
nfs4_stateid * stateid;
struct nfs_seqid * seqid;
fmode_t fmode;
const u32 * bitmask;
+ u32 op_bitmask; /* which optional ops to encode */
+ struct nfs4_layoutreturn_args lr_args; /* optional */
struct nfs4_sequence_args seq_args;
};

@@ -365,8 +371,21 @@ struct nfs_closeres {
struct nfs_fattr * fattr;
struct nfs_seqid * seqid;
const struct nfs_server *server;
+ u32 op_bitmask; /* which optional ops encoded */
+ struct nfs4_layoutreturn_res lr_res; /* optional */
struct nfs4_sequence_res seq_res;
};
+
+struct nfs4_closedata {
+ struct path path;
+ struct inode *inode;
+ struct nfs4_state *state;
+ struct nfs_closeargs arg;
+ struct nfs_closeres res;
+ struct nfs_fattr fattr;
+ unsigned long timestamp;
+};
+
/*
* * Arguments to the lock,lockt, and locku call.
* */
--
1.7.2.1


2010-11-12 08:49:25

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 13/22] pnfs-submit: reorganize struct cb_layoutrecallargs

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 12 ++++++++----
fs/nfs/callback_proc.c | 16 ++++++++--------
fs/nfs/callback_xdr.c | 21 ++++++++++++---------
3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 0b1f3c4..cea58cc 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -147,13 +147,17 @@ extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,

struct cb_layoutrecallargs {
struct sockaddr *cbl_addr;
- struct nfs_fh cbl_fh;
- struct pnfs_layout_range cbl_seg;
- struct nfs_fsid cbl_fsid;
uint32_t cbl_recall_type;
uint32_t cbl_layout_type;
uint32_t cbl_layoutchanged;
- nfs4_stateid cbl_stateid;
+ union {
+ struct {
+ struct nfs_fh cbl_fh;
+ struct pnfs_layout_range cbl_range;
+ nfs4_stateid cbl_stateid;
+ };
+ struct nfs_fsid cbl_fsid;
+ };
};

extern unsigned nfs4_callback_layoutrecall(
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index d02997a..6e0fc40 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -233,13 +233,13 @@ static int pnfs_recall_layout(void *data)
rl = *args->rl;

/* support whole file layouts only */
- rl.cbl_seg.offset = 0;
- rl.cbl_seg.length = NFS4_MAX_UINT64;
+ rl.cbl_range.offset = 0;
+ rl.cbl_range.length = NFS4_MAX_UINT64;

if (rl.cbl_recall_type == RETURN_FILE) {
if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout,
rl.cbl_stateid))
- status = pnfs_return_layout(inode, &rl.cbl_seg,
+ status = pnfs_return_layout(inode, &rl.cbl_range,
&rl.cbl_stateid, RETURN_FILE,
false);
else
@@ -261,7 +261,7 @@ static int pnfs_recall_layout(void *data)
/* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
/* FIXME: need to check status on pnfs_return_layout */
- pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false);
+ pnfs_return_layout(ino, &rl.cbl_range, NULL, RETURN_FILE, false);
iput(ino);
}

@@ -277,7 +277,7 @@ static int pnfs_recall_layout(void *data)
lrp->args.layout_type = rl.cbl_layout_type;
lrp->args.return_type = rl.cbl_recall_type;
lrp->clp = clp;
- lrp->args.range = rl.cbl_seg;
+ lrp->args.range = rl.cbl_range;
lrp->args.inode = inode;
nfs4_proc_layoutreturn(lrp, true);

@@ -338,9 +338,9 @@ static int pnfs_recall_all_layouts(struct nfs_client *clp)
int status = 0;

rl.cbl_recall_type = RETURN_ALL;
- rl.cbl_seg.iomode = IOMODE_ANY;
- rl.cbl_seg.offset = 0;
- rl.cbl_seg.length = NFS4_MAX_UINT64;
+ rl.cbl_range.iomode = IOMODE_ANY;
+ rl.cbl_range.offset = 0;
+ rl.cbl_range.length = NFS4_MAX_UINT64;

/* we need the inode to get the nfs_server struct */
inode = nfs_layoutrecall_find_inode(clp, &rl);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 01688ce..b963c58 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -229,6 +229,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
{
__be32 *p;
__be32 status = 0;
+ uint32_t iomode;

args->cbl_addr = svc_addr(rqstp);
p = read_buf(xdr, 4 * sizeof(uint32_t));
@@ -238,11 +239,15 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
}

args->cbl_layout_type = ntohl(*p++);
- args->cbl_seg.iomode = ntohl(*p++);
+ /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+ * as it is unuseable and ignored with the other types.
+ */
+ iomode = ntohl(*p++);
args->cbl_layoutchanged = ntohl(*p++);
args->cbl_recall_type = ntohl(*p++);

if (likely(args->cbl_recall_type == RETURN_FILE)) {
+ args->cbl_range.iomode = iomode;
status = decode_fh(xdr, &args->cbl_fh);
if (unlikely(status != 0))
goto out;
@@ -252,8 +257,8 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
status = htonl(NFS4ERR_BADXDR);
goto out;
}
- p = xdr_decode_hyper(p, &args->cbl_seg.offset);
- p = xdr_decode_hyper(p, &args->cbl_seg.length);
+ p = xdr_decode_hyper(p, &args->cbl_range.offset);
+ p = xdr_decode_hyper(p, &args->cbl_range.length);
status = decode_stateid(xdr, &args->cbl_stateid);
if (unlikely(status != 0))
goto out;
@@ -266,12 +271,10 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
p = xdr_decode_hyper(p, &args->cbl_fsid.major);
p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
}
- dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d "
- "fsid %llx-%llx fhsize %d\n", __func__,
- args->cbl_layout_type, args->cbl_seg.iomode,
- args->cbl_layoutchanged, args->cbl_recall_type,
- args->cbl_fsid.major, args->cbl_fsid.minor,
- args->cbl_fh.size);
+ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+ __func__,
+ args->cbl_layout_type, iomode,
+ args->cbl_layoutchanged, args->cbl_recall_type);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
--
1.7.2.1


2010-11-12 08:49:27

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 21/22] pnfs_submit: roc add layoutcommit op to close compound

From: Andy Adamson <[email protected]>

Place the layoutcommi operation prior to the close operation in the close
compound so that the filehandle is still valid.

If the layoutcommit fails, a retry of the close compound, which retries with
rpc_restart_call_prepare and so calls pnfs_roc again, will not include the
layoutcommit operation, as the layoutcommit_needed test will be false having
been satisfied by the failed compound.

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4xdr.c | 11 +++++++++--
fs/nfs/pnfs.c | 9 ++++++++-
include/linux/nfs_xdr.h | 2 ++
3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f11870e..b016ec8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -439,13 +439,15 @@ static int nfs4_stat_to_errno(int);
encode_putfh_maxsz + \
encode_close_maxsz + \
encode_getattr_maxsz + \
- encode_layoutreturn_maxsz)
+ encode_layoutreturn_maxsz + \
+ encode_layoutcommit_maxsz)
#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_close_maxsz + \
decode_getattr_maxsz + \
- decode_layoutreturn_maxsz)
+ decode_layoutreturn_maxsz + \
+ decode_layoutcommit_maxsz)
#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -2136,6 +2138,8 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
encode_compound_hdr(&xdr, req, &hdr);
encode_sequence(&xdr, &args->seq_args, &hdr);
encode_putfh(&xdr, args->fh, &hdr);
+ if (args->op_bitmask & NFS4_HAS_LAYOUTCOMMIT) /* layoutcommit set */
+ encode_layoutcommit(&xdr, &args->lc_args, &hdr);
encode_close(&xdr, args, &hdr);
encode_getfattr(&xdr, args->bitmask, &hdr);
if (args->op_bitmask & NFS4_HAS_LAYOUTRETURN) /* layoutreturn set */
@@ -5705,6 +5709,9 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
status = decode_putfh(&xdr);
if (status)
goto out;
+ /* We pay no attention to the layoutcommit return */
+ if (res->op_bitmask & NFS4_HAS_LAYOUTCOMMIT)
+ decode_layoutcommit(&xdr);
status = decode_close(&xdr, res);
if (status != 0)
goto out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f2ec773..15673d0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -627,7 +627,6 @@ pnfs_return_layout_barrier(struct nfs_inode *nfsi,
* Return on close
*
* No LAYOUTRETURNS can be sent when BULK RECALL flag is set.
- * FIXME: add layoutcommit operation if layoutcommit_needed is true.
*/
bool
pnfs_roc(struct nfs4_closedata *data)
@@ -655,6 +654,14 @@ pnfs_roc(struct nfs4_closedata *data)
}
if (found == false)
goto out_nolayout;
+
+ /* Add layoutcommit operation if needed */
+ if (layoutcommit_needed(NFS_I(data->inode))) {
+ pnfs_layoutcommit_setup(data->inode, &data->arg.lc_args, false);
+ data->res.op_bitmask |= NFS4_HAS_LAYOUTCOMMIT;
+ data->arg.op_bitmask |= NFS4_HAS_LAYOUTCOMMIT;
+ }
+
/* Stop new and drop response to outstanding LAYOUTGETS */
lo->plh_block_lgets++;
lo->plh_outstanding++;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 851b09f..d4c4804 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -353,6 +353,7 @@ struct nfs_open_confirmres {

/* op_bitmask bits */
#define NFS4_HAS_LAYOUTRETURN 0x01
+#define NFS4_HAS_LAYOUTCOMMIT 0x02

struct nfs_closeargs {
struct nfs_fh * fh;
@@ -361,6 +362,7 @@ struct nfs_closeargs {
fmode_t fmode;
const u32 * bitmask;
u32 op_bitmask; /* which optional ops to encode */
+ struct nfs4_layoutcommit_op_args lc_args; /* optional */
struct nfs4_layoutreturn_args lr_args; /* optional */
struct nfs4_sequence_args seq_args;
};
--
1.7.2.1


2010-11-12 08:49:26

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 17/22] pnfs-submit: increase number of outstanding CB_LAYOUTRECALLS we can handle

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 1 +
fs/nfs/callback_proc.c | 35 +++++++++++++++++++++++++++++------
fs/nfs/pnfs.c | 24 +++++++++---------------
fs/nfs/pnfs.h | 7 +++----
include/linux/nfs_fs_sb.h | 3 ++-
5 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 4a9905b..218490c 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -165,6 +165,7 @@ extern unsigned nfs4_callback_layoutrecall(
void *dummy, struct cb_process_state *cps);
extern bool matches_outstanding_recall(struct inode *ino,
struct pnfs_layout_range *range);
+extern void notify_drained(struct nfs_client *clp, u64 mask);
extern void nfs_client_return_layouts(struct nfs_client *clp);

static inline void put_session_client(struct nfs4_session *session)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index af405cf..752b593 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -220,16 +220,28 @@ void nfs_client_return_layouts(struct nfs_client *clp)
/* Removing from the list unblocks LAYOUTGETs */
list_del(&cb_info->pcl_list);
clp->cl_cb_lrecall_count--;
+ clp->cl_drain_notification[1 << cb_info->pcl_notify_bit] = NULL;
rpc_wake_up(&clp->cl_rpcwaitq_recall);
kfree(cb_info);
}
}

-void notify_drained(struct pnfs_cb_lrecall_info *d)
+void notify_drained(struct nfs_client *clp, u64 mask)
{
- if (d && atomic_dec_and_test(&d->pcl_count)) {
- set_bit(NFS4CLNT_LAYOUT_RECALL, &d->pcl_clp->cl_state);
- nfs4_schedule_state_manager(d->pcl_clp);
+ atomic_t **ptr = clp->cl_drain_notification;
+ bool done = false;
+
+ /* clp lock not needed except to remove used up entries */
+ /* Should probably use functions defined in bitmap.h */
+ while (mask) {
+ if ((mask & 1) && (atomic_dec_and_test(*ptr)))
+ done = true;
+ mask >>= 1;
+ ptr++;
+ }
+ if (done) {
+ set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
}
}

@@ -270,7 +282,9 @@ static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
if (rv == NFS4_OK) {
lo->plh_block_lgets++;
nfs4_asynch_forget_layouts(lo, &args->cbl_range,
- cb_info, &free_me_list);
+ cb_info->pcl_notify_bit,
+ &cb_info->pcl_count,
+ &free_me_list);
}
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
spin_unlock(&lo->inode->i_lock);
@@ -306,7 +320,9 @@ static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
&recall_list, plh_bulk_recall) {
spin_lock(&lo->inode->i_lock);
set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- nfs4_asynch_forget_layouts(lo, &range, cb_info,
+ nfs4_asynch_forget_layouts(lo, &range,
+ cb_info->pcl_notify_bit,
+ &cb_info->pcl_count,
&free_me_list);
list_del_init(&lo->plh_bulk_recall);
spin_unlock(&lo->inode->i_lock);
@@ -322,6 +338,8 @@ static u32 do_callback_layoutrecall(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
struct pnfs_cb_lrecall_info *new;
+ atomic_t **ptr;
+ int bit_num;
u32 res;

dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
@@ -344,12 +362,17 @@ static u32 do_callback_layoutrecall(struct nfs_client *clp,
clp->cl_cb_lrecall_count++;
/* Adding to the list will block conflicting LGET activity */
list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls);
+ for (bit_num = 0, ptr = clp->cl_drain_notification; *ptr; ptr++)
+ bit_num++;
+ *ptr = &new->pcl_count;
+ new->pcl_notify_bit = bit_num;
spin_unlock(&clp->cl_lock);
res = initiate_layout_draining(new);
if (res || atomic_dec_and_test(&new->pcl_count)) {
spin_lock(&clp->cl_lock);
list_del(&new->pcl_list);
clp->cl_cb_lrecall_count--;
+ clp->cl_drain_notification[1 << bit_num] = NULL;
rpc_wake_up(&clp->cl_rpcwaitq_recall);
spin_unlock(&clp->cl_lock);
if (res == NFS4_OK) {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2d817be..22abf83 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -278,7 +278,7 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
smp_mb();
lseg->valid = true;
lseg->layout = lo;
- lseg->drain_notification = NULL;
+ lseg->pls_notify_mask = 0;
}

static void
@@ -330,12 +330,12 @@ put_lseg(struct pnfs_layout_segment *lseg)
atomic_read(&lseg->pls_refcount), lseg->valid);
ino = lseg->layout->inode;
if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
- struct pnfs_cb_lrecall_info *drain_info = lseg->drain_notification;
+ u64 mask = lseg->pls_notify_mask;

_put_lseg_common(lseg);
spin_unlock(&ino->i_lock);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
- notify_drained(drain_info);
+ notify_drained(NFS_SERVER(ino)->nfs_client, mask);
/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
put_layout_hdr(ino);
}
@@ -401,14 +401,14 @@ pnfs_free_lseg_list(struct list_head *free_me)
{
struct pnfs_layout_segment *lseg, *tmp;
struct inode *ino;
- struct pnfs_cb_lrecall_info *drain_info;
+ u64 mask;

list_for_each_entry_safe(lseg, tmp, free_me, fi_list) {
BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
ino = lseg->layout->inode;
- drain_info = lseg->drain_notification;
+ mask = lseg->pls_notify_mask;
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
- notify_drained(drain_info);
+ notify_drained(NFS_SERVER(ino)->nfs_client, mask);
/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
put_layout_hdr(ino);
}
@@ -587,7 +587,7 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,

void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range,
- struct pnfs_cb_lrecall_info *drain_info,
+ int notify_bit, atomic_t *notify_count,
struct list_head *tmp_list)
{
struct pnfs_layout_segment *lseg, *tmp;
@@ -595,14 +595,8 @@ void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
assert_spin_locked(&lo->inode->i_lock);
list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
if (should_free_lseg(&lseg->range, range)) {
- /* FIXME - need to change to something like a
- * notification bitmap to remove the restriction
- * of only being able to process a single
- * CB_LAYOUTRECALL at a time.
- */
- BUG_ON(lseg->drain_notification);
- lseg->drain_notification = drain_info;
- atomic_inc(&drain_info->pcl_count);
+ lseg->pls_notify_mask |= (1 << notify_bit);
+ atomic_inc(notify_count);
mark_lseg_invalid(lseg, tmp_list);
}
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 7ea121f..7fd1f5d 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -39,7 +39,7 @@ struct pnfs_layout_segment {
atomic_t pls_refcount;
bool valid;
struct pnfs_layout_hdr *layout;
- struct pnfs_cb_lrecall_info *drain_notification;
+ u64 pls_notify_mask;
};

enum pnfs_try_status {
@@ -126,6 +126,7 @@ struct pnfs_device {
struct pnfs_cb_lrecall_info {
struct list_head pcl_list; /* hook into cl_layoutrecalls list */
atomic_t pcl_count;
+ int pcl_notify_bit;
struct nfs_client *pcl_clp;
struct inode *pcl_ino;
struct cb_layoutrecallargs pcl_args;
@@ -231,10 +232,8 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct nfs4_state *open_state);
void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range,
- struct pnfs_cb_lrecall_info *drain_info,
+ int notify_bit, atomic_t *notify_count,
struct list_head *tmp_list);
-/* FIXME - this should be in callback.h, but pnfs_cb_lrecall_info needs to be there too */
-extern void notify_drained(struct pnfs_cb_lrecall_info *d);

static inline bool
has_layout(struct nfs_inode *nfsi)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 80dcc00..295d449 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -85,7 +85,8 @@ struct nfs_client {
struct list_head cl_layouts;
struct list_head cl_layoutrecalls;
unsigned long cl_cb_lrecall_count;
-#define PNFS_MAX_CB_LRECALLS (1)
+#define PNFS_MAX_CB_LRECALLS (64)
+ atomic_t *cl_drain_notification[PNFS_MAX_CB_LRECALLS];
struct rpc_wait_queue cl_rpcwaitq_recall;
struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
#endif /* CONFIG_NFS_V4_1 */
--
1.7.2.1


2010-11-17 17:53:05

by Benny Halevy

[permalink] [raw]
Subject: Re: [nfsv4] [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 2010-11-15 19:53, Fred Isaman wrote:
> On Mon, Nov 15, 2010 at 11:17 AM, Benny Halevy <[email protected]> wrote:
>> On 2010-11-15 16:51, Fred Isaman wrote:
>>> On Sun, Nov 14, 2010 at 10:43 AM, Benny Halevy <[email protected]> wrote:
>>>>
>>>> Using the open stateid after forgetting the layout could be a protocol bug,
>>>> or at least it falls into undefined territories.
>>>>
>>>> The RFC says:
>>>>
>>>> The loga_stateid field specifies a valid stateid. If a layout is not
>>>> currently held by the client, the loga_stateid field represents a
>>>> stateid reflecting the correspondingly valid open, byte-range lock,
>>>> or delegation stateid. Once a layout is held on the file by the
>>>> client, the loga_stateid field MUST be a stateid as returned from a
>>>> previous LAYOUTGET or LAYOUTRETURN operation or provided by a
>>>> CB_LAYOUTRECALL operation (see Section 12.5.3).
>>>>
>>>> So the question is does the text above refer to the client view of the state or to
>>>> the server's view.
>>>> In other words, with the forgetful client model, when the client unilaterally forgets
>>>> the layout without letting the server know about it (no LAYOUTRETURN was sent),
>>>> does it mean "a layout is not currently held by the client"?
>>>>
>>>
>>> I would argue that yes, this is in fact what it means.
>>>
>>> It seems the server has two options when confronted with an
>>> openstateid. Either interpret this as a declaration by the client
>>> that it has forgotten all previous layouts and behave appropriately
>>> (wipe any layout state assigned to the file and create a new
>>> layoutstateid), or assume this is part of parallel spew of
>>> LAYOUTGET(openstateid) and try to use an existing layout state with
>>> the appropriate (possibly not one) seqid. I argue that, as the spec
>>> stands, the second option is not really a choice, because the first
>>> option exists. If a client using the second option encounters a
>>> server using the first, bad things happen. The client will issue
>>> multiple LAYOUTGET(openstateids), the server will, upon seeing each,
>>> discard any previous state and return a new state with segid=1, with
>>
>> Is this the specified behavior?
>>
>>> the final valid state being that of whichever one was processed last.
>>> The client will see all the OK returns, and not have any easy method
>>> of determining which is the one that the server considers valid.
>>>
>>> Thus I claim that, because of the forgetful model, the client must
>>> serialize its LAYOUTGET(openstateid) calls.
>>>
>>
>> I disagree. LAYOUTGET(openstateid) should be no different than
>> any other layout stateid and the client should be able to send multiple
>> such LAYOUTGETs *initially* (and only initially). The server can process
>> these as any other LAYOUTGET with the sequenceid rules assuming seqid==0
>> (which is disallowed otherwise)
>>
>>>> The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
>>>> while it still thinks that the client is holding a layout.
>>>> Since this could normally happen if the client sends multiple LAYOUTGETs in
>>>> parallel before it received any layout stateid the server should allow it
>>>> within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
>>>> not explicitly called out there), otherwise, it seems like the server is supposed
>>>> to return NFS4ERR_OLD_STATEID.
>>>>
>>>> Strictly reading the spec, the client should use the most recent layout stateid
>>>> even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
>>>> or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
>>>> clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
>>>> a layout (did I say recently how much I hate the forgetful model which introduces
>>>> more corner cases rather than simplifying the protocol as it was supposed to do? ;-)
>>>>
>>>
>>> Strict reading again depends on whose point of view, client or server...
>>>
>>> "Once a client has no more layouts on a file, the layout stateid is no
>>> longer valid and MUST NOT be used. Any attempt to use such a layout
>>> stateid will result in NFS4ERR_BAD_STATEID."
>>
>> In NFSv4.1 the server decides about stateids. It's not up to the client
>> to throw away the stateid and revert to the initial stateid.
>> It must send an appropriate LAYOUTRETURN and get lrs_present==false
>> to do that and then it can be sure its layout state for the file is synchronized
>> with the server's.
>>
>> Benny
>>
>
> I actually agree that your method is better. I merely disagree that
> the spec as is allows it. Another quote:
>
> "When a client has no layout on a file, it MUST present an open stateid...".
>
> The problem is that the spec is currently not clear about how the
> forgetful model interacts with sending openstateids, particularly with
> multiple parallel LAYOUTGETs. If a server implementor assumes the
> client can silently forget its layouts, then later send a
> LAYOUTGET(openstateid), which seems to be what the spec currently
> says, then we get potential problems that can only be avoided if the
> client serializes the LAYOUTGET(openstate) calls.
>
> If you want your behavior, where the client is expected to remember
> the layout stateid even after forgetting the layouts, I think an
> errata is needed.

Fair enough.
As I heard no other opinions and we two agree on this,
I'll take it on myself to propose one.

Benny

>
> Fred
>
>
>>>
>>>
>>> Fred
>>>
>>>> Benny
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>
>> _______________________________________________
>> nfsv4 mailing list
>> [email protected]
>> https://www.ietf.org/mailman/listinfo/nfsv4
>>

2010-11-12 08:49:27

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 20/22] pnfs-submit refactor pnfs_layoutcommit_setup

From: Andy Adamson <[email protected]>

Prepare for adding the layoutcommit operation to the close compound on
return-on-close.
Have pnfs_layoutcommit_setup handle all the layoutcommit operation setup.
Have pnfs_layoutcommit_inode handle all the layoutcommit compound setup.

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 106 ++++++++++++++++++++++++++-------------------------------
fs/nfs/pnfs.h | 3 ++
2 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index b3f1946..f2ec773 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1245,98 +1245,88 @@ pnfs_try_to_commit(struct nfs_write_data *data,
}

/*
- * Set up the argument/result storage required for the RPC call.
+ * Set up the arguments required for the RPC call.
*/
-static int
+void
pnfs_layoutcommit_setup(struct inode *inode,
- struct nfs4_layoutcommit_data *data,
- loff_t write_begin_pos, loff_t write_end_pos)
+ struct nfs4_layoutcommit_op_args *args, bool use_cred)
{
- struct nfs_server *nfss = NFS_SERVER(inode);
- int result = 0;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t write_begin_pos, write_end_pos;

dprintk("--> %s\n", __func__);

- data->inode = inode;
- data->args.fh = NFS_FH(inode);
- data->args.op.layout_type = nfss->pnfs_curr_ld->id;
- data->res.fattr = &data->fattr;
- nfs_fattr_init(&data->fattr);
+ assert_spin_locked(&inode->i_lock);

- /* Set values from inode so it can be reset
+ /*
+ * Clear layoutcommit properties in the inode so
+ * new layoutcommit info can be generated
*/
- data->args.op.range.iomode = IOMODE_RW;
- data->args.op.range.offset = write_begin_pos;
- data->args.op.range.length = write_end_pos - write_begin_pos + 1;
- data->args.op.lastbytewritten = min(write_end_pos,
- i_size_read(inode) - 1);
- data->args.bitmask = nfss->attr_bitmask;
- data->res.server = nfss;
-
- dprintk("<-- %s Status %d\n", __func__, result);
- return result;
+ write_begin_pos = nfsi->layout->write_begin_pos;
+ write_end_pos = nfsi->layout->write_end_pos;
+ nfsi->layout->write_begin_pos = 0;
+ nfsi->layout->write_end_pos = 0;
+ /* In the true case, caller has passed on the cred to another struct */
+ if (use_cred == false)
+ put_rpccred(nfsi->layout->cred);
+ nfsi->layout->cred = NULL;
+ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
+ /* FIXME: figure out what to do here */
+ memcpy(args->stateid.data, nfsi->layout->stateid.data,
+ NFS4_STATEID_SIZE);
+
+ args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
+
+ args->range.iomode = IOMODE_RW;
+ args->range.offset = write_begin_pos;
+ args->range.length = write_end_pos - write_begin_pos + 1;
+ args->lastbytewritten = min(write_end_pos, i_size_read(inode) - 1);
}

-/* Issue a async layoutcommit for an inode.
+/*
+ * Issue a async layoutcommit for an inode.
+ * Returns 0 on success, negative value for error
*/
int
pnfs_layoutcommit_inode(struct inode *inode, int sync)
{
struct nfs4_layoutcommit_data *data;
- struct nfs_inode *nfsi = NFS_I(inode);
- loff_t write_begin_pos;
- loff_t write_end_pos;
-
- int status = 0;
+ int status = -ENOMEM;

dprintk("%s Begin (sync:%d)\n", __func__, sync);

- BUG_ON(!has_layout(nfsi));
-
data = kzalloc(sizeof(*data), GFP_NOFS);
if (!data)
- return -ENOMEM;
+ goto out;

+ status = 0;
spin_lock(&inode->i_lock);
- if (!layoutcommit_needed(nfsi)) {
+ if (!layoutcommit_needed(NFS_I(inode))) {
spin_unlock(&inode->i_lock);
- goto out_free;
+ kfree(data);
+ goto out;
}
+ /* Use the layoutcommit cred */
+ data->args.cred = NFS_I(inode)->layout->cred;

- /* Clear layoutcommit properties in the inode so
- * new lc info can be generated
- */
- write_begin_pos = nfsi->layout->write_begin_pos;
- write_end_pos = nfsi->layout->write_end_pos;
- data->args.cred = nfsi->layout->cred;
- nfsi->layout->write_begin_pos = 0;
- nfsi->layout->write_end_pos = 0;
- nfsi->layout->cred = NULL;
- __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
- memcpy(data->args.op.stateid.data, nfsi->layout->stateid.data,
- NFS4_STATEID_SIZE);
+ /* Set up layoutcommit operation args */
+ pnfs_layoutcommit_setup(inode, &data->args.op, true);

/* Reference for layoutcommit matched in pnfs_layoutcommit_release */
get_layout_hdr(NFS_I(inode)->layout);
-
spin_unlock(&inode->i_lock);

- /* Set up layout commit args */
- status = pnfs_layoutcommit_setup(inode, data, write_begin_pos,
- write_end_pos);
- if (status) {
- /* The layout driver failed to setup the layoutcommit */
- put_rpccred(data->args.cred);
- put_layout_hdr(inode);
- goto out_free;
- }
+ data->args.fh = NFS_FH(inode);
+ data->args.bitmask = NFS_SERVER(inode)->attr_bitmask;
+
+ data->inode = inode;
+ data->res.server = NFS_SERVER(inode);
+ data->res.fattr = &data->fattr;
+ nfs_fattr_init(&data->fattr);
status = nfs4_proc_layoutcommit(data, sync);
out:
dprintk("%s end (err:%d)\n", __func__, status);
return status;
-out_free:
- kfree(data);
- goto out;
}

/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 916a057..4812288 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -235,6 +235,9 @@ void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
int notify_bit, atomic_t *notify_count,
struct list_head *tmp_list);
bool pnfs_roc(struct nfs4_closedata *data);
+void pnfs_layoutcommit_setup(struct inode *inode,
+ struct nfs4_layoutcommit_op_args *args,
+ bool use_cred);

static inline bool
has_layout(struct nfs_inode *nfsi)
--
1.7.2.1


2010-11-15 20:40:43

by Fred Isaman

[permalink] [raw]
Subject: Re: [nfsv4] [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On Mon, Nov 15, 2010 at 2:19 PM, Boaz Harrosh <[email protected]> wrote:
> On 11/15/2010 07:53 PM, Fred Isaman wrote:
>> On Mon, Nov 15, 2010 at 11:17 AM, Benny Halevy <[email protected]> wrote:
>>> On 2010-11-15 16:51, Fred Isaman wrote:
>>>> On Sun, Nov 14, 2010 at 10:43 AM, Benny Halevy <[email protected]> wrote:
>>>>>
>>>>> Using the open stateid after forgetting the layout could be a protocol bug,
>>>>> or at least it falls into undefined territories.
>>>>>
>>>>> The RFC says:
>>>>>
>>>>> ? The loga_stateid field specifies a valid stateid. ?If a layout is not
>>>>> ? currently held by the client, the loga_stateid field represents a
>>>>> ? stateid reflecting the correspondingly valid open, byte-range lock,
>>>>> ? or delegation stateid. ?Once a layout is held on the file by the
>>>>> ? client, the loga_stateid field MUST be a stateid as returned from a
>>>>> ? previous LAYOUTGET or LAYOUTRETURN operation or provided by a
>>>>> ? CB_LAYOUTRECALL operation (see Section 12.5.3).
>>>>>
>>>>> So the question is does the text above refer to the client view of the state or to
>>>>> the server's view.
>>>>> In other words, with the forgetful client model, when the client unilaterally forgets
>>>>> the layout without letting the server know about it (no LAYOUTRETURN was sent),
>>>>> does it mean "a layout is not currently held by the client"?
>>>>>
>>>>
>>>> I would argue that yes, this is in fact what it means.
>>>>
>>>> It seems the server has two options when confronted with an
>>>> openstateid. ?Either interpret this as a declaration by the client
>>>> that it has forgotten all previous layouts and behave appropriately
>>>> (wipe any layout state assigned to the file and create a new
>>>> layoutstateid), or assume this is part of parallel spew of
>>>> LAYOUTGET(openstateid) and try to use an existing layout state with
>>>> the appropriate (possibly not one) seqid. ?I argue that, as the spec
>>>> stands, the second option is not really a choice, because the first
>>>> option exists. ?If a client using the second option encounters a
>>>> server using the first, bad things happen. ?The client will issue
>>>> multiple LAYOUTGET(openstateids), the server will, upon seeing each,
>>>> discard any previous state and return a new state with segid=1, with
>>>
>>> Is this the specified behavior?
>>>
>>>> the final valid state being that of whichever one was processed last.
>>>> The client will see all the OK returns, and not have any easy method
>>>> of determining which is the one that the server considers valid.
>>>>
>>>> Thus I claim that, because of the forgetful model, the client must
>>>> serialize its LAYOUTGET(openstateid) calls.
>>>>
>>>
>>> I disagree. LAYOUTGET(openstateid) should be no different than
>>> any other layout stateid and the client should be able to send multiple
>>> such LAYOUTGETs *initially* (and only initially). ?The server can process
>>> these as any other LAYOUTGET with the sequenceid rules assuming seqid==0
>>> (which is disallowed otherwise)
>>>
>>>>> The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
>>>>> while it still thinks that the client is holding a layout.
>>>>> Since this could normally happen if the client sends multiple LAYOUTGETs in
>>>>> parallel before it received any layout stateid the server should allow it
>>>>> within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
>>>>> not explicitly called out there), otherwise, it seems like the server is supposed
>>>>> to return NFS4ERR_OLD_STATEID.
>>>>>
>>>>> Strictly reading the spec, the client should use the most recent layout stateid
>>>>> even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
>>>>> or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
>>>>> clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
>>>>> a layout (did I say recently how much I hate the forgetful model which introduces
>>>>> more corner cases rather than simplifying the protocol as it was supposed to do? ;-)
>>>>>
>>>>
>>>> Strict reading again depends on whose point of view, client or server...
>>>>
>>>> "Once a client has no more layouts on a file, the layout stateid is no
>>>> longer valid and MUST NOT be used. ?Any attempt to use such a layout
>>>> stateid will result in NFS4ERR_BAD_STATEID."
>>>
>>> In NFSv4.1 the server decides about stateids. It's not up to the client
>>> to throw away the stateid and revert to the initial stateid.
>>> It must send an appropriate LAYOUTRETURN and get lrs_present==false
>>> to do that and then it can be sure its layout state for the file is synchronized
>>> with the server's.
>>>
>>> Benny
>>>
>>
>> I actually agree that your method is better. ?I merely disagree that
>> the spec as is allows it. ?Another quote:
>>
>> "When a client has no layout on a file, it MUST present an open stateid...".
>>
>> The problem is that the spec is currently not clear about how the
>> forgetful model interacts with sending openstateids, particularly with
>> multiple parallel LAYOUTGETs. ?If a server implementor assumes the
>> client can silently forget its layouts, then later send a
>> LAYOUTGET(openstateid),
>
> No the spec does not say that, and the Server is not to assume a
> forgetful client ever.


The spec does say that:

"It may be useful for clients to "forget" details about what layouts
and ranges the client actually has."

and

"When a client has no layout on a file, it MUST present an open stateid..."


> The first and only time the Server is to encounter
> a forgetful client is when NOMATCHING_LAYOUT is returned from a callback.
> Until then the Server gave out a layout and assumes the client has it.
> If a client is to send an LAYOUTGET(openstate) outside the VALID_SEQID_RANGE
> it will be returned an error. So the forgetful client cannot be all that
> forgetful it must remember it's stateid, though it is free not to use
> these old segments and ask for new ones (And return NOMATCHING on recalls).
>

Now where in the spec does it say that? (Note I agree it *should* say
something similar to your statement, but I don't see where it does
now).

Fred

> I agree with you that you have exposed the exact logical contradiction
> of the forgetful model, And why it is stupid really. (The faster we are
> to return NOMATCHING to the "forgetful model" the better off we'll be ;-))
>
> which seems to be what the spec currently
>> says, then we get potential problems that can only be avoided if the
>> client serializes the LAYOUTGET(openstate) calls.
>>
>
> Given above, that the Server cannot do that, hence the client is now
> able to actually take advantage of the concurrency inherited in the STD
> and the VALID_SEQID_RANGE model.
>
>> If you want your behavior, where the client is expected to remember
>> the layout stateid even after forgetting the layouts, I think an
>> errata is needed.
>>
>
> I don't think so. Once you realize that there is only a single point
> in time the server "assumes" forgetfulness, .i.e at recall=>NOMATCHING
> that picture changes.
>
> Boaz
>> Fred
>>
>>
>>>>
>>>>
>>>> Fred
>>>>
>>>>> Benny
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>

2010-11-14 11:50:22

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 2010-11-14 13:44, Benny Halevy wrote:
> On 2010-11-13 11:11, Trond Myklebust wrote:
>> On Fri, 2010-11-12 at 03:48 -0500, Fred Isaman wrote:
>>> + switch (cb_args->cbl_recall_type) {
>>> + case RETURN_ALL:
>>> + return true;
>>> + case RETURN_FSID:
>>> + return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid,
>>> + sizeof(struct nfs_fsid));
>>> + case RETURN_FILE:
>>> + return (ino == cb_info->pcl_ino) &&
>>> + should_free_lseg(range, &cb_args->cbl_range);
>>> + default:
>>> + BUG();
>>
>> Why should we BUG() just because the server is screwed up? That's not a
>> client bug.
>>
>
> Agreed. This should be handled earlier in nfs4_callback_layoutrecall
> or do_callback_layoutrecall so that we can return NFS4ERR_INVALID.
>

Actually NFS4ERR_BADXDR is the right error to return for a
"value within the input stream that is not valid for the enum"

Benny

2010-11-16 09:54:41

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [nfsv4] [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 11/15/2010 10:40 PM, Fred Isaman wrote:
> On Mon, Nov 15, 2010 at 2:19 PM, Boaz Harrosh <[email protected]> wrote:
>> On 11/15/2010 07:53 PM, Fred Isaman wrote:
>>> On Mon, Nov 15, 2010 at 11:17 AM, Benny Halevy <[email protected]> wrote:
>>>> On 2010-11-15 16:51, Fred Isaman wrote:
>>>>> On Sun, Nov 14, 2010 at 10:43 AM, Benny Halevy <[email protected]> wrote:
>>>>>>
>>>>>> Using the open stateid after forgetting the layout could be a protocol bug,
>>>>>> or at least it falls into undefined territories.
>>>>>>
>>>>>> The RFC says:
>>>>>>
>>>>>> The loga_stateid field specifies a valid stateid. If a layout is not
>>>>>> currently held by the client, the loga_stateid field represents a
>>>>>> stateid reflecting the correspondingly valid open, byte-range lock,
>>>>>> or delegation stateid. Once a layout is held on the file by the
>>>>>> client, the loga_stateid field MUST be a stateid as returned from a
>>>>>> previous LAYOUTGET or LAYOUTRETURN operation or provided by a
>>>>>> CB_LAYOUTRECALL operation (see Section 12.5.3).
>>>>>>
>>>>>> So the question is does the text above refer to the client view of the state or to
>>>>>> the server's view.
>>>>>> In other words, with the forgetful client model, when the client unilaterally forgets
>>>>>> the layout without letting the server know about it (no LAYOUTRETURN was sent),
>>>>>> does it mean "a layout is not currently held by the client"?
>>>>>>
>>>>>
>>>>> I would argue that yes, this is in fact what it means.
>>>>>
>>>>> It seems the server has two options when confronted with an
>>>>> openstateid. Either interpret this as a declaration by the client
>>>>> that it has forgotten all previous layouts and behave appropriately
>>>>> (wipe any layout state assigned to the file and create a new
>>>>> layoutstateid), or assume this is part of parallel spew of
>>>>> LAYOUTGET(openstateid) and try to use an existing layout state with
>>>>> the appropriate (possibly not one) seqid. I argue that, as the spec
>>>>> stands, the second option is not really a choice, because the first
>>>>> option exists. If a client using the second option encounters a
>>>>> server using the first, bad things happen. The client will issue
>>>>> multiple LAYOUTGET(openstateids), the server will, upon seeing each,
>>>>> discard any previous state and return a new state with segid=1, with
>>>>
>>>> Is this the specified behavior?
>>>>
>>>>> the final valid state being that of whichever one was processed last.
>>>>> The client will see all the OK returns, and not have any easy method
>>>>> of determining which is the one that the server considers valid.
>>>>>
>>>>> Thus I claim that, because of the forgetful model, the client must
>>>>> serialize its LAYOUTGET(openstateid) calls.
>>>>>
>>>>
>>>> I disagree. LAYOUTGET(openstateid) should be no different than
>>>> any other layout stateid and the client should be able to send multiple
>>>> such LAYOUTGETs *initially* (and only initially). The server can process
>>>> these as any other LAYOUTGET with the sequenceid rules assuming seqid==0
>>>> (which is disallowed otherwise)
>>>>
>>>>>> The server will see a LAYOUTGET with an open/lock/deleg stateid in this case
>>>>>> while it still thinks that the client is holding a layout.
>>>>>> Since this could normally happen if the client sends multiple LAYOUTGETs in
>>>>>> parallel before it received any layout stateid the server should allow it
>>>>>> within the VALID_SEQID_RANGE constraints (see 12.5.5.2.1.4, although it is
>>>>>> not explicitly called out there), otherwise, it seems like the server is supposed
>>>>>> to return NFS4ERR_OLD_STATEID.
>>>>>>
>>>>>> Strictly reading the spec, the client should use the most recent layout stateid
>>>>>> even in the forgetful model, until it gets a LAYOUTRETURN reply with lrs_present==false
>>>>>> or until it replies NFS4ERR_NOMATCHING_LAYOUT to CB_LAYOUTRECALL with
>>>>>> clora_iomode==LAYOUTIOMODE4_ANY or other values where the client never dropped
>>>>>> a layout (did I say recently how much I hate the forgetful model which introduces
>>>>>> more corner cases rather than simplifying the protocol as it was supposed to do? ;-)
>>>>>>
>>>>>
>>>>> Strict reading again depends on whose point of view, client or server...
>>>>>
>>>>> "Once a client has no more layouts on a file, the layout stateid is no
>>>>> longer valid and MUST NOT be used. Any attempt to use such a layout
>>>>> stateid will result in NFS4ERR_BAD_STATEID."
>>>>
>>>> In NFSv4.1 the server decides about stateids. It's not up to the client
>>>> to throw away the stateid and revert to the initial stateid.
>>>> It must send an appropriate LAYOUTRETURN and get lrs_present==false
>>>> to do that and then it can be sure its layout state for the file is synchronized
>>>> with the server's.
>>>>
>>>> Benny
>>>>
>>>
>>> I actually agree that your method is better. I merely disagree that
>>> the spec as is allows it. Another quote:
>>>
>>> "When a client has no layout on a file, it MUST present an open stateid...".
>>>
>>> The problem is that the spec is currently not clear about how the
>>> forgetful model interacts with sending openstateids, particularly with
>>> multiple parallel LAYOUTGETs. If a server implementor assumes the
>>> client can silently forget its layouts, then later send a
>>> LAYOUTGET(openstateid),
>>
>> No the spec does not say that, and the Server is not to assume a
>> forgetful client ever.
>
>
> The spec does say that:
>
> "It may be useful for clients to "forget" details about what layouts
> and ranges the client actually has."
>

Rrrr, please read.
" "forget" details about what layouts and ranges"
Not about the stateid it has. Only the "details"

> and
>
> "When a client has no layout on a file, it MUST present an open stateid..."

It's not clear cut lawyer proof. But the way I understand this sentence is
that "If the client does not have a layout state eg. he did not request one
or returned all previous layouts"

[It is easy to take random bits of text bring them together out of context
and make them sound like anything. But it does not change the intent
and it does not mean we want to make stupid code because we can, just
because the STD was wrong. (Or contradictory)]

>
>
>> The first and only time the Server is to encounter
>> a forgetful client is when NOMATCHING_LAYOUT is returned from a callback.
>> Until then the Server gave out a layout and assumes the client has it.
>> If a client is to send an LAYOUTGET(openstate) outside the VALID_SEQID_RANGE
>> it will be returned an error. So the forgetful client cannot be all that
>> forgetful it must remember it's stateid, though it is free not to use
>> these old segments and ask for new ones (And return NOMATCHING on recalls).
>>
>
> Now where in the spec does it say that? (Note I agree it *should* say
> something similar to your statement, but I don't see where it does
> now).
>

Which part of "That" did you mean. I had 7 lines of text in there?

The STD is full of sections and explanations about VALID_SEQID_RANGE
and the propagation of state and possible parallelism of state.

The only place it mentions "forgetfull" is in the NOMATCHING_LAYOUT
return to a callback. (And the fact that a Client might not use
layouts it requested).

So I'm trying to put the only logical meaning and intent I can find
in the text. Otherwise why are there full chapters of state and state
propagation, if the client can at any moment reset the process and
start from scratch. Surly if it "could" there would be a full section
on what a Server to do when in mid flight the Client decides to go
back to the beginning.

Anywhere in the text it does not say the Client can loose state all
it says is the small section you quoted above.
""forget" details about"

details, not state.

Boaz
> Fred
>
>> I agree with you that you have exposed the exact logical contradiction
>> of the forgetful model, And why it is stupid really. (The faster we are
>> to return NOMATCHING to the "forgetful model" the better off we'll be ;-))
>>
>> which seems to be what the spec currently
>>> says, then we get potential problems that can only be avoided if the
>>> client serializes the LAYOUTGET(openstate) calls.
>>>
>>
>> Given above, that the Server cannot do that, hence the client is now
>> able to actually take advantage of the concurrency inherited in the STD
>> and the VALID_SEQID_RANGE model.
>>
>>> If you want your behavior, where the client is expected to remember
>>> the layout stateid even after forgetting the layouts, I think an
>>> errata is needed.
>>>
>>
>> I don't think so. Once you realize that there is only a single point
>> in time the server "assumes" forgetfulness, .i.e at recall=>NOMATCHING
>> that picture changes.
>>
>> Boaz
>>> Fred
>>>
>>>
>>>>>
>>>>>
>>>>> Fred
>>>>>
>>>>>> Benny
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>


2010-11-16 11:13:03

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [nfsv4] [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On 11/16/2010 11:54 AM, Boaz Harrosh wrote:
>
> It's not clear cut lawyer proof. But the way I understand this sentence is
> that "If the client does not have a layout state eg. he did not request one
> or returned all previous layouts"
>

To clarify, above "he did not request one"
should be "did not successfully received one" with a new layout-stateid
of course.

Thanks
Boaz

2010-11-15 14:28:39

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

On Sun, Nov 14, 2010 at 6:50 AM, Benny Halevy <[email protected]> wro=
te:
> On 2010-11-14 13:44, Benny Halevy wrote:
>> On 2010-11-13 11:11, Trond Myklebust wrote:
>>> On Fri, 2010-11-12 at 03:48 -0500, Fred Isaman wrote:
>>>> + =A0 switch (cb_args->cbl_recall_type) {
>>>> + =A0 case RETURN_ALL:
>>>> + =A0 =A0 =A0 =A0 =A0 return true;
>>>> + =A0 case RETURN_FSID:
>>>> + =A0 =A0 =A0 =A0 =A0 return !memcmp(&NFS_SERVER(ino)->fsid, &cb_a=
rgs->cbl_fsid,
>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0sizeof(struct=
nfs_fsid));
>>>> + =A0 case RETURN_FILE:
>>>> + =A0 =A0 =A0 =A0 =A0 return (ino =3D=3D cb_info->pcl_ino) &&
>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 should_free_lseg(range, &cb_=
args->cbl_range);
>>>> + =A0 default:
>>>> + =A0 =A0 =A0 =A0 =A0 BUG();
>>>
>>> Why should we BUG() just because the server is screwed up? That's n=
ot a
>>> client bug.
>>>
>>
>> Agreed. =A0This should be handled earlier in nfs4_callback_layoutrec=
all
>> or do_callback_layoutrecall so that we can return NFS4ERR_INVALID.
>>
>
> Actually NFS4ERR_BADXDR is the right error to return for a
> "value within the input stream that is not valid for the enum"
>
> Benny

OK.

=46red

> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to [email protected]
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>

2010-11-15 15:07:28

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH 08/22] SQUASHME: allow cb_sequence changes to compile without v4.1

On Sun, Nov 14, 2010 at 7:05 AM, Benny Halevy <[email protected]> wro=
te:
> On 2010-11-12 10:48, Fred Isaman wrote:
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> =A0fs/nfs/callback.h =A0 =A0 =A0| =A0 26 ++++++++++++++++++++++++++
>> =A0fs/nfs/callback_proc.c | =A0 =A06 ------
>> =A0fs/nfs/callback_xdr.c =A0| =A0 =A03 +--
>> =A0fs/nfs/internal.h =A0 =A0 =A0| =A0 =A04 ++++
>> =A04 files changed, 31 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
>> index 89fee05..0b1f3c4 100644
>> --- a/fs/nfs/callback.h
>> +++ b/fs/nfs/callback.h
>> @@ -8,6 +8,8 @@
>> =A0#ifndef __LINUX_FS_NFS_CALLBACK_H
>> =A0#define __LINUX_FS_NFS_CALLBACK_H
>>
>> +#include "internal.h"
>> +
>> =A0#define NFS4_CALLBACK 0x40000000
>> =A0#define NFS4_CALLBACK_XDRSIZE 2048
>> =A0#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
>> @@ -158,6 +160,30 @@ extern unsigned nfs4_callback_layoutrecall(
>> =A0 =A0 =A0 struct cb_layoutrecallargs *args,
>> =A0 =A0 =A0 void *dummy, struct cb_process_state *cps);
>>
>> +static inline void put_session_client(struct nfs4_session *session)
>> +{
>> + =A0 =A0 if (session) =A0/* matched by cb_sequence find_client_with=
_session */
>
> nit: comment out of scope. =A0belongs to the call site, not here...
>
>> + =A0 =A0 =A0 =A0 =A0 =A0 nfs_put_client(session->clp);
>> +}
>> +
>> +static inline struct nfs_client *
>> +find_client_from_cps(struct cb_process_state *cps, struct sockaddr =
*addr)
>> +{
>> + =A0 =A0 return cps->session ? cps->session->clp : nfs_find_client(=
addr, 4);
>> +}
>> +
>> +#else
>
> nit: /* CONFIG_NFS_V4_1 */ comment missing
>
> (I'll fix both in my tree)
>
> Benny

OK, thanks.

=46red

>
>> +
>> +static inline struct nfs_client *
>> +find_client_from_cps(struct cb_process_state *cps, struct sockaddr =
*addr)
>> +{
>> + =A0 =A0 return nfs_find_client(addr, 4);
>> +}
>> +
>> +static inline void put_session_client(struct nfs4_session *session)
>> +{
>> +}
>> +
>> =A0#endif /* CONFIG_NFS_V4_1 */
>>
>> =A0extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
>> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
>> index 2e62155..d02997a 100644
>> --- a/fs/nfs/callback_proc.c
>> +++ b/fs/nfs/callback_proc.c
>> @@ -21,12 +21,6 @@
>> =A0#define NFSDBG_FACILITY NFSDBG_CALLBACK
>> =A0#endif
>>
>> -static struct nfs_client *
>> -find_client_from_cps(struct cb_process_state *cps, struct sockaddr =
*addr)
>> -{
>> - =A0 =A0 return cps->session ? cps->session->clp : nfs_find_client(=
addr, 4);
>> -}
>> -
>> =A0__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
>> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct cb_get=
attrres *res,
>> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct cb_pro=
cess_state *cps)
>> diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
>> index 1650ab0..01688ce 100644
>> --- a/fs/nfs/callback_xdr.c
>> +++ b/fs/nfs/callback_xdr.c
>> @@ -770,8 +770,7 @@ static __be32 nfs4_callback_compound(struct svc_=
rqst *rqstp, void *argp, void *r
>>
>> =A0 =A0 =A0 *hdr_res.status =3D status;
>> =A0 =A0 =A0 *hdr_res.nops =3D htonl(nops);
>> - =A0 =A0 if (cps.session) /* matched by cb_sequence find_client_wit=
h_session */
>> - =A0 =A0 =A0 =A0 =A0 =A0 nfs_put_client(cps.session->clp);
>> + =A0 =A0 put_session_client(cps.session);
>> =A0 =A0 =A0 dprintk("%s: done, status =3D %u\n", __func__, ntohl(sta=
tus));
>> =A0 =A0 =A0 return rpc_success;
>> =A0}
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index 755e555..6f14089 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -2,6 +2,8 @@
>> =A0 * NFS internal definitions
>> =A0 */
>>
>> +#ifndef __LINUX_FS_NFS_INTERNAL_H
>> +#define __LINUX_FS_NFS_INTERNAL_H
>> =A0#include "nfs4_fs.h"
>> =A0#include <linux/mount.h>
>> =A0#include <linux/security.h>
>> @@ -415,3 +417,5 @@ static inline int nfs_restart_rpc(struct rpc_tas=
k *task, const struct nfs_client
>> =A0 =A0 =A0 =A0 =A0 =A0 =A0 return rpc_restart_call_prepare(task);
>> =A0 =A0 =A0 return rpc_restart_call(task);
>> =A0}
>> +
>> +#endif /* __LINUX_FS_NFS_INTERNAL_H */
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to [email protected]
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>