Return-Path: Received: from mx2.netapp.com ([216.240.18.37]:10522 "EHLO mx2.netapp.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752737Ab1DTTx7 convert rfc822-to-8bit (ORCPT ); Wed, 20 Apr 2011 15:53:59 -0400 Subject: Re: [RFC 03/27] pnfs: layoutreturn From: Trond Myklebust To: Benny Halevy Cc: linux-nfs@vger.kernel.org In-Reply-To: <1303320392-21068-1-git-send-email-bhalevy@panasas.com> References: <4DAF0DE1.6020609@panasas.com> <1303320392-21068-1-git-send-email-bhalevy@panasas.com> Content-Type: text/plain; charset="UTF-8" Date: Wed, 20 Apr 2011 15:53:41 -0400 Message-ID: <1303329221.23206.22.camel@lade.trondhjem.org> Sender: linux-nfs-owner@vger.kernel.org List-ID: MIME-Version: 1.0 On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote: > Signed-off-by: Alexandros Batsakis > Signed-off-by: Andy Adamson > Signed-off-by: Andy Adamson > Signed-off-by: Dean Hildebrand > Signed-off-by: Fred Isaman > Signed-off-by: Fred Isaman > Signed-off-by: Marc Eshel > Signed-off-by: Zhang Jingwang > Signed-off-by: Benny Halevy > --- > fs/nfs/inode.c | 3 +- > fs/nfs/nfs4proc.c | 97 ++++++++++++++++++++++++++++++++++++++ > fs/nfs/nfs4xdr.c | 118 ++++++++++++++++++++++++++++++++++++++++++++-- > fs/nfs/pnfs.c | 96 ++++++++++++++++++++++++++++++++++++++ > fs/nfs/pnfs.h | 22 +++++++++ > include/linux/nfs4.h | 1 + > include/linux/nfs_xdr.h | 23 +++++++++ > 7 files changed, 353 insertions(+), 7 deletions(-) > > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c > index 57bb31a..73a2529 100644 > --- a/fs/nfs/inode.c > +++ b/fs/nfs/inode.c > @@ -1424,9 +1424,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) > */ > void nfs4_evict_inode(struct inode *inode) > { > - pnfs_destroy_layout(NFS_I(inode)); > + pnfs_return_layout(inode, NULL, true); Why does this want to come before the call to truncate_inode_pages()? Is there any reason not to put pnfs_return_layout() and pnfs_destroy_layout into a single helper here? > truncate_inode_pages(&inode->i_data, 0); > end_writeback(inode); > + pnfs_destroy_layout(NFS_I(inode)); > /* If we are holding a delegation, return it! */ > nfs_inode_return_delegation_noreclaim(inode); > /* First call standard NFS clear_inode() code */ > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > index 9bf41ea..b03defb 100644 > --- a/fs/nfs/nfs4proc.c > +++ b/fs/nfs/nfs4proc.c > @@ -5662,6 +5662,103 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) > return status; > } > > +static void > +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) > +{ > + struct nfs4_layoutreturn *lrp = calldata; > + > + dprintk("--> %s\n", __func__); > + if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, > + &lrp->res.seq_res, 0, task)) > + return; > + rpc_call_start(task); > +} > + > +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) > +{ > + struct nfs4_layoutreturn *lrp = calldata; > + struct nfs_server *server; > + > + dprintk("--> %s\n", __func__); > + > + if (!nfs4_sequence_done(task, &lrp->res.seq_res)) > + return; > + > + if (lrp->args.return_type == RETURN_FILE) > + server = NFS_SERVER(lrp->args.inode); > + else > + server = NULL; > + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { > + nfs_restart_rpc(task, lrp->clp); > + return; > + } > + if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) { > + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; > + > + spin_lock(&lo->plh_inode->i_lock); > + if (lrp->res.lrs_present) > + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); > + else > + BUG_ON(!list_empty(&lo->plh_segs)); > + spin_unlock(&lo->plh_inode->i_lock); > + } > + dprintk("<-- %s\n", __func__); > +} > + > +static void nfs4_layoutreturn_release(void *calldata) > +{ > + struct nfs4_layoutreturn *lrp = calldata; > + > + dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type); > + if (lrp->args.return_type == RETURN_FILE) { > + struct inode *ino = lrp->args.inode; > + struct pnfs_layout_hdr *lo = NFS_I(ino)->layout; > + > + put_layout_hdr(lo); > + } > + kfree(calldata); > + dprintk("<-- %s\n", __func__); > +} > + > +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { > + .rpc_call_prepare = nfs4_layoutreturn_prepare, > + .rpc_call_done = nfs4_layoutreturn_done, > + .rpc_release = nfs4_layoutreturn_release, > +}; > + > +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) Why the 'issync' parameter? > +{ > + struct rpc_task *task; > + struct rpc_message msg = { > + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], > + .rpc_argp = &lrp->args, > + .rpc_resp = &lrp->res, > + }; > + struct rpc_task_setup task_setup_data = { > + .rpc_client = lrp->clp->cl_rpcclient, > + .rpc_message = &msg, > + .callback_ops = &nfs4_layoutreturn_call_ops, > + .callback_data = lrp, > + .flags = RPC_TASK_ASYNC, > + }; > + int status = 0; > + > + dprintk("--> %s\n", __func__); > + task = rpc_run_task(&task_setup_data); > + if (IS_ERR(task)) > + return PTR_ERR(task); > + if (!issync) > + goto out; > + status = nfs4_wait_for_completion_rpc_task(task); > + if (status != 0) > + goto out; > + status = task->tk_status; > +out: > + dprintk("<-- %s\n", __func__); > + rpc_put_task(task); > + return status; > +} > + > static int > _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) > { > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c > index dddfb57..53ea3e5 100644 > --- a/fs/nfs/nfs4xdr.c > +++ b/fs/nfs/nfs4xdr.c > @@ -338,7 +338,12 @@ static int nfs4_stat_to_errno(int); > 1 /* layoutupdate4 layout type */ + \ > 1 /* NULL filelayout layoutupdate4 payload */) > #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) > - > +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ > + encode_stateid_maxsz + \ > + 1 /* FIXME: opaque lrf_body always empty at > + *the moment */) > +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ > + 1 + decode_stateid_maxsz) > #else /* CONFIG_NFS_V4_1 */ > #define encode_sequence_maxsz 0 > #define decode_sequence_maxsz 0 > @@ -760,7 +765,14 @@ static int nfs4_stat_to_errno(int); > decode_putfh_maxsz + \ > decode_layoutcommit_maxsz + \ > decode_getattr_maxsz) > - > +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ > + encode_sequence_maxsz + \ > + encode_putfh_maxsz + \ > + encode_layoutreturn_maxsz) > +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ > + decode_sequence_maxsz + \ > + decode_putfh_maxsz + \ > + decode_layoutreturn_maxsz) > > const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + > compound_encode_hdr_maxsz + > @@ -1890,6 +1902,37 @@ encode_layoutcommit(struct xdr_stream *xdr, > hdr->replen += decode_layoutcommit_maxsz; > return 0; > } > + > +static void > +encode_layoutreturn(struct xdr_stream *xdr, > + const struct nfs4_layoutreturn_args *args, > + struct compound_hdr *hdr) > +{ > + nfs4_stateid stateid; > + __be32 *p; > + > + p = reserve_space(xdr, 20); > + *p++ = cpu_to_be32(OP_LAYOUTRETURN); > + *p++ = cpu_to_be32(args->reclaim); > + *p++ = cpu_to_be32(args->layout_type); > + *p++ = cpu_to_be32(args->range.iomode); > + *p = cpu_to_be32(args->return_type); > + if (args->return_type == RETURN_FILE) { > + p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); > + p = xdr_encode_hyper(p, args->range.offset); > + p = xdr_encode_hyper(p, args->range.length); > + spin_lock(&args->inode->i_lock); > + memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data, > + NFS4_STATEID_SIZE); > + spin_unlock(&args->inode->i_lock); > + p = xdr_encode_opaque_fixed(p, &stateid.data, > + NFS4_STATEID_SIZE); > + p = reserve_space(xdr, 4); > + *p = cpu_to_be32(0); > + } > + hdr->nops++; > + hdr->replen += decode_layoutreturn_maxsz; > +} > #endif /* CONFIG_NFS_V4_1 */ > > /* > @@ -2707,9 +2750,9 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, > /* > * Encode LAYOUTCOMMIT request > */ > -static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, > - struct xdr_stream *xdr, > - struct nfs4_layoutcommit_args *args) > +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, > + struct xdr_stream *xdr, > + struct nfs4_layoutcommit_args *args) > { > struct compound_hdr hdr = { > .minorversion = nfs4_xdr_minorversion(&args->seq_args), > @@ -2721,7 +2764,24 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, > encode_layoutcommit(xdr, args, &hdr); > encode_getfattr(xdr, args->bitmask, &hdr); > encode_nops(&hdr); > - return 0; > +} > + > +/* > + * Encode LAYOUTRETURN request > + */ > +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, > + struct xdr_stream *xdr, > + struct nfs4_layoutreturn_args *args) > +{ > + struct compound_hdr hdr = { > + .minorversion = nfs4_xdr_minorversion(&args->seq_args), > + }; > + > + encode_compound_hdr(xdr, req, &hdr); > + encode_sequence(xdr, &args->seq_args, &hdr); > + encode_putfh(xdr, NFS_FH(args->inode), &hdr); > + encode_layoutreturn(xdr, args, &hdr); > + encode_nops(&hdr); > } > #endif /* CONFIG_NFS_V4_1 */ > > @@ -5202,6 +5262,27 @@ out_overflow: > return -EIO; > } > > +static int decode_layoutreturn(struct xdr_stream *xdr, > + struct nfs4_layoutreturn_res *res) > +{ > + __be32 *p; > + int status; > + > + status = decode_op_hdr(xdr, OP_LAYOUTRETURN); > + if (status) > + return status; > + p = xdr_inline_decode(xdr, 4); > + if (unlikely(!p)) > + goto out_overflow; > + res->lrs_present = be32_to_cpup(p); > + if (res->lrs_present) > + status = decode_stateid(xdr, &res->stateid); > + return status; > +out_overflow: > + print_overflow_msg(__func__, xdr); > + return -EIO; > +} > + > static int decode_layoutcommit(struct xdr_stream *xdr, > struct rpc_rqst *req, > struct nfs4_layoutcommit_res *res) > @@ -6319,6 +6400,30 @@ out: > } > > /* > + * Decode LAYOUTRETURN response > + */ > +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, > + struct xdr_stream *xdr, > + struct nfs4_layoutreturn_res *res) > +{ > + struct compound_hdr hdr; > + int status; > + > + status = decode_compound_hdr(xdr, &hdr); > + if (status) > + goto out; > + status = decode_sequence(xdr, &res->seq_res, rqstp); > + if (status) > + goto out; > + status = decode_putfh(xdr); > + if (status) > + goto out; > + status = decode_layoutreturn(xdr, res); > +out: > + return status; > +} > + > +/* > * Decode LAYOUTCOMMIT response > */ > static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, > @@ -6544,6 +6649,7 @@ struct rpc_procinfo nfs4_procedures[] = { > PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), > PROC(LAYOUTGET, enc_layoutget, dec_layoutget), > PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), > + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), > #endif /* CONFIG_NFS_V4_1 */ > }; > > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c > index d9ab972..89e7725 100644 > --- a/fs/nfs/pnfs.c > +++ b/fs/nfs/pnfs.c > @@ -321,6 +321,36 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, > return invalid - removed; > } > > +/* Returns false if there was nothing to do, true otherwise */ > +static bool > +pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, > + struct pnfs_layout_range *range) > +{ > + struct pnfs_layout_segment *lseg, *next; > + bool rv = false; > + > + dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", > + __func__, lo, range->offset, range->length, range->iomode); > + assert_spin_locked(&lo->plh_inode->i_lock); Not needed in the case of nfs4_evict_inode. > + if (list_empty(&lo->plh_segs)) { > + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) > + put_layout_hdr_locked(lo); > + return 0; > + } > + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) > + if (should_free_lseg(lseg->pls_range.iomode, range->iomode)) { > + dprintk("%s: freeing lseg %p iomode %d " > + "offset %llu length %llu\n", __func__, > + lseg, lseg->pls_range.iomode, > + lseg->pls_range.offset, > + lseg->pls_range.length); > + mark_lseg_invalid(lseg, tmp_list); > + rv = true; > + } > + dprintk("%s:Return %d\n", __func__, rv); > + return rv; > +} > + > /* note free_me must contain lsegs from a single layout_hdr */ > void > pnfs_free_lseg_list(struct list_head *free_me) > @@ -539,6 +569,72 @@ out_err_free: > return NULL; > } > > +static int > +return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait) > +{ > + struct nfs4_layoutreturn *lrp; > + struct nfs_server *server = NFS_SERVER(ino); > + int status = -ENOMEM; > + > + dprintk("--> %s\n", __func__); > + > + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); > + if (lrp == NULL) { > + put_layout_hdr(NFS_I(ino)->layout); > + goto out; > + } > + lrp->args.reclaim = 0; > + lrp->args.layout_type = server->pnfs_curr_ld->id; > + lrp->args.return_type = RETURN_FILE; > + lrp->args.range = *range; > + lrp->args.inode = ino; > + lrp->clp = server->nfs_client; > + > + status = nfs4_proc_layoutreturn(lrp, wait); > +out: > + dprintk("<-- %s status: %d\n", __func__, status); > + return status; > +} > + > +/* Initiates a LAYOUTRETURN(FILE) */ > +int > +_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait) What are the 'range' and 'wait' parameters for? We don't have any users other than nfs4_evict_inode. > +{ > + struct pnfs_layout_hdr *lo = NULL; > + struct nfs_inode *nfsi = NFS_I(ino); > + struct pnfs_layout_range arg; > + LIST_HEAD(tmp_list); > + int status = 0; > + > + dprintk("--> %s\n", __func__); > + > + arg.iomode = range ? range->iomode : IOMODE_ANY; > + arg.offset = 0; > + arg.length = NFS4_MAX_UINT64; > + > + spin_lock(&ino->i_lock); > + lo = nfsi->layout; > + if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) { > + spin_unlock(&ino->i_lock); > + dprintk("%s: no layout segments to return\n", __func__); > + goto out; > + } > + /* Reference matched in nfs4_layoutreturn_release */ > + get_layout_hdr(lo); > + spin_unlock(&ino->i_lock); > + pnfs_free_lseg_list(&tmp_list); > + > + /* Return layout even if layoutcommit fails */ > + status = pnfs_layoutcommit_inode(ino, wait); Why is this needed? Again, by the time we get to nfs4_evict_inode, the inode is guaranteed to be clean. > + if (status) > + dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n", > + __func__, status); > + status = return_layout(ino, &arg, wait); > +out: > + dprintk("<-- %s status: %d\n", __func__, status); > + return status; > +} > + > bool pnfs_roc(struct inode *ino) > { > struct pnfs_layout_hdr *lo; > diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h > index 4cb0a0d..a308f3c 100644 > --- a/fs/nfs/pnfs.h > +++ b/fs/nfs/pnfs.h > @@ -123,6 +123,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); > extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, > struct pnfs_device *dev); > extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); > +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); > > /* pnfs.c */ > void get_layout_hdr(struct pnfs_layout_hdr *lo); > @@ -158,6 +159,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); > bool pnfs_roc_drain(struct inode *ino, u32 *barrier); > void pnfs_set_layoutcommit(struct nfs_write_data *wdata); > int pnfs_layoutcommit_inode(struct inode *inode, bool sync); > +int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait); > > static inline int lo_fail_bit(u32 iomode) > { > @@ -226,6 +228,19 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req) > put_lseg(req->wb_commit_lseg); > } > > +static inline int pnfs_return_layout(struct inode *ino, > + struct pnfs_layout_range *range, > + bool wait) > +{ > + struct nfs_inode *nfsi = NFS_I(ino); > + struct nfs_server *nfss = NFS_SERVER(ino); > + > + if (pnfs_enabled_sb(nfss) && nfsi->layout) > + return _pnfs_return_layout(ino, range, wait); > + > + return 0; > +} > + > #else /* CONFIG_NFS_V4_1 */ > > static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) > @@ -267,6 +282,13 @@ pnfs_try_to_write_data(struct nfs_write_data *data, > return PNFS_NOT_ATTEMPTED; > } > > +static inline int pnfs_return_layout(struct inode *ino, > + struct pnfs_layout_range *range, > + bool wait) > +{ > + return 0; > +} > + > static inline bool > pnfs_roc(struct inode *ino) > { > diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h > index 178fafe..9376eaf 100644 > --- a/include/linux/nfs4.h > +++ b/include/linux/nfs4.h > @@ -562,6 +562,7 @@ enum { > NFSPROC4_CLNT_LAYOUTGET, > NFSPROC4_CLNT_GETDEVICEINFO, > NFSPROC4_CLNT_LAYOUTCOMMIT, > + NFSPROC4_CLNT_LAYOUTRETURN, > }; > > /* nfs41 types */ > diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h > index 78b101e..455ddfb 100644 > --- a/include/linux/nfs_xdr.h > +++ b/include/linux/nfs_xdr.h > @@ -266,6 +266,29 @@ struct nfs4_layoutcommit_data { > struct nfs4_layoutcommit_res res; > }; > > +struct nfs4_layoutreturn_args { > + __u32 reclaim; > + __u32 layout_type; > + __u32 return_type; Why do we need a 'return_type'? As far as I can see, this will always be RETURN_FILE. > + struct pnfs_layout_range range; > + struct inode *inode; > + struct nfs4_sequence_args seq_args; > +}; > + > +struct nfs4_layoutreturn_res { > + struct nfs4_sequence_res seq_res; > + u32 lrs_present; > + nfs4_stateid stateid; > +}; > + > +struct nfs4_layoutreturn { > + struct nfs4_layoutreturn_args args; > + struct nfs4_layoutreturn_res res; > + struct rpc_cred *cred; > + struct nfs_client *clp; > + int rpc_status; > +}; > + > /* > * Arguments to the open call. > */ -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com