From: "Labiaga, Ricardo" Subject: RE: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Date: Mon, 1 Jun 2009 17:33:34 -0700 Message-ID: <273FE88A07F5D445824060902F70034405FE3129@SACMVEXC1-PRD.hq.netapp.com> References: <1241136328.15476.124.camel@heimdal.trondhjem.org> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Cc: "Adamson, Andy" , "J. Bruce Fields" , , To: "Trond Myklebust" , "Benny Halevy" Return-path: Received: from mx2.netapp.com ([216.240.18.37]:46962 "EHLO mx2.netapp.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757968AbZFBAfj convert rfc822-to-8bit (ORCPT ); Mon, 1 Jun 2009 20:35:39 -0400 In-Reply-To: <1241136328.15476.124.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org> Sender: linux-nfs-owner@vger.kernel.org List-ID: Trond, Bruce, Alexandros has coded a number of patches that address the issues raised here by Trond. Do you want the fixes squashed into the original patch or do you want them submitted separately. Thanks, - ricardo > -----Original Message----- > From: Trond Myklebust [mailto:trond.myklebust@fys.uio.no] > Sent: Thursday, April 30, 2009 5:05 PM > To: Benny Halevy > Cc: Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org; linux- > nfs@vger.kernel.org > Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side > backchannel handling > > On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote: > > From: Rahul Iyer > > > > FIXME: bhalevy: write up commit message > > > > Signed-off-by: Rahul Iyer > > Signed-off-by: Mike Sager > > Signed-off-by: Marc Eshel > > Signed-off-by: Benny Halevy > > > > When the call direction is a reply, copy the xid and call direction into > the > > req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header returns > > rpc_garbage. > > > > Signed-off-by: Andy Adamson > > Signed-off-by: Benny Halevy > > [get rid of CONFIG_NFSD_V4_1] > > Signed-off-by: Benny Halevy > > --- > > include/linux/sunrpc/clnt.h | 1 + > > include/linux/sunrpc/svcsock.h | 1 + > > include/linux/sunrpc/xprt.h | 2 + > > net/sunrpc/clnt.c | 1 + > > net/sunrpc/svcsock.c | 68 ++++++++++- > > net/sunrpc/xprt.c | 41 ++++++- > > net/sunrpc/xprtsock.c | 278 > +++++++++++++++++++++++++++++++++++++++- > > 7 files changed, 381 insertions(+), 11 deletions(-) > > > > diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h > > index c39a210..cf9a8ec 100644 > > --- a/include/linux/sunrpc/clnt.h > > +++ b/include/linux/sunrpc/clnt.h > > @@ -110,6 +110,7 @@ struct rpc_create_args { > > rpc_authflavor_t authflavor; > > unsigned long flags; > > char *client_name; > > + struct svc_sock *bc_sock; /* NFSv4.1 backchannel */ > > }; > > > > /* Values for "flags" field */ > > diff --git a/include/linux/sunrpc/svcsock.h > b/include/linux/sunrpc/svcsock.h > > index 8271631..19228f4 100644 > > --- a/include/linux/sunrpc/svcsock.h > > +++ b/include/linux/sunrpc/svcsock.h > > @@ -28,6 +28,7 @@ struct svc_sock { > > /* private TCP part */ > > u32 sk_reclen; /* length of record */ > > u32 sk_tcplen; /* current read length */ > > + struct rpc_xprt *sk_bc_xprt; /* NFSv4.1 backchannel xprt > */ > > }; > > > > /* > > diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h > > index 1758d9f..063a6a7 100644 > > --- a/include/linux/sunrpc/xprt.h > > +++ b/include/linux/sunrpc/xprt.h > > @@ -174,6 +174,7 @@ struct rpc_xprt { > > spinlock_t reserve_lock; /* lock slot table */ > > u32 xid; /* Next XID value to use */ > > struct rpc_task * snd_task; /* Task blocked in send */ > > + struct svc_sock *bc_sock; /* NFSv4.1 backchannel */ > > struct list_head recv; > > > > struct { > > @@ -197,6 +198,7 @@ struct xprt_create { > > struct sockaddr * srcaddr; /* optional local address */ > > struct sockaddr * dstaddr; /* remote peer address */ > > size_t addrlen; > > + struct svc_sock *bc_sock; /* NFSv4.1 backchannel */ > > }; > > > > struct xprt_class { > > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c > > index 5abab09..3dc847f 100644 > > --- a/net/sunrpc/clnt.c > > +++ b/net/sunrpc/clnt.c > > @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args > *args) > > .srcaddr = args->saddress, > > .dstaddr = args->address, > > .addrlen = args->addrsize, > > + .bc_sock = args->bc_sock, > > }; > > char servername[48]; > > > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c > > index 4e6d406..619764e 100644 > > --- a/net/sunrpc/svcsock.c > > +++ b/net/sunrpc/svcsock.c > > @@ -49,6 +49,7 @@ > > #include > > #include > > #include > > +#include > > > > #define RPCDBG_FACILITY RPCDBG_SVCXPRT > > > > @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) > > int len; > > struct kvec *vec; > > int pnum, vlen; > > + struct rpc_rqst *req = NULL; > > > > dprintk("svc: tcp_recv %p data %d conn %d close %d\n", > > svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), > > @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst > *rqstp) > > len = svsk->sk_reclen; > > set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); > > > > + /* > > + * We have enough data for the whole tcp record. Let's try and read > the > > + * first 8 bytes to get the xid and the call direction. We can use > this > > + * to figure out if this is a call or a reply to a callback. If > > + * sk_reclen is < 8 (xid and calldir), then this is a malformed > packet. > > + * In that case, don't bother with the calldir and just read the > data. > > + * It will be rejected in svc_process. > > + */ > > + > > vec = rqstp->rq_vec; > > vec[0] = rqstp->rq_arg.head[0]; > > vlen = PAGE_SIZE; > > + > > + if (len >= 8) { > > + u32 *p; > > + u32 xid; > > + u32 calldir; > > + > > + len = svc_recvfrom(rqstp, vec, 1, 8); > > + if (len < 0) > > + goto error; > > + > > + p = (u32 *)rqstp->rq_arg.head[0].iov_base; > > + xid = *p++; > > + calldir = *p; > > + > > + if (calldir) { > > + /* REPLY */ > > + if (svsk->sk_bc_xprt) > > + req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid); > > + if (req) { > > + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, > > + sizeof(struct xdr_buf)); > > + /* copy the xid and call direction */ > > + memcpy(req->rq_private_buf.head[0].iov_base, > > + rqstp->rq_arg.head[0].iov_base, 8); > > + vec[0] = req->rq_private_buf.head[0]; > > + } else > > + printk(KERN_NOTICE > > + "%s: Got unrecognized reply: " > > + "calldir 0x%x sk_bc_xprt %p xid %08x\n", > > + __func__, ntohl(calldir), > > + svsk->sk_bc_xprt, xid); > > + } > > + > > + if (!calldir || !req) > > + vec[0] = rqstp->rq_arg.head[0]; > > + > > + vec[0].iov_base += 8; > > + vec[0].iov_len -= 8; > > + len = svsk->sk_reclen - 8; > > + vlen -= 8; > > + } > > + > > pnum = 1; > > while (vlen < len) { > > - vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]); > > + vec[pnum].iov_base = (req) ? > > + page_address(req->rq_private_buf.pages[pnum - 1]) : > > + page_address(rqstp->rq_pages[pnum]); > > vec[pnum].iov_len = PAGE_SIZE; > > pnum++; > > vlen += PAGE_SIZE; > > @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) > > if (len < 0) > > goto error; > > > > + /* > > + * Account for the 8 bytes we read earlier > > + */ > > + len += 8; > > + > > + if (req) { > > + xprt_complete_rqst(req->rq_task, len); > > + len = 0; > > + goto out; > > + } > > dprintk("svc: TCP complete record (%d bytes)\n", len); > > rqstp->rq_arg.len = len; > > rqstp->rq_arg.page_base = 0; > > @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) > > rqstp->rq_xprt_ctxt = NULL; > > rqstp->rq_prot = IPPROTO_TCP; > > > > +out: > > /* Reset TCP read info */ > > svsk->sk_reclen = 0; > > svsk->sk_tcplen = 0; > > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c > > index a0bfe53..03f175e 100644 > > --- a/net/sunrpc/xprt.c > > +++ b/net/sunrpc/xprt.c > > @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task) > > spin_unlock(&xprt->reserve_lock); > > } > > > > +/* > > + * The autoclose function for the back channel > > + * > > + * The callback channel should never close the channel, > > + * let the forechannel do that. > > + */ > > +static void bc_autoclose(struct work_struct *work) > > +{ > > + return; > > +} > > + > > + > > +/* > > + * The autodisconnect routine for the back channel. We never disconnect > > + */ > > +static void > > +bc_init_autodisconnect(unsigned long data) > > +{ > > + return; > > +} > > + > > /** > > * xprt_create_transport - create an RPC transport > > * @args: rpc transport creation arguments > > @@ -1051,9 +1072,16 @@ found: > > > > INIT_LIST_HEAD(&xprt->free); > > INIT_LIST_HEAD(&xprt->recv); > > - INIT_WORK(&xprt->task_cleanup, xprt_autoclose); > > - setup_timer(&xprt->timer, xprt_init_autodisconnect, > > - (unsigned long)xprt); > > + if (args->bc_sock) { > > + INIT_WORK(&xprt->task_cleanup, bc_autoclose); > > + setup_timer(&xprt->timer, bc_init_autodisconnect, > > + (unsigned long)xprt); > > Hrmph... Why do you need dummy routines here? > > > + } else { > > + INIT_WORK(&xprt->task_cleanup, xprt_autoclose); > > + setup_timer(&xprt->timer, xprt_init_autodisconnect, > > + (unsigned long)xprt); > > + } > > + > > xprt->last_used = jiffies; > > xprt->cwnd = RPC_INITCWND; > > xprt->bind_index = 0; > > @@ -1073,6 +1101,13 @@ found: > > dprintk("RPC: created transport %p with %u slots\n", xprt, > > xprt->max_reqs); > > > > + /* > > + * Since we don't want connections for the backchannel, we set > > + * the xprt status to connected > > + */ > > + if (args->bc_sock) > > + xprt_set_connected(xprt); > > + > > return xprt; > > } > > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c > > index d40ff50..067d205 100644 > > --- a/net/sunrpc/xprtsock.c > > +++ b/net/sunrpc/xprtsock.c > > @@ -32,6 +32,7 @@ > > #include > > #include > > #include > > +#include > > #include > > #include > > > > @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct rpc_xprt > *xprt, struct seq_file *seq) > > xprt->stat.bklog_u); > > } > > > > +/* > > + * The connect worker for the backchannel > > + * This should never be called as we should never need to connect > > + */ > > +static void bc_connect_worker(struct work_struct *work) > > +{ > > + BUG(); > > +} > > + > > +/* > > + * The set_port routine of the rpc_xprt_ops. This is related to the > portmapper > > + * and should never be called > > + */ > > + > > +static void bc_set_port(struct rpc_xprt *xprt, unsigned short port) > > +{ > > + BUG(); > > +} > > + > > +/* > > + * The connect routine for the backchannel rpc_xprt ops > > + * Again, should never be called! > > + */ > > + > > +static void bc_connect(struct rpc_task *task) > > +{ > > + BUG(); > > +} > > + > > +struct rpc_buffer { > > + size_t len; > > + char data[]; > > +}; > > +/* > > + * Allocate a bunch of pages for a scratch buffer for the rpc code. The > reason > > + * we allocate pages instead doing a kmalloc like rpc_malloc is because > we want > > + * to use the server side send routines. > > + */ > > +void *bc_malloc(struct rpc_task *task, size_t size) > > +{ > > + struct page *page; > > + struct rpc_buffer *buf; > > + > > + BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer)); > > + page = alloc_page(GFP_KERNEL); > > + > > + if (!page) > > + return NULL; > > + > > + buf = page_address(page); > > + buf->len = PAGE_SIZE; > > + > > + return buf->data; > > +} > > + > > __get_free_page()? Why can't you kmalloc() here? > > > +/* > > + * Free the space allocated in the bc_alloc routine > > + */ > > +void bc_free(void *buffer) > > +{ > > + struct rpc_buffer *buf; > > + > > + if (!buffer) > > + return; > > + > > + buf = container_of(buffer, struct rpc_buffer, data); > > + free_pages((unsigned long)buf, get_order(buf->len)); > > This looks funky... Why can't you just call free_page()? You already > know from bc_malloc() that this is an order 0 page allocation. > > > +} > > + > > +/* > > + * Use the svc_sock to send the callback. Must be called with svsk- > >sk_mutex > > + * held. Borrows heavily from svc_tcp_sendto and xs_tcp_semd_request. > > + */ > > +static int bc_sendto(struct rpc_rqst *req) > > +{ > > + int total_len; > > + int len; > > + int size; > > + int result; > > + struct xdr_buf *xbufp = &req->rq_snd_buf; > > + struct page **pages = xbufp->pages; > > + unsigned int flags = MSG_MORE; > > + unsigned int pglen = xbufp->page_len; > > + size_t base = xbufp->page_base; > > + struct rpc_xprt *xprt = req->rq_xprt; > > + struct sock_xprt *transport = > > + container_of(xprt, struct sock_xprt, xprt); > > + struct socket *sock = transport->sock; > > + > > + total_len = xbufp->len; > > + > > + /* > > + * Set up the rpc header and record marker stuff > > + */ > > + xs_encode_tcp_record_marker(xbufp); > > + > > + /* > > + * The RPC message is divided into 3 pieces: > > + * - The header: This is what most of the smaller RPC messages > consist > > + * of. Often the whole message is in this. > > + * > > + * - xdr->pages: This is a list of pages that contain data, for > > + * example in a write request or while using rpcsec gss > > + * > > + * - The tail: This is the rest of the rpc message > > + * > > + * First we send the header, then the pages and then finally the > tail. > > + * The code borrows heavily from svc_sendto. > > + */ > > + > > + /* > > + * Send the head > > + */ > > + if (total_len == xbufp->head[0].iov_len) > > + flags = 0; > > + > > + len = sock->ops->sendpage(sock, virt_to_page(xbufp- > >head[0].iov_base), > > + (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK, > > + xbufp->head[0].iov_len, flags); > > Why do you need to do this? The head iovec is supposed to be reserved > for kmalloc()ed memory, which cannot be used together with sendpage(). > Somebody, some day is going to mess up and try to put a kmalloced buffer > in here, and will wonder why the above doesn't work. > > If you are sending pages, then please put them in the page list part of > the xdr_buf. There is no rule that the RPC call _must_ have a non-zero > head. > > > + > > + if (len != xbufp->head[0].iov_len) > > + goto out; > > + > > + /* > > + * send page data > > + * > > + * Check the amount of data to be sent. If it is less than the > > + * remaining page, then send it else send the current page > > + */ > > + > > + size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; > > + while (pglen > 0) { > > + if (total_len == size) > > + flags = 0; > > + result = sock->ops->sendpage(sock, *pages, base, size, flags); > > + if (result > 0) > > + len += result; > > + if (result != size) > > + goto out; > > + total_len -= size; > > + pglen -= size; > > + size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; > > + base = 0; > > + pages++; > > + } > > + /* > > + * send tail > > + */ > > + if (xbufp->tail[0].iov_len) { > > + result = sock->ops->sendpage(sock, > > + xbufp->tail[0].iov_base, > > + (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK, > > + xbufp->tail[0].iov_len, > > + 0); > > Ditto. > > > + > > + if (result > 0) > > + len += result; > > + } > > +out: > > + if (len != xbufp->len) > > + printk(KERN_NOTICE "Error sending entire callback!\n"); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > Then what? Shouldn't you be closing the connection here? > > > + > > + return len; > > +} > > + > > +/* > > + * The send routine. Borrows from svc_send > > + */ > > +static int bc_send_request(struct rpc_task *task) > > +{ > > + struct rpc_rqst *req = task->tk_rqstp; > > + struct rpc_xprt *bc_xprt = req->rq_xprt; > > + struct svc_xprt *xprt; > > + struct svc_sock *svsk; > > + u32 len; > > + > > + dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid)); > > + /* > > + * Get the server socket associated with this callback xprt > > + */ > > + svsk = bc_xprt->bc_sock; > > + xprt = &svsk->sk_xprt; > > + > > + mutex_lock(&xprt->xpt_mutex); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > Eh? What's this, in which patch is it defined, and why is it at all > needed? > > > + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) > ^^^^^^^^^^^^^^^^^^^^^^^^^^ > Where is this defined, and why is it needed? The xprt already has a > connected/unconnected flag. > > > + len = -ENOTCONN; > > + else > > + len = bc_sendto(req); > > + mutex_unlock(&xprt->xpt_mutex); > > + > > + return 0; > > + > > +} > > + > > +/* > > + * The close routine. Since this is client initiated, we do nothing > > + */ > > + > > +static void bc_close(struct rpc_xprt *xprt) > > +{ > > + return; > > +} > > + > > +/* > > + * The xprt destroy routine. Again, because this connection is client > > + * initiated, we do nothing > > + */ > > + > > +static void bc_destroy(struct rpc_xprt *xprt) > > +{ > > + return; > > +} > > + > > static struct rpc_xprt_ops xs_udp_ops = { > > .set_buffer_size = xs_udp_set_buffer_size, > > .reserve_xprt = xprt_reserve_xprt_cong, > > @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = { > > .print_stats = xs_tcp_print_stats, > > }; > > > > +/* > > + * The rpc_xprt_ops for the server backchannel > > + */ > > + > > +static struct rpc_xprt_ops bc_tcp_ops = { > > + .reserve_xprt = xprt_reserve_xprt, > > + .release_xprt = xprt_release_xprt, > > + .set_port = bc_set_port, > > + .connect = bc_connect, > > + .buf_alloc = bc_malloc, > > + .buf_free = bc_free, > > + .send_request = bc_send_request, > > + .set_retrans_timeout = xprt_set_retrans_timeout_def, > > + .close = bc_close, > > + .destroy = bc_destroy, > > + .print_stats = xs_tcp_print_stats, > > +}; > > + > > static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, > > unsigned int slot_table_size) > > { > > @@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct > xprt_create *args) > > xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); > > xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; > > > > - xprt->bind_timeout = XS_BIND_TO; > > - xprt->connect_timeout = XS_TCP_CONN_TO; > > - xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; > > - xprt->idle_timeout = XS_IDLE_DISC_TO; > > + if (args->bc_sock) { > > + /* backchannel */ > > + xprt_set_bound(xprt); > > + INIT_DELAYED_WORK(&transport->connect_worker, > > + bc_connect_worker); > > Errm.... Is it really such a good idea to tell the RPC layer that it can > reconnect at any time using a routine that will BUG()? > > > + xprt->bind_timeout = 0; > > + xprt->connect_timeout = 0; > > + xprt->reestablish_timeout = 0; > > + xprt->idle_timeout = (~0); > > > > - xprt->ops = &xs_tcp_ops; > > - xprt->timeout = &xs_tcp_default_timeout; > > + /* > > + * The backchannel uses the same socket connection as the > > + * forechannel > > + */ > > + xprt->bc_sock = args->bc_sock; > > + xprt->bc_sock->sk_bc_xprt = xprt; > > + transport->sock = xprt->bc_sock->sk_sock; > > + transport->inet = xprt->bc_sock->sk_sk; > > + > > + xprt->ops = &bc_tcp_ops; > > + > > + goto next; > > + } > > > > switch (addr->sa_family) { > > case AF_INET: > > @@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct > xprt_create *args) > > xprt_set_bound(xprt); > > > > INIT_DELAYED_WORK(&transport->connect_worker, > xs_tcp_connect_worker4); > > - xs_format_ipv4_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); > > break; > > case AF_INET6: > > if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) > > xprt_set_bound(xprt); > > > > INIT_DELAYED_WORK(&transport->connect_worker, > xs_tcp_connect_worker6); > > + break; > > + } > > + xprt->bind_timeout = XS_BIND_TO; > > + xprt->connect_timeout = XS_TCP_CONN_TO; > > + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; > > + xprt->idle_timeout = XS_IDLE_DISC_TO; > > + > > + xprt->ops = &xs_tcp_ops; > > + > > +next: > > + xprt->timeout = &xs_tcp_default_timeout; > > + > > + switch (addr->sa_family) { > > Why do we suddenly need 2 switch statements here? > > > + case AF_INET: > > + xs_format_ipv4_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); > > + break; > > + case AF_INET6: > > xs_format_ipv6_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); > > break; > > default: > > > _______________________________________________ > pNFS mailing list > pNFS@linux-nfs.org > http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs