2012-05-03 14:59:24

by Ian Campbell

[permalink] [raw]
Subject: [PATCH 9/9] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.

This prevents an issue where an ACK is delayed, a retransmit is queued (either
at the RPC or TCP level) and the ACK arrives before the retransmission hits the
wire. If this happens to an NFS WRITE RPC then the write() system call
completes and the userspace process can continue, potentially modifying data
referenced by the retransmission before the retransmission occurs.

Signed-off-by: Ian Campbell <[email protected]>
Acked-by: Trond Myklebust <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Neil Brown <[email protected]>
Cc: "J. Bruce Fields" <[email protected]>
Cc: [email protected]
Cc: [email protected]
---
include/linux/sunrpc/xdr.h | 2 ++
include/linux/sunrpc/xprt.h | 5 ++++-
net/sunrpc/clnt.c | 27 ++++++++++++++++++++++-----
net/sunrpc/svcsock.c | 3 ++-
net/sunrpc/xprt.c | 12 ++++++++++++
net/sunrpc/xprtsock.c | 3 ++-
6 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index af70af3..ff1b121 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -16,6 +16,7 @@
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <linux/scatterlist.h>
+#include <linux/skbuff.h>

/*
* Buffer adjustment
@@ -57,6 +58,7 @@ struct xdr_buf {
tail[1]; /* Appended after page data */

struct page ** pages; /* Array of contiguous pages */
+ struct skb_frag_destructor *destructor;
unsigned int page_base, /* Start of page data */
page_len, /* Length of page data */
flags; /* Flags for data disposition */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 77d278d..e8d3f18 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -92,7 +92,10 @@ struct rpc_rqst {
/* A cookie used to track the
state of the transport
connection */
-
+ struct skb_frag_destructor destructor; /* SKB paged fragment
+ * destructor for
+ * transmitted pages*/
+
/*
* Partial send handling
*/
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6797246..351bf3d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -61,6 +61,7 @@ static void call_reserve(struct rpc_task *task);
static void call_reserveresult(struct rpc_task *task);
static void call_allocate(struct rpc_task *task);
static void call_decode(struct rpc_task *task);
+static void call_complete(struct rpc_task *task);
static void call_bind(struct rpc_task *task);
static void call_bind_status(struct rpc_task *task);
static void call_transmit(struct rpc_task *task);
@@ -1416,6 +1417,8 @@ rpc_xdr_encode(struct rpc_task *task)
(char *)req->rq_buffer + req->rq_callsize,
req->rq_rcvsize);

+ req->rq_snd_buf.destructor = &req->destructor;
+
p = rpc_encode_header(task);
if (p == NULL) {
printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
@@ -1581,6 +1584,7 @@ call_connect_status(struct rpc_task *task)
static void
call_transmit(struct rpc_task *task)
{
+ struct rpc_rqst *req = task->tk_rqstp;
dprint_status(task);

task->tk_action = call_status;
@@ -1614,8 +1618,8 @@ call_transmit(struct rpc_task *task)
call_transmit_status(task);
if (rpc_reply_expected(task))
return;
- task->tk_action = rpc_exit_task;
- rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
+ task->tk_action = call_complete;
+ skb_frag_destructor_unref(&req->destructor);
}

/*
@@ -1688,7 +1692,8 @@ call_bc_transmit(struct rpc_task *task)
return;
}

- task->tk_action = rpc_exit_task;
+ task->tk_action = call_complete;
+ skb_frag_destructor_unref(&req->destructor);
if (task->tk_status < 0) {
printk(KERN_NOTICE "RPC: Could not send backchannel reply "
"error: %d\n", task->tk_status);
@@ -1728,7 +1733,6 @@ call_bc_transmit(struct rpc_task *task)
"error: %d\n", task->tk_status);
break;
}
- rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */

@@ -1906,12 +1910,14 @@ call_decode(struct rpc_task *task)
return;
}

- task->tk_action = rpc_exit_task;
+ task->tk_action = call_complete;

if (decode) {
task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
task->tk_msg.rpc_resp);
}
+ rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
+ skb_frag_destructor_unref(&req->destructor);
dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
task->tk_status);
return;
@@ -1926,6 +1932,17 @@ out_retry:
}
}

+/*
+ * 8. Wait for pages to be released by the network stack.
+ */
+static void
+call_complete(struct rpc_task *task)
+{
+ dprintk("RPC: %5u call_complete result %d\n",
+ task->tk_pid, task->tk_status);
+ task->tk_action = rpc_exit_task;
+}
+
static __be32 *
rpc_encode_header(struct rpc_task *task)
{
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f6d8c73..1145929 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -198,7 +198,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
while (pglen > 0) {
if (slen == size)
flags = 0;
- result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
+ result = kernel_sendpage(sock, *ppage, xdr->destructor,
+ base, size, flags);
if (result > 0)
len += result;
if (result != size)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 6fe2dce..f8418a0 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1108,6 +1108,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
xprt->xid = net_random();
}

+static int xprt_complete_skb_pages(struct skb_frag_destructor *destroy)
+{
+ struct rpc_rqst *req =
+ container_of(destroy, struct rpc_rqst, destructor);
+
+ dprintk("RPC: %5u completing skb pages\n", req->rq_task->tk_pid);
+ rpc_wake_up_queued_task(&req->rq_xprt->pending, req->rq_task);
+ return 0;
+}
+
static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
{
struct rpc_rqst *req = task->tk_rqstp;
@@ -1120,6 +1130,8 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
req->rq_xid = xprt_alloc_xid(xprt);
req->rq_release_snd_buf = NULL;
xprt_reset_majortimeo(req);
+ atomic_set(&req->destructor.ref, 1);
+ req->destructor.destroy = &xprt_complete_skb_pages;
dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
req, ntohl(req->rq_xid));
}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index f1995dc..44e07f3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
remainder -= len;
if (remainder != 0 || more)
flags |= MSG_MORE;
- err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
+ err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
+ base, len, flags);
if (remainder == 0 || err != len)
break;
sent += err;
--
1.7.2.5



2012-05-10 11:19:54

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH 9/9] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.

On Thu, May 03, 2012 at 03:56:11PM +0100, Ian Campbell wrote:
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index f6d8c73..1145929 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -198,7 +198,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
> while (pglen > 0) {
> if (slen == size)
> flags = 0;
> - result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
> + result = kernel_sendpage(sock, *ppage, xdr->destructor,
> + base, size, flags);
> if (result > 0)
> len += result;
> if (result != size)

So I tried triggering this by simply creating an nfs export on localhost
and copying a large file out with dd, but this never seems to trigger
this code.

Any idea how to test?

--
MST

2012-05-10 13:26:15

by Ian Campbell

[permalink] [raw]
Subject: Re: [PATCH 9/9] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.

On Thu, 2012-05-10 at 12:19 +0100, Michael S. Tsirkin wrote:
> On Thu, May 03, 2012 at 03:56:11PM +0100, Ian Campbell wrote:
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index f6d8c73..1145929 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -198,7 +198,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
> > while (pglen > 0) {
> > if (slen == size)
> > flags = 0;
> > - result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
> > + result = kernel_sendpage(sock, *ppage, xdr->destructor,
> > + base, size, flags);
> > if (result > 0)
> > len += result;
> > if (result != size)
>
> So I tried triggering this by simply creating an nfs export on localhost
> and copying a large file out with dd, but this never seems to trigger
> this code.
>
> Any idea how to test?

My test code, which is a bit overly complex for this because it also
tries to demonstrate corruption on the wire, is attached.

Using dd I suspect you probably need to increase the block size, and
possibly enable O_DIRECT (conv=direct?)

My typical scenario has been to mount a remote NFS and run
tcpdump -s 4096 -x -ne -v -i eth0 'host $client and ip[184:4] == 0x55555555'
to watch for on-wire corruption.

FYI I've triggered a BUG_ON in my local debug patches with your series
applied, I'm just investigating whether its my debugging or something in
the series which causes it.

After that I'll try it with local NFS and VMs with bridging etc to test
the extra aspects which your series is exercising.

Ian.


Attachments:
blktest3.c (1.06 kB)