2013-11-09 18:20:15

by Myklebust, Trond

[permalink] [raw]
Subject: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls

The following scenario can cause silent data corruption when doing
NFS writes. It has mainly been observed when doing database writes
using O_DIRECT.

1) The RPC client uses sendpage() to do zero-copy of the page data.
2) Due to networking issues, the reply from the server is delayed,
and so the RPC client times out.

3) The client issues a second sendpage of the page data as part of
an RPC call retransmission.

4) The reply to the first transmission arrives from the server
_before_ the client hardware has emptied the TCP socket send
buffer.
5) After processing the reply, the RPC state machine rules that
the call to be done, and triggers the completion callbacks.
6) The application notices the RPC call is done, and reuses the
pages to store something else (e.g. a new write).

7) The client NIC drains the TCP socket send buffer. Since the
page data has now changed, it reads a corrupted version of the
initial RPC call, and puts it on the wire.

This patch fixes the problem in the following manner:

The ordering guarantees of TCP ensure that when the server sends a
reply, then we know that the _first_ transmission has completed. Using
zero-copy in that situation is therefore safe.
If a time out occurs, we then send the retransmission using sendmsg()
(i.e. no zero-copy), We then know that the socket contains a full copy of
the data, and so it will retransmit a faithful reproduction even if the
RPC call completes, and the application reuses the O_DIRECT buffer in
the meantime.

Signed-off-by: Trond Myklebust <[email protected]>
Cc: [email protected]
---
net/sunrpc/xprtsock.c | 28 +++++++++++++++++++++-------
1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 17c88928b7db..dd9d295813cf 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
return kernel_sendmsg(sock, &msg, NULL, 0, 0);
}

-static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
+static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
{
+ ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags);
struct page **ppage;
unsigned int remainder;
int err, sent = 0;
@@ -403,6 +405,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
base += xdr->page_base;
ppage = xdr->pages + (base >> PAGE_SHIFT);
base &= ~PAGE_MASK;
+ do_sendpage = sock->ops->sendpage;
+ if (!zerocopy)
+ do_sendpage = sock_no_sendpage;
for(;;) {
unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
int flags = XS_SENDMSG_FLAGS;
@@ -410,7 +415,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
remainder -= len;
if (remainder != 0 || more)
flags |= MSG_MORE;
- err = sock->ops->sendpage(sock, *ppage, base, len, flags);
+ err = do_sendpage(sock, *ppage, base, len, flags);
if (remainder == 0 || err != len)
break;
sent += err;
@@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
* @addrlen: UDP only -- length of destination address
* @xdr: buffer containing this request
* @base: starting position in the buffer
+ * @zerocopy: true if it is safe to use sendpage()
*
*/
-static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
+static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
{
unsigned int remainder = xdr->len - base;
int err, sent = 0;
@@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
if (base < xdr->page_len) {
unsigned int len = xdr->page_len - base;
remainder -= len;
- err = xs_send_pagedata(sock, xdr, base, remainder != 0);
+ err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
if (remainder == 0 || err != len)
goto out;
sent += err;
@@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
req->rq_svec->iov_base, req->rq_svec->iov_len);

status = xs_sendpages(transport->sock, NULL, 0,
- xdr, req->rq_bytes_sent);
+ xdr, req->rq_bytes_sent, true);
dprintk("RPC: %s(%u) = %d\n",
__func__, xdr->len - req->rq_bytes_sent, status);
if (likely(status >= 0)) {
@@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
status = xs_sendpages(transport->sock,
xs_addr(xprt),
xprt->addrlen, xdr,
- req->rq_bytes_sent);
+ req->rq_bytes_sent, true);

dprintk("RPC: xs_udp_send_request(%u) = %d\n",
xdr->len - req->rq_bytes_sent, status);
@@ -693,6 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
+ bool zerocopy = true;
int status;

xs_encode_stream_record_marker(&req->rq_snd_buf);
@@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
xs_pktdump("packet data:",
req->rq_svec->iov_base,
req->rq_svec->iov_len);
+ /* Don't use zero copy if this is a resend. If the RPC call
+ * completes while the socket holds a reference to the pages,
+ * then we may end up resending corrupted data.
+ */
+ if (task->tk_flags & RPC_TASK_SENT)
+ zerocopy = false;

/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
while (1) {
status = xs_sendpages(transport->sock,
- NULL, 0, xdr, req->rq_bytes_sent);
+ NULL, 0, xdr, req->rq_bytes_sent,
+ zerocopy);

dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
xdr->len - req->rq_bytes_sent, status);
--
1.8.3.1



2013-11-12 00:15:52

by Malahal Naineni

[permalink] [raw]
Subject: Re: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls

Myklebust, Trond [[email protected]] wrote:
> On Mon, 2013-11-11 at 17:15 -0600, Malahal Naineni wrote:
> > Trond Myklebust [[email protected]] wrote:
> > > @@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
> > > xs_pktdump("packet data:",
> > > req->rq_svec->iov_base,
> > > req->rq_svec->iov_len);
> > > + /* Don't use zero copy if this is a resend. If the RPC call
> > > + * completes while the socket holds a reference to the pages,
> > > + * then we may end up resending corrupted data.
> > > + */
> > > + if (task->tk_flags & RPC_TASK_SENT)
> >
> > How about RPC_WAS_SENT() ?
>
> <shrug>that evaluates to the same</shrug>

Yes, but what is the point of having a macro then?

Regards, Malahal.


2013-11-10 22:18:04

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls


On Nov 10, 2013, at 15:33, Chuck Lever <[email protected]> wrote:

>
> On Nov 9, 2013, at 1:20 PM, Trond Myklebust <[email protected]> wrote:
>
>> The following scenario can cause silent data corruption when doing
>> NFS writes. It has mainly been observed when doing database writes
>> using O_DIRECT.
>>
>> 1) The RPC client uses sendpage() to do zero-copy of the page data.
>> 2) Due to networking issues, the reply from the server is delayed,
>> and so the RPC client times out.
>>
>> 3) The client issues a second sendpage of the page data as part of
>> an RPC call retransmission.
>>
>> 4) The reply to the first transmission arrives from the server
>> _before_ the client hardware has emptied the TCP socket send
>> buffer.
>> 5) After processing the reply, the RPC state machine rules that
>> the call to be done, and triggers the completion callbacks.
>> 6) The application notices the RPC call is done, and reuses the
>> pages to store something else (e.g. a new write).
>>
>> 7) The client NIC drains the TCP socket send buffer. Since the
>> page data has now changed, it reads a corrupted version of the
>> initial RPC call, and puts it on the wire.
>>
>> This patch fixes the problem in the following manner:
>>
>> The ordering guarantees of TCP ensure that when the server sends a
>> reply, then we know that the _first_ transmission has completed. Using
>> zero-copy in that situation is therefore safe.
>> If a time out occurs, we then send the retransmission using sendmsg()
>> (i.e. no zero-copy), We then know that the socket contains a full copy of
>> the data, and so it will retransmit a faithful reproduction even if the
>> RPC call completes, and the application reuses the O_DIRECT buffer in
>> the meantime.
>
> Clever!
>
> But if wsize is large, the retransmission will require a potentially large contiguous piece of memory to complete a WRITE RPC. In low memory scenarios, that might be hard to come by. Should be safe for direct I/O, but if this I/O is driven by memory reclaim, it could deadlock.

> We see this all the time when using NFS on network devices that do not support scatter/gather (and thus have no sock_sendpage() method to begin with).

On TCP? I thought that the memory reclaim can override the MSG_MORE flag if it needs to.

>> Signed-off-by: Trond Myklebust <[email protected]>
>> Cc: [email protected]
>> ---
>> net/sunrpc/xprtsock.c | 28 +++++++++++++++++++++-------
>> 1 file changed, 21 insertions(+), 7 deletions(-)
>>
>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>> index 17c88928b7db..dd9d295813cf 100644
>> --- a/net/sunrpc/xprtsock.c
>> +++ b/net/sunrpc/xprtsock.c
>> @@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
>> return kernel_sendmsg(sock, &msg, NULL, 0, 0);
>> }
>>
>> -static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
>> +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
>> {
>> + ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
>> + int offset, size_t size, int flags);
>> struct page **ppage;
>> unsigned int remainder;
>> int err, sent = 0;
>> @@ -403,6 +405,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>> base += xdr->page_base;
>> ppage = xdr->pages + (base >> PAGE_SHIFT);
>> base &= ~PAGE_MASK;
>> + do_sendpage = sock->ops->sendpage;
>> + if (!zerocopy)
>> + do_sendpage = sock_no_sendpage;
>> for(;;) {
>> unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
>> int flags = XS_SENDMSG_FLAGS;
>> @@ -410,7 +415,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>> remainder -= len;
>> if (remainder != 0 || more)
>> flags |= MSG_MORE;
>> - err = sock->ops->sendpage(sock, *ppage, base, len, flags);
>> + err = do_sendpage(sock, *ppage, base, len, flags);
>> if (remainder == 0 || err != len)
>> break;
>> sent += err;
>> @@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>> * @addrlen: UDP only -- length of destination address
>> * @xdr: buffer containing this request
>> * @base: starting position in the buffer
>> + * @zerocopy: true if it is safe to use sendpage()
>> *
>> */
>> -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
>> +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
>> {
>> unsigned int remainder = xdr->len - base;
>> int err, sent = 0;
>> @@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
>> if (base < xdr->page_len) {
>> unsigned int len = xdr->page_len - base;
>> remainder -= len;
>> - err = xs_send_pagedata(sock, xdr, base, remainder != 0);
>> + err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
>> if (remainder == 0 || err != len)
>> goto out;
>> sent += err;
>> @@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
>> req->rq_svec->iov_base, req->rq_svec->iov_len);
>>
>> status = xs_sendpages(transport->sock, NULL, 0,
>> - xdr, req->rq_bytes_sent);
>> + xdr, req->rq_bytes_sent, true);
>> dprintk("RPC: %s(%u) = %d\n",
>> __func__, xdr->len - req->rq_bytes_sent, status);
>> if (likely(status >= 0)) {
>> @@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
>> status = xs_sendpages(transport->sock,
>> xs_addr(xprt),
>> xprt->addrlen, xdr,
>> - req->rq_bytes_sent);
>> + req->rq_bytes_sent, true);
>>
>> dprintk("RPC: xs_udp_send_request(%u) = %d\n",
>> xdr->len - req->rq_bytes_sent, status);
>> @@ -693,6 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
>> struct rpc_xprt *xprt = req->rq_xprt;
>> struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
>> struct xdr_buf *xdr = &req->rq_snd_buf;
>> + bool zerocopy = true;
>> int status;
>>
>> xs_encode_stream_record_marker(&req->rq_snd_buf);
>> @@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
>> xs_pktdump("packet data:",
>> req->rq_svec->iov_base,
>> req->rq_svec->iov_len);
>> + /* Don't use zero copy if this is a resend. If the RPC call
>> + * completes while the socket holds a reference to the pages,
>> + * then we may end up resending corrupted data.
>> + */
>> + if (task->tk_flags & RPC_TASK_SENT)
>> + zerocopy = false;
>>
>> /* Continue transmitting the packet/record. We must be careful
>> * to cope with writespace callbacks arriving _after_ we have
>> * called sendmsg(). */
>> while (1) {
>> status = xs_sendpages(transport->sock,
>> - NULL, 0, xdr, req->rq_bytes_sent);
>> + NULL, 0, xdr, req->rq_bytes_sent,
>> + zerocopy);
>>
>> dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
>> xdr->len - req->rq_bytes_sent, status);
>> --
>> 1.8.3.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com


2013-11-10 20:33:26

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls


On Nov 9, 2013, at 1:20 PM, Trond Myklebust <[email protected]> wrote:

> The following scenario can cause silent data corruption when doing
> NFS writes. It has mainly been observed when doing database writes
> using O_DIRECT.
>
> 1) The RPC client uses sendpage() to do zero-copy of the page data.
> 2) Due to networking issues, the reply from the server is delayed,
> and so the RPC client times out.
>
> 3) The client issues a second sendpage of the page data as part of
> an RPC call retransmission.
>
> 4) The reply to the first transmission arrives from the server
> _before_ the client hardware has emptied the TCP socket send
> buffer.
> 5) After processing the reply, the RPC state machine rules that
> the call to be done, and triggers the completion callbacks.
> 6) The application notices the RPC call is done, and reuses the
> pages to store something else (e.g. a new write).
>
> 7) The client NIC drains the TCP socket send buffer. Since the
> page data has now changed, it reads a corrupted version of the
> initial RPC call, and puts it on the wire.
>
> This patch fixes the problem in the following manner:
>
> The ordering guarantees of TCP ensure that when the server sends a
> reply, then we know that the _first_ transmission has completed. Using
> zero-copy in that situation is therefore safe.
> If a time out occurs, we then send the retransmission using sendmsg()
> (i.e. no zero-copy), We then know that the socket contains a full copy of
> the data, and so it will retransmit a faithful reproduction even if the
> RPC call completes, and the application reuses the O_DIRECT buffer in
> the meantime.

Clever!

But if wsize is large, the retransmission will require a potentially large contiguous piece of memory to complete a WRITE RPC. In low memory scenarios, that might be hard to come by. Should be safe for direct I/O, but if this I/O is driven by memory reclaim, it could deadlock.

We see this all the time when using NFS on network devices that do not support scatter/gather (and thus have no sock_sendpage() method to begin with).



> Signed-off-by: Trond Myklebust <[email protected]>
> Cc: [email protected]
> ---
> net/sunrpc/xprtsock.c | 28 +++++++++++++++++++++-------
> 1 file changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 17c88928b7db..dd9d295813cf 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
> return kernel_sendmsg(sock, &msg, NULL, 0, 0);
> }
>
> -static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
> +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
> {
> + ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
> + int offset, size_t size, int flags);
> struct page **ppage;
> unsigned int remainder;
> int err, sent = 0;
> @@ -403,6 +405,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> base += xdr->page_base;
> ppage = xdr->pages + (base >> PAGE_SHIFT);
> base &= ~PAGE_MASK;
> + do_sendpage = sock->ops->sendpage;
> + if (!zerocopy)
> + do_sendpage = sock_no_sendpage;
> for(;;) {
> unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
> int flags = XS_SENDMSG_FLAGS;
> @@ -410,7 +415,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> remainder -= len;
> if (remainder != 0 || more)
> flags |= MSG_MORE;
> - err = sock->ops->sendpage(sock, *ppage, base, len, flags);
> + err = do_sendpage(sock, *ppage, base, len, flags);
> if (remainder == 0 || err != len)
> break;
> sent += err;
> @@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> * @addrlen: UDP only -- length of destination address
> * @xdr: buffer containing this request
> * @base: starting position in the buffer
> + * @zerocopy: true if it is safe to use sendpage()
> *
> */
> -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
> +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
> {
> unsigned int remainder = xdr->len - base;
> int err, sent = 0;
> @@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
> if (base < xdr->page_len) {
> unsigned int len = xdr->page_len - base;
> remainder -= len;
> - err = xs_send_pagedata(sock, xdr, base, remainder != 0);
> + err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
> if (remainder == 0 || err != len)
> goto out;
> sent += err;
> @@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
> req->rq_svec->iov_base, req->rq_svec->iov_len);
>
> status = xs_sendpages(transport->sock, NULL, 0,
> - xdr, req->rq_bytes_sent);
> + xdr, req->rq_bytes_sent, true);
> dprintk("RPC: %s(%u) = %d\n",
> __func__, xdr->len - req->rq_bytes_sent, status);
> if (likely(status >= 0)) {
> @@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
> status = xs_sendpages(transport->sock,
> xs_addr(xprt),
> xprt->addrlen, xdr,
> - req->rq_bytes_sent);
> + req->rq_bytes_sent, true);
>
> dprintk("RPC: xs_udp_send_request(%u) = %d\n",
> xdr->len - req->rq_bytes_sent, status);
> @@ -693,6 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
> struct rpc_xprt *xprt = req->rq_xprt;
> struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
> struct xdr_buf *xdr = &req->rq_snd_buf;
> + bool zerocopy = true;
> int status;
>
> xs_encode_stream_record_marker(&req->rq_snd_buf);
> @@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
> xs_pktdump("packet data:",
> req->rq_svec->iov_base,
> req->rq_svec->iov_len);
> + /* Don't use zero copy if this is a resend. If the RPC call
> + * completes while the socket holds a reference to the pages,
> + * then we may end up resending corrupted data.
> + */
> + if (task->tk_flags & RPC_TASK_SENT)
> + zerocopy = false;
>
> /* Continue transmitting the packet/record. We must be careful
> * to cope with writespace callbacks arriving _after_ we have
> * called sendmsg(). */
> while (1) {
> status = xs_sendpages(transport->sock,
> - NULL, 0, xdr, req->rq_bytes_sent);
> + NULL, 0, xdr, req->rq_bytes_sent,
> + zerocopy);
>
> dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
> xdr->len - req->rq_bytes_sent, status);
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





2013-11-11 23:25:11

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls

On Mon, 2013-11-11 at 17:15 -0600, Malahal Naineni wrote:
+AD4- Trond Myklebust +AFs-Trond.Myklebust+AEA-netapp.com+AF0- wrote:
+AD4- +AD4- +AEAAQA- -700,13 +-707,20 +AEAAQA- static int xs+AF8-tcp+AF8-send+AF8-request(struct rpc+AF8-task +ACo-task)
+AD4- +AD4- xs+AF8-pktdump(+ACI-packet data:+ACI-,
+AD4- +AD4- req-+AD4-rq+AF8-svec-+AD4-iov+AF8-base,
+AD4- +AD4- req-+AD4-rq+AF8-svec-+AD4-iov+AF8-len)+ADs-
+AD4- +AD4- +- /+ACo- Don't use zero copy if this is a resend. If the RPC call
+AD4- +AD4- +- +ACo- completes while the socket holds a reference to the pages,
+AD4- +AD4- +- +ACo- then we may end up resending corrupted data.
+AD4- +AD4- +- +ACo-/
+AD4- +AD4- +- if (task-+AD4-tk+AF8-flags +ACY- RPC+AF8-TASK+AF8-SENT)
+AD4-
+AD4- How about RPC+AF8-WAS+AF8-SENT() ?

+ADw-shrug+AD4-that evaluates to the same+ADw-/shrug+AD4-

--
Trond Myklebust
Linux NFS client maintainer

NetApp
Trond.Myklebust+AEA-netapp.com
http://www.netapp.com

2013-11-11 23:15:46

by Malahal Naineni

[permalink] [raw]
Subject: Re: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls

Trond Myklebust [[email protected]] wrote:
> The following scenario can cause silent data corruption when doing
> NFS writes. It has mainly been observed when doing database writes
> using O_DIRECT.
>
> 1) The RPC client uses sendpage() to do zero-copy of the page data.
> 2) Due to networking issues, the reply from the server is delayed,
> and so the RPC client times out.
>
> 3) The client issues a second sendpage of the page data as part of
> an RPC call retransmission.
>
> 4) The reply to the first transmission arrives from the server
> _before_ the client hardware has emptied the TCP socket send
> buffer.
> 5) After processing the reply, the RPC state machine rules that
> the call to be done, and triggers the completion callbacks.
> 6) The application notices the RPC call is done, and reuses the
> pages to store something else (e.g. a new write).
>
> 7) The client NIC drains the TCP socket send buffer. Since the
> page data has now changed, it reads a corrupted version of the
> initial RPC call, and puts it on the wire.
>
> This patch fixes the problem in the following manner:
>
> The ordering guarantees of TCP ensure that when the server sends a
> reply, then we know that the _first_ transmission has completed. Using
> zero-copy in that situation is therefore safe.
> If a time out occurs, we then send the retransmission using sendmsg()
> (i.e. no zero-copy), We then know that the socket contains a full copy of
> the data, and so it will retransmit a faithful reproduction even if the
> RPC call completes, and the application reuses the O_DIRECT buffer in
> the meantime.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> Cc: [email protected]
> ---
> net/sunrpc/xprtsock.c | 28 +++++++++++++++++++++-------
> 1 file changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 17c88928b7db..dd9d295813cf 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
> return kernel_sendmsg(sock, &msg, NULL, 0, 0);
> }
>
> -static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
> +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
> {
> + ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
> + int offset, size_t size, int flags);
> struct page **ppage;
> unsigned int remainder;
> int err, sent = 0;
> @@ -403,6 +405,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> base += xdr->page_base;
> ppage = xdr->pages + (base >> PAGE_SHIFT);
> base &= ~PAGE_MASK;
> + do_sendpage = sock->ops->sendpage;
> + if (!zerocopy)
> + do_sendpage = sock_no_sendpage;
> for(;;) {
> unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
> int flags = XS_SENDMSG_FLAGS;
> @@ -410,7 +415,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> remainder -= len;
> if (remainder != 0 || more)
> flags |= MSG_MORE;
> - err = sock->ops->sendpage(sock, *ppage, base, len, flags);
> + err = do_sendpage(sock, *ppage, base, len, flags);
> if (remainder == 0 || err != len)
> break;
> sent += err;
> @@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> * @addrlen: UDP only -- length of destination address
> * @xdr: buffer containing this request
> * @base: starting position in the buffer
> + * @zerocopy: true if it is safe to use sendpage()
> *
> */
> -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
> +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
> {
> unsigned int remainder = xdr->len - base;
> int err, sent = 0;
> @@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
> if (base < xdr->page_len) {
> unsigned int len = xdr->page_len - base;
> remainder -= len;
> - err = xs_send_pagedata(sock, xdr, base, remainder != 0);
> + err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
> if (remainder == 0 || err != len)
> goto out;
> sent += err;
> @@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
> req->rq_svec->iov_base, req->rq_svec->iov_len);
>
> status = xs_sendpages(transport->sock, NULL, 0,
> - xdr, req->rq_bytes_sent);
> + xdr, req->rq_bytes_sent, true);
> dprintk("RPC: %s(%u) = %d\n",
> __func__, xdr->len - req->rq_bytes_sent, status);
> if (likely(status >= 0)) {
> @@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
> status = xs_sendpages(transport->sock,
> xs_addr(xprt),
> xprt->addrlen, xdr,
> - req->rq_bytes_sent);
> + req->rq_bytes_sent, true);
>
> dprintk("RPC: xs_udp_send_request(%u) = %d\n",
> xdr->len - req->rq_bytes_sent, status);
> @@ -693,6 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
> struct rpc_xprt *xprt = req->rq_xprt;
> struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
> struct xdr_buf *xdr = &req->rq_snd_buf;
> + bool zerocopy = true;
> int status;
>
> xs_encode_stream_record_marker(&req->rq_snd_buf);
> @@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
> xs_pktdump("packet data:",
> req->rq_svec->iov_base,
> req->rq_svec->iov_len);
> + /* Don't use zero copy if this is a resend. If the RPC call
> + * completes while the socket holds a reference to the pages,
> + * then we may end up resending corrupted data.
> + */
> + if (task->tk_flags & RPC_TASK_SENT)

How about RPC_WAS_SENT() ?

> + zerocopy = false;
>
> /* Continue transmitting the packet/record. We must be careful
> * to cope with writespace callbacks arriving _after_ we have
> * called sendmsg(). */
> while (1) {
> status = xs_sendpages(transport->sock,
> - NULL, 0, xdr, req->rq_bytes_sent);
> + NULL, 0, xdr, req->rq_bytes_sent,
> + zerocopy);
>
> dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
> xdr->len - req->rq_bytes_sent, status);
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>