Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752187AbYLCUMG (ORCPT ); Wed, 3 Dec 2008 15:12:06 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754197AbYLCTyc (ORCPT ); Wed, 3 Dec 2008 14:54:32 -0500 Received: from kroah.org ([198.145.64.141]:60263 "EHLO coco.kroah.org" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1754497AbYLCTy2 (ORCPT ); Wed, 3 Dec 2008 14:54:28 -0500 Date: Wed, 3 Dec 2008 11:52:46 -0800 From: Greg KH To: linux-kernel@vger.kernel.org, stable@kernel.org Cc: Justin Forbes , Zwane Mwaikambo , "Theodore Ts'o" , Randy Dunlap , Dave Jones , Chuck Wolber , Chris Wedgwood , Michael Krufky , Chuck Ebbert , Domenico Andreoli , Willy Tarreau , Rodrigo Rubira Branco , Jake Edge , Eugene Teo , torvalds@linux-foundation.org, akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk, Steve French , Jeff Layton , Shirish S Pargaonkar , Steve French Subject: [patch 055/104] cifs: Reduce number of socket retries in large write path Message-ID: <20081203195246.GD8950@kroah.com> References: <20081203193901.715896543@mini.kroah.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline; filename="cifs-reduce-number-of-socket-retries-in-large-write-path.patch" In-Reply-To: <20081203194725.GA8950@kroah.com> User-Agent: Mutt/1.5.16 (2007-06-09) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12472 Lines: 337 2.6.27-stable review patch. If anyone has any objections, please let us know. ------------------ From: Steve French Backport of upstream commit edf1ae403896cb7750800508b14996ba6be39a53 for -stable. [CIFS] Reduce number of socket retries in large write path CIFS in some heavy stress conditions cifs could get EAGAIN repeatedly in smb_send2 which led to repeated retries and eventually failure of large writes which could lead to data corruption. There are three changes that were suggested by various network developers: 1) convert cifs from non-blocking to blocking tcp sendmsg (we left in the retry on failure) 2) change cifs to not set sendbuf and rcvbuf size for the socket (let tcp autotune the buffer sizes since that works much better in the TCP stack now) 3) if we have a partial frame sent in smb_send2, mark the tcp session as invalid (close the socket and reconnect) so we do not corrupt the remaining part of the SMB with the beginning of the next SMB. This does not appear to hurt performance measurably and has been run in various scenarios, but it definately removes a corruption that we were seeing in some high stress test cases. Acked-by: Shirish Pargaonkar Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsglob.h | 2 + fs/cifs/cifsproto.h | 2 - fs/cifs/connect.c | 58 ++++++++++++++++++++++++++++++++++++++-------------- fs/cifs/transport.c | 41 +++++++++++++++++++++++++++--------- 4 files changed, 77 insertions(+), 26 deletions(-) --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -141,6 +141,8 @@ struct TCP_Server_Info { char versionMajor; char versionMinor; bool svlocal:1; /* local server or remote */ + bool noblocksnd; /* use blocking sendmsg */ + bool noautotune; /* do not autotune send buf sizes */ atomic_t socketUseCount; /* number of open cifs sessions on socket */ atomic_t inFlight; /* number of requests on the wire to server */ #ifdef CONFIG_CIFS_STATS2 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -36,7 +36,7 @@ extern void cifs_buf_release(void *); extern struct smb_hdr *cifs_small_buf_get(void); extern void cifs_small_buf_release(void *); extern int smb_send(struct socket *, struct smb_hdr *, - unsigned int /* length */ , struct sockaddr *); + unsigned int /* length */ , struct sockaddr *, bool); extern unsigned int _GetXid(void); extern void _FreeXid(unsigned int); #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid)); --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -90,6 +90,8 @@ struct smb_vol { bool nocase:1; /* request case insensitive filenames */ bool nobrl:1; /* disable sending byte range locks to srv */ bool seal:1; /* request transport encryption on share */ + bool noblocksnd:1; + bool noautotune:1; unsigned int rsize; unsigned int wsize; unsigned int sockopt; @@ -100,9 +102,11 @@ struct smb_vol { static int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, char *netb_name, - char *server_netb_name); + char *server_netb_name, + bool noblocksnd, + bool nosndbuf); /* ipv6 never set sndbuf size */ static int ipv6_connect(struct sockaddr_in6 *psin_server, - struct socket **csocket); + struct socket **csocket, bool noblocksnd); /* @@ -188,12 +192,13 @@ cifs_reconnect(struct TCP_Server_Info *s try_to_freeze(); if (server->protocolType == IPV6) { rc = ipv6_connect(&server->addr.sockAddr6, - &server->ssocket); + &server->ssocket, server->noautotune); } else { rc = ipv4_connect(&server->addr.sockAddr, &server->ssocket, server->workstation_RFC1001_name, - server->server_RFC1001_name); + server->server_RFC1001_name, + server->noblocksnd, server->noautotune); } if (rc) { cFYI(1, ("reconnect error %d", rc)); @@ -409,8 +414,14 @@ incomplete_rcv: msleep(1); /* minimum sleep to prevent looping allowing socket to clear and app threads to set tcpStatus CifsNeedReconnect if server hung */ - if (pdu_length < 4) + if (pdu_length < 4) { + iov.iov_base = (4 - pdu_length) + + (char *)smb_buffer; + iov.iov_len = pdu_length; + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; goto incomplete_rcv; + } else continue; } else if (length <= 0) { @@ -1186,6 +1197,10 @@ cifs_parse_mount_options(char *options, /* ignore */ } else if (strnicmp(data, "rw", 2) == 0) { vol->rw = true; + } else if (strnicmp(data, "noblocksnd", 11) == 0) { + vol->noblocksnd = true; + } else if (strnicmp(data, "noautotune", 10) == 0) { + vol->noautotune = true; } else if ((strnicmp(data, "suid", 4) == 0) || (strnicmp(data, "nosuid", 6) == 0) || (strnicmp(data, "exec", 4) == 0) || @@ -1506,7 +1521,8 @@ static void rfc1002mangle(char *target, static int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, - char *netbios_name, char *target_name) + char *netbios_name, char *target_name, + bool noblocksnd, bool noautotune) { int rc = 0; int connected = 0; @@ -1578,11 +1594,15 @@ ipv4_connect(struct sockaddr_in *psin_se (*csocket)->sk->sk_sndbuf, (*csocket)->sk->sk_rcvbuf, (*csocket)->sk->sk_rcvtimeo)); (*csocket)->sk->sk_rcvtimeo = 7 * HZ; + if (!noblocksnd) + (*csocket)->sk->sk_sndtimeo = 3 * HZ; /* make the bufsizes depend on wsize/rsize and max requests */ - if ((*csocket)->sk->sk_sndbuf < (200 * 1024)) - (*csocket)->sk->sk_sndbuf = 200 * 1024; - if ((*csocket)->sk->sk_rcvbuf < (140 * 1024)) - (*csocket)->sk->sk_rcvbuf = 140 * 1024; + if (noautotune) { + if ((*csocket)->sk->sk_sndbuf < (200 * 1024)) + (*csocket)->sk->sk_sndbuf = 200 * 1024; + if ((*csocket)->sk->sk_rcvbuf < (140 * 1024)) + (*csocket)->sk->sk_rcvbuf = 140 * 1024; + } /* send RFC1001 sessinit */ if (psin_server->sin_port == htons(RFC1001_PORT)) { @@ -1619,7 +1639,7 @@ ipv4_connect(struct sockaddr_in *psin_se /* sizeof RFC1002_SESSION_REQUEST with no scope */ smb_buf->smb_buf_length = 0x81000044; rc = smb_send(*csocket, smb_buf, 0x44, - (struct sockaddr *)psin_server); + (struct sockaddr *)psin_server, noblocksnd); kfree(ses_init_buf); msleep(1); /* RFC1001 layer in at least one server requires very short break before negprot @@ -1639,7 +1659,8 @@ ipv4_connect(struct sockaddr_in *psin_se } static int -ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket) +ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket, + bool noblocksnd) { int rc = 0; int connected = 0; @@ -1708,6 +1729,8 @@ ipv6_connect(struct sockaddr_in6 *psin_s the default. sock_setsockopt not used because it expects user space buffer */ (*csocket)->sk->sk_rcvtimeo = 7 * HZ; + if (!noblocksnd) + (*csocket)->sk->sk_sndtimeo = 3 * HZ; return rc; } @@ -1961,11 +1984,14 @@ cifs_mount(struct super_block *sb, struc cFYI(1, ("attempting ipv6 connect")); /* BB should we allow ipv6 on port 139? */ /* other OS never observed in Wild doing 139 with v6 */ - rc = ipv6_connect(&sin_server6, &csocket); + rc = ipv6_connect(&sin_server6, &csocket, + volume_info.noblocksnd); } else rc = ipv4_connect(&sin_server, &csocket, - volume_info.source_rfc1001_name, - volume_info.target_rfc1001_name); + volume_info.source_rfc1001_name, + volume_info.target_rfc1001_name, + volume_info.noblocksnd, + volume_info.noautotune); if (rc < 0) { cERROR(1, ("Error connecting to IPv4 socket. " "Aborting operation")); @@ -1980,6 +2006,8 @@ cifs_mount(struct super_block *sb, struc sock_release(csocket); goto out; } else { + srvTcp->noblocksnd = volume_info.noblocksnd; + srvTcp->noautotune = volume_info.noautotune; memcpy(&srvTcp->addr.sockAddr, &sin_server, sizeof(struct sockaddr_in)); atomic_set(&srvTcp->inFlight, 0); --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -162,7 +162,7 @@ void DeleteTconOplockQEntries(struct cif int smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer, - unsigned int smb_buf_length, struct sockaddr *sin) + unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd) { int rc = 0; int i = 0; @@ -179,7 +179,10 @@ smb_send(struct socket *ssocket, struct smb_msg.msg_namelen = sizeof(struct sockaddr); smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; - smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/ + if (noblocksnd) + smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; + else + smb_msg.msg_flags = MSG_NOSIGNAL; /* smb header is converted in header_assemble. bcc and rest of SMB word area, and byte area if necessary, is converted to littleendian in @@ -230,8 +233,8 @@ smb_send(struct socket *ssocket, struct } static int -smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec, - struct sockaddr *sin) +smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, + struct sockaddr *sin, bool noblocksnd) { int rc = 0; int i = 0; @@ -241,6 +244,7 @@ smb_send2(struct socket *ssocket, struct unsigned int total_len; int first_vec = 0; unsigned int smb_buf_length = smb_buffer->smb_buf_length; + struct socket *ssocket = server->ssocket; if (ssocket == NULL) return -ENOTSOCK; /* BB eventually add reconnect code here */ @@ -249,7 +253,10 @@ smb_send2(struct socket *ssocket, struct smb_msg.msg_namelen = sizeof(struct sockaddr); smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; - smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/ + if (noblocksnd) + smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; + else + smb_msg.msg_flags = MSG_NOSIGNAL; /* smb header is converted in header_assemble. bcc and rest of SMB word area, and byte area if necessary, is converted to littleendian in @@ -313,6 +320,16 @@ smb_send2(struct socket *ssocket, struct i = 0; /* in case we get ENOSPC on the next send */ } + if ((total_len > 0) && (total_len != smb_buf_length + 4)) { + cFYI(1, ("partial send (%d remaining), terminating session", + total_len)); + /* If we have only sent part of an SMB then the next SMB + could be taken as the remainder of this one. We need + to kill the socket so the server throws away the partial + SMB */ + server->tcpStatus = CifsNeedReconnect; + } + if (rc < 0) { cERROR(1, ("Error %d sending data on socket to server", rc)); } else @@ -519,8 +536,9 @@ SendReceive2(const unsigned int xid, str #ifdef CONFIG_CIFS_STATS2 atomic_inc(&ses->server->inSend); #endif - rc = smb_send2(ses->server->ssocket, iov, n_vec, - (struct sockaddr *) &(ses->server->addr.sockAddr)); + rc = smb_send2(ses->server, iov, n_vec, + (struct sockaddr *) &(ses->server->addr.sockAddr), + ses->server->noblocksnd); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; @@ -712,7 +730,8 @@ SendReceive(const unsigned int xid, stru atomic_inc(&ses->server->inSend); #endif rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, - (struct sockaddr *) &(ses->server->addr.sockAddr)); + (struct sockaddr *) &(ses->server->addr.sockAddr), + ses->server->noblocksnd); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; @@ -852,7 +871,8 @@ send_nt_cancel(struct cifsTconInfo *tcon return rc; } rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, - (struct sockaddr *) &(ses->server->addr.sockAddr)); + (struct sockaddr *) &(ses->server->addr.sockAddr), + ses->server->noblocksnd); up(&ses->server->tcpSem); return rc; } @@ -942,7 +962,8 @@ SendReceiveBlockingLock(const unsigned i atomic_inc(&ses->server->inSend); #endif rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, - (struct sockaddr *) &(ses->server->addr.sockAddr)); + (struct sockaddr *) &(ses->server->addr.sockAddr), + ses->server->noblocksnd); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/