This patch-set contains bug fixes for state-recovery at the RDS
layer when the underlying transport is TCP and the TCP state at one
of the endpoints is reset
V2 changes: DaveM comments to reduce memory footprint, follow
NFS/RPC model where possible. Added test-case #3
Without the changes in this set, when one of the endpoints is reset,
the existing code does not correctly clean up RDS socket state for stale
connections, resulting in some unstable, timing-dependant behavior on
the wire, including an infinite exchange of 3WHs back-and-forth, and a
resulting potential to never converge RDS state.
Test cases used to verify the changes in this set are:
1. Start rds client/server applications on two participating nodes,
node1 and node2. After at least one packet has been sent (to establish
the TCP connection), restart the rds_tcp module on the client, and
now resend packets. Tcpdump should show server sending a FIN for the
"old" client port, and clean connection establishment/exchange for
the new client port.
2. At the end of step 1, restart rds srever on node2, and start client on
node1, make sure using tcpdump, 'netstat -an|grep 16385' that
packets flow correctly.
3. start RDS client/server application on two participating nodes, and
repeat steps 1 and 2, but this time, simulate node failure by doing
"ifconfig <intf> down", so no FIN is sent.
Sowmini Varadhan (2):
RDS-TCP: Always create a new rds_sock for an incoming connection.
RDS-TCP: only initiate reconnect attempt on outgoing TCP socket.
net/rds/connection.c | 17 +++++++++++++++--
net/rds/tcp_connect.c | 1 +
net/rds/tcp_listen.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 62 insertions(+), 2 deletions(-)
When running RDS over TCP, the active (client) side connects to the
listening ("passive") side at the RDS_TCP_PORT. After the connection
is established, if the client side reboots (potentially without even
sending a FIN) the server still has a TCP socket in the esablished
state. If the server-side now gets a new SYN comes from the client
with a different client port, TCP will create a new socket-pair, but
the RDS layer will incorrectly pull up the old rds_connection (which
is still associated with the stale t_sock and RDS socket state).
This patch corrects this behavior by having rds_tcp_accept_one()
always create a new connection for an incoming TCP SYN.
The rds and tcp state associated with the old socket-pair is cleaned
up via the rds_tcp_state_change() callback which would typically be
invoked in most cases when the client-TCP sends a FIN on TCP restart,
triggering a transition to CLOSE_WAIT state. In the rarer event of client
death without a FIN, TCP_KEEPALIVE probes on the socket will detect
the stale socket, and the TCP transition to CLOSE state will trigger
the RDS state cleanup.
Signed-off-by: Sowmini Varadhan <[email protected]>
---
v2: DaveM comments- make the TCP server side recovery follow patterns in
NFS/RPC, and keep things neat without needless memory bloat.
net/rds/connection.c | 4 ++++
net/rds/tcp_connect.c | 1 +
net/rds/tcp_listen.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 51 insertions(+), 0 deletions(-)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 14f0413..60f0cd6 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
+ struct rds_transport *otrans = trans;
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ goto new_conn;
rcu_read_lock();
conn = rds_conn_lookup(head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
@@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
if (conn)
goto out;
+new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index f9f564a..973109c 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_ESTABLISHED:
rds_connect_complete(conn);
break;
+ case TCP_CLOSE_WAIT:
case TCP_CLOSE:
rds_conn_drop(conn);
default:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 23ab4dc..0da49e3 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work);
static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
static struct socket *rds_tcp_listen_sock;
+static int rds_tcp_keepalive(struct socket *sock)
+{
+ /* values below based on xs_udp_default_timeout */
+ int keepidle = 5; /* send a probe 'keepidle' secs after last data */
+ int keepcnt = 5; /* number of unack'ed probes before declaring dead */
+ int keepalive = 1;
+ int ret = 0;
+
+ ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&keepalive, sizeof(keepalive));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ if (ret < 0)
+ goto bail;
+
+ /* KEEPINTVL is the interval between successive probes. We follow
+ * the model in xs_tcp_finish_connecting() and re-use keepidle.
+ */
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+bail:
+ return ret;
+}
+
static int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
struct rds_connection *conn;
int ret;
struct inet_sock *inet;
+ struct rds_tcp_connection *rs_tcp;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
@@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ ret = rds_tcp_keepalive(new_sock);
+ if (ret < 0)
+ goto out;
+
rds_tcp_tune(new_sock);
inet = inet_sk(new_sock->sk);
@@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock)
ret = PTR_ERR(conn);
goto out;
}
+ /* An incoming SYN request came in, and TCP just accepted it.
+ * We always create a new conn for listen side of TCP, and do not
+ * add it to the c_hash_list.
+ *
+ * If the client reboots, this conn will need to be cleaned up.
+ * rds_tcp_state_change() will do that cleanup
+ */
+ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
+ WARN_ON(!rs_tcp || rs_tcp->t_sock);
/*
* see the comment above rds_queue_delayed_reconnect()
--
1.7.1
When the peer of an RDS-TCP connection restarts, a reconnect
attempt should only be made from the active side of the TCP
connection, i.e. the side that has a transient TCP port
number. Do not add the passive side of the TCP connection
to the c_hash_node and thus avoid triggering rds_queue_reconnect()
for passive rds connections.
Signed-off-by: Sowmini Varadhan <[email protected]>
---
net/rds/connection.c | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 60f0cd6..da6da57 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -234,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
/* Creating normal conn */
struct rds_connection *found;
- found = rds_conn_lookup(head, laddr, faddr, trans);
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ found = NULL;
+ else
+ found = rds_conn_lookup(head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
- hlist_add_head_rcu(&conn->c_hash_node, head);
+ if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
+ (otrans->t_type != RDS_TRANS_TCP)) {
+ /* Only the active side should be added to
+ * reconnect list for TCP.
+ */
+ hlist_add_head_rcu(&conn->c_hash_node, head);
+ }
rds_cong_add_conn(conn);
rds_conn_count++;
}
--
1.7.1
From: Sowmini Varadhan <[email protected]>
Date: Tue, 5 May 2015 15:20:50 -0400
>
> This patch-set contains bug fixes for state-recovery at the RDS
> layer when the underlying transport is TCP and the TCP state at one
> of the endpoints is reset
>
> V2 changes: DaveM comments to reduce memory footprint, follow
> NFS/RPC model where possible. Added test-case #3
>
> Without the changes in this set, when one of the endpoints is reset,
> the existing code does not correctly clean up RDS socket state for stale
> connections, resulting in some unstable, timing-dependant behavior on
> the wire, including an infinite exchange of 3WHs back-and-forth, and a
> resulting potential to never converge RDS state.
...
Yeah this looks a lot better, series applied, thanks.