2022-03-16 05:18:04

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 2/3] SUNRPC: Replace internal use of SOCKWQ_ASYNC_NOSPACE

From: Trond Myklebust <[email protected]>

The socket's SOCKWQ_ASYNC_NOSPACE can be cleared by various actors in
the socket layer, so replace it with our own flag in the transport
sock_state field.

Reported-by: Chuck Lever <[email protected]>
Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/xprtsock.h | 1 +
net/sunrpc/xprtsock.c | 22 ++++------------------
2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 8c2a712cb242..121162a95863 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -89,5 +89,6 @@ struct sock_xprt {
#define XPRT_SOCK_WAKE_WRITE (5)
#define XPRT_SOCK_WAKE_PENDING (6)
#define XPRT_SOCK_WAKE_DISCONNECT (7)
+#define XPRT_SOCK_NOSPACE (8)

#endif /* _LINUX_SUNRPC_XPRTSOCK_H */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 786df8c0cda3..23cdb841f056 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -779,14 +779,8 @@ static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport)

/* Don't race with disconnect */
if (xprt_connected(xprt)) {
- struct socket_wq *wq;
-
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
- rcu_read_unlock();
-
/* wait for more buffer space */
+ set_bit(XPRT_SOCK_NOSPACE, &transport->sock_state);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
xprt_wait_for_buffer_space(xprt);
@@ -1148,6 +1142,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+ clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state);
}

static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
@@ -1494,7 +1489,6 @@ static void xs_tcp_state_change(struct sock *sk)

static void xs_write_space(struct sock *sk)
{
- struct socket_wq *wq;
struct sock_xprt *transport;
struct rpc_xprt *xprt;

@@ -1505,15 +1499,10 @@ static void xs_write_space(struct sock *sk)
if (unlikely(!(xprt = xprt_from_sock(sk))))
return;
transport = container_of(xprt, struct sock_xprt, xprt);
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
- goto out;
-
+ if (!test_and_clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state))
+ return;
xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
sk->sk_write_pending--;
-out:
- rcu_read_unlock();
}

/**
@@ -1854,7 +1843,6 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
- sock_set_flag(sk, SOCK_FASYNC);
sk->sk_error_report = xs_error_report;

xprt_clear_connected(xprt);
@@ -2048,7 +2036,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
- sock_set_flag(sk, SOCK_FASYNC);

xprt_set_connected(xprt);

@@ -2215,7 +2202,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_data_ready = xs_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
- sock_set_flag(sk, SOCK_FASYNC);
sk->sk_error_report = xs_error_report;

/* socket options */
--
2.35.1


2022-03-17 04:30:47

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 3/3] SUNRPC: Improve accuracy of socket ENOBUFS determination

From: Trond Myklebust <[email protected]>

The current code checks for whether or not the socket is in a writeable
state after we get an EAGAIN. That is racy, since we've dropped the
socket lock, so the amount of free buffer may have changed.

Instead, let's check whether the socket is writeable before we try to
write to it. If that was the case, we do expect the message to be at
least partially sent unless we're in a low memory situation.

Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/xprtsock.c | 53 +++++++++++++++----------------------------
1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 23cdb841f056..87cc97d6c677 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -805,13 +805,15 @@ static int xs_sock_nospace(struct rpc_rqst *req)
return ret;
}

-static int xs_stream_nospace(struct rpc_rqst *req)
+static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait)
{
struct sock_xprt *transport =
container_of(req->rq_xprt, struct sock_xprt, xprt);
struct sock *sk = transport->inet;
int ret = -EAGAIN;

+ if (vm_wait)
+ return -ENOBUFS;
lock_sock(sk);
if (!sk_stream_memory_free(sk))
ret = xs_nospace(req, transport);
@@ -869,6 +871,7 @@ static int xs_local_send_request(struct rpc_rqst *req)
struct msghdr msg = {
.msg_flags = XS_SENDMSG_FLAGS,
};
+ bool vm_wait;
unsigned int sent;
int status;

@@ -881,15 +884,14 @@ static int xs_local_send_request(struct rpc_rqst *req)
xs_pktdump("packet data:",
req->rq_svec->iov_base, req->rq_svec->iov_len);

+ vm_wait = sk_stream_is_writeable(transport->inet) ? true : false;
+
req->rq_xtime = ktime_get();
status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
transport->xmit.offset, rm, &sent);
dprintk("RPC: %s(%u) = %d\n",
__func__, xdr->len - transport->xmit.offset, status);

- if (status == -EAGAIN && sock_writeable(transport->inet))
- status = -ENOBUFS;
-
if (likely(sent > 0) || status == 0) {
transport->xmit.offset += sent;
req->rq_bytes_sent = transport->xmit.offset;
@@ -899,13 +901,12 @@ static int xs_local_send_request(struct rpc_rqst *req)
return 0;
}
status = -EAGAIN;
+ vm_wait = false;
}

switch (status) {
- case -ENOBUFS:
- break;
case -EAGAIN:
- status = xs_stream_nospace(req);
+ status = xs_stream_nospace(req, vm_wait);
break;
default:
dprintk("RPC: sendmsg returned unrecognized error %d\n",
@@ -1023,7 +1024,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
struct msghdr msg = {
.msg_flags = XS_SENDMSG_FLAGS,
};
- bool vm_wait = false;
+ bool vm_wait;
unsigned int sent;
int status;

@@ -1048,7 +1049,10 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
* called sendmsg(). */
req->rq_xtime = ktime_get();
tcp_sock_set_cork(transport->inet, true);
- while (1) {
+
+ vm_wait = sk_stream_is_writeable(transport->inet) ? true : false;
+
+ do {
status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
transport->xmit.offset, rm, &sent);

@@ -1069,31 +1073,10 @@ static int xs_tcp_send_request(struct rpc_rqst *req)

WARN_ON_ONCE(sent == 0 && status == 0);

- if (status == -EAGAIN ) {
- /*
- * Return EAGAIN if we're sure we're hitting the
- * socket send buffer limits.
- */
- if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
- break;
- /*
- * Did we hit a memory allocation failure?
- */
- if (sent == 0) {
- status = -ENOBUFS;
- if (vm_wait)
- break;
- /* Retry, knowing now that we're below the
- * socket send buffer limit
- */
- vm_wait = true;
- }
- continue;
- }
- if (status < 0)
- break;
- vm_wait = false;
- }
+ if (sent > 0)
+ vm_wait = false;
+
+ } while (status == 0);

switch (status) {
case -ENOTSOCK:
@@ -1101,7 +1084,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- status = xs_stream_nospace(req);
+ status = xs_stream_nospace(req, vm_wait);
break;
case -ECONNRESET:
case -ECONNREFUSED:
--
2.35.1