Move the TCP data receive loop out of xs_tcp_data_ready(). Doing so
will allow us to move the data receive out of the softirq context in
a set of followup patches.
Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/xprtsock.c | 44 +++++++++++++++++++++++++++++---------------
1 file changed, 29 insertions(+), 15 deletions(-)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1a85e0ed0b48..fa8d0c15c8cd 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1391,6 +1391,30 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
return len - desc.count;
}
+static void xs_tcp_data_receive(struct sock_xprt *transport)
+{
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct sock *sk;
+ read_descriptor_t rd_desc = {
+ .count = 2*1024*1024,
+ .arg.data = xprt,
+ };
+ unsigned long total = 0;
+ int read = 0;
+
+ sk = transport->inet;
+
+ /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
+ for (;;) {
+ read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+ if (read <= 0)
+ break;
+ total += read;
+ rd_desc.count = 65536;
+ }
+ trace_xs_tcp_data_ready(xprt, read, total);
+}
+
/**
* xs_tcp_data_ready - "data ready" callback for TCP sockets
* @sk: socket with data to read
@@ -1398,34 +1422,24 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
*/
static void xs_tcp_data_ready(struct sock *sk)
{
+ struct sock_xprt *transport;
struct rpc_xprt *xprt;
- read_descriptor_t rd_desc;
- int read;
- unsigned long total = 0;
dprintk("RPC: xs_tcp_data_ready...\n");
read_lock_bh(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk))) {
- read = 0;
+ if (!(xprt = xprt_from_sock(sk)))
goto out;
- }
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
/* Any data means we had a useful conversation, so
* the we don't need to delay the next reconnect
*/
if (xprt->reestablish_timeout)
xprt->reestablish_timeout = 0;
- /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
- rd_desc.arg.data = xprt;
- do {
- rd_desc.count = 65536;
- read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
- if (read > 0)
- total += read;
- } while (read > 0);
+ xs_tcp_data_receive(transport);
out:
- trace_xs_tcp_data_ready(xprt, read, total);
read_unlock_bh(&sk->sk_callback_lock);
}
--
2.4.3
This patch attempts to eke out a couple more iops by allowing the TCP code
to replace bh-safe spinlock with an ordinary one while receiving data. We
still need to use the bh-safe spinlock when completing.
Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/xprt.c | 4 ++++
net/sunrpc/xprtsock.c | 17 ++++++++++-------
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e98f4a243e5..e604bb680bcf 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -951,6 +951,7 @@ void xprt_transmit(struct rpc_task *task)
/*
* Add to the list only if we're expecting a reply
*/
+ spin_lock(&xprt->reserve_lock);
spin_lock_bh(&xprt->transport_lock);
/* Update the softirq receive buffer */
memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
@@ -958,6 +959,7 @@ void xprt_transmit(struct rpc_task *task)
/* Add request to the receive list */
list_add_tail(&req->rq_list, &xprt->recv);
spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->reserve_lock);
xprt_reset_majortimeo(req);
/* Turn off autodisconnect */
del_singleshot_timer_sync(&xprt->timer);
@@ -1278,6 +1280,7 @@ void xprt_release(struct rpc_task *task)
task->tk_ops->rpc_count_stats(task, task->tk_calldata);
else if (task->tk_client)
rpc_count_iostats(task, task->tk_client->cl_metrics);
+ spin_lock(&xprt->reserve_lock);
spin_lock_bh(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
if (xprt->ops->release_request)
@@ -1289,6 +1292,7 @@ void xprt_release(struct rpc_task *task)
mod_timer(&xprt->timer,
xprt->last_used + xprt->idle_timeout);
spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->reserve_lock);
if (req->rq_buffer)
xprt->ops->buf_free(req->rq_buffer);
xprt_inject_disconnect(xprt);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 58dc90ccebb6..063d2eb20d8e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1246,21 +1246,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
/* Find and lock the request corresponding to this xid */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->reserve_lock);
req = xprt_lookup_rqst(xprt, transport->tcp_xid);
if (!req) {
dprintk("RPC: XID %08x request not found!\n",
ntohl(transport->tcp_xid));
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->reserve_lock);
return -1;
}
xs_tcp_read_common(xprt, desc, req);
- if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
+ if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
+ spin_lock_bh(&xprt->transport_lock);
xprt_complete_rqst(req->rq_task, transport->tcp_copied);
+ spin_unlock_bh(&xprt->transport_lock);
+ }
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->reserve_lock);
return 0;
}
@@ -1280,10 +1283,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
struct rpc_rqst *req;
/* Look up and lock the request corresponding to the given XID */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->reserve_lock);
req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->reserve_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
@@ -1294,7 +1297,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_bc_request(req, transport->tcp_copied);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->reserve_lock);
return 0;
}
--
2.4.3
Stream protocols such as TCP can often build up a backlog of data to be
read due to ordering. Combine this with the fact that some workloads such
as NFS read()-intensive workloads need to receive a lot of data per RPC
call, and it turns out that receiving the data from inside a softirq
context can cause starvation.
The following patch moves the TCP data receive into a workqueue context.
We still end up calling tcp_read_sock(), but we do so from a process
context, meaning that softirqs are enabled for most of the time.
With this patch, I see a doubling of read bandwidth when running a
multi-threaded iozone workload between a virtual client and server setup.
Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/xprtsock.h | 2 ++
net/sunrpc/xprtsock.c | 51 +++++++++++++++++++++++++++++------------
2 files changed, 38 insertions(+), 15 deletions(-)
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 357e44c1a46b..0ece4ba06f06 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -44,6 +44,8 @@ struct sock_xprt {
*/
unsigned long sock_state;
struct delayed_work connect_worker;
+ struct work_struct recv_worker;
+ struct mutex recv_mutex;
struct sockaddr_storage srcaddr;
unsigned short srcport;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index fa8d0c15c8cd..58dc90ccebb6 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -823,6 +823,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
kernel_sock_shutdown(sock, SHUT_RDWR);
+ mutex_lock(&transport->recv_mutex);
write_lock_bh(&sk->sk_callback_lock);
transport->inet = NULL;
transport->sock = NULL;
@@ -833,6 +834,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
xprt_clear_connected(xprt);
write_unlock_bh(&sk->sk_callback_lock);
xs_sock_reset_connection_flags(xprt);
+ mutex_unlock(&transport->recv_mutex);
trace_rpc_socket_close(xprt, sock);
sock_release(sock);
@@ -886,6 +888,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
cancel_delayed_work_sync(&transport->connect_worker);
xs_close(xprt);
+ cancel_work_sync(&transport->recv_worker);
xs_xprt_free(xprt);
module_put(THIS_MODULE);
}
@@ -1243,12 +1246,12 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
/* Find and lock the request corresponding to this xid */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
req = xprt_lookup_rqst(xprt, transport->tcp_xid);
if (!req) {
dprintk("RPC: XID %08x request not found!\n",
ntohl(transport->tcp_xid));
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return -1;
}
@@ -1257,7 +1260,7 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_rqst(req->rq_task, transport->tcp_copied);
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1277,10 +1280,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
struct rpc_rqst *req;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
@@ -1291,7 +1294,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_bc_request(req, transport->tcp_copied);
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1402,19 +1405,33 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
unsigned long total = 0;
int read = 0;
+ mutex_lock(&transport->recv_mutex);
sk = transport->inet;
+ if (sk == NULL)
+ goto out;
/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
for (;;) {
+ lock_sock(sk);
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+ release_sock(sk);
if (read <= 0)
break;
total += read;
rd_desc.count = 65536;
}
+out:
+ mutex_unlock(&transport->recv_mutex);
trace_xs_tcp_data_ready(xprt, read, total);
}
+static void xs_tcp_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ xs_tcp_data_receive(transport);
+}
+
/**
* xs_tcp_data_ready - "data ready" callback for TCP sockets
* @sk: socket with data to read
@@ -1437,8 +1454,8 @@ static void xs_tcp_data_ready(struct sock *sk)
*/
if (xprt->reestablish_timeout)
xprt->reestablish_timeout = 0;
+ queue_work(rpciod_workqueue, &transport->recv_worker);
- xs_tcp_data_receive(transport);
out:
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1840,6 +1857,10 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
}
#endif
+static void xs_dummy_data_receive_workfn(struct work_struct *work)
+{
+}
+
static void xs_dummy_setup_socket(struct work_struct *work)
{
}
@@ -2664,6 +2685,7 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
}
new = container_of(xprt, struct sock_xprt, xprt);
+ mutex_init(&new->recv_mutex);
memcpy(&xprt->addr, args->dstaddr, args->addrlen);
xprt->addrlen = args->addrlen;
if (args->srcaddr)
@@ -2717,6 +2739,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
xprt->ops = &xs_local_ops;
xprt->timeout = &xs_local_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_dummy_data_receive_workfn);
INIT_DELAYED_WORK(&transport->connect_worker,
xs_dummy_setup_socket);
@@ -2788,21 +2811,20 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
xprt->timeout = &xs_udp_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_dummy_data_receive_workfn);
+ INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
+
switch (addr->sa_family) {
case AF_INET:
if (((struct sockaddr_in *)addr)->sin_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_udp_setup_socket);
xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
break;
case AF_INET6:
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_udp_setup_socket);
xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
break;
default:
@@ -2867,21 +2889,20 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->ops = &xs_tcp_ops;
xprt->timeout = &xs_tcp_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
+ INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
+
switch (addr->sa_family) {
case AF_INET:
if (((struct sockaddr_in *)addr)->sin_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_tcp_setup_socket);
xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
break;
case AF_INET6:
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_tcp_setup_socket);
xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
break;
default:
--
2.4.3
> On Oct 5, 2015, at 7:03 PM, Trond Myklebust <[email protected]> wrote:
>
> This patch attempts to eke out a couple more iops by allowing the TCP code
> to replace bh-safe spinlock with an ordinary one while receiving data. We
> still need to use the bh-safe spinlock when completing.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> net/sunrpc/xprt.c | 4 ++++
> net/sunrpc/xprtsock.c | 17 ++++++++++-------
> 2 files changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index 2e98f4a243e5..e604bb680bcf 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -951,6 +951,7 @@ void xprt_transmit(struct rpc_task *task)
> /*
> * Add to the list only if we're expecting a reply
> */
> + spin_lock(&xprt->reserve_lock);
> spin_lock_bh(&xprt->transport_lock);
This is painful. spin_lock_bh is a pre-emption point, where
the kernel can handle interrupts while others are spinning
waiting for both of these locks. It can result in waiting
for more than 100us to get the lock.
Was there an increase or decrease in CPU utilization with
this change?
> /* Update the softirq receive buffer */
> memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
> @@ -958,6 +959,7 @@ void xprt_transmit(struct rpc_task *task)
> /* Add request to the receive list */
> list_add_tail(&req->rq_list, &xprt->recv);
> spin_unlock_bh(&xprt->transport_lock);
> + spin_unlock(&xprt->reserve_lock);
> xprt_reset_majortimeo(req);
> /* Turn off autodisconnect */
> del_singleshot_timer_sync(&xprt->timer);
> @@ -1278,6 +1280,7 @@ void xprt_release(struct rpc_task *task)
> task->tk_ops->rpc_count_stats(task, task->tk_calldata);
> else if (task->tk_client)
> rpc_count_iostats(task, task->tk_client->cl_metrics);
> + spin_lock(&xprt->reserve_lock);
> spin_lock_bh(&xprt->transport_lock);
> xprt->ops->release_xprt(xprt, task);
> if (xprt->ops->release_request)
> @@ -1289,6 +1292,7 @@ void xprt_release(struct rpc_task *task)
> mod_timer(&xprt->timer,
> xprt->last_used + xprt->idle_timeout);
> spin_unlock_bh(&xprt->transport_lock);
> + spin_unlock(&xprt->reserve_lock);
> if (req->rq_buffer)
> xprt->ops->buf_free(req->rq_buffer);
> xprt_inject_disconnect(xprt);
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 58dc90ccebb6..063d2eb20d8e 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -1246,21 +1246,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
> dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
>
> /* Find and lock the request corresponding to this xid */
> - spin_lock_bh(&xprt->transport_lock);
> + spin_lock(&xprt->reserve_lock);
> req = xprt_lookup_rqst(xprt, transport->tcp_xid);
> if (!req) {
> dprintk("RPC: XID %08x request not found!\n",
> ntohl(transport->tcp_xid));
> - spin_unlock_bh(&xprt->transport_lock);
> + spin_unlock(&xprt->reserve_lock);
> return -1;
> }
Is a similar change needed for rpcrdma_reply_handler() ? If
that can’t be changed until it runs in a WQ context, I have
that patch right now. It can be ready for 4.4.
> xs_tcp_read_common(xprt, desc, req);
>
> - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
> + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
> + spin_lock_bh(&xprt->transport_lock);
> xprt_complete_rqst(req->rq_task, transport->tcp_copied);
> + spin_unlock_bh(&xprt->transport_lock);
> + }
>
> - spin_unlock_bh(&xprt->transport_lock);
> + spin_unlock(&xprt->reserve_lock);
> return 0;
> }
xprt_complete_rqst() is actually the top source of contention
on transport_lock in my tests, thanks to the locking under
rpc_wake_up_queued_task(). Is there a plan to mitigate locking
costs in the RPC scheduler?
> @@ -1280,10 +1283,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
> struct rpc_rqst *req;
>
> /* Look up and lock the request corresponding to the given XID */
> - spin_lock_bh(&xprt->transport_lock);
> + spin_lock(&xprt->reserve_lock);
> req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
> if (req == NULL) {
> - spin_unlock_bh(&xprt->transport_lock);
> + spin_unlock(&xprt->reserve_lock);
> printk(KERN_WARNING "Callback slot table overflowed\n");
> xprt_force_disconnect(xprt);
> return -1;
> @@ -1294,7 +1297,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
>
> if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
> xprt_complete_bc_request(req, transport->tcp_copied);
> - spin_unlock_bh(&xprt->transport_lock);
> + spin_unlock(&xprt->reserve_lock);
>
> return 0;
> }
> --
> 2.4.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
—
Chuck Lever
On Mon, Oct 5, 2015 at 8:10 PM, Chuck Lever <[email protected]> wrote:
>
>> On Oct 5, 2015, at 7:03 PM, Trond Myklebust <[email protected]> wrote:
>>
>> This patch attempts to eke out a couple more iops by allowing the TCP code
>> to replace bh-safe spinlock with an ordinary one while receiving data. We
>> still need to use the bh-safe spinlock when completing.
>>
>> Signed-off-by: Trond Myklebust <[email protected]>
>> ---
>> net/sunrpc/xprt.c | 4 ++++
>> net/sunrpc/xprtsock.c | 17 ++++++++++-------
>> 2 files changed, 14 insertions(+), 7 deletions(-)
>>
>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>> index 2e98f4a243e5..e604bb680bcf 100644
>> --- a/net/sunrpc/xprt.c
>> +++ b/net/sunrpc/xprt.c
>> @@ -951,6 +951,7 @@ void xprt_transmit(struct rpc_task *task)
>> /*
>> * Add to the list only if we're expecting a reply
>> */
>> + spin_lock(&xprt->reserve_lock);
>> spin_lock_bh(&xprt->transport_lock);
>
> This is painful. spin_lock_bh is a pre-emption point, where
> the kernel can handle interrupts while others are spinning
> waiting for both of these locks. It can result in waiting
> for more than 100us to get the lock.
>
I thought, the problem isn't so much the spin_lock_bh() but rather the
spin_unlock_bh(). If this is the outermost bh-safe lock, then
spin_unlock_bh() may call do_softirq(), which can take a while, and
therefore increase the chances of contention for any outer
(non-bh-safe) locks.
Note, however, that PATCH 2/3 significantly cuts down on the total
amount of time spent in do_softirq(), and therefore explains most of
the performance increase. We can drop patch 3/3 without losing much.
> Was there an increase or decrease in CPU utilization with
> this change?
I'm seeing less overall contention.
>> /* Update the softirq receive buffer */
>> memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
>> @@ -958,6 +959,7 @@ void xprt_transmit(struct rpc_task *task)
>> /* Add request to the receive list */
>> list_add_tail(&req->rq_list, &xprt->recv);
>> spin_unlock_bh(&xprt->transport_lock);
>> + spin_unlock(&xprt->reserve_lock);
>> xprt_reset_majortimeo(req);
>> /* Turn off autodisconnect */
>> del_singleshot_timer_sync(&xprt->timer);
>> @@ -1278,6 +1280,7 @@ void xprt_release(struct rpc_task *task)
>> task->tk_ops->rpc_count_stats(task, task->tk_calldata);
>> else if (task->tk_client)
>> rpc_count_iostats(task, task->tk_client->cl_metrics);
>> + spin_lock(&xprt->reserve_lock);
>> spin_lock_bh(&xprt->transport_lock);
>> xprt->ops->release_xprt(xprt, task);
>> if (xprt->ops->release_request)
>> @@ -1289,6 +1292,7 @@ void xprt_release(struct rpc_task *task)
>> mod_timer(&xprt->timer,
>> xprt->last_used + xprt->idle_timeout);
>> spin_unlock_bh(&xprt->transport_lock);
>> + spin_unlock(&xprt->reserve_lock);
>> if (req->rq_buffer)
>> xprt->ops->buf_free(req->rq_buffer);
>> xprt_inject_disconnect(xprt);
>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>> index 58dc90ccebb6..063d2eb20d8e 100644
>> --- a/net/sunrpc/xprtsock.c
>> +++ b/net/sunrpc/xprtsock.c
>> @@ -1246,21 +1246,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
>> dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
>>
>> /* Find and lock the request corresponding to this xid */
>> - spin_lock_bh(&xprt->transport_lock);
>> + spin_lock(&xprt->reserve_lock);
>> req = xprt_lookup_rqst(xprt, transport->tcp_xid);
>> if (!req) {
>> dprintk("RPC: XID %08x request not found!\n",
>> ntohl(transport->tcp_xid));
>> - spin_unlock_bh(&xprt->transport_lock);
>> + spin_unlock(&xprt->reserve_lock);
>> return -1;
>> }
>
> Is a similar change needed for rpcrdma_reply_handler() ? If
> that can’t be changed until it runs in a WQ context, I have
> that patch right now. It can be ready for 4.4.
>
>
>> xs_tcp_read_common(xprt, desc, req);
>>
>> - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
>> + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
>> + spin_lock_bh(&xprt->transport_lock);
>> xprt_complete_rqst(req->rq_task, transport->tcp_copied);
>> + spin_unlock_bh(&xprt->transport_lock);
>> + }
>>
>> - spin_unlock_bh(&xprt->transport_lock);
>> + spin_unlock(&xprt->reserve_lock);
>> return 0;
>> }
>
> xprt_complete_rqst() is actually the top source of contention
> on transport_lock in my tests, thanks to the locking under
> rpc_wake_up_queued_task(). Is there a plan to mitigate locking
> costs in the RPC scheduler?
I'm not understanding how rpc_wake_up_queued_task() could cause a
problem here. Can you explain?
>> @@ -1280,10 +1283,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
>> struct rpc_rqst *req;
>>
>> /* Look up and lock the request corresponding to the given XID */
>> - spin_lock_bh(&xprt->transport_lock);
>> + spin_lock(&xprt->reserve_lock);
>> req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
>> if (req == NULL) {
>> - spin_unlock_bh(&xprt->transport_lock);
>> + spin_unlock(&xprt->reserve_lock);
>> printk(KERN_WARNING "Callback slot table overflowed\n");
>> xprt_force_disconnect(xprt);
>> return -1;
>> @@ -1294,7 +1297,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
>>
>> if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
>> xprt_complete_bc_request(req, transport->tcp_copied);
>> - spin_unlock_bh(&xprt->transport_lock);
>> + spin_unlock(&xprt->reserve_lock);
>>
>> return 0;
>> }
>> --
>> 2.4.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> —
> Chuck Lever
>
>
>
> On Oct 5, 2015, at 11:55 PM, Trond Myklebust <[email protected]> wrote:
>
> On Mon, Oct 5, 2015 at 8:10 PM, Chuck Lever <[email protected]> wrote:
>>
>>> On Oct 5, 2015, at 7:03 PM, Trond Myklebust <[email protected]> wrote:
>>>
>>> This patch attempts to eke out a couple more iops by allowing the TCP code
>>> to replace bh-safe spinlock with an ordinary one while receiving data. We
>>> still need to use the bh-safe spinlock when completing.
>>>
>>> Signed-off-by: Trond Myklebust <[email protected]>
>>> ---
>>> net/sunrpc/xprt.c | 4 ++++
>>> net/sunrpc/xprtsock.c | 17 ++++++++++-------
>>> 2 files changed, 14 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>> index 2e98f4a243e5..e604bb680bcf 100644
>>> --- a/net/sunrpc/xprt.c
>>> +++ b/net/sunrpc/xprt.c
>>> @@ -951,6 +951,7 @@ void xprt_transmit(struct rpc_task *task)
>>> /*
>>> * Add to the list only if we're expecting a reply
>>> */
>>> + spin_lock(&xprt->reserve_lock);
>>> spin_lock_bh(&xprt->transport_lock);
>>
>> This is painful. spin_lock_bh is a pre-emption point, where
>> the kernel can handle interrupts while others are spinning
>> waiting for both of these locks. It can result in waiting
>> for more than 100us to get the lock.
>>
>
> I thought, the problem isn't so much the spin_lock_bh() but rather the
> spin_unlock_bh(). If this is the outermost bh-safe lock, then
> spin_unlock_bh() may call do_softirq(), which can take a while, and
> therefore increase the chances of contention for any outer
> (non-bh-safe) locks.
>
> Note, however, that PATCH 2/3 significantly cuts down on the total
> amount of time spent in do_softirq(), and therefore explains most of
> the performance increase. We can drop patch 3/3 without losing much.
My only quibble is the double locking in xprt_transmit().
Wondering if it is sensible to use llist for the receive
list.
I actually need to drop locks between xprt_lookup_rqst()
and xprt_complete_rqst() for other reasons. That part of
this patch is desirable!
In fact, I wasn’t aware that it was safe to drop the
lock after xprt_lookup_rqst without first having removed
the found request from the receive list.
>> Was there an increase or decrease in CPU utilization with
>> this change?
>
> I'm seeing less overall contention.
>
>>> /* Update the softirq receive buffer */
>>> memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
>>> @@ -958,6 +959,7 @@ void xprt_transmit(struct rpc_task *task)
>>> /* Add request to the receive list */
>>> list_add_tail(&req->rq_list, &xprt->recv);
>>> spin_unlock_bh(&xprt->transport_lock);
>>> + spin_unlock(&xprt->reserve_lock);
>>> xprt_reset_majortimeo(req);
>>> /* Turn off autodisconnect */
>>> del_singleshot_timer_sync(&xprt->timer);
>>> @@ -1278,6 +1280,7 @@ void xprt_release(struct rpc_task *task)
>>> task->tk_ops->rpc_count_stats(task, task->tk_calldata);
>>> else if (task->tk_client)
>>> rpc_count_iostats(task, task->tk_client->cl_metrics);
>>> + spin_lock(&xprt->reserve_lock);
>>> spin_lock_bh(&xprt->transport_lock);
>>> xprt->ops->release_xprt(xprt, task);
>>> if (xprt->ops->release_request)
>>> @@ -1289,6 +1292,7 @@ void xprt_release(struct rpc_task *task)
>>> mod_timer(&xprt->timer,
>>> xprt->last_used + xprt->idle_timeout);
>>> spin_unlock_bh(&xprt->transport_lock);
>>> + spin_unlock(&xprt->reserve_lock);
>>> if (req->rq_buffer)
>>> xprt->ops->buf_free(req->rq_buffer);
>>> xprt_inject_disconnect(xprt);
>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>> index 58dc90ccebb6..063d2eb20d8e 100644
>>> --- a/net/sunrpc/xprtsock.c
>>> +++ b/net/sunrpc/xprtsock.c
>>> @@ -1246,21 +1246,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
>>> dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
>>>
>>> /* Find and lock the request corresponding to this xid */
>>> - spin_lock_bh(&xprt->transport_lock);
>>> + spin_lock(&xprt->reserve_lock);
>>> req = xprt_lookup_rqst(xprt, transport->tcp_xid);
>>> if (!req) {
>>> dprintk("RPC: XID %08x request not found!\n",
>>> ntohl(transport->tcp_xid));
>>> - spin_unlock_bh(&xprt->transport_lock);
>>> + spin_unlock(&xprt->reserve_lock);
>>> return -1;
>>> }
>>
>> Is a similar change needed for rpcrdma_reply_handler() ? If
>> that can’t be changed until it runs in a WQ context, I have
>> that patch right now. It can be ready for 4.4.
>>
>>
>>> xs_tcp_read_common(xprt, desc, req);
>>>
>>> - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
>>> + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
>>> + spin_lock_bh(&xprt->transport_lock);
>>> xprt_complete_rqst(req->rq_task, transport->tcp_copied);
>>> + spin_unlock_bh(&xprt->transport_lock);
>>> + }
>>>
>>> - spin_unlock_bh(&xprt->transport_lock);
>>> + spin_unlock(&xprt->reserve_lock);
>>> return 0;
>>> }
>>
>> xprt_complete_rqst() is actually the top source of contention
>> on transport_lock in my tests, thanks to the locking under
>> rpc_wake_up_queued_task(). Is there a plan to mitigate locking
>> costs in the RPC scheduler?
>
> I'm not understanding how rpc_wake_up_queued_task() could cause a
> problem here. Can you explain?
xprt_complete_rqst() is holding the transport_lock, and
rpc_wake_up_queued_task is holding the wait queue lock.
During complex workloads like database or multi-threaded software
build, I’ve observed the rpc_make_runnable -> wake_up_bit path
take a very long time while both of these locks are held.
The problem seems similar to the above: pre-emption allows the
kernel to perform housekeeping while RPC-related threads are
spinning on these locks.
There is likewise a problem with xprt_lock_write_next, which
invokes rpc_wake_up_first while the transport_lock is held.
>>> @@ -1280,10 +1283,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
>>> struct rpc_rqst *req;
>>>
>>> /* Look up and lock the request corresponding to the given XID */
>>> - spin_lock_bh(&xprt->transport_lock);
>>> + spin_lock(&xprt->reserve_lock);
>>> req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
>>> if (req == NULL) {
>>> - spin_unlock_bh(&xprt->transport_lock);
>>> + spin_unlock(&xprt->reserve_lock);
>>> printk(KERN_WARNING "Callback slot table overflowed\n");
>>> xprt_force_disconnect(xprt);
>>> return -1;
>>> @@ -1294,7 +1297,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
>>>
>>> if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
>>> xprt_complete_bc_request(req, transport->tcp_copied);
>>> - spin_unlock_bh(&xprt->transport_lock);
>>> + spin_unlock(&xprt->reserve_lock);
>>>
>>> return 0;
>>> }
>>> --
>>> 2.4.3
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>> —
>> Chuck Lever
>>
>>
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
—
Chuck Lever
On Tue, Oct 6, 2015 at 9:56 AM, Chuck Lever <[email protected]> wrote:
>
>> On Oct 5, 2015, at 11:55 PM, Trond Myklebust <[email protected]> wrote:
>>
>> On Mon, Oct 5, 2015 at 8:10 PM, Chuck Lever <[email protected]> wrote:
>>>
>>>> On Oct 5, 2015, at 7:03 PM, Trond Myklebust <[email protected]> wrote:
>>>>
>>>> This patch attempts to eke out a couple more iops by allowing the TCP code
>>>> to replace bh-safe spinlock with an ordinary one while receiving data. We
>>>> still need to use the bh-safe spinlock when completing.
>>>>
>>>> Signed-off-by: Trond Myklebust <[email protected]>
>>>> ---
>>>> net/sunrpc/xprt.c | 4 ++++
>>>> net/sunrpc/xprtsock.c | 17 ++++++++++-------
>>>> 2 files changed, 14 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>> index 2e98f4a243e5..e604bb680bcf 100644
>>>> --- a/net/sunrpc/xprt.c
>>>> +++ b/net/sunrpc/xprt.c
>>>> @@ -951,6 +951,7 @@ void xprt_transmit(struct rpc_task *task)
>>>> /*
>>>> * Add to the list only if we're expecting a reply
>>>> */
>>>> + spin_lock(&xprt->reserve_lock);
>>>> spin_lock_bh(&xprt->transport_lock);
>>>
>>> This is painful. spin_lock_bh is a pre-emption point, where
>>> the kernel can handle interrupts while others are spinning
>>> waiting for both of these locks. It can result in waiting
>>> for more than 100us to get the lock.
>>>
>>
>> I thought, the problem isn't so much the spin_lock_bh() but rather the
>> spin_unlock_bh(). If this is the outermost bh-safe lock, then
>> spin_unlock_bh() may call do_softirq(), which can take a while, and
>> therefore increase the chances of contention for any outer
>> (non-bh-safe) locks.
>>
>> Note, however, that PATCH 2/3 significantly cuts down on the total
>> amount of time spent in do_softirq(), and therefore explains most of
>> the performance increase. We can drop patch 3/3 without losing much.
>
> My only quibble is the double locking in xprt_transmit().
> Wondering if it is sensible to use llist for the receive
> list.
Possibly.
> I actually need to drop locks between xprt_lookup_rqst()
> and xprt_complete_rqst() for other reasons. That part of
> this patch is desirable!
>
> In fact, I wasn’t aware that it was safe to drop the
> lock after xprt_lookup_rqst without first having removed
> the found request from the receive list.
That's what I'm using the reserve_lock for. We still need something
like it in order to pin the request on the receive list while we're
doing the copy.
As the subject message says, though, this is an RFC series. I'm still
playing with the code in order to figure out what is needed.
>>> Was there an increase or decrease in CPU utilization with
>>> this change?
>>
>> I'm seeing less overall contention.
>>
>>>> /* Update the softirq receive buffer */
>>>> memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
>>>> @@ -958,6 +959,7 @@ void xprt_transmit(struct rpc_task *task)
>>>> /* Add request to the receive list */
>>>> list_add_tail(&req->rq_list, &xprt->recv);
>>>> spin_unlock_bh(&xprt->transport_lock);
>>>> + spin_unlock(&xprt->reserve_lock);
>>>> xprt_reset_majortimeo(req);
>>>> /* Turn off autodisconnect */
>>>> del_singleshot_timer_sync(&xprt->timer);
>>>> @@ -1278,6 +1280,7 @@ void xprt_release(struct rpc_task *task)
>>>> task->tk_ops->rpc_count_stats(task, task->tk_calldata);
>>>> else if (task->tk_client)
>>>> rpc_count_iostats(task, task->tk_client->cl_metrics);
>>>> + spin_lock(&xprt->reserve_lock);
>>>> spin_lock_bh(&xprt->transport_lock);
>>>> xprt->ops->release_xprt(xprt, task);
>>>> if (xprt->ops->release_request)
>>>> @@ -1289,6 +1292,7 @@ void xprt_release(struct rpc_task *task)
>>>> mod_timer(&xprt->timer,
>>>> xprt->last_used + xprt->idle_timeout);
>>>> spin_unlock_bh(&xprt->transport_lock);
>>>> + spin_unlock(&xprt->reserve_lock);
>>>> if (req->rq_buffer)
>>>> xprt->ops->buf_free(req->rq_buffer);
>>>> xprt_inject_disconnect(xprt);
>>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>>> index 58dc90ccebb6..063d2eb20d8e 100644
>>>> --- a/net/sunrpc/xprtsock.c
>>>> +++ b/net/sunrpc/xprtsock.c
>>>> @@ -1246,21 +1246,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
>>>> dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
>>>>
>>>> /* Find and lock the request corresponding to this xid */
>>>> - spin_lock_bh(&xprt->transport_lock);
>>>> + spin_lock(&xprt->reserve_lock);
>>>> req = xprt_lookup_rqst(xprt, transport->tcp_xid);
>>>> if (!req) {
>>>> dprintk("RPC: XID %08x request not found!\n",
>>>> ntohl(transport->tcp_xid));
>>>> - spin_unlock_bh(&xprt->transport_lock);
>>>> + spin_unlock(&xprt->reserve_lock);
>>>> return -1;
>>>> }
>>>
>>> Is a similar change needed for rpcrdma_reply_handler() ? If
>>> that can’t be changed until it runs in a WQ context, I have
>>> that patch right now. It can be ready for 4.4.
>>>
>>>
>>>> xs_tcp_read_common(xprt, desc, req);
>>>>
>>>> - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
>>>> + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
>>>> + spin_lock_bh(&xprt->transport_lock);
>>>> xprt_complete_rqst(req->rq_task, transport->tcp_copied);
>>>> + spin_unlock_bh(&xprt->transport_lock);
>>>> + }
>>>>
>>>> - spin_unlock_bh(&xprt->transport_lock);
>>>> + spin_unlock(&xprt->reserve_lock);
>>>> return 0;
>>>> }
>>>
>>> xprt_complete_rqst() is actually the top source of contention
>>> on transport_lock in my tests, thanks to the locking under
>>> rpc_wake_up_queued_task(). Is there a plan to mitigate locking
>>> costs in the RPC scheduler?
>>
>> I'm not understanding how rpc_wake_up_queued_task() could cause a
>> problem here. Can you explain?
>
> xprt_complete_rqst() is holding the transport_lock, and
> rpc_wake_up_queued_task is holding the wait queue lock.
>
> During complex workloads like database or multi-threaded software
> build, I’ve observed the rpc_make_runnable -> wake_up_bit path
> take a very long time while both of these locks are held.
>
> The problem seems similar to the above: pre-emption allows the
> kernel to perform housekeeping while RPC-related threads are
> spinning on these locks.
>
> There is likewise a problem with xprt_lock_write_next, which
> invokes rpc_wake_up_first while the transport_lock is held.
I'm still not understanding. Apologies if I'm just being dense.
The call to do_softirq() shouldn't happen until after the last bh-safe
lock is released, so I wouldn't normally expect it to be called until
after the transport lock has been released in xprt_complete_rqst(). In
the existing upstream code, because all of this is being called from a
bh-safe context in xs_tcp_data_ready(), the do_softirq() will be
deferred until at least then, and possibly even later, depending on
where in the socket code the ->data_ready() callback originated from.
> On Oct 6, 2015, at 10:17 AM, Trond Myklebust <[email protected]> wrote:
>
> On Tue, Oct 6, 2015 at 9:56 AM, Chuck Lever <[email protected]> wrote:
>>
>>> On Oct 5, 2015, at 11:55 PM, Trond Myklebust <[email protected]> wrote:
>>>
>>> On Mon, Oct 5, 2015 at 8:10 PM, Chuck Lever <[email protected]> wrote:
>>>>
>>>>> On Oct 5, 2015, at 7:03 PM, Trond Myklebust <[email protected]> wrote:
>>>>>
>>>>> This patch attempts to eke out a couple more iops by allowing the TCP code
>>>>> to replace bh-safe spinlock with an ordinary one while receiving data. We
>>>>> still need to use the bh-safe spinlock when completing.
>>>>>
>>>>> Signed-off-by: Trond Myklebust <[email protected]>
>>>>> ---
>>>>> net/sunrpc/xprt.c | 4 ++++
>>>>> net/sunrpc/xprtsock.c | 17 ++++++++++-------
>>>>> 2 files changed, 14 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>>> index 2e98f4a243e5..e604bb680bcf 100644
>>>>> --- a/net/sunrpc/xprt.c
>>>>> +++ b/net/sunrpc/xprt.c
>>>>> @@ -951,6 +951,7 @@ void xprt_transmit(struct rpc_task *task)
>>>>> /*
>>>>> * Add to the list only if we're expecting a reply
>>>>> */
>>>>> + spin_lock(&xprt->reserve_lock);
>>>>> spin_lock_bh(&xprt->transport_lock);
>>>>
>>>> This is painful. spin_lock_bh is a pre-emption point, where
>>>> the kernel can handle interrupts while others are spinning
>>>> waiting for both of these locks. It can result in waiting
>>>> for more than 100us to get the lock.
>>>>
>>>
>>> I thought, the problem isn't so much the spin_lock_bh() but rather the
>>> spin_unlock_bh(). If this is the outermost bh-safe lock, then
>>> spin_unlock_bh() may call do_softirq(), which can take a while, and
>>> therefore increase the chances of contention for any outer
>>> (non-bh-safe) locks.
>>>
>>> Note, however, that PATCH 2/3 significantly cuts down on the total
>>> amount of time spent in do_softirq(), and therefore explains most of
>>> the performance increase. We can drop patch 3/3 without losing much.
>>
>> My only quibble is the double locking in xprt_transmit().
>> Wondering if it is sensible to use llist for the receive
>> list.
>
> Possibly.
>
>> I actually need to drop locks between xprt_lookup_rqst()
>> and xprt_complete_rqst() for other reasons. That part of
>> this patch is desirable!
>>
>> In fact, I wasn’t aware that it was safe to drop the
>> lock after xprt_lookup_rqst without first having removed
>> the found request from the receive list.
>
> That's what I'm using the reserve_lock for. We still need something
> like it in order to pin the request on the receive list while we're
> doing the copy.
My feeble attempt at this was to add a separate step to remove
the request from the list, once the caller was sure that the
request was not going to get dropped. Then the transport_lock
is dropped, and later xprt_complete_rqst finishes retiring
the request with the wake_up_queued_task.
When split out this way, it is clear that xprt_complete_rqst
is the top contender for the transport_lock.
> As the subject message says, though, this is an RFC series. I'm still
> playing with the code in order to figure out what is needed.
>
>>>> Was there an increase or decrease in CPU utilization with
>>>> this change?
>>>
>>> I'm seeing less overall contention.
>>>
>>>>> /* Update the softirq receive buffer */
>>>>> memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
>>>>> @@ -958,6 +959,7 @@ void xprt_transmit(struct rpc_task *task)
>>>>> /* Add request to the receive list */
>>>>> list_add_tail(&req->rq_list, &xprt->recv);
>>>>> spin_unlock_bh(&xprt->transport_lock);
>>>>> + spin_unlock(&xprt->reserve_lock);
>>>>> xprt_reset_majortimeo(req);
>>>>> /* Turn off autodisconnect */
>>>>> del_singleshot_timer_sync(&xprt->timer);
>>>>> @@ -1278,6 +1280,7 @@ void xprt_release(struct rpc_task *task)
>>>>> task->tk_ops->rpc_count_stats(task, task->tk_calldata);
>>>>> else if (task->tk_client)
>>>>> rpc_count_iostats(task, task->tk_client->cl_metrics);
>>>>> + spin_lock(&xprt->reserve_lock);
>>>>> spin_lock_bh(&xprt->transport_lock);
>>>>> xprt->ops->release_xprt(xprt, task);
>>>>> if (xprt->ops->release_request)
>>>>> @@ -1289,6 +1292,7 @@ void xprt_release(struct rpc_task *task)
>>>>> mod_timer(&xprt->timer,
>>>>> xprt->last_used + xprt->idle_timeout);
>>>>> spin_unlock_bh(&xprt->transport_lock);
>>>>> + spin_unlock(&xprt->reserve_lock);
>>>>> if (req->rq_buffer)
>>>>> xprt->ops->buf_free(req->rq_buffer);
>>>>> xprt_inject_disconnect(xprt);
>>>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>>>> index 58dc90ccebb6..063d2eb20d8e 100644
>>>>> --- a/net/sunrpc/xprtsock.c
>>>>> +++ b/net/sunrpc/xprtsock.c
>>>>> @@ -1246,21 +1246,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
>>>>> dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
>>>>>
>>>>> /* Find and lock the request corresponding to this xid */
>>>>> - spin_lock_bh(&xprt->transport_lock);
>>>>> + spin_lock(&xprt->reserve_lock);
>>>>> req = xprt_lookup_rqst(xprt, transport->tcp_xid);
>>>>> if (!req) {
>>>>> dprintk("RPC: XID %08x request not found!\n",
>>>>> ntohl(transport->tcp_xid));
>>>>> - spin_unlock_bh(&xprt->transport_lock);
>>>>> + spin_unlock(&xprt->reserve_lock);
>>>>> return -1;
>>>>> }
>>>>
>>>> Is a similar change needed for rpcrdma_reply_handler() ? If
>>>> that can’t be changed until it runs in a WQ context, I have
>>>> that patch right now. It can be ready for 4.4.
>>>>
>>>>
>>>>> xs_tcp_read_common(xprt, desc, req);
>>>>>
>>>>> - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
>>>>> + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
>>>>> + spin_lock_bh(&xprt->transport_lock);
>>>>> xprt_complete_rqst(req->rq_task, transport->tcp_copied);
>>>>> + spin_unlock_bh(&xprt->transport_lock);
>>>>> + }
>>>>>
>>>>> - spin_unlock_bh(&xprt->transport_lock);
>>>>> + spin_unlock(&xprt->reserve_lock);
>>>>> return 0;
>>>>> }
>>>>
>>>> xprt_complete_rqst() is actually the top source of contention
>>>> on transport_lock in my tests, thanks to the locking under
>>>> rpc_wake_up_queued_task(). Is there a plan to mitigate locking
>>>> costs in the RPC scheduler?
>>>
>>> I'm not understanding how rpc_wake_up_queued_task() could cause a
>>> problem here. Can you explain?
>>
>> xprt_complete_rqst() is holding the transport_lock, and
>> rpc_wake_up_queued_task is holding the wait queue lock.
>>
>> During complex workloads like database or multi-threaded software
>> build, I’ve observed the rpc_make_runnable -> wake_up_bit path
>> take a very long time while both of these locks are held.
>>
>> The problem seems similar to the above: pre-emption allows the
>> kernel to perform housekeeping while RPC-related threads are
>> spinning on these locks.
>>
>> There is likewise a problem with xprt_lock_write_next, which
>> invokes rpc_wake_up_first while the transport_lock is held.
>
> I'm still not understanding. Apologies if I'm just being dense.
NP, we had a discussion about this at LSF this spring. There
wasn't agreement on exactly what’s going on.
> The call to do_softirq() shouldn't happen until after the last bh-safe
> lock is released, so I wouldn't normally expect it to be called until
> after the transport lock has been released in xprt_complete_rqst(). In
> the existing upstream code, because all of this is being called from a
> bh-safe context in xs_tcp_data_ready(), the do_softirq() will be
> deferred until at least then, and possibly even later, depending on
> where in the socket code the ->data_ready() callback originated from.
I’ve ftraced this (with RPC/RDMA, anyway), and the softIRQ pre-emption
appears to happen at spin_lock_bh time. I’d be very happy to be
incorrect about this, and it’s certainly possible I didn’t understand
what I was looking at.
Even if spin_unlock_bh is where pre-emption occurs, the order of taking
the locks would matter:
1. spin_lock A
2. spin_lock_bh B
3. do_work
4. spin_unlock_bh B -> do_softIRQ
5. spin_unlock A
The thread continues to hold lock A while handling a non-deterministic
amount of other work, forcing those waiting on A to spin. But I think
that’s what you were saying above.
I haven’t been able to nail down exactly why wake_up_bit outliers
take so long. It is either softIRQ pre-emption or the fact that
wake_up_bit depends on the latency for a trip through the scheduler.
—-
Chuck Lever