Hi Anna-
Second round of three. This set could be more controversial. In it,
I adjust some code paths so that I can create rpcrdma-specific
alloc_slot and free_slot methods. These are clean-ups that make
it straightforward to do the changes coming in the third round,
and hopefully add a little bit of a scalability boost as well.
---
Chuck Lever (9):
SUNRPC: Move xprt_update_rtt callsite
SUNRPC: Make RTT measurement more precise (Receive)
SUNRPC: Make RTT measurement more precise (Send)
SUNRPC: Make num_reqs a non-atomic integer
SUNRPC: Initialize rpc_rqst outside of xprt->reserve_lock
SUNRPC: Add a ->free_slot transport callout
xprtrdma: Introduce ->alloc_slot call-out for xprtrdma
xprtrdma: Make rpc_rqst part of rpcrdma_req
xprtrdma: Allocate rpcrdma_reps during Receive completion
include/linux/sunrpc/xprt.h | 9 ++-
net/sunrpc/clnt.c | 1
net/sunrpc/xprt.c | 51 +++++++++------
net/sunrpc/xprtrdma/backchannel.c | 94 ++++++++++------------------
net/sunrpc/xprtrdma/rpc_rdma.c | 8 ++
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1
net/sunrpc/xprtrdma/transport.c | 61 ++++++++++++++----
net/sunrpc/xprtrdma/verbs.c | 35 ++++++++--
net/sunrpc/xprtrdma/xprt_rdma.h | 13 +---
net/sunrpc/xprtsock.c | 8 ++
10 files changed, 166 insertions(+), 115 deletions(-)
--
Chuck Lever
Some RPC transports have more overhead in their send_request
callouts than others. For example, for RPC-over-RDMA:
- Marshaling an RPC often has to DMA map the RPC arguments
- Registration methods perform memory registration as part of
marshaling
To capture just server and network latencies more precisely: when
sending a Call, capture the rq_xtime timestamp _after_ the transport
header has been marshaled.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprt.c | 1 -
net/sunrpc/xprtrdma/transport.c | 1 +
net/sunrpc/xprtsock.c | 3 +++
3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index cb7784c..73f05a1 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1033,7 +1033,6 @@ void xprt_transmit(struct rpc_task *task)
return;
connect_cookie = xprt->connect_cookie;
- req->rq_xtime = ktime_get();
status = xprt->ops->send_request(task);
trace_xprt_transmit(xprt, req->rq_xid, status);
if (status != 0) {
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 67e4386..cc1aad3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -696,6 +696,7 @@
/* Must suppress retransmit to maintain credits */
if (rqst->rq_connect_cookie == xprt->connect_cookie)
goto drop_connection;
+ rqst->rq_xtime = ktime_get();
__set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 61df77f..e3c6a3d 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -527,6 +527,7 @@ static int xs_local_send_request(struct rpc_task *task)
xs_pktdump("packet data:",
req->rq_svec->iov_base, req->rq_svec->iov_len);
+ req->rq_xtime = ktime_get();
status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
true, &sent);
dprintk("RPC: %s(%u) = %d\n",
@@ -589,6 +590,7 @@ static int xs_udp_send_request(struct rpc_task *task)
if (!xprt_bound(xprt))
return -ENOTCONN;
+ req->rq_xtime = ktime_get();
status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
xdr, req->rq_bytes_sent, true, &sent);
@@ -678,6 +680,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
+ req->rq_xtime = ktime_get();
while (1) {
sent = 0;
status = xs_sendpages(transport->sock, NULL, 0, xdr,
Since commit 33849792cbcd ("xprtrdma: Detect unreachable NFS/RDMA
servers more reliably"), the xprtrdma transport now has a ->timer
callout. But xprtrdma does not need to compute RTT data, only UDP
needs that. Move the xprt_update_rtt call into the UDP transport
implementation.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprt.h | 1 +
net/sunrpc/xprt.c | 11 ++++++++---
net/sunrpc/xprtsock.c | 1 +
3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 7fad838..ad322ce 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -373,6 +373,7 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
void xprt_write_space(struct rpc_xprt *xprt);
void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result);
struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid);
+void xprt_update_rtt(struct rpc_task *task);
void xprt_complete_rqst(struct rpc_task *task, int copied);
void xprt_pin_rqst(struct rpc_rqst *req);
void xprt_unpin_rqst(struct rpc_rqst *req);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8f0ad4f..13fbb48 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -889,7 +889,13 @@ static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
}
}
-static void xprt_update_rtt(struct rpc_task *task)
+/**
+ * xprt_update_rtt - Update RPC RTT statistics
+ * @task: RPC request that recently completed
+ *
+ * Caller holds xprt->recv_lock.
+ */
+void xprt_update_rtt(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_rtt *rtt = task->tk_client->cl_rtt;
@@ -902,6 +908,7 @@ static void xprt_update_rtt(struct rpc_task *task)
rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
}
}
+EXPORT_SYMBOL_GPL(xprt_update_rtt);
/**
* xprt_complete_rqst - called when reply processing is complete
@@ -921,8 +928,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
xprt->stat.recvs++;
req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
- if (xprt->ops->timer != NULL)
- xprt_update_rtt(task);
list_del_init(&req->rq_list);
req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a6b8c1f..61df77f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1060,6 +1060,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
if (!rovr)
goto out_unlock;
xprt_pin_rqst(rovr);
+ xprt_update_rtt(rovr->rq_task);
spin_unlock(&xprt->recv_lock);
task = rovr->rq_task;
Some RPC transports have more overhead in their reply handlers
than others. For example, for RPC-over-RDMA:
- RPC completion has to wait for memory invalidation, which is
not a part of the server/network round trip
- Recently a context switch was introduced into the reply handler,
which further artificially inflates the measure of RPC RTT
To capture just server and network latencies more precisely: when
receiving a reply, compute the RTT as soon as the XID is recognized
rather than at RPC completion time.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprt.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 13fbb48..cb7784c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -826,6 +826,7 @@ static void xprt_connect_status(struct rpc_task *task)
* @xprt: transport on which the original request was transmitted
* @xid: RPC XID of incoming reply
*
+ * Caller holds xprt->recv_lock.
*/
struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
{
@@ -834,6 +835,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
list_for_each_entry(entry, &xprt->recv, rq_list)
if (entry->rq_xid == xid) {
trace_xprt_lookup_rqst(xprt, xid, 0);
+ entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
return entry;
}
@@ -915,7 +917,7 @@ void xprt_update_rtt(struct rpc_task *task)
* @task: RPC request that recently completed
* @copied: actual number of bytes received from the transport
*
- * Caller holds transport lock.
+ * Caller holds xprt->recv_lock.
*/
void xprt_complete_rqst(struct rpc_task *task, int copied)
{
@@ -927,7 +929,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
xprt->stat.recvs++;
- req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
list_del_init(&req->rq_list);
req->rq_private_buf.len = copied;
If recording xprt->stat.max_slots is moved into xprt_alloc_slot,
then xprt->num_reqs is never manipulated outside
xprt->reserve_lock. There's no longer a need for xprt->num_reqs to
be atomic.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprt.h | 2 +-
net/sunrpc/xprt.c | 17 +++++++++--------
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ad322ce..5fea0fb 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -197,7 +197,7 @@ struct rpc_xprt {
struct list_head free; /* free slots */
unsigned int max_reqs; /* max number of slots */
unsigned int min_reqs; /* min number of slots */
- atomic_t num_reqs; /* total slots */
+ unsigned int num_reqs; /* total slots */
unsigned long state; /* transport state */
unsigned char resvport : 1; /* use a reserved port */
atomic_t swapper; /* we're swapping over this
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 73f05a1..70f0050 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1009,7 +1009,7 @@ void xprt_transmit(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
unsigned int connect_cookie;
- int status, numreqs;
+ int status;
dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
@@ -1047,9 +1047,6 @@ void xprt_transmit(struct rpc_task *task)
xprt->ops->set_retrans_timeout(task);
- numreqs = atomic_read(&xprt->num_reqs);
- if (numreqs > xprt->stat.max_slots)
- xprt->stat.max_slots = numreqs;
xprt->stat.sends++;
xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
xprt->stat.bklog_u += xprt->backlog.qlen;
@@ -1111,14 +1108,15 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
{
struct rpc_rqst *req = ERR_PTR(-EAGAIN);
- if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs))
+ if (xprt->num_reqs >= xprt->max_reqs)
goto out;
+ ++xprt->num_reqs;
spin_unlock(&xprt->reserve_lock);
req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS);
spin_lock(&xprt->reserve_lock);
if (req != NULL)
goto out;
- atomic_dec(&xprt->num_reqs);
+ --xprt->num_reqs;
req = ERR_PTR(-ENOMEM);
out:
return req;
@@ -1126,7 +1124,8 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
- if (atomic_add_unless(&xprt->num_reqs, -1, xprt->min_reqs)) {
+ if (xprt->num_reqs > xprt->min_reqs) {
+ --xprt->num_reqs;
kfree(req);
return true;
}
@@ -1162,6 +1161,8 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
spin_unlock(&xprt->reserve_lock);
return;
out_init_req:
+ xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
+ xprt->num_reqs);
task->tk_status = 0;
task->tk_rqstp = req;
xprt_request_init(task, xprt);
@@ -1229,7 +1230,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
else
xprt->max_reqs = num_prealloc;
xprt->min_reqs = num_prealloc;
- atomic_set(&xprt->num_reqs, num_prealloc);
+ xprt->num_reqs = num_prealloc;
return xprt;
alloc_slot is a transport-specific op, but initializing an rpc_rqst
is common to all transports. Move initialization to common code in
preparation for adding a transport-specific alloc_slot to xprtrdma.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprt.h | 1 +
net/sunrpc/clnt.c | 1 +
net/sunrpc/xprt.c | 12 +++++++-----
3 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 5fea0fb..9784e28 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -324,6 +324,7 @@ struct xprt_class {
struct rpc_xprt *xprt_create_transport(struct xprt_create *args);
void xprt_connect(struct rpc_task *task);
void xprt_reserve(struct rpc_task *task);
+void xprt_request_init(struct rpc_task *task);
void xprt_retry_reserve(struct rpc_task *task);
int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6e432ec..226f558 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
task->tk_status = 0;
if (status >= 0) {
if (task->tk_rqstp) {
+ xprt_request_init(task);
task->tk_action = call_refresh;
return;
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 70f0050..a394b46 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -66,7 +66,7 @@
* Local functions
*/
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
-static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
+static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
static void xprt_connect_status(struct rpc_task *task);
static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
@@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task *task)
task->tk_status = -EAGAIN;
goto out_unlock;
}
+ if (likely(!bc_prealloc(req)))
+ req->rq_xid = xprt_alloc_xid(xprt);
ret = true;
out_unlock:
spin_unlock_bh(&xprt->transport_lock);
@@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
out_init_req:
xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
xprt->num_reqs);
+ spin_unlock(&xprt->reserve_lock);
+
task->tk_status = 0;
task->tk_rqstp = req;
- xprt_request_init(task, xprt);
- spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_alloc_slot);
@@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
xprt->xid = prandom_u32();
}
-static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
+void xprt_request_init(struct rpc_task *task)
{
+ struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req = task->tk_rqstp;
INIT_LIST_HEAD(&req->rq_list);
@@ -1312,7 +1315,6 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_buffer = NULL;
- req->rq_xid = xprt_alloc_xid(xprt);
req->rq_connect_cookie = xprt->connect_cookie - 1;
req->rq_bytes_sent = 0;
req->rq_snd_buf.len = 0;
Refactor: xprtrdma needs to have better control over when RPCs are
awoken from the backlog queue, so replace xprt_free_slot with a
transport op callout.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprt.h | 4 ++++
net/sunrpc/xprt.c | 5 +++--
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 +
net/sunrpc/xprtrdma/transport.c | 1 +
net/sunrpc/xprtsock.c | 4 ++++
5 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 9784e28..706eef1 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -127,6 +127,8 @@ struct rpc_xprt_ops {
int (*reserve_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
void (*alloc_slot)(struct rpc_xprt *xprt, struct rpc_task *task);
+ void (*free_slot)(struct rpc_xprt *xprt,
+ struct rpc_rqst *req);
void (*rpcbind)(struct rpc_task *task);
void (*set_port)(struct rpc_xprt *xprt, unsigned short port);
void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
@@ -329,6 +331,8 @@ struct xprt_class {
int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
+void xprt_free_slot(struct rpc_xprt *xprt,
+ struct rpc_rqst *req);
void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
bool xprt_prepare_transmit(struct rpc_task *task);
void xprt_transmit(struct rpc_task *task);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a394b46..fb3093d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1186,7 +1186,7 @@ void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
}
EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot);
-static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
+void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
if (!xprt_dynamic_free_slot(xprt, req)) {
@@ -1196,6 +1196,7 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
+EXPORT_SYMBOL_GPL(xprt_free_slot);
static void xprt_free_all_slots(struct rpc_xprt *xprt)
{
@@ -1375,7 +1376,7 @@ void xprt_release(struct rpc_task *task)
dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
if (likely(!bc_prealloc(req)))
- xprt_free_slot(xprt, req);
+ xprt->ops->free_slot(xprt, req);
else
xprt_free_bc_request(req);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a73632c..1035516 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -273,6 +273,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong,
.alloc_slot = xprt_alloc_slot,
+ .free_slot = xprt_free_slot,
.release_request = xprt_release_rqst_cong,
.buf_alloc = xprt_rdma_bc_allocate,
.buf_free = xprt_rdma_bc_free,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index cc1aad3..40ff91d 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -780,6 +780,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
.alloc_slot = xprt_alloc_slot,
+ .free_slot = xprt_free_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
.set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
.timer = xprt_rdma_timer,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e3c6a3d..2511c21 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2764,6 +2764,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xs_tcp_release_xprt,
.alloc_slot = xprt_alloc_slot,
+ .free_slot = xprt_free_slot,
.rpcbind = xs_local_rpcbind,
.set_port = xs_local_set_port,
.connect = xs_local_connect,
@@ -2783,6 +2784,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong,
.alloc_slot = xprt_alloc_slot,
+ .free_slot = xprt_free_slot,
.rpcbind = rpcb_getport_async,
.set_port = xs_set_port,
.connect = xs_connect,
@@ -2804,6 +2806,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xs_tcp_release_xprt,
.alloc_slot = xprt_lock_and_alloc_slot,
+ .free_slot = xprt_free_slot,
.rpcbind = rpcb_getport_async,
.set_port = xs_set_port,
.connect = xs_connect,
@@ -2835,6 +2838,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xprt_release_xprt,
.alloc_slot = xprt_alloc_slot,
+ .free_slot = xprt_free_slot,
.buf_alloc = bc_malloc,
.buf_free = bc_free,
.send_request = bc_send_request,
rpcrdma_buffer_get acquires an rpcrdma_req and rep for each RPC.
Currently this is done in the call_allocate action, and sometimes it
can fail if there are many outstanding RPCs.
When call_allocate fails, the RPC task is put on the delayq. It is
awoken a few milliseconds later, but there's no guarantee it will
get a buffer at that time. The RPC task can be repeatedly put back
to sleep or even starved.
The call_allocate action should rarely fail. The delayq mechanism is
not meant to deal with transport congestion.
In the current sunrpc stack, there is a friendlier way to deal with
this situation. These objects are actually tantamount to an RPC
slot (rpc_rqst) and there is a separate FSM action, distinct from
call_allocate, for allocating slot resources. This is the
call_reserve action.
When allocation fails during this action, the RPC is placed on the
transport's backlog queue. The backlog mechanism provides a stronger
guarantee that when the RPC is awoken, a buffer will be available
for it; and backlogged RPCs are awoken one-at-a-time.
To make slot resource allocation occur in the call_reserve action,
create special ->alloc_slot and ->free_slot call-outs for xprtrdma.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 52 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 50 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 40ff91d..1dac949 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -537,6 +537,54 @@
}
}
+/**
+ * xprt_rdma_alloc_slot - allocate an rpc_rqst
+ * @xprt: controlling RPC transport
+ * @task: RPC task requesting a fresh rpc_rqst
+ *
+ * tk_status values:
+ * %0 if task->tk_rqstp points to a fresh rpc_rqst
+ * %-EAGAIN if no rpc_rqst is available; queued on backlog
+ */
+static void
+xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct rpc_rqst *rqst;
+
+ spin_lock(&xprt->reserve_lock);
+ if (list_empty(&xprt->free))
+ goto out_sleep;
+ rqst = list_first_entry(&xprt->free, struct rpc_rqst, rq_list);
+ list_del(&rqst->rq_list);
+ spin_unlock(&xprt->reserve_lock);
+
+ task->tk_rqstp = rqst;
+ task->tk_status = 0;
+ return;
+
+out_sleep:
+ rpc_sleep_on(&xprt->backlog, task, NULL);
+ spin_unlock(&xprt->reserve_lock);
+ task->tk_status = -EAGAIN;
+}
+
+/**
+ * xprt_rdma_free_slot - release an rpc_rqst
+ * @xprt: controlling RPC transport
+ * @rqst: rpc_rqst to release
+ *
+ */
+static void
+xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
+{
+ memset(rqst, 0, sizeof(*rqst));
+
+ spin_lock(&xprt->reserve_lock);
+ list_add(&rqst->rq_list, &xprt->free);
+ rpc_wake_up_next(&xprt->backlog);
+ spin_unlock(&xprt->reserve_lock);
+}
+
static bool
rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
size_t size, gfp_t flags)
@@ -779,8 +827,8 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
static const struct rpc_xprt_ops xprt_rdma_procs = {
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
- .alloc_slot = xprt_alloc_slot,
- .free_slot = xprt_free_slot,
+ .alloc_slot = xprt_rdma_alloc_slot,
+ .free_slot = xprt_rdma_free_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
.set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
.timer = xprt_rdma_timer,
This simplifies allocation of the generic RPC slot and xprtrdma
specific per-RPC resources.
It also makes xprtrdma more like the socket-based transports:
->buf_alloc and ->buf_free are now responsible only for send and
receive buffers.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprt.h | 1
net/sunrpc/xprtrdma/backchannel.c | 77 +++++++++++++++++--------------------
net/sunrpc/xprtrdma/transport.c | 35 ++++-------------
net/sunrpc/xprtrdma/xprt_rdma.h | 9 +---
4 files changed, 46 insertions(+), 76 deletions(-)
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 706eef1..336fd1a 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -84,7 +84,6 @@ struct rpc_rqst {
void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
struct list_head rq_list;
- void *rq_xprtdata; /* Per-xprt private data */
void *rq_buffer; /* Call XDR encode buffer */
size_t rq_callsize;
void *rq_rbuffer; /* Reply XDR decode buffer */
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 47ebac9..4034788 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -29,29 +29,41 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
spin_unlock(&buf->rb_reqslock);
rpcrdma_destroy_req(req);
-
- kfree(rqst);
}
-static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
- struct rpc_rqst *rqst)
+static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt,
+ unsigned int count)
{
- struct rpcrdma_regbuf *rb;
- struct rpcrdma_req *req;
- size_t size;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct rpc_rqst *rqst;
+ unsigned int i;
+
+ for (i = 0; i < (count << 1); i++) {
+ struct rpcrdma_regbuf *rb;
+ struct rpcrdma_req *req;
+ size_t size;
+
+ req = rpcrdma_create_req(r_xprt);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ rqst = &req->rl_slot;
+
+ rqst->rq_xprt = xprt;
+ INIT_LIST_HEAD(&rqst->rq_list);
+ INIT_LIST_HEAD(&rqst->rq_bc_list);
+ __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
- req = rpcrdma_create_req(r_xprt);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- size = r_xprt->rx_data.inline_rsize;
- rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
- if (IS_ERR(rb))
- goto out_fail;
- req->rl_sendbuf = rb;
- xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
- min_t(size_t, size, PAGE_SIZE));
- rpcrdma_set_xprtdata(rqst, req);
+ size = r_xprt->rx_data.inline_rsize;
+ rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
+ if (IS_ERR(rb))
+ goto out_fail;
+ req->rl_sendbuf = rb;
+ xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
+ min_t(size_t, size, PAGE_SIZE));
+ }
return 0;
out_fail:
@@ -86,9 +98,6 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
- struct rpc_rqst *rqst;
- unsigned int i;
int rc;
/* The backchannel reply path returns each rpc_rqst to the
@@ -103,25 +112,9 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
if (reqs > RPCRDMA_BACKWARD_WRS >> 1)
goto out_err;
- for (i = 0; i < (reqs << 1); i++) {
- rqst = kzalloc(sizeof(*rqst), GFP_KERNEL);
- if (!rqst)
- goto out_free;
-
- dprintk("RPC: %s: new rqst %p\n", __func__, rqst);
-
- rqst->rq_xprt = &r_xprt->rx_xprt;
- INIT_LIST_HEAD(&rqst->rq_list);
- INIT_LIST_HEAD(&rqst->rq_bc_list);
- __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
-
- if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
- goto out_free;
-
- spin_lock_bh(&xprt->bc_pa_lock);
- list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
- spin_unlock_bh(&xprt->bc_pa_lock);
- }
+ rc = rpcrdma_bc_setup_reqs(r_xprt, reqs);
+ if (rc)
+ goto out_free;
rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
if (rc)
@@ -131,7 +124,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
if (rc)
goto out_free;
- buffer->rb_bc_srv_max_requests = reqs;
+ r_xprt->rx_buf.rb_bc_srv_max_requests = reqs;
request_module("svcrdma");
trace_xprtrdma_cb_setup(r_xprt, reqs);
return 0;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 1dac949..e0ab2b2 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -330,9 +330,7 @@
return ERR_PTR(-EBADF);
}
- xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
- xprt_rdma_slot_table_entries,
- xprt_rdma_slot_table_entries);
+ xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0);
if (xprt == NULL) {
dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
__func__);
@@ -364,7 +362,7 @@
xprt_set_bound(xprt);
xprt_rdma_format_addresses(xprt, sap);
- cdata.max_requests = xprt->max_reqs;
+ cdata.max_requests = xprt_rdma_slot_table_entries;
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
@@ -549,22 +547,18 @@
static void
xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
{
- struct rpc_rqst *rqst;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_req *req;
- spin_lock(&xprt->reserve_lock);
- if (list_empty(&xprt->free))
+ req = rpcrdma_buffer_get(&r_xprt->rx_buf);
+ if (!req)
goto out_sleep;
- rqst = list_first_entry(&xprt->free, struct rpc_rqst, rq_list);
- list_del(&rqst->rq_list);
- spin_unlock(&xprt->reserve_lock);
-
- task->tk_rqstp = rqst;
+ task->tk_rqstp = &req->rl_slot;
task->tk_status = 0;
return;
out_sleep:
rpc_sleep_on(&xprt->backlog, task, NULL);
- spin_unlock(&xprt->reserve_lock);
task->tk_status = -EAGAIN;
}
@@ -578,11 +572,8 @@
xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
{
memset(rqst, 0, sizeof(*rqst));
-
- spin_lock(&xprt->reserve_lock);
- list_add(&rqst->rq_list, &xprt->free);
+ rpcrdma_buffer_put(rpcr_to_rdmar(rqst));
rpc_wake_up_next(&xprt->backlog);
- spin_unlock(&xprt->reserve_lock);
}
static bool
@@ -655,13 +646,9 @@
{
struct rpc_rqst *rqst = task->tk_rqstp;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- struct rpcrdma_req *req;
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
gfp_t flags;
- req = rpcrdma_buffer_get(&r_xprt->rx_buf);
- if (req == NULL)
- goto out_get;
-
flags = RPCRDMA_DEF_GFP;
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
@@ -671,15 +658,12 @@
if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
goto out_fail;
- rpcrdma_set_xprtdata(rqst, req);
rqst->rq_buffer = req->rl_sendbuf->rg_base;
rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
trace_xprtrdma_allocate(task, req);
return 0;
out_fail:
- rpcrdma_buffer_put(req);
-out_get:
trace_xprtrdma_allocate(task, NULL);
return -ENOMEM;
}
@@ -700,7 +684,6 @@
if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags))
rpcrdma_release_rqst(r_xprt, req);
trace_xprtrdma_rpc_done(task, req);
- rpcrdma_buffer_put(req);
}
/**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 3d3b423..b35d80b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -334,6 +334,7 @@ enum {
struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_list;
+ struct rpc_rqst rl_slot;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;
struct xdr_stream rl_stream;
@@ -356,16 +357,10 @@ enum {
RPCRDMA_REQ_F_TX_RESOURCES,
};
-static inline void
-rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
-{
- rqst->rq_xprtdata = req;
-}
-
static inline struct rpcrdma_req *
rpcr_to_rdmar(const struct rpc_rqst *rqst)
{
- return rqst->rq_xprtdata;
+ return container_of(rqst, struct rpcrdma_req, rl_slot);
}
static inline void
Receive completion for a CQ runs on one CPU core only. Ensure that
Receive buffers are allocated on the same CPU core where Receive
completions are handled. This guarantees that a transport's Receive
buffers are on the NUMA node that is local to the device no matter
where the transport was created.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/backchannel.c | 21 ---------------------
net/sunrpc/xprtrdma/rpc_rdma.c | 8 ++++++++
net/sunrpc/xprtrdma/verbs.c | 35 ++++++++++++++++++++++++++---------
net/sunrpc/xprtrdma/xprt_rdma.h | 4 +++-
4 files changed, 37 insertions(+), 31 deletions(-)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 4034788..6b21fb8 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -71,23 +71,6 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt,
return -ENOMEM;
}
-/* Allocate and add receive buffers to the rpcrdma_buffer's
- * existing list of rep's. These are released when the
- * transport is destroyed.
- */
-static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
- unsigned int count)
-{
- int rc = 0;
-
- while (count--) {
- rc = rpcrdma_create_rep(r_xprt);
- if (rc)
- break;
- }
- return rc;
-}
-
/**
* xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
* @xprt: transport associated with these backchannel resources
@@ -116,10 +99,6 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
if (rc)
goto out_free;
- rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
- if (rc)
- goto out_free;
-
rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs);
if (rc)
goto out_free;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e8adad3..d15aa27 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1331,8 +1331,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
u32 credits;
+ int total;
__be32 *p;
+ total = buf->rb_max_requests + (buf->rb_bc_srv_max_requests << 1);
+ total -= buf->rb_reps;
+ if (total > 0)
+ while (total--)
+ if (!rpcrdma_create_rep(r_xprt, false))
+ break;
+
if (rep->rr_hdrbuf.head[0].iov_len == 0)
goto out_badstatus;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 6a7a5a2..af74953 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1095,11 +1095,12 @@ struct rpcrdma_req *
/**
* rpcrdma_create_rep - Allocate an rpcrdma_rep object
* @r_xprt: controlling transport
+ * @temp: destroy rep upon release
*
* Returns 0 on success or a negative errno on failure.
*/
int
-rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
+rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1127,9 +1128,11 @@ struct rpcrdma_req *
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
+ rep->rr_temp = temp;
spin_lock(&buf->rb_lock);
list_add(&rep->rr_list, &buf->rb_recv_bufs);
+ ++buf->rb_reps;
spin_unlock(&buf->rb_lock);
return 0;
@@ -1179,11 +1182,9 @@ struct rpcrdma_req *
}
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i <= buf->rb_max_requests; i++) {
- rc = rpcrdma_create_rep(r_xprt);
- if (rc)
- goto out;
- }
+ rc = rpcrdma_create_rep(r_xprt, true);
+ if (rc)
+ goto out;
rc = rpcrdma_sendctxs_create(r_xprt);
if (rc)
@@ -1220,8 +1221,14 @@ struct rpcrdma_req *
static void
rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
{
+ struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf;
+
rpcrdma_free_regbuf(rep->rr_rdmabuf);
kfree(rep);
+
+ spin_lock(&buf->rb_lock);
+ --buf->rb_reps;
+ spin_unlock(&buf->rb_lock);
}
void
@@ -1417,12 +1424,17 @@ struct rpcrdma_req *
spin_lock(&buffers->rb_lock);
buffers->rb_send_count--;
- list_add_tail(&req->rl_list, &buffers->rb_send_bufs);
+ list_add(&req->rl_list, &buffers->rb_send_bufs);
if (rep) {
buffers->rb_recv_count--;
- list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ if (!rep->rr_temp) {
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ rep = NULL;
+ }
}
spin_unlock(&buffers->rb_lock);
+ if (rep)
+ rpcrdma_destroy_rep(rep);
}
/*
@@ -1450,8 +1462,13 @@ struct rpcrdma_req *
spin_lock(&buffers->rb_lock);
buffers->rb_recv_count--;
- list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ if (!rep->rr_temp) {
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ rep = NULL;
+ }
spin_unlock(&buffers->rb_lock);
+ if (rep)
+ rpcrdma_destroy_rep(rep);
}
/**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b35d80b..5f069c7 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -196,6 +196,7 @@ struct rpcrdma_rep {
__be32 rr_proc;
int rr_wc_flags;
u32 rr_inv_rkey;
+ bool rr_temp;
struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
@@ -401,6 +402,7 @@ struct rpcrdma_buffer {
struct list_head rb_recv_bufs;
u32 rb_max_requests;
u32 rb_credits; /* most recent credit grant */
+ unsigned int rb_reps;
u32 rb_bc_srv_max_requests;
spinlock_t rb_reqslock; /* protect rb_allreqs */
@@ -563,7 +565,7 @@ int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
*/
struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
void rpcrdma_destroy_req(struct rpcrdma_req *);
-int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt);
+int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
> On Mar 6, 2018, at 5:02 PM, Anna Schumaker <[email protected]> =
wrote:
>=20
> Hi Chuck,
>=20
> I'm seeing a huge performance hit with this patch. I'm just running =
cthon over TCP, and it goes from finishing in 22 seconds to taking well =
over 5 minutes. I seem to only see this on the read and write tests, =
such as basic test5 taking a minute to finish:
>=20
> ./test5: read and write =
=20
> wrote 1048576 byte file 10 times in 60.35 seconds (173737 =
bytes/sec) =20
> read 1048576 byte file 10 times in 0.0 seconds =
(-2147483648 bytes/sec) =20
> ./test5 ok.=20
OK. This looks like write is impacted, but this test doesn't
actually perform any reads on the wire. Try iozone with -I,
maybe? That would show results for both read and write.
> I haven't dug into this too deeply, but my best guess is that maybe =
it's due to adding a call to xprt_request_init() in =
net/sunrpc/clnt.c:call_reserveresult()
It wasn't added there, it was moved from xprt_alloc_slot. So,
it's not new work per-RPC.
Any additional information would be appreciated!
> Thoughts?
> Anna
>=20
> On 03/05/2018 03:13 PM, Chuck Lever wrote:
>> alloc_slot is a transport-specific op, but initializing an rpc_rqst
>> is common to all transports. Move initialization to common code in
>> preparation for adding a transport-specific alloc_slot to xprtrdma.
>>=20
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> include/linux/sunrpc/xprt.h | 1 +
>> net/sunrpc/clnt.c | 1 +
>> net/sunrpc/xprt.c | 12 +++++++-----
>> 3 files changed, 9 insertions(+), 5 deletions(-)
>>=20
>> diff --git a/include/linux/sunrpc/xprt.h =
b/include/linux/sunrpc/xprt.h
>> index 5fea0fb..9784e28 100644
>> --- a/include/linux/sunrpc/xprt.h
>> +++ b/include/linux/sunrpc/xprt.h
>> @@ -324,6 +324,7 @@ struct xprt_class {
>> struct rpc_xprt *xprt_create_transport(struct =
xprt_create *args);
>> void xprt_connect(struct rpc_task *task);
>> void xprt_reserve(struct rpc_task *task);
>> +void xprt_request_init(struct rpc_task =
*task);
>> void xprt_retry_reserve(struct rpc_task *task);
>> int xprt_reserve_xprt(struct rpc_xprt *xprt, struct =
rpc_task *task);
>> int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, =
struct rpc_task *task);
>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>> index 6e432ec..226f558 100644
>> --- a/net/sunrpc/clnt.c
>> +++ b/net/sunrpc/clnt.c
>> @@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
>> task->tk_status =3D 0;
>> if (status >=3D 0) {
>> if (task->tk_rqstp) {
>> + xprt_request_init(task);
>> task->tk_action =3D call_refresh;
>> return;
>> }
>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>> index 70f0050..a394b46 100644
>> --- a/net/sunrpc/xprt.c
>> +++ b/net/sunrpc/xprt.c
>> @@ -66,7 +66,7 @@
>> * Local functions
>> */
>> static void xprt_init(struct rpc_xprt *xprt, struct net *net);
>> -static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
>> +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
>> static void xprt_connect_status(struct rpc_task *task);
>> static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task =
*);
>> static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst =
*);
>> @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task *task)
>> task->tk_status =3D -EAGAIN;
>> goto out_unlock;
>> }
>> + if (likely(!bc_prealloc(req)))
>> + req->rq_xid =3D xprt_alloc_xid(xprt);
>> ret =3D true;
>> out_unlock:
>> spin_unlock_bh(&xprt->transport_lock);
>> @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, =
struct rpc_task *task)
>> out_init_req:
>> xprt->stat.max_slots =3D max_t(unsigned int, =
xprt->stat.max_slots,
>> xprt->num_reqs);
>> + spin_unlock(&xprt->reserve_lock);
>> +
>> task->tk_status =3D 0;
>> task->tk_rqstp =3D req;
>> - xprt_request_init(task, xprt);
>> - spin_unlock(&xprt->reserve_lock);
>> }
>> EXPORT_SYMBOL_GPL(xprt_alloc_slot);
>>=20
>> @@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct =
rpc_xprt *xprt)
>> xprt->xid =3D prandom_u32();
>> }
>>=20
>> -static void xprt_request_init(struct rpc_task *task, struct rpc_xprt =
*xprt)
>> +void xprt_request_init(struct rpc_task *task)
>> {
>> + struct rpc_xprt *xprt =3D task->tk_xprt;
>> struct rpc_rqst *req =3D task->tk_rqstp;
>>=20
>> INIT_LIST_HEAD(&req->rq_list);
>> @@ -1312,7 +1315,6 @@ static void xprt_request_init(struct rpc_task =
*task, struct rpc_xprt *xprt)
>> req->rq_task =3D task;
>> req->rq_xprt =3D xprt;
>> req->rq_buffer =3D NULL;
>> - req->rq_xid =3D xprt_alloc_xid(xprt);
>> req->rq_connect_cookie =3D xprt->connect_cookie - 1;
>> req->rq_bytes_sent =3D 0;
>> req->rq_snd_buf.len =3D 0;
>>=20
--
Chuck Lever
Hi Chuck,
I'm seeing a huge performance hit with this patch. I'm just running cthon over TCP, and it goes from finishing in 22 seconds to taking well over 5 minutes. I seem to only see this on the read and write tests, such as basic test5 taking a minute to finish:
./test5: read and write
wrote 1048576 byte file 10 times in 60.35 seconds (173737 bytes/sec)
read 1048576 byte file 10 times in 0.0 seconds (-2147483648 bytes/sec)
./test5 ok.
I haven't dug into this too deeply, but my best guess is that maybe it's due to adding a call to xprt_request_init() in net/sunrpc/clnt.c:call_reserveresult()
Thoughts?
Anna
On 03/05/2018 03:13 PM, Chuck Lever wrote:
> alloc_slot is a transport-specific op, but initializing an rpc_rqst
> is common to all transports. Move initialization to common code in
> preparation for adding a transport-specific alloc_slot to xprtrdma.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> include/linux/sunrpc/xprt.h | 1 +
> net/sunrpc/clnt.c | 1 +
> net/sunrpc/xprt.c | 12 +++++++-----
> 3 files changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> index 5fea0fb..9784e28 100644
> --- a/include/linux/sunrpc/xprt.h
> +++ b/include/linux/sunrpc/xprt.h
> @@ -324,6 +324,7 @@ struct xprt_class {
> struct rpc_xprt *xprt_create_transport(struct xprt_create *args);
> void xprt_connect(struct rpc_task *task);
> void xprt_reserve(struct rpc_task *task);
> +void xprt_request_init(struct rpc_task *task);
> void xprt_retry_reserve(struct rpc_task *task);
> int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
> int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 6e432ec..226f558 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
> task->tk_status = 0;
> if (status >= 0) {
> if (task->tk_rqstp) {
> + xprt_request_init(task);
> task->tk_action = call_refresh;
> return;
> }
> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index 70f0050..a394b46 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -66,7 +66,7 @@
> * Local functions
> */
> static void xprt_init(struct rpc_xprt *xprt, struct net *net);
> -static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
> +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
> static void xprt_connect_status(struct rpc_task *task);
> static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
> static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
> @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task *task)
> task->tk_status = -EAGAIN;
> goto out_unlock;
> }
> + if (likely(!bc_prealloc(req)))
> + req->rq_xid = xprt_alloc_xid(xprt);
> ret = true;
> out_unlock:
> spin_unlock_bh(&xprt->transport_lock);
> @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
> out_init_req:
> xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
> xprt->num_reqs);
> + spin_unlock(&xprt->reserve_lock);
> +
> task->tk_status = 0;
> task->tk_rqstp = req;
> - xprt_request_init(task, xprt);
> - spin_unlock(&xprt->reserve_lock);
> }
> EXPORT_SYMBOL_GPL(xprt_alloc_slot);
>
> @@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
> xprt->xid = prandom_u32();
> }
>
> -static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> +void xprt_request_init(struct rpc_task *task)
> {
> + struct rpc_xprt *xprt = task->tk_xprt;
> struct rpc_rqst *req = task->tk_rqstp;
>
> INIT_LIST_HEAD(&req->rq_list);
> @@ -1312,7 +1315,6 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> req->rq_task = task;
> req->rq_xprt = xprt;
> req->rq_buffer = NULL;
> - req->rq_xid = xprt_alloc_xid(xprt);
> req->rq_connect_cookie = xprt->connect_cookie - 1;
> req->rq_bytes_sent = 0;
> req->rq_snd_buf.len = 0;
>
> On Mar 6, 2018, at 5:07 PM, Chuck Lever <[email protected]> =
wrote:
>=20
>=20
>=20
>> On Mar 6, 2018, at 5:02 PM, Anna Schumaker =
<[email protected]> wrote:
>>=20
>> Hi Chuck,
>>=20
>> I'm seeing a huge performance hit with this patch. I'm just running =
cthon over TCP, and it goes from finishing in 22 seconds to taking well =
over 5 minutes. I seem to only see this on the read and write tests, =
such as basic test5 taking a minute to finish:
>>=20
>> ./test5: read and write =
=20
>> wrote 1048576 byte file 10 times in 60.35 seconds (173737 =
bytes/sec) =20
>> read 1048576 byte file 10 times in 0.0 seconds =
(-2147483648 bytes/sec) =20
>> ./test5 ok.=20
>=20
> OK. This looks like write is impacted, but this test doesn't
> actually perform any reads on the wire. Try iozone with -I,
> maybe? That would show results for both read and write.
Hum.
Stock v4.16-rc4:
./test5: read and write
wrote 1048576 byte file 10 times in 0.2 seconds (350811642 =
bytes/sec)
read 1048576 byte file 10 times in 0.0 seconds (-2147483648 =
bytes/sec)
./test5 ok.
v4.16-rc4 with my full set of patches:
./test5: read and write
wrote 1048576 byte file 10 times in 0.2 seconds (354236681 =
bytes/sec)
read 1048576 byte file 10 times in 0.0 seconds (-2147483648 =
bytes/sec)
./test5 ok.
I don't see a regression here. Let me know how it goes!
>> I haven't dug into this too deeply, but my best guess is that maybe =
it's due to adding a call to xprt_request_init() in =
net/sunrpc/clnt.c:call_reserveresult()
>=20
> It wasn't added there, it was moved from xprt_alloc_slot. So,
> it's not new work per-RPC.
>=20
> Any additional information would be appreciated!
>=20
>=20
>> Thoughts?
>> Anna
>>=20
>> On 03/05/2018 03:13 PM, Chuck Lever wrote:
>>> alloc_slot is a transport-specific op, but initializing an rpc_rqst
>>> is common to all transports. Move initialization to common code in
>>> preparation for adding a transport-specific alloc_slot to xprtrdma.
>>>=20
>>> Signed-off-by: Chuck Lever <[email protected]>
>>> ---
>>> include/linux/sunrpc/xprt.h | 1 +
>>> net/sunrpc/clnt.c | 1 +
>>> net/sunrpc/xprt.c | 12 +++++++-----
>>> 3 files changed, 9 insertions(+), 5 deletions(-)
>>>=20
>>> diff --git a/include/linux/sunrpc/xprt.h =
b/include/linux/sunrpc/xprt.h
>>> index 5fea0fb..9784e28 100644
>>> --- a/include/linux/sunrpc/xprt.h
>>> +++ b/include/linux/sunrpc/xprt.h
>>> @@ -324,6 +324,7 @@ struct xprt_class {
>>> struct rpc_xprt *xprt_create_transport(struct =
xprt_create *args);
>>> void xprt_connect(struct rpc_task *task);
>>> void xprt_reserve(struct rpc_task *task);
>>> +void xprt_request_init(struct rpc_task =
*task);
>>> void xprt_retry_reserve(struct rpc_task =
*task);
>>> int xprt_reserve_xprt(struct rpc_xprt *xprt, struct =
rpc_task *task);
>>> int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, =
struct rpc_task *task);
>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>> index 6e432ec..226f558 100644
>>> --- a/net/sunrpc/clnt.c
>>> +++ b/net/sunrpc/clnt.c
>>> @@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
>>> task->tk_status =3D 0;
>>> if (status >=3D 0) {
>>> if (task->tk_rqstp) {
>>> + xprt_request_init(task);
>>> task->tk_action =3D call_refresh;
>>> return;
>>> }
>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>> index 70f0050..a394b46 100644
>>> --- a/net/sunrpc/xprt.c
>>> +++ b/net/sunrpc/xprt.c
>>> @@ -66,7 +66,7 @@
>>> * Local functions
>>> */
>>> static void xprt_init(struct rpc_xprt *xprt, struct net *net);
>>> -static void xprt_request_init(struct rpc_task *, struct =
rpc_xprt *);
>>> +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
>>> static void xprt_connect_status(struct rpc_task *task);
>>> static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task =
*);
>>> static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst =
*);
>>> @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task =
*task)
>>> task->tk_status =3D -EAGAIN;
>>> goto out_unlock;
>>> }
>>> + if (likely(!bc_prealloc(req)))
>>> + req->rq_xid =3D xprt_alloc_xid(xprt);
>>> ret =3D true;
>>> out_unlock:
>>> spin_unlock_bh(&xprt->transport_lock);
>>> @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, =
struct rpc_task *task)
>>> out_init_req:
>>> xprt->stat.max_slots =3D max_t(unsigned int, =
xprt->stat.max_slots,
>>> xprt->num_reqs);
>>> + spin_unlock(&xprt->reserve_lock);
>>> +
>>> task->tk_status =3D 0;
>>> task->tk_rqstp =3D req;
>>> - xprt_request_init(task, xprt);
>>> - spin_unlock(&xprt->reserve_lock);
>>> }
>>> EXPORT_SYMBOL_GPL(xprt_alloc_slot);
>>>=20
>>> @@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct =
rpc_xprt *xprt)
>>> xprt->xid =3D prandom_u32();
>>> }
>>>=20
>>> -static void xprt_request_init(struct rpc_task *task, struct =
rpc_xprt *xprt)
>>> +void xprt_request_init(struct rpc_task *task)
>>> {
>>> + struct rpc_xprt *xprt =3D task->tk_xprt;
>>> struct rpc_rqst *req =3D task->tk_rqstp;
>>>=20
>>> INIT_LIST_HEAD(&req->rq_list);
>>> @@ -1312,7 +1315,6 @@ static void xprt_request_init(struct rpc_task =
*task, struct rpc_xprt *xprt)
>>> req->rq_task =3D task;
>>> req->rq_xprt =3D xprt;
>>> req->rq_buffer =3D NULL;
>>> - req->rq_xid =3D xprt_alloc_xid(xprt);
>>> req->rq_connect_cookie =3D xprt->connect_cookie - 1;
>>> req->rq_bytes_sent =3D 0;
>>> req->rq_snd_buf.len =3D 0;
>>>=20
>=20
> --
> Chuck Lever
>=20
>=20
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" =
in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Chuck Lever
On 03/06/2018 05:30 PM, Chuck Lever wrote:
>
>
>> On Mar 6, 2018, at 5:07 PM, Chuck Lever <[email protected]> wrote:
>>
>>
>>
>>> On Mar 6, 2018, at 5:02 PM, Anna Schumaker <[email protected]> wrote:
>>>
>>> Hi Chuck,
>>>
>>> I'm seeing a huge performance hit with this patch. I'm just running cthon over TCP, and it goes from finishing in 22 seconds to taking well over 5 minutes. I seem to only see this on the read and write tests, such as basic test5 taking a minute to finish:
>>>
>>> ./test5: read and write
>>> wrote 1048576 byte file 10 times in 60.35 seconds (173737 bytes/sec)
>>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 bytes/sec)
>>> ./test5 ok.
>>
>> OK. This looks like write is impacted, but this test doesn't
>> actually perform any reads on the wire. Try iozone with -I,
>> maybe? That would show results for both read and write.
>
> Hum.
>
> Stock v4.16-rc4:
>
> ./test5: read and write
> wrote 1048576 byte file 10 times in 0.2 seconds (350811642 bytes/sec)
> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 bytes/sec)
> ./test5 ok.
>
>
> v4.16-rc4 with my full set of patches:
>
> ./test5: read and write
> wrote 1048576 byte file 10 times in 0.2 seconds (354236681 bytes/sec)
> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 bytes/sec)
> ./test5 ok.
>
> I don't see a regression here. Let me know how it goes!
I'm using rc4 too, so maybe it's something different in my setup? Making this change fixes the issue for me:
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a394b4635f8e..273847f7e455 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -987,8 +987,6 @@ bool xprt_prepare_transmit(struct rpc_task *task)
task->tk_status = -EAGAIN;
goto out_unlock;
}
- if (likely(!bc_prealloc(req)))
- req->rq_xid = xprt_alloc_xid(xprt);
ret = true;
out_unlock:
spin_unlock_bh(&xprt->transport_lock);
@@ -1315,6 +1313,7 @@ void xprt_request_init(struct rpc_task *task)
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_buffer = NULL;
+ req->rq_xid = xprt_alloc_xid(xprt);
req->rq_connect_cookie = xprt->connect_cookie - 1;
req->rq_bytes_sent = 0;
req->rq_snd_buf.len = 0;
Anna
>
>
>>> I haven't dug into this too deeply, but my best guess is that maybe it's due to adding a call to xprt_request_init() in net/sunrpc/clnt.c:call_reserveresult()
>>
>> It wasn't added there, it was moved from xprt_alloc_slot. So,
>> it's not new work per-RPC.
>>
>> Any additional information would be appreciated!
>>
>>
>>> Thoughts?
>>> Anna
>>>
>>> On 03/05/2018 03:13 PM, Chuck Lever wrote:
>>>> alloc_slot is a transport-specific op, but initializing an rpc_rqst
>>>> is common to all transports. Move initialization to common code in
>>>> preparation for adding a transport-specific alloc_slot to xprtrdma.
>>>>
>>>> Signed-off-by: Chuck Lever <[email protected]>
>>>> ---
>>>> include/linux/sunrpc/xprt.h | 1 +
>>>> net/sunrpc/clnt.c | 1 +
>>>> net/sunrpc/xprt.c | 12 +++++++-----
>>>> 3 files changed, 9 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
>>>> index 5fea0fb..9784e28 100644
>>>> --- a/include/linux/sunrpc/xprt.h
>>>> +++ b/include/linux/sunrpc/xprt.h
>>>> @@ -324,6 +324,7 @@ struct xprt_class {
>>>> struct rpc_xprt *xprt_create_transport(struct xprt_create *args);
>>>> void xprt_connect(struct rpc_task *task);
>>>> void xprt_reserve(struct rpc_task *task);
>>>> +void xprt_request_init(struct rpc_task *task);
>>>> void xprt_retry_reserve(struct rpc_task *task);
>>>> int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
>>>> int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
>>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>>> index 6e432ec..226f558 100644
>>>> --- a/net/sunrpc/clnt.c
>>>> +++ b/net/sunrpc/clnt.c
>>>> @@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
>>>> task->tk_status = 0;
>>>> if (status >= 0) {
>>>> if (task->tk_rqstp) {
>>>> + xprt_request_init(task);
>>>> task->tk_action = call_refresh;
>>>> return;
>>>> }
>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>> index 70f0050..a394b46 100644
>>>> --- a/net/sunrpc/xprt.c
>>>> +++ b/net/sunrpc/xprt.c
>>>> @@ -66,7 +66,7 @@
>>>> * Local functions
>>>> */
>>>> static void xprt_init(struct rpc_xprt *xprt, struct net *net);
>>>> -static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
>>>> +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
>>>> static void xprt_connect_status(struct rpc_task *task);
>>>> static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
>>>> static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
>>>> @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task *task)
>>>> task->tk_status = -EAGAIN;
>>>> goto out_unlock;
>>>> }
>>>> + if (likely(!bc_prealloc(req)))
>>>> + req->rq_xid = xprt_alloc_xid(xprt);
>>>> ret = true;
>>>> out_unlock:
>>>> spin_unlock_bh(&xprt->transport_lock);
>>>> @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
>>>> out_init_req:
>>>> xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
>>>> xprt->num_reqs);
>>>> + spin_unlock(&xprt->reserve_lock);
>>>> +
>>>> task->tk_status = 0;
>>>> task->tk_rqstp = req;
>>>> - xprt_request_init(task, xprt);
>>>> - spin_unlock(&xprt->reserve_lock);
>>>> }
>>>> EXPORT_SYMBOL_GPL(xprt_alloc_slot);
>>>>
>>>> @@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
>>>> xprt->xid = prandom_u32();
>>>> }
>>>>
>>>> -static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>>>> +void xprt_request_init(struct rpc_task *task)
>>>> {
>>>> + struct rpc_xprt *xprt = task->tk_xprt;
>>>> struct rpc_rqst *req = task->tk_rqstp;
>>>>
>>>> INIT_LIST_HEAD(&req->rq_list);
>>>> @@ -1312,7 +1315,6 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>>>> req->rq_task = task;
>>>> req->rq_xprt = xprt;
>>>> req->rq_buffer = NULL;
>>>> - req->rq_xid = xprt_alloc_xid(xprt);
>>>> req->rq_connect_cookie = xprt->connect_cookie - 1;
>>>> req->rq_bytes_sent = 0;
>>>> req->rq_snd_buf.len = 0;
>>>>
>>
>> --
>> Chuck Lever
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Chuck Lever
>
>
>
> On Mar 7, 2018, at 3:00 PM, Anna Schumaker <[email protected]> =
wrote:
>=20
>=20
>=20
> On 03/06/2018 05:30 PM, Chuck Lever wrote:
>>=20
>>=20
>>> On Mar 6, 2018, at 5:07 PM, Chuck Lever <[email protected]> =
wrote:
>>>=20
>>>=20
>>>=20
>>>> On Mar 6, 2018, at 5:02 PM, Anna Schumaker =
<[email protected]> wrote:
>>>>=20
>>>> Hi Chuck,
>>>>=20
>>>> I'm seeing a huge performance hit with this patch. I'm just =
running cthon over TCP, and it goes from finishing in 22 seconds to =
taking well over 5 minutes. I seem to only see this on the read and =
write tests, such as basic test5 taking a minute to finish:
>>>>=20
>>>> ./test5: read and write =
=20
>>>> wrote 1048576 byte file 10 times in 60.35 seconds (173737 =
bytes/sec) =20
>>>> read 1048576 byte file 10 times in 0.0 seconds =
(-2147483648 bytes/sec) =20
>>>> ./test5 ok.=20
>>>=20
>>> OK. This looks like write is impacted, but this test doesn't
>>> actually perform any reads on the wire. Try iozone with -I,
>>> maybe? That would show results for both read and write.
>>=20
>> Hum.
>>=20
>> Stock v4.16-rc4:
>>=20
>> ./test5: read and write
>> wrote 1048576 byte file 10 times in 0.2 seconds (350811642 =
bytes/sec)
>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 =
bytes/sec)
>> ./test5 ok.
>>=20
>>=20
>> v4.16-rc4 with my full set of patches:
>>=20
>> ./test5: read and write
>> wrote 1048576 byte file 10 times in 0.2 seconds (354236681 =
bytes/sec)
>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 =
bytes/sec)
>> ./test5 ok.
>>=20
>> I don't see a regression here. Let me know how it goes!
>=20
> I'm using rc4 too, so maybe it's something different in my setup?
What is your setup, exactly? I assume your client is maybe a
single CPU guest, and the server is the same, and both are
hosted on one system?
> Making this change fixes the issue for me:
>=20
> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index a394b4635f8e..273847f7e455 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -987,8 +987,6 @@ bool xprt_prepare_transmit(struct rpc_task *task)
> task->tk_status =3D -EAGAIN;
> goto out_unlock;
> }
> - if (likely(!bc_prealloc(req)))
> - req->rq_xid =3D xprt_alloc_xid(xprt);
> ret =3D true;
> out_unlock:
> spin_unlock_bh(&xprt->transport_lock);
> @@ -1315,6 +1313,7 @@ void xprt_request_init(struct rpc_task *task)
> req->rq_task =3D task;
> req->rq_xprt =3D xprt;
> req->rq_buffer =3D NULL;
> + req->rq_xid =3D xprt_alloc_xid(xprt);
xprt_alloc_xid is just=20
1299 static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
1300 {
1301 return (__force __be32)xprt->xid++;
1302 }
I don't believe the new call site for xprt_request_init is
adequately serialized for this to be safe in general. That's why
I'm calling xprt_alloc_xid in xprt_prepare_transmit, behind the
transport_lock.
However, I think we need to explain why that helps your performance
issue, because it doesn't make sense to me that this would make a
difference. Why did you think to try this change? Is there evidence
on the wire of XID re-use, for example?
> req->rq_connect_cookie =3D xprt->connect_cookie - 1;
> req->rq_bytes_sent =3D 0;
> req->rq_snd_buf.len =3D 0;
>=20
>=20
> Anna
>=20
>>=20
>>=20
>>>> I haven't dug into this too deeply, but my best guess is that maybe =
it's due to adding a call to xprt_request_init() in =
net/sunrpc/clnt.c:call_reserveresult()
>>>=20
>>> It wasn't added there, it was moved from xprt_alloc_slot. So,
>>> it's not new work per-RPC.
>>>=20
>>> Any additional information would be appreciated!
>>>=20
>>>=20
>>>> Thoughts?
>>>> Anna
>>>>=20
>>>> On 03/05/2018 03:13 PM, Chuck Lever wrote:
>>>>> alloc_slot is a transport-specific op, but initializing an =
rpc_rqst
>>>>> is common to all transports. Move initialization to common code in
>>>>> preparation for adding a transport-specific alloc_slot to =
xprtrdma.
>>>>>=20
>>>>> Signed-off-by: Chuck Lever <[email protected]>
>>>>> ---
>>>>> include/linux/sunrpc/xprt.h | 1 +
>>>>> net/sunrpc/clnt.c | 1 +
>>>>> net/sunrpc/xprt.c | 12 +++++++-----
>>>>> 3 files changed, 9 insertions(+), 5 deletions(-)
>>>>>=20
>>>>> diff --git a/include/linux/sunrpc/xprt.h =
b/include/linux/sunrpc/xprt.h
>>>>> index 5fea0fb..9784e28 100644
>>>>> --- a/include/linux/sunrpc/xprt.h
>>>>> +++ b/include/linux/sunrpc/xprt.h
>>>>> @@ -324,6 +324,7 @@ struct xprt_class {
>>>>> struct rpc_xprt *xprt_create_transport(struct =
xprt_create *args);
>>>>> void xprt_connect(struct rpc_task *task);
>>>>> void xprt_reserve(struct rpc_task *task);
>>>>> +void xprt_request_init(struct rpc_task =
*task);
>>>>> void xprt_retry_reserve(struct rpc_task =
*task);
>>>>> int xprt_reserve_xprt(struct rpc_xprt *xprt, =
struct rpc_task *task);
>>>>> int xprt_reserve_xprt_cong(struct rpc_xprt =
*xprt, struct rpc_task *task);
>>>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>>>> index 6e432ec..226f558 100644
>>>>> --- a/net/sunrpc/clnt.c
>>>>> +++ b/net/sunrpc/clnt.c
>>>>> @@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
>>>>> task->tk_status =3D 0;
>>>>> if (status >=3D 0) {
>>>>> if (task->tk_rqstp) {
>>>>> + xprt_request_init(task);
>>>>> task->tk_action =3D call_refresh;
>>>>> return;
>>>>> }
>>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>>> index 70f0050..a394b46 100644
>>>>> --- a/net/sunrpc/xprt.c
>>>>> +++ b/net/sunrpc/xprt.c
>>>>> @@ -66,7 +66,7 @@
>>>>> * Local functions
>>>>> */
>>>>> static void xprt_init(struct rpc_xprt *xprt, struct net =
*net);
>>>>> -static void xprt_request_init(struct rpc_task *, struct =
rpc_xprt *);
>>>>> +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
>>>>> static void xprt_connect_status(struct rpc_task *task);
>>>>> static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task =
*);
>>>>> static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst =
*);
>>>>> @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task =
*task)
>>>>> task->tk_status =3D -EAGAIN;
>>>>> goto out_unlock;
>>>>> }
>>>>> + if (likely(!bc_prealloc(req)))
>>>>> + req->rq_xid =3D xprt_alloc_xid(xprt);
>>>>> ret =3D true;
>>>>> out_unlock:
>>>>> spin_unlock_bh(&xprt->transport_lock);
>>>>> @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt =
*xprt, struct rpc_task *task)
>>>>> out_init_req:
>>>>> xprt->stat.max_slots =3D max_t(unsigned int, =
xprt->stat.max_slots,
>>>>> xprt->num_reqs);
>>>>> + spin_unlock(&xprt->reserve_lock);
>>>>> +
>>>>> task->tk_status =3D 0;
>>>>> task->tk_rqstp =3D req;
>>>>> - xprt_request_init(task, xprt);
>>>>> - spin_unlock(&xprt->reserve_lock);
>>>>> }
>>>>> EXPORT_SYMBOL_GPL(xprt_alloc_slot);
>>>>>=20
>>>>> @@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct =
rpc_xprt *xprt)
>>>>> xprt->xid =3D prandom_u32();
>>>>> }
>>>>>=20
>>>>> -static void xprt_request_init(struct rpc_task *task, struct =
rpc_xprt *xprt)
>>>>> +void xprt_request_init(struct rpc_task *task)
>>>>> {
>>>>> + struct rpc_xprt *xprt =3D task->tk_xprt;
>>>>> struct rpc_rqst *req =3D task->tk_rqstp;
>>>>>=20
>>>>> INIT_LIST_HEAD(&req->rq_list);
>>>>> @@ -1312,7 +1315,6 @@ static void xprt_request_init(struct =
rpc_task *task, struct rpc_xprt *xprt)
>>>>> req->rq_task =3D task;
>>>>> req->rq_xprt =3D xprt;
>>>>> req->rq_buffer =3D NULL;
>>>>> - req->rq_xid =3D xprt_alloc_xid(xprt);
>>>>> req->rq_connect_cookie =3D xprt->connect_cookie - 1;
>>>>> req->rq_bytes_sent =3D 0;
>>>>> req->rq_snd_buf.len =3D 0;
>>>>>=20
>>>=20
>>> --
>>> Chuck Lever
>>>=20
>>>=20
>>>=20
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe =
linux-rdma" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>=20
>> --
>> Chuck Lever
--
Chuck Lever
On 03/07/2018 03:23 PM, Chuck Lever wrote:
>
>
>> On Mar 7, 2018, at 3:00 PM, Anna Schumaker <[email protected]> wrote:
>>
>>
>>
>> On 03/06/2018 05:30 PM, Chuck Lever wrote:
>>>
>>>
>>>> On Mar 6, 2018, at 5:07 PM, Chuck Lever <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>>> On Mar 6, 2018, at 5:02 PM, Anna Schumaker <[email protected]> wrote:
>>>>>
>>>>> Hi Chuck,
>>>>>
>>>>> I'm seeing a huge performance hit with this patch. I'm just running cthon over TCP, and it goes from finishing in 22 seconds to taking well over 5 minutes. I seem to only see this on the read and write tests, such as basic test5 taking a minute to finish:
>>>>>
>>>>> ./test5: read and write
>>>>> wrote 1048576 byte file 10 times in 60.35 seconds (173737 bytes/sec)
>>>>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 bytes/sec)
>>>>> ./test5 ok.
>>>>
>>>> OK. This looks like write is impacted, but this test doesn't
>>>> actually perform any reads on the wire. Try iozone with -I,
>>>> maybe? That would show results for both read and write.
>>>
>>> Hum.
>>>
>>> Stock v4.16-rc4:
>>>
>>> ./test5: read and write
>>> wrote 1048576 byte file 10 times in 0.2 seconds (350811642 bytes/sec)
>>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 bytes/sec)
>>> ./test5 ok.
>>>
>>>
>>> v4.16-rc4 with my full set of patches:
>>>
>>> ./test5: read and write
>>> wrote 1048576 byte file 10 times in 0.2 seconds (354236681 bytes/sec)
>>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 bytes/sec)
>>> ./test5 ok.
>>>
>>> I don't see a regression here. Let me know how it goes!
>>
>> I'm using rc4 too, so maybe it's something different in my setup?
>
> What is your setup, exactly? I assume your client is maybe a
> single CPU guest, and the server is the same, and both are
> hosted on one system?
Client is single CPU kvm guest with 1 gig ram, server is also kvm on the same system with 2 cpus and 4 gigs ram.
>
>
>> Making this change fixes the issue for me:
>>
>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>> index a394b4635f8e..273847f7e455 100644
>> --- a/net/sunrpc/xprt.c
>> +++ b/net/sunrpc/xprt.c
>> @@ -987,8 +987,6 @@ bool xprt_prepare_transmit(struct rpc_task *task)
>> task->tk_status = -EAGAIN;
>> goto out_unlock;
>> }
>> - if (likely(!bc_prealloc(req)))
>> - req->rq_xid = xprt_alloc_xid(xprt);
>> ret = true;
>> out_unlock:
>> spin_unlock_bh(&xprt->transport_lock);
>> @@ -1315,6 +1313,7 @@ void xprt_request_init(struct rpc_task *task)
>> req->rq_task = task;
>> req->rq_xprt = xprt;
>> req->rq_buffer = NULL;
>> + req->rq_xid = xprt_alloc_xid(xprt);
>
> xprt_alloc_xid is just
>
> 1299 static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
> 1300 {
> 1301 return (__force __be32)xprt->xid++;
> 1302 }
>
> I don't believe the new call site for xprt_request_init is
> adequately serialized for this to be safe in general. That's why
> I'm calling xprt_alloc_xid in xprt_prepare_transmit, behind the
> transport_lock.
This makes sense.
>
> However, I think we need to explain why that helps your performance
> issue, because it doesn't make sense to me that this would make a
> difference. Why did you think to try this change? Is there evidence
> on the wire of XID re-use, for example?
I selectively reverted parts of your original patch until I found the parts that kill my performance.
>
>
>> req->rq_connect_cookie = xprt->connect_cookie - 1;
>> req->rq_bytes_sent = 0;
>> req->rq_snd_buf.len = 0;
>>
>>
>> Anna
>>
>>>
>>>
>>>>> I haven't dug into this too deeply, but my best guess is that maybe it's due to adding a call to xprt_request_init() in net/sunrpc/clnt.c:call_reserveresult()
>>>>
>>>> It wasn't added there, it was moved from xprt_alloc_slot. So,
>>>> it's not new work per-RPC.
>>>>
>>>> Any additional information would be appreciated!
>>>>
>>>>
>>>>> Thoughts?
>>>>> Anna
>>>>>
>>>>> On 03/05/2018 03:13 PM, Chuck Lever wrote:
>>>>>> alloc_slot is a transport-specific op, but initializing an rpc_rqst
>>>>>> is common to all transports. Move initialization to common code in
>>>>>> preparation for adding a transport-specific alloc_slot to xprtrdma.
>>>>>>
>>>>>> Signed-off-by: Chuck Lever <[email protected]>
>>>>>> ---
>>>>>> include/linux/sunrpc/xprt.h | 1 +
>>>>>> net/sunrpc/clnt.c | 1 +
>>>>>> net/sunrpc/xprt.c | 12 +++++++-----
>>>>>> 3 files changed, 9 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
>>>>>> index 5fea0fb..9784e28 100644
>>>>>> --- a/include/linux/sunrpc/xprt.h
>>>>>> +++ b/include/linux/sunrpc/xprt.h
>>>>>> @@ -324,6 +324,7 @@ struct xprt_class {
>>>>>> struct rpc_xprt *xprt_create_transport(struct xprt_create *args);
>>>>>> void xprt_connect(struct rpc_task *task);
>>>>>> void xprt_reserve(struct rpc_task *task);
>>>>>> +void xprt_request_init(struct rpc_task *task);
>>>>>> void xprt_retry_reserve(struct rpc_task *task);
>>>>>> int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
>>>>>> int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
>>>>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>>>>> index 6e432ec..226f558 100644
>>>>>> --- a/net/sunrpc/clnt.c
>>>>>> +++ b/net/sunrpc/clnt.c
>>>>>> @@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
>>>>>> task->tk_status = 0;
>>>>>> if (status >= 0) {
>>>>>> if (task->tk_rqstp) {
>>>>>> + xprt_request_init(task);
>>>>>> task->tk_action = call_refresh;
>>>>>> return;
>>>>>> }
>>>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>>>> index 70f0050..a394b46 100644
>>>>>> --- a/net/sunrpc/xprt.c
>>>>>> +++ b/net/sunrpc/xprt.c
>>>>>> @@ -66,7 +66,7 @@
>>>>>> * Local functions
>>>>>> */
>>>>>> static void xprt_init(struct rpc_xprt *xprt, struct net *net);
>>>>>> -static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
>>>>>> +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
>>>>>> static void xprt_connect_status(struct rpc_task *task);
>>>>>> static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
>>>>>> static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
>>>>>> @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task *task)
>>>>>> task->tk_status = -EAGAIN;
>>>>>> goto out_unlock;
>>>>>> }
>>>>>> + if (likely(!bc_prealloc(req)))
>>>>>> + req->rq_xid = xprt_alloc_xid(xprt);
>>>>>> ret = true;
>>>>>> out_unlock:
>>>>>> spin_unlock_bh(&xprt->transport_lock);
>>>>>> @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
>>>>>> out_init_req:
>>>>>> xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
>>>>>> xprt->num_reqs);
>>>>>> + spin_unlock(&xprt->reserve_lock);
>>>>>> +
>>>>>> task->tk_status = 0;
>>>>>> task->tk_rqstp = req;
>>>>>> - xprt_request_init(task, xprt);
>>>>>> - spin_unlock(&xprt->reserve_lock);
>>>>>> }
>>>>>> EXPORT_SYMBOL_GPL(xprt_alloc_slot);
>>>>>>
>>>>>> @@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
>>>>>> xprt->xid = prandom_u32();
>>>>>> }
>>>>>>
>>>>>> -static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>>>>>> +void xprt_request_init(struct rpc_task *task)
>>>>>> {
>>>>>> + struct rpc_xprt *xprt = task->tk_xprt;
>>>>>> struct rpc_rqst *req = task->tk_rqstp;
>>>>>>
>>>>>> INIT_LIST_HEAD(&req->rq_list);
>>>>>> @@ -1312,7 +1315,6 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>>>>>> req->rq_task = task;
>>>>>> req->rq_xprt = xprt;
>>>>>> req->rq_buffer = NULL;
>>>>>> - req->rq_xid = xprt_alloc_xid(xprt);
>>>>>> req->rq_connect_cookie = xprt->connect_cookie - 1;
>>>>>> req->rq_bytes_sent = 0;
>>>>>> req->rq_snd_buf.len = 0;
>>>>>>
>>>>
>>>> --
>>>> Chuck Lever
>>>>
>>>>
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>>> --
>>> Chuck Lever
>
> --
> Chuck Lever
>
>
>
> On Mar 7, 2018, at 3:32 PM, Anna Schumaker <[email protected]> =
wrote:
>=20
>=20
>=20
> On 03/07/2018 03:23 PM, Chuck Lever wrote:
>>=20
>>=20
>>> On Mar 7, 2018, at 3:00 PM, Anna Schumaker =
<[email protected]> wrote:
>>>=20
>>>=20
>>>=20
>>> On 03/06/2018 05:30 PM, Chuck Lever wrote:
>>>>=20
>>>>=20
>>>>> On Mar 6, 2018, at 5:07 PM, Chuck Lever <[email protected]> =
wrote:
>>>>>=20
>>>>>=20
>>>>>=20
>>>>>> On Mar 6, 2018, at 5:02 PM, Anna Schumaker =
<[email protected]> wrote:
>>>>>>=20
>>>>>> Hi Chuck,
>>>>>>=20
>>>>>> I'm seeing a huge performance hit with this patch. I'm just =
running cthon over TCP, and it goes from finishing in 22 seconds to =
taking well over 5 minutes. I seem to only see this on the read and =
write tests, such as basic test5 taking a minute to finish:
>>>>>>=20
>>>>>> ./test5: read and write =
=20
>>>>>> wrote 1048576 byte file 10 times in 60.35 seconds (173737 =
bytes/sec) =20
>>>>>> read 1048576 byte file 10 times in 0.0 seconds =
(-2147483648 bytes/sec) =20
>>>>>> ./test5 ok.=20
>>>>>=20
>>>>> OK. This looks like write is impacted, but this test doesn't
>>>>> actually perform any reads on the wire. Try iozone with -I,
>>>>> maybe? That would show results for both read and write.
>>>>=20
>>>> Hum.
>>>>=20
>>>> Stock v4.16-rc4:
>>>>=20
>>>> ./test5: read and write
>>>> wrote 1048576 byte file 10 times in 0.2 seconds (350811642 =
bytes/sec)
>>>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 =
bytes/sec)
>>>> ./test5 ok.
>>>>=20
>>>>=20
>>>> v4.16-rc4 with my full set of patches:
>>>>=20
>>>> ./test5: read and write
>>>> wrote 1048576 byte file 10 times in 0.2 seconds (354236681 =
bytes/sec)
>>>> read 1048576 byte file 10 times in 0.0 seconds (-2147483648 =
bytes/sec)
>>>> ./test5 ok.
>>>>=20
>>>> I don't see a regression here. Let me know how it goes!
>>>=20
>>> I'm using rc4 too, so maybe it's something different in my setup?
>>=20
>> What is your setup, exactly? I assume your client is maybe a
>> single CPU guest, and the server is the same, and both are
>> hosted on one system?
>=20
> Client is single CPU kvm guest with 1 gig ram, server is also kvm on =
the same system with 2 cpus and 4 gigs ram.
>=20
>>=20
>>=20
>>> Making this change fixes the issue for me:
>>>=20
>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>> index a394b4635f8e..273847f7e455 100644
>>> --- a/net/sunrpc/xprt.c
>>> +++ b/net/sunrpc/xprt.c
>>> @@ -987,8 +987,6 @@ bool xprt_prepare_transmit(struct rpc_task =
*task)
>>> task->tk_status =3D -EAGAIN;
>>> goto out_unlock;
>>> }
>>> - if (likely(!bc_prealloc(req)))
>>> - req->rq_xid =3D xprt_alloc_xid(xprt);
>>> ret =3D true;
>>> out_unlock:
>>> spin_unlock_bh(&xprt->transport_lock);
>>> @@ -1315,6 +1313,7 @@ void xprt_request_init(struct rpc_task *task)
>>> req->rq_task =3D task;
>>> req->rq_xprt =3D xprt;
>>> req->rq_buffer =3D NULL;
>>> + req->rq_xid =3D xprt_alloc_xid(xprt);
>>=20
>> xprt_alloc_xid is just=20
>>=20
>> 1299 static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
>> 1300 {
>> 1301 return (__force __be32)xprt->xid++;
>> 1302 }
>>=20
>> I don't believe the new call site for xprt_request_init is
>> adequately serialized for this to be safe in general. That's why
>> I'm calling xprt_alloc_xid in xprt_prepare_transmit, behind the
>> transport_lock.
>=20
> This makes sense.
>=20
>>=20
>> However, I think we need to explain why that helps your performance
>> issue, because it doesn't make sense to me that this would make a
>> difference. Why did you think to try this change? Is there evidence
>> on the wire of XID re-use, for example?
>=20
> I selectively reverted parts of your original patch until I found the =
parts that kill my performance.
Fair enough, but that doesn't explain why your change helps. :-)
Since I can't reproduce the regression here, try this:
0. Build a kernel with the regression
1. On your client: # trace-cmd record -e sunrpc:* -e rpcrdma:*
2. Run the cthon04 basic tests
3. ^C the trace-cmd
4. Rename trace.dat
5. Repeat steps 1-4 with stock v4.16-rc4
6. tar and gzip the .dat files and send them to me
>>> req->rq_connect_cookie =3D xprt->connect_cookie - 1;
>>> req->rq_bytes_sent =3D 0;
>>> req->rq_snd_buf.len =3D 0;
>>>=20
>>>=20
>>> Anna
>>>=20
>>>>=20
>>>>=20
>>>>>> I haven't dug into this too deeply, but my best guess is that =
maybe it's due to adding a call to xprt_request_init() in =
net/sunrpc/clnt.c:call_reserveresult()
>>>>>=20
>>>>> It wasn't added there, it was moved from xprt_alloc_slot. So,
>>>>> it's not new work per-RPC.
>>>>>=20
>>>>> Any additional information would be appreciated!
>>>>>=20
>>>>>=20
>>>>>> Thoughts?
>>>>>> Anna
>>>>>>=20
>>>>>> On 03/05/2018 03:13 PM, Chuck Lever wrote:
>>>>>>> alloc_slot is a transport-specific op, but initializing an =
rpc_rqst
>>>>>>> is common to all transports. Move initialization to common code =
in
>>>>>>> preparation for adding a transport-specific alloc_slot to =
xprtrdma.
>>>>>>>=20
>>>>>>> Signed-off-by: Chuck Lever <[email protected]>
>>>>>>> ---
>>>>>>> include/linux/sunrpc/xprt.h | 1 +
>>>>>>> net/sunrpc/clnt.c | 1 +
>>>>>>> net/sunrpc/xprt.c | 12 +++++++-----
>>>>>>> 3 files changed, 9 insertions(+), 5 deletions(-)
>>>>>>>=20
>>>>>>> diff --git a/include/linux/sunrpc/xprt.h =
b/include/linux/sunrpc/xprt.h
>>>>>>> index 5fea0fb..9784e28 100644
>>>>>>> --- a/include/linux/sunrpc/xprt.h
>>>>>>> +++ b/include/linux/sunrpc/xprt.h
>>>>>>> @@ -324,6 +324,7 @@ struct xprt_class {
>>>>>>> struct rpc_xprt *xprt_create_transport(struct =
xprt_create *args);
>>>>>>> void xprt_connect(struct rpc_task *task);
>>>>>>> void xprt_reserve(struct rpc_task *task);
>>>>>>> +void xprt_request_init(struct rpc_task =
*task);
>>>>>>> void xprt_retry_reserve(struct rpc_task =
*task);
>>>>>>> int xprt_reserve_xprt(struct rpc_xprt *xprt, =
struct rpc_task *task);
>>>>>>> int xprt_reserve_xprt_cong(struct rpc_xprt =
*xprt, struct rpc_task *task);
>>>>>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>>>>>> index 6e432ec..226f558 100644
>>>>>>> --- a/net/sunrpc/clnt.c
>>>>>>> +++ b/net/sunrpc/clnt.c
>>>>>>> @@ -1546,6 +1546,7 @@ void rpc_force_rebind(struct rpc_clnt =
*clnt)
>>>>>>> task->tk_status =3D 0;
>>>>>>> if (status >=3D 0) {
>>>>>>> if (task->tk_rqstp) {
>>>>>>> + xprt_request_init(task);
>>>>>>> task->tk_action =3D call_refresh;
>>>>>>> return;
>>>>>>> }
>>>>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>>>>> index 70f0050..a394b46 100644
>>>>>>> --- a/net/sunrpc/xprt.c
>>>>>>> +++ b/net/sunrpc/xprt.c
>>>>>>> @@ -66,7 +66,7 @@
>>>>>>> * Local functions
>>>>>>> */
>>>>>>> static void xprt_init(struct rpc_xprt *xprt, struct net =
*net);
>>>>>>> -static void xprt_request_init(struct rpc_task *, struct =
rpc_xprt *);
>>>>>>> +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
>>>>>>> static void xprt_connect_status(struct rpc_task *task);
>>>>>>> static int __xprt_get_cong(struct rpc_xprt *, struct =
rpc_task *);
>>>>>>> static void __xprt_put_cong(struct rpc_xprt *, struct =
rpc_rqst *);
>>>>>>> @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task =
*task)
>>>>>>> task->tk_status =3D -EAGAIN;
>>>>>>> goto out_unlock;
>>>>>>> }
>>>>>>> + if (likely(!bc_prealloc(req)))
>>>>>>> + req->rq_xid =3D xprt_alloc_xid(xprt);
>>>>>>> ret =3D true;
>>>>>>> out_unlock:
>>>>>>> spin_unlock_bh(&xprt->transport_lock);
>>>>>>> @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt =
*xprt, struct rpc_task *task)
>>>>>>> out_init_req:
>>>>>>> xprt->stat.max_slots =3D max_t(unsigned int, =
xprt->stat.max_slots,
>>>>>>> xprt->num_reqs);
>>>>>>> + spin_unlock(&xprt->reserve_lock);
>>>>>>> +
>>>>>>> task->tk_status =3D 0;
>>>>>>> task->tk_rqstp =3D req;
>>>>>>> - xprt_request_init(task, xprt);
>>>>>>> - spin_unlock(&xprt->reserve_lock);
>>>>>>> }
>>>>>>> EXPORT_SYMBOL_GPL(xprt_alloc_slot);
>>>>>>>=20
>>>>>>> @@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct =
rpc_xprt *xprt)
>>>>>>> xprt->xid =3D prandom_u32();
>>>>>>> }
>>>>>>>=20
>>>>>>> -static void xprt_request_init(struct rpc_task *task, struct =
rpc_xprt *xprt)
>>>>>>> +void xprt_request_init(struct rpc_task *task)
>>>>>>> {
>>>>>>> + struct rpc_xprt *xprt =3D task->tk_xprt;
>>>>>>> struct rpc_rqst *req =3D task->tk_rqstp;
>>>>>>>=20
>>>>>>> INIT_LIST_HEAD(&req->rq_list);
>>>>>>> @@ -1312,7 +1315,6 @@ static void xprt_request_init(struct =
rpc_task *task, struct rpc_xprt *xprt)
>>>>>>> req->rq_task =3D task;
>>>>>>> req->rq_xprt =3D xprt;
>>>>>>> req->rq_buffer =3D NULL;
>>>>>>> - req->rq_xid =3D xprt_alloc_xid(xprt);
>>>>>>> req->rq_connect_cookie =3D xprt->connect_cookie - 1;
>>>>>>> req->rq_bytes_sent =3D 0;
>>>>>>> req->rq_snd_buf.len =3D 0;
>>>>>>>=20
>>>>>=20
>>>>> --
>>>>> Chuck Lever
>>>>>=20
>>>>>=20
>>>>>=20
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe =
linux-rdma" in
>>>>> the body of a message to [email protected]
>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>=20
>>>> --
>>>> Chuck Lever
>>=20
>> --
>> Chuck Lever
--
Chuck Lever