Move the code in svc_sock_enqueue() that checks for available
write space on the socket, into a new sko_has_wspace method
in svc_sock_ops.
Signed-off-by: Greg Banks <[email protected]>
Signed-off-by: Peter Leckie <[email protected]>
---
include/linux/sunrpc/svcsock.h | 4 +
net/sunrpc/svcsock.c | 74 +++++++++++++++++-------------
2 files changed, 48 insertions(+), 30 deletions(-)
Index: linux/net/sunrpc/svcsock.c
===================================================================
--- linux.orig/net/sunrpc/svcsock.c 2007-05-17 00:36:20.381217499 +1000
+++ linux/net/sunrpc/svcsock.c 2007-05-17 00:49:50.820303639 +1000
@@ -204,22 +204,6 @@ svc_release_skb(struct svc_rqst *rqstp)
}
/*
- * Any space to write?
- */
-static inline unsigned long
-svc_sock_wspace(struct svc_sock *svsk)
-{
- int wspace;
-
- if (svsk->sk_sock->type == SOCK_STREAM)
- wspace = sk_stream_wspace(svsk->sk_sk);
- else
- wspace = sock_wspace(svsk->sk_sk);
-
- return wspace;
-}
-
-/*
* Queue up a socket with data pending. If there are idle nfsd
* processes, wake 'em up.
*
@@ -268,21 +252,15 @@ svc_sock_enqueue(struct svc_sock *svsk)
BUG_ON(svsk->sk_pool != NULL);
svsk->sk_pool = pool;
- set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
- if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
- > svc_sock_wspace(svsk))
+ if (svsk->sk_ops->sko_has_wspace
&& !test_bit(SK_CLOSE, &svsk->sk_flags)
&& !test_bit(SK_CONN, &svsk->sk_flags)) {
- /* Don't enqueue while not enough space for reply */
- dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
- svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
- svc_sock_wspace(svsk));
- svsk->sk_pool = NULL;
- clear_bit(SK_BUSY, &svsk->sk_flags);
- goto out_unlock;
+ if (!svsk->sk_ops->sko_has_wspace(svsk)) {
+ svsk->sk_pool = NULL;
+ clear_bit(SK_BUSY, &svsk->sk_flags);
+ goto out_unlock;
+ }
}
- clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-
if (!list_empty(&pool->sp_threads)) {
rqstp = list_entry(pool->sp_threads.next,
@@ -904,13 +882,42 @@ svc_tcpip_prepare_reply(struct svc_rqst
return 0;
}
+/**
+ * svc_sock_has_write_space - Checks if there is enough space
+ * to send the reply on the socket.
+ * @svsk: the svc_sock to write on
+ * @wspace: the number of bytes available for writing
+ */
+static int svc_sock_has_write_space(struct svc_sock *svsk, int wspace)
+{
+ struct svc_serv *serv = svsk->sk_server;
+ int required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg;
+
+ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ if (required*2 > wspace) {
+ /* Don't enqueue while not enough space for reply */
+ dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
+ svsk->sk_sk, required, wspace);
+ return 0;
+ }
+ clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return 1;
+}
+
+static int
+svc_udp_has_wspace(struct svc_sock *svsk)
+{
+ return svc_sock_has_write_space(svsk, sock_wspace(svsk->sk_sk));
+}
+
static const struct svc_sock_ops svc_udp_ops = {
.sko_name = "udp",
.sko_recvfrom = svc_udp_recvfrom,
.sko_sendto = svc_udp_sendto,
.sko_detach = svc_tcpip_detach,
.sko_free = svc_tcpip_free,
- .sko_prepare_reply = svc_tcpip_prepare_reply
+ .sko_prepare_reply = svc_tcpip_prepare_reply,
+ .sko_has_wspace = svc_udp_has_wspace
};
static void
@@ -1365,13 +1372,20 @@ svc_tcp_prepare_reply(struct svc_rqst *r
return 0;
}
+static int
+svc_tcp_has_wspace(struct svc_sock *svsk)
+{
+ return svc_sock_has_write_space(svsk, sk_stream_wspace(svsk->sk_sk));
+}
+
static const struct svc_sock_ops svc_tcp_ops = {
.sko_name = "tcp",
.sko_recvfrom = svc_tcp_recvfrom,
.sko_sendto = svc_tcp_sendto,
.sko_detach = svc_tcpip_detach,
.sko_free = svc_tcpip_free,
- .sko_prepare_reply = svc_tcp_prepare_reply
+ .sko_prepare_reply = svc_tcp_prepare_reply,
+ .sko_has_wspace = svc_tcp_has_wspace
};
static void
Index: linux/include/linux/sunrpc/svcsock.h
===================================================================
--- linux.orig/include/linux/sunrpc/svcsock.h 2007-05-17 00:36:20.553194321 +1000
+++ linux/include/linux/sunrpc/svcsock.h 2007-05-17 00:46:47.944827892 +1000
@@ -33,6 +33,10 @@ struct svc_sock_ops {
* fail, e.g. due to memory allocation.
*/
int (*sko_prepare_reply)(struct svc_rqst *);
+ /*
+ * Return 1 if sufficient space to write reply to network.
+ */
+ int (*sko_has_wspace)(struct svc_sock *);
};
/*
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Fri, May 18, 2007 at 08:33:54AM -0500, Tom Tucker wrote:
> On Fri, 2007-05-18 at 14:05 +1000, Greg Banks wrote:
> > On Thu, May 17, 2007 at 08:30:39PM +1000, Neil Brown wrote:
> > > Do that mean that RDMA will never
> > > reject a write due to lack of space?
> >
> > No, it means that the current RDMA send code will block waiting
> > for space to become available. That's right, nfsd threads block on
> > the network. Steel yourself, there's worse to come.
> >
>
> Uh... Not really. The queue depths are designed to match credits to
> worst case reply sizes. In the normal case, it should never have to
> wait. The wait is to catch the margins in the same way that a kmalloc
> will wait for memory to become available.
This is news to me, but then I just read that code not write it ;-)
I just poked around in your GIT tree and AFAICT the software queue
depth limits are set by this logic:
743 static int
744 svc_rdma_accept(struct svc_rqst *rqstp)
745 {
...
775 ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
...
788 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
789 (size_t)svcrdma_max_requests);
790 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
54 unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
143 #define RPCRDMA_SQ_DEPTH_MULT 8
145 #define RPCRDMA_MAX_REQUESTS 16
A brief experiment shows that devattr.max_qp_wr = 16384 on recent
Mellanox cards, so the actual sc_sq_depth value will be ruled
by svcrdma_max_requests, and be 128.
The worst case is a 4K page machine writing a 1MB reply to a READ rpc,
which if I understand the code correctly uses a WR per page plus one
for the reply header, or 1024/4+1 = 257 WRs. Now imagine all the nfsd
threads(*) trying to do that to the same QP. SGI runs with 128 threads.
In this scenario, every call to svc_rdma_send() stands a good chance
of blocking. The wait is not interruptible and has no timeout. If
the client's HCA has a conniption under this load, the server will
have some large fraction of the nfsds unkillably blocked until the
server's HCA gives up and reports an error (I presume it does this?).
The wspace management code in knfsd is designed to avoid having
nfsds ever block in the network stack; I don't see how the NFS/RDMA
code achieves that. Or is there something I've missed?
* I would expect there to be a client-side limit on the number of
outstanding READs so this doesn't normally happen, but part of the
reason for knfsd's wspace management is to avoid DoS attacks from
malicious or broken clients.
> There's actually a stat kept by the transport that counts the number of
> times it waits.
Ah, that would be rdma_stat_sq_starve? It's been added since the
November code drop so I hadn't noticed it. BTW is there a reason
for emitting statistics to userspace as sysctls?
> There is a place that a wait is done in the "normal" case and that's for
> the completion of an RDMA_READ in the process of gathering the data for
> and RPC on receive. That wait happens _every_ time.
Yes indeed, and this is the "worse" I was referring to. I have
a crash dump in which 122 of the 128 nfsd threads are doing this:
0 schedule+0x249c [0xa00000010052799c]
1 schedule_timeout+0x1ac [0xa00000010052958c]
2 rdma_read_xdr+0xf2c [0xa0000002216bcd8c]
3 svc_rdma_recvfrom+0x1e5c [0xa0000002216bed7c]
4 svc_recv+0xc8c [0xa00000022169f0ac]
5 nfsd+0x1ec [0xa000000221d7504c]
6 __svc_create_thread_tramp+0x30c [0xa0000002216976ac]
7 kernel_thread_helper+0xcc [0xa00000010001290c]
8 start_kernel_thread+0x1c [0xa0000001000094bc]
That doesn't leave a lot of threads to do real work, like waiting for
the filesystem. And every now and again something goes awry in
IB land and each thread ends up waiting for 6 seconds or more.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Tue, 2007-05-22 at 21:16 +1000, Greg Banks wrote:
> On Fri, May 18, 2007 at 08:33:54AM -0500, Tom Tucker wrote:
> > On Fri, 2007-05-18 at 14:05 +1000, Greg Banks wrote:
> > > On Thu, May 17, 2007 at 08:30:39PM +1000, Neil Brown wrote:
> > > > Do that mean that RDMA will never
> > > > reject a write due to lack of space?
> > >
> > > No, it means that the current RDMA send code will block waiting
> > > for space to become available. That's right, nfsd threads block on
> > > the network. Steel yourself, there's worse to come.
> > >
> >
> > Uh... Not really. The queue depths are designed to match credits to
> > worst case reply sizes. In the normal case, it should never have to
> > wait. The wait is to catch the margins in the same way that a kmalloc
> > will wait for memory to become available.
>
> This is news to me, but then I just read that code not write it ;-)
Uh oh, I'm about to get schooled aren't I...
> I just poked around in your GIT tree and AFAICT the software queue
> depth limits are set by this logic:
>
> 743 static int
> 744 svc_rdma_accept(struct svc_rqst *rqstp)
> 745 {
> ...
> 775 ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
> ...
> 788 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
> 789 (size_t)svcrdma_max_requests);
> 790 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
>
>
> 54 unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
>
> 143 #define RPCRDMA_SQ_DEPTH_MULT 8
> 145 #define RPCRDMA_MAX_REQUESTS 16
>
> A brief experiment shows that devattr.max_qp_wr = 16384 on recent
> Mellanox cards, so the actual sc_sq_depth value will be ruled
> by svcrdma_max_requests, and be 128.
>
> The worst case is a 4K page machine writing a 1MB reply to a READ rpc,
> which if I understand the code correctly uses a WR per page plus one
> for the reply header, or 1024/4+1 = 257 WRs. Now imagine all the nfsd
> threads(*) trying to do that to the same QP. SGI runs with 128 threads.
Fair enough big data fills the SQ.
>
> In this scenario, every call to svc_rdma_send() stands a good chance
> of blocking. The wait is not interruptible and has no timeout. If
> the client's HCA has a conniption under this load, the server will
> have some large fraction of the nfsds unkillably blocked until the
> server's HCA gives up and reports an error (I presume it does this?).
I consider the client's hardware failing as outside the common
optimization case. But I think your argument still holds for big data.
>
> The wspace management code in knfsd is designed to avoid having
> nfsds ever block in the network stack; I don't see how the NFS/RDMA
> code achieves that. Or is there something I've missed?
>
You're dead on. We should implement wspace properly. I'll look into
fixing this.
> * I would expect there to be a client-side limit on the number of
> outstanding READs so this doesn't normally happen, but part of the
> reason for knfsd's wspace management is to avoid DoS attacks from
> malicious or broken clients.
>
> > There's actually a stat kept by the transport that counts the number of
> > times it waits.
>
> Ah, that would be rdma_stat_sq_starve? It's been added since the
> November code drop so I hadn't noticed it. BTW is there a reason
> for emitting statistics to userspace as sysctls?
The bulk of the stats were exported in order to evaluate different I/O
models. For example, do we poll some number of times on an SQ stall? How
about an RQ stall, etc... I think many of these stats should be removed.
I'd appreciate feedback on which ones we should retain.
>
> > There is a place that a wait is done in the "normal" case and that's for
> > the completion of an RDMA_READ in the process of gathering the data for
> > and RPC on receive. That wait happens _every_ time.
>
> Yes indeed, and this is the "worse" I was referring to. I have
> a crash dump in which 122 of the 128 nfsd threads are doing this:
> 0 schedule+0x249c [0xa00000010052799c]
> 1 schedule_timeout+0x1ac [0xa00000010052958c]
> 2 rdma_read_xdr+0xf2c [0xa0000002216bcd8c]
> 3 svc_rdma_recvfrom+0x1e5c [0xa0000002216bed7c]
> 4 svc_recv+0xc8c [0xa00000022169f0ac]
> 5 nfsd+0x1ec [0xa000000221d7504c]
> 6 __svc_create_thread_tramp+0x30c [0xa0000002216976ac]
> 7 kernel_thread_helper+0xcc [0xa00000010001290c]
> 8 start_kernel_thread+0x1c [0xa0000001000094bc]
>
>
>
> That doesn't leave a lot of threads to do real work, like waiting for
> the filesystem.
Hmm. I'd quibble with your example, but point taken. Other threads could
be doing work on another mount point for another client.
What's happening here is that you're pounding the server with NFS WRITE
and recvfrom is dutifully queuing another thread to process the next RPC
in the RQ, which happens to require another RDMA READ and another wait
and so on. As you point out, this is not productive and quickly consumes
all your threads -- Ugh.
A simple optimization to the current design for this case is that if the
RPC contains a read-list don't call svc_sock_enqueue until _after_ the
RDMA READ completes. The current thread will wait, but the remaining
threads (127 in your case) will be free to do other work for other mount
points.
This only falls over in the limit of 128 mounts all pounding the server
with writes. Comments?
> And every now and again something goes awry in
> IB land and each thread ends up waiting for 6 seconds or more.
Yes. Note that when this happens, the connection gets terminated.
>
> Greg.
> --
> Greg Banks, R&D Software Engineer, SGI Australian Software Group.
> Apparently, I'm Bedevere. Which MPHG character are you?
> I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Tue, May 22, 2007 at 12:34:23PM -0500, Tom Tucker wrote:
> On Tue, 2007-05-22 at 21:16 +1000, Greg Banks wrote:
> >
> > The wspace management code in knfsd is designed to avoid having
> > nfsds ever block in the network stack; I don't see how the NFS/RDMA
> > code achieves that. Or is there something I've missed?
> >
>
> You're dead on. We should implement wspace properly. I'll look into
> fixing this.
Great.
> The bulk of the stats were exported in order to evaluate different I/O
> models. For example, do we poll some number of times on an SQ stall? How
> about an RQ stall, etc... I think many of these stats should be removed.
> I'd appreciate feedback on which ones we should retain.
Actually I quite like the stats, although the reap_[sr]q_at_* counters
don't seem to be used. For rdma_stat_rq_prod, I would have had a
counter increment where wc.status reports an error, thus futzing with
the global cacheline only once in the normal success path.
But I think it would be better to report them through the existing
/proc/net/rpc/nfsd mechanism, i.e. add an RDMA-specific call to
svc_seq_show() and add your new counters to struct svc_stat.
> What's happening here is that you're pounding the server with NFS WRITE
> and recvfrom is dutifully queuing another thread to process the next RPC
> in the RQ, which happens to require another RDMA READ and another wait
> and so on. As you point out, this is not productive and quickly consumes
> all your threads -- Ugh.
Exactly.
> A simple optimization to the current design for this case is that if the
> RPC contains a read-list don't call svc_sock_enqueue until _after_ the
> RDMA READ completes.
I assume you meant "don't release the SK_BUSY bit", because
svc_sock_enqueue will be called from the dto tasklet every time
a new call arrives whether or not we've finished the RDMA READ.
> The current thread will wait, but the remaining
> threads (127 in your case) will be free to do other work for other mount
> points.
>
> This only falls over in the limit of 128 mounts all pounding the server
> with writes. Comments?
A common deployment scenario I would expect to see would have at
least twice as many clients as threads, and SGI have successfully
tested scenarios with a 16:1 client:thread ratio. So I don't think
we can solve the problem in any way which ties down a thread for
up to 6 seconds per client, because there won't be enough threads
to go around.
The other case that needs to work is a single fat client trying to
throw as many bits as possible down the wire. For this case your
proposal would limit the server to effectively serving WRITE RPCs
serially, with what I presume would be a catastrophic effect on
write performance. What you want is for knfsd to be able to throw
as many threads at the single fat client as possible.
The knfsd design actually handles both these extreme cases quite
well today with TCP sockets. The features which I think are key
to providing this flexibility are:
* a pool of threads whose size is only loosely tied to the number
of clients
* threads are not tied to clients, they compete for individual
incoming RPCs
* threads take care to avoid blocking on the network
The problem we have achieving the last point for NFS/RDMA is that
RDMA READs are fundamentally neither a per-svc_sock nor a per-thread
concept. To achieve parallelism you want multiple sequences of RDMA
READs proceeding per svc_sock. To avoid blocking threads you want a
thread to be able to abandon a sequence of RDMA READs partway through
and go back to it's main loop. Doing both implies a new object to hold
the state for a single sequence of RDMA READs for a single WRITE RPC.
So I think the "right" way to handle RDMA READs is to define a
new struct, for argument's sake "struct svc_rdma_read", containing
some large fraction of the auto variables currently in the function
rdma_read_xdr() and some from it's caller. When an RPC with a read
list arrives, allocate a struct svc_rdma_read and add it to a list
of such structs on the corresponding struct svcxprt_rdma. Issue some
READ WRs, until doing so would block. When the completions for issued
WRs arrive, mark the struct svc_rdma_read, instead of waking the
nfsd as you do now, set SK_DATA on the svcxprt_rdma and enqueue it.
When an nfsd later calls svc_rdma_recvfrom(), first check whether any
of the struct svc_rdma_reads in flight have progressed, and depending
on whether they're complete either issue some more READ WRs or finalise
the struct svc_rdma_read by setting up the thread's page array.
In other words, invert the rdma_read_xdr() logic so nfsd return
to the main loop instead of blocking.
Unfortunately it's kind of a major change. Thoughts?
> > And every now and again something goes awry in
> > IB land and each thread ends up waiting for 6 seconds or more.
>
> Yes. Note that when this happens, the connection gets terminated.
Indeed. Meanwhile, for the last 6 seconds all the nfsds are
blocked waiting for the one client, and none of the other clients
are getting any service.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On 5/22/07 9:32 PM, "Greg Banks" <[email protected]> wrote:
> On Tue, May 22, 2007 at 12:34:23PM -0500, Tom Tucker wrote:
>> On Tue, 2007-05-22 at 21:16 +1000, Greg Banks wrote:
>>>
>>> The wspace management code in knfsd is designed to avoid having
>>> nfsds ever block in the network stack; I don't see how the NFS/RDMA
>>> code achieves that. Or is there something I've missed?
>>>
>>
>> You're dead on. We should implement wspace properly. I'll look into
>> fixing this.
>
> Great.
>
>> The bulk of the stats were exported in order to evaluate different I/O
>> models. For example, do we poll some number of times on an SQ stall? How
>> about an RQ stall, etc... I think many of these stats should be removed.
>> I'd appreciate feedback on which ones we should retain.
>
> Actually I quite like the stats, although the reap_[sr]q_at_* counters
> don't seem to be used. For rdma_stat_rq_prod, I would have had a
> counter increment where wc.status reports an error, thus futzing with
> the global cacheline only once in the normal success path.
>
> But I think it would be better to report them through the existing
> /proc/net/rpc/nfsd mechanism, i.e. add an RDMA-specific call to
> svc_seq_show() and add your new counters to struct svc_stat.
>
I need to do a little clean up (e.g. atomic_inc()) and then I'll add them as
suggested. If there are dissenters, please speak now...
>
>> What's happening here is that you're pounding the server with NFS WRITE
>> and recvfrom is dutifully queuing another thread to process the next RPC
>> in the RQ, which happens to require another RDMA READ and another wait
>> and so on. As you point out, this is not productive and quickly consumes
>> all your threads -- Ugh.
>
> Exactly.
>
>> A simple optimization to the current design for this case is that if the
>> RPC contains a read-list don't call svc_sock_enqueue until _after_ the
>> RDMA READ completes.
>
> I assume you meant "don't release the SK_BUSY bit", because
> svc_sock_enqueue will be called from the dto tasklet every time
> a new call arrives whether or not we've finished the RDMA READ.
>
Right. Good point. I might have spent a few hours debugging that one ;-)
>> The current thread will wait, but the remaining
>> threads (127 in your case) will be free to do other work for other mount
>> points.
>>
>> This only falls over in the limit of 128 mounts all pounding the server
>> with writes. Comments?
>
> A common deployment scenario I would expect to see would have at
> least twice as many clients as threads, and SGI have successfully
> tested scenarios with a 16:1 client:thread ratio. So I don't think
> we can solve the problem in any way which ties down a thread for
> up to 6 seconds per client, because there won't be enough threads
> to go around.
Agreed. But isn't that the 100 dead adapter scenario? Someone has to wait.
Who?
>
> The other case that needs to work is a single fat client trying to
> throw as many bits as possible down the wire. For this case your
> proposal would limit the server to effectively serving WRITE RPCs
> serially,
Not entirely. The approach would overlap the RPC fetch of the next WRITE
with the response to the previous.
>... with what I presume would be a catastrophic effect on
> write performance. What you want is for knfsd to be able to throw
> as many threads at the single fat client as possible.
>
> The knfsd design actually handles both these extreme cases quite
> well today with TCP sockets. The features which I think are key
> to providing this flexibility are:
>
> * a pool of threads whose size is only loosely tied to the number
> of clients
>
> * threads are not tied to clients, they compete for individual
> incoming RPCs
>
> * threads take care to avoid blocking on the network
>
> The problem we have achieving the last point for NFS/RDMA is that
> RDMA READs are fundamentally neither a per-svc_sock nor a per-thread
> concept. To achieve parallelism you want multiple sequences of RDMA
> READs proceeding per svc_sock. To avoid blocking threads you want a
> thread to be able to abandon a sequence of RDMA READs partway through
> and go back to it's main loop. Doing both implies a new object to hold
> the state for a single sequence of RDMA READs for a single WRITE RPC.
>
> So I think the "right" way to handle RDMA READs is to define a
> new struct, for argument's sake "struct svc_rdma_read", containing
> some large fraction of the auto variables currently in the function
> rdma_read_xdr() and some from it's caller. When an RPC with a read
> list arrives, allocate a struct svc_rdma_read and add it to a list
> of such structs on the corresponding struct svcxprt_rdma. Issue some
> READ WRs, until doing so would block. When the completions for issued
> WRs arrive, mark the struct svc_rdma_read, instead of waking the
> nfsd as you do now, set SK_DATA on the svcxprt_rdma and enqueue it.
> When an nfsd later calls svc_rdma_recvfrom(), first check whether any
> of the struct svc_rdma_reads in flight have progressed, and depending
> on whether they're complete either issue some more READ WRs or finalise
> the struct svc_rdma_read by setting up the thread's page array.
>
> In other words, invert the rdma_read_xdr() logic so nfsd return
> to the main loop instead of blocking.
>
> Unfortunately it's kind of a major change. Thoughts?
>
Ok, I love it and I hate it :-) This is the consolidated waiter strategy
that solves all the problems...except ...does this expose any read/write
ordering issues at the client? Couldn't the client issue a write followed by
a read and get the original data?
That's the hate it part. If we need to decide which requests are allowed to
proceed on a QP with outstanding READ WR, things get messy quick.
Is there no such requirement?
>>> And every now and again something goes awry in
>>> IB land and each thread ends up waiting for 6 seconds or more.
>>
>> Yes. Note that when this happens, the connection gets terminated.
>
> Indeed. Meanwhile, for the last 6 seconds all the nfsds are
> blocked waiting for the one client, and none of the other clients
> are getting any service.
We need to think about this. A dead adapter causes havoc.
>
> Greg.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Wed, May 23, 2007 at 12:22:44AM -0500, Tom Tucker wrote:
>
>
>
> On 5/22/07 9:32 PM, "Greg Banks" <[email protected]> wrote:
>
> > On Tue, May 22, 2007 at 12:34:23PM -0500, Tom Tucker wrote:
>
> I need to do a little clean up (e.g. atomic_inc()) and then I'll add them as
> suggested. If there are dissenters, please speak now...
You should be able to get away with just ++. They're only stats,
it's not like you have logic depending on them being exact.
> >> The current thread will wait, but the remaining
> >> threads (127 in your case) will be free to do other work for other mount
> >> points.
> >>
> >> This only falls over in the limit of 128 mounts all pounding the server
> >> with writes. Comments?
> >
> > A common deployment scenario I would expect to see would have at
> > least twice as many clients as threads, and SGI have successfully
> > tested scenarios with a 16:1 client:thread ratio. So I don't think
> > we can solve the problem in any way which ties down a thread for
> > up to 6 seconds per client, because there won't be enough threads
> > to go around.
>
> Agreed. But isn't that the 100 dead adapter scenario?
Umm?
> Someone has to wait.
> Who?
> > In other words, invert the rdma_read_xdr() logic so nfsd return
> > to the main loop instead of blocking.
> >
> > Unfortunately it's kind of a major change. Thoughts?
> >
>
> Ok, I love it and I hate it :-) This is the consolidated waiter strategy
> that solves all the problems...except ...does this expose any read/write
> ordering issues at the client? Couldn't the client issue a write followed by
> a read and get the original data?
The server doesn't guarantee that the order of completion of calls
in the filesystem is the same as the order of emission of those calls
by the client. On UDP, the calls might not even arrive at the server
in the same order they left the client.
If the client needs to ensure that a READ happens after a WRITE,
it needs to wait for the WRITE reply before emitting the READ; that
should still be true with this change.
> That's the hate it part. If we need to decide which requests are allowed to
> proceed on a QP with outstanding READ WR, things get messy quick.
>
> Is there no such requirement?
I don't think so.
You might want to limit the number of RDMA READ streams in flight.
> >>> And every now and again something goes awry in
> >>> IB land and each thread ends up waiting for 6 seconds or more.
> >>
> >> Yes. Note that when this happens, the connection gets terminated.
> >
> > Indeed. Meanwhile, for the last 6 seconds all the nfsds are
> > blocked waiting for the one client, and none of the other clients
> > are getting any service.
>
> We need to think about this. A dead adapter causes havoc.
A dead adaptor on the server should cause havoc, at least for as long
as it takes HA to kick in. A dead adaptor on a client should affect
only that client and no other clients.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Wed, May 23, 2007 at 09:36:55AM -0400, Chuck Lever wrote:
> Greg Banks wrote:
> >On Wed, May 23, 2007 at 12:22:44AM -0500, Tom Tucker wrote:
> >>
> >>
> >>On 5/22/07 9:32 PM, "Greg Banks" <[email protected]> wrote:
> >>
> >>>On Tue, May 22, 2007 at 12:34:23PM -0500, Tom Tucker wrote:
> >>I need to do a little clean up (e.g. atomic_inc()) and then I'll add them
> >>as
> >>suggested. If there are dissenters, please speak now...
> >
> >You should be able to get away with just ++. They're only stats,
> >it's not like you have logic depending on them being exact.
>
> The usual way to do this is to allocate separate stat arrays for each
> CPU. All the per-CPU arrays are summed only when something wants to
> display stat totals.
>
> That way you can use ++, be fairly sure you will get accurate
> statistics, and not have to be concerned about CPU cache effects or lock
> contention.
Here in SGI, home of the multiple-thousand CPU Linux boxes, making
data structures per-cpu tends to be done...thoughtfully.
In any case, the remainder of svc_stat gets no such treatment today.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Thu, May 17, 2007 at 05:22:11AM +1000, Greg Banks wrote:
> @@ -268,21 +252,15 @@ svc_sock_enqueue(struct svc_sock *svsk)
> BUG_ON(svsk->sk_pool != NULL);
> svsk->sk_pool = pool;
>
> - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> - if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
> - > svc_sock_wspace(svsk))
> + if (svsk->sk_ops->sko_has_wspace
> && !test_bit(SK_CLOSE, &svsk->sk_flags)
> && !test_bit(SK_CONN, &svsk->sk_flags)) {
> - /* Don't enqueue while not enough space for reply */
> - dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
> - svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
> - svc_sock_wspace(svsk));
> - svsk->sk_pool = NULL;
> - clear_bit(SK_BUSY, &svsk->sk_flags);
> - goto out_unlock;
> + if (!svsk->sk_ops->sko_has_wspace(svsk)) {
> + svsk->sk_pool = NULL;
> + clear_bit(SK_BUSY, &svsk->sk_flags);
> + goto out_unlock;
> + }
> }
> - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> -
>
> if (!list_empty(&pool->sp_threads)) {
> rqstp = list_entry(pool->sp_threads.next,
> @@ -904,13 +882,42 @@ svc_tcpip_prepare_reply(struct svc_rqst
> return 0;
> }
>
> +/**
> + * svc_sock_has_write_space - Checks if there is enough space
> + * to send the reply on the socket.
> + * @svsk: the svc_sock to write on
> + * @wspace: the number of bytes available for writing
> + */
> +static int svc_sock_has_write_space(struct svc_sock *svsk, int wspace)
> +{
> + struct svc_serv *serv = svsk->sk_server;
> + int required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg;
> +
> + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> + if (required*2 > wspace) {
> + /* Don't enqueue while not enough space for reply */
> + dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
> + svsk->sk_sk, required, wspace);
> + return 0;
> + }
> + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> + return 1;
> +}
So, this is just my ignorance--why do the set and clear of SOCK_NOSPACE
need to be ordered in the way they are? (Why not just set once inside
the if clause?)
--b.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Wed, May 16, 2007 at 05:10:53PM -0400, J. Bruce Fields wrote:
> On Thu, May 17, 2007 at 05:22:11AM +1000, Greg Banks wrote:
> > + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > + if (required*2 > wspace) {
> > + /* Don't enqueue while not enough space for reply */
> > + dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
> > + svsk->sk_sk, required, wspace);
> > + return 0;
> > + }
> > + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > + return 1;
> > +}
>
> So, this is just my ignorance--why do the set and clear of SOCK_NOSPACE
> need to be ordered in the way they are? (Why not just set once inside
> the if clause?)
I can't see a good reason for it, but I'm trying to minimise
perturbations to the logic.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Thursday May 17, [email protected] wrote:
> On Wed, May 16, 2007 at 05:10:53PM -0400, J. Bruce Fields wrote:
> > On Thu, May 17, 2007 at 05:22:11AM +1000, Greg Banks wrote:
> > > + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > + if (required*2 > wspace) {
> > > + /* Don't enqueue while not enough space for reply */
> > > + dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
> > > + svsk->sk_sk, required, wspace);
> > > + return 0;
> > > + }
> > > + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > + return 1;
> > > +}
> >
> > So, this is just my ignorance--why do the set and clear of SOCK_NOSPACE
> > need to be ordered in the way they are? (Why not just set once inside
> > the if clause?)
>
> I can't see a good reason for it, but I'm trying to minimise
> perturbations to the logic.
Unfortunately, you actually perturbed the important bit... Or at
least, the bit that I thought was important when I wrote it.
Previously, sk_stream_wspace(), or sock_wspace() would be called *after*
SOCK_NOSPACE was set. With your patch it is called *before*.
It is a fairly improbably race, but if the output queue flushed
completely between calling XX_wspace and setting SOCK_NOSPACE, the
sk_write_space callback might never get called.
I wonder if we need a memory barrier to ensure that this actually works.??
So I think that code needs to be rearranged a bit, and maybe
commented, in case I don't remember next time :-)
And I gather by the fact that you test "->sko_has_wspace" that RDMA
doesn't have such a function? Do that mean that RDMA will never
reject a write due to lack of space? That seems unlikely.
I would rather assume that every transport has a sko_has_wspace
function...
NeilBrown
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
At 06:30 AM 5/17/2007, Neil Brown wrote:
>And I gather by the fact that you test "->sko_has_wspace" that RDMA
>doesn't have such a function? Do that mean that RDMA will never
>reject a write due to lack of space? That seems unlikely.
RDMA uses credits, which are preallocated per-connection resources in
the hardware. RDMA transports don't generally support flow control, that
is left to the upper layer. As such, if the client is obeying the protocol, the
send will succeed. And if it's not, then the client only loses since the failure
will cause the connection to be lost.
Someday we might want to support this kind of test. There are shared
resource schemes in RDMA, more for the receive side, but it's not out of the
question for sends. As a default though, a NULL has_wspace makes sense.
Tom.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Thursday May 17, [email protected] wrote:
> At 06:30 AM 5/17/2007, Neil Brown wrote:
> >And I gather by the fact that you test "->sko_has_wspace" that RDMA
> >doesn't have such a function? Do that mean that RDMA will never
> >reject a write due to lack of space? That seems unlikely.
>
> RDMA uses credits, which are preallocated per-connection resources in
> the hardware. RDMA transports don't generally support flow control, that
> is left to the upper layer. As such, if the client is obeying the protocol, the
> send will succeed. And if it's not, then the client only loses since the failure
> will cause the connection to be lost.
>
> Someday we might want to support this kind of test. There are shared
> resource schemes in RDMA, more for the receive side, but it's not out of the
> question for sends. As a default though, a NULL has_wspace makes sense.
Thanks for the explanation.
So presumably the serve communicates to the client how much buffer
space it has via these credits, so that we can be sure a write will
never fail (as long as the protocol is being honoured). That sounds
reasonable.
I think I would rather
int rdma_has_wspace(struct svc_sock *svsk) { return 1; }
than NULL. It removes special cases in the generic code, and gives an
obvious place for a comment explaining that RDMA doesn't need flow
control because it uses credits.
NeilBrown
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Thu, May 17, 2007 at 08:30:39PM +1000, Neil Brown wrote:
> On Thursday May 17, [email protected] wrote:
> > On Wed, May 16, 2007 at 05:10:53PM -0400, J. Bruce Fields wrote:
> > > On Thu, May 17, 2007 at 05:22:11AM +1000, Greg Banks wrote:
> > > > + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > + if (required*2 > wspace) {
> > > > + /* Don't enqueue while not enough space for reply */
> > > > + dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
> > > > + svsk->sk_sk, required, wspace);
> > > > + return 0;
> > > > + }
> > > > + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > + return 1;
> > > > +}
> > >
> > > So, this is just my ignorance--why do the set and clear of SOCK_NOSPACE
> > > need to be ordered in the way they are? (Why not just set once inside
> > > the if clause?)
> >
> > I can't see a good reason for it, but I'm trying to minimise
> > perturbations to the logic.
>
> Unfortunately, you actually perturbed the important bit... Or at
> least, the bit that I thought was important when I wrote it.
>
> Previously, sk_stream_wspace(), or sock_wspace() would be called *after*
> SOCK_NOSPACE was set. With your patch it is called *before*.
>
> It is a fairly improbably race, but if the output queue flushed
> completely between calling XX_wspace and setting SOCK_NOSPACE, the
> sk_write_space callback might never get called.
Woops. I'll fix that.
> And I gather by the fact that you test "->sko_has_wspace" that RDMA
> doesn't have such a function?
You gather correctly.
> Do that mean that RDMA will never
> reject a write due to lack of space?
No, it means that the current RDMA send code will block waiting
for space to become available. That's right, nfsd threads block on
the network. Steel yourself, there's worse to come.
> That seems unlikely.
> I would rather assume that every transport has a sko_has_wspace
> function...
Ok, but for today the RDMA one will be
static int svc_rdma_has_wspace(struct svc_sock *svsk)
{
return 1;
}
We might be able to pull the condition in the blocking logic out
of svc_rdma_send() out to implement an sko_has_wspace, but there's
something of an impedance mismatch. The RDMA queue limit is expressed
in Work Requests not bytes, and the mapping between the two is not
precisely visible at the point when has_wspace is called. I guess
we'd have to use an upper bound.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Thu, May 17, 2007 at 08:30:39PM +1000, Neil Brown wrote:
> On Thursday May 17, [email protected] wrote:
> > On Wed, May 16, 2007 at 05:10:53PM -0400, J. Bruce Fields wrote:
> > > On Thu, May 17, 2007 at 05:22:11AM +1000, Greg Banks wrote:
> > > > + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > + if (required*2 > wspace) {
> > > > + /* Don't enqueue while not enough space for reply */
> > > > + dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
> > > > + svsk->sk_sk, required, wspace);
> > > > + return 0;
> > > > + }
> > > > + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > + return 1;
> > > > +}
> > >
> > > So, this is just my ignorance--why do the set and clear of SOCK_NOSPACE
> > > need to be ordered in the way they are? (Why not just set once inside
> > > the if clause?)
> >
> > I can't see a good reason for it, but I'm trying to minimise
> > perturbations to the logic.
>
> Unfortunately, you actually perturbed the important bit... Or at
> least, the bit that I thought was important when I wrote it.
>
> Previously, sk_stream_wspace(), or sock_wspace() would be called *after*
> SOCK_NOSPACE was set. With your patch it is called *before*.
Latest patch does this:
+/**
+ * svc_udp_has_wspace - Checks if there is enough space
+ * to send the reply on the socket without blocking.
+ * @svsk: the svc_sock to write on
+ * @required: the number of bytes which should be available
+ */
+static int
+svc_udp_has_wspace(struct svc_sock *svsk, int required)
+{
+ int wspace;
+
+ /*
+ * Neil Brown explains the gymnastics with SOCK_NOSPACE thus:
+ * It is a fairly improbably race, but if the output queue
+ * flushed completely between calling sk_stream_wspace() or
+ * sock_wspace() and setting SOCK_NOSPACE, the sk_write_space
+ * callback might never get called.
+ */
+ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ wspace = sock_wspace(svsk->sk_sk);
+ if (required*2 > wspace) {
+ /* Don't enqueue while not enough space for reply */
+ dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
+ svsk->sk_sk, required, wspace);
+ return 0;
+ }
+ clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return 1;
+}
+
And the same gymnastics are repeated in svc_tcp_has_wspace() which
now has no common code with the UDP case. I also calculate `required'
in generic code and pass it in.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
Apparently, I'm Bedevere. Which MPHG character are you?
I don't speak for SGI.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Friday May 18, [email protected] wrote:
> >
> > Previously, sk_stream_wspace(), or sock_wspace() would be called *after*
> > SOCK_NOSPACE was set. With your patch it is called *before*.
>
> Latest patch does this:
>
Looks good, thanks.
NeilBrown
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Fri, 2007-05-18 at 14:05 +1000, Greg Banks wrote:
> On Thu, May 17, 2007 at 08:30:39PM +1000, Neil Brown wrote:
> > On Thursday May 17, [email protected] wrote:
> > > On Wed, May 16, 2007 at 05:10:53PM -0400, J. Bruce Fields wrote:
> > > > On Thu, May 17, 2007 at 05:22:11AM +1000, Greg Banks wrote:
> > > > > + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > > + if (required*2 > wspace) {
> > > > > + /* Don't enqueue while not enough space for reply */
> > > > > + dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
> > > > > + svsk->sk_sk, required, wspace);
> > > > > + return 0;
> > > > > + }
> > > > > + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > > + return 1;
> > > > > +}
> > > >
> > > > So, this is just my ignorance--why do the set and clear of SOCK_NOSPACE
> > > > need to be ordered in the way they are? (Why not just set once inside
> > > > the if clause?)
> > >
> > > I can't see a good reason for it, but I'm trying to minimise
> > > perturbations to the logic.
> >
> > Unfortunately, you actually perturbed the important bit... Or at
> > least, the bit that I thought was important when I wrote it.
> >
> > Previously, sk_stream_wspace(), or sock_wspace() would be called *after*
> > SOCK_NOSPACE was set. With your patch it is called *before*.
> >
> > It is a fairly improbably race, but if the output queue flushed
> > completely between calling XX_wspace and setting SOCK_NOSPACE, the
> > sk_write_space callback might never get called.
>
> Woops. I'll fix that.
>
> > And I gather by the fact that you test "->sko_has_wspace" that RDMA
> > doesn't have such a function?
>
> You gather correctly.
>
> > Do that mean that RDMA will never
> > reject a write due to lack of space?
>
> No, it means that the current RDMA send code will block waiting
> for space to become available. That's right, nfsd threads block on
> the network. Steel yourself, there's worse to come.
>
Uh... Not really. The queue depths are designed to match credits to
worst case reply sizes. In the normal case, it should never have to
wait. The wait is to catch the margins in the same way that a kmalloc
will wait for memory to become available.
There's actually a stat kept by the transport that counts the number of
times it waits.
There is a place that a wait is done in the "normal" case and that's for
the completion of an RDMA_READ in the process of gathering the data for
and RPC on receive. That wait happens _every_ time.
> > That seems unlikely.
> > I would rather assume that every transport has a sko_has_wspace
> > function...
>
> Ok, but for today the RDMA one will be
>
> static int svc_rdma_has_wspace(struct svc_sock *svsk)
> {
> return 1;
> }
>
> We might be able to pull the condition in the blocking logic out
> of svc_rdma_send() out to implement an sko_has_wspace, but there's
> something of an impedance mismatch. The RDMA queue limit is expressed
> in Work Requests not bytes, and the mapping between the two is not
> precisely visible at the point when has_wspace is called. I guess
> we'd have to use an upper bound.
> Greg.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Fri, 2007-05-18 at 08:33 -0500, Tom Tucker wrote:
> On Fri, 2007-05-18 at 14:05 +1000, Greg Banks wrote:
> > On Thu, May 17, 2007 at 08:30:39PM +1000, Neil Brown wrote:
> > > On Thursday May 17, [email protected] wrote:
> > > > On Wed, May 16, 2007 at 05:10:53PM -0400, J. Bruce Fields wrote:
> > > > > On Thu, May 17, 2007 at 05:22:11AM +1000, Greg Banks wrote:
> > > > > > + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > > > + if (required*2 > wspace) {
> > > > > > + /* Don't enqueue while not enough space for reply */
> > > > > > + dprintk("svc: socket %p no space, %d*2 > %d, not enqueued\n",
> > > > > > + svsk->sk_sk, required, wspace);
> > > > > > + return 0;
> > > > > > + }
> > > > > > + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
> > > > > > + return 1;
> > > > > > +}
> > > > >
> > > > > So, this is just my ignorance--why do the set and clear of SOCK_NOSPACE
> > > > > need to be ordered in the way they are? (Why not just set once inside
> > > > > the if clause?)
> > > >
> > > > I can't see a good reason for it, but I'm trying to minimise
> > > > perturbations to the logic.
> > >
> > > Unfortunately, you actually perturbed the important bit... Or at
> > > least, the bit that I thought was important when I wrote it.
> > >
> > > Previously, sk_stream_wspace(), or sock_wspace() would be called *after*
> > > SOCK_NOSPACE was set. With your patch it is called *before*.
> > >
> > > It is a fairly improbably race, but if the output queue flushed
> > > completely between calling XX_wspace and setting SOCK_NOSPACE, the
> > > sk_write_space callback might never get called.
> >
> > Woops. I'll fix that.
> >
> > > And I gather by the fact that you test "->sko_has_wspace" that RDMA
> > > doesn't have such a function?
> >
> > You gather correctly.
> >
> > > Do that mean that RDMA will never
> > > reject a write due to lack of space?
> >
> > No, it means that the current RDMA send code will block waiting
> > for space to become available. That's right, nfsd threads block on
> > the network. Steel yourself, there's worse to come.
> >
>
> Uh... Not really. The queue depths are designed to match credits to
> worst case reply sizes. In the normal case, it should never have to
> wait. The wait is to catch the margins in the same way that a kmalloc
> will wait for memory to become available.
What I mean here is that the server code will wait when a kmalloc fails,
not that kmalloc itself will wait.
>
> There's actually a stat kept by the transport that counts the number of
> times it waits.
>
> There is a place that a wait is done in the "normal" case and that's for
> the completion of an RDMA_READ in the process of gathering the data for
> and RPC on receive. That wait happens _every_ time.
>
> > > That seems unlikely.
> > > I would rather assume that every transport has a sko_has_wspace
> > > function...
> >
> > Ok, but for today the RDMA one will be
> >
> > static int svc_rdma_has_wspace(struct svc_sock *svsk)
> > {
> > return 1;
> > }
> >
> > We might be able to pull the condition in the blocking logic out
> > of svc_rdma_send() out to implement an sko_has_wspace, but there's
> > something of an impedance mismatch. The RDMA queue limit is expressed
> > in Work Requests not bytes, and the mapping between the two is not
> > precisely visible at the point when has_wspace is called. I guess
> > we'd have to use an upper bound.
> > Greg.
>
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by DB2 Express
> Download DB2 Express C - the FREE version of DB2 express and take
> control of your XML. No limits. Just data. Click to get it now.
> http://sourceforge.net/powerbar/db2/
> _______________________________________________
> NFS maillist - [email protected]
> https://lists.sourceforge.net/lists/listinfo/nfs
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
At 12:05 AM 5/18/2007, Greg Banks wrote:
>> Do that mean that RDMA will never
>> reject a write due to lack of space?
>
>No, it means that the current RDMA send code will block waiting
>for space to become available. That's right, nfsd threads block on
>the network. Steel yourself, there's worse to come.
Huh? RDMA Sends never block, except to wait for the handshake as the
hardware picks up the work request.
What's the "worse" part you refer to?
Tom.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs