We can get -EAGAIN when writing to a socket for one of two reasons.
1/ The accounting done by the network layer reports that the socket has
used all the buffer space it is allowed
2/ kmalloc failed.
In the first case we are certain to get a ->sk_write_space callback
when space becomes available. In the second case there is no such
guarantee and we need to wait a short time.
sk_stream_wait_memory does exactly this calculating 'vm_wait' with
a magic formula which this patch copies.
The current code will always check if the socket is writeable after
preparing to receive the ->sk_write_space callback. In the kmalloc-failed
case this is usually true so the task is immediately woken, so it spins until
kmalloc succeeds, causing CPU wastage and soft-lockup messages.
So when calling xs_nospace() we check if the socket is actually writeable.
If it is, we assume a kmalloc failure (which may be inaccurate) and
set a timer for a short random wait after which we try again. We
always wait in this case, possibly longer than needed, but not very long.
If the socket is not writeable, we follow the current path which will cause
the task to sleep until the socket is writable.
Fixes: 06ea0bfe6e60 ("SUNRPC: Fix races in xs_nospace()")
Signed-off-by: NeilBrown <[email protected]>
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 8b93ef53df3c..146917ab1bcb 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -336,7 +336,7 @@ int xprt_load_transport(const char *);
void xprt_set_retrans_timeout_def(struct rpc_task *task);
void xprt_set_retrans_timeout_rtt(struct rpc_task *task);
void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
-void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action);
+void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action, int writeable);
void xprt_write_space(struct rpc_xprt *xprt);
void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result);
struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 826fecf19350..094f13f908d8 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -492,12 +492,17 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
* we don't in general want to force a socket disconnection due to
* an incomplete RPC call transmission.
*/
-void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
+void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action,
+ int writeable)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
+ if (writeable)
+ /* waiting due to kmalloc failure */
+ /* See sk_stream_wait_memory */
+ task->tk_timeout = (prandom_u32() % (HZ / 5)) + 2;
rpc_sleep_on(&xprt->pending, task, action);
}
EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 66891e32c5e3..5a1bd21f5957 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -443,12 +443,22 @@ static void xs_nospace_callback(struct rpc_task *task)
clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
}
+static void xs_nomem_callback(struct rpc_task *task)
+{
+ struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
+
+ if (task->tk_status == -ETIMEDOUT)
+ task->tk_status = 0;
+ transport->inet->sk_write_pending--;
+ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+}
+
/**
* xs_nospace - place task on wait queue if transmit was incomplete
* @task: task to put to sleep
*
*/
-static int xs_nospace(struct rpc_task *task)
+static int xs_nospace(struct rpc_task *task, int writeable)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
@@ -473,7 +483,12 @@ static int xs_nospace(struct rpc_task *task)
set_bit(SOCK_NOSPACE, &transport->sock->flags);
sk->sk_write_pending++;
/* ...and wait for more buffer space */
- xprt_wait_for_buffer_space(task, xs_nospace_callback);
+ if (writeable)
+ xprt_wait_for_buffer_space(
+ task, xs_nomem_callback, 1);
+ else
+ xprt_wait_for_buffer_space(
+ task, xs_nospace_callback, 0);
}
} else {
clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
@@ -483,7 +498,8 @@ static int xs_nospace(struct rpc_task *task)
spin_unlock_bh(&xprt->transport_lock);
/* Race breaker in case memory is freed before above code is called */
- sk->sk_write_space(sk);
+ if (!writeable)
+ sk->sk_write_space(sk);
return ret;
}
@@ -540,7 +556,7 @@ static int xs_local_send_request(struct rpc_task *task)
switch (status) {
case -ENOBUFS:
case -EAGAIN:
- status = xs_nospace(task);
+ status = xs_nospace(task, sock_writeable(transport->inet));
break;
default:
dprintk("RPC: sendmsg returned unrecognized error %d\n",
@@ -604,7 +620,7 @@ process_status:
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- status = xs_nospace(task);
+ status = xs_nospace(task, sock_writeable(transport->inet));
break;
default:
dprintk("RPC: sendmsg returned unrecognized error %d\n",
@@ -712,7 +728,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
break;
case -ENOBUFS:
case -EAGAIN:
- status = xs_nospace(task);
+ status = xs_nospace(task, sk_stream_is_writeable(transport->inet));
break;
default:
dprintk("RPC: sendmsg returned unrecognized error %d\n",
Hi Neil,
There should already be a handler for ENOBUFS in call_status, but
I can see that it has a couple of flaws. What say we try to fix that
instead?
Cheers
Trond
--
2.4.3
ENOBUFS means that memory allocations are failing due to an actual
low memory situation. It should not be confused with being out of
socket buffer space.
Handle the problem by just punting to the delay in call_status.
Reported-by: Neil Brown <[email protected]>
Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/xprtsock.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ee359fc7af16..44c1927b68c7 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -539,6 +539,7 @@ static int xs_local_send_request(struct rpc_task *task)
switch (status) {
case -ENOBUFS:
+ break;
case -EAGAIN:
status = xs_nospace(task);
break;
@@ -692,7 +693,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
status = -ENOTCONN;
/* Should we call xs_close() here? */
break;
- case -ENOBUFS:
case -EAGAIN:
status = xs_nospace(task);
break;
@@ -703,6 +703,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
case -ECONNREFUSED:
case -ENOTCONN:
case -EADDRINUSE:
+ case -ENOBUFS:
case -EPIPE:
clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
}
--
2.4.3
If we're running out of buffer memory when transmitting data, then
we want to just delay for a moment, and then continue transmitting
the remainder of the message.
Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/clnt.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index cbc6af923dd1..23608eb0ded2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1902,6 +1902,7 @@ call_transmit_status(struct rpc_task *task)
switch (task->tk_status) {
case -EAGAIN:
+ case -ENOBUFS:
break;
default:
dprint_status(task);
@@ -1928,7 +1929,6 @@ call_transmit_status(struct rpc_task *task)
case -ECONNABORTED:
case -EADDRINUSE:
case -ENOTCONN:
- case -ENOBUFS:
case -EPIPE:
rpc_task_force_reencode(task);
}
@@ -2057,12 +2057,13 @@ call_status(struct rpc_task *task)
case -ECONNABORTED:
rpc_force_rebind(clnt);
case -EADDRINUSE:
- case -ENOBUFS:
rpc_delay(task, 3*HZ);
case -EPIPE:
case -ENOTCONN:
task->tk_action = call_bind;
break;
+ case -ENOBUFS:
+ rpc_delay(task, HZ>>2);
case -EAGAIN:
task->tk_action = call_transmit;
break;
--
2.4.3
On Fri, 3 Jul 2015 09:49:37 -0400 Trond Myklebust
<[email protected]> wrote:
> Hi Neil,
>
> There should already be a handler for ENOBUFS in call_status, but
> I can see that it has a couple of flaws. What say we try to fix that
> instead?
>
> Cheers
> Trond
>
Hi Trond,
your patches make sense I think, but they are only part of a solution.
In the problem case the error comes from sk_stream_wait_memory, and
that returns EAGAIN, never ENOBUFS. So fixing the handling of ENOBUFS
won't be enough.
The call path is
xs_tcp_send_request -> xs_sendpages -> xs_send_pagedata ->
(sock->ops->sendpage == tcp_sendpage) -> do_tcp_sendpage ->
sk_stream_wait_memory
and EAGAIN travels all the way from bottom to top unmolested.
We could possibly change sk_stream_wait_memory to return ENOBUFS if
vm_wait is != 0, but:
- that could have other consequences so needs to go through netdev
and probably isn't a quick fix
- there could be other paths that don't return ENOBUFS - it really
don't seem that ENOBUFS appears all that much in 'net/' in places
where it would need to...
Maybe we could check and translate the error in xs_sendpages:
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 66891e32c5e3..8474d79ec2b2 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -431,6 +431,14 @@ out:
if (err > 0) {
*sent_p += err;
err = 0;
+ } else if (err == -EAGAIN) {
+ /* Might be wrong error code. */
+ if (sock->sk->sk_write_space == xs_tcp_write_space &&
+ sk_stream_is_writeable(sock->sk))
+ err = -ENOBUFS;
+ if (sock->sk->sk_write_space == xs_udp_write_space &&
+ sock_writeable(sock->sk))
+ err = -ENOBUFS;
}
return err;
}
Though that is a bit of a hack. If/when net-dev gets the correct error
returns, we can then remove that.
Though I'm beginning to wonder if ENOBUFS is the correct error code
anyway. "man 2 send" suggests ENOMEM, with ENOBUFS meaning:
The output queue for a network interface was full. This
generally indicates that the interface has stopped send-
ing, but may be caused by transient congestion. (Nor-
mally, this does not occur in Linux. Packets are just
silently dropped when a device queue overflows.)
So I'm not sure I feel to comfortable about relying on the exact error
code.
What do you think?
Thanks,
NeilBrown
Hi Neil,
On Sun, Jul 5, 2015 at 9:26 PM, NeilBrown <[email protected]> wrote:
> On Fri, 3 Jul 2015 09:49:37 -0400 Trond Myklebust
> <[email protected]> wrote:
>
>> Hi Neil,
>>
>> There should already be a handler for ENOBUFS in call_status, but
>> I can see that it has a couple of flaws. What say we try to fix that
>> instead?
>>
>> Cheers
>> Trond
>>
>
> Hi Trond,
> your patches make sense I think, but they are only part of a solution.
> In the problem case the error comes from sk_stream_wait_memory, and
> that returns EAGAIN, never ENOBUFS. So fixing the handling of ENOBUFS
> won't be enough.
>
> The call path is
> xs_tcp_send_request -> xs_sendpages -> xs_send_pagedata ->
> (sock->ops->sendpage == tcp_sendpage) -> do_tcp_sendpage ->
> sk_stream_wait_memory
>
> and EAGAIN travels all the way from bottom to top unmolested.
>
> We could possibly change sk_stream_wait_memory to return ENOBUFS if
> vm_wait is != 0, but:
> - that could have other consequences so needs to go through netdev
> and probably isn't a quick fix
> - there could be other paths that don't return ENOBUFS - it really
> don't seem that ENOBUFS appears all that much in 'net/' in places
> where it would need to...
>
> Maybe we could check and translate the error in xs_sendpages:
>
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 66891e32c5e3..8474d79ec2b2 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -431,6 +431,14 @@ out:
> if (err > 0) {
> *sent_p += err;
> err = 0;
> + } else if (err == -EAGAIN) {
> + /* Might be wrong error code. */
> + if (sock->sk->sk_write_space == xs_tcp_write_space &&
> + sk_stream_is_writeable(sock->sk))
> + err = -ENOBUFS;
> + if (sock->sk->sk_write_space == xs_udp_write_space &&
> + sock_writeable(sock->sk))
> + err = -ENOBUFS;
> }
> return err;
> }
>
>
> Though that is a bit of a hack. If/when net-dev gets the correct error
> returns, we can then remove that.
>
> Though I'm beginning to wonder if ENOBUFS is the correct error code
> anyway. "man 2 send" suggests ENOMEM, with ENOBUFS meaning:
>
> The output queue for a network interface was full. This
> generally indicates that the interface has stopped send-
> ing, but may be caused by transient congestion. (Nor-
> mally, this does not occur in Linux. Packets are just
> silently dropped when a device queue overflows.)
>
> So I'm not sure I feel to comfortable about relying on the exact error
> code.
>
> What do you think?
The latest POSIX base specifications at
http://pubs.opengroup.org/onlinepubs/9699919799/functions/sendmsg.html
state that sendmsg() "may fail" if:
[ENOBUFS]
Insufficient resources were available in the system to perform the operation.
[ENOMEM]
Insufficient memory was available to fulfill the request.
which implies that there is no actual standard here. I therefore agree
that we should just massage those error messages into something that
works for us until the networking community figures out how they want
to standardise.
Cheers
Trond
On Mon, 6 Jul 2015 09:36:38 -0400 Trond Myklebust
<[email protected]> wrote:
> Hi Neil,
>
> On Sun, Jul 5, 2015 at 9:26 PM, NeilBrown <[email protected]> wrote:
> > On Fri, 3 Jul 2015 09:49:37 -0400 Trond Myklebust
> > <[email protected]> wrote:
> >
> >> Hi Neil,
> >>
> >> There should already be a handler for ENOBUFS in call_status, but
> >> I can see that it has a couple of flaws. What say we try to fix that
> >> instead?
> >>
> >> Cheers
> >> Trond
> >>
> >
> > Hi Trond,
> > your patches make sense I think, but they are only part of a solution.
> > In the problem case the error comes from sk_stream_wait_memory, and
> > that returns EAGAIN, never ENOBUFS. So fixing the handling of ENOBUFS
> > won't be enough.
> >
> > The call path is
> > xs_tcp_send_request -> xs_sendpages -> xs_send_pagedata ->
> > (sock->ops->sendpage == tcp_sendpage) -> do_tcp_sendpage ->
> > sk_stream_wait_memory
> >
> > and EAGAIN travels all the way from bottom to top unmolested.
> >
> > We could possibly change sk_stream_wait_memory to return ENOBUFS if
> > vm_wait is != 0, but:
> > - that could have other consequences so needs to go through netdev
> > and probably isn't a quick fix
> > - there could be other paths that don't return ENOBUFS - it really
> > don't seem that ENOBUFS appears all that much in 'net/' in places
> > where it would need to...
> >
> > Maybe we could check and translate the error in xs_sendpages:
> >
> > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > index 66891e32c5e3..8474d79ec2b2 100644
> > --- a/net/sunrpc/xprtsock.c
> > +++ b/net/sunrpc/xprtsock.c
> > @@ -431,6 +431,14 @@ out:
> > if (err > 0) {
> > *sent_p += err;
> > err = 0;
> > + } else if (err == -EAGAIN) {
> > + /* Might be wrong error code. */
> > + if (sock->sk->sk_write_space == xs_tcp_write_space &&
> > + sk_stream_is_writeable(sock->sk))
> > + err = -ENOBUFS;
> > + if (sock->sk->sk_write_space == xs_udp_write_space &&
> > + sock_writeable(sock->sk))
> > + err = -ENOBUFS;
> > }
> > return err;
> > }
> >
> >
> > Though that is a bit of a hack. If/when net-dev gets the correct error
> > returns, we can then remove that.
> >
> > Though I'm beginning to wonder if ENOBUFS is the correct error code
> > anyway. "man 2 send" suggests ENOMEM, with ENOBUFS meaning:
> >
> > The output queue for a network interface was full. This
> > generally indicates that the interface has stopped send-
> > ing, but may be caused by transient congestion. (Nor-
> > mally, this does not occur in Linux. Packets are just
> > silently dropped when a device queue overflows.)
> >
> > So I'm not sure I feel to comfortable about relying on the exact error
> > code.
> >
> > What do you think?
>
> The latest POSIX base specifications at
>
> http://pubs.opengroup.org/onlinepubs/9699919799/functions/sendmsg.html
>
> state that sendmsg() "may fail" if:
>
> [ENOBUFS]
> Insufficient resources were available in the system to perform the operation.
> [ENOMEM]
> Insufficient memory was available to fulfill the request.
>
> which implies that there is no actual standard here. I therefore agree
> that we should just massage those error messages into something that
> works for us until the networking community figures out how they want
> to standardise.
>
Works for me - thanks.
I'll create some fault-injection so I can experiment with this code
path properly and let you know how I go - probably based on the patches
you sent.
Thanks,
NeilBrown