2008-06-17 19:00:52

by Rainer Weikusat

[permalink] [raw]
Subject: [PATCH 2.6.25.7] af_unix: fix 'poll for write'/ connected DGRAM sockets

From: Rainer Weikusat <[email protected]>

The unix_dgram_sendmsg routine implements a (somewhat crude)
form of receiver-imposed flow control by comparing the length of the
receive queue of the 'peer socket' with the max_ack_backlog value
stored in the corresponding sock structure, either blocking
the thread which caused the send-routine to be called or returning
EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET
sockets. The poll-implementation for these socket types is
datagram_poll from core/datagram.c. A socket is deemed to be writeable
by this routine when the memory presently consumed by datagrams
owned by it is less than the configured socket send buffer size. This
is always wrong for connected PF_UNIX non-stream sockets when the
abovementioned receive queue is currently considered to be full.
'poll' will then return, indicating that the socket is writeable, but
a subsequent write result in EAGAIN, effectively causing an
(usual) application to 'poll for writeability by repeated send request
with O_NONBLOCK set' until it has consumed its time quantum.

The change below uses a suitably modified variant of the datagram_poll
routines for both type of PF_UNIX sockets, which tests if the
recv-queue of the peer a socket is connected to is presently
considered to be 'full' as part of the 'is this socket
writeable'-checking code. The socket being polled is additionally
put onto the peer_wait wait queue associated with its peer, because the
unix_dgram_sendmsg routine does a wake up on this queue after a
datagram was received and the 'other wakeup call' is done implicitly
as part of skb destruction, meaning, a process blocked in poll
because of a full peer receive queue could otherwise sleep forever
if no datagram owned by its socket was already sitting on this queue.
Among this change is a small (inline) helper routine named
'unix_recvq_full', which consolidates the actual testing code (in three
different places) into a single location.

Signed-off-by: <[email protected]>
---
diff -pru linux-2.6.25.7/net/unix/af_unix.c linux-2.6.25.7-patched/net/unix/af_unix.c
--- linux-2.6.25.7/net/unix/af_unix.c 2008-04-17 04:49:44.000000000 +0200
+++ linux-2.6.25.7-patched/net/unix/af_unix.c 2008-06-17 20:13:48.000000000 +0200
@@ -169,6 +169,11 @@ static inline int unix_may_send(struct s
return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
}

+static inline int unix_recvq_full(struct sock const *sk)
+{
+ return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
+}
+
static struct sock *unix_peer_get(struct sock *s)
{
struct sock *peer;
@@ -482,6 +487,8 @@ static int unix_socketpair(struct socket
static int unix_accept(struct socket *, struct socket *, int);
static int unix_getname(struct socket *, struct sockaddr *, int *, int);
static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
+static unsigned int unix_datagram_poll(struct file *, struct socket *,
+ poll_table *);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct kiocb *, struct socket *,
@@ -527,7 +534,7 @@ static const struct proto_ops unix_dgram
.socketpair = unix_socketpair,
.accept = sock_no_accept,
.getname = unix_getname,
- .poll = datagram_poll,
+ .poll = unix_datagram_poll,
.ioctl = unix_ioctl,
.listen = sock_no_listen,
.shutdown = unix_shutdown,
@@ -548,7 +555,7 @@ static const struct proto_ops unix_seqpa
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
- .poll = datagram_poll,
+ .poll = unix_datagram_poll,
.ioctl = unix_ioctl,
.listen = unix_listen,
.shutdown = unix_shutdown,
@@ -979,8 +986,7 @@ static long unix_wait_for_peer(struct so

sched = !sock_flag(other, SOCK_DEAD) &&
!(other->sk_shutdown & RCV_SHUTDOWN) &&
- (skb_queue_len(&other->sk_receive_queue) >
- other->sk_max_ack_backlog);
+ unix_recvq_full(other);

unix_state_unlock(other);

@@ -1054,8 +1060,7 @@ restart:
if (other->sk_state != TCP_LISTEN)
goto out_unlock;

- if (skb_queue_len(&other->sk_receive_queue) >
- other->sk_max_ack_backlog) {
+ if (unix_recvq_full(other)) {
err = -EAGAIN;
if (!timeo)
goto out_unlock;
@@ -1424,9 +1429,7 @@ restart:
goto out_unlock;
}

- if (unix_peer(other) != sk &&
- (skb_queue_len(&other->sk_receive_queue) >
- other->sk_max_ack_backlog)) {
+ if (unix_peer(other) != sk && unix_recvq_full(other)) {
if (!timeo) {
err = -EAGAIN;
goto out_unlock;
@@ -1987,6 +1990,64 @@ static unsigned int unix_poll(struct fil
return mask;
}

+static unsigned int unix_datagram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk, *peer;
+ unsigned int mask;
+
+ poll_wait(file, sk->sk_sleep, wait);
+
+ peer = unix_peer_get(sk);
+ if (peer) {
+ if (peer != sk)
+ /*
+ * Writability of a connected socket additionally
+ * depends on the state of the receive queue of the
+ * peer.
+ */
+ poll_wait(file, &unix_sk(peer)->peer_wait, wait);
+ else {
+ sock_put(peer);
+ peer = NULL;
+ }
+ }
+
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ if (sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ /* connection hasn't started yet? */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return mask;
+ }
+
+ /* writable? */
+ if (unix_writable(sk) && !(peer && unix_recvq_full(peer)))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ if (peer)
+ sock_put(peer);
+
+ return mask;
+}

#ifdef CONFIG_PROC_FS
static struct sock *first_unix_socket(int *i)


2008-06-18 04:56:38

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2.6.25.7] af_unix: fix 'poll for write'/ connected DGRAM sockets

From: Rainer Weikusat <[email protected]>
Date: Tue, 17 Jun 2008 20:47:02 +0200

> The unix_dgram_sendmsg routine implements a (somewhat crude)
> form of receiver-imposed flow control by comparing the length of the
> receive queue of the 'peer socket' with the max_ack_backlog value
> stored in the corresponding sock structure, either blocking
> the thread which caused the send-routine to be called or returning
> EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET
> sockets. The poll-implementation for these socket types is
> datagram_poll from core/datagram.c. A socket is deemed to be writeable
> by this routine when the memory presently consumed by datagrams
> owned by it is less than the configured socket send buffer size. This
> is always wrong for connected PF_UNIX non-stream sockets when the
> abovementioned receive queue is currently considered to be full.
> 'poll' will then return, indicating that the socket is writeable, but
> a subsequent write result in EAGAIN, effectively causing an
> (usual) application to 'poll for writeability by repeated send request
> with O_NONBLOCK set' until it has consumed its time quantum.
>
> The change below uses a suitably modified variant of the datagram_poll
> routines for both type of PF_UNIX sockets, which tests if the
> recv-queue of the peer a socket is connected to is presently
> considered to be 'full' as part of the 'is this socket
> writeable'-checking code. The socket being polled is additionally
> put onto the peer_wait wait queue associated with its peer, because the
> unix_dgram_sendmsg routine does a wake up on this queue after a
> datagram was received and the 'other wakeup call' is done implicitly
> as part of skb destruction, meaning, a process blocked in poll
> because of a full peer receive queue could otherwise sleep forever
> if no datagram owned by its socket was already sitting on this queue.
> Among this change is a small (inline) helper routine named
> 'unix_recvq_full', which consolidates the actual testing code (in three
> different places) into a single location.
>
> Signed-off-by: <[email protected]>

Thank you for fixing this bug.

I'm going to review the logic in the new poll routing a little
bit more, then apply it to net-2.6 unless I find some problems.

Thanks again.

2008-06-18 12:31:45

by Rainer Weikusat

[permalink] [raw]
Subject: Re: [PATCH 2.6.25.7] af_unix: fix 'poll for write'/ connected DGRAM sockets

David Miller <[email protected]> writes:
> From: Rainer Weikusat <[email protected]>
> Date: Tue, 17 Jun 2008 20:47:02 +0200
>> The unix_dgram_sendmsg routine implements

[...]

> I'm going to review the logic in the new poll routing a little
> bit more, then apply it to net-2.6 unless I find some problems.

Thank you for having a look at this. I have found a couple of problems
myself in the meantime:

- I misread the check in unix_dgram_sendmsg, which guards the
'receive queue' test: That actually tests if the peer of the
other socket is not the sending socket itself, ie it is true
if the sending socket is either unconnected or the destination
socket is the 1 in a n:1-setup (The well-known example of
this would be /dev/log. This happened to be my use
scenario, too).

- Assuming the socket wasn't writeable because it still owns
too many datagrams, it shouldn't be but onto the peer_wait
queue of the peer. If the destination socket consumes one of
the datagrams credited to the sending socket, the thread
will be woken up 'as usual'. Otherwise, there is no point in
waking it at all.

- Splitting the 'check for writeable' code in two parts, one
above and one below the other checks, was a stupid idea: The
only requirement is that the thread is added to the other
wait queue before it checks the state of the peer's
sk_receive_queue and I think having all this code together
makes it easier to understand.

- The naming is inconsistent with the other datagram socket
routines.

I will submit an updated patch which addresses this.

A couple of related-but-different oddities:

- What happens if a thread is blocked in poll and another
thread re-connects the socket to someone else? This should
presumably cause the other thread to be woken up if it is
presently queued on the peer wait queue.

- What happens if someone connects the other socket? Presently
(insofar I understand the code correctly),
nothing. unix_dgram_connect just sets the peer of a socket
which was unconnected before, despite its receive queue
may still contain packets sent to it before the connect,
which it (according to the comment above _disconnect),
shouldn't receive anymore (I have tested that this actually
happens with the help of perl).

2008-06-18 21:48:38

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2.6.25.7] af_unix: fix 'poll for write'/ connected DGRAM sockets

From: Rainer Weikusat <[email protected]>
Date: Wed, 18 Jun 2008 14:31:07 +0200

> I will submit an updated patch which addresses this.

Please submit a relative patch with the fixups, your
original change is already in my tree.

2008-06-19 14:15:13

by Rainer Weikusat

[permalink] [raw]
Subject: [PATCH 2.6.25.7 v1-v2] af_unix: fix 'poll for write'/connected DGRAM sockets

From: Rainer Weikusat <[email protected]>

For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg
routine implements a form of receiver-imposed flow control by
comparing the length of the receive queue of the 'peer socket' with
the max_ack_backlog value stored in the corresponding sock structure,
either blocking the thread which caused the send-routine to be called
or returning EAGAIN. This routine is used by both SOCK_DGRAM and
SOCK_SEQPACKET sockets. The poll-implementation for these socket types
is datagram_poll from core/datagram.c. A socket is deemed to be
writeable by this routine when the memory presently consumed by
datagrams owned by it is less than the configured socket send buffer
size. This is always wrong for PF_UNIX non-stream sockets connected to
server sockets dealing with (potentially) multiple clients if the
abovementioned receive queue is currently considered to be full.
'poll' will then return, indicating that the socket is writeable, but
a subsequent write result in EAGAIN, effectively causing an (usual)
application to 'poll for writeability by repeated send request with
O_NONBLOCK set' until it has consumed its time quantum.

The change below uses a suitably modified variant of the datagram_poll
routines for both type of PF_UNIX sockets, which tests if the
recv-queue of the peer a socket is connected to is presently
considered to be 'full' as part of the 'is this socket
writeable'-checking code. The socket being polled is additionally
put onto the peer_wait wait queue associated with its peer, because the
unix_dgram_recvmsg routine does a wake up on this queue after a
datagram was received and the 'other wakeup call' is done implicitly
as part of skb destruction, meaning, a process blocked in poll
because of a full peer receive queue could otherwise sleep forever
if no datagram owned by its socket was already sitting on this queue.
Among this change is a small (inline) helper routine named
'unix_recvq_full', which consolidates the actual testing code (in three
different places) into a single location.

Signed-off-by: <[email protected]>
---
diff -pru linux-2.6.25.7-old/net/unix/af_unix.c linux-2.6.25.7-new/net/unix/af_unix.c
--- linux-2.6.25.7-old/net/unix/af_unix.c 2008-06-19 11:31:36.000000000 +0200
+++ linux-2.6.25.7-new/net/unix/af_unix.c 2008-06-19 11:32:05.000000000 +0200
@@ -487,7 +487,7 @@ static int unix_socketpair(struct socket
static int unix_accept(struct socket *, struct socket *, int);
static int unix_getname(struct socket *, struct sockaddr *, int *, int);
static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
-static unsigned int unix_datagram_poll(struct file *, struct socket *,
+static unsigned int unix_dgram_poll(struct file *, struct socket *,
poll_table *);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
static int unix_shutdown(struct socket *, int);
@@ -534,7 +534,7 @@ static const struct proto_ops unix_dgram
.socketpair = unix_socketpair,
.accept = sock_no_accept,
.getname = unix_getname,
- .poll = unix_datagram_poll,
+ .poll = unix_dgram_poll,
.ioctl = unix_ioctl,
.listen = sock_no_listen,
.shutdown = unix_shutdown,
@@ -555,7 +555,7 @@ static const struct proto_ops unix_seqpa
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
- .poll = unix_datagram_poll,
+ .poll = unix_dgram_poll,
.ioctl = unix_ioctl,
.listen = unix_listen,
.shutdown = unix_shutdown,
@@ -1990,29 +1990,13 @@ static unsigned int unix_poll(struct fil
return mask;
}

-static unsigned int unix_datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
{
- struct sock *sk = sock->sk, *peer;
- unsigned int mask;
+ struct sock *sk = sock->sk, *other;
+ unsigned int mask, writable;

poll_wait(file, sk->sk_sleep, wait);
-
- peer = unix_peer_get(sk);
- if (peer) {
- if (peer != sk)
- /*
- * Writability of a connected socket additionally
- * depends on the state of the receive queue of the
- * peer.
- */
- poll_wait(file, &unix_sk(peer)->peer_wait, wait);
- else {
- sock_put(peer);
- peer = NULL;
- }
- }
-
mask = 0;

/* exceptional events? */
@@ -2038,14 +2022,26 @@ static unsigned int unix_datagram_poll(s
}

/* writable? */
- if (unix_writable(sk) && !(peer && unix_recvq_full(peer)))
+ writable = unix_writable(sk);
+ if (writable) {
+ other = unix_peer_get(sk);
+ if (other) {
+ if (unix_peer(other) != sk) {
+ poll_wait(file, &unix_sk(other)->peer_wait,
+ wait);
+ if (unix_recvq_full(other))
+ writable = 0;
+ }
+
+ sock_put(other);
+ }
+ }
+
+ if (writable)
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

- if (peer)
- sock_put(peer);
-
return mask;
}

2008-06-20 13:35:49

by Rainer Weikusat

[permalink] [raw]
Subject: Re: [PATCH 2.6.25.7 v1-v2] af_unix: fix 'poll for write'/connected DGRAM sockets

From: Rainer Weikusat <[email protected]>

For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg
routine implements a form of receiver-imposed flow control by
comparing the length of the receive queue of the 'peer socket' with
the max_ack_backlog value stored in the corresponding sock structure,
either blocking the thread which caused the send-routine to be called
or returning EAGAIN. This routine is used by both SOCK_DGRAM and
SOCK_SEQPACKET sockets. The poll-implementation for these socket types
is datagram_poll from core/datagram.c. A socket is deemed to be
writeable by this routine when the memory presently consumed by
datagrams owned by it is less than the configured socket send buffer
size. This is always wrong for PF_UNIX non-stream sockets connected to
server sockets dealing with (potentially) multiple clients if the
abovementioned receive queue is currently considered to be full.
'poll' will then return, indicating that the socket is writeable, but
a subsequent write result in EAGAIN, effectively causing an (usual)
application to 'poll for writeability by repeated send request with
O_NONBLOCK set' until it has consumed its time quantum.

The change below uses a suitably modified variant of the datagram_poll
routines for both type of PF_UNIX sockets, which tests if the
recv-queue of the peer a socket is connected to is presently
considered to be 'full' as part of the 'is this socket
writeable'-checking code. The socket being polled is additionally
put onto the peer_wait wait queue associated with its peer, because the
unix_dgram_recvmsg routine does a wake up on this queue after a
datagram was received and the 'other wakeup call' is done implicitly
as part of skb destruction, meaning, a process blocked in poll
because of a full peer receive queue could otherwise sleep forever
if no datagram owned by its socket was already sitting on this queue.
Among this change is a small (inline) helper routine named
'unix_recvq_full', which consolidates the actual testing code (in three
different places) into a single location.

Signed-off-by: Rainer Weikusat <[email protected]>
---
--- linux-2.6/net/unix/af_unix.c 2008-06-20 12:43:08.000000000 +0200
+++ linux-2.6-new/net/unix/af_unix.c 2008-06-20 13:03:07.000000000 +0200
@@ -487,8 +487,8 @@ static int unix_socketpair(struct socket
static int unix_accept(struct socket *, struct socket *, int);
static int unix_getname(struct socket *, struct sockaddr *, int *, int);
static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
-static unsigned int unix_datagram_poll(struct file *, struct socket *,
- poll_table *);
+static unsigned int unix_dgram_poll(struct file *, struct socket *,
+ poll_table *);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct kiocb *, struct socket *,
@@ -534,7 +534,7 @@ static const struct proto_ops unix_dgram
.socketpair = unix_socketpair,
.accept = sock_no_accept,
.getname = unix_getname,
- .poll = unix_datagram_poll,
+ .poll = unix_dgram_poll,
.ioctl = unix_ioctl,
.listen = sock_no_listen,
.shutdown = unix_shutdown,
@@ -555,7 +555,7 @@ static const struct proto_ops unix_seqpa
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
- .poll = unix_datagram_poll,
+ .poll = unix_dgram_poll,
.ioctl = unix_ioctl,
.listen = unix_listen,
.shutdown = unix_shutdown,
@@ -1994,29 +1994,13 @@ static unsigned int unix_poll(struct fil
return mask;
}

-static unsigned int unix_datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
{
- struct sock *sk = sock->sk, *peer;
- unsigned int mask;
+ struct sock *sk = sock->sk, *other;
+ unsigned int mask, writable;

poll_wait(file, sk->sk_sleep, wait);
-
- peer = unix_peer_get(sk);
- if (peer) {
- if (peer != sk) {
- /*
- * Writability of a connected socket additionally
- * depends on the state of the receive queue of the
- * peer.
- */
- poll_wait(file, &unix_sk(peer)->peer_wait, wait);
- } else {
- sock_put(peer);
- peer = NULL;
- }
- }
-
mask = 0;

/* exceptional events? */
@@ -2042,14 +2026,26 @@ static unsigned int unix_datagram_poll(s
}

/* writable? */
- if (unix_writable(sk) && !(peer && unix_recvq_full(peer)))
+ writable = unix_writable(sk);
+ if (writable) {
+ other = unix_peer_get(sk);
+ if (other) {
+ if (unix_peer(other) != sk) {
+ poll_wait(file, &unix_sk(other)->peer_wait,
+ wait);
+ if (unix_recvq_full(other))
+ writable = 0;
+ }
+
+ sock_put(other);
+ }
+ }
+
+ if (writable)
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

- if (peer)
- sock_put(peer);
-
return mask;
}

2008-06-28 02:34:45

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2.6.25.7 v1-v2] af_unix: fix 'poll for write'/connected DGRAM sockets

From: Rainer Weikusat <[email protected]>
Date: Fri, 20 Jun 2008 15:35:25 +0200

> For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg
> routine implements a form of receiver-imposed flow control by
> comparing the length of the receive queue of the 'peer socket' with
> the max_ack_backlog value stored in the corresponding sock structure,
> either blocking the thread which caused the send-routine to be called
> or returning EAGAIN. This routine is used by both SOCK_DGRAM and
> SOCK_SEQPACKET sockets. The poll-implementation for these socket types
> is datagram_poll from core/datagram.c. A socket is deemed to be
> writeable by this routine when the memory presently consumed by
> datagrams owned by it is less than the configured socket send buffer
> size. This is always wrong for PF_UNIX non-stream sockets connected to
> server sockets dealing with (potentially) multiple clients if the
> abovementioned receive queue is currently considered to be full.
> 'poll' will then return, indicating that the socket is writeable, but
> a subsequent write result in EAGAIN, effectively causing an (usual)
> application to 'poll for writeability by repeated send request with
> O_NONBLOCK set' until it has consumed its time quantum.
>
> The change below uses a suitably modified variant of the datagram_poll
> routines for both type of PF_UNIX sockets, which tests if the
> recv-queue of the peer a socket is connected to is presently
> considered to be 'full' as part of the 'is this socket
> writeable'-checking code. The socket being polled is additionally
> put onto the peer_wait wait queue associated with its peer, because the
> unix_dgram_recvmsg routine does a wake up on this queue after a
> datagram was received and the 'other wakeup call' is done implicitly
> as part of skb destruction, meaning, a process blocked in poll
> because of a full peer receive queue could otherwise sleep forever
> if no datagram owned by its socket was already sitting on this queue.
> Among this change is a small (inline) helper routine named
> 'unix_recvq_full', which consolidates the actual testing code (in three
> different places) into a single location.
>
> Signed-off-by: Rainer Weikusat <[email protected]>

Applied, thanks a lot Rainer.