Adding memory barrier to the __pollwait function paired with
receive callbacks. The smp_mb__after_lock define is added,
since {read|write|spin}_lock() on x86 are full memory barriers.
The race fires, when following code paths meet, and the tp->rcv_nxt and
__add_wait_queue updates stay in CPU caches.
CPU1 CPU2
sys_select receive packet
... ...
__add_wait_queue update tp->rcv_nxt
... ...
tp->rcv_nxt check sock_def_readable
... {
schedule ...
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep)
...
}
If there was no cache the code would work ok, since the wait_queue and
rcv_nxt are opposit to each other.
Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
passed the tp->rcv_nxt check and sleeps, or will get the new value for
tp->rcv_nxt and will return with new data mask.
In both cases the process (CPU1) is being added to the wait queue, so the
waitqueue_active (CPU2) call cannot miss and will wake up CPU1.
The bad case is when the __add_wait_queue changes done by CPU1 stay in its
cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then
endup calling schedule and sleep forever if there are no more data on the
socket.
wbr,
jirka
Signed-off-by: Jiri Olsa <[email protected]>
---
arch/x86/include/asm/spinlock.h | 3 +++
fs/select.c | 4 ++++
include/linux/spinlock.h | 5 +++++
include/net/sock.h | 18 ++++++++++++++++++
net/atm/common.c | 4 ++--
net/core/sock.c | 8 ++++----
net/dccp/output.c | 2 +-
net/iucv/af_iucv.c | 2 +-
net/rxrpc/af_rxrpc.c | 2 +-
net/unix/af_unix.c | 2 +-
10 files changed, 40 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b7e5db8..39ecc5f 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -302,4 +302,7 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
#define _raw_read_relax(lock) cpu_relax()
#define _raw_write_relax(lock) cpu_relax()
+/* The {read|write|spin}_lock() on x86 are full memory barriers. */
+#define smp_mb__after_lock() do { } while (0)
+
#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/fs/select.c b/fs/select.c
index d870237..c4bd5f0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -219,6 +219,10 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
init_waitqueue_func_entry(&entry->wait, pollwake);
entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
+
+ /* This memory barrier is paired with the smp_mb__after_lock
+ * in the sk_has_sleeper. */
+ smp_mb();
}
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 252b245..ae053bd 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -132,6 +132,11 @@ do { \
#endif /*__raw_spin_is_contended*/
#endif
+/* The lock does not imply full memory barrier. */
+#ifndef smp_mb__after_lock
+#define smp_mb__after_lock() smp_mb()
+#endif
+
/**
* spin_unlock_wait - wait until the spinlock gets unlocked
* @lock: the spinlock in question.
diff --git a/include/net/sock.h b/include/net/sock.h
index 352f06b..7fbb143 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1241,6 +1241,24 @@ static inline int sk_has_allocations(const struct sock *sk)
return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}
+/**
+ * sk_has_sleeper - check if there are any waiting processes
+ * @sk: socket
+ *
+ * Returns true if socket has waiting processes
+ */
+static inline int sk_has_sleeper(struct sock *sk)
+{
+ /*
+ * We need to be sure we are in sync with the
+ * add_wait_queue modifications to the wait queue.
+ *
+ * This memory barrier is paired in the __pollwait.
+ */
+ smp_mb__after_lock();
+ return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
+}
+
/*
* Queue a received datagram if it will fit. Stream and sequenced
* protocols can't normally use this as they need to fit buffers in
diff --git a/net/atm/common.c b/net/atm/common.c
index c1c9793..67a8642 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk)
static void vcc_def_wakeup(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up(sk->sk_sleep);
read_unlock(&sk->sk_callback_lock);
}
@@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk)
read_lock(&sk->sk_callback_lock);
if (vcc_writable(sk)) {
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible(sk->sk_sleep);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
diff --git a/net/core/sock.c b/net/core/sock.c
index b0ba569..6354863 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage);
static void sock_def_wakeup(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible_all(sk->sk_sleep);
read_unlock(&sk->sk_callback_lock);
}
@@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk)
static void sock_def_error_report(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
read_unlock(&sk->sk_callback_lock);
@@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk)
static void sock_def_readable(struct sock *sk, int len)
{
read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
@@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk)
* progress. --DaveM
*/
if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
POLLWRNORM | POLLWRBAND);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index c0e88c1..c96119f 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible(sk->sk_sleep);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 6be5f92..ba0149d 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk)
static void iucv_sock_wake_msglim(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible_all(sk->sk_sleep);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
read_unlock(&sk->sk_callback_lock);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index eac5e7b..60e0e38 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk)
_enter("%p", sk);
read_lock(&sk->sk_callback_lock);
if (rxrpc_writable(sk)) {
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible(sk->sk_sleep);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 36d4e44..143143a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
if (unix_writable(sk)) {
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ if (sk_has_sleeper(sk))
wake_up_interruptible_sync(sk->sk_sleep);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
Can't really comment this patch, except this all looks reasonable to me.
Add more CCs.
On 06/25, Jiri Olsa wrote:
>
> Adding memory barrier to the __pollwait function paired with
> receive callbacks. The smp_mb__after_lock define is added,
> since {read|write|spin}_lock() on x86 are full memory barriers.
>
> The race fires, when following code paths meet, and the tp->rcv_nxt and
> __add_wait_queue updates stay in CPU caches.
>
>
> CPU1 CPU2
>
> sys_select receive packet
> ... ...
> __add_wait_queue update tp->rcv_nxt
> ... ...
> tp->rcv_nxt check sock_def_readable
> ... {
> schedule ...
> if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> wake_up_interruptible(sk->sk_sleep)
> ...
> }
>
> If there was no cache the code would work ok, since the wait_queue and
> rcv_nxt are opposit to each other.
>
> Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
> passed the tp->rcv_nxt check and sleeps, or will get the new value for
> tp->rcv_nxt and will return with new data mask.
> In both cases the process (CPU1) is being added to the wait queue, so the
> waitqueue_active (CPU2) call cannot miss and will wake up CPU1.
>
> The bad case is when the __add_wait_queue changes done by CPU1 stay in its
> cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then
> endup calling schedule and sleep forever if there are no more data on the
> socket.
>
> wbr,
> jirka
>
>
> Signed-off-by: Jiri Olsa <[email protected]>
>
> ---
> arch/x86/include/asm/spinlock.h | 3 +++
> fs/select.c | 4 ++++
> include/linux/spinlock.h | 5 +++++
> include/net/sock.h | 18 ++++++++++++++++++
> net/atm/common.c | 4 ++--
> net/core/sock.c | 8 ++++----
> net/dccp/output.c | 2 +-
> net/iucv/af_iucv.c | 2 +-
> net/rxrpc/af_rxrpc.c | 2 +-
> net/unix/af_unix.c | 2 +-
> 10 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index b7e5db8..39ecc5f 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -302,4 +302,7 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
> #define _raw_read_relax(lock) cpu_relax()
> #define _raw_write_relax(lock) cpu_relax()
>
> +/* The {read|write|spin}_lock() on x86 are full memory barriers. */
> +#define smp_mb__after_lock() do { } while (0)
> +
> #endif /* _ASM_X86_SPINLOCK_H */
> diff --git a/fs/select.c b/fs/select.c
> index d870237..c4bd5f0 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -219,6 +219,10 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
> init_waitqueue_func_entry(&entry->wait, pollwake);
> entry->wait.private = pwq;
> add_wait_queue(wait_address, &entry->wait);
> +
> + /* This memory barrier is paired with the smp_mb__after_lock
> + * in the sk_has_sleeper. */
> + smp_mb();
> }
>
> int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
> diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
> index 252b245..ae053bd 100644
> --- a/include/linux/spinlock.h
> +++ b/include/linux/spinlock.h
> @@ -132,6 +132,11 @@ do { \
> #endif /*__raw_spin_is_contended*/
> #endif
>
> +/* The lock does not imply full memory barrier. */
> +#ifndef smp_mb__after_lock
> +#define smp_mb__after_lock() smp_mb()
> +#endif
> +
> /**
> * spin_unlock_wait - wait until the spinlock gets unlocked
> * @lock: the spinlock in question.
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 352f06b..7fbb143 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1241,6 +1241,24 @@ static inline int sk_has_allocations(const struct sock *sk)
> return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
> }
>
> +/**
> + * sk_has_sleeper - check if there are any waiting processes
> + * @sk: socket
> + *
> + * Returns true if socket has waiting processes
> + */
> +static inline int sk_has_sleeper(struct sock *sk)
> +{
> + /*
> + * We need to be sure we are in sync with the
> + * add_wait_queue modifications to the wait queue.
> + *
> + * This memory barrier is paired in the __pollwait.
> + */
> + smp_mb__after_lock();
> + return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
> +}
> +
> /*
> * Queue a received datagram if it will fit. Stream and sequenced
> * protocols can't normally use this as they need to fit buffers in
> diff --git a/net/atm/common.c b/net/atm/common.c
> index c1c9793..67a8642 100644
> --- a/net/atm/common.c
> +++ b/net/atm/common.c
> @@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk)
> static void vcc_def_wakeup(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up(sk->sk_sleep);
> read_unlock(&sk->sk_callback_lock);
> }
> @@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk)
> read_lock(&sk->sk_callback_lock);
>
> if (vcc_writable(sk)) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible(sk->sk_sleep);
>
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index b0ba569..6354863 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage);
> static void sock_def_wakeup(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_all(sk->sk_sleep);
> read_unlock(&sk->sk_callback_lock);
> }
> @@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk)
> static void sock_def_error_report(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
> sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
> read_unlock(&sk->sk_callback_lock);
> @@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk)
> static void sock_def_readable(struct sock *sk, int len)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
> POLLRDNORM | POLLRDBAND);
> sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
> @@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk)
> * progress. --DaveM
> */
> if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
> POLLWRNORM | POLLWRBAND);
>
> diff --git a/net/dccp/output.c b/net/dccp/output.c
> index c0e88c1..c96119f 100644
> --- a/net/dccp/output.c
> +++ b/net/dccp/output.c
> @@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
>
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible(sk->sk_sleep);
> /* Should agree with poll, otherwise some programs break */
> if (sock_writeable(sk))
> diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
> index 6be5f92..ba0149d 100644
> --- a/net/iucv/af_iucv.c
> +++ b/net/iucv/af_iucv.c
> @@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk)
> static void iucv_sock_wake_msglim(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_all(sk->sk_sleep);
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> read_unlock(&sk->sk_callback_lock);
> diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
> index eac5e7b..60e0e38 100644
> --- a/net/rxrpc/af_rxrpc.c
> +++ b/net/rxrpc/af_rxrpc.c
> @@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk)
> _enter("%p", sk);
> read_lock(&sk->sk_callback_lock);
> if (rxrpc_writable(sk)) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible(sk->sk_sleep);
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> }
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 36d4e44..143143a 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> if (unix_writable(sk)) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_sync(sk->sk_sleep);
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> }
Jiri Olsa a ?crit :
> Adding memory barrier to the __pollwait function paired with
> receive callbacks. The smp_mb__after_lock define is added,
> since {read|write|spin}_lock() on x86 are full memory barriers.
>
> The race fires, when following code paths meet, and the tp->rcv_nxt and
> __add_wait_queue updates stay in CPU caches.
>
>
> CPU1 CPU2
>
> sys_select receive packet
> ... ...
> __add_wait_queue update tp->rcv_nxt
> ... ...
> tp->rcv_nxt check sock_def_readable
> ... {
> schedule ...
> if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> wake_up_interruptible(sk->sk_sleep)
> ...
> }
>
> If there was no cache the code would work ok, since the wait_queue and
> rcv_nxt are opposit to each other.
>
> Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
> passed the tp->rcv_nxt check and sleeps, or will get the new value for
> tp->rcv_nxt and will return with new data mask.
> In both cases the process (CPU1) is being added to the wait queue, so the
> waitqueue_active (CPU2) call cannot miss and will wake up CPU1.
>
> The bad case is when the __add_wait_queue changes done by CPU1 stay in its
> cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then
> endup calling schedule and sleep forever if there are no more data on the
> socket.
>
> wbr,
> jirka
>
>
> Signed-off-by: Jiri Olsa <[email protected]>
Patch seems fine for me, thanks a lot Jiri !
Signed-off-by: Eric Dumazet <[email protected]>
>
> ---
> arch/x86/include/asm/spinlock.h | 3 +++
> fs/select.c | 4 ++++
> include/linux/spinlock.h | 5 +++++
> include/net/sock.h | 18 ++++++++++++++++++
> net/atm/common.c | 4 ++--
> net/core/sock.c | 8 ++++----
> net/dccp/output.c | 2 +-
> net/iucv/af_iucv.c | 2 +-
> net/rxrpc/af_rxrpc.c | 2 +-
> net/unix/af_unix.c | 2 +-
> 10 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index b7e5db8..39ecc5f 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -302,4 +302,7 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
> #define _raw_read_relax(lock) cpu_relax()
> #define _raw_write_relax(lock) cpu_relax()
>
> +/* The {read|write|spin}_lock() on x86 are full memory barriers. */
> +#define smp_mb__after_lock() do { } while (0)
> +
> #endif /* _ASM_X86_SPINLOCK_H */
> diff --git a/fs/select.c b/fs/select.c
> index d870237..c4bd5f0 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -219,6 +219,10 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
> init_waitqueue_func_entry(&entry->wait, pollwake);
> entry->wait.private = pwq;
> add_wait_queue(wait_address, &entry->wait);
> +
> + /* This memory barrier is paired with the smp_mb__after_lock
> + * in the sk_has_sleeper. */
> + smp_mb();
> }
>
> int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
> diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
> index 252b245..ae053bd 100644
> --- a/include/linux/spinlock.h
> +++ b/include/linux/spinlock.h
> @@ -132,6 +132,11 @@ do { \
> #endif /*__raw_spin_is_contended*/
> #endif
>
> +/* The lock does not imply full memory barrier. */
> +#ifndef smp_mb__after_lock
> +#define smp_mb__after_lock() smp_mb()
> +#endif
> +
> /**
> * spin_unlock_wait - wait until the spinlock gets unlocked
> * @lock: the spinlock in question.
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 352f06b..7fbb143 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1241,6 +1241,24 @@ static inline int sk_has_allocations(const struct sock *sk)
> return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
> }
>
> +/**
> + * sk_has_sleeper - check if there are any waiting processes
> + * @sk: socket
> + *
> + * Returns true if socket has waiting processes
> + */
> +static inline int sk_has_sleeper(struct sock *sk)
> +{
> + /*
> + * We need to be sure we are in sync with the
> + * add_wait_queue modifications to the wait queue.
> + *
> + * This memory barrier is paired in the __pollwait.
> + */
> + smp_mb__after_lock();
> + return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
> +}
> +
> /*
> * Queue a received datagram if it will fit. Stream and sequenced
> * protocols can't normally use this as they need to fit buffers in
> diff --git a/net/atm/common.c b/net/atm/common.c
> index c1c9793..67a8642 100644
> --- a/net/atm/common.c
> +++ b/net/atm/common.c
> @@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk)
> static void vcc_def_wakeup(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up(sk->sk_sleep);
> read_unlock(&sk->sk_callback_lock);
> }
> @@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk)
> read_lock(&sk->sk_callback_lock);
>
> if (vcc_writable(sk)) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible(sk->sk_sleep);
>
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index b0ba569..6354863 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage);
> static void sock_def_wakeup(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_all(sk->sk_sleep);
> read_unlock(&sk->sk_callback_lock);
> }
> @@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk)
> static void sock_def_error_report(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
> sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
> read_unlock(&sk->sk_callback_lock);
> @@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk)
> static void sock_def_readable(struct sock *sk, int len)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
> POLLRDNORM | POLLRDBAND);
> sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
> @@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk)
> * progress. --DaveM
> */
> if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
> POLLWRNORM | POLLWRBAND);
>
> diff --git a/net/dccp/output.c b/net/dccp/output.c
> index c0e88c1..c96119f 100644
> --- a/net/dccp/output.c
> +++ b/net/dccp/output.c
> @@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
>
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible(sk->sk_sleep);
> /* Should agree with poll, otherwise some programs break */
> if (sock_writeable(sk))
> diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
> index 6be5f92..ba0149d 100644
> --- a/net/iucv/af_iucv.c
> +++ b/net/iucv/af_iucv.c
> @@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk)
> static void iucv_sock_wake_msglim(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_all(sk->sk_sleep);
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> read_unlock(&sk->sk_callback_lock);
> diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
> index eac5e7b..60e0e38 100644
> --- a/net/rxrpc/af_rxrpc.c
> +++ b/net/rxrpc/af_rxrpc.c
> @@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk)
> _enter("%p", sk);
> read_lock(&sk->sk_callback_lock);
> if (rxrpc_writable(sk)) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible(sk->sk_sleep);
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> }
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 36d4e44..143143a 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk)
> {
> read_lock(&sk->sk_callback_lock);
> if (unix_writable(sk)) {
> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> + if (sk_has_sleeper(sk))
> wake_up_interruptible_sync(sk->sk_sleep);
> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> }
On Thu, 25 Jun 2009, Oleg Nesterov wrote:
> Can't really comment this patch, except this all looks reasonable to me.
> Add more CCs.
While this can work, IMO it'd be cleaner to have the smp_mb() moved from
fs/select.c to the ->poll() function.
Having a barrier that matches another one in another susbsystem, because
of the special locking logic of such subsystem, is not too shiny IMHO.
>
> On 06/25, Jiri Olsa wrote:
> >
> > Adding memory barrier to the __pollwait function paired with
> > receive callbacks. The smp_mb__after_lock define is added,
> > since {read|write|spin}_lock() on x86 are full memory barriers.
> >
> > The race fires, when following code paths meet, and the tp->rcv_nxt and
> > __add_wait_queue updates stay in CPU caches.
> >
> >
> > CPU1 CPU2
> >
> > sys_select receive packet
> > ... ...
> > __add_wait_queue update tp->rcv_nxt
> > ... ...
> > tp->rcv_nxt check sock_def_readable
> > ... {
> > schedule ...
> > if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > wake_up_interruptible(sk->sk_sleep)
> > ...
> > }
> >
> > If there was no cache the code would work ok, since the wait_queue and
> > rcv_nxt are opposit to each other.
> >
> > Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
> > passed the tp->rcv_nxt check and sleeps, or will get the new value for
> > tp->rcv_nxt and will return with new data mask.
> > In both cases the process (CPU1) is being added to the wait queue, so the
> > waitqueue_active (CPU2) call cannot miss and will wake up CPU1.
> >
> > The bad case is when the __add_wait_queue changes done by CPU1 stay in its
> > cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then
> > endup calling schedule and sleep forever if there are no more data on the
> > socket.
> >
> > wbr,
> > jirka
> >
> >
> > Signed-off-by: Jiri Olsa <[email protected]>
> >
> > ---
> > arch/x86/include/asm/spinlock.h | 3 +++
> > fs/select.c | 4 ++++
> > include/linux/spinlock.h | 5 +++++
> > include/net/sock.h | 18 ++++++++++++++++++
> > net/atm/common.c | 4 ++--
> > net/core/sock.c | 8 ++++----
> > net/dccp/output.c | 2 +-
> > net/iucv/af_iucv.c | 2 +-
> > net/rxrpc/af_rxrpc.c | 2 +-
> > net/unix/af_unix.c | 2 +-
> > 10 files changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> > index b7e5db8..39ecc5f 100644
> > --- a/arch/x86/include/asm/spinlock.h
> > +++ b/arch/x86/include/asm/spinlock.h
> > @@ -302,4 +302,7 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
> > #define _raw_read_relax(lock) cpu_relax()
> > #define _raw_write_relax(lock) cpu_relax()
> >
> > +/* The {read|write|spin}_lock() on x86 are full memory barriers. */
> > +#define smp_mb__after_lock() do { } while (0)
> > +
> > #endif /* _ASM_X86_SPINLOCK_H */
> > diff --git a/fs/select.c b/fs/select.c
> > index d870237..c4bd5f0 100644
> > --- a/fs/select.c
> > +++ b/fs/select.c
> > @@ -219,6 +219,10 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
> > init_waitqueue_func_entry(&entry->wait, pollwake);
> > entry->wait.private = pwq;
> > add_wait_queue(wait_address, &entry->wait);
> > +
> > + /* This memory barrier is paired with the smp_mb__after_lock
> > + * in the sk_has_sleeper. */
> > + smp_mb();
> > }
> >
> > int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
> > diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
> > index 252b245..ae053bd 100644
> > --- a/include/linux/spinlock.h
> > +++ b/include/linux/spinlock.h
> > @@ -132,6 +132,11 @@ do { \
> > #endif /*__raw_spin_is_contended*/
> > #endif
> >
> > +/* The lock does not imply full memory barrier. */
> > +#ifndef smp_mb__after_lock
> > +#define smp_mb__after_lock() smp_mb()
> > +#endif
> > +
> > /**
> > * spin_unlock_wait - wait until the spinlock gets unlocked
> > * @lock: the spinlock in question.
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 352f06b..7fbb143 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -1241,6 +1241,24 @@ static inline int sk_has_allocations(const struct sock *sk)
> > return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
> > }
> >
> > +/**
> > + * sk_has_sleeper - check if there are any waiting processes
> > + * @sk: socket
> > + *
> > + * Returns true if socket has waiting processes
> > + */
> > +static inline int sk_has_sleeper(struct sock *sk)
> > +{
> > + /*
> > + * We need to be sure we are in sync with the
> > + * add_wait_queue modifications to the wait queue.
> > + *
> > + * This memory barrier is paired in the __pollwait.
> > + */
> > + smp_mb__after_lock();
> > + return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
> > +}
> > +
> > /*
> > * Queue a received datagram if it will fit. Stream and sequenced
> > * protocols can't normally use this as they need to fit buffers in
> > diff --git a/net/atm/common.c b/net/atm/common.c
> > index c1c9793..67a8642 100644
> > --- a/net/atm/common.c
> > +++ b/net/atm/common.c
> > @@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk)
> > static void vcc_def_wakeup(struct sock *sk)
> > {
> > read_lock(&sk->sk_callback_lock);
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up(sk->sk_sleep);
> > read_unlock(&sk->sk_callback_lock);
> > }
> > @@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk)
> > read_lock(&sk->sk_callback_lock);
> >
> > if (vcc_writable(sk)) {
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible(sk->sk_sleep);
> >
> > sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index b0ba569..6354863 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage);
> > static void sock_def_wakeup(struct sock *sk)
> > {
> > read_lock(&sk->sk_callback_lock);
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible_all(sk->sk_sleep);
> > read_unlock(&sk->sk_callback_lock);
> > }
> > @@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk)
> > static void sock_def_error_report(struct sock *sk)
> > {
> > read_lock(&sk->sk_callback_lock);
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
> > sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
> > read_unlock(&sk->sk_callback_lock);
> > @@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk)
> > static void sock_def_readable(struct sock *sk, int len)
> > {
> > read_lock(&sk->sk_callback_lock);
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
> > POLLRDNORM | POLLRDBAND);
> > sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
> > @@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk)
> > * progress. --DaveM
> > */
> > if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
> > POLLWRNORM | POLLWRBAND);
> >
> > diff --git a/net/dccp/output.c b/net/dccp/output.c
> > index c0e88c1..c96119f 100644
> > --- a/net/dccp/output.c
> > +++ b/net/dccp/output.c
> > @@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk)
> > {
> > read_lock(&sk->sk_callback_lock);
> >
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible(sk->sk_sleep);
> > /* Should agree with poll, otherwise some programs break */
> > if (sock_writeable(sk))
> > diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
> > index 6be5f92..ba0149d 100644
> > --- a/net/iucv/af_iucv.c
> > +++ b/net/iucv/af_iucv.c
> > @@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk)
> > static void iucv_sock_wake_msglim(struct sock *sk)
> > {
> > read_lock(&sk->sk_callback_lock);
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible_all(sk->sk_sleep);
> > sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> > read_unlock(&sk->sk_callback_lock);
> > diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
> > index eac5e7b..60e0e38 100644
> > --- a/net/rxrpc/af_rxrpc.c
> > +++ b/net/rxrpc/af_rxrpc.c
> > @@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk)
> > _enter("%p", sk);
> > read_lock(&sk->sk_callback_lock);
> > if (rxrpc_writable(sk)) {
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible(sk->sk_sleep);
> > sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> > }
> > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> > index 36d4e44..143143a 100644
> > --- a/net/unix/af_unix.c
> > +++ b/net/unix/af_unix.c
> > @@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk)
> > {
> > read_lock(&sk->sk_callback_lock);
> > if (unix_writable(sk)) {
> > - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> > + if (sk_has_sleeper(sk))
> > wake_up_interruptible_sync(sk->sk_sleep);
> > sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
> > }
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
- Davide
Hello,
Jiri Olsa wrote:
> Adding memory barrier to the __pollwait function paired with
> receive callbacks. The smp_mb__after_lock define is added,
> since {read|write|spin}_lock() on x86 are full memory barriers.
>
> The race fires, when following code paths meet, and the tp->rcv_nxt and
> __add_wait_queue updates stay in CPU caches.
>
>
> CPU1 CPU2
>
> sys_select receive packet
> ... ...
> __add_wait_queue update tp->rcv_nxt
> ... ...
> tp->rcv_nxt check sock_def_readable
> ... {
> schedule ...
> if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> wake_up_interruptible(sk->sk_sleep)
> ...
> }
>
> If there was no cache the code would work ok, since the wait_queue and
> rcv_nxt are opposit to each other.
>
> Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
> passed the tp->rcv_nxt check and sleeps, or will get the new value for
> tp->rcv_nxt and will return with new data mask.
> In both cases the process (CPU1) is being added to the wait queue, so the
> waitqueue_active (CPU2) call cannot miss and will wake up CPU1.
>
> The bad case is when the __add_wait_queue changes done by CPU1 stay in its
> cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then
> endup calling schedule and sleep forever if there are no more data on the
> socket.
So, the problem is the half barrier semantics of spin_lock on CPU1's
side and lack of any barrier (read for tp->rcv_nxt check can creep
above the queuelist update) in waitqueue_active() check on CPU2's side
(read for waitqueue list can creep above tcv_nxt update). Am I
understanding it right?
This is a little bit scary. The interface kind of suggests that they
have strong enough barrier semantics (well, I would assume that). I
wonder whether there are more more places where this kind of race
condition exists.
> @@ -219,6 +219,10 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
> init_waitqueue_func_entry(&entry->wait, pollwake);
> entry->wait.private = pwq;
> add_wait_queue(wait_address, &entry->wait);
> +
> + /* This memory barrier is paired with the smp_mb__after_lock
> + * in the sk_has_sleeper. */
> + smp_mb();
I'm not entirely sure this is the correct place to do it while the mb
for the other side lives in network code. Wouldn't it be better to
move this memory barrier to network select code? It's strange for an
API to have only single side of a barrier pair and leave the other to
the API user.
Also, maybe we need smp_mb__after_unlock() too? Maybe
add_wait_queue_mb() possibly paired with wait_queue_active_mb() is
better? On x86, it wouldn't make any difference tho.
One more thing, can you please use fully winged style for multiline
comments?
Thanks.
--
tejun
Davide Libenzi a ?crit :
> On Thu, 25 Jun 2009, Oleg Nesterov wrote:
>
>> Can't really comment this patch, except this all looks reasonable to me.
>> Add more CCs.
>
> While this can work, IMO it'd be cleaner to have the smp_mb() moved from
> fs/select.c to the ->poll() function.
> Having a barrier that matches another one in another susbsystem, because
> of the special locking logic of such subsystem, is not too shiny IMHO.
>
Yes but barrier is necessary only if add_wait_queue() was actually called, and __pollwait()
does this call.
Adding a plain smp_mb() in tcp_poll() for example would slowdown select()/poll() with NULL
timeout.
Adding a cond test before smp_mb() in tcp_poll() (and other ->poll() functions)
would be litle bit overkill too...
I believe this race was not existent in the past because spin_unlock() had a memory barrier,
and we changed this to a plain memory write at some point...
Most add_wait_queue() calls are followed by a call to set_current_state()
so a proper smp_mb() is explicitly included.
>
>
>
>> On 06/25, Jiri Olsa wrote:
>>> Adding memory barrier to the __pollwait function paired with
>>> receive callbacks. The smp_mb__after_lock define is added,
>>> since {read|write|spin}_lock() on x86 are full memory barriers.
>>>
>>> The race fires, when following code paths meet, and the tp->rcv_nxt and
>>> __add_wait_queue updates stay in CPU caches.
>>>
>>>
>>> CPU1 CPU2
>>>
>>> sys_select receive packet
>>> ... ...
>>> __add_wait_queue update tp->rcv_nxt
>>> ... ...
>>> tp->rcv_nxt check sock_def_readable
>>> ... {
>>> schedule ...
>>> if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> wake_up_interruptible(sk->sk_sleep)
>>> ...
>>> }
>>>
>>> If there was no cache the code would work ok, since the wait_queue and
>>> rcv_nxt are opposit to each other.
>>>
>>> Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
>>> passed the tp->rcv_nxt check and sleeps, or will get the new value for
>>> tp->rcv_nxt and will return with new data mask.
>>> In both cases the process (CPU1) is being added to the wait queue, so the
>>> waitqueue_active (CPU2) call cannot miss and will wake up CPU1.
>>>
>>> The bad case is when the __add_wait_queue changes done by CPU1 stay in its
>>> cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then
>>> endup calling schedule and sleep forever if there are no more data on the
>>> socket.
>>>
>>> wbr,
>>> jirka
>>>
>>>
>>> Signed-off-by: Jiri Olsa <[email protected]>
>>>
>>> ---
>>> arch/x86/include/asm/spinlock.h | 3 +++
>>> fs/select.c | 4 ++++
>>> include/linux/spinlock.h | 5 +++++
>>> include/net/sock.h | 18 ++++++++++++++++++
>>> net/atm/common.c | 4 ++--
>>> net/core/sock.c | 8 ++++----
>>> net/dccp/output.c | 2 +-
>>> net/iucv/af_iucv.c | 2 +-
>>> net/rxrpc/af_rxrpc.c | 2 +-
>>> net/unix/af_unix.c | 2 +-
>>> 10 files changed, 40 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
>>> index b7e5db8..39ecc5f 100644
>>> --- a/arch/x86/include/asm/spinlock.h
>>> +++ b/arch/x86/include/asm/spinlock.h
>>> @@ -302,4 +302,7 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
>>> #define _raw_read_relax(lock) cpu_relax()
>>> #define _raw_write_relax(lock) cpu_relax()
>>>
>>> +/* The {read|write|spin}_lock() on x86 are full memory barriers. */
>>> +#define smp_mb__after_lock() do { } while (0)
>>> +
>>> #endif /* _ASM_X86_SPINLOCK_H */
>>> diff --git a/fs/select.c b/fs/select.c
>>> index d870237..c4bd5f0 100644
>>> --- a/fs/select.c
>>> +++ b/fs/select.c
>>> @@ -219,6 +219,10 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
>>> init_waitqueue_func_entry(&entry->wait, pollwake);
>>> entry->wait.private = pwq;
>>> add_wait_queue(wait_address, &entry->wait);
>>> +
>>> + /* This memory barrier is paired with the smp_mb__after_lock
>>> + * in the sk_has_sleeper. */
>>> + smp_mb();
>>> }
>>>
>>> int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
>>> diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
>>> index 252b245..ae053bd 100644
>>> --- a/include/linux/spinlock.h
>>> +++ b/include/linux/spinlock.h
>>> @@ -132,6 +132,11 @@ do { \
>>> #endif /*__raw_spin_is_contended*/
>>> #endif
>>>
>>> +/* The lock does not imply full memory barrier. */
>>> +#ifndef smp_mb__after_lock
>>> +#define smp_mb__after_lock() smp_mb()
>>> +#endif
>>> +
>>> /**
>>> * spin_unlock_wait - wait until the spinlock gets unlocked
>>> * @lock: the spinlock in question.
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 352f06b..7fbb143 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -1241,6 +1241,24 @@ static inline int sk_has_allocations(const struct sock *sk)
>>> return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
>>> }
>>>
>>> +/**
>>> + * sk_has_sleeper - check if there are any waiting processes
>>> + * @sk: socket
>>> + *
>>> + * Returns true if socket has waiting processes
>>> + */
>>> +static inline int sk_has_sleeper(struct sock *sk)
>>> +{
>>> + /*
>>> + * We need to be sure we are in sync with the
>>> + * add_wait_queue modifications to the wait queue.
>>> + *
>>> + * This memory barrier is paired in the __pollwait.
>>> + */
>>> + smp_mb__after_lock();
>>> + return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
>>> +}
>>> +
>>> /*
>>> * Queue a received datagram if it will fit. Stream and sequenced
>>> * protocols can't normally use this as they need to fit buffers in
>>> diff --git a/net/atm/common.c b/net/atm/common.c
>>> index c1c9793..67a8642 100644
>>> --- a/net/atm/common.c
>>> +++ b/net/atm/common.c
>>> @@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk)
>>> static void vcc_def_wakeup(struct sock *sk)
>>> {
>>> read_lock(&sk->sk_callback_lock);
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up(sk->sk_sleep);
>>> read_unlock(&sk->sk_callback_lock);
>>> }
>>> @@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk)
>>> read_lock(&sk->sk_callback_lock);
>>>
>>> if (vcc_writable(sk)) {
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible(sk->sk_sleep);
>>>
>>> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>> index b0ba569..6354863 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage);
>>> static void sock_def_wakeup(struct sock *sk)
>>> {
>>> read_lock(&sk->sk_callback_lock);
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible_all(sk->sk_sleep);
>>> read_unlock(&sk->sk_callback_lock);
>>> }
>>> @@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk)
>>> static void sock_def_error_report(struct sock *sk)
>>> {
>>> read_lock(&sk->sk_callback_lock);
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
>>> sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
>>> read_unlock(&sk->sk_callback_lock);
>>> @@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk)
>>> static void sock_def_readable(struct sock *sk, int len)
>>> {
>>> read_lock(&sk->sk_callback_lock);
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
>>> POLLRDNORM | POLLRDBAND);
>>> sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
>>> @@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk)
>>> * progress. --DaveM
>>> */
>>> if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
>>> POLLWRNORM | POLLWRBAND);
>>>
>>> diff --git a/net/dccp/output.c b/net/dccp/output.c
>>> index c0e88c1..c96119f 100644
>>> --- a/net/dccp/output.c
>>> +++ b/net/dccp/output.c
>>> @@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk)
>>> {
>>> read_lock(&sk->sk_callback_lock);
>>>
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible(sk->sk_sleep);
>>> /* Should agree with poll, otherwise some programs break */
>>> if (sock_writeable(sk))
>>> diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
>>> index 6be5f92..ba0149d 100644
>>> --- a/net/iucv/af_iucv.c
>>> +++ b/net/iucv/af_iucv.c
>>> @@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk)
>>> static void iucv_sock_wake_msglim(struct sock *sk)
>>> {
>>> read_lock(&sk->sk_callback_lock);
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible_all(sk->sk_sleep);
>>> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
>>> read_unlock(&sk->sk_callback_lock);
>>> diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
>>> index eac5e7b..60e0e38 100644
>>> --- a/net/rxrpc/af_rxrpc.c
>>> +++ b/net/rxrpc/af_rxrpc.c
>>> @@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk)
>>> _enter("%p", sk);
>>> read_lock(&sk->sk_callback_lock);
>>> if (rxrpc_writable(sk)) {
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible(sk->sk_sleep);
>>> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
>>> }
>>> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>>> index 36d4e44..143143a 100644
>>> --- a/net/unix/af_unix.c
>>> +++ b/net/unix/af_unix.c
>>> @@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk)
>>> {
>>> read_lock(&sk->sk_callback_lock);
>>> if (unix_writable(sk)) {
>>> - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
>>> + if (sk_has_sleeper(sk))
>>> wake_up_interruptible_sync(sk->sk_sleep);
>>> sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
>>> }
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
>
> - Davide
>
>
On Fri, 26 Jun 2009, Eric Dumazet wrote:
> Davide Libenzi a ?crit :
> > On Thu, 25 Jun 2009, Oleg Nesterov wrote:
> >
> >> Can't really comment this patch, except this all looks reasonable to me.
> >> Add more CCs.
> >
> > While this can work, IMO it'd be cleaner to have the smp_mb() moved from
> > fs/select.c to the ->poll() function.
> > Having a barrier that matches another one in another susbsystem, because
> > of the special locking logic of such subsystem, is not too shiny IMHO.
> >
>
> Yes but barrier is necessary only if add_wait_queue() was actually called, and __pollwait()
> does this call.
>
> Adding a plain smp_mb() in tcp_poll() for example would slowdown select()/poll() with NULL
> timeout.
Do you think of it as good design adding an MB on a subsystem, because of
the special locking logic of another one?
The (eventual) slowdown, IMO can be argued sideways, by saying that
non-socket users will pay the price for their polls.
- Davide
From: Davide Libenzi <[email protected]>
Date: Thu, 25 Jun 2009 19:04:04 -0700 (PDT)
> On Fri, 26 Jun 2009, Eric Dumazet wrote:
>
>> Adding a plain smp_mb() in tcp_poll() for example would slowdown select()/poll() with NULL
>> timeout.
>
> Do you think of it as good design adding an MB on a subsystem, because of
> the special locking logic of another one?
> The (eventual) slowdown, IMO can be argued sideways, by saying that
> non-socket users will pay the price for their polls.
Perhaps every performance argument is moot since spin_unlock() used to
have the barrier :-)
Davide Libenzi a ?crit :
> On Fri, 26 Jun 2009, Eric Dumazet wrote:
>
>> Davide Libenzi a ?crit :
>>> On Thu, 25 Jun 2009, Oleg Nesterov wrote:
>>>
>>>> Can't really comment this patch, except this all looks reasonable to me.
>>>> Add more CCs.
>>> While this can work, IMO it'd be cleaner to have the smp_mb() moved from
>>> fs/select.c to the ->poll() function.
>>> Having a barrier that matches another one in another susbsystem, because
>>> of the special locking logic of such subsystem, is not too shiny IMHO.
>>>
>> Yes but barrier is necessary only if add_wait_queue() was actually called, and __pollwait()
>> does this call.
>>
>> Adding a plain smp_mb() in tcp_poll() for example would slowdown select()/poll() with NULL
>> timeout.
>
> Do you think of it as good design adding an MB on a subsystem, because of
> the special locking logic of another one?
> The (eventual) slowdown, IMO can be argued sideways, by saying that
> non-socket users will pay the price for their polls.
>
I wont argue with you David, just try to correct bugs.
fs/ext4/ioctl.c line 182
set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
schedule();
Another example of missing barrier after add_wait_queue()
Because add_wait_queue() misses a barrier, we have to add one after each call.
Maybe it would be safer to add barrier in add_wait_queue() itself, not in _pollwait().
On Fri, 26 Jun 2009, Eric Dumazet wrote:
> I wont argue with you David, just try to correct bugs.
>
> fs/ext4/ioctl.c line 182
>
> set_current_state(TASK_INTERRUPTIBLE);
> add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
> if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
> schedule();
>
> Another example of missing barrier after add_wait_queue()
>
> Because add_wait_queue() misses a barrier, we have to add one after each call.
>
> Maybe it would be safer to add barrier in add_wait_queue() itself, not in _pollwait().
Not all the code that uses add_wait_queue() does need to have the MB,
like code that does the most common pattern:
xxx_poll(...) {
poll_wait(...);
lock();
flags = calc_flags(->status);
unlock();
return flags;
}
xxx_update(...) {
lock();
->status = ...;
unlock();
if (waitqueue_active())
wake_up();
}
It's the code that does the lockless flags calculation in ->poll that
might need it.
I dunno what the amount of changes are, but cross-matching MB across
subsystems does not look nice.
IMHO that's a detail of the subsystem locking, and should be confined
inside the subsystem itself.
No?
- Davide
On 26-06-2009 05:14, Davide Libenzi wrote:
> On Fri, 26 Jun 2009, Eric Dumazet wrote:
>
>> I wont argue with you David, just try to correct bugs.
>>
>> fs/ext4/ioctl.c line 182
>>
>> set_current_state(TASK_INTERRUPTIBLE);
>> add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
>> if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
>> schedule();
>>
>> Another example of missing barrier after add_wait_queue()
>>
>> Because add_wait_queue() misses a barrier, we have to add one after each call.
>>
>> Maybe it would be safer to add barrier in add_wait_queue() itself, not in _pollwait().
>
> Not all the code that uses add_wait_queue() does need to have the MB,
> like code that does the most common pattern:
>
> xxx_poll(...) {
> poll_wait(...);
> lock();
> flags = calc_flags(->status);
> unlock();
> return flags;
> }
>
> xxx_update(...) {
> lock();
> ->status = ...;
> unlock();
> if (waitqueue_active())
> wake_up();
> }
>
> It's the code that does the lockless flags calculation in ->poll that
> might need it.
> I dunno what the amount of changes are, but cross-matching MB across
> subsystems does not look nice.
> IMHO that's a detail of the subsystem locking, and should be confined
> inside the subsystem itself.
> No?
How about poll_wait_mb() and waitqueue_active_mb() (with mb and
additional check for NULL of wait_queue_head)?
Jarek P.
On Fri, Jun 26, 2009 at 05:42:30AM +0000, Jarek Poplawski wrote:
> On 26-06-2009 05:14, Davide Libenzi wrote:
> > On Fri, 26 Jun 2009, Eric Dumazet wrote:
> >
> >> I wont argue with you David, just try to correct bugs.
> >>
> >> fs/ext4/ioctl.c line 182
> >>
> >> set_current_state(TASK_INTERRUPTIBLE);
> >> add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
> >> if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
> >> schedule();
> >>
> >> Another example of missing barrier after add_wait_queue()
> >>
> >> Because add_wait_queue() misses a barrier, we have to add one after each call.
> >>
> >> Maybe it would be safer to add barrier in add_wait_queue() itself, not in _pollwait().
> >
> > Not all the code that uses add_wait_queue() does need to have the MB,
> > like code that does the most common pattern:
> >
> > xxx_poll(...) {
> > poll_wait(...);
> > lock();
> > flags = calc_flags(->status);
> > unlock();
> > return flags;
> > }
> >
> > xxx_update(...) {
> > lock();
> > ->status = ...;
> > unlock();
> > if (waitqueue_active())
> > wake_up();
> > }
> >
> > It's the code that does the lockless flags calculation in ->poll that
> > might need it.
> > I dunno what the amount of changes are, but cross-matching MB across
> > subsystems does not look nice.
> > IMHO that's a detail of the subsystem locking, and should be confined
> > inside the subsystem itself.
> > No?
>
> How about poll_wait_mb() and waitqueue_active_mb() (with mb and
> additional check for NULL of wait_queue_head)?
Hmm... But considering Eric's arguments I see it would be hard
/impossible to do it with the current api, so let's forget.
Jarek P.
On 06/26, Eric Dumazet wrote:
>
> Davide Libenzi a ?crit :
> >
> > Do you think of it as good design adding an MB on a subsystem, because of
> > the special locking logic of another one?
> > The (eventual) slowdown, IMO can be argued sideways, by saying that
> > non-socket users will pay the price for their polls.
>
> I wont argue with you David, just try to correct bugs.
I must admit, I agree with David.
> fs/ext4/ioctl.c line 182
>
> set_current_state(TASK_INTERRUPTIBLE);
> add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
> if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
> schedule();
>
> Another example of missing barrier after add_wait_queue()
Assuming that ->turn_ro_timer does wake_up(->ro_wait_queue) everything
is OK, we do not need a barrier.
Oleg.
On 06/25, Davide Libenzi wrote:
>
> Not all the code that uses add_wait_queue() does need to have the MB,
> like code that does the most common pattern:
>
> xxx_poll(...) {
> poll_wait(...);
> lock();
> flags = calc_flags(->status);
> unlock();
> return flags;
> }
>
> xxx_update(...) {
> lock();
> ->status = ...;
> unlock();
> if (waitqueue_active())
> wake_up();
> }
>
> It's the code that does the lockless flags calculation in ->poll that
> might need it.
And if we remove waitqueue_active() in xxx_update(), then lock/unlock is
not needed too.
If xxx_poll() takes q->lock first, it can safely miss the changes in ->status
and schedule(): xxx_update() will take q->lock, notice the sleeper and wake
it up (ok, it will set ->triggered but this doesn't matter).
If xxx_update() takes q->lock first, xxx_poll() must see the changes in
status after poll_wait()->unlock(&q->lock) (in fact, after lock, not unlock).
Oleg.
On Fri, 26 Jun 2009, Oleg Nesterov wrote:
> And if we remove waitqueue_active() in xxx_update(), then lock/unlock is
> not needed too.
>
> If xxx_poll() takes q->lock first, it can safely miss the changes in ->status
> and schedule(): xxx_update() will take q->lock, notice the sleeper and wake
> it up (ok, it will set ->triggered but this doesn't matter).
>
> If xxx_update() takes q->lock first, xxx_poll() must see the changes in
> status after poll_wait()->unlock(&q->lock) (in fact, after lock, not unlock).
Sure. The snippet above was just to show what typically the code does, not
a suggestion on how to solve the socket case.
But yeah, the problem in this case is the waitqueue_active() call. Without
that, the wait queue lock/unlock in poll_wait() and the one in wake_up()
guarantees the necessary barriers.
Some might argue the costs of the lock/unlock of q->lock, and wonder if
MBs are a more efficient solution. This is something I'm not going into.
To me, it just looked not right having cross-matching MB in different
subsystems.
- Davide
On 06/26, Davide Libenzi wrote:
>
> On Fri, 26 Jun 2009, Oleg Nesterov wrote:
>
> > And if we remove waitqueue_active() in xxx_update(), then lock/unlock is
> > not needed too.
> >
> > If xxx_poll() takes q->lock first, it can safely miss the changes in ->status
> > and schedule(): xxx_update() will take q->lock, notice the sleeper and wake
> > it up (ok, it will set ->triggered but this doesn't matter).
> >
> > If xxx_update() takes q->lock first, xxx_poll() must see the changes in
> > status after poll_wait()->unlock(&q->lock) (in fact, after lock, not unlock).
>
> Sure. The snippet above was just to show what typically the code does, not
> a suggestion on how to solve the socket case.
Yes, yes. I just meant you are right imho, we shouldn't add mb() into
add_wait_queue().
> But yeah, the problem in this case is the waitqueue_active() call. Without
> that, the wait queue lock/unlock in poll_wait() and the one in wake_up()
> guarantees the necessary barriers.
> Some might argue the costs of the lock/unlock of q->lock, and wonder if
> MBs are a more efficient solution. This is something I'm not going into.
> To me, it just looked not right having cross-matching MB in different
> subsystems.
This is subjective and thus up to maintainers, but personally I think you
are very, very right.
Perhaps we can add
void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
{
if (pt) {
poll_wait(file, sk->sk_sleep, pt);
/*
* fat comment
*/
smp_mb(); // or smp_mb__after_unlock();
}
}
Oleg.
On Fri, 26 Jun 2009, Oleg Nesterov wrote:
> Perhaps we can add
>
> void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
> {
> if (pt) {
> poll_wait(file, sk->sk_sleep, pt);
> /*
> * fat comment
> */
> smp_mb(); // or smp_mb__after_unlock();
> }
> }
That'd be fine IMHO. Are DaveM and Eric OK?
- Davide
From: Davide Libenzi <[email protected]>
Date: Fri, 26 Jun 2009 11:12:15 -0700 (PDT)
> On Fri, 26 Jun 2009, Oleg Nesterov wrote:
>
>> Perhaps we can add
>>
>> void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
>> {
>> if (pt) {
>> poll_wait(file, sk->sk_sleep, pt);
>> /*
>> * fat comment
>> */
>> smp_mb(); // or smp_mb__after_unlock();
>> }
>> }
>
> That'd be fine IMHO. Are DaveM and Eric OK?
No objections from me.
David Miller a ?crit :
> From: Davide Libenzi <[email protected]>
> Date: Fri, 26 Jun 2009 11:12:15 -0700 (PDT)
>
>> On Fri, 26 Jun 2009, Oleg Nesterov wrote:
>>
>>> Perhaps we can add
>>>
>>> void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
>>> {
>>> if (pt) {
>>> poll_wait(file, sk->sk_sleep, pt);
>>> /*
>>> * fat comment
>>> */
>>> smp_mb(); // or smp_mb__after_unlock();
>>> }
>>> }
>> That'd be fine IMHO. Are DaveM and Eric OK?
>
> No objections from me.
Very good :)
Jiri, please respin a patch with this idea from Oleg
(We'll have to check all calls to poll_wait() in net tree)
Thanks everybody
Oleg Nesterov wrote, On 06/26/2009 04:50 PM:
> On 06/26, Davide Libenzi wrote:
>> On Fri, 26 Jun 2009, Oleg Nesterov wrote:
>>
>>> And if we remove waitqueue_active() in xxx_update(), then lock/unlock is
>>> not needed too.
>>>
>>> If xxx_poll() takes q->lock first, it can safely miss the changes in ->status
>>> and schedule(): xxx_update() will take q->lock, notice the sleeper and wake
>>> it up (ok, it will set ->triggered but this doesn't matter).
>>>
>>> If xxx_update() takes q->lock first, xxx_poll() must see the changes in
>>> status after poll_wait()->unlock(&q->lock) (in fact, after lock, not unlock).
>> Sure. The snippet above was just to show what typically the code does, not
>> a suggestion on how to solve the socket case.
>
> Yes, yes. I just meant you are right imho, we shouldn't add mb() into
> add_wait_queue().
>
>> But yeah, the problem in this case is the waitqueue_active() call. Without
>> that, the wait queue lock/unlock in poll_wait() and the one in wake_up()
>> guarantees the necessary barriers.
>> Some might argue the costs of the lock/unlock of q->lock, and wonder if
>> MBs are a more efficient solution. This is something I'm not going into.
>> To me, it just looked not right having cross-matching MB in different
>> subsystems.
>
> This is subjective and thus up to maintainers, but personally I think you
> are very, very right.
>
> Perhaps we can add
>
> void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
> {
> if (pt) {
> poll_wait(file, sk->sk_sleep, pt);
> /*
> * fat comment
> */
> smp_mb(); // or smp_mb__after_unlock();
> }
> }
>
> Oleg.
Maybe 'a bit' further?:
static inline void __poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
p->qproc(filp, wait_address, p);
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
__poll_wait(filp, wait_address, p);
}
static inline void sock_poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address) {
__poll_wait(filp, wait_address, p);
/*
* fat comment
*/
smp_mb(); // or smp_mb__after_unlock();
}
}
Jarek P.
Jarek Poplawski wrote, On 06/28/2009 01:10 PM:
> Oleg Nesterov wrote, On 06/26/2009 04:50 PM:
>
>> On 06/26, Davide Libenzi wrote:
>>> On Fri, 26 Jun 2009, Oleg Nesterov wrote:
>>>
>>>> And if we remove waitqueue_active() in xxx_update(), then lock/unlock is
>>>> not needed too.
>>>>
>>>> If xxx_poll() takes q->lock first, it can safely miss the changes in ->status
>>>> and schedule(): xxx_update() will take q->lock, notice the sleeper and wake
>>>> it up (ok, it will set ->triggered but this doesn't matter).
>>>>
>>>> If xxx_update() takes q->lock first, xxx_poll() must see the changes in
>>>> status after poll_wait()->unlock(&q->lock) (in fact, after lock, not unlock).
>>> Sure. The snippet above was just to show what typically the code does, not
>>> a suggestion on how to solve the socket case.
>> Yes, yes. I just meant you are right imho, we shouldn't add mb() into
>> add_wait_queue().
>>
>>> But yeah, the problem in this case is the waitqueue_active() call. Without
>>> that, the wait queue lock/unlock in poll_wait() and the one in wake_up()
>>> guarantees the necessary barriers.
>>> Some might argue the costs of the lock/unlock of q->lock, and wonder if
>>> MBs are a more efficient solution. This is something I'm not going into.
>>> To me, it just looked not right having cross-matching MB in different
>>> subsystems.
>> This is subjective and thus up to maintainers, but personally I think you
>> are very, very right.
>>
>> Perhaps we can add
>>
>> void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
>> {
>> if (pt) {
>> poll_wait(file, sk->sk_sleep, pt);
>> /*
>> * fat comment
>> */
>> smp_mb(); // or smp_mb__after_unlock();
>> }
>> }
>>
>> Oleg.
>
>
> Maybe 'a bit' further?:
>
> static inline void __poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
> {
> p->qproc(filp, wait_address, p);
> }
>
> static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
> {
> if (p && wait_address)
> __poll_wait(filp, wait_address, p);
> }
>
> static inline void sock_poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
> {
> if (p && wait_address) {
> __poll_wait(filp, wait_address, p);
> /*
> * fat comment
> */
> smp_mb(); // or smp_mb__after_unlock();
> }
> }
>
Hmm... of course:
static inline void sock_poll_wait(struct file * filp, struct sock *sk, poll_table *p)
{
if (p && sk->sk_sleep) {
__poll_wait(filp, sk->sk_sleep, p);
/*
* fat comment
*/
smp_mb(); // or smp_mb__after_unlock();
}
}
Jarek P.
On 06/28, Jarek Poplawski wrote:
>
> Jarek Poplawski wrote, On 06/28/2009 01:10 PM:
>
> > Oleg Nesterov wrote, On 06/26/2009 04:50 PM:
> >
> >> Perhaps we can add
> >>
> >> void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
> >> {
> >> if (pt) {
> >> poll_wait(file, sk->sk_sleep, pt);
> >> /*
> >> * fat comment
> >> */
> >> smp_mb(); // or smp_mb__after_unlock();
> >> }
> >> }
> >>
> >> Oleg.
> >
> >
> > Maybe 'a bit' further?:
> >
> > static inline void __poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
> > {
> > p->qproc(filp, wait_address, p);
> > }
> >
> > static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
> > {
> > if (p && wait_address)
> > __poll_wait(filp, wait_address, p);
> > }
> >
> > static inline void sock_poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
> > {
> > if (p && wait_address) {
> > __poll_wait(filp, wait_address, p);
> > /*
> > * fat comment
> > */
> > smp_mb(); // or smp_mb__after_unlock();
> > }
> > }
> >
>
>
> Hmm... of course:
>
> static inline void sock_poll_wait(struct file * filp, struct sock *sk, poll_table *p)
> {
> if (p && sk->sk_sleep) {
> __poll_wait(filp, sk->sk_sleep, p);
> /*
> * fat comment
> */
> smp_mb(); // or smp_mb__after_unlock();
> }
> }
Perhaps it makes sense to check ->sk_sleep != NULL in sock_poll_wait(), but
I don't think we need __poll_wait(). poll_wait() is inline, I think gcc
will optimize out "if (p && wait_address)" check if poll_wait() is called
from sock_poll_wait().
This all is up to Jiri of course. But speaking about cosmetic changes, I
think it is better to make 2 patches. The first one fixes the problem using
smp_mb(), another introduces smp_mb__xxx_lock() to optimize the code.
Oleg.
On Sun, Jun 28, 2009 at 08:04:12PM +0200, Oleg Nesterov wrote:
> On 06/28, Jarek Poplawski wrote:
...
> > Hmm... of course:
> >
> > static inline void sock_poll_wait(struct file * filp, struct sock *sk, poll_table *p)
> > {
> > if (p && sk->sk_sleep) {
> > __poll_wait(filp, sk->sk_sleep, p);
> > /*
> > * fat comment
> > */
> > smp_mb(); // or smp_mb__after_unlock();
> > }
> > }
>
> Perhaps it makes sense to check ->sk_sleep != NULL in sock_poll_wait(), but
> I don't think we need __poll_wait(). poll_wait() is inline, I think gcc
> will optimize out "if (p && wait_address)" check if poll_wait() is called
> from sock_poll_wait().
Sure, to me it looks a bit more readable, but let Jiri choose.;-)
Cheers,
Jarek P.
Jiri Olsa <[email protected]> writes:
> Adding memory barrier to the __pollwait function paired with
> receive callbacks. The smp_mb__after_lock define is added,
> since {read|write|spin}_lock() on x86 are full memory barriers.
I was wondering did you see that race actually happening in practice?
If yes on which system?
At least on x86 I can't see how it happens. mb() is only a compile
time barrier and the compiler doesn't optimize over indirect callbacks
like __pollwait() anyways.
It might be still needed on some weaker ordered architectures, but did you
actually see it there?
-Andi
--
[email protected] -- Speaking for myself only.
On Mon, Jun 29, 2009 at 11:12:33AM +0200, Andi Kleen wrote:
> Jiri Olsa <[email protected]> writes:
>
> > Adding memory barrier to the __pollwait function paired with
> > receive callbacks. The smp_mb__after_lock define is added,
> > since {read|write|spin}_lock() on x86 are full memory barriers.
>
> I was wondering did you see that race actually happening in practice?
> If yes on which system?
>
> At least on x86 I can't see how it happens. mb() is only a compile
> time barrier and the compiler doesn't optimize over indirect callbacks
> like __pollwait() anyways.
>
> It might be still needed on some weaker ordered architectures, but did you
> actually see it there?
>
> -Andi
yes, we have a customer that has been able to reproduce this problem on
x86_64 CPU model Xeon E5345*2, but they didn't reproduce on XEON MV, for example.
they were able to capture a backtrace when the race happened:
https://bugzilla.redhat.com/show_bug.cgi?id=494404#c1
jirka
>
> --
> [email protected] -- Speaking for myself only.
On Sun, Jun 28, 2009 at 11:48:46PM +0200, Jarek Poplawski wrote:
> On Sun, Jun 28, 2009 at 08:04:12PM +0200, Oleg Nesterov wrote:
> > On 06/28, Jarek Poplawski wrote:
> ...
> > > Hmm... of course:
> > >
> > > static inline void sock_poll_wait(struct file * filp, struct sock *sk, poll_table *p)
> > > {
> > > if (p && sk->sk_sleep) {
> > > __poll_wait(filp, sk->sk_sleep, p);
> > > /*
> > > * fat comment
> > > */
> > > smp_mb(); // or smp_mb__after_unlock();
> > > }
> > > }
> >
> > Perhaps it makes sense to check ->sk_sleep != NULL in sock_poll_wait(), but
> > I don't think we need __poll_wait(). poll_wait() is inline, I think gcc
> > will optimize out "if (p && wait_address)" check if poll_wait() is called
> > from sock_poll_wait().
>
> Sure, to me it looks a bit more readable, but let Jiri choose.;-)
>
> Cheers,
> Jarek P.
yes :) I like more Jarek's way.. and I'll send separate patch for the
smp_mb_after_lock change.
thanks,
jirka
On Fri, Jun 26, 2009 at 09:35:28PM +0200, Eric Dumazet wrote:
> David Miller a ?crit :
> > From: Davide Libenzi <[email protected]>
> > Date: Fri, 26 Jun 2009 11:12:15 -0700 (PDT)
> >
> >> On Fri, 26 Jun 2009, Oleg Nesterov wrote:
> >>
> >>> Perhaps we can add
> >>>
> >>> void sock_poll_wait(struct file *file, struct sock *sk, poll_table *pt)
> >>> {
> >>> if (pt) {
> >>> poll_wait(file, sk->sk_sleep, pt);
> >>> /*
> >>> * fat comment
> >>> */
> >>> smp_mb(); // or smp_mb__after_unlock();
> >>> }
> >>> }
> >> That'd be fine IMHO. Are DaveM and Eric OK?
> >
> > No objections from me.
>
> Very good :)
>
> Jiri, please respin a patch with this idea from Oleg
>
> (We'll have to check all calls to poll_wait() in net tree)
>
> Thanks everybody
>
thanks a lot! :) I'll send out new patch shortly..
I'll include all the poll_wait calls change I'm kind of sure of,
and list of others I'm not, so we can discuss them.
jirka
Andi Kleen wrote:
> At least on x86 I can't see how it happens. mb() is only a compile
> time barrier and the compiler doesn't optimize over indirect callbacks
> like __pollwait() anyways.
Just a note about this. That used to be true, that GCC didn't optimize
indirect calls. However, see -findirect-inlining in GCC 4.4.
I am not saying that it applies here, but it is something to remember.
--
Zan Lynx
[email protected]
"Knowledge is Power. Power Corrupts. Study Hard. Be Evil."
On Mon, Jun 29, 2009 at 10:59:57AM -0600, Zan Lynx wrote:
> Andi Kleen wrote:
> >At least on x86 I can't see how it happens. mb() is only a compile
> >time barrier and the compiler doesn't optimize over indirect callbacks
> >like __pollwait() anyways.
>
> Just a note about this. That used to be true, that GCC didn't optimize
> indirect calls. However, see -findirect-inlining in GCC 4.4.
I am aware of that, but ...
>
> I am not saying that it applies here, but it is something to remember.
... it doesn't apply here because it works only for a few very simple
cases that don't apply for this code. Besides the kernel builds with -O2,
which only inlines functions called once or very small functions, which
also rules this out.
-Andi
--
[email protected] -- Speaking for myself only.
Andi Kleen <[email protected]> wrote:
>
> At least on x86 I can't see how it happens. mb() is only a compile
> time barrier and the compiler doesn't optimize over indirect callbacks
> like __pollwait() anyways.
mb() does more than just a compiler barrier, it also issues mfence.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Wed, Jul 01, 2009 at 11:39:24AM +0800, Herbert Xu wrote:
> Andi Kleen <[email protected]> wrote:
> >
> > At least on x86 I can't see how it happens. mb() is only a compile
> > time barrier and the compiler doesn't optimize over indirect callbacks
> > like __pollwait() anyways.
>
> mb() does more than just a compiler barrier, it also issues mfence.
mfence is not needed for normal C code (not using non temporal
stores) in the Linux memory model on x86 and is a no-op. Only the compile
time barrier matters.
-Andi
--
[email protected] -- Speaking for myself only.
On Wed, Jul 01, 2009 at 08:27:32AM +0200, Andi Kleen wrote:
>
> mfence is not needed for normal C code (not using non temporal
> stores) in the Linux memory model on x86 and is a no-op. Only the compile
> time barrier matters.
In that case this bug needs to be digged deeper regardless of
this patch.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Wed, Jul 01, 2009 at 03:03:32PM +0800, Herbert Xu wrote:
> On Wed, Jul 01, 2009 at 08:27:32AM +0200, Andi Kleen wrote:
> >
> > mfence is not needed for normal C code (not using non temporal
> > stores) in the Linux memory model on x86 and is a no-op. Only the compile
> > time barrier matters.
>
> In that case this bug needs to be digged deeper regardless of
> this patch.
Agreed.
I suspect the reordering of the wake queue tests might makes a difference, but
in this case to ensure they are always tested in the proper order by
the compiler would need more smp_rmb()s
-Andi
--
[email protected] -- Speaking for myself only.
On Wed, Jul 01, 2009 at 09:22:26AM +0200, Andi Kleen wrote:
>
> Agreed.
>
> I suspect the reordering of the wake queue tests might makes a difference, but
> in this case to ensure they are always tested in the proper order by
> the compiler would need more smp_rmb()s
I just looked up the original bug report and it's against RHEL4
which is a 2.6.9 kernel. Jiri, have we reproduced this on a more
recent kernel?
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Wed, Jul 01, 2009 at 04:31:44PM +0800, Herbert Xu wrote:
> On Wed, Jul 01, 2009 at 09:22:26AM +0200, Andi Kleen wrote:
> >
> > Agreed.
> >
> > I suspect the reordering of the wake queue tests might makes a difference, but
> > in this case to ensure they are always tested in the proper order by
> > the compiler would need more smp_rmb()s
>
> I just looked up the original bug report and it's against RHEL4
> which is a 2.6.9 kernel. Jiri, have we reproduced this on a more
> recent kernel?
no, only RHEL 4..
it is very hard to reproduce. I was succesfull once
on RHEL4 after 8 days, never on upstream though.
jirka
>
> Thanks,
> --
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Wed, Jul 01, 2009 at 10:44:30AM +0200, Jiri Olsa wrote:
>
> no, only RHEL 4..
>
> it is very hard to reproduce. I was succesfull once
> on RHEL4 after 8 days, never on upstream though.
It would be good to know whether this bug occurs on the upstream
kernel because as it stands, the patch is a no-op for the upstream
kernel on x86-64.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Wed, Jul 01, 2009 at 06:58:52PM +0800, Herbert Xu wrote:
>
> It would be good to know whether this bug occurs on the upstream
> kernel because as it stands, the patch is a no-op for the upstream
> kernel on x86-64.
I take that back. Andi, please look at section 8.2.3.4 of the
IA-32 Architectures Software Developer's Manual Volume 3A, "Loads
May Be Reordered with Earlier Stores to Different Locations.
This seems to be exactly the scenario that we have here, and shows
why mfence is required.
In our case, we're doing
CPU1 CPU2
Write data ready Add ourselves to wait queue
Read wait queue to notify Check data ready
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt