Hi,
I found out a possible bug in reqsk_queue_hash_req(), it seem
that we should move "req->dl_next = lopt->syn_table[hash];" statement
into follow write lock protected scope.
As I browsed source code, this function only can be call at rx
code path which is protected a spin lock over struct sock , but its
caller ( inet_csk_reqsk_queue_hash_add() ) is a GPL exported symbol,
so I think that we'd best move this statement into below write lock
protected scope.
Below is the patch to play this change, please do not apply it on
source code, it's just for show.
Thanks.
Yu
--- include/net/request_sock.h 2010-04-09 15:27:14.000000000 +0800
+++ include/net/request_sock.h 2010-04-20 18:11:32.000000000 +0800
@@ -247,9 +247,9 @@ static inline void reqsk_queue_hash_req(
req->expires = jiffies + timeout;
req->retrans = 0;
req->sk = NULL;
- req->dl_next = lopt->syn_table[hash];
write_lock(&queue->syn_wait_lock);
+ req->dl_next = lopt->syn_table[hash];
lopt->syn_table[hash] = req;
write_unlock(&queue->syn_wait_lock);
}
Le mardi 20 avril 2010 à 18:35 +0800, Li Yu a écrit :
> Hi,
>
> I found out a possible bug in reqsk_queue_hash_req(), it seem
> that we should move "req->dl_next = lopt->syn_table[hash];" statement
> into follow write lock protected scope.
>
> As I browsed source code, this function only can be call at rx
> code path which is protected a spin lock over struct sock , but its
> caller ( inet_csk_reqsk_queue_hash_add() ) is a GPL exported symbol,
> so I think that we'd best move this statement into below write lock
> protected scope.
>
> Below is the patch to play this change, please do not apply it on
> source code, it's just for show.
>
> Thanks.
>
> Yu
>
> --- include/net/request_sock.h 2010-04-09 15:27:14.000000000 +0800
> +++ include/net/request_sock.h 2010-04-20 18:11:32.000000000 +0800
> @@ -247,9 +247,9 @@ static inline void reqsk_queue_hash_req(
> req->expires = jiffies + timeout;
> req->retrans = 0;
> req->sk = NULL;
> - req->dl_next = lopt->syn_table[hash];
>
> write_lock(&queue->syn_wait_lock);
> + req->dl_next = lopt->syn_table[hash];
> lopt->syn_table[hash] = req;
> write_unlock(&queue->syn_wait_lock);
> }
I believe its not really necessary, because we are the only possible
writer at this stage.
The write_lock() ... write_unlock() is there only to enforce a
synchronisation with readers.
All callers of this reqsk_queue_hash_req() must have the socket locked
From: Eric Dumazet <[email protected]>
Date: Tue, 20 Apr 2010 13:06:51 +0200
> I believe its not really necessary, because we are the only possible
> writer at this stage.
>
> The write_lock() ... write_unlock() is there only to enforce a
> synchronisation with readers.
>
> All callers of this reqsk_queue_hash_req() must have the socket locked
Right.
In fact there are quite a few snippets around the networking
where we use this trick of only locking around the single
pointer assignment that puts the object into the list.
And they were all written by Alexey Kuznetsov, so they must
be correct :-)
Le mardi 20 avril 2010 à 14:24 -0700, David Miller a écrit :
> From: Eric Dumazet <[email protected]>
> Date: Tue, 20 Apr 2010 13:06:51 +0200
>
> > I believe its not really necessary, because we are the only possible
> > writer at this stage.
> >
> > The write_lock() ... write_unlock() is there only to enforce a
> > synchronisation with readers.
> >
> > All callers of this reqsk_queue_hash_req() must have the socket locked
>
> Right.
>
> In fact there are quite a few snippets around the networking
> where we use this trick of only locking around the single
> pointer assignment that puts the object into the list.
>
> And they were all written by Alexey Kuznetsov, so they must
> be correct :-)
We could convert to RCU, but given this read_lock is only taken by
inet_diag, its not really necessary.
Something like the following, but I have no plan to complete this work.
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b6d3b55..5f3373a 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -278,7 +278,9 @@ static inline void inet_csk_reqsk_queue_added(struct sock *sk,
static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
{
- return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+ struct listen_sock * lopt = reqsk_lopt_get(&inet_csk(sk)->icsk_accept_queue);
+
+ return reqsk_queue_len(lopt);
}
static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 99e6e19..b665103 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -65,6 +65,7 @@ struct request_sock {
struct sock *sk;
u32 secid;
u32 peer_secid;
+ struct rcu_head rcu;
};
static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
@@ -77,10 +78,7 @@ static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *op
return req;
}
-static inline void __reqsk_free(struct request_sock *req)
-{
- kmem_cache_free(req->rsk_ops->slab, req);
-}
+extern void __reqsk_free(struct request_sock *req);
static inline void reqsk_free(struct request_sock *req)
{
@@ -110,23 +108,13 @@ struct listen_sock {
* @rskq_accept_head - FIFO head of established children
* @rskq_accept_tail - FIFO tail of established children
* @rskq_defer_accept - User waits for some data after accept()
- * @syn_wait_lock - serializer
- *
- * %syn_wait_lock is necessary only to avoid proc interface having to grab the main
- * lock sock while browsing the listening hash (otherwise it's deadlock prone).
*
- * This lock is acquired in read mode only from listening_get_next() seq_file
- * op and it's acquired in write mode _only_ from code that is actively
- * changing rskq_accept_head. All readers that are holding the master sock lock
- * don't need to grab this lock in read mode too as rskq_accept_head. writes
- * are always protected from the main sock lock.
*/
struct request_sock_queue {
struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail;
- rwlock_t syn_wait_lock;
u8 rskq_defer_accept;
- /* 3 bytes hole, try to pack */
+ /* 3-7 bytes hole, try to pack */
struct listen_sock *listen_opt;
};
@@ -154,9 +142,7 @@ static inline void reqsk_queue_unlink(struct request_sock_queue *queue,
struct request_sock *req,
struct request_sock **prev_req)
{
- write_lock(&queue->syn_wait_lock);
- *prev_req = req->dl_next;
- write_unlock(&queue->syn_wait_lock);
+ rcu_assign_pointer(*prev_req, req->dl_next);
}
static inline void reqsk_queue_add(struct request_sock_queue *queue,
@@ -167,13 +153,13 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue,
req->sk = child;
sk_acceptq_added(parent);
+ req->dl_next = NULL;
if (queue->rskq_accept_head == NULL)
queue->rskq_accept_head = req;
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
- req->dl_next = NULL;
}
static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue)
@@ -223,9 +209,16 @@ static inline int reqsk_queue_added(struct request_sock_queue *queue)
return prev_qlen;
}
-static inline int reqsk_queue_len(const struct request_sock_queue *queue)
+static inline struct listen_sock *reqsk_lopt_get(const struct request_sock_queue *queue)
{
- return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0;
+ return rcu_dereference_check(queue->listen_opt,
+ rcu_read_lock_bh_held() ||
+ sock_owned_by_user(sk));
+}
+
+static inline int reqsk_queue_len(struct listen_sock *lopt)
+{
+ return lopt != NULL ? lopt->qlen : 0;
}
static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
@@ -247,11 +240,9 @@ static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
req->expires = jiffies + timeout;
req->retrans = 0;
req->sk = NULL;
- req->dl_next = lopt->syn_table[hash];
- write_lock(&queue->syn_wait_lock);
- lopt->syn_table[hash] = req;
- write_unlock(&queue->syn_wait_lock);
+ req->dl_next = lopt->syn_table[hash];
+ rcu_assign_pointer(lopt->syn_table[hash], req);
}
#endif /* _REQUEST_SOCK_H */
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 7552495..e4c333e 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -58,13 +58,10 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
- rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
- write_lock_bh(&queue->syn_wait_lock);
- queue->listen_opt = lopt;
- write_unlock_bh(&queue->syn_wait_lock);
+ rcu_assign_pointer(queue->listen_opt, lopt);
return 0;
}
@@ -94,10 +91,8 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(
{
struct listen_sock *lopt;
- write_lock_bh(&queue->syn_wait_lock);
lopt = queue->listen_opt;
queue->listen_opt = NULL;
- write_unlock_bh(&queue->syn_wait_lock);
return lopt;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8da6429..2b2cb80 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -49,6 +49,19 @@ void inet_get_local_port_range(int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
+static void reqsk_rcu_free(struct rcu_head *head)
+{
+ struct request_sock *req = container_of(head, struct request_sock, rcu);
+
+ kmem_cache_free(req->rsk_ops->slab, req);
+}
+
+void __reqsk_free(struct request_sock *req)
+{
+ call_rcu_bh(&req->rcu, reqsk_rcu_free);
+}
+EXPORT_SYMBOL(__reqsk_free);
+
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb)
{
@@ -420,7 +433,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
ireq->loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
WARN_ON(req->sk);
- *prevp = prev;
+ rcu_assign_pointer(*prevp, prev);
break;
}
}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2dd..4da8365 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -632,9 +632,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
entry.family = sk->sk_family;
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ rcu_read_lock_bh();
- lopt = icsk->icsk_accept_queue.listen_opt;
+ lopt = rcu_dereference_bh(icsk->icsk_accept_queue.listen_opt);
if (!lopt || !lopt->qlen)
goto out;
@@ -648,7 +648,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
struct request_sock *req, *head = lopt->syn_table[j];
reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
+ for (req = head; req; reqnum++, req = rcu_dereference_bh(req->dl_next)) {
struct inet_request_sock *ireq = inet_rsk(req);
if (reqnum < s_reqnum)
@@ -691,7 +691,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
}
out:
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ rcu_read_unlock_bh();
return err;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ad08392..cd6a042 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1985,6 +1985,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct inet_listen_hashbucket *ilb;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
+ struct listen_sock *lopt;
if (!sk) {
st->bucket = 0;
@@ -2009,20 +2010,21 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
- if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
+ if (++st->sbucket >= lopt->nr_table_entries)
break;
get_req:
- req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
+ req = lopt->syn_table[st->sbucket];
}
sk = sk_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING;
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ rcu_read_unlock_bh();
} else {
icsk = inet_csk(sk);
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue))
+ rcu_read_lock_bh();
+ lopt = rcu_dereference_bh(icsk->icsk_accept_queue.listen_opt);
+ if (reqsk_queue_len(lopt))
goto start_req;
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ rcu_read_unlock_bh();
sk = sk_next(sk);
}
get_sk:
@@ -2032,8 +2034,9 @@ get_sk:
goto out;
}
icsk = inet_csk(sk);
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
+ rcu_read_lock_bh();
+ lopt = rcu_dereference_bh(icsk->icsk_accept_queue.listen_opt);
+ if (reqsk_queue_len(lopt)) {
start_req:
st->uid = sock_i_uid(sk);
st->syn_wait_sk = sk;
@@ -2041,7 +2044,7 @@ start_req:
st->sbucket = 0;
goto get_req;
}
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ rcu_read_unlock_bh();
}
spin_unlock_bh(&ilb->lock);
if (++st->bucket < INET_LHTABLE_SIZE) {
@@ -2235,10 +2238,8 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
- if (v) {
- struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- }
+ if (v)
+ rcu_read_unlock_bh();
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
Eric Dumazet 写道:
> Le mardi 20 avril 2010 à 21:21 +0800, Li Yu a écrit :
>
>> In my word, write lock also means mutual exclusion among all writers,
>> is it right?
>>
>
> Yes, generally speaking.
>
> But not on this use case.
>
> This is documented in an include file, if you search for syn_wait_lock
>
>>> All callers of this reqsk_queue_hash_req() must have the socket locked
>> See. If we always assumed the caller should hold the locked socket
>> first, this is not a bug, but I think we'd better add a comment at
>> header file.
>
> It is documented, as a matter of fact :)
>
>
Great, this isn't a bug, you are right here :)
I just found out these comments about syn_wait_lock, it seem that we need to crossed reference documents for kernel API, a newbie like me, may confused at such similar problems.
>