KCSAN reported a race between writing req->status in p9_client_cb and
accessing it in p9_client_rpc's wait_event.
Accesses to req itself is protected by the data barrier (writing req
fields, write barrier, writing status // reading status, read barrier,
reading other req fields), but status accesses themselves apparently
also must be annotated properly with WRITE_ONCE/READ_ONCE when we
access it without locks.
Follows:
- error paths writing status in various threads all can notify
p9_client_rpc, so these all also need WRITE_ONCE
- there's a similar read loop in trans_virtio for zc case that also
needs READ_ONCE
- other reads in trans_fd should be protected by the trans_fd lock and
lists state machine, as corresponding writers all are within trans_fd
and should be under the same lock. If KCSAN complains on them we likely
will have something else to fix as well, so it's better to leave them
unmarked and look again if required.
Reported-by: Naresh Kamboju <[email protected]>
Suggested-by: Marco Elver <[email protected]>
Signed-off-by: Dominique Martinet <[email protected]>
---
net/9p/client.c | 15 ++++++++-------
net/9p/trans_fd.c | 12 ++++++------
net/9p/trans_rdma.c | 4 ++--
net/9p/trans_virtio.c | 9 +++++----
net/9p/trans_xen.c | 4 ++--
5 files changed, 23 insertions(+), 21 deletions(-)
diff --git a/net/9p/client.c b/net/9p/client.c
index b554f8357f96..b5aa25f82b78 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -443,7 +443,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
* the status change is visible to another thread
*/
smp_wmb();
- req->status = status;
+ WRITE_ONCE(req->status, status);
wake_up(&req->wq);
p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
@@ -605,7 +605,7 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
/* if we haven't received a response for oldreq,
* remove it from the list
*/
- if (oldreq->status == REQ_STATUS_SENT) {
+ if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) {
if (c->trans_mod->cancelled)
c->trans_mod->cancelled(c, oldreq);
}
@@ -702,7 +702,8 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
}
again:
/* Wait for the response */
- err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
+ err = wait_event_killable(req->wq,
+ READ_ONCE(req->status) >= REQ_STATUS_RCVD);
/* Make sure our req is coherent with regard to updates in other
* threads - echoes to wmb() in the callback
@@ -716,7 +717,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
goto again;
}
- if (req->status == REQ_STATUS_ERROR) {
+ if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
err = req->t_err;
}
@@ -729,7 +730,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
p9_client_flush(c, req);
/* if we received the response anyway, don't signal error */
- if (req->status == REQ_STATUS_RCVD)
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
err = 0;
}
recalc_sigpending:
@@ -798,7 +799,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
if (err != -ERESTARTSYS)
goto recalc_sigpending;
}
- if (req->status == REQ_STATUS_ERROR) {
+ if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
err = req->t_err;
}
@@ -811,7 +812,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
p9_client_flush(c, req);
/* if we received the response anyway, don't signal error */
- if (req->status == REQ_STATUS_RCVD)
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
err = 0;
}
recalc_sigpending:
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 06ec9f7d3318..f8899745571c 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -201,11 +201,11 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
list_move(&req->req_list, &cancel_list);
- req->status = REQ_STATUS_ERROR;
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
}
list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
list_move(&req->req_list, &cancel_list);
- req->status = REQ_STATUS_ERROR;
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
}
spin_unlock(&m->req_lock);
@@ -466,7 +466,7 @@ static void p9_write_work(struct work_struct *work)
req = list_entry(m->unsent_req_list.next, struct p9_req_t,
req_list);
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
p9_debug(P9_DEBUG_TRANS, "move req %p\n", req);
list_move_tail(&req->req_list, &m->req_list);
@@ -675,7 +675,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
return m->err;
spin_lock(&m->req_lock);
- req->status = REQ_STATUS_UNSENT;
+ WRITE_ONCE(req->status, REQ_STATUS_UNSENT);
list_add_tail(&req->req_list, &m->unsent_req_list);
spin_unlock(&m->req_lock);
@@ -702,7 +702,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
if (req->status == REQ_STATUS_UNSENT) {
list_del(&req->req_list);
- req->status = REQ_STATUS_FLSHD;
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
p9_req_put(client, req);
ret = 0;
}
@@ -731,7 +731,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
* remove it from the list.
*/
list_del(&req->req_list);
- req->status = REQ_STATUS_FLSHD;
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
spin_unlock(&m->req_lock);
p9_req_put(client, req);
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 33a9ac6f2d55..83f9100d46bf 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -506,7 +506,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
* because doing if after could erase the REQ_STATUS_RCVD
* status in case of a very fast reply.
*/
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
err = ib_post_send(rdma->qp, &wr, NULL);
if (err)
goto send_error;
@@ -516,7 +516,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
/* Handle errors that happened during or while preparing the send: */
send_error:
- req->status = REQ_STATUS_ERROR;
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
kfree(c);
p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 19bccfa0d593..3c27ffb781e3 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -262,7 +262,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
req_retry:
spin_lock_irqsave(&chan->lock, flags);
@@ -468,7 +468,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
inlen = n;
}
}
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
req_retry_pinned:
spin_lock_irqsave(&chan->lock, flags);
@@ -531,9 +531,10 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
spin_unlock_irqrestore(&chan->lock, flags);
kicked = 1;
p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
- err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
+ err = wait_event_killable(req->wq,
+ READ_ONCE(req->status) >= REQ_STATUS_RCVD);
// RERROR needs reply (== error string) in static data
- if (req->status == REQ_STATUS_RCVD &&
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD &&
unlikely(req->rc.sdata[4] == P9_RERROR))
handle_rerror(req, in_hdr_len, offs, in_pages);
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index aaa5fd364691..cf1b89ba522b 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -157,7 +157,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
&masked_prod, masked_cons,
XEN_9PFS_RING_SIZE(ring));
- p9_req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(p9_req->status, REQ_STATUS_SENT);
virt_wmb(); /* write ring before updating pointer */
prod += size;
ring->intf->out_prod = prod;
@@ -212,7 +212,7 @@ static void p9_xen_response(struct work_struct *work)
dev_warn(&priv->dev->dev,
"requested packet size too big: %d for tag %d with capacity %zd\n",
h.size, h.tag, req->rc.capacity);
- req->status = REQ_STATUS_ERROR;
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
goto recv_error;
}
--
2.38.1
On Monday, December 5, 2022 1:47:56 PM CET Dominique Martinet wrote:
> KCSAN reported a race between writing req->status in p9_client_cb and
> accessing it in p9_client_rpc's wait_event.
>
> Accesses to req itself is protected by the data barrier (writing req
> fields, write barrier, writing status // reading status, read barrier,
> reading other req fields), but status accesses themselves apparently
> also must be annotated properly with WRITE_ONCE/READ_ONCE when we
> access it without locks.
>
> Follows:
> - error paths writing status in various threads all can notify
> p9_client_rpc, so these all also need WRITE_ONCE
> - there's a similar read loop in trans_virtio for zc case that also
> needs READ_ONCE
> - other reads in trans_fd should be protected by the trans_fd lock and
> lists state machine, as corresponding writers all are within trans_fd
> and should be under the same lock. If KCSAN complains on them we likely
> will have something else to fix as well, so it's better to leave them
> unmarked and look again if required.
>
> Reported-by: Naresh Kamboju <[email protected]>
> Suggested-by: Marco Elver <[email protected]>
> Signed-off-by: Dominique Martinet <[email protected]>
I must have missed the prior discussion, but looking at the suggested
solution: if there is no lock, then adding READ_ONCE() and WRITE_ONCE() would
not fix cross-CPU issues, as those would not have a memory barrier in that
case.
Shouldn't that therefore rather be at least smp_load_acquire() and
smp_store_release() at such places instead?
Best regards,
Christian Schoenebeck
Christian Schoenebeck wrote on Mon, Dec 05, 2022 at 04:19:01PM +0100:
> I must have missed the prior discussion, but looking at the suggested
Good point, I'll add a link to the report as well...
It's this thread:
https://lkml.kernel.org/r/CA+G9fYsK5WUxs6p9NaE4e3p7ew_+s0SdW0+FnBgiLWdYYOvoMg@mail.gmail.com
> solution: if there is no lock, then adding READ_ONCE() and WRITE_ONCE() would
> not fix cross-CPU issues, as those would not have a memory barrier in that
> case.
>
> Shouldn't that therefore rather be at least smp_load_acquire() and
> smp_store_release() at such places instead?
The barrier is here -- I think we're just protecting against compiler
reordering or if on some arch the store isn't actually atomic.
This code path actually was broken before I added the barrier a while
ago (2b6e72ed747f68a03), as I was observing some rare but very real
errors on a big server so I'm fairly confident that for at least x86_64
the generated code isn't too bad, but if KCSAN helps catching stuff I
won't complain.
--
Dominique
On Monday, December 5, 2022 11:27:48 PM CET Dominique Martinet wrote:
> Christian Schoenebeck wrote on Mon, Dec 05, 2022 at 04:19:01PM +0100:
> > I must have missed the prior discussion, but looking at the suggested
>
> Good point, I'll add a link to the report as well...
> It's this thread:
> https://lkml.kernel.org/r/CA+G9fYsK5WUxs6p9NaE4e3p7ew_+s0SdW0+FnBgiLWdYYOvoMg@mail.gmail.com
>
> > solution: if there is no lock, then adding READ_ONCE() and WRITE_ONCE() would
> > not fix cross-CPU issues, as those would not have a memory barrier in that
> > case.
> >
> > Shouldn't that therefore rather be at least smp_load_acquire() and
> > smp_store_release() at such places instead?
>
> The barrier is here --
Right, looks like most of it should be fine. Maybe p9_client_zc_rpc() needs a
barrier as well?
> I think we're just protecting against compiler
> reordering or if on some arch the store isn't actually atomic.
And access order within the same thread.
> This code path actually was broken before I added the barrier a while
> ago (2b6e72ed747f68a03), as I was observing some rare but very real
> errors on a big server so I'm fairly confident that for at least x86_64
> the generated code isn't too bad, but if KCSAN helps catching stuff I
> won't complain.
What about p9_tag_alloc()?
Christian Schoenebeck wrote on Thu, Dec 08, 2022 at 04:51:27PM +0100:
> Right, looks like most of it should be fine. Maybe p9_client_zc_rpc() needs a
> barrier as well?
Good point, the request is used without any other lock after the
wait_event on req->status in trans_virtio.c;
I'll send a separate patch for it later today.
> > I think we're just protecting against compiler
> > reordering or if on some arch the store isn't actually atomic.
>
> And access order within the same thread.
In this case afaik the barrier also does that? There would be no point
if a write barrier allowed a write placed before the barrier to be
reordered after it...
> > This code path actually was broken before I added the barrier a while
> > ago (2b6e72ed747f68a03), as I was observing some rare but very real
> > errors on a big server so I'm fairly confident that for at least x86_64
> > the generated code isn't too bad, but if KCSAN helps catching stuff I
> > won't complain.
>
> What about p9_tag_alloc()?
I think that one's ok: it happens during the allocation before the
request is enqueued in the idr, so it should be race-free by defition.
tools/memory-model/Documentation/access-marking.txt says
"Initialization-time and cleanup-time accesses" should use plain
C-language accesses, so I stuck to that.
cheers,
--
Dominique
On Friday, December 9, 2022 12:50:46 AM CET Dominique Martinet wrote:
> Christian Schoenebeck wrote on Thu, Dec 08, 2022 at 04:51:27PM +0100:
> > Right, looks like most of it should be fine. Maybe p9_client_zc_rpc() needs a
> > barrier as well?
>
> Good point, the request is used without any other lock after the
> wait_event on req->status in trans_virtio.c;
> I'll send a separate patch for it later today.
>
>
> > > I think we're just protecting against compiler
> > > reordering or if on some arch the store isn't actually atomic.
> >
> > And access order within the same thread.
>
> In this case afaik the barrier also does that? There would be no point
> if a write barrier allowed a write placed before the barrier to be
> reordered after it...
If it's about a single access, right. However when there are multiple accesses
(e.g. an expression) and plain-C access was used then the compiler was still
free to re-order the accesses in a different order than coded.
> > > This code path actually was broken before I added the barrier a while
> > > ago (2b6e72ed747f68a03), as I was observing some rare but very real
> > > errors on a big server so I'm fairly confident that for at least x86_64
> > > the generated code isn't too bad, but if KCSAN helps catching stuff I
> > > won't complain.
> >
> > What about p9_tag_alloc()?
>
> I think that one's ok: it happens during the allocation before the
> request is enqueued in the idr, so it should be race-free by defition.
>
> tools/memory-model/Documentation/access-marking.txt says
> "Initialization-time and cleanup-time accesses" should use plain
> C-language accesses, so I stuck to that.
When it is allocated then it is safe, but the object may also come from a pool
here. It's probably not likely to cause an issue here, just saying.
Christian Schoenebeck wrote on Fri, Dec 09, 2022 at 02:45:51PM +0100:
> > > What about p9_tag_alloc()?
> >
> > I think that one's ok: it happens during the allocation before the
> > request is enqueued in the idr, so it should be race-free by defition.
> >
> > tools/memory-model/Documentation/access-marking.txt says
> > "Initialization-time and cleanup-time accesses" should use plain
> > C-language accesses, so I stuck to that.
>
> When it is allocated then it is safe, but the object may also come from a pool
> here. It's probably not likely to cause an issue here, just saying.
If it comes from the pool then it is gated by the refcount... But that
would require a similar barrier indeed (init stuff, wmb, init refcount
// get req + check refcount, rmb, read stuff e.g. tag); just a
write_once would not help.
For the init side I assume unlocking c->lock acts as a write barrier
after tag is set, which is conveniently the last step, but we'd need a
read barrier here in tag lookup:
--------
diff --git a/net/9p/client.c b/net/9p/client.c
index fef6516a0639..68585ad9003c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -363,6 +363,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
*/
if (!p9_req_try_get(req))
goto again;
+ smp_rmb();
if (req->tc.tag != tag) {
p9_req_put(c, req);
goto again;
--------
OTOH this cannot happen with a normal server (a req should only be looked
up after it has been sent to the server and comes back, which involves a
few round trip and a few locks in the recv paths for tcp); but if syzbot
tries hard enough I guess that could be hit...
I don't have a strong opinion on this: I don't think anything really bad
can happen here as long as the refcount is correct (status is read under
lock when it matters before extra decrements of the refcount, and writes
to the buffer itself are safe from a memory pov), even if it's obviously
not correct strictly speaking.
(And I have no way of measuring what impact that extra barrier would have
tbh; for virtio at least lookup is actually never used...)
--
Dominique
On Friday, December 9, 2022 10:12:41 PM CET Dominique Martinet wrote:
> Christian Schoenebeck wrote on Fri, Dec 09, 2022 at 02:45:51PM +0100:
> > > > What about p9_tag_alloc()?
> > >
> > > I think that one's ok: it happens during the allocation before the
> > > request is enqueued in the idr, so it should be race-free by defition.
> > >
> > > tools/memory-model/Documentation/access-marking.txt says
> > > "Initialization-time and cleanup-time accesses" should use plain
> > > C-language accesses, so I stuck to that.
> >
> > When it is allocated then it is safe, but the object may also come from a pool
> > here. It's probably not likely to cause an issue here, just saying.
>
> If it comes from the pool then it is gated by the refcount... But that
> would require a similar barrier indeed (init stuff, wmb, init refcount
> // get req + check refcount, rmb, read stuff e.g. tag); just a
> write_once would not help.
>
> For the init side I assume unlocking c->lock acts as a write barrier
> after tag is set, which is conveniently the last step, but we'd need a
> read barrier here in tag lookup:
> --------
> diff --git a/net/9p/client.c b/net/9p/client.c
> index fef6516a0639..68585ad9003c 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -363,6 +363,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
> */
> if (!p9_req_try_get(req))
> goto again;
> + smp_rmb();
> if (req->tc.tag != tag) {
> p9_req_put(c, req);
> goto again;
> --------
>
> OTOH this cannot happen with a normal server (a req should only be looked
> up after it has been sent to the server and comes back, which involves a
> few round trip and a few locks in the recv paths for tcp); but if syzbot
> tries hard enough I guess that could be hit...
> I don't have a strong opinion on this: I don't think anything really bad
> can happen here as long as the refcount is correct (status is read under
> lock when it matters before extra decrements of the refcount, and writes
> to the buffer itself are safe from a memory pov), even if it's obviously
> not correct strictly speaking.
> (And I have no way of measuring what impact that extra barrier would have
> tbh; for virtio at least lookup is actually never used...)
Yeah agreed, this was more of a theoretical issue. With the other memory
barrier patch posted by you already:
Reviewed-by: Christian Schoenebeck <[email protected]>