I've been working on async RPCs for a while and never had time to debug
the last issues this had, but by limiting the async clunks to failures
the impact is drastically smaller and I've not been able to reproduce
any more bug so far.
This will require some more testing and I'm tempted to say this is not
worth rushing this into the merge window next week-ish; the new problem
Jens reported with task_work isn't really new and I'd rather get this
right than rush new bugs in given the sour experience I've had with this
patch series... Hopefully it'll get in this time.
With that in mind I plan to take the patches in to my -next branch after
the merge window, so this has time to get reviewed first.
I'd like to measure impact on performance as well, but really spent way
more time on this than I already have, so that'll have to wait a bit.
The only good thing here is that this shouldn't conflict with Eric's
rework...
A few problems I ran into today:
- not doing async clunks for retries led to massive fid leaks as soon
as I started doing async flush; I've described that in the clunk patch
but basically all the servers we tested with always replied with the
clunk before the flush, so the first clunk was never an error, so there
wasn't a need to retry at all... Because if it had it'd fall with
ERESTARTSYS immediately again, and it didn't.
This isn't perfect, but hopefully should be good enough to avoid too
many problems.
- for flush itself, the waiting for all in-flight flushes before
sending new rpc isn't great, but I don't have any better idea.
I think in the general case we could get away with not waiting at all
most of the time (check if there are any pending flush sent by the
current tid?), but the current approach by making the thread not
killable at all (!) is much more conservative, so I feel like we
should try this much even if it costs a bit.
Anyway, here goes nothing. Comments please!
Dominique Martinet (5):
9p/net: move code in preparation of async rpc
9p/net: share pooled receive buffers size exception in p9_tag_alloc
9p/net: implement asynchronous rpc skeleton
9p/net: add async clunk for retries
9p/net: make flush asynchronous
include/net/9p/client.h | 15 +-
net/9p/client.c | 508 +++++++++++++++++++++++++---------------
2 files changed, 339 insertions(+), 184 deletions(-)
--
2.39.1
This commit containers no code change:
- move p9_fid_* higher in code as p9_fid_destroy will be used
in async callback
- move p9_client_flush as it will no longer call p9_client_rpc
and can simplify forward declaration a bit later
This just simplifies the next commits to make diffs cleaner.
Signed-off-by: Dominique Martinet <[email protected]>
---
net/9p/client.c | 198 ++++++++++++++++++++++++------------------------
1 file changed, 99 insertions(+), 99 deletions(-)
diff --git a/net/9p/client.c b/net/9p/client.c
index 622ec6a586ee..53ebd07c36ee 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -428,6 +428,66 @@ static void p9_tag_cleanup(struct p9_client *c)
rcu_read_unlock();
}
+static struct p9_fid *p9_fid_create(struct p9_client *clnt)
+{
+ int ret;
+ struct p9_fid *fid;
+
+ p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
+ fid = kzalloc(sizeof(*fid), GFP_KERNEL);
+ if (!fid)
+ return NULL;
+
+ fid->mode = -1;
+ fid->uid = current_fsuid();
+ fid->clnt = clnt;
+ refcount_set(&fid->count, 1);
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&clnt->lock);
+ ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
+ GFP_NOWAIT);
+ spin_unlock_irq(&clnt->lock);
+ idr_preload_end();
+ if (!ret) {
+ trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
+ return fid;
+ }
+
+ kfree(fid);
+ return NULL;
+}
+
+static void p9_fid_destroy(struct p9_fid *fid)
+{
+ struct p9_client *clnt;
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
+ trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
+ clnt = fid->clnt;
+ spin_lock_irqsave(&clnt->lock, flags);
+ idr_remove(&clnt->fids, fid->fid);
+ spin_unlock_irqrestore(&clnt->lock, flags);
+ kfree(fid->rdir);
+ kfree(fid);
+}
+
+/* We also need to export tracepoint symbols for tracepoint_enabled() */
+EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
+
+void do_trace_9p_fid_get(struct p9_fid *fid)
+{
+ trace_9p_fid_ref(fid, P9_FID_REF_GET);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_get);
+
+void do_trace_9p_fid_put(struct p9_fid *fid)
+{
+ trace_9p_fid_ref(fid, P9_FID_REF_PUT);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_put);
+
/**
* p9_client_cb - call back from transport to client
* @c: client state
@@ -570,6 +630,45 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
return err;
}
+static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
+ int8_t type, uint t_size, uint r_size,
+ const char *fmt, va_list ap)
+{
+ int err;
+ struct p9_req_t *req;
+ va_list apc;
+
+ p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
+
+ /* we allow for any status other than disconnected */
+ if (c->status == Disconnected)
+ return ERR_PTR(-EIO);
+
+ /* if status is begin_disconnected we allow only clunk request */
+ if (c->status == BeginDisconnect && type != P9_TCLUNK)
+ return ERR_PTR(-EIO);
+
+ va_copy(apc, ap);
+ req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
+ va_end(apc);
+ if (IS_ERR(req))
+ return req;
+
+ /* marshall the data */
+ p9pdu_prepare(&req->tc, req->tc.tag, type);
+ err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap);
+ if (err)
+ goto reterr;
+ p9pdu_finalize(c, &req->tc);
+ trace_9p_client_req(c, type, req->tc.tag);
+ return req;
+reterr:
+ p9_req_put(c, req);
+ /* We have to put also the 2nd reference as it won't be used */
+ p9_req_put(c, req);
+ return ERR_PTR(err);
+}
+
static struct p9_req_t *
p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
@@ -613,45 +712,6 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
return 0;
}
-static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
- int8_t type, uint t_size, uint r_size,
- const char *fmt, va_list ap)
-{
- int err;
- struct p9_req_t *req;
- va_list apc;
-
- p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
-
- /* we allow for any status other than disconnected */
- if (c->status == Disconnected)
- return ERR_PTR(-EIO);
-
- /* if status is begin_disconnected we allow only clunk request */
- if (c->status == BeginDisconnect && type != P9_TCLUNK)
- return ERR_PTR(-EIO);
-
- va_copy(apc, ap);
- req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
- va_end(apc);
- if (IS_ERR(req))
- return req;
-
- /* marshall the data */
- p9pdu_prepare(&req->tc, req->tc.tag, type);
- err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap);
- if (err)
- goto reterr;
- p9pdu_finalize(c, &req->tc);
- trace_9p_client_req(c, type, req->tc.tag);
- return req;
-reterr:
- p9_req_put(c, req);
- /* We have to put also the 2nd reference as it won't be used */
- p9_req_put(c, req);
- return ERR_PTR(err);
-}
-
/**
* p9_client_rpc - issue a request and wait for a response
* @c: client session
@@ -838,66 +898,6 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
return ERR_PTR(safe_errno(err));
}
-static struct p9_fid *p9_fid_create(struct p9_client *clnt)
-{
- int ret;
- struct p9_fid *fid;
-
- p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
- fid = kzalloc(sizeof(*fid), GFP_KERNEL);
- if (!fid)
- return NULL;
-
- fid->mode = -1;
- fid->uid = current_fsuid();
- fid->clnt = clnt;
- refcount_set(&fid->count, 1);
-
- idr_preload(GFP_KERNEL);
- spin_lock_irq(&clnt->lock);
- ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
- GFP_NOWAIT);
- spin_unlock_irq(&clnt->lock);
- idr_preload_end();
- if (!ret) {
- trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
- return fid;
- }
-
- kfree(fid);
- return NULL;
-}
-
-static void p9_fid_destroy(struct p9_fid *fid)
-{
- struct p9_client *clnt;
- unsigned long flags;
-
- p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
- trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
- clnt = fid->clnt;
- spin_lock_irqsave(&clnt->lock, flags);
- idr_remove(&clnt->fids, fid->fid);
- spin_unlock_irqrestore(&clnt->lock, flags);
- kfree(fid->rdir);
- kfree(fid);
-}
-
-/* We also need to export tracepoint symbols for tracepoint_enabled() */
-EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
-
-void do_trace_9p_fid_get(struct p9_fid *fid)
-{
- trace_9p_fid_ref(fid, P9_FID_REF_GET);
-}
-EXPORT_SYMBOL(do_trace_9p_fid_get);
-
-void do_trace_9p_fid_put(struct p9_fid *fid)
-{
- trace_9p_fid_ref(fid, P9_FID_REF_PUT);
-}
-EXPORT_SYMBOL(do_trace_9p_fid_put);
-
static int p9_client_version(struct p9_client *c)
{
int err = 0;
--
2.39.1
The async RPC code will also use an automatic size and this bit of code
can be shared, as p9_tag_alloc still knows what client we alloc for.
Signed-off-by: Dominique Martinet <[email protected]>
---
net/9p/client.c | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/net/9p/client.c b/net/9p/client.c
index 53ebd07c36ee..4e5238db4a7a 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -282,8 +282,15 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
t_size ?: p9_msg_buf_size(c, type, fmt, apc));
va_end(apc);
- alloc_rsize = min_t(size_t, c->msize,
- r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
+ /* Currently RDMA transport is excluded from response message size
+ * optimization, as it cannot cope with it due to its pooled response
+ * buffers (this has no impact on request size)
+ */
+ if (r_size == 0 && c->trans_mod->pooled_rbuffers)
+ alloc_rsize = c->msize;
+ else
+ alloc_rsize = min_t(size_t, c->msize,
+ r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
if (!req)
return ERR_PTR(-ENOMEM);
@@ -728,18 +735,10 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
int sigpending, err;
unsigned long flags;
struct p9_req_t *req;
- /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to
- * auto determine an appropriate (small) request/response size
- * according to actual message data being sent. Currently RDMA
- * transport is excluded from this response message size optimization,
- * as it would not cope with it, due to its pooled response buffers
- * (using an optimized request size for RDMA as well though).
- */
- const uint tsize = 0;
- const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0;
va_start(ap, fmt);
- req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap);
+ /* auto determine an appropriate request/response size */
+ req = p9_client_prepare_req(c, type, 0, 0, fmt, ap);
va_end(ap);
if (IS_ERR(req))
return req;
--
2.39.1
Add p9_client_async_rpc that will let us send an rpc without waiting
for the reply, as well as an async callback hook.
The callback is called, but nothing is ever put in the list at this
point and p9_client_async_rpc() is unused.
Previous version of this patch here[1] used to implement the async check
separately from the callback, but we will want to be notified when flush
has been processed.
Link: http://lkml.kernel.org/r/[email protected] (v1)
Signed-off-by: Dominique Martinet <[email protected]>
---
include/net/9p/client.h | 9 ++++-
net/9p/client.c | 76 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 84 insertions(+), 1 deletion(-)
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 78ebcf782ce5..348ea191ac66 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -72,7 +72,8 @@ enum p9_req_status_t {
* @wq: wait_queue for the client to block on for this request
* @tc: the request fcall structure
* @rc: the response fcall structure
- * @req_list: link for higher level objects to chain requests
+ * @req_list: link for transports to chain requests (used by trans_fd)
+ * @async_list: link used to check on async requests
*/
struct p9_req_t {
int status;
@@ -82,6 +83,7 @@ struct p9_req_t {
struct p9_fcall tc;
struct p9_fcall rc;
struct list_head req_list;
+ struct list_head async_list;
};
/**
@@ -92,6 +94,9 @@ struct p9_req_t {
* @trans_mod: module API instantiated with this client
* @status: connection state
* @trans: tranport instance state and API
+ * @fcall_cache: kmem cache for client request data (msize-specific)
+ * @async_req_lock: protects @async_req_list
+ * @async_req_list: list of requests waiting a reply
* @fids: All active FID handles
* @reqs: All active requests.
* @name: node name used as client id
@@ -107,6 +112,8 @@ struct p9_client {
enum p9_trans_status status;
void *trans;
struct kmem_cache *fcall_cache;
+ spinlock_t async_req_lock;
+ struct list_head async_req_list;
union {
struct {
diff --git a/net/9p/client.c b/net/9p/client.c
index 4e5238db4a7a..3235543c1884 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -311,6 +311,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
refcount_set(&req->refcount, 0);
init_waitqueue_head(&req->wq);
INIT_LIST_HEAD(&req->req_list);
+ INIT_LIST_HEAD(&req->async_list);
idr_preload(GFP_NOFS);
spin_lock_irq(&c->lock);
@@ -495,6 +496,27 @@ void do_trace_9p_fid_put(struct p9_fid *fid)
}
EXPORT_SYMBOL(do_trace_9p_fid_put);
+/**
+ * p9_client_async_cb -- handle async requests in cb
+ * @c: client state
+ * @req: request received
+ */
+static void p9_client_async_cb(struct p9_client *c, struct p9_req_t *req)
+{
+ unsigned long flags;
+
+ /* Nothing to do for non-async requests */
+ if (likely(list_empty(&req->async_list)))
+ return;
+
+ WARN(1, "Async request received with tc.id %d\n", req->tc.id);
+
+ spin_lock_irqsave(&c->async_req_lock, flags);
+ list_del(&req->async_list);
+ spin_unlock_irqrestore(&c->async_req_lock, flags);
+ p9_tag_remove(c, req);
+}
+
/**
* p9_client_cb - call back from transport to client
* @c: client state
@@ -506,6 +528,8 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
{
p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag);
+ p9_client_async_cb(c, req);
+
/* This barrier is needed to make sure any change made to req before
* the status change is visible to another thread
*/
@@ -676,6 +700,54 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
return ERR_PTR(err);
}
+static struct p9_req_t *
+p9_client_async_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+{
+ va_list ap;
+ int err;
+ struct p9_req_t *req;
+
+ va_start(ap, fmt);
+ /* auto determine an appropriate request/response size */
+ req = p9_client_prepare_req(c, type, 0, 0, fmt, ap);
+ va_end(ap);
+ if (IS_ERR(req))
+ return req;
+
+ err = c->trans_mod->request(c, req);
+ if (err < 0) {
+ /* bail out without flushing... */
+ p9_req_put(c, req);
+ p9_tag_remove(c, req);
+ if (err != -ERESTARTSYS && err != -EFAULT)
+ c->status = Disconnected;
+ return ERR_PTR(safe_errno(err));
+ }
+
+ return req;
+}
+
+static void p9_client_cleanup_async(struct p9_client *c)
+{
+ struct p9_req_t *req, *nreq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&c->async_req_lock, flags);
+ list_for_each_entry_safe(req, nreq, &c->async_req_list, async_list) {
+ if (req->status < REQ_STATUS_RCVD) {
+ p9_debug(P9_DEBUG_ERROR,
+ "final async handler found req tag %d type %d status %d\n",
+ req->tc.tag, req->tc.id, req->status);
+ if (c->trans_mod->cancelled)
+ c->trans_mod->cancelled(c, req);
+ /* won't receive reply now */
+ p9_req_put(c, req);
+ }
+ p9_client_async_cb(c, req);
+ }
+ spin_unlock_irqrestore(&c->async_req_lock, flags);
+}
+
static struct p9_req_t *
p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
@@ -983,6 +1055,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
memcpy(clnt->name, client_id, strlen(client_id) + 1);
spin_lock_init(&clnt->lock);
+ spin_lock_init(&clnt->async_req_lock);
+ INIT_LIST_HEAD(&clnt->async_req_list);
idr_init(&clnt->fids);
idr_init(&clnt->reqs);
@@ -1059,6 +1133,8 @@ void p9_client_destroy(struct p9_client *clnt)
v9fs_put_trans(clnt->trans_mod);
+ p9_client_cleanup_async(clnt);
+
idr_for_each_entry(&clnt->fids, fid, id) {
pr_info("Found fid %d not clunked\n", fid->fid);
p9_fid_destroy(fid);
--
2.39.1
clunks in 9p are absolute: once the request is sent to the server,
the local end should consider the fid freed.
Unfortunately our retry logic is hazardous at best, if we got
ERESTARTSYS and we call p9_client_rpc again odds are we'll just
notice the same signal is pending again and fail, leaking the fd.
This used to work, because we never actually got to second retry with
the flush logic, as most servers ignore the flush request and send the
clunk reply first, in which case p9_client_rpc() will return
successfully even if it got interrupted.
Once async flush come into play we will start leaking fids everytime a
task is interrupted, avoid this by resending the flush and handling it
asynchronously.
Note that it's possible that the server got the clunk twice, but it will
be a clunk for a fid that is not used (client controls fid number
allocation), and the async cb does not consider the client return value.
We do not make all clunks asynchronous because some tests failed when
this was tried in [1] ; clunk in 9p should not act like a barrier like
close() does (e.g. flushing contents) but I have no time to debug.
This might still be a problem with the retries, but we are still better
than the current code which gives up and leaks the fid.
What should probably be done next is to make all calls to p9_fid_put
check for errors, at which point the failed clunk can be returned as an
error to userspace which could then notice the error. (Not like most
programs check for close() return value in the first place...)
Link: http://lkml.kernel.org/r/[email protected] [1]
Signed-off-by: Dominique Martinet <[email protected]>
---
include/net/9p/client.h | 4 +++
net/9p/client.c | 58 ++++++++++++++++++++++++++++++++---------
2 files changed, 49 insertions(+), 13 deletions(-)
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 348ea191ac66..dd493f7b8059 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -74,6 +74,7 @@ enum p9_req_status_t {
* @rc: the response fcall structure
* @req_list: link for transports to chain requests (used by trans_fd)
* @async_list: link used to check on async requests
+ * @clunked_fid: for clunk, points to fid
*/
struct p9_req_t {
int status;
@@ -84,6 +85,9 @@ struct p9_req_t {
struct p9_fcall rc;
struct list_head req_list;
struct list_head async_list;
+ union {
+ struct p9_fid *clunked_fid;
+ };
};
/**
diff --git a/net/9p/client.c b/net/9p/client.c
index 3235543c1884..f4b85c33c465 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -509,7 +509,13 @@ static void p9_client_async_cb(struct p9_client *c, struct p9_req_t *req)
if (likely(list_empty(&req->async_list)))
return;
- WARN(1, "Async request received with tc.id %d\n", req->tc.id);
+ if (req->tc.id == P9_TCLUNK) {
+ p9_debug(P9_DEBUG_MUX, "async destroying fid %d\n",
+ req->clunked_fid->fid);
+ p9_fid_destroy(req->clunked_fid);
+ } else {
+ WARN(1, "Async request received with tc.id %d\n", req->tc.id);
+ }
spin_lock_irqsave(&c->async_req_lock, flags);
list_del(&req->async_list);
@@ -1497,16 +1503,35 @@ int p9_client_fsync(struct p9_fid *fid, int datasync)
}
EXPORT_SYMBOL(p9_client_fsync);
+static int p9_client_async_clunk(struct p9_fid *fid)
+{
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> async TCLUNK fid %d\n", fid->fid);
+ clnt = fid->clnt;
+
+ req = p9_client_async_rpc(clnt, P9_TCLUNK, "d", fid->fid);
+ if (IS_ERR(req)) {
+ /* leak fid until umount... */
+ return PTR_ERR(req);
+ }
+
+ req->clunked_fid = fid;
+ spin_lock_irq(&clnt->lock);
+ list_add(&req->async_list, &clnt->async_req_list);
+ spin_unlock_irq(&clnt->lock);
+
+ return 0;
+}
+
int p9_client_clunk(struct p9_fid *fid)
{
int err;
struct p9_client *clnt;
struct p9_req_t *req;
- int retries = 0;
-again:
- p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n",
- fid->fid, retries);
+ p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid);
err = 0;
clnt = fid->clnt;
@@ -1520,16 +1545,23 @@ int p9_client_clunk(struct p9_fid *fid)
p9_req_put(clnt, req);
error:
- /* Fid is not valid even after a failed clunk
- * If interrupted, retry once then give up and
- * leak fid until umount.
+ /* Fid is not valid even after a failed clunk, but we do not
+ * know if we successfully sent the request so send again
+ * asynchronously ('err' cannot differentiate between:
+ * failure on the client side before send, interrupted
+ * before we sent the request, interrupted after we sent
+ * the request and error from the server)
+ * In doubt it's better to try to ask again (for the
+ * 'interrupted before we sent the request' case), other
+ * patterns will just ignore again.
+ * Return the original error though.
*/
- if (err == -ERESTARTSYS) {
- if (retries++ == 0)
- goto again;
- } else {
- p9_fid_destroy(fid);
+ if (err) {
+ p9_client_async_clunk(fid);
+ return err;
}
+
+ p9_fid_destroy(fid);
return err;
}
EXPORT_SYMBOL(p9_client_clunk);
--
2.39.1
Make the client flush asynchronous so we don't need to ignore signals in
rpc functions:
- on signal, send a flush request as we previously did but use the new
asynchronous helper and return immediately
- when the reply has been received or connection is destroyed, free
both tags in the handler
Since the 9p client has no way of knowing if a request has been replayed
after flush, we also need to wait for all flush to have been processed
in order to avoid the following example scenario:
1. Send write#1 [0; 1MB[
2. Receive signal and interrupt the write, code sends flush and return
3. Resend write as write#2 [0; 1MB[
4. write#2 is handled
5. Send write#3 [1MB-4k; 1MB[
6. write#3 is handled
7. write#1 is handled and write#3's content overwritten
(flush is later ack'd)
In practice, all transports use a connection-oriented pipe (e.g.
messages are received in order), so requests are guaranteed to be
received by the server in the order they were sent; but the server has
no way of knowing that the second write is a replay of the first and
could be dispatched to another thread leading to such scenario.
This means the connection is effectively put on hold after any flush has
been sent, but hopefully flush are rare enough to not make a difference
in normal workloads.
Not looping forever in p9_client_rpc and its zc counterpart has been
a goal for a long time, but will also in particular fix a new hang with
the TWA_SIGNAL based task_work as reported by Pengfei Xu and Jens Axboe
recently.
Reported-by: Dmitry Vyukov <[email protected]>
Reported-by: Pengfei Xu <[email protected]>
Reported-by: Jens Axboe <[email protected]>
Link: http://lkml.kernel.org/r/[email protected] (v1)
Signed-off-by: Dominique Martinet <[email protected]>
---
include/net/9p/client.h | 2 +
net/9p/client.c | 187 ++++++++++++++++++++++++----------------
2 files changed, 113 insertions(+), 76 deletions(-)
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index dd493f7b8059..9eec67108096 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -75,6 +75,7 @@ enum p9_req_status_t {
* @req_list: link for transports to chain requests (used by trans_fd)
* @async_list: link used to check on async requests
* @clunked_fid: for clunk, points to fid
+ * @flushed_req: for flush, points to matching flushed req
*/
struct p9_req_t {
int status;
@@ -87,6 +88,7 @@ struct p9_req_t {
struct list_head async_list;
union {
struct p9_fid *clunked_fid;
+ struct p9_req_t *flushed_req;
};
};
diff --git a/net/9p/client.c b/net/9p/client.c
index f4b85c33c465..4c328021dc98 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -503,6 +503,8 @@ EXPORT_SYMBOL(do_trace_9p_fid_put);
*/
static void p9_client_async_cb(struct p9_client *c, struct p9_req_t *req)
{
+ /* extra req to put e.g. flush target */
+ struct p9_req_t *extra_req = NULL;
unsigned long flags;
/* Nothing to do for non-async requests */
@@ -513,14 +515,31 @@ static void p9_client_async_cb(struct p9_client *c, struct p9_req_t *req)
p9_debug(P9_DEBUG_MUX, "async destroying fid %d\n",
req->clunked_fid->fid);
p9_fid_destroy(req->clunked_fid);
+ } else if (req->tc.id == P9_TFLUSH) {
+ p9_debug(P9_DEBUG_MUX,
+ "flushing oldreq tag %d status %d, flush_req tag %d\n",
+ req->flushed_req->tc.tag,
+ req->flushed_req->status,
+ req->tc.tag);
+ if (req->flushed_req->status < REQ_STATUS_RCVD) {
+ /* won't receive reply now */
+ if (c->trans_mod->cancelled)
+ c->trans_mod->cancelled(c, req);
+ wake_up_all(&req->flushed_req->wq);
+ p9_req_put(c, req->flushed_req);
+ }
+ extra_req = req->flushed_req;
} else {
WARN(1, "Async request received with tc.id %d\n", req->tc.id);
}
-
+ /* Put old refs whatever reqs actually returned */
spin_lock_irqsave(&c->async_req_lock, flags);
list_del(&req->async_list);
spin_unlock_irqrestore(&c->async_req_lock, flags);
p9_tag_remove(c, req);
+
+ if (extra_req)
+ p9_req_put(c, extra_req);
}
/**
@@ -542,7 +561,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
smp_wmb();
WRITE_ONCE(req->status, status);
- wake_up(&req->wq);
+ wake_up_all(&req->wq);
p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
p9_req_put(c, req);
}
@@ -754,9 +773,6 @@ static void p9_client_cleanup_async(struct p9_client *c)
spin_unlock_irqrestore(&c->async_req_lock, flags);
}
-static struct p9_req_t *
-p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
-
/**
* p9_client_flush - flush (cancel) a request
* @c: client state
@@ -772,28 +788,68 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
{
struct p9_req_t *req;
- s16 oldtag;
- int err;
-
- err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1);
- if (err)
- return err;
+ s16 oldtag = oldreq->tc.tag;
p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag);
-
- req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag);
+ req = p9_client_async_rpc(c, P9_TFLUSH, "w", oldtag);
if (IS_ERR(req))
return PTR_ERR(req);
- /* if we haven't received a response for oldreq,
- * remove it from the list
- */
- if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) {
- if (c->trans_mod->cancelled)
- c->trans_mod->cancelled(c, oldreq);
- }
+ p9_debug(P9_DEBUG_MUX, "sent flush for oldreq %d type %d with flush tag %d\n",
+ oldtag, oldreq->tc.id, req->tc.tag);
+ req->flushed_req = oldreq;
+ spin_lock_irq(&c->lock);
+ list_add(&req->async_list, &c->async_req_list);
+ spin_unlock_irq(&c->lock);
- p9_req_put(c, req);
+ return 0;
+}
+
+/**
+ * p9_client_wait_flush - wait for any in flight flush
+ * @c: client state
+ *
+ * Since we cannot know if a request is a replay for a request that was
+ * previous flushed, we need to wait for all in flight flush to send
+ * new requests to avoid racing with a previously cancelled flush.
+ */
+
+static int p9_client_wait_flush(struct p9_client *c)
+{
+ struct p9_req_t *req;
+ unsigned long flags;
+ int err;
+
+ /* As an optimization check if list is empty without lock first;
+ * that should be the case most of the time, and if we see it
+ * as empty we are not a thread that recently issued a flush
+ */
+ if (list_empty(&c->async_req_list))
+ return 0;
+
+again:
+ spin_lock_irqsave(&c->async_req_lock, flags);
+ list_for_each_entry(req, &c->async_req_list, async_list) {
+ /* received request will be deleted shortly and can be ignored */
+ if (req->status >= REQ_STATUS_RCVD)
+ continue;
+
+ if (req->tc.id == P9_TFLUSH) {
+ /* also ok if its target has been ack'd */
+ if (req->flushed_req->status >= REQ_STATUS_RCVD)
+ continue;
+ /* take ref on flushed req, drop lock and wait for it... */
+ p9_req_get(req->flushed_req);
+ spin_unlock_irqrestore(&c->async_req_lock, flags);
+ err = wait_event_interruptible(req->flushed_req->wq,
+ READ_ONCE(req->status) >= REQ_STATUS_RCVD);
+ p9_req_put(c, req->flushed_req);
+ if (err < 0)
+ return err;
+ goto again;
+ }
+ }
+ spin_unlock_irqrestore(&c->async_req_lock, flags);
return 0;
}
@@ -810,8 +866,8 @@ static struct p9_req_t *
p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
{
va_list ap;
- int sigpending, err;
- unsigned long flags;
+ int err;
+ int flushing = 0;
struct p9_req_t *req;
va_start(ap, fmt);
@@ -825,21 +881,21 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
req->rc.zc = false;
if (signal_pending(current)) {
- sigpending = 1;
- clear_thread_flag(TIF_SIGPENDING);
- } else {
- sigpending = 0;
+ err = -ERESTARTSYS;
+ goto not_sent;
}
+ err = p9_client_wait_flush(c);
+ if (err < 0)
+ goto not_sent;
+
err = c->trans_mod->request(c, req);
if (err < 0) {
- /* write won't happen */
- p9_req_put(c, req);
if (err != -ERESTARTSYS && err != -EFAULT)
c->status = Disconnected;
- goto recalc_sigpending;
+ goto not_sent;
}
-again:
+
/* Wait for the response */
err = wait_event_killable(req->wq,
READ_ONCE(req->status) >= REQ_STATUS_RCVD);
@@ -849,34 +905,15 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
*/
smp_rmb();
- if (err == -ERESTARTSYS && c->status == Connected &&
- type == P9_TFLUSH) {
- sigpending = 1;
- clear_thread_flag(TIF_SIGPENDING);
- goto again;
- }
-
- if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
+ if (req->status == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
err = req->t_err;
}
if (err == -ERESTARTSYS && c->status == Connected) {
p9_debug(P9_DEBUG_MUX, "flushing\n");
- sigpending = 1;
- clear_thread_flag(TIF_SIGPENDING);
- if (c->trans_mod->cancel(c, req))
- p9_client_flush(c, req);
-
- /* if we received the response anyway, don't signal error */
- if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
- err = 0;
- }
-recalc_sigpending:
- if (sigpending) {
- spin_lock_irqsave(¤t->sighand->siglock, flags);
- recalc_sigpending();
- spin_unlock_irqrestore(¤t->sighand->siglock, flags);
+ if (c->trans_mod->cancel(c, req) && !p9_client_flush(c, req))
+ flushing = 1;
}
if (err < 0)
goto reterr;
@@ -885,8 +922,12 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
trace_9p_client_res(c, type, req->rc.tag, err);
if (!err)
return req;
-reterr:
+ goto reterr;
+not_sent:
p9_req_put(c, req);
+reterr:
+ if (!flushing)
+ p9_req_put(c, req);
return ERR_PTR(safe_errno(err));
}
@@ -910,8 +951,8 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
const char *fmt, ...)
{
va_list ap;
- int sigpending, err;
- unsigned long flags;
+ int err;
+ int flushing = 0;
struct p9_req_t *req;
va_start(ap, fmt);
@@ -927,19 +968,21 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
req->rc.zc = true;
if (signal_pending(current)) {
- sigpending = 1;
- clear_thread_flag(TIF_SIGPENDING);
- } else {
- sigpending = 0;
+ err = -ERESTARTSYS;
+ goto not_sent;
}
+ err = p9_client_wait_flush(c);
+ if (err < 0)
+ goto not_sent;
+
err = c->trans_mod->zc_request(c, req, uidata, uodata,
inlen, olen, in_hdrlen);
if (err < 0) {
if (err == -EIO)
c->status = Disconnected;
if (err != -ERESTARTSYS)
- goto recalc_sigpending;
+ goto not_sent;
}
if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
@@ -947,21 +990,9 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
}
if (err == -ERESTARTSYS && c->status == Connected) {
p9_debug(P9_DEBUG_MUX, "flushing\n");
- sigpending = 1;
- clear_thread_flag(TIF_SIGPENDING);
- if (c->trans_mod->cancel(c, req))
- p9_client_flush(c, req);
-
- /* if we received the response anyway, don't signal error */
- if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
- err = 0;
- }
-recalc_sigpending:
- if (sigpending) {
- spin_lock_irqsave(¤t->sighand->siglock, flags);
- recalc_sigpending();
- spin_unlock_irqrestore(¤t->sighand->siglock, flags);
+ if (c->trans_mod->cancel(c, req) && !p9_client_flush(c, req))
+ flushing = 1;
}
if (err < 0)
goto reterr;
@@ -970,8 +1001,12 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
trace_9p_client_res(c, type, req->rc.tag, err);
if (!err)
return req;
-reterr:
+ goto reterr;
+not_sent:
p9_req_put(c, req);
+reterr:
+ if (!flushing)
+ p9_req_put(c, req);
return ERR_PTR(safe_errno(err));
}
--
2.39.1
Dominique Martinet wrote on Sat, Feb 11, 2023 at 04:50:18PM +0900:
> A few problems I ran into today:
> [...]
Ah, one more thing I forgot: I'm still convinced that
p9_virtio_zc_request is wrong in its current (before these patches), and
adding this on top doesn't make it any better.
We'll likely want to fix that too before merging this, giving another
reason to dealy this for 6.4.
p9_virtio_zc_request shouldn't unpin the pages if wait_event_killable is
interrupted, and the flush async cb should call back in the transport to
unpin at that point or something?
I'm probably repeating myself here but I really don't have time in the
day for everything, so that too will have to wait...
(and there's probably more I forgot, but that'll do for now)
--
Dominique
On Saturday, February 11, 2023 8:50:19 AM CET Dominique Martinet wrote:
> This commit containers no code change:
> - move p9_fid_* higher in code as p9_fid_destroy will be used
> in async callback
I would have just added p9_fid_destroy()'s prototype earlier in code instead
of moving stuff around:
static void p9_fid_destroy(struct p9_fid *fid);
Because that would not mess with 'git blame' history. But anyway, it's just
restructuring, no behaviour change, so:
Reviewed-by: Christian Schoenebeck <[email protected]>
> - move p9_client_flush as it will no longer call p9_client_rpc
> and can simplify forward declaration a bit later
>
> This just simplifies the next commits to make diffs cleaner.
>
> Signed-off-by: Dominique Martinet <[email protected]>
> ---
> net/9p/client.c | 198 ++++++++++++++++++++++++------------------------
> 1 file changed, 99 insertions(+), 99 deletions(-)
>
> diff --git a/net/9p/client.c b/net/9p/client.c
> index 622ec6a586ee..53ebd07c36ee 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -428,6 +428,66 @@ static void p9_tag_cleanup(struct p9_client *c)
> rcu_read_unlock();
> }
>
> +static struct p9_fid *p9_fid_create(struct p9_client *clnt)
> +{
> + int ret;
> + struct p9_fid *fid;
> +
> + p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
> + fid = kzalloc(sizeof(*fid), GFP_KERNEL);
> + if (!fid)
> + return NULL;
> +
> + fid->mode = -1;
> + fid->uid = current_fsuid();
> + fid->clnt = clnt;
> + refcount_set(&fid->count, 1);
> +
> + idr_preload(GFP_KERNEL);
> + spin_lock_irq(&clnt->lock);
> + ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
> + GFP_NOWAIT);
> + spin_unlock_irq(&clnt->lock);
> + idr_preload_end();
> + if (!ret) {
> + trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
> + return fid;
> + }
> +
> + kfree(fid);
> + return NULL;
> +}
> +
> +static void p9_fid_destroy(struct p9_fid *fid)
> +{
> + struct p9_client *clnt;
> + unsigned long flags;
> +
> + p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
> + trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
> + clnt = fid->clnt;
> + spin_lock_irqsave(&clnt->lock, flags);
> + idr_remove(&clnt->fids, fid->fid);
> + spin_unlock_irqrestore(&clnt->lock, flags);
> + kfree(fid->rdir);
> + kfree(fid);
> +}
> +
> +/* We also need to export tracepoint symbols for tracepoint_enabled() */
> +EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
> +
> +void do_trace_9p_fid_get(struct p9_fid *fid)
> +{
> + trace_9p_fid_ref(fid, P9_FID_REF_GET);
> +}
> +EXPORT_SYMBOL(do_trace_9p_fid_get);
> +
> +void do_trace_9p_fid_put(struct p9_fid *fid)
> +{
> + trace_9p_fid_ref(fid, P9_FID_REF_PUT);
> +}
> +EXPORT_SYMBOL(do_trace_9p_fid_put);
> +
> /**
> * p9_client_cb - call back from transport to client
> * @c: client state
> @@ -570,6 +630,45 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
> return err;
> }
>
> +static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
> + int8_t type, uint t_size, uint r_size,
> + const char *fmt, va_list ap)
> +{
> + int err;
> + struct p9_req_t *req;
> + va_list apc;
> +
> + p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
> +
> + /* we allow for any status other than disconnected */
> + if (c->status == Disconnected)
> + return ERR_PTR(-EIO);
> +
> + /* if status is begin_disconnected we allow only clunk request */
> + if (c->status == BeginDisconnect && type != P9_TCLUNK)
> + return ERR_PTR(-EIO);
> +
> + va_copy(apc, ap);
> + req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
> + va_end(apc);
> + if (IS_ERR(req))
> + return req;
> +
> + /* marshall the data */
> + p9pdu_prepare(&req->tc, req->tc.tag, type);
> + err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap);
> + if (err)
> + goto reterr;
> + p9pdu_finalize(c, &req->tc);
> + trace_9p_client_req(c, type, req->tc.tag);
> + return req;
> +reterr:
> + p9_req_put(c, req);
> + /* We have to put also the 2nd reference as it won't be used */
> + p9_req_put(c, req);
> + return ERR_PTR(err);
> +}
> +
> static struct p9_req_t *
> p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
>
> @@ -613,45 +712,6 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
> return 0;
> }
>
> -static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
> - int8_t type, uint t_size, uint r_size,
> - const char *fmt, va_list ap)
> -{
> - int err;
> - struct p9_req_t *req;
> - va_list apc;
> -
> - p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
> -
> - /* we allow for any status other than disconnected */
> - if (c->status == Disconnected)
> - return ERR_PTR(-EIO);
> -
> - /* if status is begin_disconnected we allow only clunk request */
> - if (c->status == BeginDisconnect && type != P9_TCLUNK)
> - return ERR_PTR(-EIO);
> -
> - va_copy(apc, ap);
> - req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
> - va_end(apc);
> - if (IS_ERR(req))
> - return req;
> -
> - /* marshall the data */
> - p9pdu_prepare(&req->tc, req->tc.tag, type);
> - err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap);
> - if (err)
> - goto reterr;
> - p9pdu_finalize(c, &req->tc);
> - trace_9p_client_req(c, type, req->tc.tag);
> - return req;
> -reterr:
> - p9_req_put(c, req);
> - /* We have to put also the 2nd reference as it won't be used */
> - p9_req_put(c, req);
> - return ERR_PTR(err);
> -}
> -
> /**
> * p9_client_rpc - issue a request and wait for a response
> * @c: client session
> @@ -838,66 +898,6 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
> return ERR_PTR(safe_errno(err));
> }
>
> -static struct p9_fid *p9_fid_create(struct p9_client *clnt)
> -{
> - int ret;
> - struct p9_fid *fid;
> -
> - p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
> - fid = kzalloc(sizeof(*fid), GFP_KERNEL);
> - if (!fid)
> - return NULL;
> -
> - fid->mode = -1;
> - fid->uid = current_fsuid();
> - fid->clnt = clnt;
> - refcount_set(&fid->count, 1);
> -
> - idr_preload(GFP_KERNEL);
> - spin_lock_irq(&clnt->lock);
> - ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
> - GFP_NOWAIT);
> - spin_unlock_irq(&clnt->lock);
> - idr_preload_end();
> - if (!ret) {
> - trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
> - return fid;
> - }
> -
> - kfree(fid);
> - return NULL;
> -}
> -
> -static void p9_fid_destroy(struct p9_fid *fid)
> -{
> - struct p9_client *clnt;
> - unsigned long flags;
> -
> - p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
> - trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
> - clnt = fid->clnt;
> - spin_lock_irqsave(&clnt->lock, flags);
> - idr_remove(&clnt->fids, fid->fid);
> - spin_unlock_irqrestore(&clnt->lock, flags);
> - kfree(fid->rdir);
> - kfree(fid);
> -}
> -
> -/* We also need to export tracepoint symbols for tracepoint_enabled() */
> -EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
> -
> -void do_trace_9p_fid_get(struct p9_fid *fid)
> -{
> - trace_9p_fid_ref(fid, P9_FID_REF_GET);
> -}
> -EXPORT_SYMBOL(do_trace_9p_fid_get);
> -
> -void do_trace_9p_fid_put(struct p9_fid *fid)
> -{
> - trace_9p_fid_ref(fid, P9_FID_REF_PUT);
> -}
> -EXPORT_SYMBOL(do_trace_9p_fid_put);
> -
> static int p9_client_version(struct p9_client *c)
> {
> int err = 0;
>
On Saturday, February 11, 2023 8:50:20 AM CET Dominique Martinet wrote:
> The async RPC code will also use an automatic size and this bit of code
> can be shared, as p9_tag_alloc still knows what client we alloc for.
>
> Signed-off-by: Dominique Martinet <[email protected]>
Reviewed-by: Christian Schoenebeck <[email protected]>
> ---
> net/9p/client.c | 23 +++++++++++------------
> 1 file changed, 11 insertions(+), 12 deletions(-)
>
> diff --git a/net/9p/client.c b/net/9p/client.c
> index 53ebd07c36ee..4e5238db4a7a 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -282,8 +282,15 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
> t_size ?: p9_msg_buf_size(c, type, fmt, apc));
> va_end(apc);
>
> - alloc_rsize = min_t(size_t, c->msize,
> - r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
> + /* Currently RDMA transport is excluded from response message size
> + * optimization, as it cannot cope with it due to its pooled response
> + * buffers (this has no impact on request size)
> + */
> + if (r_size == 0 && c->trans_mod->pooled_rbuffers)
> + alloc_rsize = c->msize;
> + else
> + alloc_rsize = min_t(size_t, c->msize,
> + r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
>
> if (!req)
> return ERR_PTR(-ENOMEM);
> @@ -728,18 +735,10 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
> int sigpending, err;
> unsigned long flags;
> struct p9_req_t *req;
> - /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to
> - * auto determine an appropriate (small) request/response size
> - * according to actual message data being sent. Currently RDMA
> - * transport is excluded from this response message size optimization,
> - * as it would not cope with it, due to its pooled response buffers
> - * (using an optimized request size for RDMA as well though).
> - */
> - const uint tsize = 0;
> - const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0;
>
> va_start(ap, fmt);
> - req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap);
> + /* auto determine an appropriate request/response size */
> + req = p9_client_prepare_req(c, type, 0, 0, fmt, ap);
> va_end(ap);
> if (IS_ERR(req))
> return req;
>
On Saturday, February 11, 2023 8:50:18 AM CET Dominique Martinet wrote:
> I've been working on async RPCs for a while and never had time to debug
> the last issues this had, but by limiting the async clunks to failures
> the impact is drastically smaller and I've not been able to reproduce
> any more bug so far.
>
> This will require some more testing and I'm tempted to say this is not
> worth rushing this into the merge window next week-ish; the new problem
> Jens reported with task_work isn't really new and I'd rather get this
> right than rush new bugs in given the sour experience I've had with this
> patch series... Hopefully it'll get in this time.
> With that in mind I plan to take the patches in to my -next branch after
> the merge window, so this has time to get reviewed first.
>
> I'd like to measure impact on performance as well, but really spent way
> more time on this than I already have, so that'll have to wait a bit.
I have not observed performance degradation, nor any (new) errors. So for
this entire series:
Tested-by: Christian Schoenebeck <[email protected]>
On Monday, February 13, 2023 7:26:54 PM CET Christian Schoenebeck wrote:
> On Saturday, February 11, 2023 8:50:18 AM CET Dominique Martinet wrote:
> > I've been working on async RPCs for a while and never had time to debug
> > the last issues this had, but by limiting the async clunks to failures
> > the impact is drastically smaller and I've not been able to reproduce
> > any more bug so far.
> >
> > This will require some more testing and I'm tempted to say this is not
> > worth rushing this into the merge window next week-ish; the new problem
> > Jens reported with task_work isn't really new and I'd rather get this
> > right than rush new bugs in given the sour experience I've had with this
> > patch series... Hopefully it'll get in this time.
> > With that in mind I plan to take the patches in to my -next branch after
> > the merge window, so this has time to get reviewed first.
> >
> > I'd like to measure impact on performance as well, but really spent way
> > more time on this than I already have, so that'll have to wait a bit.
>
> I have not observed performance degradation, nor any (new) errors. So for
> this entire series:
>
> Tested-by: Christian Schoenebeck <[email protected]>
Sorry, strike that. :-/
When unmounting the 9p file system it would fail (hang/timeout) to kill 9p
processes:
...
[ OK ] Reached target Unmount All Filesystems.
[ OK ] Stopped Create Static Device Nodes in /dev.
[ OK ] Stopped Create System Users.
[ OK ] Stopped Remount Root and Kernel File Systems.
[ OK ] Reached target Shutdown.
[ OK ] Reached target Final Step.
[ OK ] Finished Power-Off.
[ OK ] Reached target Power-Off.
[ 323.019620] systemd-shutdown[1]: Syncing filesystems and block devices.
[ 353.023371] systemd-shutdown[1]: Syncing filesystems and block devices - timed out, issuing SIGKILL to PID 10124.
[ 353.028633] systemd-shutdown[1]: Sending SIGTERM to remaining processes...
[ 363.033509] systemd-shutdown[1]: Waiting for process: (sd-sync), systemd-journal
[ 443.037702] systemd-shutdown[1]: Sending SIGKILL to remaining processes...
[ 443.045199] systemd-shutdown[1]: Sending SIGKILL to PID 237 (systemd-journal).
[ 443.046891] systemd-shutdown[1]: Sending SIGKILL to PID 10124 ((sd-sync)).
[ 453.048972] systemd-shutdown[1]: Waiting for process: (sd-sync), systemd-journal
[ 484.756639] INFO: task systemd-journal:237 blocked for more than 120 seconds.
[ 484.761954] Tainted: G E 6.2.0-rc6+ #62
[ 484.762940] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 484.764377] task:systemd-journal state:D stack:0 pid:237 ppid:1 flags:0x00004004
[ 484.765820] Call Trace:
[ 484.766299] <TASK>
[ 484.766667] __schedule (kernel/sched/core.c:5293 kernel/sched/core.c:6606)
[ 484.767314] schedule (kernel/sched/core.c:6683 (discriminator 1))
[ 484.767901] io_schedule (kernel/sched/core.c:8843 kernel/sched/core.c:8869)
[ 484.768486] folio_wait_bit_common (mm/filemap.c:1289)
[ 484.769240] ? find_get_pages_range_tag (mm/filemap.c:2341)
[ 484.770103] ? __pfx_wake_page_function (mm/filemap.c:1079)
[ 484.770987] folio_wait_writeback (./arch/x86/include/asm/bitops.h:207 ./arch/x86/include/asm/bitops.h:239 ./include/asm-generic/bitops/instrumented-non-atomic.h:142 ./include/linux/page-flags.h:518 mm/page-writeback.c:3146)
[ 484.771773] __filemap_fdatawait_range (./include/linux/page-flags.h:251 (discriminator 3) ./include/linux/page-flags.h:473 (discriminator 3) mm/filemap.c:522 (discriminator 3))
[ 484.772630] ? filemap_fdatawrite_wbc (mm/filemap.c:391)
[ 484.773447] ? __filemap_fdatawrite_range (mm/filemap.c:422)
[ 484.774350] filemap_write_and_wait_range (mm/filemap.c:685 mm/filemap.c:665)
[ 484.775173] v9fs_vfs_setattr_dotl (./include/linux/pagemap.h:60 fs/9p/vfs_inode_dotl.c:586) 9p
[ 484.775997] notify_change (fs/attr.c:486)
[ 484.776657] ? __vfs_getxattr (fs/xattr.c:425)
[ 484.777316] ? do_truncate (./include/linux/fs.h:761 fs/open.c:66)
[ 484.777928] do_truncate (./include/linux/fs.h:761 fs/open.c:66)
[ 484.778515] do_sys_ftruncate (fs/open.c:193)
[ 484.779192] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 484.779790] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 484.780643] RIP: 0033:0x7fb8c0f6bb77
[ 484.781256] RSP: 002b:00007ffe74e6a5e8 EFLAGS: 00000206 ORIG_RAX: 000000000000004d
[ 484.782526] RAX: ffffffffffffffda RBX: 00007ffe74e6a630 RCX: 00007fb8c0f6bb77
[ 484.783700] RDX: 0000561d6c28b0b0 RSI: 0000000001000000 RDI: 0000000000000014
[ 484.784905] RBP: 0000561d6c294bd0 R08: 0000000000000001 R09: 0000561d6c294c54
[ 484.786129] R10: 0000000000000010 R11: 0000000000000206 R12: 00007ffe74e6a628
[ 484.787323] R13: 0000561d6c284670 R14: 0000000000000703 R15: 0000000000000000
[ 484.788619] </TASK>
[ 484.789152] INFO: task (sd-sync):10124 blocked for more than 120 seconds.
[ 484.790622] Tainted: G E 6.2.0-rc6+ #62
[ 484.791660] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 484.793009] task:(sd-sync) state:D stack:0 pid:10124 ppid:1 flags:0x00000004
[ 484.794431] Call Trace:
[ 484.794889] <TASK>
[ 484.795312] __schedule (kernel/sched/core.c:5293 kernel/sched/core.c:6606)
[ 484.795937] ? __pfx_sync_inodes_one_sb (fs/sync.c:75)
[ 484.796751] schedule (kernel/sched/core.c:6683 (discriminator 1))
[ 484.797313] wb_wait_for_completion (fs/fs-writeback.c:192 fs/fs-writeback.c:189)
[ 484.798042] ? __pfx_autoremove_wake_function (kernel/sched/wait.c:418)
[ 484.798997] sync_inodes_sb (fs/fs-writeback.c:369 (discriminator 3) fs/fs-writeback.c:2726 (discriminator 3))
[ 484.799644] ? __pfx_sync_inodes_one_sb (fs/sync.c:75)
[ 484.800486] iterate_supers (fs/super.c:725)
[ 484.801184] ksys_sync (fs/sync.c:103)
[ 484.801754] __do_sys_sync (fs/sync.c:115)
[ 484.802372] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 484.802975] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 484.803868] RIP: 0033:0x7f41734a1447
[ 484.804496] RSP: 002b:00007ffc7f4de6a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a2
[ 484.805892] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f41734a1447
[ 484.807216] RDX: 00000000ffffffff RSI: 00000000ffffffff RDI: 0000000000000003
[ 484.808464] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[ 484.809701] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffc7f4dff2b
[ 484.810857] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 484.812090] </TASK>
[ 533.055563] systemd-shutdown[1]: Unmounting file systems.
[ 533.059617] [10125]: Remounting '/' read-only in with options 'loose,access=client,msize=512000,trans=virtio'.
[ 605.587527] INFO: task systemd-journal:237 blocked for more than 241 seconds.
[ 605.592716] Tainted: G E 6.2.0-rc6+ #62
[ 605.593674] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 605.594956] task:systemd-journal state:D stack:0 pid:237 ppid:1 flags:0x00004004
[ 605.596415] Call Trace:
[ 605.596884] <TASK>
[ 605.597274] __schedule (kernel/sched/core.c:5293 kernel/sched/core.c:6606)
[ 605.597875] schedule (kernel/sched/core.c:6683 (discriminator 1))
[ 605.598424] io_schedule (kernel/sched/core.c:8843 kernel/sched/core.c:8869)
[ 605.599034] folio_wait_bit_common (mm/filemap.c:1289)
[ 605.599830] ? find_get_pages_range_tag (mm/filemap.c:2341)
[ 605.600750] ? __pfx_wake_page_function (mm/filemap.c:1079)
[ 605.601607] folio_wait_writeback (./arch/x86/include/asm/bitops.h:207 ./arch/x86/include/asm/bitops.h:239 ./include/asm-generic/bitops/instrumented-non-atomic.h:142 ./include/linux/page-flags.h:518 mm/page-writeback.c:3146)
[ 605.602314] __filemap_fdatawait_range (./include/linux/page-flags.h:251 (discriminator 3) ./include/linux/page-flags.h:473 (discriminator 3) mm/filemap.c:522 (discriminator 3))
[ 605.603217] ? filemap_fdatawrite_wbc (mm/filemap.c:391)
[ 605.604060] ? __filemap_fdatawrite_range (mm/filemap.c:422)
[ 605.604877] filemap_write_and_wait_range (mm/filemap.c:685 mm/filemap.c:665)
[ 605.605737] v9fs_vfs_setattr_dotl (./include/linux/pagemap.h:60 fs/9p/vfs_inode_dotl.c:586) 9p
[ 605.606560] notify_change (fs/attr.c:486)
[ 605.607206] ? __vfs_getxattr (fs/xattr.c:425)
[ 605.607871] ? do_truncate (./include/linux/fs.h:761 fs/open.c:66)
[ 605.608510] do_truncate (./include/linux/fs.h:761 fs/open.c:66)
[ 605.609118] do_sys_ftruncate (fs/open.c:193)
[ 605.609867] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 605.610506] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 605.611401] RIP: 0033:0x7fb8c0f6bb77
[ 605.612038] RSP: 002b:00007ffe74e6a5e8 EFLAGS: 00000206 ORIG_RAX: 000000000000004d
[ 605.613280] RAX: ffffffffffffffda RBX: 00007ffe74e6a630 RCX: 00007fb8c0f6bb77
[ 605.614449] RDX: 0000561d6c28b0b0 RSI: 0000000001000000 RDI: 0000000000000014
[ 605.615763] RBP: 0000561d6c294bd0 R08: 0000000000000001 R09: 0000561d6c294c54
[ 605.616952] R10: 0000000000000010 R11: 0000000000000206 R12: 00007ffe74e6a628
[ 605.618206] R13: 0000561d6c284670 R14: 0000000000000703 R15: 0000000000000000
[ 605.619547] </TASK>
[ 605.619939] INFO: task (sd-sync):10124 blocked for more than 241 seconds.
[ 605.621187] Tainted: G E 6.2.0-rc6+ #62
[ 605.622229] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 605.623518] task:(sd-sync) state:D stack:0 pid:10124 ppid:1 flags:0x00000004
[ 605.624907] Call Trace:
[ 605.625340] <TASK>
[ 605.625758] __schedule (kernel/sched/core.c:5293 kernel/sched/core.c:6606)
[ 605.626397] ? __pfx_sync_inodes_one_sb (fs/sync.c:75)
[ 605.627263] schedule (kernel/sched/core.c:6683 (discriminator 1))
[ 605.627802] wb_wait_for_completion (fs/fs-writeback.c:192 fs/fs-writeback.c:189)
[ 605.628530] ? __pfx_autoremove_wake_function (kernel/sched/wait.c:418)
[ 605.629380] sync_inodes_sb (fs/fs-writeback.c:369 (discriminator 3) fs/fs-writeback.c:2726 (discriminator 3))
[ 605.630014] ? __pfx_sync_inodes_one_sb (fs/sync.c:75)
[ 605.630789] iterate_supers (fs/super.c:725)
[ 605.631428] ksys_sync (fs/sync.c:103)
[ 605.631999] __do_sys_sync (fs/sync.c:115)
[ 605.632669] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 605.633293] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 605.634188] RIP: 0033:0x7f41734a1447
[ 605.634870] RSP: 002b:00007ffc7f4de6a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a2
[ 605.636181] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f41734a1447
[ 605.637444] RDX: 00000000ffffffff RSI: 00000000ffffffff RDI: 0000000000000003
[ 605.638750] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[ 605.640032] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffc7f4dff2b
[ 605.641313] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 605.642465] </TASK>
[ 623.059026] systemd-shutdown[1]: Remounting '/' timed out, issuing SIGKILL to PID 10125.
[ 623.064583] systemd-shutdown[1]: Not all file systems unmounted, 1 left.
[ 623.066046] systemd-shutdown[1]: Deactivating swaps.
[ 623.067274] systemd-shutdown[1]: All swaps deactivated.
[ 623.068436] systemd-shutdown[1]: Detaching loop devices.
[ 623.070645] systemd-shutdown[1]: All loop devices detached.
[ 623.071918] systemd-shutdown[1]: Stopping MD devices.
[ 623.073602] systemd-shutdown[1]: All MD devices stopped.
[ 623.074785] systemd-shutdown[1]: Detaching DM devices.
[ 623.076416] systemd-shutdown[1]: All DM devices detached.
[ 623.077981] systemd-shutdown[1]: Unmounting file systems.
[ 623.080197] [10126]: Remounting '/' read-only in with options 'loose,access=client,msize=512000,trans=virtio'.
[ 713.079672] systemd-shutdown[1]: Remounting '/' timed out, issuing SIGKILL to PID 10126.
[ 713.085323] systemd-shutdown[1]: Not all file systems unmounted, 1 left.
[ 713.086770] systemd-shutdown[1]: Cannot finalize remaining file systems, continuing.
[ 713.092483] systemd-shutdown[1]: Failed to finalize file systems, ignoring.
[ 713.094816] systemd-shutdown[1]: Syncing filesystems and block devices.
[ 726.418246] INFO: task systemd-journal:237 blocked for more than 362 seconds.
[ 726.423366] Tainted: G E 6.2.0-rc6+ #62
[ 726.424320] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 726.425672] task:systemd-journal state:D stack:0 pid:237 ppid:1 flags:0x00004004
[ 726.427132] Call Trace:
[ 726.427583] <TASK>
[ 726.427968] __schedule (kernel/sched/core.c:5293 kernel/sched/core.c:6606)
[ 726.428586] schedule (kernel/sched/core.c:6683 (discriminator 1))
[ 726.429145] io_schedule (kernel/sched/core.c:8843 kernel/sched/core.c:8869)
[ 726.429753] folio_wait_bit_common (mm/filemap.c:1289)
[ 726.430562] ? find_get_pages_range_tag (mm/filemap.c:2341)
[ 726.431427] ? __pfx_wake_page_function (mm/filemap.c:1079)
[ 726.432254] folio_wait_writeback (./arch/x86/include/asm/bitops.h:207 ./arch/x86/include/asm/bitops.h:239 ./include/asm-generic/bitops/instrumented-non-atomic.h:142 ./include/linux/page-flags.h:518 mm/page-writeback.c:3146)
[ 726.432987] __filemap_fdatawait_range (./include/linux/page-flags.h:251 (discriminator 3) ./include/linux/page-flags.h:473 (discriminator 3) mm/filemap.c:522 (discriminator 3))
[ 726.433815] ? filemap_fdatawrite_wbc (mm/filemap.c:391)
[ 726.434625] ? __filemap_fdatawrite_range (mm/filemap.c:422)
[ 726.435470] filemap_write_and_wait_range (mm/filemap.c:685 mm/filemap.c:665)
[ 726.436313] v9fs_vfs_setattr_dotl (./include/linux/pagemap.h:60 fs/9p/vfs_inode_dotl.c:586) 9p
[ 726.437168] notify_change (fs/attr.c:486)
[ 726.437791] ? __vfs_getxattr (fs/xattr.c:425)
[ 726.438483] ? do_truncate (./include/linux/fs.h:761 fs/open.c:66)
[ 726.439120] do_truncate (./include/linux/fs.h:761 fs/open.c:66)
[ 726.439723] do_sys_ftruncate (fs/open.c:193)
[ 726.440407] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 726.441051] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 726.441882] RIP: 0033:0x7fb8c0f6bb77
[ 726.442517] RSP: 002b:00007ffe74e6a5e8 EFLAGS: 00000206 ORIG_RAX: 000000000000004d
[ 726.443826] RAX: ffffffffffffffda RBX: 00007ffe74e6a630 RCX: 00007fb8c0f6bb77
[ 726.445068] RDX: 0000561d6c28b0b0 RSI: 0000000001000000 RDI: 0000000000000014
[ 726.446318] RBP: 0000561d6c294bd0 R08: 0000000000000001 R09: 0000561d6c294c54
[ 726.447541] R10: 0000000000000010 R11: 0000000000000206 R12: 00007ffe74e6a628
[ 726.448776] R13: 0000561d6c284670 R14: 0000000000000703 R15: 0000000000000000
[ 726.449968] </TASK>
[ 726.450373] INFO: task (sd-sync):10124 blocked for more than 362 seconds.
[ 726.451534] Tainted: G E 6.2.0-rc6+ #62
[ 726.452537] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 726.453896] task:(sd-sync) state:D stack:0 pid:10124 ppid:1 flags:0x00000004
[ 726.455307] Call Trace:
[ 726.455747] <TASK>
[ 726.456132] __schedule (kernel/sched/core.c:5293 kernel/sched/core.c:6606)
[ 726.456756] ? __pfx_sync_inodes_one_sb (fs/sync.c:75)
[ 726.457584] schedule (kernel/sched/core.c:6683 (discriminator 1))
[ 726.458157] wb_wait_for_completion (fs/fs-writeback.c:192 fs/fs-writeback.c:189)
[ 726.458932] ? __pfx_autoremove_wake_function (kernel/sched/wait.c:418)
[ 726.459847] sync_inodes_sb (fs/fs-writeback.c:369 (discriminator 3) fs/fs-writeback.c:2726 (discriminator 3))
[ 726.460513] ? __pfx_sync_inodes_one_sb (fs/sync.c:75)
[ 726.461340] iterate_supers (fs/super.c:725)
[ 726.462001] ksys_sync (fs/sync.c:103)
[ 726.462567] __do_sys_sync (fs/sync.c:115)
[ 726.463188] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 726.463825] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 726.464683] RIP: 0033:0x7f41734a1447
[ 726.465256] RSP: 002b:00007ffc7f4de6a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a2
[ 726.466581] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f41734a1447
[ 726.467795] RDX: 00000000ffffffff RSI: 00000000ffffffff RDI: 0000000000000003
[ 726.469025] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[ 726.470273] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffc7f4dff2b
[ 726.471510] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 726.472746] </TASK>
[ 726.473147] INFO: task (sd-remount):10125 blocked for more than 120 seconds.
[ 726.474313] Tainted: G E 6.2.0-rc6+ #62
[ 726.475318] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 726.476664] task:(sd-remount) state:D stack:0 pid:10125 ppid:1 flags:0x00000004
[ 726.478116] Call Trace:
[ 726.478523] <TASK>
[ 726.478884] __schedule (kernel/sched/core.c:5293 kernel/sched/core.c:6606)
[ 726.479479] ? memcg_slab_post_alloc_hook (./include/linux/percpu-refcount.h:174 ./include/linux/percpu-refcount.h:332 ./include/linux/percpu-refcount.h:351 ./include/linux/memcontrol.h:794 mm/slab.h:553)
[ 726.480340] schedule (kernel/sched/core.c:6683 (discriminator 1))
[ 726.480851] rwsem_down_write_slowpath (kernel/locking/rwsem.c:1192)
[ 726.481668] ? alloc_fs_context (fs/fs_context.c:291)
[ 726.482390] path_mount (fs/namespace.c:2806 fs/namespace.c:3467)
[ 726.483011] __x64_sys_mount (fs/namespace.c:3489 fs/namespace.c:3697 fs/namespace.c:3674 fs/namespace.c:3674)
[ 726.483698] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 726.484326] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 726.485191] RIP: 0033:0x7f41734ab62a
[ 726.485764] RSP: 002b:00007ffc7f4de7c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
[ 726.487081] RAX: ffffffffffffffda RBX: 000055c5c844aa10 RCX: 00007f41734ab62a
[ 726.488247] RDX: 0000000000000000 RSI: 000055c5c844aa50 RDI: 0000000000000000
[ 726.489475] RBP: 0000000000000000 R08: 000055c5c844a9d0 R09: 000055c5c844a9d0
[ 726.490718] R10: 00000000003000a1 R11: 0000000000000246 R12: 00007ffc7f4de900
[ 726.491900] R13: 0000000000000000 R14: 00007ffc7f4de848 R15: 0000000000000000
[ 726.493003] </TASK>
[ 743.099066] systemd-shutdown[1]: Syncing filesystems and block devices - timed out, issuing SIGKILL to PID 10129.
[ 743.104366] systemd-shutdown[1]: Powering off.
[ 743.213283] ACPI: PM: Preparing to enter system sleep state S5
[ 743.214425] reboot: Power down
Christian Schoenebeck wrote on Mon, Feb 13, 2023 at 07:45:34PM +0100:
> On Monday, February 13, 2023 7:26:54 PM CET Christian Schoenebeck wrote:
> > On Saturday, February 11, 2023 8:50:18 AM CET Dominique Martinet wrote:
> > > I've been working on async RPCs for a while and never had time to debug
> > > the last issues this had, but by limiting the async clunks to failures
> > > the impact is drastically smaller and I've not been able to reproduce
> > > any more bug so far.
> > >
> > > This will require some more testing and I'm tempted to say this is not
> > > worth rushing this into the merge window next week-ish; the new problem
> > > Jens reported with task_work isn't really new and I'd rather get this
> > > right than rush new bugs in given the sour experience I've had with this
> > > patch series... Hopefully it'll get in this time.
> > > With that in mind I plan to take the patches in to my -next branch after
> > > the merge window, so this has time to get reviewed first.
> > >
> > > I'd like to measure impact on performance as well, but really spent way
> > > more time on this than I already have, so that'll have to wait a bit.
> >
> > I have not observed performance degradation, nor any (new) errors. So for
> > this entire series:
> >
> > Tested-by: Christian Schoenebeck <[email protected]>
>
> Sorry, strike that. :-/
Ugh! thanks for the test, I'm not making much sense of the trace (folio
wait bit means some background write isn't happening when flushing
before the setattr?) but I'll try to reproduce and have a look
today/tomorrow
--
Dominique
On Monday, February 13, 2023 11:37:50 PM CET Dominique Martinet wrote:
> Christian Schoenebeck wrote on Mon, Feb 13, 2023 at 07:45:34PM +0100:
> > On Monday, February 13, 2023 7:26:54 PM CET Christian Schoenebeck wrote:
> > > On Saturday, February 11, 2023 8:50:18 AM CET Dominique Martinet wrote:
> > > > I've been working on async RPCs for a while and never had time to debug
> > > > the last issues this had, but by limiting the async clunks to failures
> > > > the impact is drastically smaller and I've not been able to reproduce
> > > > any more bug so far.
> > > >
> > > > This will require some more testing and I'm tempted to say this is not
> > > > worth rushing this into the merge window next week-ish; the new problem
> > > > Jens reported with task_work isn't really new and I'd rather get this
> > > > right than rush new bugs in given the sour experience I've had with this
> > > > patch series... Hopefully it'll get in this time.
> > > > With that in mind I plan to take the patches in to my -next branch after
> > > > the merge window, so this has time to get reviewed first.
> > > >
> > > > I'd like to measure impact on performance as well, but really spent way
> > > > more time on this than I already have, so that'll have to wait a bit.
> > >
> > > I have not observed performance degradation, nor any (new) errors. So for
> > > this entire series:
> > >
> > > Tested-by: Christian Schoenebeck <[email protected]>
> >
> > Sorry, strike that. :-/
>
> Ugh! thanks for the test, I'm not making much sense of the trace (folio
> wait bit means some background write isn't happening when flushing
> before the setattr?) but I'll try to reproduce and have a look
> today/tomorrow
Yes, apparently it tries to write dirty pages of the mapped file and keeps
hanging there [fs/9p/vfs_inode_dotl.c:586]:
int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *iattr)
{
...
/* Write all dirty data */
if (S_ISREG(inode->i_mode))
filemap_write_and_wait(inode->i_mapping); <-- Hang
retval = p9_client_setattr(fid, &p9attr);
...
}
Christian Schoenebeck wrote on Tue, Feb 14, 2023 at 10:34:36AM +0100:
> > Ugh! thanks for the test, I'm not making much sense of the trace (folio
> > wait bit means some background write isn't happening when flushing
> > before the setattr?) but I'll try to reproduce and have a look
> > today/tomorrow
>
> Yes, apparently it tries to write dirty pages of the mapped file and keeps
> hanging there [fs/9p/vfs_inode_dotl.c:586]:
Yeah, it'd help to get the trace of the thread actually trying to do the
IO, if it still exists.
I had some hangs in the check that there are no flush in flight at some
point, and I thought I fixed that, but I can't really see anywhere else
that'd start hanging with this... it'll be clearer if I can reproduce.
Anyway, I found another bug, just running ./configure on a random project
(picked coreutils tarball) fails with interrupted system call ?!
checking for gawk... gawk
checking whether make sets $(MAKE)... yes
checking whether make supports nested variables... yes
checking whether make supports nested variables... (cached) yes
checking for gcc... gcc
./configure: line 6385: can't create conftest.c: Interrupted system call
checking whether the C compiler works... no
sed: can't read conftest.c: No such file or directory
configure: error: in `/mnt/coreutils-9.1':
configure: error: C compiler cannot create executables
See `config.log' for more details
This obviously doesn't happen if I run with `sh -x configure`, but
through execsnoop I caught it's just calling 'gcc conftest.c' and
there's nothing special in here... But it doesn't happen without these
patches, so I get something to dig at first.
With a bit of luck it's the same root cause...
--
Dominique
It's been a while but I didn't forget...
Dominique Martinet wrote on Tue, Feb 14, 2023 at 08:16:38PM +0900:
> > Yes, apparently it tries to write dirty pages of the mapped file and keeps
> > hanging there [fs/9p/vfs_inode_dotl.c:586]:
>
> Yeah, it'd help to get the trace of the thread actually trying to do the
> IO, if it still exists.
> I had some hangs in the check that there are no flush in flight at some
> point, and I thought I fixed that, but I can't really see anywhere else
> that'd start hanging with this... it'll be clearer if I can reproduce.
I couldn't reproduce this one, but manually inspecting
p9_client_wait_flush again I noticed the wait_event_interruptible was
waiting on req->flushed_req->wq but looking at req->status in the
condition; that was an error.
Also, we have a ref on req->flushed_req but not on req, so
req->flushed_req wasn't safe.
I've changed the code to add a variable directly on req->flushed_req and
use it consistently, I'm not sure that's the problem you ran into but it
might help.
It's been a while but do you remember if that hang was consistently
happening on shutdown, or was it a one time thing?
Either way, I'd appreciate if you could try my 9p-test branch again:
https://github.com/martinetd/linux/commits/9p-test
With that said, I expect that p9_client_wait_req will cause hangs on
broken servers.
If connection drops hopefully the reqs will just be marked as error and
free the thread, but I can see syzbot complaining about yet another
thread stuck.. Well it's interruptible at least, and bails out on
ERESTARTSYS.
> Anyway, I found another bug, just running ./configure on a random project
> (picked coreutils tarball) fails with interrupted system call ?!
That other bug was weird, I could reproduce it reliably until I rebooted
the host because of an unrelated nfs bug on the host, and after reboot I
couldn't reproduce anymore.
I'll chalk it down to buggy host/weird happenstance, but something to
watch for if random EINTR happen again :/
--
Dominique
On Sunday, March 19, 2023 12:53:46 PM CET Dominique Martinet wrote:
> It's been a while but I didn't forget...
>
> Dominique Martinet wrote on Tue, Feb 14, 2023 at 08:16:38PM +0900:
> > > Yes, apparently it tries to write dirty pages of the mapped file and keeps
> > > hanging there [fs/9p/vfs_inode_dotl.c:586]:
> >
> > Yeah, it'd help to get the trace of the thread actually trying to do the
> > IO, if it still exists.
> > I had some hangs in the check that there are no flush in flight at some
> > point, and I thought I fixed that, but I can't really see anywhere else
> > that'd start hanging with this... it'll be clearer if I can reproduce.
>
> I couldn't reproduce this one, but manually inspecting
> p9_client_wait_flush again I noticed the wait_event_interruptible was
> waiting on req->flushed_req->wq but looking at req->status in the
> condition; that was an error.
> Also, we have a ref on req->flushed_req but not on req, so
> req->flushed_req wasn't safe.
>
> I've changed the code to add a variable directly on req->flushed_req and
> use it consistently, I'm not sure that's the problem you ran into but it
> might help.
> It's been a while but do you remember if that hang was consistently
> happening on shutdown, or was it a one time thing?
>
> Either way, I'd appreciate if you could try my 9p-test branch again:
> https://github.com/martinetd/linux/commits/9p-test
>
>
> With that said, I expect that p9_client_wait_req will cause hangs on
> broken servers.
> If connection drops hopefully the reqs will just be marked as error and
> free the thread, but I can see syzbot complaining about yet another
> thread stuck.. Well it's interruptible at least, and bails out on
> ERESTARTSYS.
I gave your current test branch some spins today.
And yes, the problem did not happen reliably each time, but consistently
enough for me to reproduce it.
With your latest test branch it appears to have mitigated the problem. Once in
a while on shutdown I see it blocking for few minutes, but recovering:
[** ] (2 of 3) A stop job is running for …ave Random Seed (2min 36s / 10min)
[ 484.986388] INFO: task systemd-user-ru:8782 blocked for more than 120 seconds.
[ 484.990150] Tainted: G E 6.3.0-rc2+ #63
[ 484.992553] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 484.993657] task:systemd-user-ru state:D stack:0 pid:8782 ppid:1 flags:0x00000004
[ 484.994863] Call Trace:
[ 484.995398] <TASK>
[ 484.995866] __schedule (kernel/sched/core.c:5304 kernel/sched/core.c:6622)
[ 484.996614] schedule (kernel/sched/core.c:6699 (discriminator 1))
[ 484.997283] d_alloc_parallel (./include/linux/spinlock.h:350 fs/dcache.c:2626 fs/dcache.c:2707)
[ 484.998150] ? __pfx_default_wake_function (kernel/sched/core.c:6944)
[ 484.999213] __lookup_slow (fs/namei.c:1671)
[ 485.000006] walk_component (./include/linux/fs.h:773 fs/namei.c:1704 fs/namei.c:1994)
[ 485.000805] path_lookupat (fs/namei.c:2451 fs/namei.c:2475)
[ 485.001594] filename_lookup (fs/namei.c:2504)
[ 485.002452] ? __check_object_size (mm/usercopy.c:196 mm/usercopy.c:251 mm/usercopy.c:213)
[ 485.003523] ? strncpy_from_user (./include/asm-generic/access_ok.h:40 ./arch/x86/include/asm/uaccess.h:551 lib/strncpy_from_user.c:138)
[ 485.004537] user_path_at_empty (fs/namei.c:2879)
[ 485.005508] do_faccessat (fs/open.c:484)
[ 485.006410] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 485.007281] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 485.008495] RIP: 0033:0x7f7b4527d8f7
[ 485.009380] RSP: 002b:00007ffc9ee73c08 EFLAGS: 00000246 ORIG_RAX: 0000000000000015
[ 485.011118] RAX: ffffffffffffffda RBX: 00007f7b44ee2608 RCX: 00007f7b4527d8f7
[ 485.012831] RDX: 000000000000001c RSI: 0000000000000000 RDI: 00007f7b44eda86b
[ 485.014560] RBP: 0000000000000003 R08: 0000000000000000 R09: 00007f7b45360be0
[ 485.016246] R10: 00005571ade756c0 R11: 0000000000000246 R12: 00007ffc9ee73cf8
[ 485.017937] R13: 00007ffc9ee73d18 R14: 00007f7b44ee2610 R15: 0000000000000000
[ 485.019669] </TASK>
However that happens now before unmount rule is reached on shutdown?
Previously it was hanging after the shutdown rule was reached. Does this make
sense?
> > Anyway, I found another bug, just running ./configure on a random project
> > (picked coreutils tarball) fails with interrupted system call ?!
>
> That other bug was weird, I could reproduce it reliably until I rebooted
> the host because of an unrelated nfs bug on the host, and after reboot I
> couldn't reproduce anymore.
> I'll chalk it down to buggy host/weird happenstance, but something to
> watch for if random EINTR happen again :/
On Thursday, March 23, 2023 4:58:51 PM CET Christian Schoenebeck wrote:
> On Sunday, March 19, 2023 12:53:46 PM CET Dominique Martinet wrote:
> > It's been a while but I didn't forget...
> >
> > Dominique Martinet wrote on Tue, Feb 14, 2023 at 08:16:38PM +0900:
> > > > Yes, apparently it tries to write dirty pages of the mapped file and keeps
> > > > hanging there [fs/9p/vfs_inode_dotl.c:586]:
> > >
> > > Yeah, it'd help to get the trace of the thread actually trying to do the
> > > IO, if it still exists.
> > > I had some hangs in the check that there are no flush in flight at some
> > > point, and I thought I fixed that, but I can't really see anywhere else
> > > that'd start hanging with this... it'll be clearer if I can reproduce.
> >
> > I couldn't reproduce this one, but manually inspecting
> > p9_client_wait_flush again I noticed the wait_event_interruptible was
> > waiting on req->flushed_req->wq but looking at req->status in the
> > condition; that was an error.
> > Also, we have a ref on req->flushed_req but not on req, so
> > req->flushed_req wasn't safe.
> >
> > I've changed the code to add a variable directly on req->flushed_req and
> > use it consistently, I'm not sure that's the problem you ran into but it
> > might help.
> > It's been a while but do you remember if that hang was consistently
> > happening on shutdown, or was it a one time thing?
> >
> > Either way, I'd appreciate if you could try my 9p-test branch again:
> > https://github.com/martinetd/linux/commits/9p-test
> >
> >
> > With that said, I expect that p9_client_wait_req will cause hangs on
> > broken servers.
> > If connection drops hopefully the reqs will just be marked as error and
> > free the thread, but I can see syzbot complaining about yet another
> > thread stuck.. Well it's interruptible at least, and bails out on
> > ERESTARTSYS.
>
> I gave your current test branch some spins today.
>
> And yes, the problem did not happen reliably each time, but consistently
> enough for me to reproduce it.
>
> With your latest test branch it appears to have mitigated the problem. Once in
> a while on shutdown I see it blocking for few minutes, but recovering:
>
> [** ] (2 of 3) A stop job is running for …ave Random Seed (2min 36s / 10min)
> [ 484.986388] INFO: task systemd-user-ru:8782 blocked for more than 120 seconds.
> [ 484.990150] Tainted: G E 6.3.0-rc2+ #63
> [ 484.992553] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 484.993657] task:systemd-user-ru state:D stack:0 pid:8782 ppid:1 flags:0x00000004
> [ 484.994863] Call Trace:
> [ 484.995398] <TASK>
> [ 484.995866] __schedule (kernel/sched/core.c:5304 kernel/sched/core.c:6622)
> [ 484.996614] schedule (kernel/sched/core.c:6699 (discriminator 1))
> [ 484.997283] d_alloc_parallel (./include/linux/spinlock.h:350 fs/dcache.c:2626 fs/dcache.c:2707)
> [ 484.998150] ? __pfx_default_wake_function (kernel/sched/core.c:6944)
> [ 484.999213] __lookup_slow (fs/namei.c:1671)
> [ 485.000006] walk_component (./include/linux/fs.h:773 fs/namei.c:1704 fs/namei.c:1994)
> [ 485.000805] path_lookupat (fs/namei.c:2451 fs/namei.c:2475)
> [ 485.001594] filename_lookup (fs/namei.c:2504)
> [ 485.002452] ? __check_object_size (mm/usercopy.c:196 mm/usercopy.c:251 mm/usercopy.c:213)
> [ 485.003523] ? strncpy_from_user (./include/asm-generic/access_ok.h:40 ./arch/x86/include/asm/uaccess.h:551 lib/strncpy_from_user.c:138)
> [ 485.004537] user_path_at_empty (fs/namei.c:2879)
> [ 485.005508] do_faccessat (fs/open.c:484)
> [ 485.006410] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
> [ 485.007281] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
> [ 485.008495] RIP: 0033:0x7f7b4527d8f7
> [ 485.009380] RSP: 002b:00007ffc9ee73c08 EFLAGS: 00000246 ORIG_RAX: 0000000000000015
> [ 485.011118] RAX: ffffffffffffffda RBX: 00007f7b44ee2608 RCX: 00007f7b4527d8f7
> [ 485.012831] RDX: 000000000000001c RSI: 0000000000000000 RDI: 00007f7b44eda86b
> [ 485.014560] RBP: 0000000000000003 R08: 0000000000000000 R09: 00007f7b45360be0
> [ 485.016246] R10: 00005571ade756c0 R11: 0000000000000246 R12: 00007ffc9ee73cf8
> [ 485.017937] R13: 00007ffc9ee73d18 R14: 00007f7b44ee2610 R15: 0000000000000000
> [ 485.019669] </TASK>
>
> However that happens now before unmount rule is reached on shutdown?
> Previously it was hanging after the shutdown rule was reached. Does this make
> sense?
Correction: I still get the previous issue on unmount with your latest version
as well. :/
Call trace looks slightly different, but it seems to be the same issue:
[ OK ] Reached target Unmount All Filesystems.
[* ] A stop job is running for Load/Save Random Seed (3min 27s / 10min)
[ 243.031668] INFO: task systemd-journal:199 blocked for more than 120 seconds.
[ 243.036460] Tainted: G E 6.3.0-rc2+ #63
[ 243.037666] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 243.039004] task:systemd-journal state:D stack:0 pid:199 ppid:1 flags:0x00004004
[ 243.040434] Call Trace:
[ 243.040878] <TASK>
[ 243.041265] __schedule (kernel/sched/core.c:5304 kernel/sched/core.c:6622)
[ 243.041887] ? __pagevec_release (./include/linux/pagevec.h:38 mm/swap.c:1063)
[ 243.042594] schedule (kernel/sched/core.c:6699 (discriminator 1))
[ 243.043150] io_schedule (kernel/sched/core.c:8859 kernel/sched/core.c:8885)
[ 243.043751] folio_wait_bit_common (mm/filemap.c:1293)
[ 243.044519] ? __pfx_wake_page_function (mm/filemap.c:1083)
[ 243.045321] folio_wait_writeback (./arch/x86/include/asm/bitops.h:207 ./arch/x86/include/asm/bitops.h:239 ./include/asm-generic/bitops/instrumented-non-atomic.h:142 ./include/linux/page-flags.h:518 mm/page-writeback.c:3125)
[ 243.046043] __filemap_fdatawait_range (./arch/x86/include/asm/bitops.h:75 (discriminator 3) ./include/asm-generic/bitops/instrumented-atomic.h:42 (discriminator 3) ./include/linux/page-flags.h:473 (discriminator 3) mm/filemap.c:526 (discriminator 3))
[ 243.046828] ? filemap_fdatawrite_wbc (mm/filemap.c:393)
[ 243.047622] ? __filemap_fdatawrite_range (mm/filemap.c:424)
[ 243.048531] filemap_write_and_wait_range (mm/filemap.c:689 mm/filemap.c:669)
[ 243.049358] v9fs_vfs_setattr_dotl (./include/linux/pagemap.h:60 fs/9p/vfs_inode_dotl.c:587) 9p
[ 243.050229] notify_change (fs/attr.c:486)
[ 243.050924] ? __vfs_getxattr (fs/xattr.c:426)
[ 243.051571] ? do_truncate (./include/linux/fs.h:763 fs/open.c:67)
[ 243.052224] do_truncate (./include/linux/fs.h:763 fs/open.c:67)
[ 243.052816] do_sys_ftruncate (fs/open.c:194)
[ 243.053548] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 243.054187] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 243.055057] RIP: 0033:0x7f6db123ab77
[ 243.055706] RSP: 002b:00007ffc4dda0ca8 EFLAGS: 00000206 ORIG_RAX: 000000000000004d
[ 243.056978] RAX: ffffffffffffffda RBX: 00007ffc4dda0cf0 RCX: 00007f6db123ab77
[ 243.058198] RDX: 000055715f9b2a00 RSI: 0000000001000000 RDI: 0000000000000018
[ 243.059478] RBP: 000055715f9b2bf0 R08: 0000000000000001 R09: 000055715f9b2c74
[ 243.060716] R10: 0000000000000000 R11: 0000000000000206 R12: 00007ffc4dda0ce8
I just realized I don't have to do anything fancy to reproduce this. I just
have to boot and shutdown guest few times to trigger it.