This series has only been lightly tested, but it seems to do the trick... how's
it look to you? Commit messages could be better, but I'm not familiar with the
right 9p terminology for some things.
-- >8 --
An upcoming patch is going to require passing the client through
p9_req_put() -> p9_req_free(), but that's awkward with the kref
indirection - so this patch switches to using refcount_t directly.
Signed-off-by: Kent Overstreet <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Latchesar Ionkov <[email protected]>
Cc: Dominique Martinet <[email protected]>
---
include/net/9p/client.h | 6 +++---
net/9p/client.c | 19 ++++++++-----------
2 files changed, 11 insertions(+), 14 deletions(-)
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index ec1d1706f4..c038c2d73d 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -76,7 +76,7 @@ enum p9_req_status_t {
struct p9_req_t {
int status;
int t_err;
- struct kref refcount;
+ refcount_t refcount;
wait_queue_head_t wq;
struct p9_fcall tc;
struct p9_fcall rc;
@@ -227,12 +227,12 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
static inline void p9_req_get(struct p9_req_t *r)
{
- kref_get(&r->refcount);
+ refcount_inc(&r->refcount);
}
static inline int p9_req_try_get(struct p9_req_t *r)
{
- return kref_get_unless_zero(&r->refcount);
+ return refcount_inc_not_zero(&r->refcount);
}
int p9_req_put(struct p9_req_t *r);
diff --git a/net/9p/client.c b/net/9p/client.c
index 8bba0d9cf9..0ee48e8b72 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -305,7 +305,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
* callback), so p9_client_cb eats the second ref there
* as the pointer is duplicated directly by virtqueue_add_sgs()
*/
- refcount_set(&req->refcount.refcount, 2);
+ refcount_set(&req->refcount, 2);
return req;
@@ -370,18 +370,15 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
return p9_req_put(r);
}
-static void p9_req_free(struct kref *ref)
-{
- struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount);
-
- p9_fcall_fini(&r->tc);
- p9_fcall_fini(&r->rc);
- kmem_cache_free(p9_req_cache, r);
-}
-
int p9_req_put(struct p9_req_t *r)
{
- return kref_put(&r->refcount, p9_req_free);
+ if (refcount_dec_and_test(&r->refcount)) {
+ p9_fcall_fini(&r->tc);
+ p9_fcall_fini(&r->rc);
+ kmem_cache_free(p9_req_cache, r);
+ return 1;
+ }
+ return 0;
}
EXPORT_SYMBOL(p9_req_put);
--
2.36.1
This is to aid in adding mempools, in the next patch.
Signed-off-by: Kent Overstreet <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Latchesar Ionkov <[email protected]>
Cc: Dominique Martinet <[email protected]>
---
include/net/9p/client.h | 2 +-
net/9p/client.c | 12 ++++++------
net/9p/trans_fd.c | 12 ++++++------
net/9p/trans_rdma.c | 2 +-
net/9p/trans_virtio.c | 4 ++--
net/9p/trans_xen.c | 2 +-
6 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index c038c2d73d..cb78e0e333 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -235,7 +235,7 @@ static inline int p9_req_try_get(struct p9_req_t *r)
return refcount_inc_not_zero(&r->refcount);
}
-int p9_req_put(struct p9_req_t *r);
+int p9_req_put(struct p9_client *c, struct p9_req_t *r);
void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
diff --git a/net/9p/client.c b/net/9p/client.c
index 0ee48e8b72..a36a40137c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -341,7 +341,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
if (!p9_req_try_get(req))
goto again;
if (req->tc.tag != tag) {
- p9_req_put(req);
+ p9_req_put(c, req);
goto again;
}
}
@@ -367,10 +367,10 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
spin_lock_irqsave(&c->lock, flags);
idr_remove(&c->reqs, tag);
spin_unlock_irqrestore(&c->lock, flags);
- return p9_req_put(r);
+ return p9_req_put(c, r);
}
-int p9_req_put(struct p9_req_t *r)
+int p9_req_put(struct p9_client *c, struct p9_req_t *r)
{
if (refcount_dec_and_test(&r->refcount)) {
p9_fcall_fini(&r->tc);
@@ -423,7 +423,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
wake_up(&req->wq);
p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
- p9_req_put(req);
+ p9_req_put(c, req);
}
EXPORT_SYMBOL(p9_client_cb);
@@ -706,7 +706,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
reterr:
p9_tag_remove(c, req);
/* We have to put also the 2nd reference as it won't be used */
- p9_req_put(req);
+ p9_req_put(c, req);
return ERR_PTR(err);
}
@@ -743,7 +743,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
err = c->trans_mod->request(c, req);
if (err < 0) {
/* write won't happen */
- p9_req_put(req);
+ p9_req_put(c, req);
if (err != -ERESTARTSYS && err != -EFAULT)
c->status = Disconnected;
goto recalc_sigpending;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 8f8f95e39b..007c3f45fe 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -378,7 +378,7 @@ static void p9_read_work(struct work_struct *work)
m->rc.sdata = NULL;
m->rc.offset = 0;
m->rc.capacity = 0;
- p9_req_put(m->rreq);
+ p9_req_put(m->client, m->rreq);
m->rreq = NULL;
}
@@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work)
m->wpos += err;
if (m->wpos == m->wsize) {
m->wpos = m->wsize = 0;
- p9_req_put(m->wreq);
+ p9_req_put(m->client, m->wreq);
m->wreq = NULL;
}
@@ -695,7 +695,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
if (req->status == REQ_STATUS_UNSENT) {
list_del(&req->req_list);
req->status = REQ_STATUS_FLSHD;
- p9_req_put(req);
+ p9_req_put(client, req);
ret = 0;
}
spin_unlock(&client->lock);
@@ -722,7 +722,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
list_del(&req->req_list);
req->status = REQ_STATUS_FLSHD;
spin_unlock(&client->lock);
- p9_req_put(req);
+ p9_req_put(client, req);
return 0;
}
@@ -883,12 +883,12 @@ static void p9_conn_destroy(struct p9_conn *m)
p9_mux_poll_stop(m);
cancel_work_sync(&m->rq);
if (m->rreq) {
- p9_req_put(m->rreq);
+ p9_req_put(m->client, m->rreq);
m->rreq = NULL;
}
cancel_work_sync(&m->wq);
if (m->wreq) {
- p9_req_put(m->wreq);
+ p9_req_put(m->client, m->wreq);
m->wreq = NULL;
}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 88e5638266..d817d37452 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -350,7 +350,7 @@ send_done(struct ib_cq *cq, struct ib_wc *wc)
c->busa, c->req->tc.size,
DMA_TO_DEVICE);
up(&rdma->sq_sem);
- p9_req_put(c->req);
+ p9_req_put(client, c->req);
kfree(c);
}
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index b24a4fb0f0..147972bf2e 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -199,7 +199,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
/* Reply won't come, so drop req ref */
static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
{
- p9_req_put(req);
+ p9_req_put(client, req);
return 0;
}
@@ -523,7 +523,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
kvfree(out_pages);
if (!kicked) {
/* reply won't come */
- p9_req_put(req);
+ p9_req_put(client, req);
}
return err;
}
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 77883b6788..4cf0c78d4d 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -163,7 +163,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
ring->intf->out_prod = prod;
spin_unlock_irqrestore(&ring->lock, flags);
notify_remote_via_irq(ring->irq);
- p9_req_put(p9_req);
+ p9_req_put(client, p9_req);
return 0;
}
--
2.36.1
Signed-off-by: Kent Overstreet <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Latchesar Ionkov <[email protected]>
Cc: Dominique Martinet <[email protected]>
---
include/net/9p/9p.h | 6 ++++-
include/net/9p/client.h | 5 +++-
net/9p/client.c | 58 ++++++++++++++++++++++++++++++-----------
net/9p/trans_rdma.c | 2 +-
4 files changed, 53 insertions(+), 18 deletions(-)
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 24a509f559..c0d59b53c1 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -539,12 +539,16 @@ struct p9_rstatfs {
struct p9_fcall {
u32 size;
u8 id;
+ enum p9_fcall_alloc {
+ P9_FCALL_KMEM_CACHE,
+ P9_FCALL_KMALLOC,
+ P9_FCALL_MEMPOOL,
+ } allocated;
u16 tag;
size_t offset;
size_t capacity;
- struct kmem_cache *cache;
u8 *sdata;
};
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index cb78e0e333..6517ca36bf 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -9,6 +9,7 @@
#ifndef NET_9P_CLIENT_H
#define NET_9P_CLIENT_H
+#include <linux/mempool.h>
#include <linux/utsname.h>
#include <linux/idr.h>
@@ -106,6 +107,7 @@ struct p9_client {
enum p9_trans_status status;
void *trans;
struct kmem_cache *fcall_cache;
+ mempool_t pools[2];
union {
struct {
@@ -222,7 +224,8 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
kgid_t gid, struct p9_qid *qid);
int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
-void p9_fcall_fini(struct p9_fcall *fc);
+void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
+ int fc_idx);
struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
static inline void p9_req_get(struct p9_req_t *r)
diff --git a/net/9p/client.c b/net/9p/client.c
index a36a40137c..82061c49cb 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -219,22 +219,34 @@ static int parse_opts(char *opts, struct p9_client *clnt)
}
static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
- int alloc_msize)
+ int fc_idx, unsigned alloc_msize)
{
+ WARN(alloc_msize > c->msize, "alloc_mize %u client msize %u",
+ alloc_msize, c->msize);
+
if (likely(c->fcall_cache) && alloc_msize == c->msize) {
fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
- fc->cache = c->fcall_cache;
+ fc->allocated = P9_FCALL_KMEM_CACHE;
} else {
fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
- fc->cache = NULL;
+ fc->allocated = P9_FCALL_KMALLOC;
}
- if (!fc->sdata)
+
+ if (!fc->sdata >> alloc_msize > c->msize)
return -ENOMEM;
+
+ if (!fc->sdata) {
+ fc->sdata = mempool_alloc(&c->pools[fc_idx], GFP_NOFS);
+ fc->allocated = P9_FCALL_MEMPOOL;
+ alloc_msize = c->msize;
+ }
+
fc->capacity = alloc_msize;
return 0;
}
-void p9_fcall_fini(struct p9_fcall *fc)
+void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
+ int fc_idx)
{
/* sdata can be NULL for interrupted requests in trans_rdma,
* and kmem_cache_free does not do NULL-check for us
@@ -242,10 +254,17 @@ void p9_fcall_fini(struct p9_fcall *fc)
if (unlikely(!fc->sdata))
return;
- if (fc->cache)
- kmem_cache_free(fc->cache, fc->sdata);
- else
+ switch (fc->allocated) {
+ case P9_FCALL_KMEM_CACHE:
+ kmem_cache_free(c->fcall_cache, fc->sdata);
+ break;
+ case P9_FCALL_KMALLOC:
kfree(fc->sdata);
+ break;
+ case P9_FCALL_MEMPOOL:
+ mempool_free(fc->sdata, &c->pools[fc_idx]);
+ break;
+ }
}
EXPORT_SYMBOL(p9_fcall_fini);
@@ -270,9 +289,9 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
if (!req)
return ERR_PTR(-ENOMEM);
- if (p9_fcall_init(c, &req->tc, alloc_msize))
+ if (p9_fcall_init(c, &req->tc, 0, alloc_msize))
goto free_req;
- if (p9_fcall_init(c, &req->rc, alloc_msize))
+ if (p9_fcall_init(c, &req->rc, 1, alloc_msize))
goto free;
p9pdu_reset(&req->tc);
@@ -310,8 +329,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
return req;
free:
- p9_fcall_fini(&req->tc);
- p9_fcall_fini(&req->rc);
+ p9_fcall_fini(c, &req->tc, 0);
+ p9_fcall_fini(c, &req->rc, 1);
free_req:
kmem_cache_free(p9_req_cache, req);
return ERR_PTR(-ENOMEM);
@@ -373,8 +392,8 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
int p9_req_put(struct p9_client *c, struct p9_req_t *r)
{
if (refcount_dec_and_test(&r->refcount)) {
- p9_fcall_fini(&r->tc);
- p9_fcall_fini(&r->rc);
+ p9_fcall_fini(c, &r->tc, 0);
+ p9_fcall_fini(c, &r->rc, 1);
kmem_cache_free(p9_req_cache, r);
return 1;
}
@@ -999,7 +1018,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
char *client_id;
err = 0;
- clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
+ clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
if (!clnt)
return ERR_PTR(-ENOMEM);
@@ -1063,6 +1082,11 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
clnt->msize - (P9_HDRSZ + 4),
NULL);
+ err = mempool_init_kmalloc_pool(&clnt->pools[0], 4, clnt->msize) ?:
+ mempool_init_kmalloc_pool(&clnt->pools[1], 4, clnt->msize);
+ if (err)
+ goto close_trans;
+
return clnt;
close_trans:
@@ -1070,6 +1094,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
put_trans:
v9fs_put_trans(clnt->trans_mod);
free_client:
+ mempool_exit(&clnt->pools[1]);
+ mempool_exit(&clnt->pools[0]);
kfree(clnt);
return ERR_PTR(err);
}
@@ -1094,6 +1120,8 @@ void p9_client_destroy(struct p9_client *clnt)
p9_tag_cleanup(clnt);
+ mempool_exit(&clnt->pools[1]);
+ mempool_exit(&clnt->pools[0]);
kmem_cache_destroy(clnt->fcall_cache);
kfree(clnt);
}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index d817d37452..99d878d70d 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -431,7 +431,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
/* Got one! */
- p9_fcall_fini(&req->rc);
+ p9_fcall_fini(client, &req->rc, 1);
req->rc.sdata = NULL;
goto dont_need_post_recv;
} else {
--
2.36.1
Thanks for the patches!
first two patches look good, couple of comments below for this one
Kent Overstreet wrote on Sun, Jul 03, 2022 at 09:42:43PM -0400:
> Signed-off-by: Kent Overstreet <[email protected]>
> Cc: Eric Van Hensbergen <[email protected]>
> Cc: Latchesar Ionkov <[email protected]>
> Cc: Dominique Martinet <[email protected]>
> ---
> include/net/9p/9p.h | 6 ++++-
> include/net/9p/client.h | 5 +++-
> net/9p/client.c | 58 ++++++++++++++++++++++++++++++-----------
> net/9p/trans_rdma.c | 2 +-
> 4 files changed, 53 insertions(+), 18 deletions(-)
>
> diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
> index 24a509f559..c0d59b53c1 100644
> --- a/include/net/9p/9p.h
> +++ b/include/net/9p/9p.h
> @@ -539,12 +539,16 @@ struct p9_rstatfs {
> struct p9_fcall {
> u32 size;
> u8 id;
> + enum p9_fcall_alloc {
> + P9_FCALL_KMEM_CACHE,
> + P9_FCALL_KMALLOC,
> + P9_FCALL_MEMPOOL,
> + } allocated;
> u16 tag;
>
> size_t offset;
> size_t capacity;
>
> - struct kmem_cache *cache;
> u8 *sdata;
> };
>
> diff --git a/include/net/9p/client.h b/include/net/9p/client.h
> index cb78e0e333..6517ca36bf 100644
> --- a/include/net/9p/client.h
> +++ b/include/net/9p/client.h
> @@ -9,6 +9,7 @@
> #ifndef NET_9P_CLIENT_H
> #define NET_9P_CLIENT_H
>
> +#include <linux/mempool.h>
> #include <linux/utsname.h>
> #include <linux/idr.h>
>
> @@ -106,6 +107,7 @@ struct p9_client {
> enum p9_trans_status status;
> void *trans;
> struct kmem_cache *fcall_cache;
> + mempool_t pools[2];
>
> union {
> struct {
> @@ -222,7 +224,8 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
> kgid_t gid, struct p9_qid *qid);
> int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
> int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
> -void p9_fcall_fini(struct p9_fcall *fc);
> +void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
> + int fc_idx);
> struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
>
> static inline void p9_req_get(struct p9_req_t *r)
> diff --git a/net/9p/client.c b/net/9p/client.c
> index a36a40137c..82061c49cb 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -219,22 +219,34 @@ static int parse_opts(char *opts, struct p9_client *clnt)
> }
>
> static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
> - int alloc_msize)
> + int fc_idx, unsigned alloc_msize)
> {
> + WARN(alloc_msize > c->msize, "alloc_mize %u client msize %u",
> + alloc_msize, c->msize);
> +
> if (likely(c->fcall_cache) && alloc_msize == c->msize) {
> fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> - fc->cache = c->fcall_cache;
> + fc->allocated = P9_FCALL_KMEM_CACHE;
> } else {
> fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> - fc->cache = NULL;
> + fc->allocated = P9_FCALL_KMALLOC;
> }
> - if (!fc->sdata)
> +
> + if (!fc->sdata >> alloc_msize > c->msize)
> return -ENOMEM;
probably meant && instead of >> ?
I'd also move this alloc_msize > c->msize check just below the warn to
keep it early if you want to keep it, but if we want to warn here it
really should be in p9_tag_alloc that alreadys cuts the user argument
short with a `min(c->msize, max_size)`
We shouldn't have any user calling with more at this point (the
user-provided size comes from p9_client_prepare_req arguments and it's
either msize or header size constants); and it probably makes sense to
check and error out rather than cap it.
> +
> + if (!fc->sdata) {
> + fc->sdata = mempool_alloc(&c->pools[fc_idx], GFP_NOFS);
> + fc->allocated = P9_FCALL_MEMPOOL;
> + alloc_msize = c->msize;
hm, so you try with the kmalloc/kmem_cache first and only fallback to
mempool if that failed?
What's the point of keeping the kmem cache in this case, instead of
routing all size-appropriate requests to the mempool?
(honest question)
> + }
> +
> fc->capacity = alloc_msize;
> return 0;
> }
>
> -void p9_fcall_fini(struct p9_fcall *fc)
> +void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
> + int fc_idx)
> {
> /* sdata can be NULL for interrupted requests in trans_rdma,
> * and kmem_cache_free does not do NULL-check for us
> @@ -242,10 +254,17 @@ void p9_fcall_fini(struct p9_fcall *fc)
> if (unlikely(!fc->sdata))
> return;
>
> - if (fc->cache)
> - kmem_cache_free(fc->cache, fc->sdata);
> - else
> + switch (fc->allocated) {
> + case P9_FCALL_KMEM_CACHE:
> + kmem_cache_free(c->fcall_cache, fc->sdata);
> + break;
> + case P9_FCALL_KMALLOC:
> kfree(fc->sdata);
> + break;
> + case P9_FCALL_MEMPOOL:
> + mempool_free(fc->sdata, &c->pools[fc_idx]);
> + break;
> + }
> }
> EXPORT_SYMBOL(p9_fcall_fini);
>
> @@ -270,9 +289,9 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
> if (!req)
> return ERR_PTR(-ENOMEM);
>
> - if (p9_fcall_init(c, &req->tc, alloc_msize))
> + if (p9_fcall_init(c, &req->tc, 0, alloc_msize))
> goto free_req;
> - if (p9_fcall_init(c, &req->rc, alloc_msize))
> + if (p9_fcall_init(c, &req->rc, 1, alloc_msize))
given the two rc/tc buffers are of same size I don't see the point of
using two caches either, you could just double the min number of
elements to the same effect?
> goto free;
>
> p9pdu_reset(&req->tc);
> @@ -310,8 +329,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
> return req;
>
> free:
> - p9_fcall_fini(&req->tc);
> - p9_fcall_fini(&req->rc);
> + p9_fcall_fini(c, &req->tc, 0);
> + p9_fcall_fini(c, &req->rc, 1);
> free_req:
> kmem_cache_free(p9_req_cache, req);
> return ERR_PTR(-ENOMEM);
> @@ -373,8 +392,8 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
> int p9_req_put(struct p9_client *c, struct p9_req_t *r)
> {
> if (refcount_dec_and_test(&r->refcount)) {
> - p9_fcall_fini(&r->tc);
> - p9_fcall_fini(&r->rc);
> + p9_fcall_fini(c, &r->tc, 0);
> + p9_fcall_fini(c, &r->rc, 1);
> kmem_cache_free(p9_req_cache, r);
> return 1;
> }
> @@ -999,7 +1018,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
> char *client_id;
>
> err = 0;
> - clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
> + clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
yes, thanks. Can simplify exit path a bit more with that but I'll do it.
> if (!clnt)
> return ERR_PTR(-ENOMEM);
>
> @@ -1063,6 +1082,11 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
> clnt->msize - (P9_HDRSZ + 4),
> NULL);
>
> + err = mempool_init_kmalloc_pool(&clnt->pools[0], 4, clnt->msize) ?:
> + mempool_init_kmalloc_pool(&clnt->pools[1], 4, clnt->msize);
I was thinking of using the slab helpers when I looked at it earlier,
e.g.
mempool_init_slab_pool(XYZ, clnt->fcall_cache);
Are there any real differences between the two?
(that also made me notice create/init difference, I agree init is
probably better than create here)
--
Dominique
> + if (err)
> + goto close_trans;
> +
> return clnt;
>
> close_trans:
> @@ -1070,6 +1094,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
> put_trans:
> v9fs_put_trans(clnt->trans_mod);
> free_client:
> + mempool_exit(&clnt->pools[1]);
> + mempool_exit(&clnt->pools[0]);
> kfree(clnt);
> return ERR_PTR(err);
> }
> @@ -1094,6 +1120,8 @@ void p9_client_destroy(struct p9_client *clnt)
>
> p9_tag_cleanup(clnt);
>
> + mempool_exit(&clnt->pools[1]);
> + mempool_exit(&clnt->pools[0]);
> kmem_cache_destroy(clnt->fcall_cache);
> kfree(clnt);
> }
> diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
> index d817d37452..99d878d70d 100644
> --- a/net/9p/trans_rdma.c
> +++ b/net/9p/trans_rdma.c
> @@ -431,7 +431,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
> if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
> if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
> /* Got one! */
> - p9_fcall_fini(&req->rc);
> + p9_fcall_fini(client, &req->rc, 1);
> req->rc.sdata = NULL;
> goto dont_need_post_recv;
> } else {
On Mon, Jul 04, 2022 at 11:22:42AM +0900, Dominique Martinet wrote:
> Thanks for the patches!
>
> first two patches look good, couple of comments below for this one
>
> Kent Overstreet wrote on Sun, Jul 03, 2022 at 09:42:43PM -0400:
> > Signed-off-by: Kent Overstreet <[email protected]>
> > Cc: Eric Van Hensbergen <[email protected]>
> > Cc: Latchesar Ionkov <[email protected]>
> > Cc: Dominique Martinet <[email protected]>
> > ---
> > include/net/9p/9p.h | 6 ++++-
> > include/net/9p/client.h | 5 +++-
> > net/9p/client.c | 58 ++++++++++++++++++++++++++++++-----------
> > net/9p/trans_rdma.c | 2 +-
> > 4 files changed, 53 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
> > index 24a509f559..c0d59b53c1 100644
> > --- a/include/net/9p/9p.h
> > +++ b/include/net/9p/9p.h
> > @@ -539,12 +539,16 @@ struct p9_rstatfs {
> > struct p9_fcall {
> > u32 size;
> > u8 id;
> > + enum p9_fcall_alloc {
> > + P9_FCALL_KMEM_CACHE,
> > + P9_FCALL_KMALLOC,
> > + P9_FCALL_MEMPOOL,
> > + } allocated;
> > u16 tag;
> >
> > size_t offset;
> > size_t capacity;
> >
> > - struct kmem_cache *cache;
> > u8 *sdata;
> > };
> >
> > diff --git a/include/net/9p/client.h b/include/net/9p/client.h
> > index cb78e0e333..6517ca36bf 100644
> > --- a/include/net/9p/client.h
> > +++ b/include/net/9p/client.h
> > @@ -9,6 +9,7 @@
> > #ifndef NET_9P_CLIENT_H
> > #define NET_9P_CLIENT_H
> >
> > +#include <linux/mempool.h>
> > #include <linux/utsname.h>
> > #include <linux/idr.h>
> >
> > @@ -106,6 +107,7 @@ struct p9_client {
> > enum p9_trans_status status;
> > void *trans;
> > struct kmem_cache *fcall_cache;
> > + mempool_t pools[2];
> >
> > union {
> > struct {
> > @@ -222,7 +224,8 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
> > kgid_t gid, struct p9_qid *qid);
> > int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
> > int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
> > -void p9_fcall_fini(struct p9_fcall *fc);
> > +void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
> > + int fc_idx);
> > struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
> >
> > static inline void p9_req_get(struct p9_req_t *r)
> > diff --git a/net/9p/client.c b/net/9p/client.c
> > index a36a40137c..82061c49cb 100644
> > --- a/net/9p/client.c
> > +++ b/net/9p/client.c
> > @@ -219,22 +219,34 @@ static int parse_opts(char *opts, struct p9_client *clnt)
> > }
> >
> > static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
> > - int alloc_msize)
> > + int fc_idx, unsigned alloc_msize)
> > {
> > + WARN(alloc_msize > c->msize, "alloc_mize %u client msize %u",
> > + alloc_msize, c->msize);
> > +
> > if (likely(c->fcall_cache) && alloc_msize == c->msize) {
> > fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> > - fc->cache = c->fcall_cache;
> > + fc->allocated = P9_FCALL_KMEM_CACHE;
> > } else {
> > fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> > - fc->cache = NULL;
> > + fc->allocated = P9_FCALL_KMALLOC;
> > }
> > - if (!fc->sdata)
> > +
> > + if (!fc->sdata >> alloc_msize > c->msize)
> > return -ENOMEM;
>
> probably meant && instead of >> ?
Thanks, good catch
> I'd also move this alloc_msize > c->msize check just below the warn to
> keep it early if you want to keep it, but if we want to warn here it
> really should be in p9_tag_alloc that alreadys cuts the user argument
> short with a `min(c->msize, max_size)`
>
> We shouldn't have any user calling with more at this point (the
> user-provided size comes from p9_client_prepare_req arguments and it's
> either msize or header size constants); and it probably makes sense to
> check and error out rather than cap it.
If that's the case I think we should just switch the warning to a BUG_ON() - I
just wasn't sure from reading the code if that was really guarded against.
> hm, so you try with the kmalloc/kmem_cache first and only fallback to
> mempool if that failed?
>
> What's the point of keeping the kmem cache in this case, instead of
> routing all size-appropriate requests to the mempool?
> (honest question)
Actually none, and I should've made it a kmem_cache mempool, not a kmalloc pool.
>
> > + }
> > +
> > fc->capacity = alloc_msize;
> > return 0;
> > }
> >
> > -void p9_fcall_fini(struct p9_fcall *fc)
> > +void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
> > + int fc_idx)
> > {
> > /* sdata can be NULL for interrupted requests in trans_rdma,
> > * and kmem_cache_free does not do NULL-check for us
> > @@ -242,10 +254,17 @@ void p9_fcall_fini(struct p9_fcall *fc)
> > if (unlikely(!fc->sdata))
> > return;
> >
> > - if (fc->cache)
> > - kmem_cache_free(fc->cache, fc->sdata);
> > - else
> > + switch (fc->allocated) {
> > + case P9_FCALL_KMEM_CACHE:
> > + kmem_cache_free(c->fcall_cache, fc->sdata);
> > + break;
> > + case P9_FCALL_KMALLOC:
> > kfree(fc->sdata);
> > + break;
> > + case P9_FCALL_MEMPOOL:
> > + mempool_free(fc->sdata, &c->pools[fc_idx]);
> > + break;
> > + }
> > }
> > EXPORT_SYMBOL(p9_fcall_fini);
> >
> > @@ -270,9 +289,9 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
> > if (!req)
> > return ERR_PTR(-ENOMEM);
> >
> > - if (p9_fcall_init(c, &req->tc, alloc_msize))
> > + if (p9_fcall_init(c, &req->tc, 0, alloc_msize))
> > goto free_req;
> > - if (p9_fcall_init(c, &req->rc, alloc_msize))
> > + if (p9_fcall_init(c, &req->rc, 1, alloc_msize))
>
> given the two rc/tc buffers are of same size I don't see the point of
> using two caches either, you could just double the min number of
> elements to the same effect?
You can't double allocate from the same mempool, that will deadlock if multiple
threads need the last element at the same time - I should've left a comment for
that.
Here's an updated patch - also up in git at
https://evilpiepirate.org/git/bcachefs.git/log/?h=9p_mempool
-- >8 --
Subject: [PATCH] 9p: Add mempools for RPCs
Signed-off-by: Kent Overstreet <[email protected]>
Cc: Eric Van Hensbergen <[email protected]>
Cc: Latchesar Ionkov <[email protected]>
Cc: Dominique Martinet <[email protected]>
---
include/net/9p/9p.h | 2 +-
include/net/9p/client.h | 12 ++++++-
net/9p/client.c | 70 ++++++++++++++++++++++++-----------------
net/9p/trans_rdma.c | 2 +-
4 files changed, 54 insertions(+), 32 deletions(-)
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 24a509f559..0b20ee6854 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -539,12 +539,12 @@ struct p9_rstatfs {
struct p9_fcall {
u32 size;
u8 id;
+ bool used_mempool;
u16 tag;
size_t offset;
size_t capacity;
- struct kmem_cache *cache;
u8 *sdata;
};
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index cb78e0e333..832dcc866a 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -9,6 +9,7 @@
#ifndef NET_9P_CLIENT_H
#define NET_9P_CLIENT_H
+#include <linux/mempool.h>
#include <linux/utsname.h>
#include <linux/idr.h>
@@ -107,6 +108,14 @@ struct p9_client {
void *trans;
struct kmem_cache *fcall_cache;
+ /*
+ * We need two identical mempools because it's not safe to allocate
+ * multiple elements from the same pool (without freeing the first);
+ * that will deadlock if multiple threads need the last element at the
+ * same time.
+ */
+ mempool_t pools[2];
+
union {
struct {
int rfd;
@@ -222,7 +231,8 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
kgid_t gid, struct p9_qid *qid);
int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
-void p9_fcall_fini(struct p9_fcall *fc);
+void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
+ int fc_idx);
struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
static inline void p9_req_get(struct p9_req_t *r)
diff --git a/net/9p/client.c b/net/9p/client.c
index a36a40137c..e14074d031 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -218,23 +218,29 @@ static int parse_opts(char *opts, struct p9_client *clnt)
return ret;
}
-static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
- int alloc_msize)
+static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
+ int fc_idx, unsigned alloc_msize)
{
- if (likely(c->fcall_cache) && alloc_msize == c->msize) {
- fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
- fc->cache = c->fcall_cache;
- } else {
- fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
- fc->cache = NULL;
- }
- if (!fc->sdata)
- return -ENOMEM;
+ gfp_t gfp = GFP_NOFS|__GFP_NOWARN;
+
+ BUG_ON(alloc_msize > c->msize);
+
+ fc->sdata = NULL;
+ fc->used_mempool = false;
fc->capacity = alloc_msize;
- return 0;
+
+ if (alloc_msize < c->msize)
+ fc->sdata = kmalloc(alloc_msize, gfp);
+
+ if (!fc->sdata) {
+ fc->sdata = mempool_alloc(&c->pools[fc_idx], gfp);
+ fc->used_mempool = true;
+ fc->capacity = c->msize;
+ }
}
-void p9_fcall_fini(struct p9_fcall *fc)
+void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
+ int fc_idx)
{
/* sdata can be NULL for interrupted requests in trans_rdma,
* and kmem_cache_free does not do NULL-check for us
@@ -242,8 +248,8 @@ void p9_fcall_fini(struct p9_fcall *fc)
if (unlikely(!fc->sdata))
return;
- if (fc->cache)
- kmem_cache_free(fc->cache, fc->sdata);
+ if (fc->used_mempool)
+ mempool_free(fc->sdata, &c->pools[fc_idx]);
else
kfree(fc->sdata);
}
@@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
if (!req)
return ERR_PTR(-ENOMEM);
- if (p9_fcall_init(c, &req->tc, alloc_msize))
- goto free_req;
- if (p9_fcall_init(c, &req->rc, alloc_msize))
- goto free;
+ p9_fcall_init(c, &req->tc, 0, alloc_msize);
+ p9_fcall_init(c, &req->rc, 1, alloc_msize);
p9pdu_reset(&req->tc);
p9pdu_reset(&req->rc);
@@ -310,9 +314,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
return req;
free:
- p9_fcall_fini(&req->tc);
- p9_fcall_fini(&req->rc);
-free_req:
+ p9_fcall_fini(c, &req->tc, 0);
+ p9_fcall_fini(c, &req->rc, 1);
kmem_cache_free(p9_req_cache, req);
return ERR_PTR(-ENOMEM);
}
@@ -373,8 +376,8 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
int p9_req_put(struct p9_client *c, struct p9_req_t *r)
{
if (refcount_dec_and_test(&r->refcount)) {
- p9_fcall_fini(&r->tc);
- p9_fcall_fini(&r->rc);
+ p9_fcall_fini(c, &r->tc, 0);
+ p9_fcall_fini(c, &r->rc, 1);
kmem_cache_free(p9_req_cache, r);
return 1;
}
@@ -999,7 +1002,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
char *client_id;
err = 0;
- clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
+ clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
if (!clnt)
return ERR_PTR(-ENOMEM);
@@ -1050,10 +1053,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
goto close_trans;
}
- err = p9_client_version(clnt);
- if (err)
- goto close_trans;
-
/* P9_HDRSZ + 4 is the smallest packet header we can have that is
* followed by data accessed from userspace by read
*/
@@ -1063,6 +1062,15 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
clnt->msize - (P9_HDRSZ + 4),
NULL);
+ err = mempool_init_slab_pool(&clnt->pools[0], 4, clnt->fcall_cache) ?:
+ mempool_init_slab_pool(&clnt->pools[1], 4, clnt->fcall_cache);
+ if (err)
+ goto close_trans;
+
+ err = p9_client_version(clnt);
+ if (err)
+ goto close_trans;
+
return clnt;
close_trans:
@@ -1070,6 +1078,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
put_trans:
v9fs_put_trans(clnt->trans_mod);
free_client:
+ mempool_exit(&clnt->pools[1]);
+ mempool_exit(&clnt->pools[0]);
kfree(clnt);
return ERR_PTR(err);
}
@@ -1094,6 +1104,8 @@ void p9_client_destroy(struct p9_client *clnt)
p9_tag_cleanup(clnt);
+ mempool_exit(&clnt->pools[1]);
+ mempool_exit(&clnt->pools[0]);
kmem_cache_destroy(clnt->fcall_cache);
kfree(clnt);
}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index d817d37452..99d878d70d 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -431,7 +431,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
/* Got one! */
- p9_fcall_fini(&req->rc);
+ p9_fcall_fini(client, &req->rc, 1);
req->rc.sdata = NULL;
goto dont_need_post_recv;
} else {
--
2.36.1
+Christian, sorry I just noticed you weren't in Ccs again --
the patches are currently there if you want a look:
https://evilpiepirate.org/git/bcachefs.git/log/?h=9p_mempool
I think it'll conflict a bit with your 8k non-read/write RPCs but I'll
take care of that when checking it this weekend.
Kent Overstreet wrote on Sun, Jul 03, 2022 at 11:05:57PM -0400:
> > We shouldn't have any user calling with more at this point (the
> > user-provided size comes from p9_client_prepare_req arguments and it's
> > either msize or header size constants); and it probably makes sense to
> > check and error out rather than cap it.
>
> If that's the case I think we should just switch the warning to a BUG_ON() - I
> just wasn't sure from reading the code if that was really guarded against.
yes, BUG_ON is good for me.
> > > - if (p9_fcall_init(c, &req->tc, alloc_msize))
> > > + if (p9_fcall_init(c, &req->tc, 0, alloc_msize))
> > > goto free_req;
> > > - if (p9_fcall_init(c, &req->rc, alloc_msize))
> > > + if (p9_fcall_init(c, &req->rc, 1, alloc_msize))
> >
> > given the two rc/tc buffers are of same size I don't see the point of
> > using two caches either, you could just double the min number of
> > elements to the same effect?
>
> You can't double allocate from the same mempool, that will deadlock if multiple
> threads need the last element at the same time - I should've left a comment for
> that.
hmm, looking at the code as long as min elements is big enough the
deadlock becomes increasingly difficult to hit -- but I guess there's no
guarantee we won't get 8 threads each getting their first item from the
pool and starving each other on the second... Fair enough, thank you for
the comment.
> @@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
> if (!req)
> return ERR_PTR(-ENOMEM);
>
> - if (p9_fcall_init(c, &req->tc, alloc_msize))
> - goto free_req;
> - if (p9_fcall_init(c, &req->rc, alloc_msize))
> - goto free;
> + p9_fcall_init(c, &req->tc, 0, alloc_msize);
> + p9_fcall_init(c, &req->rc, 1, alloc_msize);
mempool allocation never fails, correct?
(don't think this needs a comment, just making sure here)
This all looks good to me, will queue it up in my -next branch after
running some tests next weekend and hopefully submit when 5.20 opens
with the code making smaller allocs more common.
--
Dominique
On Mon, Jul 04, 2022 at 12:38:46PM +0900, Dominique Martinet wrote:
> > @@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
> > if (!req)
> > return ERR_PTR(-ENOMEM);
> >
> > - if (p9_fcall_init(c, &req->tc, alloc_msize))
> > - goto free_req;
> > - if (p9_fcall_init(c, &req->rc, alloc_msize))
> > - goto free;
> > + p9_fcall_init(c, &req->tc, 0, alloc_msize);
> > + p9_fcall_init(c, &req->rc, 1, alloc_msize);
>
>
> mempool allocation never fails, correct?
>
> (don't think this needs a comment, just making sure here)
As long as GFP_WAIT is included, yes
> This all looks good to me, will queue it up in my -next branch after
> running some tests next weekend and hopefully submit when 5.20 opens
> with the code making smaller allocs more common.
Sounds good!
On Montag, 4. Juli 2022 05:38:46 CEST Dominique Martinet wrote:
> +Christian, sorry I just noticed you weren't in Ccs again --
> the patches are currently there if you want a look:
> https://evilpiepirate.org/git/bcachefs.git/log/?h=9p_mempool
I wonder whether it would make sense to update 9p section in MAINTAINERS to
better reflect current reality, at least in a way such that contributors would
CC me right away?
Eric, Latchesar, what do you think?
> > @@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type,
> > unsigned int max_size)>
> > if (!req)
> >
> > return ERR_PTR(-ENOMEM);
> >
> > - if (p9_fcall_init(c, &req->tc, alloc_msize))
> > - goto free_req;
> > - if (p9_fcall_init(c, &req->rc, alloc_msize))
> > - goto free;
> > + p9_fcall_init(c, &req->tc, 0, alloc_msize);
> > + p9_fcall_init(c, &req->rc, 1, alloc_msize);
>
> mempool allocation never fails, correct?
>
> (don't think this needs a comment, just making sure here)
>
> This all looks good to me, will queue it up in my -next branch after
> running some tests next weekend and hopefully submit when 5.20 opens
> with the code making smaller allocs more common.
Hoo, Dominique, please hold your horses. I currently can't keep up with
reviewing and testing all pending 9p patches right now.
Personally I would hold these patches back for now. They would make sense on
current situation on master, because ATM basically all 9p requests simply
allocate exactly 'msize' for any 9p request.
However that's exactly what I was going to address with my already posted
patches (relevant patches regarding this issue here being 9..12):
https://lore.kernel.org/all/[email protected]/
And in the cover letter (section "STILL TODO" ... "3.") I was suggesting to
subsequently subdivide kmem_cache_alloc() into e.g. 4 allocation size
categories? Because that's what my already posted patches do anyway.
How about I address the already discussed issues and post a v5 of those
patches this week and then we can continue from there?
Best regards,
Christian Schoenebeck
Christian Schoenebeck wrote on Mon, Jul 04, 2022 at 01:12:51PM +0200:
> On Montag, 4. Juli 2022 05:38:46 CEST Dominique Martinet wrote:
> > +Christian, sorry I just noticed you weren't in Ccs again --
> > the patches are currently there if you want a look:
> > https://evilpiepirate.org/git/bcachefs.git/log/?h=9p_mempool
>
> I wonder whether it would make sense to update 9p section in MAINTAINERS to
> better reflect current reality, at least in a way such that contributors would
> CC me right away?
I almost suggested that, but Al Viro didn't cc Eric/Latchesar on the
previous series so I'm not sure how much people rely on MAINTAINERS and
how much of it is muscle memory...
Well, can't hurt to try at least -- giving Eric and Latchesar a chance
to reply.
Replying out of order
> However that's exactly what I was going to address with my already posted
> patches (relevant patches regarding this issue here being 9..12):
> https://lore.kernel.org/all/[email protected]/
> And in the cover letter (section "STILL TODO" ... "3.") I was suggesting to
> subsequently subdivide kmem_cache_alloc() into e.g. 4 allocation size
> categories? Because that's what my already posted patches do anyway.
Yes, I hinted at that by asking if it'd be worth a second mempool for 8k
buffers, but I'm not sure it is -- I think kmalloc will just be as fast
for these in practice? That would need checking.
But I also took a fresh look just now and didn't remember we had so many
different cases there, and that msize is no longer really used -- now
this is just a gut feeling, but I think we'd benefit from rounding up to
some pooled sizes e.g. I assume it'll be faster to allocate 1MB from the
cache three times than try to get 500k/600k/1MB from kmalloc.
That's a lot of assuming though and this is going to need checking...
> Hoo, Dominique, please hold your horses. I currently can't keep up with
> reviewing and testing all pending 9p patches right now.
>
> Personally I would hold these patches back for now. They would make sense on
> current situation on master, because ATM basically all 9p requests simply
> allocate exactly 'msize' for any 9p request.
So I think they're orthogonal really:
what mempool does is that it'll reserve the minimum amount of memory
required for x allocations (whatever min is set during init, so here 4
parallel RPCs) -- if normal allocation goes through it'll go through
normal slab allocation first, and if that fails we'll get a buffer from
the pool instead, and if there is none left it'll wait for a previous
request to be freed up possibly throttling the number of parallel
requests down but never failing like we currently do.
What will happen with your patches is we'll get less large buffers
allocated so we can reduce the reserved amount of buffers, but there
will still be some so Kent is still going to be just as likely to see
high order allocation failures. The memory pressure and difficulty to
allocate high order pages doesn't necessarily comes from other 9p
requests so just having less msize-sized buffers might help a bit but I
don't think it'll be enough (I remember some similar failures with
lustre and 256k allocs, and it's just buffer cache and whats
not... because we're doing these allocs with NOFS these can't be
reclaimed)
With this the worst that can happen is some RPCs will be delayed, and
the patch already falls back to allocating a msize buffer from pool even
if less is requrested if that kmalloc failed, so I think it should work
out ok as a first iteration.
(I appreciate the need for testing, but this feels much less risky than
the iovec series we've had recently... Famous last words?)
For later iterations we might want to optimize with multiple sizes of
pools and pick the closest majoring size or something, but I think
that'll be tricky to get right so I'd rather not rush such an
optimization.
> How about I address the already discussed issues and post a v5 of those
> patches this week and then we can continue from there?
I would have been happy to rebase your patches 9..12 on top of Kent's
this weekend but if you want to refresh them this week we can continue
from there, sure.
--
Dominique
On Mon, Jul 04, 2022 at 01:12:51PM +0200, Christian Schoenebeck wrote:
> On Montag, 4. Juli 2022 05:38:46 CEST Dominique Martinet wrote:
> > +Christian, sorry I just noticed you weren't in Ccs again --
> > the patches are currently there if you want a look:
> > https://evilpiepirate.org/git/bcachefs.git/log/?h=9p_mempool
>
> I wonder whether it would make sense to update 9p section in MAINTAINERS to
> better reflect current reality, at least in a way such that contributors would
> CC me right away?
>
> Eric, Latchesar, what do you think?
>
> > > @@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type,
> > > unsigned int max_size)>
> > > if (!req)
> > >
> > > return ERR_PTR(-ENOMEM);
> > >
> > > - if (p9_fcall_init(c, &req->tc, alloc_msize))
> > > - goto free_req;
> > > - if (p9_fcall_init(c, &req->rc, alloc_msize))
> > > - goto free;
> > > + p9_fcall_init(c, &req->tc, 0, alloc_msize);
> > > + p9_fcall_init(c, &req->rc, 1, alloc_msize);
> >
> > mempool allocation never fails, correct?
> >
> > (don't think this needs a comment, just making sure here)
> >
> > This all looks good to me, will queue it up in my -next branch after
> > running some tests next weekend and hopefully submit when 5.20 opens
> > with the code making smaller allocs more common.
>
> Hoo, Dominique, please hold your horses. I currently can't keep up with
> reviewing and testing all pending 9p patches right now.
>
> Personally I would hold these patches back for now. They would make sense on
> current situation on master, because ATM basically all 9p requests simply
> allocate exactly 'msize' for any 9p request.
Err, why?
These patches are pretty simple, and they fix a bug that's affecting users right
now (and has been for ages)
> However that's exactly what I was going to address with my already posted
> patches (relevant patches regarding this issue here being 9..12):
> https://lore.kernel.org/all/[email protected]/
> And in the cover letter (section "STILL TODO" ... "3.") I was suggesting to
> subsequently subdivide kmem_cache_alloc() into e.g. 4 allocation size
> categories? Because that's what my already posted patches do anyway.
Yeah that sounds like you're just reimplementing kmalloc.
On Montag, 4. Juli 2022 15:06:00 CEST Dominique Martinet wrote:
> Christian Schoenebeck wrote on Mon, Jul 04, 2022 at 01:12:51PM +0200:
> > On Montag, 4. Juli 2022 05:38:46 CEST Dominique Martinet wrote:
[...]
> > However that's exactly what I was going to address with my already posted
> > patches (relevant patches regarding this issue here being 9..12):
> > https://lore.kernel.org/all/[email protected]/
> > And in the cover letter (section "STILL TODO" ... "3.") I was suggesting
> > to
> > subsequently subdivide kmem_cache_alloc() into e.g. 4 allocation size
> > categories? Because that's what my already posted patches do anyway.
>
> Yes, I hinted at that by asking if it'd be worth a second mempool for 8k
> buffers, but I'm not sure it is -- I think kmalloc will just be as fast
> for these in practice? That would need checking.
>
> But I also took a fresh look just now and didn't remember we had so many
> different cases there, and that msize is no longer really used -- now
> this is just a gut feeling, but I think we'd benefit from rounding up to
> some pooled sizes e.g. I assume it'll be faster to allocate 1MB from the
> cache three times than try to get 500k/600k/1MB from kmalloc.
>
> That's a lot of assuming though and this is going to need checking...
Yeah, that's the reason why omitted this aspect so far, because I also thought
it deserves actual benchmarking how much cache granularity really makes sense,
instead of blindly subdividing them into random separate cache size
categories.
> > Hoo, Dominique, please hold your horses. I currently can't keep up with
> > reviewing and testing all pending 9p patches right now.
> >
> > Personally I would hold these patches back for now. They would make sense
> > on current situation on master, because ATM basically all 9p requests
> > simply allocate exactly 'msize' for any 9p request.
>
> So I think they're orthogonal really:
> what mempool does is that it'll reserve the minimum amount of memory
> required for x allocations (whatever min is set during init, so here 4
> parallel RPCs) -- if normal allocation goes through it'll go through
> normal slab allocation first, and if that fails we'll get a buffer from
> the pool instead, and if there is none left it'll wait for a previous
> request to be freed up possibly throttling the number of parallel
> requests down but never failing like we currently do.
Understood.
> With this the worst that can happen is some RPCs will be delayed, and
> the patch already falls back to allocating a msize buffer from pool even
> if less is requrested if that kmalloc failed, so I think it should work
> out ok as a first iteration.
>
> (I appreciate the need for testing, but this feels much less risky than
> the iovec series we've had recently... Famous last words?)
Got it, consider my famous last words dropped. ;-)
> For later iterations we might want to optimize with multiple sizes of
> pools and pick the closest majoring size or something, but I think
> that'll be tricky to get right so I'd rather not rush such an
> optimization.
>
> > How about I address the already discussed issues and post a v5 of those
> > patches this week and then we can continue from there?
>
> I would have been happy to rebase your patches 9..12 on top of Kent's
> this weekend but if you want to refresh them this week we can continue
> from there, sure.
I'll rebase them on master and address what we discussed so far. Then we'll
see.
Best regards,
Christian Schoenebeck
On Montag, 4. Juli 2022 15:06:31 CEST Kent Overstreet wrote:
> On Mon, Jul 04, 2022 at 01:12:51PM +0200, Christian Schoenebeck wrote:
> > On Montag, 4. Juli 2022 05:38:46 CEST Dominique Martinet wrote:
> > > +Christian, sorry I just noticed you weren't in Ccs again --
> > > the patches are currently there if you want a look:
> > > https://evilpiepirate.org/git/bcachefs.git/log/?h=9p_mempool
> >
> > I wonder whether it would make sense to update 9p section in MAINTAINERS
> > to
> > better reflect current reality, at least in a way such that contributors
> > would CC me right away?
> >
> > Eric, Latchesar, what do you think?
> >
> > > > @@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type,
> > > > unsigned int max_size)>
> > > >
> > > > if (!req)
> > > >
> > > > return ERR_PTR(-ENOMEM);
> > > >
> > > > - if (p9_fcall_init(c, &req->tc, alloc_msize))
> > > > - goto free_req;
> > > > - if (p9_fcall_init(c, &req->rc, alloc_msize))
> > > > - goto free;
> > > > + p9_fcall_init(c, &req->tc, 0, alloc_msize);
> > > > + p9_fcall_init(c, &req->rc, 1, alloc_msize);
> > >
> > > mempool allocation never fails, correct?
> > >
> > > (don't think this needs a comment, just making sure here)
> > >
> > > This all looks good to me, will queue it up in my -next branch after
> > > running some tests next weekend and hopefully submit when 5.20 opens
> > > with the code making smaller allocs more common.
> >
> > Hoo, Dominique, please hold your horses. I currently can't keep up with
> > reviewing and testing all pending 9p patches right now.
> >
> > Personally I would hold these patches back for now. They would make sense
> > on current situation on master, because ATM basically all 9p requests
> > simply allocate exactly 'msize' for any 9p request.
>
> Err, why?
>
> These patches are pretty simple, and they fix a bug that's affecting users
> right now (and has been for ages)
So simple that it already had one obvious bug (at least). But as it seems that
Dominique already supports your patch, I refrain from enumerating more
reasons.
> > However that's exactly what I was going to address with my already posted
> > patches (relevant patches regarding this issue here being 9..12):
> > https://lore.kernel.org/all/[email protected]/
> > And in the cover letter (section "STILL TODO" ... "3.") I was suggesting
> > to
> > subsequently subdivide kmem_cache_alloc() into e.g. 4 allocation size
> > categories? Because that's what my already posted patches do anyway.
>
> Yeah that sounds like you're just reimplementing kmalloc.
Quite exaggerated statement.
Best regards,
Christian Schoenebeck
On Mon, Jul 04, 2022 at 03:39:32PM +0200, Christian Schoenebeck wrote:
> So simple that it already had one obvious bug (at least). But as it seems that
> Dominique already supports your patch, I refrain from enumerating more
> reasons.
So snippy.
>
> > > However that's exactly what I was going to address with my already posted
> > > patches (relevant patches regarding this issue here being 9..12):
> > > https://lore.kernel.org/all/[email protected]/
> > > And in the cover letter (section "STILL TODO" ... "3.") I was suggesting
> > > to
> > > subsequently subdivide kmem_cache_alloc() into e.g. 4 allocation size
> > > categories? Because that's what my already posted patches do anyway.
> >
> > Yeah that sounds like you're just reimplementing kmalloc.
>
> Quite exaggerated statement.
I'm just pointing out that kmalloc() is just a frontend around
kmem_cache_alloc() that picks the cache based on the size parameter... so...
still sounds like you are?
Not that there's never a legitimate reason to do so, but it does raise an
eyebrow.
On Montag, 4. Juli 2022 16:19:46 CEST Kent Overstreet wrote:
> On Mon, Jul 04, 2022 at 03:39:32PM +0200, Christian Schoenebeck wrote:
> > So simple that it already had one obvious bug (at least). But as it seems
> > that Dominique already supports your patch, I refrain from enumerating
> > more reasons.
>
> So snippy.
Yeah, the tone makes the music. If you adjust yours, then I'll do, too.
> > > > However that's exactly what I was going to address with my already
> > > > posted
> > > > patches (relevant patches regarding this issue here being 9..12):
> > > > https://lore.kernel.org/all/[email protected]
> > > > om/
> > > > And in the cover letter (section "STILL TODO" ... "3.") I was
> > > > suggesting
> > > > to
> > > > subsequently subdivide kmem_cache_alloc() into e.g. 4 allocation size
> > > > categories? Because that's what my already posted patches do anyway.
> > >
> > > Yeah that sounds like you're just reimplementing kmalloc.
> >
> > Quite exaggerated statement.
>
> I'm just pointing out that kmalloc() is just a frontend around
> kmem_cache_alloc() that picks the cache based on the size parameter... so...
> still sounds like you are?
>
> Not that there's never a legitimate reason to do so, but it does raise an
> eyebrow.
So you are saying this change was useless as well then?
https://github.com/torvalds/linux/commit/91a76be37ff89795526c452a6799576b03bec501
Like already discussed in the other email, I omitted those cache size
granularity changes for good reasons, until proofen by benchmark that they
would actually help.
Best regards,
Christian Schoenebeck
I've taken the mempool patches to 9p-next
Christian Schoenebeck wrote on Mon, Jul 04, 2022 at 03:56:55PM +0200:
>> (I appreciate the need for testing, but this feels much less risky than
>> the iovec series we've had recently... Famous last words?)
>
> Got it, consider my famous last words dropped. ;-)
Ok, so I think you won this one...
Well -- when testing normally it obviously works well, performance wise
is roughly the same (obviously since it tries to allocate from slab
first and in normal case that will work)
When I tried gaming it with very low memory though I thought it worked
well, but I managed to get a bunch of processes stuck in mempool_alloc
with no obvious tid waiting for a reply.
I had the bright idea of using fio with io_uring and interestingly the
uring worker doesn't show up in ps or /proc/<pid>, but with qemu's gdb
and lx-ps I could find a bunch of iou-wrk-<pid> that are all with
similar stacks
1 │ [<0>] mempool_alloc+0x136/0x180
2 │ [<0>] p9_fcall_init+0x63/0x80 [9pnet]
3 │ [<0>] p9_client_prepare_req+0xa9/0x290 [9pnet]
4 │ [<0>] p9_client_rpc+0x64/0x610 [9pnet]
5 │ [<0>] p9_client_write+0xcb/0x210 [9pnet]
6 │ [<0>] v9fs_file_write_iter+0x4d/0xc0 [9p]
7 │ [<0>] io_write+0x129/0x2c0
8 │ [<0>] io_issue_sqe+0xa1/0x25b0
9 │ [<0>] io_wq_submit_work+0x90/0x190
10 │ [<0>] io_worker_handle_work+0x211/0x550
11 │ [<0>] io_wqe_worker+0x2c5/0x340
12 │ [<0>] ret_from_fork+0x1f/0x30
or, and that's the interesting part
1 │ [<0>] mempool_alloc+0x136/0x180
2 │ [<0>] p9_fcall_init+0x63/0x80 [9pnet]
3 │ [<0>] p9_client_prepare_req+0xa9/0x290 [9pnet]
4 │ [<0>] p9_client_rpc+0x64/0x610 [9pnet]
5 │ [<0>] p9_client_flush+0x81/0xc0 [9pnet]
6 │ [<0>] p9_client_rpc+0x591/0x610 [9pnet]
7 │ [<0>] p9_client_write+0xcb/0x210 [9pnet]
8 │ [<0>] v9fs_file_write_iter+0x4d/0xc0 [9p]
9 │ [<0>] io_write+0x129/0x2c0
10 │ [<0>] io_issue_sqe+0xa1/0x25b0
11 │ [<0>] io_wq_submit_work+0x90/0x190
12 │ [<0>] io_worker_handle_work+0x211/0x550
13 │ [<0>] io_wqe_worker+0x2c5/0x340
14 │ [<0>] ret_from_fork+0x1f/0x30
The problem is these flushes : the same task is holding a buffer for the
original rpc and tries to get a new one, but waits for someone to free
and.. obviously there isn't anyone (I cound 11 flushes pending, so more
than the minimum number of buffers we'd expect from the mempool, and I
don't think we missed any free)
Now I'm not sure what's best here.
The best thing to do would probably to just tell the client it can't use
the mempools for flushes -- the flushes are rare and will use small
buffers with your smaller allocations patch; I bet I wouldn't be able to
reproduce that anymore but it should probably just forbid the mempool
just in case.
Anyway, I'm not comfortable with this patch right now, a hang is worse
than an allocation failure warning.
> > > How about I address the already discussed issues and post a v5 of those
> > > patches this week and then we can continue from there?
> >
> > I would have been happy to rebase your patches 9..12 on top of Kent's
> > this weekend but if you want to refresh them this week we can continue
> > from there, sure.
>
> I'll rebase them on master and address what we discussed so far. Then we'll
> see.
FWIW and regarding the other thread with virito queue sizes, I was only
considering the later patches with small RPCs for this merge window.
Shall we try to focus on that first, and then revisit the virtio and
mempool patches once that's done?
--
Dominique
On Samstag, 9. Juli 2022 09:43:47 CEST Dominique Martinet wrote:
> I've taken the mempool patches to 9p-next
>
> Christian Schoenebeck wrote on Mon, Jul 04, 2022 at 03:56:55PM +0200:
> >> (I appreciate the need for testing, but this feels much less risky than
> >> the iovec series we've had recently... Famous last words?)
> >
> > Got it, consider my famous last words dropped. ;-)
>
> Ok, so I think you won this one...
>
> Well -- when testing normally it obviously works well, performance wise
> is roughly the same (obviously since it tries to allocate from slab
> first and in normal case that will work)
>
> When I tried gaming it with very low memory though I thought it worked
> well, but I managed to get a bunch of processes stuck in mempool_alloc
> with no obvious tid waiting for a reply.
> I had the bright idea of using fio with io_uring and interestingly the
> uring worker doesn't show up in ps or /proc/<pid>, but with qemu's gdb
> and lx-ps I could find a bunch of iou-wrk-<pid> that are all with
> similar stacks
> 1 │ [<0>] mempool_alloc+0x136/0x180
> 2 │ [<0>] p9_fcall_init+0x63/0x80 [9pnet]
> 3 │ [<0>] p9_client_prepare_req+0xa9/0x290 [9pnet]
> 4 │ [<0>] p9_client_rpc+0x64/0x610 [9pnet]
> 5 │ [<0>] p9_client_write+0xcb/0x210 [9pnet]
> 6 │ [<0>] v9fs_file_write_iter+0x4d/0xc0 [9p]
> 7 │ [<0>] io_write+0x129/0x2c0
> 8 │ [<0>] io_issue_sqe+0xa1/0x25b0
> 9 │ [<0>] io_wq_submit_work+0x90/0x190
> 10 │ [<0>] io_worker_handle_work+0x211/0x550
> 11 │ [<0>] io_wqe_worker+0x2c5/0x340
> 12 │ [<0>] ret_from_fork+0x1f/0x30
>
> or, and that's the interesting part
> 1 │ [<0>] mempool_alloc+0x136/0x180
> 2 │ [<0>] p9_fcall_init+0x63/0x80 [9pnet]
> 3 │ [<0>] p9_client_prepare_req+0xa9/0x290 [9pnet]
> 4 │ [<0>] p9_client_rpc+0x64/0x610 [9pnet]
> 5 │ [<0>] p9_client_flush+0x81/0xc0 [9pnet]
> 6 │ [<0>] p9_client_rpc+0x591/0x610 [9pnet]
> 7 │ [<0>] p9_client_write+0xcb/0x210 [9pnet]
> 8 │ [<0>] v9fs_file_write_iter+0x4d/0xc0 [9p]
> 9 │ [<0>] io_write+0x129/0x2c0
> 10 │ [<0>] io_issue_sqe+0xa1/0x25b0
> 11 │ [<0>] io_wq_submit_work+0x90/0x190
> 12 │ [<0>] io_worker_handle_work+0x211/0x550
> 13 │ [<0>] io_wqe_worker+0x2c5/0x340
> 14 │ [<0>] ret_from_fork+0x1f/0x30
>
> The problem is these flushes : the same task is holding a buffer for the
> original rpc and tries to get a new one, but waits for someone to free
> and.. obviously there isn't anyone (I cound 11 flushes pending, so more
> than the minimum number of buffers we'd expect from the mempool, and I
> don't think we missed any free)
>
> Now I'm not sure what's best here.
> The best thing to do would probably to just tell the client it can't use
> the mempools for flushes -- the flushes are rare and will use small
> buffers with your smaller allocations patch; I bet I wouldn't be able to
> reproduce that anymore but it should probably just forbid the mempool
> just in case.
So the problem is that one task ends up with more than 1 request at a time,
and the buffer is allocated and associated per request, not per task. If I am
not missing something, then this scenario (>1 request simultaniously per task)
currently may actually only happen with p9_client_flush() calls. Which
simplifies the problem.
So probably the best way would be to simply flip the call order such that
p9_tag_remove() is called before p9_client_flush(), similar to how it's
already done with p9_client_clunk() calls?
> Anyway, I'm not comfortable with this patch right now, a hang is worse
> than an allocation failure warning.
As you already mentioned, with the pending 'net/9p: allocate appropriate
reduced message buffers' patch those hangs should not happen, as Tflush would
then just kmalloc() a small buffer. But I would probably still fix this issue
here nevertheless, as it might hurt in other ways in future. Shouldn't be too
much noise to swap the call order, right?
> > > > How about I address the already discussed issues and post a v5 of
> > > > those
> > > > patches this week and then we can continue from there?
> > >
> > > I would have been happy to rebase your patches 9..12 on top of Kent's
> > > this weekend but if you want to refresh them this week we can continue
> > > from there, sure.
> >
> > I'll rebase them on master and address what we discussed so far. Then
> > we'll
> > see.
>
> FWIW and regarding the other thread with virito queue sizes, I was only
> considering the later patches with small RPCs for this merge window.
I would also recommend to leave out the virtio patches, yes.
> Shall we try to focus on that first, and then revisit the virtio and
> mempool patches once that's done?
Your call. I think both ways are viable.
Best regards,
Christian Schoenebeck
Christian Schoenebeck wrote on Sat, Jul 09, 2022 at 04:21:46PM +0200:
> > The best thing to do would probably to just tell the client it can't use
> > the mempools for flushes -- the flushes are rare and will use small
> > buffers with your smaller allocations patch; I bet I wouldn't be able to
> > reproduce that anymore but it should probably just forbid the mempool
> > just in case.
>
> So the problem is that one task ends up with more than 1 request at a time,
> and the buffer is allocated and associated per request, not per task. If I am
> not missing something, then this scenario (>1 request simultaniously per task)
> currently may actually only happen with p9_client_flush() calls. Which
> simplifies the problem.
Yes that should be the only case where this happens.
> So probably the best way would be to simply flip the call order such that
> p9_tag_remove() is called before p9_client_flush(), similar to how it's
> already done with p9_client_clunk() calls?
I don't think we can do that safely without some extra work - because
until we get the reply from the flush, the legitimate reply to the
original request can still come. It's perfectly possible that by the
time we sent the flush the server will have sent the normal reply to our
original request -- actually with flush stuck there it's actually almost
certain it has...
For trans fd for example the reads happen in a worker thread, if that
buffer disappears too early it'll fail with EIO and the whole mount will
break down as I think that'll just kill the read worker...
(actually how does that even work, it checks for rreq->status !=
REQ_STATUS_SENT but it should be FLSHD at this point .... erm)
In theory we can probably adjust the cancel() callback to make sure that
we never use the recv/send buffers from there on but it might be tricky,
still for tcp that's this 'm->rc.sdata' that will be read into and can
point to a recv buffer if we're mid-reading (e.g. got header for that
reply but it wasn't done in a single read() call and waiting for more
data); and that operatres without the client lock so we can't just take
it away easily...
Well, that'll need careful considerations... I think it'll be much
simpler to say flush calls are allocated weird and don't use the
mempool, even if it's another bit of legacy that'll be hard to
understand why we did that down the road...
> > Anyway, I'm not comfortable with this patch right now, a hang is worse
> > than an allocation failure warning.
>
> As you already mentioned, with the pending 'net/9p: allocate appropriate
> reduced message buffers' patch those hangs should not happen, as Tflush would
> then just kmalloc() a small buffer. But I would probably still fix this issue
> here nevertheless, as it might hurt in other ways in future. Shouldn't be too
> much noise to swap the call order, right?
I definitely want to fix this even with your patches, yes.
--
Dominique
On Samstag, 9. Juli 2022 16:42:53 CEST Dominique Martinet wrote:
> Christian Schoenebeck wrote on Sat, Jul 09, 2022 at 04:21:46PM +0200:
> > > The best thing to do would probably to just tell the client it can't use
> > > the mempools for flushes -- the flushes are rare and will use small
> > > buffers with your smaller allocations patch; I bet I wouldn't be able to
> > > reproduce that anymore but it should probably just forbid the mempool
> > > just in case.
> >
> > So the problem is that one task ends up with more than 1 request at a
> > time,
> > and the buffer is allocated and associated per request, not per task. If I
> > am not missing something, then this scenario (>1 request simultaniously
> > per task) currently may actually only happen with p9_client_flush()
> > calls. Which simplifies the problem.
>
> Yes that should be the only case where this happens.
>
> > So probably the best way would be to simply flip the call order such that
> > p9_tag_remove() is called before p9_client_flush(), similar to how it's
> > already done with p9_client_clunk() calls?
>
> I don't think we can do that safely without some extra work - because
> until we get the reply from the flush, the legitimate reply to the
> original request can still come. It's perfectly possible that by the
> time we sent the flush the server will have sent the normal reply to our
> original request -- actually with flush stuck there it's actually almost
> certain it has...
Mmm, I "think" that wouldn't be something new. There is no guarantee that
client would not get a late response delivery by server of a request that
client has already thrown away.
What happens on server side is: requests come in sequentially, and are started
to be processed exactly in that order. But then they are actually running in
parallel on worker threads, dispatched back and forth between threads several
times. And Tflush itself is really just another request. So there is no
guarantee that the response order corresponds to the order of requests
originally sent by client, and if client sent a Tflush, it might still get a
response to its causal, abolished "normal" request.
Best regards,
Christian Schoenebeck
Christian Schoenebeck wrote on Sat, Jul 09, 2022 at 08:08:41PM +0200:
> Mmm, I "think" that wouldn't be something new. There is no guarantee that
> client would not get a late response delivery by server of a request that
> client has already thrown away.
No. Well, it shouldn't -- responding to tflush should guarantee that the
associated request is thrown away by the server
https://9fans.github.io/plan9port/man/man9/flush.html
Order is not explicit, but I read this:
> If it recognizes oldtag as the tag of a pending transaction, it should
> abort any pending response and discard that tag.
late replies to the oldtag are no longer allowed once rflush has been
sent.
But I guess that also depends on the transport being sequential -- that
is the case for TCP but is it true for virtio as well? e.g. if a server
replies something and immediately replies rflush are we guaranteed
rflush is received second by the client?
There's also this bit:
> When the client sends a Tflush, it must wait to receive the
> corresponding Rflush before reusing oldtag for subsequent messages
if we free the request at this point we'd reuse the tag immediately,
which definitely lead to troubles.
> What happens on server side is: requests come in sequentially, and are started
> to be processed exactly in that order. But then they are actually running in
> parallel on worker threads, dispatched back and forth between threads several
> times. And Tflush itself is really just another request. So there is no
> guarantee that the response order corresponds to the order of requests
> originally sent by client, and if client sent a Tflush, it might still get a
> response to its causal, abolished "normal" request.
yes and processing flush ought to get a lock or something and look for
oldtag.
Looking at qemu code it does it right: processing flush find the old
request and marks it as cancelled, then it waits for the request to
finish (and possibly get discarded) during which (pdu_complete) it'll
wake the flush up; so spurrious replies of a tag after flush should not
be possible.
--
Dominique
Christian Schoenebeck wrote on Sun, Jul 10, 2022 at 02:57:58PM +0200:
> On Samstag, 9. Juli 2022 22:50:30 CEST Dominique Martinet wrote:
> > Christian Schoenebeck wrote on Sat, Jul 09, 2022 at 08:08:41PM +0200:
> > > Mmm, I "think" that wouldn't be something new. There is no guarantee that
> > > client would not get a late response delivery by server of a request that
> > > client has already thrown away.
> >
> > No. Well, it shouldn't -- responding to tflush should guarantee that the
> > associated request is thrown away by the server
> >
> > https://9fans.github.io/plan9port/man/man9/flush.html
>
> Yes, but that's another aspect of Tflush, its main purpose actually: client
> tells server that it no longer cares of previously sent request with oldtag=X.
> That prevents the server routines from hanging for good on things that client
> no longer cares for anyway, which otherwise evntually might lead to a complete
> server lockup on certain setups.
I think we're saying the same thing :)
> > Order is not explicit, but I read this:
> > > If it recognizes oldtag as the tag of a pending transaction, it should
> > > abort any pending response and discard that tag.
> >
> > late replies to the oldtag are no longer allowed once rflush has been
> > sent.
>
> That's not quite correct, it also explicitly says this:
>
> "The server may respond to the pending request before responding to the
> Tflush."
>
> And independent of what the 9p2000 spec says, consider this:
>
> 1. client sends a huge Twrite request
> 2. server starts to perform that write but it takes very long
> 3.A impatient client sends a Tflush to abort it
> 3.B server finally responds to Twrite with a normal Rwrite
>
> These last two actions 3.A and 3.B may happen concurrently within the same
> transport time frame, or "at the same time" if you will. There is no way to
> prevent that from happening.
Yes, and that is precisely why we cannot free the buffers from the
Twrite until we got the Rflush.
Until the Rflush comes, a Rwrite can still come at any time so we cannot
just free these resources.
In theory it'd be possible to free the buffers for some protocol and
throw the data with the bathwater, but the man page says that in this
case we should ignore the flush and behave as if the request behaved
properly because of side-effects e.g. even if you try to interrupt an
unlink() call if the server says it removed it, well, it's removed so we
should tell userspace that.
Conversely, once the Rflush came, we can be sure that the Tflush will
never happen and we can free stuff / reuse tag.
> > But I guess that also depends on the transport being sequential -- that
> > is the case for TCP but is it true for virtio as well? e.g. if a server
> > replies something and immediately replies rflush are we guaranteed
> > rflush is received second by the client?
>
> That's more a higher level 9p server controller portion issue, not a low level
> transport one:
>
> In the scenario described above, QEMU server would always send Rflush response
> second, yes. So client would receive:
>
> 1. Rwrite or R(l)error
> 2. Rflush
>
> If the same assumption could be made for any 9p server implementation though,
> I could not say.
I'm arguing that it's a mandatory requirement -- the 9p client wouldn't
work if servers don't respect this.
> As for transport: virtio itself is really just two FIFO ringbuffers (one
> ringbuffer client -> server, one ringbuffer server -> client). Once either
> side placed their request/response message there, it is there, standing in the
> queue line and waiting for being pulled by the other side, no way back. Both
> sides pull out messages from their FIFO one by one, no look ahead. And a
> significant large time may pass for either side to pull the respective next
> message. Order of messages received on one side, always corresponds to order
> of messages being sent by other side, but that only applies to one ringbuffer
> (direction). The two ringbuffers (message directions) are completely
> independent from each other though, so no assumption can be made between them.
Sounds good though, order within a direction is guaranted, that's what I
wanted to read :)
We're using RC (reliable connected) mode for RDMA so I think it's the
same as I remember it behaves like TCP, which only leaves xen but I'll
pretend it works there as well and not worry about it..
> > > When the client sends a Tflush, it must wait to receive the
> > > corresponding Rflush before reusing oldtag for subsequent messages
> >
> > if we free the request at this point we'd reuse the tag immediately,
> > which definitely lead to troubles.
>
> Yes, that's the point I never understood why this is done by Linux client. I
> find it problematic to recycle IDs in a distributed system within a short time
> window. Additionally it also makes 9p protocol debugging more difficult, as
> you often look at tag numbers in logs and think, "does this reference the
> previous request, or is it about a new one now?"
I can definitely agree with that.
We need to keep track of used tags, but we don't need to pick the lowest
tag available -- maybe the IDR code that allocates tag can be configured
to endlessly increment and loop around, only avoiding duplicates?
Ah, here it is, from Documentation/core-api/idr.rst:
If you need to allocate IDs sequentially, you can use
idr_alloc_cyclic(). The IDR becomes less efficient when dealing
with larger IDs, so using this function comes at a slight cost.
That would be another "easy change", if you'd like to check that cost at
some point...
(until we notice that some server has a static array for tags and stop
working once you use a tag > 64 or something...)
Anyway, this is getting off-topic -- the point is that we need to keep
resources around for the original reply when we send a tflush, so we
can't just free that buffer first unless you're really good with it.
It'd be tempting to just steal its buffers but these might still be
useful, if e.g. both replies come in parallel.
(speaking of which, why do we need two buffers? Do we ever re-use the
sent buffer once the reply comes?... this all looks sequential to me...)
So instead of arguing here I'd say let's first finish your smaller reqs
patches and make mempool again on top of that with a failsafe just for
flush buffers to never fallback on mempool; I think that'll be easier to
do in this order.
--
Dominique
Greg on CC: please correct me on false assumptions on QEMU side ...
On Samstag, 9. Juli 2022 22:50:30 CEST Dominique Martinet wrote:
> Christian Schoenebeck wrote on Sat, Jul 09, 2022 at 08:08:41PM +0200:
> > Mmm, I "think" that wouldn't be something new. There is no guarantee that
> > client would not get a late response delivery by server of a request that
> > client has already thrown away.
>
> No. Well, it shouldn't -- responding to tflush should guarantee that the
> associated request is thrown away by the server
>
> https://9fans.github.io/plan9port/man/man9/flush.html
Yes, but that's another aspect of Tflush, its main purpose actually: client
tells server that it no longer cares of previously sent request with oldtag=X.
That prevents the server routines from hanging for good on things that client
no longer cares for anyway, which otherwise evntually might lead to a complete
server lockup on certain setups.
On QEMU side we have a dedicated 'synth' fs driver test case to ensure that
this really works (a simulated fs I/O call that never returns -> Tflush aborts
it -> Test Passed):
https://github.com/qemu/qemu/blob/master/tests/qtest/virtio-9p-test.c#L1234
> Order is not explicit, but I read this:
> > If it recognizes oldtag as the tag of a pending transaction, it should
> > abort any pending response and discard that tag.
>
> late replies to the oldtag are no longer allowed once rflush has been
> sent.
That's not quite correct, it also explicitly says this:
"The server may respond to the pending request before responding to the
Tflush."
And independent of what the 9p2000 spec says, consider this:
1. client sends a huge Twrite request
2. server starts to perform that write but it takes very long
3.A impatient client sends a Tflush to abort it
3.B server finally responds to Twrite with a normal Rwrite
These last two actions 3.A and 3.B may happen concurrently within the same
transport time frame, or "at the same time" if you will. There is no way to
prevent that from happening.
> But I guess that also depends on the transport being sequential -- that
> is the case for TCP but is it true for virtio as well? e.g. if a server
> replies something and immediately replies rflush are we guaranteed
> rflush is received second by the client?
That's more a higher level 9p server controller portion issue, not a low level
transport one:
In the scenario described above, QEMU server would always send Rflush response
second, yes. So client would receive:
1. Rwrite or R(l)error
2. Rflush
If the same assumption could be made for any 9p server implementation though,
I could not say.
As for transport: virtio itself is really just two FIFO ringbuffers (one
ringbuffer client -> server, one ringbuffer server -> client). Once either
side placed their request/response message there, it is there, standing in the
queue line and waiting for being pulled by the other side, no way back. Both
sides pull out messages from their FIFO one by one, no look ahead. And a
significant large time may pass for either side to pull the respective next
message. Order of messages received on one side, always corresponds to order
of messages being sent by other side, but that only applies to one ringbuffer
(direction). The two ringbuffers (message directions) are completely
independent from each other though, so no assumption can be made between them.
> There's also this bit:
> > When the client sends a Tflush, it must wait to receive the
> > corresponding Rflush before reusing oldtag for subsequent messages
>
> if we free the request at this point we'd reuse the tag immediately,
> which definitely lead to troubles.
Yes, that's the point I never understood why this is done by Linux client. I
find it problematic to recycle IDs in a distributed system within a short time
window. Additionally it also makes 9p protocol debugging more difficult, as
you often look at tag numbers in logs and think, "does this reference the
previous request, or is it about a new one now?"
> > What happens on server side is: requests come in sequentially, and are
> > started to be processed exactly in that order. But then they are actually
> > running in parallel on worker threads, dispatched back and forth between
> > threads several times. And Tflush itself is really just another request.
> > So there is no guarantee that the response order corresponds to the order
> > of requests originally sent by client, and if client sent a Tflush, it
> > might still get a response to its causal, abolished "normal" request.
>
> yes and processing flush ought to get a lock or something and look for
> oldtag.
> Looking at qemu code it does it right: processing flush find the old
> request and marks it as cancelled, then it waits for the request to
> finish (and possibly get discarded) during which (pdu_complete) it'll
> wake the flush up; so spurrious replies of a tag after flush should not
> be possible.
>
> --
> Dominique
On Sonntag, 10. Juli 2022 15:19:56 CEST Dominique Martinet wrote:
> Christian Schoenebeck wrote on Sun, Jul 10, 2022 at 02:57:58PM +0200:
> > On Samstag, 9. Juli 2022 22:50:30 CEST Dominique Martinet wrote:
> > > Christian Schoenebeck wrote on Sat, Jul 09, 2022 at 08:08:41PM +0200:
[...]
> > > late replies to the oldtag are no longer allowed once rflush has been
> > > sent.
> >
> > That's not quite correct, it also explicitly says this:
> >
> > "The server may respond to the pending request before responding to the
> > Tflush."
> >
> > And independent of what the 9p2000 spec says, consider this:
> >
> > 1. client sends a huge Twrite request
> > 2. server starts to perform that write but it takes very long
> > 3.A impatient client sends a Tflush to abort it
> > 3.B server finally responds to Twrite with a normal Rwrite
> >
> > These last two actions 3.A and 3.B may happen concurrently within the same
> > transport time frame, or "at the same time" if you will. There is no way
> > to
> > prevent that from happening.
>
> Yes, and that is precisely why we cannot free the buffers from the
> Twrite until we got the Rflush.
> Until the Rflush comes, a Rwrite can still come at any time so we cannot
> just free these resources.
With current client version, agreed, as it might potentially incorrectly
lookup a wrong (new) request with the already recycled tag number then. With
consecutive tag numbers this would not happen. Client lookup with the old tag
number would fail -> ignore reply. However ...
> In theory it'd be possible to free the buffers for some protocol and
> throw the data with the bathwater, but the man page says that in this
> case we should ignore the flush and behave as if the request behaved
> properly because of side-effects e.g. even if you try to interrupt an
> unlink() call if the server says it removed it, well, it's removed so we
> should tell userspace that.
... good point! I was probably too much thinking about Twrite/Tread examples,
so I haven't considered that case indeed.
> > > > When the client sends a Tflush, it must wait to receive the
> > > > corresponding Rflush before reusing oldtag for subsequent messages
> > >
> > > if we free the request at this point we'd reuse the tag immediately,
> > > which definitely lead to troubles.
> >
> > Yes, that's the point I never understood why this is done by Linux client.
> > I find it problematic to recycle IDs in a distributed system within a
> > short time window. Additionally it also makes 9p protocol debugging more
> > difficult, as you often look at tag numbers in logs and think, "does this
> > reference the previous request, or is it about a new one now?"
>
> I can definitely agree with that.
> We need to keep track of used tags, but we don't need to pick the lowest
> tag available -- maybe the IDR code that allocates tag can be configured
> to endlessly increment and loop around, only avoiding duplicates?
>
> Ah, here it is, from Documentation/core-api/idr.rst:
>
> If you need to allocate IDs sequentially, you can use
> idr_alloc_cyclic(). The IDR becomes less efficient when dealing
> with larger IDs, so using this function comes at a slight cost.
>
>
> That would be another "easy change", if you'd like to check that cost at
> some point...
Nice! I'll definitely give this a whirl and will report back!
> (until we notice that some server has a static array for tags and stop
> working once you use a tag > 64 or something...)
That would be an incorrect server implementation then, a.k.a. bug. The spec is
clear that tag numbers are generated by client and does not mandate any
sequential structure.
> Anyway, this is getting off-topic -- the point is that we need to keep
> resources around for the original reply when we send a tflush, so we
> can't just free that buffer first unless you're really good with it.
>
> It'd be tempting to just steal its buffers but these might still be
> useful, if e.g. both replies come in parallel.
> (speaking of which, why do we need two buffers? Do we ever re-use the
> sent buffer once the reply comes?... this all looks sequential to me...)
Yep, I was thinking the exact same, but for now I would leave it this way.
> So instead of arguing here I'd say let's first finish your smaller reqs
> patches and make mempool again on top of that with a failsafe just for
> flush buffers to never fallback on mempool; I think that'll be easier to
> do in this order.
OK then, fine with me!
No time today, but I hope to post a new version next week.
Best regards,
Christian Schoenebeck
TFLUSH is called while the thread still holds memory for the
request we're trying to flush, so mempool alloc can deadlock
there. With p9_msg_buf_size() rework the flush allocation is
small so just make it fail if allocation failed; all that does
is potentially leak the request we're flushing until its reply
finally does come.. or if it never does until umount.
Signed-off-by: Dominique Martinet <[email protected]>
---
Here's a concrete version of what I had in mind: literally just make
allocation fail if the initial alloc failed.
I can't reproduce any bad hang with a sane server here, but we still
risk hanging with a bad server that ignores flushes as these are still
unkillable (someday I'll finish my async requests work...)
So ultimately there are two things I'm not so happy about with mempools:
- this real possibility of client hangs if a server mishandles some
replies -- this might make fuzzing difficult in particular, I think it's
easier to deal with failed IO (as long as it fails all the way back to
userspace) than to hang forever.
I'm sure there are others who prefer to wait instead, but I think this
should at least have a timeout or something.
- One of the reasons I wanted to drop the old request cache before is
that these caches are per mount/connection. If you have a dozen of
mounts that each cache 4 requests worth as here, with msize=1MB and two
buffers per request we're locking down 8 * 12 = 96 MB of ram just for
mounting.
That being said, as long as hanging is a real risk I'm not comfortable
sharing the mempools between all the clients either, so I'm not sure
what to suggest.
Anyway, we're getting close to the next merge request and I don't have
time to look deeper into this; I'll be putting the mempool patches on
hold for next cycle at least and we can discuss again if Kent still
encounters allocation troubles with Christian's smaller buffers
optimization first.
These will very likely get in this cycle unless something bad happens,
I've finished retesting a bit without trouble here.
net/9p/client.c | 23 +++++++++++++++++++----
1 file changed, 19 insertions(+), 4 deletions(-)
diff --git a/net/9p/client.c b/net/9p/client.c
index 928c9f88f204..f9c17fb79f35 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -218,7 +218,7 @@ static int parse_opts(char *opts, struct p9_client *clnt)
return ret;
}
-static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
+static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
int fc_idx, unsigned alloc_msize)
{
gfp_t gfp = GFP_NOFS|__GFP_NOWARN;
@@ -232,11 +232,13 @@ static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
if (alloc_msize < c->msize)
fc->sdata = kmalloc(alloc_msize, gfp);
- if (!fc->sdata) {
+ if (!fc->sdata && fc_idx >= 0) {
fc->sdata = mempool_alloc(&c->pools[fc_idx], gfp);
fc->used_mempool = true;
fc->capacity = c->msize;
}
+
+ return fc->sdata == NULL;
}
void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
@@ -280,6 +282,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
int alloc_tsize;
int alloc_rsize;
+ int fc_idx = 0;
int tag;
va_list apc;
@@ -294,8 +297,19 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
if (!req)
return ERR_PTR(-ENOMEM);
- p9_fcall_init(c, &req->tc, 0, alloc_tsize);
- p9_fcall_init(c, &req->rc, 1, alloc_rsize);
+ /* We cannot use the mempool for TFLUSH because flushes can be
+ * allocated when the thread still holds memory for the request
+ * we're flushing. A negative fc_idx will make p9_fcall_init
+ * error out.
+ */
+ if (type == P9_TFLUSH) {
+ fc_idx = -2;
+ }
+
+ if (p9_fcall_init(c, &req->tc, fc_idx, alloc_tsize))
+ goto free_req;
+ if (p9_fcall_init(c, &req->rc, fc_idx + 1, alloc_rsize))
+ goto free;
p9pdu_reset(&req->tc);
p9pdu_reset(&req->rc);
@@ -334,6 +348,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
free:
p9_fcall_fini(c, &req->tc, 0);
p9_fcall_fini(c, &req->rc, 1);
+free_req:
kmem_cache_free(p9_req_cache, req);
return ERR_PTR(-ENOMEM);
}
--
2.35.1
On 7/13/22 00:17, Dominique Martinet wrote:
> TFLUSH is called while the thread still holds memory for the
> request we're trying to flush, so mempool alloc can deadlock
> there. With p9_msg_buf_size() rework the flush allocation is
> small so just make it fail if allocation failed; all that does
> is potentially leak the request we're flushing until its reply
> finally does come.. or if it never does until umount.
Why not just add separate mempools for flushes? We don't have to
allocate memory for big payloads so they won't cost much, and then the
IO paths will be fully mempool-ified :)
Kent Overstreet wrote on Wed, Jul 13, 2022 at 02:39:06AM -0400:
> On 7/13/22 00:17, Dominique Martinet wrote:
> > TFLUSH is called while the thread still holds memory for the
> > request we're trying to flush, so mempool alloc can deadlock
> > there. With p9_msg_buf_size() rework the flush allocation is
> > small so just make it fail if allocation failed; all that does
> > is potentially leak the request we're flushing until its reply
> > finally does come.. or if it never does until umount.
>
> Why not just add separate mempools for flushes? We don't have to allocate
> memory for big payloads so they won't cost much, and then the IO paths will
> be fully mempool-ified :)
I don't think it really matters either way -- I'm much more worried
about the two points I gave in the commit comment section: mempools not
being shared leading to increased memory usage when many mostly-idle
mounts (I know users who need that), and more importantly that the
mempool waiting is uninterruptible/non-failible might be "nice" from the
using mempool side but I'd really prefer users to be able to ^C out of
a mount made on a bad server getting stuck in mempool_alloc at least.
It looked good before I realized all the ways this could hang, but now I
just think for something like 9p it makes more sense to fail the
allocation and the IO than to bounce forever trying to allocate memory
we don't have.
Let's first see if you still see if you still see high order allocation
failures when these are made much less likely with Chritisan's patch.
What I intend to push this cycle is in
https://github.com/martinetd/linux/commits/9p-test
up to 'net/9p: allocate appropriate reduced message buffers'; if you can
easily produce them I'd appreciate if you could confirm if it helps.
(just waiting for Chritian's confirmation + adjusting the strcmp for
rdma before I push it to 9p-next)
--
Dominique
On 7/13/22 03:12, Dominique Martinet wrote:
> Kent Overstreet wrote on Wed, Jul 13, 2022 at 02:39:06AM -0400:
>> On 7/13/22 00:17, Dominique Martinet wrote:
>>> TFLUSH is called while the thread still holds memory for the
>>> request we're trying to flush, so mempool alloc can deadlock
>>> there. With p9_msg_buf_size() rework the flush allocation is
>>> small so just make it fail if allocation failed; all that does
>>> is potentially leak the request we're flushing until its reply
>>> finally does come.. or if it never does until umount.
>>
>> Why not just add separate mempools for flushes? We don't have to allocate
>> memory for big payloads so they won't cost much, and then the IO paths will
>> be fully mempool-ified :)
>
> I don't think it really matters either way -- I'm much more worried
> about the two points I gave in the commit comment section: mempools not
> being shared leading to increased memory usage when many mostly-idle
> mounts (I know users who need that), and more importantly that the
> mempool waiting is uninterruptible/non-failible might be "nice" from the
> using mempool side but I'd really prefer users to be able to ^C out of
> a mount made on a bad server getting stuck in mempool_alloc at least.
We should never get stuck allocating memory - if that happens, game
over, system can no longer make forward progress.
(oh, that does give me an idea: Suren just implemented a code tagging
mechanism for tracking memory allocations by callsite, and I was talking
about using it for tracking latency. Memory allocation latency would be
a great thing to measure, it's something we care about and we haven't
had a good way of measuring it before).
> It looked good before I realized all the ways this could hang, but now I
> just think for something like 9p it makes more sense to fail the
> allocation and the IO than to bounce forever trying to allocate memory
> we don't have.
A filesystem that randomly fails IOs is, fundamentally, not a filesystem
that _works_. This whole thing started because 9pfs failing IOs has been
breaking my xfstests runs - and 9pfs isn't the thing I'm trying to test!
Local filesystems and the local IO stack have always had this
understanding - that IO needs to _just work_ and be able to make forward
progress without allocating additional memory, otherwise everything
falls over because memory reclaim requires doing IO. It's fundamentally
no different with network filesystems, the cultural expectation just
hasn't been there historically and not for any good technical reason -
it's just that in -net land dropping packets is generally a fine thing
to do when you have to - but it's really not in filesystem land, not if
you want to make something that's reliable under memory pressure!
> Let's first see if you still see if you still see high order allocation
> failures when these are made much less likely with Chritisan's patch.
Which patch is that? Unless you're talking about my mempool patch?
> What I intend to push this cycle is in
> https://github.com/martinetd/linux/commits/9p-test
> up to 'net/9p: allocate appropriate reduced message buffers'; if you can
> easily produce them I'd appreciate if you could confirm if it helps.
>
> (just waiting for Chritian's confirmation + adjusting the strcmp for
> rdma before I push it to 9p-next)
> --
> Dominique
>
Kent Overstreet wrote on Wed, Jul 13, 2022 at 03:40:10AM -0400:
>> I don't think it really matters either way -- I'm much more worried
>> about the two points I gave in the commit comment section: mempools not
>> being shared leading to increased memory usage when many mostly-idle
>> mounts (I know users who need that), and more importantly that the
>> mempool waiting is uninterruptible/non-failible might be "nice" from the
>> using mempool side but I'd really prefer users to be able to ^C out of
>> a mount made on a bad server getting stuck in mempool_alloc at least.
>
> We should never get stuck allocating memory - if that happens, game over,
> system can no longer make forward progress.
Well, that boils down to how much you want to trust your server.
The qemu server has been working rather well but I just pulled out
nfs-ganesha back from its grave for trans_fd tests earlier and got it to
crash on fsstress in 5 minutes, because upstream hasn't had anyone
working on the 9p side of things lately.
In that particular case the socket closes and client gets out of
requests, but if the server just stops replying you'll get requests
stuck as 9p cannot get out of flushes -- another long standing bug, this
one is even worse as it's unkillable.
And as long as that waits it won't free up buffers.
Even if that gets fixed, the buffers won't be freed until the server
replies, which can be arbitrarily slow -- ^C on a process that waits
for a free buffer must work.
> > It looked good before I realized all the ways this could hang, but now I
> > just think for something like 9p it makes more sense to fail the
> > allocation and the IO than to bounce forever trying to allocate memory
> > we don't have.
>
> A filesystem that randomly fails IOs is, fundamentally, not a filesystem
> that _works_. This whole thing started because 9pfs failing IOs has been
> breaking my xfstests runs - and 9pfs isn't the thing I'm trying to test!
I can agree with that and I probably wasn't clear, what I'm talking
about here is interruptability.
If all works well I agree some amount of waiting makes sense, but for
networked stuff some fallback also is important.
(if I could be greedy, some timeout would also make sense to me, but we
can agree to disagree on this one and I'll live without one)
> Local filesystems and the local IO stack have always had this understanding
> - that IO needs to _just work_ and be able to make forward progress without
> allocating additional memory, otherwise everything falls over because memory
> reclaim requires doing IO. It's fundamentally no different with network
> filesystems, the cultural expectation just hasn't been there historically
> and not for any good technical reason - it's just that in -net land dropping
> packets is generally a fine thing to do when you have to - but it's really
> not in filesystem land, not if you want to make something that's reliable
> under memory pressure!
Well... I agree memory pressure isn't a good reason and am happy to
spend some precious free time improving this, but while network
reliability shouldn't be a problem (9p only works on reliable
transports), I wouldn't be trusting 9p with too much.
For virtio this isn't much of a problem but to give an example if the
connection drops (server restart, or if over wan some stateful firewall
restart) then the client gets lost and never recovers unless it's
remounted.
For this particular case, I know of a company that did an implementation
to remember rebuild state (reopen fids etc) from the dentry paths but
that was never made public nor would be applicable as is anyway.
Priorities also aren't the same for everyone: while it probably is
perfectly acceptable to put aside 2MB of memory in your case (single
mount, 4x2x256KB msize), I've seen a usecase which requires many mounts
(three per user logged in on the machine, possibly dozens) with large
RPCs for performance (msize=1MB); clogging up 24MB of memory per user
for filesystems they might not even use doesn't seem right to me.
Well, frankly I've just been advising to get away from 9p whenever I
could, it's simple so it's nice for testing, but I'm always surprised it
keeps getting new users... So I probably shouldn't be maintaining this :)
Anyway, for qemu it's probably possible to improve things some more:
> > Let's first see if you still see if you still see high order allocation
> > failures when these are made much less likely with Chritisan's patch.
>
> Which patch is that? Unless you're talking about my mempool patch?
You'll need his 4 patches to apply cleanly, it's what we were talking
about for not allocating such large buffers for all requests.
aa32658b1ca1 net/9p: split message size argument into 't_size' and 'r_size' pair
86b277a492be 9p: add P9_ERRMAX for 9p2000 and 9p2000.u
17c59311f84d net/9p: add p9_msg_buf_size()
4b9eca5829ed net/9p: allocate appropriate reduced message buffers
The first three prepare and last one enables it. Most requests will
allocate much smaller buffers, so if you have parallel IOs in particular
it should be quite less likely to hit allocation failures.
They'll still happen for the larger messages (read/readdir/write) if you
can't allocate a single contiguous segment, but even these would only
need one such buffer instead of the current two, so that still helps a
bit.
--
Dominique
On Mittwoch, 13. Juli 2022 06:17:01 CEST Dominique Martinet wrote:
> TFLUSH is called while the thread still holds memory for the
> request we're trying to flush, so mempool alloc can deadlock
> there. With p9_msg_buf_size() rework the flush allocation is
> small so just make it fail if allocation failed; all that does
> is potentially leak the request we're flushing until its reply
> finally does come.. or if it never does until umount.
>
> Signed-off-by: Dominique Martinet <[email protected]>
> ---
Patch looks fine on first impression, but I'll postpone testing this. And yes,
I also think that exempting Tflush should be fair. If 4k (soon) cannot be
allocated, then you probably have worse problems than that.
> Here's a concrete version of what I had in mind: literally just make
> allocation fail if the initial alloc failed.
>
> I can't reproduce any bad hang with a sane server here, but we still
> risk hanging with a bad server that ignores flushes as these are still
> unkillable (someday I'll finish my async requests work...)
>
> So ultimately there are two things I'm not so happy about with mempools:
> - this real possibility of client hangs if a server mishandles some
> replies -- this might make fuzzing difficult in particular, I think it's
Concrete example of such a mishap?
> easier to deal with failed IO (as long as it fails all the way back to
> userspace) than to hang forever.
> I'm sure there are others who prefer to wait instead, but I think this
> should at least have a timeout or something.
Not sure if it was easy to pick an appropriate timeout value. I've seen things
slowing down extremely with 9p after a while. But to be fair, these were on
production machines with ancient kernel versions, so maybe already fixed.
A proc interface would be useful though to be able to identify things like
piling up too many fids and other performance related numbers.
> - One of the reasons I wanted to drop the old request cache before is
> that these caches are per mount/connection. If you have a dozen of
> mounts that each cache 4 requests worth as here, with msize=1MB and two
> buffers per request we're locking down 8 * 12 = 96 MB of ram just for
> mounting.
> That being said, as long as hanging is a real risk I'm not comfortable
> sharing the mempools between all the clients either, so I'm not sure
> what to suggest.
Why would a shared mempool increase the chance of a hang or worsen its
outcome?
> Anyway, we're getting close to the next merge request and I don't have
> time to look deeper into this; I'll be putting the mempool patches on
> hold for next cycle at least and we can discuss again if Kent still
> encounters allocation troubles with Christian's smaller buffers
> optimization first.
> These will very likely get in this cycle unless something bad happens,
> I've finished retesting a bit without trouble here.
>
>
> net/9p/client.c | 23 +++++++++++++++++++----
> 1 file changed, 19 insertions(+), 4 deletions(-)
>
> diff --git a/net/9p/client.c b/net/9p/client.c
> index 928c9f88f204..f9c17fb79f35 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -218,7 +218,7 @@ static int parse_opts(char *opts, struct p9_client
> *clnt) return ret;
> }
>
> -static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
> +static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
> int fc_idx, unsigned alloc_msize)
> {
> gfp_t gfp = GFP_NOFS|__GFP_NOWARN;
> @@ -232,11 +232,13 @@ static void p9_fcall_init(struct p9_client *c, struct
> p9_fcall *fc, if (alloc_msize < c->msize)
> fc->sdata = kmalloc(alloc_msize, gfp);
>
> - if (!fc->sdata) {
> + if (!fc->sdata && fc_idx >= 0) {
> fc->sdata = mempool_alloc(&c->pools[fc_idx], gfp);
> fc->used_mempool = true;
> fc->capacity = c->msize;
> }
> +
> + return fc->sdata == NULL;
> }
>
> void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
> @@ -280,6 +282,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint
> t_size, uint r_size, struct p9_req_t *req = kmem_cache_alloc(p9_req_cache,
> GFP_NOFS); int alloc_tsize;
> int alloc_rsize;
> + int fc_idx = 0;
> int tag;
> va_list apc;
>
> @@ -294,8 +297,19 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint
> t_size, uint r_size, if (!req)
> return ERR_PTR(-ENOMEM);
>
> - p9_fcall_init(c, &req->tc, 0, alloc_tsize);
> - p9_fcall_init(c, &req->rc, 1, alloc_rsize);
> + /* We cannot use the mempool for TFLUSH because flushes can be
> + * allocated when the thread still holds memory for the request
> + * we're flushing. A negative fc_idx will make p9_fcall_init
> + * error out.
> + */
> + if (type == P9_TFLUSH) {
> + fc_idx = -2;
> + }
> +
> + if (p9_fcall_init(c, &req->tc, fc_idx, alloc_tsize))
> + goto free_req;
> + if (p9_fcall_init(c, &req->rc, fc_idx + 1, alloc_rsize))
> + goto free;
>
> p9pdu_reset(&req->tc);
> p9pdu_reset(&req->rc);
> @@ -334,6 +348,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint
> t_size, uint r_size, free:
> p9_fcall_fini(c, &req->tc, 0);
> p9_fcall_fini(c, &req->rc, 1);
> +free_req:
> kmem_cache_free(p9_req_cache, req);
> return ERR_PTR(-ENOMEM);
> }
Christian Schoenebeck wrote on Thu, Jul 14, 2022 at 09:16:14PM +0200:
> Patch looks fine on first impression, but I'll postpone testing this. And yes,
> I also think that exempting Tflush should be fair. If 4k (soon) cannot be
> allocated, then you probably have worse problems than that.
Yes, would be for a later cycle anyway -- no hurry.
> > Here's a concrete version of what I had in mind: literally just make
> > allocation fail if the initial alloc failed.
> >
> > I can't reproduce any bad hang with a sane server here, but we still
> > risk hanging with a bad server that ignores flushes as these are still
> > unkillable (someday I'll finish my async requests work...)
> >
> > So ultimately there are two things I'm not so happy about with mempools:
> > - this real possibility of client hangs if a server mishandles some
> > replies -- this might make fuzzing difficult in particular, I think it's
>
> Concrete example of such a mishap?
The example I gave Kent in another reply is just server misbehaving -- I
have had histories of troubles with ganesha in the past -- but even with
low memory qemu and fio and ^C it felt more likely to get stuck? it
looked killable e.g. pkill -9 fio would get me out of it with this
patch, but ^C ought to work in my opinion.
In particular with the io_uring engine some of the workers aren't
visible at all from userspace (I only found out about them through
qemu's gdb and lx-ps), so it's really hard to see we're stuck on an
allocation if that ever happens...
Ultimately I think mempool is great for short-lived allocations
e.g. temporary buffers, where we can be sure the memory will be freed up
after a short bounded time, but it might just not be a great fit for 9p.
I'm not sure what to suggest instead, though; this is really worst-case
thinking and just having ^c work e.g. make mempool_alloc interruptible
and failible would probably be enough to convince me.
> > easier to deal with failed IO (as long as it fails all the way back to
> > userspace) than to hang forever.
> > I'm sure there are others who prefer to wait instead, but I think this
> > should at least have a timeout or something.
>
> Not sure if it was easy to pick an appropriate timeout value. I've seen things
> slowing down extremely with 9p after a while. But to be fair, these were on
> production machines with ancient kernel versions, so maybe already fixed.
I'm not sure what kind of timeframe you're thinking of, for exmple
lustre has 5 minutes timeouts in some places -- although depending on
the failure some things will also wait forever.
I was thining something similar, but realistically this isn't going to
happen anyway, at least not here, so let's not waste too much time on
this point...
> A proc interface would be useful though to be able to identify things like
> piling up too many fids and other performance related numbers.
That would be nice, yes.
We can probably pull in some stats from either idr (requests for tags
and fids) quite easily -- that might be a nice side project if someone
wants to do this.
> > - One of the reasons I wanted to drop the old request cache before is
> > that these caches are per mount/connection. If you have a dozen of
> > mounts that each cache 4 requests worth as here, with msize=1MB and two
> > buffers per request we're locking down 8 * 12 = 96 MB of ram just for
> > mounting.
> > That being said, as long as hanging is a real risk I'm not comfortable
> > sharing the mempools between all the clients either, so I'm not sure
> > what to suggest.
>
> Why would a shared mempool increase the chance of a hang or worsen its
> outcome?
In the tcp/really remote server case, if a server stops responding
that'll get these requests not freed for a while (until the user gives
up, possibly never); if the client also is under enough memory pressure
to just fail a single alloc then on a still working mount we'll
potentially have no buffer to give it and get that other working mount
stuck.
Another example would be re-exporting a network filesystem, e.g. at
previous job we'd run a 9p server re-exporting multiple lustre
mountpoints on different servers, so one could get stuck while others
still work. That this would impact the 9p client on different mounts on
the client side is a bit of a bummer.
Anyway, this is just looking for trouble and I'm sure most users would
only see improvements with this -- we're talking about some allocations
that would have failed anyway which should be rare enough in the first
place, and in the nominal case we'd get this to work when it doesn't.
I'm just not fully comfortable with the failure mode at this point.
--
Dominique
On Freitag, 15. Juli 2022 00:31:54 CEST Dominique Martinet wrote:
> Christian Schoenebeck wrote on Thu, Jul 14, 2022 at 09:16:14PM +0200:
> > Patch looks fine on first impression, but I'll postpone testing this. And
> > yes, I also think that exempting Tflush should be fair. If 4k (soon)
> > cannot be allocated, then you probably have worse problems than that.
>
> Yes, would be for a later cycle anyway -- no hurry.
>
> > > Here's a concrete version of what I had in mind: literally just make
> > > allocation fail if the initial alloc failed.
> > >
> > > I can't reproduce any bad hang with a sane server here, but we still
> > > risk hanging with a bad server that ignores flushes as these are still
> > > unkillable (someday I'll finish my async requests work...)
> > >
> > > So ultimately there are two things I'm not so happy about with mempools:
> > > - this real possibility of client hangs if a server mishandles some
> > >
> > > replies -- this might make fuzzing difficult in particular, I think it's
> >
> > Concrete example of such a mishap?
>
> The example I gave Kent in another reply is just server misbehaving -- I
> have had histories of troubles with ganesha in the past -- but even with
> low memory qemu and fio and ^C it felt more likely to get stuck? it
> looked killable e.g. pkill -9 fio would get me out of it with this
> patch, but ^C ought to work in my opinion.
Ah, I thought you had concrete message sequences in mind or something. NP
> In particular with the io_uring engine some of the workers aren't
> visible at all from userspace (I only found out about them through
> qemu's gdb and lx-ps), so it's really hard to see we're stuck on an
> allocation if that ever happens...
So also a candiate for showing the amount of workers et al. in a future proc
entry.
> Ultimately I think mempool is great for short-lived allocations
> e.g. temporary buffers, where we can be sure the memory will be freed up
> after a short bounded time, but it might just not be a great fit for 9p.
> I'm not sure what to suggest instead, though; this is really worst-case
> thinking and just having ^c work e.g. make mempool_alloc interruptible
> and failible would probably be enough to convince me.
>
> > > easier to deal with failed IO (as long as it fails all the way back to
> > > userspace) than to hang forever.
> > > I'm sure there are others who prefer to wait instead, but I think this
> > > should at least have a timeout or something.
> >
> > Not sure if it was easy to pick an appropriate timeout value. I've seen
> > things slowing down extremely with 9p after a while. But to be fair,
> > these were on production machines with ancient kernel versions, so maybe
> > already fixed.
> I'm not sure what kind of timeframe you're thinking of, for exmple
> lustre has 5 minutes timeouts in some places -- although depending on
> the failure some things will also wait forever.
> I was thining something similar, but realistically this isn't going to
> happen anyway, at least not here, so let's not waste too much time on
> this point...
I think these were approximately in the scope of around 10s? But given that I
observed that only after running it for a while, it might also increase over
time.
But OTOH, if that issue really still exists (not certain), then it would make
more sense for me to find out why that happens in the first place. Because a
latency of several seconds on a local system is simply odd.
> > A proc interface would be useful though to be able to identify things like
> > piling up too many fids and other performance related numbers.
>
> That would be nice, yes.
> We can probably pull in some stats from either idr (requests for tags
> and fids) quite easily -- that might be a nice side project if someone
> wants to do this.
Already on my todo list. :)
> > > - One of the reasons I wanted to drop the old request cache before is
> > >
> > > that these caches are per mount/connection. If you have a dozen of
> > > mounts that each cache 4 requests worth as here, with msize=1MB and two
> > > buffers per request we're locking down 8 * 12 = 96 MB of ram just for
> > > mounting.
> > > That being said, as long as hanging is a real risk I'm not comfortable
> > > sharing the mempools between all the clients either, so I'm not sure
> > > what to suggest.
> >
> > Why would a shared mempool increase the chance of a hang or worsen its
> > outcome?
>
> In the tcp/really remote server case, if a server stops responding
> that'll get these requests not freed for a while (until the user gives
> up, possibly never); if the client also is under enough memory pressure
> to just fail a single alloc then on a still working mount we'll
> potentially have no buffer to give it and get that other working mount
> stuck.
OK, I got that, but if the mempools were shared among multiple 9p mounts, then
it would also be less likely to trigger the case as less memory is used
overall, no?
> Another example would be re-exporting a network filesystem, e.g. at
> previous job we'd run a 9p server re-exporting multiple lustre
> mountpoints on different servers, so one could get stuck while others
> still work. That this would impact the 9p client on different mounts on
> the client side is a bit of a bummer.
OK, that would indeed be a scenario making a difference, as one mount could
tear down the other ones.
> Anyway, this is just looking for trouble and I'm sure most users would
> only see improvements with this -- we're talking about some allocations
> that would have failed anyway which should be rare enough in the first
> place, and in the nominal case we'd get this to work when it doesn't.
> I'm just not fully comfortable with the failure mode at this point.
> --
> Dominique