From: Trond Myklebust <[email protected]>
We'd like to avoid GFP_NOWAIT whenever possible, because it has no fall-
back reclaim strategy for dealing with a failure of the initial
allocation.
At the same time, memory pools appear to be suboptimal as an allocation
strategy when used with anything other than GFP_NOWAIT, since they cause
threads to hang upon failure.
This patch series therefore aims to demote the mempools to being a
strategy of last resort, with the primary allocation strategy being to
use the underlying slabs.
While we're at it, we want to ensure that rpciod, xprtiod and nfsiod all
use the same allocation strategy, so that the latter two don't thwart
our ability to complete writeback RPC calls by getting blocked in the mm
layer.
Trond Myklebust (9):
NFS: Fix memory allocation in rpc_malloc()
NFS: Fix memory allocation in rpc_alloc_task()
SUNRPC: Fix unx_lookup_cred() allocation
SUNRPC: Make the rpciod and xprtiod slab allocation modes consistent
NFS: nfsiod should not block forever in mempool_alloc()
NFS: Avoid writeback threads getting stuck in mempool_alloc()
NFSv4/pnfs: Ensure pNFS allocation modes are consistent with nfsiod
pNFS/flexfiles: Ensure pNFS allocation modes are consistent with
nfsiod
pNFS/files: Ensure pNFS allocation modes are consistent with nfsiod
fs/nfs/filelayout/filelayout.c | 2 +-
fs/nfs/flexfilelayout/flexfilelayout.c | 50 +++++++++++---------------
fs/nfs/internal.h | 7 ++++
fs/nfs/nfs42proc.c | 2 +-
fs/nfs/pagelist.c | 10 +++---
fs/nfs/pnfs.c | 39 +++++++++-----------
fs/nfs/pnfs_nfs.c | 8 +++--
fs/nfs/write.c | 34 +++++++++---------
include/linux/nfs_fs.h | 2 +-
include/linux/sunrpc/sched.h | 1 +
net/sunrpc/auth_unix.c | 18 +++++-----
net/sunrpc/backchannel_rqst.c | 8 ++---
net/sunrpc/rpcb_clnt.c | 4 +--
net/sunrpc/sched.c | 31 ++++++++++------
net/sunrpc/socklib.c | 3 +-
net/sunrpc/xprt.c | 5 +--
net/sunrpc/xprtsock.c | 11 +++---
17 files changed, 123 insertions(+), 112 deletions(-)
--
2.35.1
From: Trond Myklebust <[email protected]>
When in a low memory situation, we do want rpciod to kick off direct
reclaim in the case where that helps, however we don't want it looping
forever in mempool_alloc().
So first try allocating from the slab using GFP_KERNEL | __GFP_NORETRY,
and then fall back to a GFP_NOWAIT allocation from the mempool.
Ditto for rpc_alloc_task()
Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/sched.h | 1 +
net/sunrpc/sched.c | 21 ++++++++++++++-------
2 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 56710f8056d3..1d7a3e51b795 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -262,6 +262,7 @@ void rpc_destroy_mempool(void);
extern struct workqueue_struct *rpciod_workqueue;
extern struct workqueue_struct *xprtiod_workqueue;
void rpc_prepare_task(struct rpc_task *task);
+gfp_t rpc_task_gfp_mask(void);
static inline int rpc_wait_for_completion_task(struct rpc_task *task)
{
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 7c8f87ebdbc0..d59a033820be 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -57,6 +57,13 @@ struct workqueue_struct *rpciod_workqueue __read_mostly;
struct workqueue_struct *xprtiod_workqueue __read_mostly;
EXPORT_SYMBOL_GPL(xprtiod_workqueue);
+gfp_t rpc_task_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+
unsigned long
rpc_task_timeout(const struct rpc_task *task)
{
@@ -1030,15 +1037,15 @@ int rpc_malloc(struct rpc_task *task)
struct rpc_rqst *rqst = task->tk_rqstp;
size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
struct rpc_buffer *buf;
- gfp_t gfp = GFP_KERNEL;
-
- if (RPC_IS_ASYNC(task))
- gfp = GFP_NOWAIT | __GFP_NOWARN;
+ gfp_t gfp = rpc_task_gfp_mask();
size += sizeof(struct rpc_buffer);
- if (size <= RPC_BUFFER_MAXSIZE)
- buf = mempool_alloc(rpc_buffer_mempool, gfp);
- else
+ if (size <= RPC_BUFFER_MAXSIZE) {
+ buf = kmem_cache_alloc(rpc_buffer_slabp, gfp);
+ /* Reach for the mempool if dynamic allocation fails */
+ if (!buf && RPC_IS_ASYNC(task))
+ buf = mempool_alloc(rpc_buffer_mempool, GFP_NOWAIT);
+ } else
buf = kmalloc(size, gfp);
if (!buf)
--
2.35.1
From: Trond Myklebust <[email protected]>
As for rpc_malloc(), we first try allocating from the slab, then fall
back to a non-waiting allocation from the mempool.
Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/sched.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index d59a033820be..b258b87a3ec2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1108,10 +1108,14 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
rpc_init_task_statistics(task);
}
-static struct rpc_task *
-rpc_alloc_task(void)
+static struct rpc_task *rpc_alloc_task(void)
{
- return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_KERNEL);
+ struct rpc_task *task;
+
+ task = kmem_cache_alloc(rpc_task_slabp, rpc_task_gfp_mask());
+ if (task)
+ return task;
+ return mempool_alloc(rpc_task_mempool, GFP_NOWAIT);
}
/*
--
2.35.1
From: Trond Myklebust <[email protected]>
Default to the same mempool allocation strategy as for rpc_malloc().
Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/auth_unix.c | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index c629d366030e..1e091d3fa607 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -40,17 +40,19 @@ unx_destroy(struct rpc_auth *auth)
/*
* Lookup AUTH_UNIX creds for current process
*/
-static struct rpc_cred *
-unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth,
+ struct auth_cred *acred, int flags)
{
- gfp_t gfp = GFP_KERNEL;
struct rpc_cred *ret;
- if (flags & RPCAUTH_LOOKUP_ASYNC)
- gfp = GFP_NOWAIT | __GFP_NOWARN;
- ret = mempool_alloc(unix_pool, gfp);
- if (!ret)
- return ERR_PTR(-ENOMEM);
+ ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask());
+ if (!ret) {
+ if (!(flags & RPCAUTH_LOOKUP_ASYNC))
+ return ERR_PTR(-ENOMEM);
+ ret = mempool_alloc(unix_pool, GFP_NOWAIT);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+ }
rpcauth_init_cred(ret, acred, auth, &unix_credops);
ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
return ret;
--
2.35.1
From: Trond Myklebust <[email protected]>
Make sure that rpciod and xprtiod are always using the same slab
allocation modes.
Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/backchannel_rqst.c | 8 ++++----
net/sunrpc/rpcb_clnt.c | 4 ++--
net/sunrpc/socklib.c | 3 ++-
net/sunrpc/xprt.c | 5 +----
net/sunrpc/xprtsock.c | 11 ++++++-----
5 files changed, 15 insertions(+), 16 deletions(-)
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 22a2c235abf1..5a6b61dcdf2d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -75,9 +75,9 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
return 0;
}
-static
-struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
+static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt)
{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
struct rpc_rqst *req;
/* Pre-allocate one backchannel rpc_rqst */
@@ -154,7 +154,7 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
INIT_LIST_HEAD(&tmp_list);
for (i = 0; i < min_reqs; i++) {
/* Pre-allocate one backchannel rpc_rqst */
- req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
+ req = xprt_alloc_bc_req(xprt);
if (req == NULL) {
printk(KERN_ERR "Failed to create bc rpc_rqst\n");
goto out_free;
@@ -343,7 +343,7 @@ struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
break;
} else if (req)
break;
- new = xprt_alloc_bc_req(xprt, GFP_KERNEL);
+ new = xprt_alloc_bc_req(xprt);
} while (new);
return req;
}
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 0fdeb8666bfd..5a8e6d46809a 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -714,7 +714,7 @@ void rpcb_getport_async(struct rpc_task *task)
goto bailout_nofree;
}
- map = kzalloc(sizeof(struct rpcbind_args), GFP_KERNEL);
+ map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask());
if (!map) {
status = -ENOMEM;
goto bailout_release_client;
@@ -730,7 +730,7 @@ void rpcb_getport_async(struct rpc_task *task)
case RPCBVERS_4:
case RPCBVERS_3:
map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID];
- map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
+ map->r_addr = rpc_sockaddr2uaddr(sap, rpc_task_gfp_mask());
if (!map->r_addr) {
status = -ENOMEM;
goto bailout_free_args;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index d52313af82bc..05b38bf68316 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -15,6 +15,7 @@
#include <linux/pagemap.h>
#include <linux/udp.h>
#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/xdr.h>
#include <linux/export.h>
@@ -222,7 +223,7 @@ static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg,
{
int err;
- err = xdr_alloc_bvec(xdr, GFP_KERNEL);
+ err = xdr_alloc_bvec(xdr, rpc_task_gfp_mask());
if (err < 0)
return err;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index bbe913121f43..744c6c1d536f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1679,15 +1679,12 @@ static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task
static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
{
struct rpc_rqst *req = ERR_PTR(-EAGAIN);
- gfp_t gfp_mask = GFP_KERNEL;
if (xprt->num_reqs >= xprt->max_reqs)
goto out;
++xprt->num_reqs;
spin_unlock(&xprt->reserve_lock);
- if (current->flags & PF_WQ_WORKER)
- gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
- req = kzalloc(sizeof(*req), gfp_mask);
+ req = kzalloc(sizeof(*req), rpc_task_gfp_mask());
spin_lock(&xprt->reserve_lock);
if (req != NULL)
goto out;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8909c768fe71..b52eaa8a0cda 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -428,9 +428,9 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
offset += want;
}
- want = xs_alloc_sparse_pages(buf,
- min_t(size_t, count - offset, buf->page_len),
- GFP_KERNEL);
+ want = xs_alloc_sparse_pages(
+ buf, min_t(size_t, count - offset, buf->page_len),
+ GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
if (seek < want) {
ret = xs_read_bvec(sock, msg, flags, buf->bvec,
xdr_buf_pagecount(buf),
@@ -826,7 +826,8 @@ static void
xs_stream_prepare_request(struct rpc_rqst *req)
{
xdr_free_bvec(&req->rq_rcv_buf);
- req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_KERNEL);
+ req->rq_task->tk_status = xdr_alloc_bvec(
+ &req->rq_rcv_buf, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
}
/*
@@ -2487,7 +2488,7 @@ static int bc_malloc(struct rpc_task *task)
return -EINVAL;
}
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
if (!page)
return -ENOMEM;
--
2.35.1
From: Trond Myklebust <[email protected]>
The concern is that since nfsiod is sometimes required to kick off a
commit, it can get locked up waiting forever in mempool_alloc() instead
of failing gracefully and leaving the commit until later.
Try to allocate from the slab first, with GFP_KERNEL | __GFP_NORETRY,
then fall back to a non-blocking attempt to allocate from the memory
pool.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/internal.h | 7 +++++++
fs/nfs/pnfs_nfs.c | 8 ++++++--
fs/nfs/write.c | 24 +++++++++---------------
include/linux/nfs_fs.h | 2 +-
4 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 194840a97e3a..57b0497105c8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -587,6 +587,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf,
!nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
}
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 316f68f96e57..657c242a18ff 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -419,7 +419,7 @@ static struct nfs_commit_data *
pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
struct nfs_commit_info *cinfo)
{
- struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+ struct nfs_commit_data *data = nfs_commitdata_alloc();
if (!data)
return NULL;
@@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
unsigned int nreq = 0;
if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
data->ds_commit_index = -1;
list_splice_init(mds_pages, &data->pages);
list_add_tail(&data->list, &list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 599a82406d38..ef47e3700e4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool;
static struct kmem_cache *nfs_cdata_cachep;
static mempool_t *nfs_commit_mempool;
-struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
{
struct nfs_commit_data *p;
- if (never_fail)
- p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
- else {
- /* It is OK to do some reclaim, not no safe to wait
- * for anything to be returned to the pool.
- * mempool_alloc() cannot handle that particular combination,
- * so we need two separate attempts.
- */
+ p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
- if (!p)
- p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
- __GFP_NOWARN | __GFP_NORETRY);
if (!p)
return NULL;
+ memset(p, 0, sizeof(*p));
}
-
- memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
return p;
}
@@ -1826,7 +1816,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
if (list_empty(head))
return 0;
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(head, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c47c448befc8..db305abafc9e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -580,7 +580,7 @@ extern int nfs_wb_all(struct inode *inode);
extern int nfs_wb_page(struct inode *inode, struct page *page);
extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
extern int nfs_commit_inode(struct inode *, int);
-extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
+extern struct nfs_commit_data *nfs_commitdata_alloc(void);
extern void nfs_commit_free(struct nfs_commit_data *data);
bool nfs_commit_end(struct nfs_mds_commit_info *cinfo);
--
2.35.1
On Thu, 2022-03-24 at 09:57 +1100, NeilBrown wrote:
> On Tue, 22 Mar 2022, [email protected] wrote:
> > From: Trond Myklebust <[email protected]>
> >
> > We'd like to avoid GFP_NOWAIT whenever possible, because it has no
> > fall-
> > back reclaim strategy for dealing with a failure of the initial
> > allocation.
>
> I'm not sure I entirely agree with that. GFP_NOWAIT will ensure
> kswapd
> runs on failure, so waiting briefly and retrying (which sunrpc does
> on
> -ENOMEM (at least ni call_refreshresult) is a valid fallback.
>
> However, I do like the new rpc_task_gfp_mask() and that fact that you
> have used it quite widely.
>
> So: looks good to me. I haven't carefully reviewed each patch enough
> to
> say Reviewed-by, but I did see an easy problems.
Thanks Neil!
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]