LinuxLists.cc - [RFC PATCH v2 08/15] NFS: for localio don't call filesystem read() and write() routines directly

2024-06-12 03:08:13

Subject: [RFC PATCH v2 08/15] NFS: for localio don't call filesystem read() and write() routines directly

From: Trond Myklebust <[email protected]>

Some filesystem writeback routines can end up taking up a lot of stack
space (particularly xfs). Instead of risking running over due to the
extra overhead from the NFS stack, we should just call these routines
from a workqueue job.

Signed-off-by: Trond Myklebust <[email protected]>
Signed-off-by: Mike Snitzer <[email protected]>
---
fs/nfs/inode.c | 57 +++++++++++++++++---------
fs/nfs/internal.h | 1 +
fs/nfs/localio.c | 102 +++++++++++++++++++++++++++++++++++-----------
3 files changed, 118 insertions(+), 42 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4f88b860494f..f5a618887c6f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2394,35 +2394,54 @@ static void nfs_destroy_inodecache(void)
kmem_cache_destroy(nfs_inode_cachep);
}

+struct workqueue_struct *nfslocaliod_workqueue;
struct workqueue_struct *nfsiod_workqueue;
EXPORT_SYMBOL_GPL(nfsiod_workqueue);

/*
- * start up the nfsiod workqueue
- */
-static int nfsiod_start(void)
-{
- struct workqueue_struct *wq;
- dprintk("RPC: creating workqueue nfsiod\n");
- wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
- if (wq == NULL)
- return -ENOMEM;
- nfsiod_workqueue = wq;
- return 0;
-}
-
-/*
- * Destroy the nfsiod workqueue
+ * Destroy the nfsiod workqueues
*/
static void nfsiod_stop(void)
{
struct workqueue_struct *wq;

wq = nfsiod_workqueue;
- if (wq == NULL)
- return;
- nfsiod_workqueue = NULL;
- destroy_workqueue(wq);
+ if (wq != NULL) {
+ nfsiod_workqueue = NULL;
+ destroy_workqueue(wq);
+ }
+#if defined(CONFIG_NFS_V3_LOCALIO) || defined(CONFIG_NFS_V4_LOCALIO)
+ wq = nfslocaliod_workqueue;
+ if (wq != NULL) {
+ nfslocaliod_workqueue = NULL;
+ destroy_workqueue(wq);
+ }
+#endif /* CONFIG_NFS_V3_LOCALIO || CONFIG_NFS_V4_LOCALIO */
+}
+
+/*
+ * Start the nfsiod workqueues
+ */
+static int nfsiod_start(void)
+{
+ dprintk("RPC: creating workqueue nfsiod\n");
+ nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (nfsiod_workqueue == NULL)
+ return -ENOMEM;
+#if defined(CONFIG_NFS_V3_LOCALIO) || defined(CONFIG_NFS_V4_LOCALIO)
+ /*
+ * localio writes need to use a normal (non-memreclaim) workqueue.
+ * When we start getting low on space, XFS goes and calls flush_work() on
+ * a non-memreclaim work queue, which causes a priority inversion problem.
+ */
+ dprintk("RPC: creating workqueue nfslocaliod\n");
+ nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
+ if (unlikely(nfslocaliod_workqueue == NULL)) {
+ nfsiod_stop();
+ return -ENOMEM;
+ }
+#endif /* CONFIG_NFS_V3_LOCALIO || CONFIG_NFS_V4_LOCALIO */
+ return 0;
}

unsigned int nfs_net_id;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 62909e5594b1..c299b1bfae9b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -440,6 +440,7 @@ int nfs_check_flags(int);

/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
+extern struct workqueue_struct *nfslocaliod_workqueue;
extern struct inode *nfs_alloc_inode(struct super_block *sb);
extern void nfs_free_inode(struct inode *);
extern int nfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index b3996bf9c45d..00832c9be9f5 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -64,6 +64,12 @@ struct nfs_local_fsync_ctx {
};
static void nfs_local_fsync_work(struct work_struct *work);

+struct nfs_local_io_args {
+ struct nfs_local_kiocb *iocb;
+ struct work_struct work;
+ struct completion *done;
+};
+
/*
* We need to translate between nfs status return values and
* the local errno values which may not be the same.
@@ -421,21 +427,38 @@ nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
nfs_local_pgio_complete(iocb);
}

-static int
-nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
- const struct rpc_call_ops *call_ops)
+static void nfs_local_call_read(struct work_struct *work)
{
- struct nfs_local_kiocb *iocb;
+ struct nfs_local_io_args *args =
+ container_of(work, struct nfs_local_io_args, work);
+ struct nfs_local_kiocb *iocb = args->iocb;
+ struct file *filp = iocb->kiocb.ki_filp;
struct iov_iter iter;
ssize_t status;

+ nfs_local_iter_init(&iter, iocb, READ);
+
+ status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_read_done(iocb, status);
+ nfs_local_pgio_release(iocb);
+ }
+ complete(args->done);
+}
+
+static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
+ const struct rpc_call_ops *call_ops)
+{
+ struct nfs_local_io_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct nfs_local_kiocb *iocb;
+
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);

iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL);
if (iocb == NULL)
return -ENOMEM;
- nfs_local_iter_init(&iter, iocb, READ);

nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
@@ -445,11 +468,18 @@ nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
}

- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
- if (status != -EIOCBQUEUED) {
- nfs_local_read_done(iocb, status);
- nfs_local_pgio_release(iocb);
- }
+ /*
+ * Don't call filesystem read() routines directly.
+ * In order to avoid issues with stack overflow,
+ * call the read routines from a workqueue job.
+ */
+ args.iocb = iocb;
+ args.done = &done;
+ INIT_WORK_ONSTACK(&args.work, nfs_local_call_read);
+ queue_work(nfslocaliod_workqueue, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+
return 0;
}

@@ -559,14 +589,35 @@ nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
nfs_local_pgio_complete(iocb);
}

-static int
-nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
- const struct rpc_call_ops *call_ops)
+static void nfs_local_call_write(struct work_struct *work)
{
- struct nfs_local_kiocb *iocb;
+ struct nfs_local_io_args *args =
+ container_of(work, struct nfs_local_io_args, work);
+ struct nfs_local_kiocb *iocb = args->iocb;
+ struct file *filp = iocb->kiocb.ki_filp;
struct iov_iter iter;
ssize_t status;

+ nfs_local_iter_init(&iter, iocb, WRITE);
+
+ file_start_write(filp);
+ status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ file_end_write(filp);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_write_done(iocb, status);
+ nfs_get_vfs_attr(filp, iocb->hdr->res.fattr);
+ nfs_local_pgio_release(iocb);
+ }
+ complete(args->done);
+}
+
+static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
+ const struct rpc_call_ops *call_ops)
+{
+ struct nfs_local_io_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct nfs_local_kiocb *iocb;
+
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
@@ -574,7 +625,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO);
if (iocb == NULL)
return -ENOMEM;
- nfs_local_iter_init(&iter, iocb, WRITE);

switch (hdr->args.stable) {
default:
@@ -594,14 +644,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,

nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);

- file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
- file_end_write(filp);
- if (status != -EIOCBQUEUED) {
- nfs_local_write_done(iocb, status);
- nfs_get_vfs_attr(filp, hdr->res.fattr);
- nfs_local_pgio_release(iocb);
- }
+ /*
+ * Don't call filesystem write() routines directly.
+ * Some filesystem writeback routines can end up taking up a lot of
+ * stack (particularly xfs). Instead of risking running over due to
+ * the extra overhead from the NFS stack, call these write routines
+ * from a workqueue job.
+ */
+ args.iocb = iocb;
+ args.done = &done;
+ INIT_WORK_ONSTACK(&args.work, nfs_local_call_write);
+ queue_work(nfslocaliod_workqueue, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+
return 0;
}

--
2.44.0

2024-06-12 04:39:32

by NeilBrown

[permalink] [raw]

Subject: Re: [RFC PATCH v2 08/15] NFS: for localio don't call filesystem read() and write() routines directly

On Wed, 12 Jun 2024, Mike Snitzer wrote:
> From: Trond Myklebust <[email protected]>
>
> Some filesystem writeback routines can end up taking up a lot of stack
> space (particularly xfs). Instead of risking running over due to the
> extra overhead from the NFS stack, we should just call these routines
> from a workqueue job.
>

Do we really need this today? Kernel stack is a lot bigger than it once
was.

If we do need it, do we need a dedicated workqueue? Why not just use
system_unbound_wq ??

NeilBrown