LinuxLists.cc - [PATCH 0/2 v2] nfsd: fully close files in the nfsd threads

2023-12-15 01:21:20

Subject: [PATCH 0/2 v2] nfsd: fully close files in the nfsd threads

This is a revised version of my recent series to have nfs close files
directly and not leave work to an async work queue.

This version takes a more cautious approach and only using __fput_sync()
where there is a reasonable case that it could make a practical
difference.

2023-12-15 01:21:31

by NeilBrown

[permalink] [raw]

Subject: [PATCH 1/2] nfsd: Don't leave work of closing files to a work queue.

The work of closing a file can have non-trivial cost. Doing it in a
separate work queue thread means that cost isn't imposed on the nfsd
threads and an imbalance can be created. This can result in files being
queued for the work queue more quickly that the work queue can process
them, resulting in unbounded growth of the queue and memory exhaustion.

To avoid this work imbalance that exhausts memory, this patch moves all
closing of files into the nfsd threads. This means that when the work
imposes a cost, that cost appears where it would be expected - in the
work of the nfsd thread. A subsequent patch will ensure the final
__fput() is called in the same (nfsd) thread which calls filp_close().

Files opened for NFSv3 are never explicitly closed by the client and are
kept open by the server in the "filecache", which responds to memory
pressure, is garbage collected even when there is no pressure, and
sometimes closes files when there is particular need such as for rename.
These files currently have filp_close() called in a dedicated work
queue, so their __fput() can have no effect on nfsd threads.

This patch discards the work queue and instead has each nfsd thread call
flip_close() on as many as 8 files from the filecache each time it acts
on a client request (or finds there are no pending client requests). If
there are more to be closed, more threads are woken. This spreads the
work of __fput() over multiple threads and imposes any cost on those
threads.

The number 8 is somewhat arbitrary. It needs to be greater than 1 to
ensure that files are closed more quickly than they can be added to the
cache. It needs to be small enough to limit the per-request delays that
will be imposed on clients when all threads are busy closing files.

Signed-off-by: NeilBrown <[email protected]>
---
fs/nfsd/filecache.c | 67 +++++++++++++++++++++------------------------
fs/nfsd/filecache.h | 1 +
fs/nfsd/nfssvc.c | 2 ++
3 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ef063f93fde9..2521379d28ec 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);

struct nfsd_fcache_disposal {
- struct work_struct work;
spinlock_t lock;
struct list_head freeme;
};

-static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
-
static struct kmem_cache *nfsd_file_slab;
static struct kmem_cache *nfsd_file_mark_slab;
static struct list_lru nfsd_file_lru;
@@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
spin_lock(&l->lock);
list_move_tail(&nf->nf_lru, &l->freeme);
spin_unlock(&l->lock);
- queue_work(nfsd_filecache_wq, &l->work);
+ svc_wake_up(nn->nfsd_serv);
+ }
+}
+
+/**
+ * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed.
+ * @nn: nfsd_net in which to find files to be disposed.
+ *
+ * When files held open for nfsv3 are removed from the filecache, whether
+ * due to memory pressure or garbage collection, they are queued to
+ * a per-net-ns queue. This function completes the disposal, either
+ * directly or by waking another nfsd thread to help with the work.
+ */
+void nfsd_file_net_dispose(struct nfsd_net *nn)
+{
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+ if (!list_empty(&l->freeme)) {
+ LIST_HEAD(dispose);
+ int i;
+
+ spin_lock(&l->lock);
+ for (i = 0; i < 8 && !list_empty(&l->freeme); i++)
+ list_move(l->freeme.next, &dispose);
+ spin_unlock(&l->lock);
+ if (!list_empty(&l->freeme))
+ /* Wake up another thread to share the work
+ * *before* doing any actual disposing.
+ */
+ svc_wake_up(nn->nfsd_serv);
+ nfsd_file_dispose_list(&dispose);
}
}

@@ -634,27 +661,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
flush_delayed_fput();
}

-/**
- * nfsd_file_delayed_close - close unused nfsd_files
- * @work: dummy
- *
- * Scrape the freeme list for this nfsd_net, and then dispose of them
- * all.
- */
-static void
-nfsd_file_delayed_close(struct work_struct *work)
-{
- LIST_HEAD(head);
- struct nfsd_fcache_disposal *l = container_of(work,
- struct nfsd_fcache_disposal, work);
-
- spin_lock(&l->lock);
- list_splice_init(&l->freeme, &head);
- spin_unlock(&l->lock);
-
- nfsd_file_dispose_list(&head);
-}
-
static int
nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
void *data)
@@ -717,10 +723,6 @@ nfsd_file_cache_init(void)
return ret;

ret = -ENOMEM;
- nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
- if (!nfsd_filecache_wq)
- goto out;
-
nfsd_file_slab = kmem_cache_create("nfsd_file",
sizeof(struct nfsd_file), 0, 0, NULL);
if (!nfsd_file_slab) {
@@ -735,7 +737,6 @@ nfsd_file_cache_init(void)
goto out_err;
}

-
ret = list_lru_init(&nfsd_file_lru);
if (ret) {
pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
@@ -785,8 +786,6 @@ nfsd_file_cache_init(void)
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- destroy_workqueue(nfsd_filecache_wq);
- nfsd_filecache_wq = NULL;
rhltable_destroy(&nfsd_file_rhltable);
goto out;
}
@@ -832,7 +831,6 @@ nfsd_alloc_fcache_disposal(void)
l = kmalloc(sizeof(*l), GFP_KERNEL);
if (!l)
return NULL;
- INIT_WORK(&l->work, nfsd_file_delayed_close);
spin_lock_init(&l->lock);
INIT_LIST_HEAD(&l->freeme);
return l;
@@ -841,7 +839,6 @@ nfsd_alloc_fcache_disposal(void)
static void
nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
{
- cancel_work_sync(&l->work);
nfsd_file_dispose_list(&l->freeme);
kfree(l);
}
@@ -910,8 +907,6 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- destroy_workqueue(nfsd_filecache_wq);
- nfsd_filecache_wq = NULL;
rhltable_destroy(&nfsd_file_rhltable);

for_each_possible_cpu(i) {
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index e54165a3224f..c61884def906 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
+void nfsd_file_net_dispose(struct nfsd_net *nn);
bool nfsd_file_is_cached(struct inode *inode);
__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index fe61d9bbcc1f..26e203f42edd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -956,6 +956,8 @@ nfsd(void *vrqstp)

svc_recv(rqstp);
validate_process_creds();
+
+ nfsd_file_net_dispose(nn);
}

atomic_dec(&nfsdstats.th_cnt);
--
2.43.0

2023-12-15 01:21:34

by NeilBrown

[permalink] [raw]

Subject: [PATCH 2/2] nfsd: use __fput_sync() to avoid delayed closing of files.

Calling fput() directly or though filp_close() from a kernel thread like
nfsd causes the final __fput() (if necessary) to be called from a
workqueue. This means that nfsd is not forced to wait for any work to
complete. If the ->release or ->destroy_inode function is slow for any
reason, this can result in nfsd closing files more quickly than the
workqueue can complete the close and the queue of pending closes can
grow without bounces (30 million has been seen at one customer site,
though this was in part due to a slowness in xfs which has since been
fixed).

nfsd does not need this. It is quite appropriate and safe for nfsd to
do its own close work. There is no reason that close should ever wait
for nfsd, so no deadlock can occur.

It should be safe and sensible to change all fput() calls to
__fput_sync(). However in the interests of caution this patch only
changes two - the two that can be most directly affected by client
behaviour and could occur at high frequency.

- the fput() implicitly in flip_close() is changed to __fput_sync()
by calling get_file() first to ensure filp_close() doesn't do
the final fput() itself. If is where files opened for IO are closed.

- the fput() in nfsd_read() is also changed. This is where directories
opened for readdir are closed.

This ensure that minimal fput work is queued to the workqueue.

This removes the need for the flush_delayed_fput() call in
nfsd_file_close_inode_sync()

Signed-off-by: NeilBrown <[email protected]>
---
fs/nfsd/filecache.c | 3 ++-
fs/nfsd/vfs.c | 10 +++++-----
2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 2521379d28ec..f9da4b0c4d42 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -280,7 +280,9 @@ nfsd_file_free(struct nfsd_file *nf)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
nfsd_file_check_write_error(nf);
+ get_file(nf->nf_file);
filp_close(nf->nf_file, NULL);
+ __fput_sync(nf->nf_file);
}

/*
@@ -658,7 +660,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
list_del_init(&nf->nf_lru);
nfsd_file_free(nf);
}
- flush_delayed_fput();
}

static int
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fbbea7498f02..998f9ba0e168 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1884,10 +1884,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
fh_drop_write(ffhp);

/*
- * If the target dentry has cached open files, then we need to try to
- * close them prior to doing the rename. Flushing delayed fput
- * shouldn't be done with locks held however, so we delay it until this
- * point and then reattempt the whole shebang.
+ * If the target dentry has cached open files, then we need to
+ * try to close them prior to doing the rename. Final fput
+ * shouldn't be done with locks held however, so we delay it
+ * until this point and then reattempt the whole shebang.
*/
if (close_cached) {
close_cached = false;
@@ -2141,7 +2141,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
if (err == nfserr_eof || err == nfserr_toosmall)
err = nfs_ok; /* can still be found in ->err */
out_close:
- fput(file);
+ __fput_sync(file);
out:
return err;
}
--
2.43.0

2023-12-15 08:27:30

by NeilBrown

[permalink] [raw]

Subject: [PATCH 3/2 SQUASH] nfsd: use __fput_sync() to avoid delayed closing of files.

After posting I remembered that you suggested a separate function would
be a good place for relevant documentation.

Please squash this into 2/2.

Thanks.

Signed-off-by: NeilBrown <[email protected]>
---
fs/nfsd/filecache.c | 4 +---
fs/nfsd/vfs.c | 34 +++++++++++++++++++++++++++++++++-
fs/nfsd/vfs.h | 2 ++
3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index f9da4b0c4d42..9a6ff8fcd12e 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -280,9 +280,7 @@ nfsd_file_free(struct nfsd_file *nf)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
nfsd_file_check_write_error(nf);
- get_file(nf->nf_file);
- filp_close(nf->nf_file, NULL);
- __fput_sync(nf->nf_file);
+ nfsd_filp_close(nf->nf_file);
}

/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 998f9ba0e168..f365c613e39e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2141,11 +2141,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
if (err == nfserr_eof || err == nfserr_toosmall)
err = nfs_ok; /* can still be found in ->err */
out_close:
- __fput_sync(file);
+ nfsd_filp_close(file);
out:
return err;
}

+/**
+ * nfsd_filp_close: close a file synchronously
+ * @fp: the file to close
+ *
+ * nfsd_filp_close() is similar in behaviour to filp_close().
+ * The difference is that if this is the final close on the
+ * file, the that finalisation happens immediately, rather then
+ * being handed over to a work_queue, as it the case for
+ * filp_close().
+ * When a user-space process closes a file (even when using
+ * filp_close() the finalisation happens before returning to
+ * userspace, so it is effectively synchronous. When a kernel thread
+ * uses file_close(), on the other hand, the handling is completely
+ * asynchronous. This means that any cost imposed by that finalisation
+ * is not imposed on the nfsd thread, and nfsd could potentually
+ * close files more quickly than the work queue finalises the close,
+ * which would lead to unbounded growth in the queue.
+ *
+ * In some contexts is it not safe to synchronously wait for
+ * close finalisation (see comment for __fput_sync()), but nfsd
+ * does not match those contexts. In partcilarly it does not, at the
+ * time that this function is called, hold and locks and no finalisation
+ * of any file, socket, or device driver would have any cause to wait
+ * for nfsd to make progress.
+ */
+void nfsd_filp_close(struct file *fp)
+{
+ get_file(fp);
+ filp_close(fp, NULL);
+ __fput_sync(fp);
+}
+
/*
* Get file system stats
* N.B. After this call fhp needs an fh_put
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e3c29596f4df..f76b5c6b4706 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -147,6 +147,8 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);

+void nfsd_filp_close(struct file *fp);
+
static inline int fh_want_write(struct svc_fh *fh)
{
int ret;
--
2.43.0

2023-12-15 11:01:53

by Jeffrey Layton

[permalink] [raw]

Subject: Re: [PATCH 0/2 v2] nfsd: fully close files in the nfsd threads

On Fri, 2023-12-15 at 12:18 +1100, NeilBrown wrote:
> This is a revised version of my recent series to have nfs close files
> directly and not leave work to an async work queue.
>
> This version takes a more cautious approach and only using __fput_sync()
> where there is a reasonable case that it could make a practical
> difference.
>

Reviewed-by: Jeff Layton <[email protected]>

2023-12-15 11:02:38

by Jeffrey Layton

[permalink] [raw]

Subject: Re: [PATCH 3/2 SQUASH] nfsd: use __fput_sync() to avoid delayed closing of files.

On Fri, 2023-12-15 at 19:27 +1100, NeilBrown wrote:
> After posting I remembered that you suggested a separate function would
> be a good place for relevant documentation.
>
> Please squash this into 2/2.
>
> Thanks.
>
> Signed-off-by: NeilBrown <[email protected]>
> ---
> fs/nfsd/filecache.c | 4 +---
> fs/nfsd/vfs.c | 34 +++++++++++++++++++++++++++++++++-
> fs/nfsd/vfs.h | 2 ++
> 3 files changed, 36 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index f9da4b0c4d42..9a6ff8fcd12e 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -280,9 +280,7 @@ nfsd_file_free(struct nfsd_file *nf)
> nfsd_file_mark_put(nf->nf_mark);
> if (nf->nf_file) {
> nfsd_file_check_write_error(nf);
> - get_file(nf->nf_file);
> - filp_close(nf->nf_file, NULL);
> - __fput_sync(nf->nf_file);
> + nfsd_filp_close(nf->nf_file);
> }
>
> /*
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 998f9ba0e168..f365c613e39e 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -2141,11 +2141,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
> if (err == nfserr_eof || err == nfserr_toosmall)
> err = nfs_ok; /* can still be found in ->err */
> out_close:
> - __fput_sync(file);
> + nfsd_filp_close(file);
> out:
> return err;
> }
>
> +/**
> + * nfsd_filp_close: close a file synchronously
> + * @fp: the file to close
> + *
> + * nfsd_filp_close() is similar in behaviour to filp_close().
> + * The difference is that if this is the final close on the
> + * file, the that finalisation happens immediately, rather then
> + * being handed over to a work_queue, as it the case for
> + * filp_close().
> + * When a user-space process closes a file (even when using
> + * filp_close() the finalisation happens before returning to
> + * userspace, so it is effectively synchronous. When a kernel thread
> + * uses file_close(), on the other hand, the handling is completely
> + * asynchronous. This means that any cost imposed by that finalisation
> + * is not imposed on the nfsd thread, and nfsd could potentually
> + * close files more quickly than the work queue finalises the close,
> + * which would lead to unbounded growth in the queue.
> + *
> + * In some contexts is it not safe to synchronously wait for
> + * close finalisation (see comment for __fput_sync()), but nfsd
> + * does not match those contexts. In partcilarly it does not, at the
> + * time that this function is called, hold and locks and no finalisation
> + * of any file, socket, or device driver would have any cause to wait
> + * for nfsd to make progress.
> + */
> +void nfsd_filp_close(struct file *fp)
> +{
> + get_file(fp);
> + filp_close(fp, NULL);
> + __fput_sync(fp);
> +}
> +
> /*
> * Get file system stats
> * N.B. After this call fhp needs an fh_put
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index e3c29596f4df..f76b5c6b4706 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -147,6 +147,8 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
> __be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
> struct dentry *, int);
>
> +void nfsd_filp_close(struct file *fp);
> +
> static inline int fh_want_write(struct svc_fh *fh)
> {
> int ret;

Reviewed-by: Jeff Layton <[email protected]>

2023-12-15 15:52:46

by Chuck Lever

[permalink] [raw]

Subject: Re: [PATCH 0/2 v2] nfsd: fully close files in the nfsd threads

On Fri, Dec 15, 2023 at 12:18:29PM +1100, NeilBrown wrote:
> This is a revised version of my recent series to have nfs close files
> directly and not leave work to an async work queue.
>
> This version takes a more cautious approach and only using __fput_sync()
> where there is a reasonable case that it could make a practical
> difference.

Hi Neil, thanks for the revisions and documentation.

Jeff and I would like to see this set spend some time in linux-next
before getting merged. We're just about on the cusp of closing out
nfsd-next for v6.8, so my plan is to apply these to nfsd-next just
as soon as v6.9 opens to give testers and bots plenty of time to
exercise them.

--
Chuck Lever