This patchset should bring the filecache into closer alignment with what
Neil had mentioned here. The first patch cleans up the refcounting, and
the second two optimize the "gc" entry handling.
This should apply cleanly on top of Chuck's current for-next branch.
Jeff Layton (3):
nfsd: rework refcounting in filecache
nfsd: only keep unused entries on the LRU
nfsd: start non-blocking writeback after adding nfsd_file to the LRU
fs/nfsd/filecache.c | 331 ++++++++++++++++++++++++--------------------
fs/nfsd/trace.h | 5 +-
2 files changed, 185 insertions(+), 151 deletions(-)
--
2.37.3
Currently, nfsd_files live on the LRU once they are added until they are
unhashed. There's no need to keep ones that are actively in use there.
Before incrementing the refcount, do a lockless check for nf_lru being
empty. If it's not then attempt to remove the entry from the LRU. If
that's successful, claim the LRU reference and return it. If the removal
fails (or if the list_head was empty), then just increment the counter
as we normally would.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 23 ++++++++++++++++++++---
1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index e63534f4b9f8..d2bbded805d4 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -420,14 +420,31 @@ nfsd_file_unhash(struct nfsd_file *nf)
return false;
}
-struct nfsd_file *
-nfsd_file_get(struct nfsd_file *nf)
+static struct nfsd_file *
+__nfsd_file_get(struct nfsd_file *nf)
{
if (likely(refcount_inc_not_zero(&nf->nf_ref)))
return nf;
return NULL;
}
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
+{
+ /*
+ * Do a lockless list_empty check first, before attempting to
+ * remove it, so we can avoid the spinlock when it's not on the
+ * list.
+ *
+ * If we successfully remove it from the LRU, then we can just
+ * claim the LRU reference and return it. Otherwise, we need to
+ * bump the counter the old-fashioned way.
+ */
+ if (!list_empty(&nf->nf_lru) && nfsd_file_lru_remove(nf))
+ return nf;
+ return __nfsd_file_get(nf);
+}
+
/**
* nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
* @nf: nfsd_file to be unhashed and queued
@@ -449,7 +466,7 @@ nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
* to take a reference. If that fails, just ignore
* the file altogether.
*/
- if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
+ if (!nfsd_file_lru_remove(nf) && !__nfsd_file_get(nf))
return false;
list_add(&nf->nf_lru, dispose);
return true;
--
2.37.3
The filecache refcounting is a bit non-standard for something searchable
by RCU, in that we maintain a sentinel reference while it's hashed. This
in turn requires that we have to do things differently in the "put"
depending on whether its hashed, which we believe to have led to races.
There are other problems in here too. nfsd_file_close_inode_sync can end
up freeing an nfsd_file while there are still outstanding references to
it, and the handling
Rework the code so that the refcount is what drives the lifecycle. When
the refcount goes to zero, then unhash and rcu free the object.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 291 +++++++++++++++++++++-----------------------
fs/nfsd/trace.h | 5 +-
2 files changed, 144 insertions(+), 152 deletions(-)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 98c6b5f51bc8..e63534f4b9f8 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1,6 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
* The NFSD open file cache.
+ *
+ * Each nfsd_file is created in response to client activity -- either regular
+ * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
+ * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
+ * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
+ * eventual disposal if they aren't used again within a short time period.
*/
#include <linux/hash.h>
@@ -302,31 +308,43 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
if (key->gc)
__set_bit(NFSD_FILE_GC, &nf->nf_flags);
nf->nf_inode = key->inode;
- /* nf_ref is pre-incremented for hash table */
- refcount_set(&nf->nf_ref, 2);
+ refcount_set(&nf->nf_ref, 1);
nf->nf_may = key->need;
nf->nf_mark = NULL;
}
return nf;
}
-static bool
+static void
+nfsd_file_flush(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return;
+ this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
+ if (vfs_fsync(file, 1) != 0)
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+}
+
+static void
nfsd_file_free(struct nfsd_file *nf)
{
s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
- bool flush = false;
+
+ trace_nfsd_file_free(nf);
this_cpu_inc(nfsd_file_releases);
this_cpu_add(nfsd_file_total_age, age);
- trace_nfsd_file_put_final(nf);
+ nfsd_file_flush(nf);
+
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
get_file(nf->nf_file);
filp_close(nf->nf_file, NULL);
fput(nf->nf_file);
- flush = true;
}
/*
@@ -334,10 +352,9 @@ nfsd_file_free(struct nfsd_file *nf)
* WARN and leak it to preserve system stability.
*/
if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
- return flush;
+ return;
call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
- return flush;
}
static bool
@@ -363,29 +380,23 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
}
-static void
-nfsd_file_flush(struct nfsd_file *nf)
-{
- struct file *file = nf->nf_file;
-
- if (!file || !(file->f_mode & FMODE_WRITE))
- return;
- this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
- if (vfs_fsync(file, 1) != 0)
- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
-}
-
-static void nfsd_file_lru_add(struct nfsd_file *nf)
+static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_add(nf);
+ return true;
+ }
+ return false;
}
-static void nfsd_file_lru_remove(struct nfsd_file *nf)
+static bool nfsd_file_lru_remove(struct nfsd_file *nf)
{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_del(nf);
+ return true;
+ }
+ return false;
}
static void
@@ -409,94 +420,89 @@ nfsd_file_unhash(struct nfsd_file *nf)
return false;
}
-static void
-nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
{
- trace_nfsd_file_unhash_and_dispose(nf);
+ if (likely(refcount_inc_not_zero(&nf->nf_ref)))
+ return nf;
+ return NULL;
+}
+
+/**
+ * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
+ * @nf: nfsd_file to be unhashed and queued
+ * @dispose: list to which it should be queued
+ *
+ * Attempt to unhash a nfsd_file and queue it to the given list. Each file
+ * will have a reference held on behalf of the list. That reference may come
+ * from the LRU, or we may need to take one. If we can't get a reference,
+ * ignore it altogether.
+ */
+static bool
+nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
+{
+ trace_nfsd_file_unhash_and_queue(nf);
if (nfsd_file_unhash(nf)) {
- /* caller must call nfsd_file_dispose_list() later */
- nfsd_file_lru_remove(nf);
+ /*
+ * If we remove it from the LRU, then just use that
+ * reference for the dispose list. Otherwise, we need
+ * to take a reference. If that fails, just ignore
+ * the file altogether.
+ */
+ if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
+ return false;
list_add(&nf->nf_lru, dispose);
+ return true;
}
+ return false;
}
-static void
-nfsd_file_put_noref(struct nfsd_file *nf)
+static bool
+__nfsd_file_put(struct nfsd_file *nf)
{
- trace_nfsd_file_put(nf);
-
if (refcount_dec_and_test(&nf->nf_ref)) {
- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
- nfsd_file_lru_remove(nf);
+ nfsd_file_unhash(nf);
nfsd_file_free(nf);
+ return true;
}
+ return false;
}
-static void
-nfsd_file_unhash_and_put(struct nfsd_file *nf)
-{
- if (nfsd_file_unhash(nf))
- nfsd_file_put_noref(nf);
-}
-
+/**
+ * nfsd_file_put - put the reference to a nfsd_file
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Put a reference to a nfsd_file. In the v4 case, we just put the
+ * reference immediately. In the v2/3 case, if the reference would be
+ * the last one, the put it on the LRU instead to be cleaned up later.
+ */
void
nfsd_file_put(struct nfsd_file *nf)
{
- might_sleep();
-
- if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
- nfsd_file_lru_add(nf);
- else if (refcount_read(&nf->nf_ref) == 2)
- nfsd_file_unhash_and_put(nf);
-
- if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
- } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
- nfsd_file_put_noref(nf);
- nfsd_file_schedule_laundrette();
- } else
- nfsd_file_put_noref(nf);
-}
-
-struct nfsd_file *
-nfsd_file_get(struct nfsd_file *nf)
-{
- if (likely(refcount_inc_not_zero(&nf->nf_ref)))
- return nf;
- return NULL;
-}
-
-static void
-nfsd_file_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
+ trace_nfsd_file_put(nf);
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
+ /*
+ * If this is the last reference (nf_ref == 1), then transfer
+ * it to the LRU. If the add to the LRU fails, just put it as
+ * usual.
+ */
+ if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
+ return;
}
+ __nfsd_file_put(nf);
}
static void
-nfsd_file_dispose_list_sync(struct list_head *dispose)
+nfsd_file_dispose_list(struct list_head *dispose)
{
- bool flush = false;
struct nfsd_file *nf;
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
- if (!refcount_dec_and_test(&nf->nf_ref))
- continue;
- if (nfsd_file_free(nf))
- flush = true;
+ nfsd_file_free(nf);
}
- if (flush)
- flush_delayed_fput();
}
static void
@@ -566,21 +572,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
- /*
- * Do a lockless refcount check. The hashtable holds one reference, so
- * we look to see if anything else has a reference, or if any have
- * been put since the shrinker last ran. Those don't get unhashed and
- * released.
- *
- * Note that in the put path, we set the flag and then decrement the
- * counter. Here we check the counter and then test and clear the flag.
- * That order is deliberate to ensure that we can do this locklessly.
- */
- if (refcount_read(&nf->nf_ref) > 1) {
- list_lru_isolate(lru, &nf->nf_lru);
- trace_nfsd_file_gc_in_use(nf);
- return LRU_REMOVED;
- }
+ /* We should only be dealing with v2/3 entries here */
+ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
/*
* Don't throw out files that are still undergoing I/O or
@@ -591,40 +584,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
return LRU_SKIP;
}
+ /* If it was recently added to the list, skip it */
if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
trace_nfsd_file_gc_referenced(nf);
return LRU_ROTATE;
}
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- trace_nfsd_file_gc_hashed(nf);
- return LRU_SKIP;
+ /*
+ * Put the reference held on behalf of the LRU. If it wasn't the last
+ * one, then just remove it from the LRU and ignore it.
+ */
+ if (!refcount_dec_and_test(&nf->nf_ref)) {
+ trace_nfsd_file_gc_in_use(nf);
+ list_lru_isolate(lru, &nf->nf_lru);
+ return LRU_REMOVED;
}
+ /* Refcount went to zero. Unhash it and queue it to the dispose list */
+ nfsd_file_unhash(nf);
list_lru_isolate_move(lru, &nf->nf_lru, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
}
-/*
- * Unhash items on @dispose immediately, then queue them on the
- * disposal workqueue to finish releasing them in the background.
- *
- * cel: Note that between the time list_lru_shrink_walk runs and
- * now, these items are in the hash table but marked unhashed.
- * Why release these outside of lru_cb ? There's no lock ordering
- * problem since lru_cb currently takes no lock.
- */
-static void nfsd_file_gc_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
-
- list_for_each_entry(nf, dispose, nf_lru)
- nfsd_file_hash_remove(nf);
- nfsd_file_dispose_list_delayed(dispose);
-}
-
static void
nfsd_file_gc(void)
{
@@ -634,7 +617,7 @@ nfsd_file_gc(void)
ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
&dispose, list_lru_count(&nfsd_file_lru));
trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
static void
@@ -659,7 +642,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
nfsd_file_lru_cb, &dispose);
trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
return ret;
}
@@ -670,8 +653,11 @@ static struct shrinker nfsd_file_shrinker = {
};
/*
- * Find all cache items across all net namespaces that match @inode and
- * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+ * Find all cache items across all net namespaces that match @inode, unhash
+ * them, take references and then put them on @dispose if that was successful.
+ *
+ * The nfsd_file objects on the list will be unhashed, and each will have a
+ * reference taken.
*/
static unsigned int
__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
@@ -689,52 +675,58 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
nfsd_file_rhash_params);
if (!nf)
break;
- nfsd_file_unhash_and_dispose(nf, dispose);
- count++;
+
+ if (nfsd_file_unhash_and_queue(nf, dispose))
+ count++;
} while (1);
rcu_read_unlock();
return count;
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put, then flush and fput all cache items associated with @inode.
+ * Unhash and put all cache item associated with @inode.
*/
-void
-nfsd_file_close_inode_sync(struct inode *inode)
+static unsigned int
+nfsd_file_close_inode(struct inode *inode)
{
- LIST_HEAD(dispose);
+ struct nfsd_file *nf;
unsigned int count;
+ LIST_HEAD(dispose);
count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode_sync(inode, count);
- nfsd_file_dispose_list_sync(&dispose);
+ trace_nfsd_file_close_inode(inode, count);
+ if (count) {
+ while(!list_empty(&dispose)) {
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
+ list_del_init(&nf->nf_lru);
+ trace_nfsd_file_closing(nf);
+ __nfsd_file_put(nf);
+ }
+ }
+ return count;
}
/**
- * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put all cache item associated with @inode.
+ * Unhash and put, then flush and fput all cache items associated with @inode.
*/
-static void
-nfsd_file_close_inode(struct inode *inode)
+void
+nfsd_file_close_inode_sync(struct inode *inode)
{
- LIST_HEAD(dispose);
- unsigned int count;
-
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode(inode, count);
- nfsd_file_dispose_list_delayed(&dispose);
+ if (nfsd_file_close_inode(inode))
+ flush_delayed_fput();
}
/**
* nfsd_file_delayed_close - close unused nfsd_files
* @work: dummy
*
- * Walk the LRU list and close any entries that have not been used since
+ * Walk the LRU list and destroy any entries that have not been used since
* the last scan.
*/
static void
@@ -892,7 +884,7 @@ __nfsd_file_cache_purge(struct net *net)
while (!IS_ERR_OR_NULL(nf)) {
if (net && nf->nf_net != net)
continue;
- nfsd_file_unhash_and_dispose(nf, &dispose);
+ nfsd_file_unhash_and_queue(nf, &dispose);
nf = rhashtable_walk_next(&iter);
}
@@ -1093,11 +1085,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
open_retry = false;
- nfsd_file_put_noref(nf);
+ __nfsd_file_put(nf);
goto retry;
}
- nfsd_file_lru_remove(nf);
this_cpu_inc(nfsd_file_cache_hits);
status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
@@ -1107,7 +1098,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
this_cpu_inc(nfsd_file_acquisitions);
*pnf = nf;
} else {
- nfsd_file_put(nf);
+ __nfsd_file_put(nf);
nf = NULL;
}
@@ -1134,7 +1125,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* then unhash.
*/
if (status != nfs_ok || key.inode->i_nlink == 0)
- nfsd_file_unhash_and_put(nf);
+ nfsd_file_unhash(nf);
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index b09ab4f92d43..a44ded06af87 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -903,10 +903,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
TP_PROTO(struct nfsd_file *nf), \
TP_ARGS(nf))
-DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
TRACE_EVENT(nfsd_file_alloc,
TP_PROTO(
--
2.37.3
When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
so that we can be ready to close it out when the time comes.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
1 file changed, 31 insertions(+), 6 deletions(-)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index d2bbded805d4..491d3d9a1870 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
}
static void
-nfsd_file_flush(struct nfsd_file *nf)
+nfsd_file_fsync(struct nfsd_file *nf)
{
struct file *file = nf->nf_file;
@@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
+static void
+nfsd_file_flush(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+ unsigned long nrpages;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return;
+
+ nrpages = file->f_mapping->nrpages;
+ if (nrpages) {
+ this_cpu_add(nfsd_file_pages_flushed, nrpages);
+ filemap_flush(file->f_mapping);
+ }
+}
+
static void
nfsd_file_free(struct nfsd_file *nf)
{
@@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
this_cpu_inc(nfsd_file_releases);
this_cpu_add(nfsd_file_total_age, age);
- nfsd_file_flush(nf);
+ nfsd_file_fsync(nf);
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
@@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
/*
- * If this is the last reference (nf_ref == 1), then transfer
- * it to the LRU. If the add to the LRU fails, just put it as
- * usual.
+ * If this is the last reference (nf_ref == 1), then try
+ * to transfer it to the LRU.
+ */
+ if (refcount_dec_not_one(&nf->nf_ref))
+ return;
+
+ /*
+ * If the add to the list succeeds, try to kick off SYNC_NONE
+ * writeback. If the add fails, then just fall through to
+ * decrement as usual.
*/
- if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
+ if (nfsd_file_lru_add(nf)) {
+ nfsd_file_flush(nf);
return;
+ }
}
__nfsd_file_put(nf);
}
--
2.37.3
On Fri, 28 Oct 2022, Jeff Layton wrote:
> Currently, nfsd_files live on the LRU once they are added until they are
> unhashed. There's no need to keep ones that are actively in use there.
Is that true?
nfsd_file_do_acquire() calls nfsd_file_lru_remove()
Isn't that enough to keep the file off the lru while it is active?
Thanks,
NeilBrown
>
> Before incrementing the refcount, do a lockless check for nf_lru being
> empty. If it's not then attempt to remove the entry from the LRU. If
> that's successful, claim the LRU reference and return it. If the removal
> fails (or if the list_head was empty), then just increment the counter
> as we normally would.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 23 ++++++++++++++++++++---
> 1 file changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index e63534f4b9f8..d2bbded805d4 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -420,14 +420,31 @@ nfsd_file_unhash(struct nfsd_file *nf)
> return false;
> }
>
> -struct nfsd_file *
> -nfsd_file_get(struct nfsd_file *nf)
> +static struct nfsd_file *
> +__nfsd_file_get(struct nfsd_file *nf)
> {
> if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> return nf;
> return NULL;
> }
>
> +struct nfsd_file *
> +nfsd_file_get(struct nfsd_file *nf)
> +{
> + /*
> + * Do a lockless list_empty check first, before attempting to
> + * remove it, so we can avoid the spinlock when it's not on the
> + * list.
> + *
> + * If we successfully remove it from the LRU, then we can just
> + * claim the LRU reference and return it. Otherwise, we need to
> + * bump the counter the old-fashioned way.
> + */
> + if (!list_empty(&nf->nf_lru) && nfsd_file_lru_remove(nf))
> + return nf;
> + return __nfsd_file_get(nf);
> +}
> +
> /**
> * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> * @nf: nfsd_file to be unhashed and queued
> @@ -449,7 +466,7 @@ nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> * to take a reference. If that fails, just ignore
> * the file altogether.
> */
> - if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> + if (!nfsd_file_lru_remove(nf) && !__nfsd_file_get(nf))
> return false;
> list_add(&nf->nf_lru, dispose);
> return true;
> --
> 2.37.3
>
>
On Fri, 28 Oct 2022, Jeff Layton wrote:
> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> so that we can be ready to close it out when the time comes.
>
> Signed-off-by: Jeff Layton <[email protected]>
This looks sensible.
Reviewed-by: NeilBrown <[email protected]>
Thanks,
NeilBrown
> ---
> fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> 1 file changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index d2bbded805d4..491d3d9a1870 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> }
>
> static void
> -nfsd_file_flush(struct nfsd_file *nf)
> +nfsd_file_fsync(struct nfsd_file *nf)
> {
> struct file *file = nf->nf_file;
>
> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
>
> +static void
> +nfsd_file_flush(struct nfsd_file *nf)
> +{
> + struct file *file = nf->nf_file;
> + unsigned long nrpages;
> +
> + if (!file || !(file->f_mode & FMODE_WRITE))
> + return;
> +
> + nrpages = file->f_mapping->nrpages;
> + if (nrpages) {
> + this_cpu_add(nfsd_file_pages_flushed, nrpages);
> + filemap_flush(file->f_mapping);
> + }
> +}
> +
> static void
> nfsd_file_free(struct nfsd_file *nf)
> {
> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> this_cpu_inc(nfsd_file_releases);
> this_cpu_add(nfsd_file_total_age, age);
>
> - nfsd_file_flush(nf);
> + nfsd_file_fsync(nf);
>
> if (nf->nf_mark)
> nfsd_file_mark_put(nf->nf_mark);
> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
>
> if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> /*
> - * If this is the last reference (nf_ref == 1), then transfer
> - * it to the LRU. If the add to the LRU fails, just put it as
> - * usual.
> + * If this is the last reference (nf_ref == 1), then try
> + * to transfer it to the LRU.
> + */
> + if (refcount_dec_not_one(&nf->nf_ref))
> + return;
> +
> + /*
> + * If the add to the list succeeds, try to kick off SYNC_NONE
> + * writeback. If the add fails, then just fall through to
> + * decrement as usual.
> */
> - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> + if (nfsd_file_lru_add(nf)) {
> + nfsd_file_flush(nf);
> return;
> + }
> }
> __nfsd_file_put(nf);
> }
> --
> 2.37.3
>
>
On Fri, 28 Oct 2022, Jeff Layton wrote:
> The filecache refcounting is a bit non-standard for something searchable
> by RCU, in that we maintain a sentinel reference while it's hashed. This
> in turn requires that we have to do things differently in the "put"
> depending on whether its hashed, which we believe to have led to races.
>
> There are other problems in here too. nfsd_file_close_inode_sync can end
> up freeing an nfsd_file while there are still outstanding references to
> it, and the handling
-EINTR ??? (you got interrupted and didn't finish the sentence?)
>
> Rework the code so that the refcount is what drives the lifecycle. When
> the refcount goes to zero, then unhash and rcu free the object.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 291 +++++++++++++++++++++-----------------------
> fs/nfsd/trace.h | 5 +-
> 2 files changed, 144 insertions(+), 152 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 98c6b5f51bc8..e63534f4b9f8 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -1,6 +1,12 @@
> // SPDX-License-Identifier: GPL-2.0
> /*
> * The NFSD open file cache.
> + *
> + * Each nfsd_file is created in response to client activity -- either regular
> + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> + * eventual disposal if they aren't used again within a short time period.
> */
>
> #include <linux/hash.h>
> @@ -302,31 +308,43 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> if (key->gc)
> __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> nf->nf_inode = key->inode;
> - /* nf_ref is pre-incremented for hash table */
> - refcount_set(&nf->nf_ref, 2);
> + refcount_set(&nf->nf_ref, 1);
> nf->nf_may = key->need;
> nf->nf_mark = NULL;
> }
> return nf;
> }
>
> -static bool
> +static void
> +nfsd_file_flush(struct nfsd_file *nf)
> +{
> + struct file *file = nf->nf_file;
> +
> + if (!file || !(file->f_mode & FMODE_WRITE))
> + return;
> + this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> + if (vfs_fsync(file, 1) != 0)
> + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> +}
> +
> +static void
> nfsd_file_free(struct nfsd_file *nf)
> {
> s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> - bool flush = false;
> +
> + trace_nfsd_file_free(nf);
>
> this_cpu_inc(nfsd_file_releases);
> this_cpu_add(nfsd_file_total_age, age);
>
> - trace_nfsd_file_put_final(nf);
> + nfsd_file_flush(nf);
> +
> if (nf->nf_mark)
> nfsd_file_mark_put(nf->nf_mark);
> if (nf->nf_file) {
> get_file(nf->nf_file);
> filp_close(nf->nf_file, NULL);
> fput(nf->nf_file);
> - flush = true;
> }
>
> /*
> @@ -334,10 +352,9 @@ nfsd_file_free(struct nfsd_file *nf)
> * WARN and leak it to preserve system stability.
> */
> if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> - return flush;
> + return;
>
> call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> - return flush;
> }
>
> static bool
> @@ -363,29 +380,23 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
> return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
> }
>
> -static void
> -nfsd_file_flush(struct nfsd_file *nf)
> -{
> - struct file *file = nf->nf_file;
> -
> - if (!file || !(file->f_mode & FMODE_WRITE))
> - return;
> - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> - if (vfs_fsync(file, 1) != 0)
> - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> -}
> -
> -static void nfsd_file_lru_add(struct nfsd_file *nf)
> +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> {
> set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> trace_nfsd_file_lru_add(nf);
> + return true;
> + }
> + return false;
> }
>
> -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> {
> - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> trace_nfsd_file_lru_del(nf);
> + return true;
> + }
> + return false;
> }
>
> static void
> @@ -409,94 +420,89 @@ nfsd_file_unhash(struct nfsd_file *nf)
> return false;
> }
>
> -static void
> -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> +struct nfsd_file *
> +nfsd_file_get(struct nfsd_file *nf)
> {
> - trace_nfsd_file_unhash_and_dispose(nf);
> + if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> + return nf;
> + return NULL;
> +}
> +
> +/**
> + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> + * @nf: nfsd_file to be unhashed and queued
> + * @dispose: list to which it should be queued
> + *
> + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> + * will have a reference held on behalf of the list. That reference may come
> + * from the LRU, or we may need to take one. If we can't get a reference,
> + * ignore it altogether.
> + */
> +static bool
> +nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> +{
> + trace_nfsd_file_unhash_and_queue(nf);
> if (nfsd_file_unhash(nf)) {
> - /* caller must call nfsd_file_dispose_list() later */
> - nfsd_file_lru_remove(nf);
> + /*
> + * If we remove it from the LRU, then just use that
> + * reference for the dispose list. Otherwise, we need
> + * to take a reference. If that fails, just ignore
> + * the file altogether.
> + */
> + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> + return false;
> list_add(&nf->nf_lru, dispose);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_put_noref(struct nfsd_file *nf)
> +static bool
> +__nfsd_file_put(struct nfsd_file *nf)
The return value of this function is never tested.
Maybe it should return void.
Further, I don't think this is a useful abstraction.
I would rather move the refcount_dec_and_test to the caller, and move
the lru_remove and unash into nfsd_file_free.
> {
> - trace_nfsd_file_put(nf);
> -
> if (refcount_dec_and_test(&nf->nf_ref)) {
> - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> - nfsd_file_lru_remove(nf);
> + nfsd_file_unhash(nf);
> nfsd_file_free(nf);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> -{
> - if (nfsd_file_unhash(nf))
> - nfsd_file_put_noref(nf);
> -}
> -
> +/**
> + * nfsd_file_put - put the reference to a nfsd_file
> + * @nf: nfsd_file of which to put the reference
> + *
> + * Put a reference to a nfsd_file. In the v4 case, we just put the
> + * reference immediately. In the v2/3 case, if the reference would be
> + * the last one, the put it on the LRU instead to be cleaned up later.
> + */
> void
> nfsd_file_put(struct nfsd_file *nf)
> {
> - might_sleep();
> -
> - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> - nfsd_file_lru_add(nf);
> - else if (refcount_read(&nf->nf_ref) == 2)
> - nfsd_file_unhash_and_put(nf);
> -
> - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> - nfsd_file_flush(nf);
> - nfsd_file_put_noref(nf);
> - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> - nfsd_file_put_noref(nf);
> - nfsd_file_schedule_laundrette();
> - } else
> - nfsd_file_put_noref(nf);
> -}
> -
> -struct nfsd_file *
> -nfsd_file_get(struct nfsd_file *nf)
> -{
> - if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> - return nf;
> - return NULL;
> -}
> -
> -static void
> -nfsd_file_dispose_list(struct list_head *dispose)
> -{
> - struct nfsd_file *nf;
> + trace_nfsd_file_put(nf);
>
> - while(!list_empty(dispose)) {
> - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> - list_del_init(&nf->nf_lru);
> - nfsd_file_flush(nf);
> - nfsd_file_put_noref(nf);
> + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
I would prefer this included a test on NFSD_FILE_HASHED as well so that
if the file isn't hashed, we don't consider it for the lru.
This would me we can simple called nfsd_file_put() for things on the
dispose list, rather then needing __nfsd_file_put()
> + /*
> + * If this is the last reference (nf_ref == 1), then transfer
> + * it to the LRU. If the add to the LRU fails, just put it as
> + * usual.
> + */
> + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> + return;
> }
> + __nfsd_file_put(nf);
As suggested above, this would become
if (refcount_dec_and_test(&nf->nf_ref))
nfsd_file_free(nf);
> }
>
> static void
> -nfsd_file_dispose_list_sync(struct list_head *dispose)
> +nfsd_file_dispose_list(struct list_head *dispose)
> {
> - bool flush = false;
> struct nfsd_file *nf;
>
> while(!list_empty(dispose)) {
> nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> list_del_init(&nf->nf_lru);
> - nfsd_file_flush(nf);
> - if (!refcount_dec_and_test(&nf->nf_ref))
> - continue;
> - if (nfsd_file_free(nf))
> - flush = true;
> + nfsd_file_free(nf);
> }
> - if (flush)
> - flush_delayed_fput();
> }
>
> static void
> @@ -566,21 +572,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> struct list_head *head = arg;
> struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
>
> - /*
> - * Do a lockless refcount check. The hashtable holds one reference, so
> - * we look to see if anything else has a reference, or if any have
> - * been put since the shrinker last ran. Those don't get unhashed and
> - * released.
> - *
> - * Note that in the put path, we set the flag and then decrement the
> - * counter. Here we check the counter and then test and clear the flag.
> - * That order is deliberate to ensure that we can do this locklessly.
> - */
> - if (refcount_read(&nf->nf_ref) > 1) {
> - list_lru_isolate(lru, &nf->nf_lru);
> - trace_nfsd_file_gc_in_use(nf);
> - return LRU_REMOVED;
> - }
> + /* We should only be dealing with v2/3 entries here */
> + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
>
> /*
> * Don't throw out files that are still undergoing I/O or
> @@ -591,40 +584,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> return LRU_SKIP;
> }
>
> + /* If it was recently added to the list, skip it */
> if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> trace_nfsd_file_gc_referenced(nf);
> return LRU_ROTATE;
> }
>
> - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> - trace_nfsd_file_gc_hashed(nf);
> - return LRU_SKIP;
> + /*
> + * Put the reference held on behalf of the LRU. If it wasn't the last
> + * one, then just remove it from the LRU and ignore it.
> + */
> + if (!refcount_dec_and_test(&nf->nf_ref)) {
> + trace_nfsd_file_gc_in_use(nf);
> + list_lru_isolate(lru, &nf->nf_lru);
> + return LRU_REMOVED;
> }
>
> + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> + nfsd_file_unhash(nf);
> list_lru_isolate_move(lru, &nf->nf_lru, head);
> this_cpu_inc(nfsd_file_evictions);
> trace_nfsd_file_gc_disposed(nf);
> return LRU_REMOVED;
> }
>
> -/*
> - * Unhash items on @dispose immediately, then queue them on the
> - * disposal workqueue to finish releasing them in the background.
> - *
> - * cel: Note that between the time list_lru_shrink_walk runs and
> - * now, these items are in the hash table but marked unhashed.
> - * Why release these outside of lru_cb ? There's no lock ordering
> - * problem since lru_cb currently takes no lock.
> - */
> -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> -{
> - struct nfsd_file *nf;
> -
> - list_for_each_entry(nf, dispose, nf_lru)
> - nfsd_file_hash_remove(nf);
> - nfsd_file_dispose_list_delayed(dispose);
> -}
> -
> static void
> nfsd_file_gc(void)
> {
> @@ -634,7 +617,7 @@ nfsd_file_gc(void)
> ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> &dispose, list_lru_count(&nfsd_file_lru));
> trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> }
>
> static void
> @@ -659,7 +642,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> nfsd_file_lru_cb, &dispose);
> trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> return ret;
> }
>
> @@ -670,8 +653,11 @@ static struct shrinker nfsd_file_shrinker = {
> };
>
> /*
> - * Find all cache items across all net namespaces that match @inode and
> - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> + * Find all cache items across all net namespaces that match @inode, unhash
> + * them, take references and then put them on @dispose if that was successful.
> + *
> + * The nfsd_file objects on the list will be unhashed, and each will have a
> + * reference taken.
> */
> static unsigned int
> __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> @@ -689,52 +675,58 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> nfsd_file_rhash_params);
> if (!nf)
> break;
> - nfsd_file_unhash_and_dispose(nf, dispose);
> - count++;
> +
> + if (nfsd_file_unhash_and_queue(nf, dispose))
> + count++;
> } while (1);
> rcu_read_unlock();
> return count;
> }
>
> /**
> - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> * @inode: inode of the file to attempt to remove
> *
> - * Unhash and put, then flush and fput all cache items associated with @inode.
> + * Unhash and put all cache item associated with @inode.
> */
> -void
> -nfsd_file_close_inode_sync(struct inode *inode)
> +static unsigned int
> +nfsd_file_close_inode(struct inode *inode)
> {
> - LIST_HEAD(dispose);
> + struct nfsd_file *nf;
> unsigned int count;
> + LIST_HEAD(dispose);
>
> count = __nfsd_file_close_inode(inode, &dispose);
> - trace_nfsd_file_close_inode_sync(inode, count);
> - nfsd_file_dispose_list_sync(&dispose);
> + trace_nfsd_file_close_inode(inode, count);
> + if (count) {
> + while(!list_empty(&dispose)) {
> + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> + list_del_init(&nf->nf_lru);
> + trace_nfsd_file_closing(nf);
> + __nfsd_file_put(nf);
If nfsd_file_put() didn't add unhashed files to the lru, this can be
nfsd_file_put().
> + }
> + }
> + return count;
> }
>
> /**
> - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> * @inode: inode of the file to attempt to remove
> *
> - * Unhash and put all cache item associated with @inode.
> + * Unhash and put, then flush and fput all cache items associated with @inode.
> */
> -static void
> -nfsd_file_close_inode(struct inode *inode)
> +void
> +nfsd_file_close_inode_sync(struct inode *inode)
> {
> - LIST_HEAD(dispose);
> - unsigned int count;
> -
> - count = __nfsd_file_close_inode(inode, &dispose);
> - trace_nfsd_file_close_inode(inode, count);
> - nfsd_file_dispose_list_delayed(&dispose);
> + if (nfsd_file_close_inode(inode))
> + flush_delayed_fput();
> }
>
> /**
> * nfsd_file_delayed_close - close unused nfsd_files
> * @work: dummy
> *
> - * Walk the LRU list and close any entries that have not been used since
> + * Walk the LRU list and destroy any entries that have not been used since
> * the last scan.
> */
> static void
> @@ -892,7 +884,7 @@ __nfsd_file_cache_purge(struct net *net)
> while (!IS_ERR_OR_NULL(nf)) {
> if (net && nf->nf_net != net)
> continue;
> - nfsd_file_unhash_and_dispose(nf, &dispose);
> + nfsd_file_unhash_and_queue(nf, &dispose);
> nf = rhashtable_walk_next(&iter);
> }
>
> @@ -1093,11 +1085,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> goto out;
> }
> open_retry = false;
> - nfsd_file_put_noref(nf);
> + __nfsd_file_put(nf);
This nf is not hashed, and I think it has no other reference. So we
could use nfsd_file_free() - but nfsd_file_put() would be just as good
and safer.
> goto retry;
> }
>
> - nfsd_file_lru_remove(nf);
Hmmm... why not remove from the lru. I guess this justifies patch 2/3,
but it might be cleaner to make this
if (nfsd_file_lru_remove(nf))
nffsd_file_put(nf);
??
> this_cpu_inc(nfsd_file_cache_hits);
>
> status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> @@ -1107,7 +1098,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> this_cpu_inc(nfsd_file_acquisitions);
> *pnf = nf;
> } else {
> - nfsd_file_put(nf);
> + __nfsd_file_put(nf);
I don't see the justification for this change.
If status == nfserr_jukebox, then it is OK.
If status is whatever we might get from break_lease(), then it seems
wrong.
If we modify nfsd_file_put() as I suggest, it will handle both cases.
> nf = NULL;
> }
>
> @@ -1134,7 +1125,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> * then unhash.
> */
> if (status != nfs_ok || key.inode->i_nlink == 0)
> - nfsd_file_unhash_and_put(nf);
> + nfsd_file_unhash(nf);
> clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> smp_mb__after_atomic();
> wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index b09ab4f92d43..a44ded06af87 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -903,10 +903,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
> TP_PROTO(struct nfsd_file *nf), \
> TP_ARGS(nf))
>
> -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
> +DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
> +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
>
> TRACE_EVENT(nfsd_file_alloc,
> TP_PROTO(
> --
> 2.37.3
>
>
Thanks,
NeilBrown
On Fri, 2022-10-28 at 09:20 +1100, NeilBrown wrote:
> On Fri, 28 Oct 2022, Jeff Layton wrote:
> > Currently, nfsd_files live on the LRU once they are added until they are
> > unhashed. There's no need to keep ones that are actively in use there.
>
> Is that true?
> nfsd_file_do_acquire() calls nfsd_file_lru_remove()
> Isn't that enough to keep the file off the lru while it is active?
>
> Thanks,
> NeilBrown
>
After patch #1, it doesn't call that anymore. That's probably a (minor)
regression then.
After patch #1, the LRU holds a reference. If you successfully remove it
from the LRU, you need to transfer or put that reference. Doing the LRU
handling in the get and put routines seems more natural, I think.
Maybe I just need to squash this patch into #1?
>
> >
> > Before incrementing the refcount, do a lockless check for nf_lru being
> > empty. If it's not then attempt to remove the entry from the LRU. If
> > that's successful, claim the LRU reference and return it. If the removal
> > fails (or if the list_head was empty), then just increment the counter
> > as we normally would.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/filecache.c | 23 ++++++++++++++++++++---
> > 1 file changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index e63534f4b9f8..d2bbded805d4 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -420,14 +420,31 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > return false;
> > }
> >
> > -struct nfsd_file *
> > -nfsd_file_get(struct nfsd_file *nf)
> > +static struct nfsd_file *
> > +__nfsd_file_get(struct nfsd_file *nf)
> > {
> > if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > return nf;
> > return NULL;
> > }
> >
> > +struct nfsd_file *
> > +nfsd_file_get(struct nfsd_file *nf)
> > +{
> > + /*
> > + * Do a lockless list_empty check first, before attempting to
> > + * remove it, so we can avoid the spinlock when it's not on the
> > + * list.
> > + *
> > + * If we successfully remove it from the LRU, then we can just
> > + * claim the LRU reference and return it. Otherwise, we need to
> > + * bump the counter the old-fashioned way.
> > + */
> > + if (!list_empty(&nf->nf_lru) && nfsd_file_lru_remove(nf))
> > + return nf;
> > + return __nfsd_file_get(nf);
> > +}
> > +
> > /**
> > * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > * @nf: nfsd_file to be unhashed and queued
> > @@ -449,7 +466,7 @@ nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > * to take a reference. If that fails, just ignore
> > * the file altogether.
> > */
> > - if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > + if (!nfsd_file_lru_remove(nf) && !__nfsd_file_get(nf))
> > return false;
> > list_add(&nf->nf_lru, dispose);
> > return true;
> > --
> > 2.37.3
> >
> >
--
Jeff Layton <[email protected]>
On Fri, 28 Oct 2022, Jeff Layton wrote:
> On Fri, 2022-10-28 at 09:20 +1100, NeilBrown wrote:
> > On Fri, 28 Oct 2022, Jeff Layton wrote:
> > > Currently, nfsd_files live on the LRU once they are added until they are
> > > unhashed. There's no need to keep ones that are actively in use there.
> >
> > Is that true?
> > nfsd_file_do_acquire() calls nfsd_file_lru_remove()
> > Isn't that enough to keep the file off the lru while it is active?
> >
> > Thanks,
> > NeilBrown
> >
>
> After patch #1, it doesn't call that anymore. That's probably a (minor)
> regression then.
Yes, I eventually found that - thanks.
>
> After patch #1, the LRU holds a reference. If you successfully remove it
> from the LRU, you need to transfer or put that reference. Doing the LRU
> handling in the get and put routines seems more natural, I think.
Maybe. But then you need a __get as well as a get.
Though it might seem asymmetric, I would prefer removing from the lru in
'acquire' and adding to the lru in put.
>
> Maybe I just need to squash this patch into #1?
Or do the "put" if lru_remove succeeds in the first patch. Then revise
it all in the second.
Thanks,
NeilBrown
>
> >
> > >
> > > Before incrementing the refcount, do a lockless check for nf_lru being
> > > empty. If it's not then attempt to remove the entry from the LRU. If
> > > that's successful, claim the LRU reference and return it. If the removal
> > > fails (or if the list_head was empty), then just increment the counter
> > > as we normally would.
> > >
> > > Signed-off-by: Jeff Layton <[email protected]>
> > > ---
> > > fs/nfsd/filecache.c | 23 ++++++++++++++++++++---
> > > 1 file changed, 20 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > index e63534f4b9f8..d2bbded805d4 100644
> > > --- a/fs/nfsd/filecache.c
> > > +++ b/fs/nfsd/filecache.c
> > > @@ -420,14 +420,31 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > > return false;
> > > }
> > >
> > > -struct nfsd_file *
> > > -nfsd_file_get(struct nfsd_file *nf)
> > > +static struct nfsd_file *
> > > +__nfsd_file_get(struct nfsd_file *nf)
> > > {
> > > if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > return nf;
> > > return NULL;
> > > }
> > >
> > > +struct nfsd_file *
> > > +nfsd_file_get(struct nfsd_file *nf)
> > > +{
> > > + /*
> > > + * Do a lockless list_empty check first, before attempting to
> > > + * remove it, so we can avoid the spinlock when it's not on the
> > > + * list.
> > > + *
> > > + * If we successfully remove it from the LRU, then we can just
> > > + * claim the LRU reference and return it. Otherwise, we need to
> > > + * bump the counter the old-fashioned way.
> > > + */
> > > + if (!list_empty(&nf->nf_lru) && nfsd_file_lru_remove(nf))
> > > + return nf;
> > > + return __nfsd_file_get(nf);
> > > +}
> > > +
> > > /**
> > > * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > > * @nf: nfsd_file to be unhashed and queued
> > > @@ -449,7 +466,7 @@ nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > > * to take a reference. If that fails, just ignore
> > > * the file altogether.
> > > */
> > > - if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > > + if (!nfsd_file_lru_remove(nf) && !__nfsd_file_get(nf))
> > > return false;
> > > list_add(&nf->nf_lru, dispose);
> > > return true;
> > > --
> > > 2.37.3
> > >
> > >
>
> --
> Jeff Layton <[email protected]>
>
> On Oct 27, 2022, at 6:55 PM, NeilBrown <[email protected]> wrote:
>
> On Fri, 28 Oct 2022, Jeff Layton wrote:
>> On Fri, 2022-10-28 at 09:20 +1100, NeilBrown wrote:
>>> On Fri, 28 Oct 2022, Jeff Layton wrote:
>>>> Currently, nfsd_files live on the LRU once they are added until they are
>>>> unhashed. There's no need to keep ones that are actively in use there.
>>>
>>> Is that true?
>>> nfsd_file_do_acquire() calls nfsd_file_lru_remove()
>>> Isn't that enough to keep the file off the lru while it is active?
>>>
>>> Thanks,
>>> NeilBrown
>>>
>>
>> After patch #1, it doesn't call that anymore. That's probably a (minor)
>> regression then.
>
> Yes, I eventually found that - thanks.
>
>>
>> After patch #1, the LRU holds a reference. If you successfully remove it
>> from the LRU, you need to transfer or put that reference. Doing the LRU
>> handling in the get and put routines seems more natural, I think.
>
> Maybe. But then you need a __get as well as a get.
> Though it might seem asymmetric, I would prefer removing from the lru in
> 'acquire' and adding to the lru in put.
That's exactly the design introduced by commit 4a0e73e635e3
("NFSD: Leave open files out of the filecache LRU"). I also
would like to keep that behavior -- that's what a real LRU
is for.
>> Maybe I just need to squash this patch into #1?
>
> Or do the "put" if lru_remove succeeds in the first patch. Then revise
> it all in the second.
>
> Thanks,
> NeilBrown
>
>
>>
>>>
>>>>
>>>> Before incrementing the refcount, do a lockless check for nf_lru being
>>>> empty. If it's not then attempt to remove the entry from the LRU. If
>>>> that's successful, claim the LRU reference and return it. If the removal
>>>> fails (or if the list_head was empty), then just increment the counter
>>>> as we normally would.
>>>>
>>>> Signed-off-by: Jeff Layton <[email protected]>
>>>> ---
>>>> fs/nfsd/filecache.c | 23 ++++++++++++++++++++---
>>>> 1 file changed, 20 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>>> index e63534f4b9f8..d2bbded805d4 100644
>>>> --- a/fs/nfsd/filecache.c
>>>> +++ b/fs/nfsd/filecache.c
>>>> @@ -420,14 +420,31 @@ nfsd_file_unhash(struct nfsd_file *nf)
>>>> return false;
>>>> }
>>>>
>>>> -struct nfsd_file *
>>>> -nfsd_file_get(struct nfsd_file *nf)
>>>> +static struct nfsd_file *
>>>> +__nfsd_file_get(struct nfsd_file *nf)
>>>> {
>>>> if (likely(refcount_inc_not_zero(&nf->nf_ref)))
>>>> return nf;
>>>> return NULL;
>>>> }
>>>>
>>>> +struct nfsd_file *
>>>> +nfsd_file_get(struct nfsd_file *nf)
>>>> +{
>>>> + /*
>>>> + * Do a lockless list_empty check first, before attempting to
>>>> + * remove it, so we can avoid the spinlock when it's not on the
>>>> + * list.
>>>> + *
>>>> + * If we successfully remove it from the LRU, then we can just
>>>> + * claim the LRU reference and return it. Otherwise, we need to
>>>> + * bump the counter the old-fashioned way.
>>>> + */
>>>> + if (!list_empty(&nf->nf_lru) && nfsd_file_lru_remove(nf))
>>>> + return nf;
>>>> + return __nfsd_file_get(nf);
>>>> +}
>>>> +
>>>> /**
>>>> * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
>>>> * @nf: nfsd_file to be unhashed and queued
>>>> @@ -449,7 +466,7 @@ nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
>>>> * to take a reference. If that fails, just ignore
>>>> * the file altogether.
>>>> */
>>>> - if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
>>>> + if (!nfsd_file_lru_remove(nf) && !__nfsd_file_get(nf))
>>>> return false;
>>>> list_add(&nf->nf_lru, dispose);
>>>> return true;
>>>> --
>>>> 2.37.3
>>>>
>>>>
>>
>> --
>> Jeff Layton <[email protected]>
>>
--
Chuck Lever
On Fri, 2022-10-28 at 09:51 +1100, NeilBrown wrote:
> On Fri, 28 Oct 2022, Jeff Layton wrote:
> > The filecache refcounting is a bit non-standard for something searchable
> > by RCU, in that we maintain a sentinel reference while it's hashed. This
> > in turn requires that we have to do things differently in the "put"
> > depending on whether its hashed, which we believe to have led to races.
> >
> > There are other problems in here too. nfsd_file_close_inode_sync can end
> > up freeing an nfsd_file while there are still outstanding references to
> > it, and the handling
>
> -EINTR ??? (you got interrupted and didn't finish the sentence?)
>
Yes, I meant to go back and flesh that out, and forgot before posting.
> >
> > Rework the code so that the refcount is what drives the lifecycle. When
> > the refcount goes to zero, then unhash and rcu free the object.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/filecache.c | 291 +++++++++++++++++++++-----------------------
> > fs/nfsd/trace.h | 5 +-
> > 2 files changed, 144 insertions(+), 152 deletions(-)
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index 98c6b5f51bc8..e63534f4b9f8 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -1,6 +1,12 @@
> > // SPDX-License-Identifier: GPL-2.0
> > /*
> > * The NFSD open file cache.
> > + *
> > + * Each nfsd_file is created in response to client activity -- either regular
> > + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> > + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> > + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> > + * eventual disposal if they aren't used again within a short time period.
> > */
> >
> > #include <linux/hash.h>
> > @@ -302,31 +308,43 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > if (key->gc)
> > __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > nf->nf_inode = key->inode;
> > - /* nf_ref is pre-incremented for hash table */
> > - refcount_set(&nf->nf_ref, 2);
> > + refcount_set(&nf->nf_ref, 1);
> > nf->nf_may = key->need;
> > nf->nf_mark = NULL;
> > }
> > return nf;
> > }
> >
> > -static bool
> > +static void
> > +nfsd_file_flush(struct nfsd_file *nf)
> > +{
> > + struct file *file = nf->nf_file;
> > +
> > + if (!file || !(file->f_mode & FMODE_WRITE))
> > + return;
> > + this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > + if (vfs_fsync(file, 1) != 0)
> > + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > +}
> > +
> > +static void
> > nfsd_file_free(struct nfsd_file *nf)
> > {
> > s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> > - bool flush = false;
> > +
> > + trace_nfsd_file_free(nf);
> >
> > this_cpu_inc(nfsd_file_releases);
> > this_cpu_add(nfsd_file_total_age, age);
> >
> > - trace_nfsd_file_put_final(nf);
> > + nfsd_file_flush(nf);
> > +
> > if (nf->nf_mark)
> > nfsd_file_mark_put(nf->nf_mark);
> > if (nf->nf_file) {
> > get_file(nf->nf_file);
> > filp_close(nf->nf_file, NULL);
> > fput(nf->nf_file);
> > - flush = true;
> > }
> >
> > /*
> > @@ -334,10 +352,9 @@ nfsd_file_free(struct nfsd_file *nf)
> > * WARN and leak it to preserve system stability.
> > */
> > if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> > - return flush;
> > + return;
> >
> > call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> > - return flush;
> > }
> >
> > static bool
> > @@ -363,29 +380,23 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
> > return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
> > }
> >
> > -static void
> > -nfsd_file_flush(struct nfsd_file *nf)
> > -{
> > - struct file *file = nf->nf_file;
> > -
> > - if (!file || !(file->f_mode & FMODE_WRITE))
> > - return;
> > - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > - if (vfs_fsync(file, 1) != 0)
> > - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > -}
> > -
> > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > {
> > set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > trace_nfsd_file_lru_add(nf);
> > + return true;
> > + }
> > + return false;
> > }
> >
> > -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > {
> > - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> > + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > trace_nfsd_file_lru_del(nf);
> > + return true;
> > + }
> > + return false;
> > }
> >
> > static void
> > @@ -409,94 +420,89 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > return false;
> > }
> >
> > -static void
> > -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > +struct nfsd_file *
> > +nfsd_file_get(struct nfsd_file *nf)
> > {
> > - trace_nfsd_file_unhash_and_dispose(nf);
> > + if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > + return nf;
> > + return NULL;
> > +}
> > +
> > +/**
> > + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > + * @nf: nfsd_file to be unhashed and queued
> > + * @dispose: list to which it should be queued
> > + *
> > + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> > + * will have a reference held on behalf of the list. That reference may come
> > + * from the LRU, or we may need to take one. If we can't get a reference,
> > + * ignore it altogether.
> > + */
> > +static bool
> > +nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > +{
> > + trace_nfsd_file_unhash_and_queue(nf);
> > if (nfsd_file_unhash(nf)) {
> > - /* caller must call nfsd_file_dispose_list() later */
> > - nfsd_file_lru_remove(nf);
> > + /*
> > + * If we remove it from the LRU, then just use that
> > + * reference for the dispose list. Otherwise, we need
> > + * to take a reference. If that fails, just ignore
> > + * the file altogether.
> > + */
> > + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > + return false;
> > list_add(&nf->nf_lru, dispose);
> > + return true;
> > }
> > + return false;
> > }
> >
> > -static void
> > -nfsd_file_put_noref(struct nfsd_file *nf)
> > +static bool
> > +__nfsd_file_put(struct nfsd_file *nf)
>
> The return value of this function is never tested.
> Maybe it should return void.
>
> Further, I don't think this is a useful abstraction.
> I would rather move the refcount_dec_and_test to the caller, and move
> the lru_remove and unash into nfsd_file_free.
>
Ok, sounds reasonable.
> > {
> > - trace_nfsd_file_put(nf);
> > -
> > if (refcount_dec_and_test(&nf->nf_ref)) {
> > - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > - nfsd_file_lru_remove(nf);
> > + nfsd_file_unhash(nf);
> > nfsd_file_free(nf);
> > + return true;
> > }
> > + return false;
> > }
> >
> > -static void
> > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > -{
> > - if (nfsd_file_unhash(nf))
> > - nfsd_file_put_noref(nf);
> > -}
> > -
> > +/**
> > + * nfsd_file_put - put the reference to a nfsd_file
> > + * @nf: nfsd_file of which to put the reference
> > + *
> > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > + * reference immediately. In the v2/3 case, if the reference would be
> > + * the last one, the put it on the LRU instead to be cleaned up later.
> > + */
> > void
> > nfsd_file_put(struct nfsd_file *nf)
> > {
> > - might_sleep();
> > -
> > - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> > - nfsd_file_lru_add(nf);
> > - else if (refcount_read(&nf->nf_ref) == 2)
> > - nfsd_file_unhash_and_put(nf);
> > -
> > - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > - nfsd_file_flush(nf);
> > - nfsd_file_put_noref(nf);
> > - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > - nfsd_file_put_noref(nf);
> > - nfsd_file_schedule_laundrette();
> > - } else
> > - nfsd_file_put_noref(nf);
> > -}
> > -
> > -struct nfsd_file *
> > -nfsd_file_get(struct nfsd_file *nf)
> > -{
> > - if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > - return nf;
> > - return NULL;
> > -}
> > -
> > -static void
> > -nfsd_file_dispose_list(struct list_head *dispose)
> > -{
> > - struct nfsd_file *nf;
> > + trace_nfsd_file_put(nf);
> >
> > - while(!list_empty(dispose)) {
> > - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > - list_del_init(&nf->nf_lru);
> > - nfsd_file_flush(nf);
> > - nfsd_file_put_noref(nf);
> > + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
>
> I would prefer this included a test on NFSD_FILE_HASHED as well so that
> if the file isn't hashed, we don't consider it for the lru.
> This would me we can simple called nfsd_file_put() for things on the
> dispose list, rather then needing __nfsd_file_put()
>
I had an incorrectly reversed test for that in the previous version in
nfsd_file_lru_add and you mentioned that it was racy. Why would that not
be the case here?
> > + /*
> > + * If this is the last reference (nf_ref == 1), then transfer
> > + * it to the LRU. If the add to the LRU fails, just put it as
> > + * usual.
> > + */
> > + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > + return;
> > }
> > + __nfsd_file_put(nf);
>
> As suggested above, this would become
> if (refcount_dec_and_test(&nf->nf_ref))
> nfsd_file_free(nf);
>
Ok.
> > }
> >
> > static void
> > -nfsd_file_dispose_list_sync(struct list_head *dispose)
> > +nfsd_file_dispose_list(struct list_head *dispose)
> > {
> > - bool flush = false;
> > struct nfsd_file *nf;
> >
> > while(!list_empty(dispose)) {
> > nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > list_del_init(&nf->nf_lru);
> > - nfsd_file_flush(nf);
> > - if (!refcount_dec_and_test(&nf->nf_ref))
> > - continue;
> > - if (nfsd_file_free(nf))
> > - flush = true;
> > + nfsd_file_free(nf);
> > }
> > - if (flush)
> > - flush_delayed_fput();
> > }
> >
> > static void
> > @@ -566,21 +572,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > struct list_head *head = arg;
> > struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> >
> > - /*
> > - * Do a lockless refcount check. The hashtable holds one reference, so
> > - * we look to see if anything else has a reference, or if any have
> > - * been put since the shrinker last ran. Those don't get unhashed and
> > - * released.
> > - *
> > - * Note that in the put path, we set the flag and then decrement the
> > - * counter. Here we check the counter and then test and clear the flag.
> > - * That order is deliberate to ensure that we can do this locklessly.
> > - */
> > - if (refcount_read(&nf->nf_ref) > 1) {
> > - list_lru_isolate(lru, &nf->nf_lru);
> > - trace_nfsd_file_gc_in_use(nf);
> > - return LRU_REMOVED;
> > - }
> > + /* We should only be dealing with v2/3 entries here */
> > + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> >
> > /*
> > * Don't throw out files that are still undergoing I/O or
> > @@ -591,40 +584,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > return LRU_SKIP;
> > }
> >
> > + /* If it was recently added to the list, skip it */
> > if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> > trace_nfsd_file_gc_referenced(nf);
> > return LRU_ROTATE;
> > }
> >
> > - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > - trace_nfsd_file_gc_hashed(nf);
> > - return LRU_SKIP;
> > + /*
> > + * Put the reference held on behalf of the LRU. If it wasn't the last
> > + * one, then just remove it from the LRU and ignore it.
> > + */
> > + if (!refcount_dec_and_test(&nf->nf_ref)) {
> > + trace_nfsd_file_gc_in_use(nf);
> > + list_lru_isolate(lru, &nf->nf_lru);
> > + return LRU_REMOVED;
> > }
> >
> > + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> > + nfsd_file_unhash(nf);
> > list_lru_isolate_move(lru, &nf->nf_lru, head);
> > this_cpu_inc(nfsd_file_evictions);
> > trace_nfsd_file_gc_disposed(nf);
> > return LRU_REMOVED;
> > }
> >
> > -/*
> > - * Unhash items on @dispose immediately, then queue them on the
> > - * disposal workqueue to finish releasing them in the background.
> > - *
> > - * cel: Note that between the time list_lru_shrink_walk runs and
> > - * now, these items are in the hash table but marked unhashed.
> > - * Why release these outside of lru_cb ? There's no lock ordering
> > - * problem since lru_cb currently takes no lock.
> > - */
> > -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> > -{
> > - struct nfsd_file *nf;
> > -
> > - list_for_each_entry(nf, dispose, nf_lru)
> > - nfsd_file_hash_remove(nf);
> > - nfsd_file_dispose_list_delayed(dispose);
> > -}
> > -
> > static void
> > nfsd_file_gc(void)
> > {
> > @@ -634,7 +617,7 @@ nfsd_file_gc(void)
> > ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> > &dispose, list_lru_count(&nfsd_file_lru));
> > trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> > - nfsd_file_gc_dispose_list(&dispose);
> > + nfsd_file_dispose_list_delayed(&dispose);
> > }
> >
> > static void
> > @@ -659,7 +642,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> > ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> > nfsd_file_lru_cb, &dispose);
> > trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> > - nfsd_file_gc_dispose_list(&dispose);
> > + nfsd_file_dispose_list_delayed(&dispose);
> > return ret;
> > }
> >
> > @@ -670,8 +653,11 @@ static struct shrinker nfsd_file_shrinker = {
> > };
> >
> > /*
> > - * Find all cache items across all net namespaces that match @inode and
> > - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> > + * Find all cache items across all net namespaces that match @inode, unhash
> > + * them, take references and then put them on @dispose if that was successful.
> > + *
> > + * The nfsd_file objects on the list will be unhashed, and each will have a
> > + * reference taken.
> > */
> > static unsigned int
> > __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > @@ -689,52 +675,58 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > nfsd_file_rhash_params);
> > if (!nf)
> > break;
> > - nfsd_file_unhash_and_dispose(nf, dispose);
> > - count++;
> > +
> > + if (nfsd_file_unhash_and_queue(nf, dispose))
> > + count++;
> > } while (1);
> > rcu_read_unlock();
> > return count;
> > }
> >
> > /**
> > - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > * @inode: inode of the file to attempt to remove
> > *
> > - * Unhash and put, then flush and fput all cache items associated with @inode.
> > + * Unhash and put all cache item associated with @inode.
> > */
> > -void
> > -nfsd_file_close_inode_sync(struct inode *inode)
> > +static unsigned int
> > +nfsd_file_close_inode(struct inode *inode)
> > {
> > - LIST_HEAD(dispose);
> > + struct nfsd_file *nf;
> > unsigned int count;
> > + LIST_HEAD(dispose);
> >
> > count = __nfsd_file_close_inode(inode, &dispose);
> > - trace_nfsd_file_close_inode_sync(inode, count);
> > - nfsd_file_dispose_list_sync(&dispose);
> > + trace_nfsd_file_close_inode(inode, count);
> > + if (count) {
> > + while(!list_empty(&dispose)) {
> > + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> > + list_del_init(&nf->nf_lru);
> > + trace_nfsd_file_closing(nf);
> > + __nfsd_file_put(nf);
>
> If nfsd_file_put() didn't add unhashed files to the lru, this can be
> nfsd_file_put().
>
> > + }
> > + }
> > + return count;
> > }
> >
> > /**
> > - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > * @inode: inode of the file to attempt to remove
> > *
> > - * Unhash and put all cache item associated with @inode.
> > + * Unhash and put, then flush and fput all cache items associated with @inode.
> > */
> > -static void
> > -nfsd_file_close_inode(struct inode *inode)
> > +void
> > +nfsd_file_close_inode_sync(struct inode *inode)
> > {
> > - LIST_HEAD(dispose);
> > - unsigned int count;
> > -
> > - count = __nfsd_file_close_inode(inode, &dispose);
> > - trace_nfsd_file_close_inode(inode, count);
> > - nfsd_file_dispose_list_delayed(&dispose);
> > + if (nfsd_file_close_inode(inode))
> > + flush_delayed_fput();
> > }
> >
> > /**
> > * nfsd_file_delayed_close - close unused nfsd_files
> > * @work: dummy
> > *
> > - * Walk the LRU list and close any entries that have not been used since
> > + * Walk the LRU list and destroy any entries that have not been used since
> > * the last scan.
> > */
> > static void
> > @@ -892,7 +884,7 @@ __nfsd_file_cache_purge(struct net *net)
> > while (!IS_ERR_OR_NULL(nf)) {
> > if (net && nf->nf_net != net)
> > continue;
> > - nfsd_file_unhash_and_dispose(nf, &dispose);
> > + nfsd_file_unhash_and_queue(nf, &dispose);
> > nf = rhashtable_walk_next(&iter);
> > }
> >
> > @@ -1093,11 +1085,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > goto out;
> > }
> > open_retry = false;
> > - nfsd_file_put_noref(nf);
> > + __nfsd_file_put(nf);
>
> This nf is not hashed, and I think it has no other reference. So we
> could use nfsd_file_free() - but nfsd_file_put() would be just as good
> and safer.
>
> > goto retry;
> > }
> >
> > - nfsd_file_lru_remove(nf);
>
> Hmmm... why not remove from the lru. I guess this justifies patch 2/3,
> but it might be cleaner to make this
>
> if (nfsd_file_lru_remove(nf))
> nffsd_file_put(nf);
> ??
>
Removing from the LRU means putting a reference now. The last "put" of a
nfsd_file can be rather expensive (you might need to flush data, and
issue a close()).
In this particular codepath, that's not so much a danger, but avoiding
excess "put" calls is still a good thing to do. That's the main reason
I've tried to "transfer" references to and from the LRU where possible.
> > this_cpu_inc(nfsd_file_cache_hits);
> >
> > status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> > @@ -1107,7 +1098,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > this_cpu_inc(nfsd_file_acquisitions);
> > *pnf = nf;
> > } else {
> > - nfsd_file_put(nf);
> > + __nfsd_file_put(nf);
>
> I don't see the justification for this change.
> If status == nfserr_jukebox, then it is OK.
> If status is whatever we might get from break_lease(), then it seems
> wrong.
> If we modify nfsd_file_put() as I suggest, it will handle both cases.
>
>
The justification is that when we're dealing with an error from an open,
we don't want to put the nfsd_file onto the LRU. So, a direct call to
__nfsd_file_put is what's needed here.
I'll plan to open-code those like you suggest in the next iteration.
> > nf = NULL;
> > }
> >
> > @@ -1134,7 +1125,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > * then unhash.
> > */
> > if (status != nfs_ok || key.inode->i_nlink == 0)
> > - nfsd_file_unhash_and_put(nf);
> > + nfsd_file_unhash(nf);
> > clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> > smp_mb__after_atomic();
> > wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index b09ab4f92d43..a44ded06af87 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -903,10 +903,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
> > TP_PROTO(struct nfsd_file *nf), \
> > TP_ARGS(nf))
> >
> > -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
> > +DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> > -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
> > +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> > +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
> >
> > TRACE_EVENT(nfsd_file_alloc,
> > TP_PROTO(
> > --
> > 2.37.3
> >
> >
>
> Thanks,
> NeilBrown
--
Jeff Layton <[email protected]>
On Fri, 2022-10-28 at 09:25 +1100, NeilBrown wrote:
> On Fri, 28 Oct 2022, Jeff Layton wrote:
> > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > so that we can be ready to close it out when the time comes.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
>
> This looks sensible.
> Reviewed-by: NeilBrown <[email protected]>
>
> Thanks,
> NeilBrown
>
>
> > ---
> > fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> > 1 file changed, 31 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index d2bbded805d4..491d3d9a1870 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > }
> >
> > static void
> > -nfsd_file_flush(struct nfsd_file *nf)
> > +nfsd_file_fsync(struct nfsd_file *nf)
> > {
> > struct file *file = nf->nf_file;
> >
> > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > }
> >
> > +static void
> > +nfsd_file_flush(struct nfsd_file *nf)
> > +{
> > + struct file *file = nf->nf_file;
> > + unsigned long nrpages;
> > +
> > + if (!file || !(file->f_mode & FMODE_WRITE))
> > + return;
> > +
> > + nrpages = file->f_mapping->nrpages;
> > + if (nrpages) {
> >
I may change this to:
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
I'm not sure here...Does nrpages count all of the pages in the mapping,
or just the dirty ones? I'm wondering if we're overcounting in
nfsd_file_pages_flushed?
> > + this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > + filemap_flush(file->f_mapping);
> > + }
> > +}
> > +
> > static void
> > nfsd_file_free(struct nfsd_file *nf)
> > {
> > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> > this_cpu_inc(nfsd_file_releases);
> > this_cpu_add(nfsd_file_total_age, age);
> >
> > - nfsd_file_flush(nf);
> > + nfsd_file_fsync(nf);
> >
> > if (nf->nf_mark)
> > nfsd_file_mark_put(nf->nf_mark);
> > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> >
> > if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > /*
> > - * If this is the last reference (nf_ref == 1), then transfer
> > - * it to the LRU. If the add to the LRU fails, just put it as
> > - * usual.
> > + * If this is the last reference (nf_ref == 1), then try
> > + * to transfer it to the LRU.
> > + */
> > + if (refcount_dec_not_one(&nf->nf_ref))
> > + return;
> > +
> > + /*
> > + * If the add to the list succeeds, try to kick off SYNC_NONE
> > + * writeback. If the add fails, then just fall through to
> > + * decrement as usual.
> > */
> > - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > + if (nfsd_file_lru_add(nf)) {
> > + nfsd_file_flush(nf);
> > return;
> > + }
> > }
> > __nfsd_file_put(nf);
> > }
> > --
> > 2.37.3
> >
> >
--
Jeff Layton <[email protected]>
> On Oct 27, 2022, at 5:52 PM, Jeff Layton <[email protected]> wrote:
>
> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> so that we can be ready to close it out when the time comes.
For a large file, a background flush still has to walk the file's
pages to see if they are dirty, and that consumes time, CPU, and
memory bandwidth. We're talking hundreds of microseconds for a
large file.
Then the final flush does all that again.
Basically, two (or more!) passes through the file for exactly the
same amount of work. Is there any measured improvement in latency
or throughput?
And then... for a GC file, no-one is waiting on data persistence
during nfsd_file_put() so I'm not sure what is gained by taking
control of the flushing process away from the underlying filesystem.
Remind me why the filecache is flushing? Shouldn't NFSD rely on
COMMIT operations for that? (It's not obvious reading the code,
maybe there should be a documenting comment somewhere that
explains this arrangement).
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> 1 file changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index d2bbded805d4..491d3d9a1870 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> }
>
> static void
> -nfsd_file_flush(struct nfsd_file *nf)
> +nfsd_file_fsync(struct nfsd_file *nf)
> {
> struct file *file = nf->nf_file;
>
> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
>
> +static void
> +nfsd_file_flush(struct nfsd_file *nf)
> +{
> + struct file *file = nf->nf_file;
> + unsigned long nrpages;
> +
> + if (!file || !(file->f_mode & FMODE_WRITE))
> + return;
> +
> + nrpages = file->f_mapping->nrpages;
> + if (nrpages) {
> + this_cpu_add(nfsd_file_pages_flushed, nrpages);
> + filemap_flush(file->f_mapping);
> + }
> +}
> +
> static void
> nfsd_file_free(struct nfsd_file *nf)
> {
> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> this_cpu_inc(nfsd_file_releases);
> this_cpu_add(nfsd_file_total_age, age);
>
> - nfsd_file_flush(nf);
> + nfsd_file_fsync(nf);
>
> if (nf->nf_mark)
> nfsd_file_mark_put(nf->nf_mark);
> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
>
> if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> /*
> - * If this is the last reference (nf_ref == 1), then transfer
> - * it to the LRU. If the add to the LRU fails, just put it as
> - * usual.
> + * If this is the last reference (nf_ref == 1), then try
> + * to transfer it to the LRU.
> + */
> + if (refcount_dec_not_one(&nf->nf_ref))
> + return;
> +
> + /*
> + * If the add to the list succeeds, try to kick off SYNC_NONE
> + * writeback. If the add fails, then just fall through to
> + * decrement as usual.
These comments simply repeat what the code does, so they seem
redundant to me. Could they instead explain why?
> */
> - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> + if (nfsd_file_lru_add(nf)) {
> + nfsd_file_flush(nf);
> return;
> + }
> }
> __nfsd_file_put(nf);
> }
> --
> 2.37.3
>
--
Chuck Lever
On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
>
> > On Oct 27, 2022, at 5:52 PM, Jeff Layton <[email protected]> wrote:
> >
> > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > so that we can be ready to close it out when the time comes.
>
> For a large file, a background flush still has to walk the file's
> pages to see if they are dirty, and that consumes time, CPU, and
> memory bandwidth. We're talking hundreds of microseconds for a
> large file.
>
> Then the final flush does all that again.
>
> Basically, two (or more!) passes through the file for exactly the
> same amount of work. Is there any measured improvement in latency
> or throughput?
>
> And then... for a GC file, no-one is waiting on data persistence
> during nfsd_file_put() so I'm not sure what is gained by taking
> control of the flushing process away from the underlying filesystem.
>
>
> Remind me why the filecache is flushing? Shouldn't NFSD rely on
> COMMIT operations for that? (It's not obvious reading the code,
> maybe there should be a documenting comment somewhere that
> explains this arrangement).
>
Fair point. I was trying to replicate the behaviors introduced in these
patches:
b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
6b8a94332ee4 nfsd: Fix a write performance regression
AFAICT, the fsync is there to catch writeback errors so that we can
reset the write verifiers (AFAICT). The rationale for that is described
here:
055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
The problem with not calling vfs_fsync is that we might miss writeback
errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
nfsd would eventually reopen the file but it could miss seeing the error
if it got opened locally in the interim.
I'm not sure we need to worry about that so much for v4 though. Maybe we
should just do this for GC files?
>
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> > 1 file changed, 31 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index d2bbded805d4..491d3d9a1870 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > }
> >
> > static void
> > -nfsd_file_flush(struct nfsd_file *nf)
> > +nfsd_file_fsync(struct nfsd_file *nf)
> > {
> > struct file *file = nf->nf_file;
> >
> > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > }
> >
> > +static void
> > +nfsd_file_flush(struct nfsd_file *nf)
> > +{
> > + struct file *file = nf->nf_file;
> > + unsigned long nrpages;
> > +
> > + if (!file || !(file->f_mode & FMODE_WRITE))
> > + return;
> > +
> > + nrpages = file->f_mapping->nrpages;
> > + if (nrpages) {
> > + this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > + filemap_flush(file->f_mapping);
> > + }
> > +}
> > +
> > static void
> > nfsd_file_free(struct nfsd_file *nf)
> > {
> > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> > this_cpu_inc(nfsd_file_releases);
> > this_cpu_add(nfsd_file_total_age, age);
> >
> > - nfsd_file_flush(nf);
> > + nfsd_file_fsync(nf);
> >
> > if (nf->nf_mark)
> > nfsd_file_mark_put(nf->nf_mark);
> > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> >
> > if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > /*
> > - * If this is the last reference (nf_ref == 1), then transfer
> > - * it to the LRU. If the add to the LRU fails, just put it as
> > - * usual.
> > + * If this is the last reference (nf_ref == 1), then try
> > + * to transfer it to the LRU.
> > + */
> > + if (refcount_dec_not_one(&nf->nf_ref))
> > + return;
> > +
> > + /*
> > + * If the add to the list succeeds, try to kick off SYNC_NONE
> > + * writeback. If the add fails, then just fall through to
> > + * decrement as usual.
>
> These comments simply repeat what the code does, so they seem
> redundant to me. Could they instead explain why?
>
>
> > */
> > - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > + if (nfsd_file_lru_add(nf)) {
> > + nfsd_file_flush(nf);
> > return;
> > + }
> > }
> > __nfsd_file_put(nf);
> > }
> > --
> > 2.37.3
> >
>
> --
> Chuck Lever
>
>
>
--
Jeff Layton <[email protected]>
> On Oct 28, 2022, at 11:05 AM, Jeff Layton <[email protected]> wrote:
>
> On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
>>
>>> On Oct 27, 2022, at 5:52 PM, Jeff Layton <[email protected]> wrote:
>>>
>>> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
>>> so that we can be ready to close it out when the time comes.
>>
>> For a large file, a background flush still has to walk the file's
>> pages to see if they are dirty, and that consumes time, CPU, and
>> memory bandwidth. We're talking hundreds of microseconds for a
>> large file.
>>
>> Then the final flush does all that again.
>>
>> Basically, two (or more!) passes through the file for exactly the
>> same amount of work. Is there any measured improvement in latency
>> or throughput?
>>
>> And then... for a GC file, no-one is waiting on data persistence
>> during nfsd_file_put() so I'm not sure what is gained by taking
>> control of the flushing process away from the underlying filesystem.
>>
>>
>> Remind me why the filecache is flushing? Shouldn't NFSD rely on
>> COMMIT operations for that? (It's not obvious reading the code,
>> maybe there should be a documenting comment somewhere that
>> explains this arrangement).
>>
>
>
> Fair point. I was trying to replicate the behaviors introduced in these
> patches:
>
> b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
> 6b8a94332ee4 nfsd: Fix a write performance regression
>
> AFAICT, the fsync is there to catch writeback errors so that we can
> reset the write verifiers (AFAICT). The rationale for that is described
> here:
>
> 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
Yes, I've been confused about this since then :-)
So, the patch description says:
If a file may contain unstable writes that can error out, then we want
to avoid garbage collecting the struct nfsd_file that may be
tracking those errors.
That doesn't explain why that's a problem, it just says what we plan to
do about it.
> The problem with not calling vfs_fsync is that we might miss writeback
> errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> nfsd would eventually reopen the file but it could miss seeing the error
> if it got opened locally in the interim.
That helps. So we're surfacing writeback errors for local writers?
I guess I would like this flushing to interfere as little as possible
with the server's happy zone, since it's not something clients need to
wait for, and an error is exceptionally rare.
But also, we can't let writeback errors hold onto a bunch of memory
indefinitely. How much nfsd_file and page cache memory might be be
pinned by a writeback error, and for how long?
> I'm not sure we need to worry about that so much for v4 though. Maybe we
> should just do this for GC files?
I'm not caffeinated yet. Why is it not a problem for v4? Is it because
an open or delegation stateid will prevent the nfsd_file from going
away?
Sorry for the noise. It's all a little subtle.
>>> Signed-off-by: Jeff Layton <[email protected]>
>>> ---
>>> fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
>>> 1 file changed, 31 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>> index d2bbded805d4..491d3d9a1870 100644
>>> --- a/fs/nfsd/filecache.c
>>> +++ b/fs/nfsd/filecache.c
>>> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>>> }
>>>
>>> static void
>>> -nfsd_file_flush(struct nfsd_file *nf)
>>> +nfsd_file_fsync(struct nfsd_file *nf)
>>> {
>>> struct file *file = nf->nf_file;
>>>
>>> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
>>> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
>>> }
>>>
>>> +static void
>>> +nfsd_file_flush(struct nfsd_file *nf)
>>> +{
>>> + struct file *file = nf->nf_file;
>>> + unsigned long nrpages;
>>> +
>>> + if (!file || !(file->f_mode & FMODE_WRITE))
>>> + return;
>>> +
>>> + nrpages = file->f_mapping->nrpages;
>>> + if (nrpages) {
>>> + this_cpu_add(nfsd_file_pages_flushed, nrpages);
>>> + filemap_flush(file->f_mapping);
>>> + }
>>> +}
>>> +
>>> static void
>>> nfsd_file_free(struct nfsd_file *nf)
>>> {
>>> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
>>> this_cpu_inc(nfsd_file_releases);
>>> this_cpu_add(nfsd_file_total_age, age);
>>>
>>> - nfsd_file_flush(nf);
>>> + nfsd_file_fsync(nf);
>>>
>>> if (nf->nf_mark)
>>> nfsd_file_mark_put(nf->nf_mark);
>>> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
>>>
>>> if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
>>> /*
>>> - * If this is the last reference (nf_ref == 1), then transfer
>>> - * it to the LRU. If the add to the LRU fails, just put it as
>>> - * usual.
>>> + * If this is the last reference (nf_ref == 1), then try
>>> + * to transfer it to the LRU.
>>> + */
>>> + if (refcount_dec_not_one(&nf->nf_ref))
>>> + return;
>>> +
>>> + /*
>>> + * If the add to the list succeeds, try to kick off SYNC_NONE
>>> + * writeback. If the add fails, then just fall through to
>>> + * decrement as usual.
>>
>> These comments simply repeat what the code does, so they seem
>> redundant to me. Could they instead explain why?
>>
>>
>>> */
>>> - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
>>> + if (nfsd_file_lru_add(nf)) {
>>> + nfsd_file_flush(nf);
>>> return;
>>> + }
>>> }
>>> __nfsd_file_put(nf);
>>> }
>>> --
>>> 2.37.3
>>>
>>
>> --
>> Chuck Lever
>>
>>
>>
>
> --
> Jeff Layton <[email protected]>
--
Chuck Lever
On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
>
> > On Oct 28, 2022, at 11:05 AM, Jeff Layton <[email protected]> wrote:
> >
> > On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
> > >
> > > > On Oct 27, 2022, at 5:52 PM, Jeff Layton <[email protected]> wrote:
> > > >
> > > > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > > > so that we can be ready to close it out when the time comes.
> > >
> > > For a large file, a background flush still has to walk the file's
> > > pages to see if they are dirty, and that consumes time, CPU, and
> > > memory bandwidth. We're talking hundreds of microseconds for a
> > > large file.
> > >
> > > Then the final flush does all that again.
> > >
> > > Basically, two (or more!) passes through the file for exactly the
> > > same amount of work. Is there any measured improvement in latency
> > > or throughput?
> > >
> > > And then... for a GC file, no-one is waiting on data persistence
> > > during nfsd_file_put() so I'm not sure what is gained by taking
> > > control of the flushing process away from the underlying filesystem.
> > >
> > >
> > > Remind me why the filecache is flushing? Shouldn't NFSD rely on
> > > COMMIT operations for that? (It's not obvious reading the code,
> > > maybe there should be a documenting comment somewhere that
> > > explains this arrangement).
> > >
> >
> >
> > Fair point. I was trying to replicate the behaviors introduced in these
> > patches:
> >
> > b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
> > 6b8a94332ee4 nfsd: Fix a write performance regression
> >
> > AFAICT, the fsync is there to catch writeback errors so that we can
> > reset the write verifiers (AFAICT). The rationale for that is described
> > here:
> >
> > 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
>
> Yes, I've been confused about this since then :-)
>
> So, the patch description says:
>
> If a file may contain unstable writes that can error out, then we want
> to avoid garbage collecting the struct nfsd_file that may be
> tracking those errors.
>
> That doesn't explain why that's a problem, it just says what we plan to
> do about it.
>
>
> > The problem with not calling vfs_fsync is that we might miss writeback
> > errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> > nfsd would eventually reopen the file but it could miss seeing the error
> > if it got opened locally in the interim.
>
> That helps. So we're surfacing writeback errors for local writers?
>
Well for non-v3 writers anyway. I suppose you could hit the same
scenario with a mixed v3 and v4 workload if you were unlucky enough, or
mixed v3 and ksmbd workload, etc...
> I guess I would like this flushing to interfere as little as possible
> with the server's happy zone, since it's not something clients need to
> wait for, and an error is exceptionally rare.
>
> But also, we can't let writeback errors hold onto a bunch of memory
> indefinitely. How much nfsd_file and page cache memory might be be
> pinned by a writeback error, and for how long?
>
You mean if we were to stop trying to fsync out when closing? We don't
keep files in the cache indefinitely, even if they have writeback
errors.
In general, the kernel attempts to write things out, and if it fails it
sets a writeback error in the mapping and marks the pages clean. So if
we're talking about files that are no longer being used (since they're
being GC'ed), we only block reaping them for as long as writeback is in
progress.
Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
a final time, grab the error and reset the write verifier when it's
non-zero.
If we stop doing fsyncs, then that model sort of breaks down. I'm not
clear on what you'd like to do instead.
>
> > I'm not sure we need to worry about that so much for v4 though. Maybe we
> > should just do this for GC files?
>
> I'm not caffeinated yet. Why is it not a problem for v4? Is it because
> an open or delegation stateid will prevent the nfsd_file from going
> away?
>
Yeah, more or less.
I think that for a error to be lost with v4, it would require the client
to have an application access pattern that would expose it to that
possibility on a local filesystem as well. I don't think we have any
obligation to do more there.
Maybe that's a false assumption though.
> Sorry for the noise. It's all a little subtle.
>
Very subtle. The more we can get this fleshed out into comments the
better, so I welcome the questions.
>
> > > > Signed-off-by: Jeff Layton <[email protected]>
> > > > ---
> > > > fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> > > > 1 file changed, 31 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > > index d2bbded805d4..491d3d9a1870 100644
> > > > --- a/fs/nfsd/filecache.c
> > > > +++ b/fs/nfsd/filecache.c
> > > > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > > }
> > > >
> > > > static void
> > > > -nfsd_file_flush(struct nfsd_file *nf)
> > > > +nfsd_file_fsync(struct nfsd_file *nf)
> > > > {
> > > > struct file *file = nf->nf_file;
> > > >
> > > > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> > > > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > }
> > > >
> > > > +static void
> > > > +nfsd_file_flush(struct nfsd_file *nf)
> > > > +{
> > > > + struct file *file = nf->nf_file;
> > > > + unsigned long nrpages;
> > > > +
> > > > + if (!file || !(file->f_mode & FMODE_WRITE))
> > > > + return;
> > > > +
> > > > + nrpages = file->f_mapping->nrpages;
> > > > + if (nrpages) {
> > > > + this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > > > + filemap_flush(file->f_mapping);
> > > > + }
> > > > +}
> > > > +
> > > > static void
> > > > nfsd_file_free(struct nfsd_file *nf)
> > > > {
> > > > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> > > > this_cpu_inc(nfsd_file_releases);
> > > > this_cpu_add(nfsd_file_total_age, age);
> > > >
> > > > - nfsd_file_flush(nf);
> > > > + nfsd_file_fsync(nf);
> > > >
> > > > if (nf->nf_mark)
> > > > nfsd_file_mark_put(nf->nf_mark);
> > > > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> > > >
> > > > if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > > /*
> > > > - * If this is the last reference (nf_ref == 1), then transfer
> > > > - * it to the LRU. If the add to the LRU fails, just put it as
> > > > - * usual.
> > > > + * If this is the last reference (nf_ref == 1), then try
> > > > + * to transfer it to the LRU.
> > > > + */
> > > > + if (refcount_dec_not_one(&nf->nf_ref))
> > > > + return;
> > > > +
> > > > + /*
> > > > + * If the add to the list succeeds, try to kick off SYNC_NONE
> > > > + * writeback. If the add fails, then just fall through to
> > > > + * decrement as usual.
> > >
> > > These comments simply repeat what the code does, so they seem
> > > redundant to me. Could they instead explain why?
> > >
> > >
> > > > */
> > > > - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > > + if (nfsd_file_lru_add(nf)) {
> > > > + nfsd_file_flush(nf);
> > > > return;
> > > > + }
> > > > }
> > > > __nfsd_file_put(nf);
> > > > }
> > > > --
> > > > 2.37.3
> > > >
> > >
> > > --
> > > Chuck Lever
> > >
> > >
> > >
> >
> > --
> > Jeff Layton <[email protected]>
>
> --
> Chuck Lever
>
>
>
--
Jeff Layton <[email protected]>
> On Oct 28, 2022, at 11:51 AM, Jeff Layton <[email protected]> wrote:
>
> On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
>>
>>> On Oct 28, 2022, at 11:05 AM, Jeff Layton <[email protected]> wrote:
>>>
>>> On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
>>>>
>>>>> On Oct 27, 2022, at 5:52 PM, Jeff Layton <[email protected]> wrote:
>>>>>
>>>>> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
>>>>> so that we can be ready to close it out when the time comes.
>>>>
>>>> For a large file, a background flush still has to walk the file's
>>>> pages to see if they are dirty, and that consumes time, CPU, and
>>>> memory bandwidth. We're talking hundreds of microseconds for a
>>>> large file.
>>>>
>>>> Then the final flush does all that again.
>>>>
>>>> Basically, two (or more!) passes through the file for exactly the
>>>> same amount of work. Is there any measured improvement in latency
>>>> or throughput?
>>>>
>>>> And then... for a GC file, no-one is waiting on data persistence
>>>> during nfsd_file_put() so I'm not sure what is gained by taking
>>>> control of the flushing process away from the underlying filesystem.
>>>>
>>>>
>>>> Remind me why the filecache is flushing? Shouldn't NFSD rely on
>>>> COMMIT operations for that? (It's not obvious reading the code,
>>>> maybe there should be a documenting comment somewhere that
>>>> explains this arrangement).
>>>>
>>>
>>>
>>> Fair point. I was trying to replicate the behaviors introduced in these
>>> patches:
>>>
>>> b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
>>> 6b8a94332ee4 nfsd: Fix a write performance regression
>>>
>>> AFAICT, the fsync is there to catch writeback errors so that we can
>>> reset the write verifiers (AFAICT). The rationale for that is described
>>> here:
>>>
>>> 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
>>
>> Yes, I've been confused about this since then :-)
>>
>> So, the patch description says:
>>
>> If a file may contain unstable writes that can error out, then we want
>> to avoid garbage collecting the struct nfsd_file that may be
>> tracking those errors.
>>
>> That doesn't explain why that's a problem, it just says what we plan to
>> do about it.
>>
>>
>>> The problem with not calling vfs_fsync is that we might miss writeback
>>> errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
>>> nfsd would eventually reopen the file but it could miss seeing the error
>>> if it got opened locally in the interim.
>>
>> That helps. So we're surfacing writeback errors for local writers?
>>
>
> Well for non-v3 writers anyway. I suppose you could hit the same
> scenario with a mixed v3 and v4 workload if you were unlucky enough, or
> mixed v3 and ksmbd workload, etc...
>
>> I guess I would like this flushing to interfere as little as possible
>> with the server's happy zone, since it's not something clients need to
>> wait for, and an error is exceptionally rare.
>>
>> But also, we can't let writeback errors hold onto a bunch of memory
>> indefinitely. How much nfsd_file and page cache memory might be be
>> pinned by a writeback error, and for how long?
>>
>
> You mean if we were to stop trying to fsync out when closing? We don't
> keep files in the cache indefinitely, even if they have writeback
> errors.
>
> In general, the kernel attempts to write things out, and if it fails it
> sets a writeback error in the mapping and marks the pages clean. So if
> we're talking about files that are no longer being used (since they're
> being GC'ed), we only block reaping them for as long as writeback is in
> progress.
>
> Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
> a final time, grab the error and reset the write verifier when it's
> non-zero.
>
> If we stop doing fsyncs, then that model sort of breaks down. I'm not
> clear on what you'd like to do instead.
I'm not clear either. I think I just have some hand-wavy requirements.
I think keeping the flushes in the nfsd threads and away from single-
threaded garbage collection makes sense. Keep I/O in nfsd context, not
in the filecache garbage collector. I'm not sure that's guaranteed
if the garbage collection thread does an nfsd_file_put() that flushes.
And, we need to ensure that an nfsd_file isn't pinned forever -- the
flush has to make forward progress so that the nfsd_file is eventually
released. Otherwise, writeback errors become a DoS vector.
But, back to the topic of this patch: my own experiments with background
syncing showed that it introduces significant overhead and it wasn't
really worth the trouble. Basically, on intensive workloads, the garbage
collector must not stall or live-lock because it's walking through
millions of pages trying to figure out which ones are dirty.
>>> I'm not sure we need to worry about that so much for v4 though. Maybe we
>>> should just do this for GC files?
>>
>> I'm not caffeinated yet. Why is it not a problem for v4? Is it because
>> an open or delegation stateid will prevent the nfsd_file from going
>> away?
>>
>
>
> Yeah, more or less.
>
> I think that for a error to be lost with v4, it would require the client
> to have an application access pattern that would expose it to that
> possibility on a local filesystem as well. I don't think we have any
> obligation to do more there.
>
> Maybe that's a false assumption though.
>
>> Sorry for the noise. It's all a little subtle.
>>
>
> Very subtle. The more we can get this fleshed out into comments the
> better, so I welcome the questions.
>
>>
>>>>> Signed-off-by: Jeff Layton <[email protected]>
>>>>> ---
>>>>> fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
>>>>> 1 file changed, 31 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>>>> index d2bbded805d4..491d3d9a1870 100644
>>>>> --- a/fs/nfsd/filecache.c
>>>>> +++ b/fs/nfsd/filecache.c
>>>>> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>>>>> }
>>>>>
>>>>> static void
>>>>> -nfsd_file_flush(struct nfsd_file *nf)
>>>>> +nfsd_file_fsync(struct nfsd_file *nf)
>>>>> {
>>>>> struct file *file = nf->nf_file;
>>>>>
>>>>> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
>>>>> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
>>>>> }
>>>>>
>>>>> +static void
>>>>> +nfsd_file_flush(struct nfsd_file *nf)
>>>>> +{
>>>>> + struct file *file = nf->nf_file;
>>>>> + unsigned long nrpages;
>>>>> +
>>>>> + if (!file || !(file->f_mode & FMODE_WRITE))
>>>>> + return;
>>>>> +
>>>>> + nrpages = file->f_mapping->nrpages;
>>>>> + if (nrpages) {
>>>>> + this_cpu_add(nfsd_file_pages_flushed, nrpages);
>>>>> + filemap_flush(file->f_mapping);
>>>>> + }
>>>>> +}
>>>>> +
>>>>> static void
>>>>> nfsd_file_free(struct nfsd_file *nf)
>>>>> {
>>>>> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
>>>>> this_cpu_inc(nfsd_file_releases);
>>>>> this_cpu_add(nfsd_file_total_age, age);
>>>>>
>>>>> - nfsd_file_flush(nf);
>>>>> + nfsd_file_fsync(nf);
>>>>>
>>>>> if (nf->nf_mark)
>>>>> nfsd_file_mark_put(nf->nf_mark);
>>>>> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
>>>>>
>>>>> if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
>>>>> /*
>>>>> - * If this is the last reference (nf_ref == 1), then transfer
>>>>> - * it to the LRU. If the add to the LRU fails, just put it as
>>>>> - * usual.
>>>>> + * If this is the last reference (nf_ref == 1), then try
>>>>> + * to transfer it to the LRU.
>>>>> + */
>>>>> + if (refcount_dec_not_one(&nf->nf_ref))
>>>>> + return;
>>>>> +
>>>>> + /*
>>>>> + * If the add to the list succeeds, try to kick off SYNC_NONE
>>>>> + * writeback. If the add fails, then just fall through to
>>>>> + * decrement as usual.
>>>>
>>>> These comments simply repeat what the code does, so they seem
>>>> redundant to me. Could they instead explain why?
>>>>
>>>>
>>>>> */
>>>>> - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
>>>>> + if (nfsd_file_lru_add(nf)) {
>>>>> + nfsd_file_flush(nf);
>>>>> return;
>>>>> + }
>>>>> }
>>>>> __nfsd_file_put(nf);
>>>>> }
>>>>> --
>>>>> 2.37.3
>>>>>
>>>>
>>>> --
>>>> Chuck Lever
>>>>
>>>>
>>>>
>>>
>>> --
>>> Jeff Layton <[email protected]>
>>
>> --
>> Chuck Lever
>>
>>
>>
>
> --
> Jeff Layton <[email protected]>
--
Chuck Lever
On Fri, 2022-10-28 at 17:21 +0000, Chuck Lever III wrote:
>
> > On Oct 28, 2022, at 11:51 AM, Jeff Layton <[email protected]> wrote:
> >
> > On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
> > >
> > > > On Oct 28, 2022, at 11:05 AM, Jeff Layton <[email protected]> wrote:
> > > >
> > > > On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
> > > > >
> > > > > > On Oct 27, 2022, at 5:52 PM, Jeff Layton <[email protected]> wrote:
> > > > > >
> > > > > > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > > > > > so that we can be ready to close it out when the time comes.
> > > > >
> > > > > For a large file, a background flush still has to walk the file's
> > > > > pages to see if they are dirty, and that consumes time, CPU, and
> > > > > memory bandwidth. We're talking hundreds of microseconds for a
> > > > > large file.
> > > > >
> > > > > Then the final flush does all that again.
> > > > >
> > > > > Basically, two (or more!) passes through the file for exactly the
> > > > > same amount of work. Is there any measured improvement in latency
> > > > > or throughput?
> > > > >
> > > > > And then... for a GC file, no-one is waiting on data persistence
> > > > > during nfsd_file_put() so I'm not sure what is gained by taking
> > > > > control of the flushing process away from the underlying filesystem.
> > > > >
> > > > >
> > > > > Remind me why the filecache is flushing? Shouldn't NFSD rely on
> > > > > COMMIT operations for that? (It's not obvious reading the code,
> > > > > maybe there should be a documenting comment somewhere that
> > > > > explains this arrangement).
> > > > >
> > > >
> > > >
> > > > Fair point. I was trying to replicate the behaviors introduced in these
> > > > patches:
> > > >
> > > > b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
> > > > 6b8a94332ee4 nfsd: Fix a write performance regression
> > > >
> > > > AFAICT, the fsync is there to catch writeback errors so that we can
> > > > reset the write verifiers (AFAICT). The rationale for that is described
> > > > here:
> > > >
> > > > 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
> > >
> > > Yes, I've been confused about this since then :-)
> > >
> > > So, the patch description says:
> > >
> > > If a file may contain unstable writes that can error out, then we want
> > > to avoid garbage collecting the struct nfsd_file that may be
> > > tracking those errors.
> > >
> > > That doesn't explain why that's a problem, it just says what we plan to
> > > do about it.
> > >
> > >
> > > > The problem with not calling vfs_fsync is that we might miss writeback
> > > > errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> > > > nfsd would eventually reopen the file but it could miss seeing the error
> > > > if it got opened locally in the interim.
> > >
> > > That helps. So we're surfacing writeback errors for local writers?
> > >
> >
> > Well for non-v3 writers anyway. I suppose you could hit the same
> > scenario with a mixed v3 and v4 workload if you were unlucky enough, or
> > mixed v3 and ksmbd workload, etc...
> >
> > > I guess I would like this flushing to interfere as little as possible
> > > with the server's happy zone, since it's not something clients need to
> > > wait for, and an error is exceptionally rare.
> > >
> > > But also, we can't let writeback errors hold onto a bunch of memory
> > > indefinitely. How much nfsd_file and page cache memory might be be
> > > pinned by a writeback error, and for how long?
> > >
> >
> > You mean if we were to stop trying to fsync out when closing? We don't
> > keep files in the cache indefinitely, even if they have writeback
> > errors.
> >
> > In general, the kernel attempts to write things out, and if it fails it
> > sets a writeback error in the mapping and marks the pages clean. So if
> > we're talking about files that are no longer being used (since they're
> > being GC'ed), we only block reaping them for as long as writeback is in
> > progress.
> >
> > Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
> > a final time, grab the error and reset the write verifier when it's
> > non-zero.
> >
> > If we stop doing fsyncs, then that model sort of breaks down. I'm not
> > clear on what you'd like to do instead.
>
> I'm not clear either. I think I just have some hand-wavy requirements.
>
> I think keeping the flushes in the nfsd threads and away from single-
> threaded garbage collection makes sense. Keep I/O in nfsd context, not
> in the filecache garbage collector. I'm not sure that's guaranteed
> if the garbage collection thread does an nfsd_file_put() that flushes.
>
The garbage collector doesn't call nfsd_file_put directly, though it
will call nfsd_file_free, which now does a vfs_fsync.
>
> And, we need to ensure that an nfsd_file isn't pinned forever -- the
> flush has to make forward progress so that the nfsd_file is eventually
> released. Otherwise, writeback errors become a DoS vector.
>
IDGI. An outright writeback _failure_ won't block anything. Stuck (hung)
writeback could cause a nfsd_file to be pinned indefinitely, but that's
really no different than the situation with stuck read requests.
> But, back to the topic of this patch: my own experiments with background
> syncing showed that it introduces significant overhead and it wasn't
> really worth the trouble. Basically, on intensive workloads, the garbage
> collector must not stall or live-lock because it's walking through
> millions of pages trying to figure out which ones are dirty.
>
If this is what you want, then kicking off SYNC_NONE writeback when we
put it on the LRU is the right thing to do.
We want to ensure that when the thing is reaped from the LRU, that the
final vfs_fsync has to write nothing back. The penultimate put that adds
it to the LRU is almost certainly going to come in the context of an
nfsd thread, so kicking off background writeback at that point seems
reasonable.
Only files that aren't touched again get reaped off the LRU eventually,
so there should be no danger of nfsd redirtying it again. By the time we
get to reaping it, everything should be written back and the inode will
be ready to close with little delay.
>
> > > > I'm not sure we need to worry about that so much for v4 though. Maybe we
> > > > should just do this for GC files?
> > >
> > > I'm not caffeinated yet. Why is it not a problem for v4? Is it because
> > > an open or delegation stateid will prevent the nfsd_file from going
> > > away?
> > >
> >
> >
> > Yeah, more or less.
> >
> > I think that for a error to be lost with v4, it would require the client
> > to have an application access pattern that would expose it to that
> > possibility on a local filesystem as well. I don't think we have any
> > obligation to do more there.
> >
> > Maybe that's a false assumption though.
> >
> > > Sorry for the noise. It's all a little subtle.
> > >
> >
> > Very subtle. The more we can get this fleshed out into comments the
> > better, so I welcome the questions.
> >
> > >
> > > > > > Signed-off-by: Jeff Layton <[email protected]>
> > > > > > ---
> > > > > > fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> > > > > > 1 file changed, 31 insertions(+), 6 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > > > > index d2bbded805d4..491d3d9a1870 100644
> > > > > > --- a/fs/nfsd/filecache.c
> > > > > > +++ b/fs/nfsd/filecache.c
> > > > > > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > > > > }
> > > > > >
> > > > > > static void
> > > > > > -nfsd_file_flush(struct nfsd_file *nf)
> > > > > > +nfsd_file_fsync(struct nfsd_file *nf)
> > > > > > {
> > > > > > struct file *file = nf->nf_file;
> > > > > >
> > > > > > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> > > > > > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > > > }
> > > > > >
> > > > > > +static void
> > > > > > +nfsd_file_flush(struct nfsd_file *nf)
> > > > > > +{
> > > > > > + struct file *file = nf->nf_file;
> > > > > > + unsigned long nrpages;
> > > > > > +
> > > > > > + if (!file || !(file->f_mode & FMODE_WRITE))
> > > > > > + return;
> > > > > > +
> > > > > > + nrpages = file->f_mapping->nrpages;
> > > > > > + if (nrpages) {
> > > > > > + this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > > > > > + filemap_flush(file->f_mapping);
> > > > > > + }
> > > > > > +}
> > > > > > +
> > > > > > static void
> > > > > > nfsd_file_free(struct nfsd_file *nf)
> > > > > > {
> > > > > > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> > > > > > this_cpu_inc(nfsd_file_releases);
> > > > > > this_cpu_add(nfsd_file_total_age, age);
> > > > > >
> > > > > > - nfsd_file_flush(nf);
> > > > > > + nfsd_file_fsync(nf);
> > > > > >
> > > > > > if (nf->nf_mark)
> > > > > > nfsd_file_mark_put(nf->nf_mark);
> > > > > > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> > > > > >
> > > > > > if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > > > > /*
> > > > > > - * If this is the last reference (nf_ref == 1), then transfer
> > > > > > - * it to the LRU. If the add to the LRU fails, just put it as
> > > > > > - * usual.
> > > > > > + * If this is the last reference (nf_ref == 1), then try
> > > > > > + * to transfer it to the LRU.
> > > > > > + */
> > > > > > + if (refcount_dec_not_one(&nf->nf_ref))
> > > > > > + return;
> > > > > > +
> > > > > > + /*
> > > > > > + * If the add to the list succeeds, try to kick off SYNC_NONE
> > > > > > + * writeback. If the add fails, then just fall through to
> > > > > > + * decrement as usual.
> > > > >
> > > > > These comments simply repeat what the code does, so they seem
> > > > > redundant to me. Could they instead explain why?
> > > > >
> > > > >
> > > > > > */
> > > > > > - if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > > > > + if (nfsd_file_lru_add(nf)) {
> > > > > > + nfsd_file_flush(nf);
> > > > > > return;
> > > > > > + }
> > > > > > }
> > > > > > __nfsd_file_put(nf);
> > > > > > }
> > > > > > --
> > > > > > 2.37.3
> > > > > >
> > > > >
> > > > > --
> > > > > Chuck Lever
> > > > >
> > > > >
> > > > >
> > > >
> > > > --
> > > > Jeff Layton <[email protected]>
> > >
> > > --
> > > Chuck Lever
> > >
> > >
> > >
> >
> > --
> > Jeff Layton <[email protected]>
>
> --
> Chuck Lever
>
>
>
--
Jeff Layton <[email protected]>
> On Oct 28, 2022, at 1:43 PM, Jeff Layton <[email protected]> wrote:
>
> On Fri, 2022-10-28 at 17:21 +0000, Chuck Lever III wrote:
>>
>>> On Oct 28, 2022, at 11:51 AM, Jeff Layton <[email protected]> wrote:
>>>
>>> On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
>>>>
>>>>> On Oct 28, 2022, at 11:05 AM, Jeff Layton <[email protected]> wrote:
>>>>>
>>>>> The problem with not calling vfs_fsync is that we might miss writeback
>>>>> errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
>>>>> nfsd would eventually reopen the file but it could miss seeing the error
>>>>> if it got opened locally in the interim.
>>>>
>>>> That helps. So we're surfacing writeback errors for local writers?
>>>>
>>>
>>> Well for non-v3 writers anyway. I suppose you could hit the same
>>> scenario with a mixed v3 and v4 workload if you were unlucky enough, or
>>> mixed v3 and ksmbd workload, etc...
>>>
>>>> I guess I would like this flushing to interfere as little as possible
>>>> with the server's happy zone, since it's not something clients need to
>>>> wait for, and an error is exceptionally rare.
>>>>
>>>> But also, we can't let writeback errors hold onto a bunch of memory
>>>> indefinitely. How much nfsd_file and page cache memory might be be
>>>> pinned by a writeback error, and for how long?
>>>>
>>>
>>> You mean if we were to stop trying to fsync out when closing? We don't
>>> keep files in the cache indefinitely, even if they have writeback
>>> errors.
>>>
>>> In general, the kernel attempts to write things out, and if it fails it
>>> sets a writeback error in the mapping and marks the pages clean. So if
>>> we're talking about files that are no longer being used (since they're
>>> being GC'ed), we only block reaping them for as long as writeback is in
>>> progress.
>>>
>>> Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
>>> a final time, grab the error and reset the write verifier when it's
>>> non-zero.
>>>
>>> If we stop doing fsyncs, then that model sort of breaks down. I'm not
>>> clear on what you'd like to do instead.
>>
>> I'm not clear either. I think I just have some hand-wavy requirements.
>>
>> I think keeping the flushes in the nfsd threads and away from single-
>> threaded garbage collection makes sense. Keep I/O in nfsd context, not
>> in the filecache garbage collector. I'm not sure that's guaranteed
>> if the garbage collection thread does an nfsd_file_put() that flushes.
>>
>
> The garbage collector doesn't call nfsd_file_put directly, though it
> will call nfsd_file_free, which now does a vfs_fsync.
OK, thought I saw some email fly by that suggested using nfsd_file_put
in the garbage collector. But... the vfs_fsync you mention can possibly
trigger I/O and wait for it (it's not the SYNC_NONE flush) in the GC
thread. Rare, but I'd rather not have even that possibility if we can
avoid it.
>> But, back to the topic of this patch: my own experiments with background
>> syncing showed that it introduces significant overhead and it wasn't
>> really worth the trouble. Basically, on intensive workloads, the garbage
>> collector must not stall or live-lock because it's walking through
>> millions of pages trying to figure out which ones are dirty.
>>
>
> If this is what you want, then kicking off SYNC_NONE writeback when we
> put it on the LRU is the right thing to do.
>
> We want to ensure that when the thing is reaped from the LRU, that the
> final vfs_fsync has to write nothing back. The penultimate put that adds
> it to the LRU is almost certainly going to come in the context of an
> nfsd thread, so kicking off background writeback at that point seems
> reasonable.
IIUC the opposing idea is to do a synchronous writeback in nfsd_file_put
and then nothing in nfsd_file_free. Why isn't that adequate to achieve
the same result ?
Thinking aloud:
- Suppose a client does some UNSTABLE NFSv3 WRITEs to a file
- The client then waits long enough for the nfsd_file to get aged out
of the filecache
- A local writer on the server triggers a writeback error on the file
- The error is cleared by other activity
- The client sends a COMMIT
Wouldn't the server miss the chance to bump its write verifier in that
case?
> Only files that aren't touched again get reaped off the LRU eventually,
> so there should be no danger of nfsd redirtying it again.
At the risk of rat-holing... IIUC the only case we should care about
is if there are outstanding NFSv3 WRITEs that haven't been COMMITed.
Seems like NFSv3-specific code, and not the filecache, should deal
with that case, and leave nfsd_file_put/free out of it...? Again, no
clear idea how it would, but just thinking about the layering here.
> By the time we
> get to reaping it, everything should be written back and the inode will
> be ready to close with little delay.
--
Chuck Lever
On Fri, 2022-10-28 at 18:53 +0000, Chuck Lever III wrote:
>
> > On Oct 28, 2022, at 1:43 PM, Jeff Layton <[email protected]> wrote:
> >
> > On Fri, 2022-10-28 at 17:21 +0000, Chuck Lever III wrote:
> > >
> > > > On Oct 28, 2022, at 11:51 AM, Jeff Layton <[email protected]> wrote:
> > > >
> > > > On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
> > > > >
> > > > > > On Oct 28, 2022, at 11:05 AM, Jeff Layton <[email protected]> wrote:
> > > > > >
> > > > > > The problem with not calling vfs_fsync is that we might miss writeback
> > > > > > errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> > > > > > nfsd would eventually reopen the file but it could miss seeing the error
> > > > > > if it got opened locally in the interim.
> > > > >
> > > > > That helps. So we're surfacing writeback errors for local writers?
> > > > >
> > > >
> > > > Well for non-v3 writers anyway. I suppose you could hit the same
> > > > scenario with a mixed v3 and v4 workload if you were unlucky enough, or
> > > > mixed v3 and ksmbd workload, etc...
> > > >
> > > > > I guess I would like this flushing to interfere as little as possible
> > > > > with the server's happy zone, since it's not something clients need to
> > > > > wait for, and an error is exceptionally rare.
> > > > >
> > > > > But also, we can't let writeback errors hold onto a bunch of memory
> > > > > indefinitely. How much nfsd_file and page cache memory might be be
> > > > > pinned by a writeback error, and for how long?
> > > > >
> > > >
> > > > You mean if we were to stop trying to fsync out when closing? We don't
> > > > keep files in the cache indefinitely, even if they have writeback
> > > > errors.
> > > >
> > > > In general, the kernel attempts to write things out, and if it fails it
> > > > sets a writeback error in the mapping and marks the pages clean. So if
> > > > we're talking about files that are no longer being used (since they're
> > > > being GC'ed), we only block reaping them for as long as writeback is in
> > > > progress.
> > > >
> > > > Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
> > > > a final time, grab the error and reset the write verifier when it's
> > > > non-zero.
> > > >
> > > > If we stop doing fsyncs, then that model sort of breaks down. I'm not
> > > > clear on what you'd like to do instead.
> > >
> > > I'm not clear either. I think I just have some hand-wavy requirements.
> > >
> > > I think keeping the flushes in the nfsd threads and away from single-
> > > threaded garbage collection makes sense. Keep I/O in nfsd context, not
> > > in the filecache garbage collector. I'm not sure that's guaranteed
> > > if the garbage collection thread does an nfsd_file_put() that flushes.
> > >
> >
> > The garbage collector doesn't call nfsd_file_put directly, though it
> > will call nfsd_file_free, which now does a vfs_fsync.
>
> OK, thought I saw some email fly by that suggested using nfsd_file_put
> in the garbage collector. But... the vfs_fsync you mention can possibly
> trigger I/O and wait for it (it's not the SYNC_NONE flush) in the GC
> thread. Rare, but I'd rather not have even that possibility if we can
> avoid it.
>
>
> > > But, back to the topic of this patch: my own experiments with background
> > > syncing showed that it introduces significant overhead and it wasn't
> > > really worth the trouble. Basically, on intensive workloads, the garbage
> > > collector must not stall or live-lock because it's walking through
> > > millions of pages trying to figure out which ones are dirty.
> > >
> >
> > If this is what you want, then kicking off SYNC_NONE writeback when we
> > put it on the LRU is the right thing to do.
> >
> > We want to ensure that when the thing is reaped from the LRU, that the
> > final vfs_fsync has to write nothing back. The penultimate put that adds
> > it to the LRU is almost certainly going to come in the context of an
> > nfsd thread, so kicking off background writeback at that point seems
> > reasonable.
>
> IIUC the opposing idea is to do a synchronous writeback in nfsd_file_put
> and then nothing in nfsd_file_free. Why isn't that adequate to achieve
> the same result ?
>
To make sure I understand:
For the GC case (v3), you basically want to do a vfs_fsync after we put
it onto the LRU list? We'd also do a vfs_fsync after the refcount goes
to 0 in nfsd_file_put.
That seems...worse than what I'm proposing. We'll end up with a bunch of
blocked nfsd threads for no reason. The writeback in most cases could
proceed asynchronously, and we'll be idling an nfsd thread for the sole
purpose of getting the result of that writeback.
I see no need to block an nfsd thread for this. If we kick off
WB_SYNC_NONE writeback when we put it on the list, then by the time we
get around to calling vfs_fsync for reaping the thing, it'll basically
be a no-op. PAGECACHE_TAG_DIRTY should be clear and vfs_fsync will just
scrape the wb error code and return without walking anything.
I get the goal of not idling the garbage collector for too long, but
some of that may just be unavoidable. Tearing down a nfsd_file can just
take a significant amount of time, between flushing data and close.
> Thinking aloud:
>
> - Suppose a client does some UNSTABLE NFSv3 WRITEs to a file
> - The client then waits long enough for the nfsd_file to get aged out
> of the filecache
> - A local writer on the server triggers a writeback error on the file
> - The error is cleared by other activity
> - The client sends a COMMIT
>
> Wouldn't the server miss the chance to bump its write verifier in that
> case?
>
Yep. That is the danger.
>
> > Only files that aren't touched again get reaped off the LRU eventually,
> > so there should be no danger of nfsd redirtying it again.
>
> At the risk of rat-holing... IIUC the only case we should care about
> is if there are outstanding NFSv3 WRITEs that haven't been COMMITed.
> Seems like NFSv3-specific code, and not the filecache, should deal
> with that case, and leave nfsd_file_put/free out of it...? Again, no
> clear idea how it would, but just thinking about the layering here.
>
No idea how we'd do that. The filecache is what gives us persistent
"state" across disparate RPCs with v3. I think this is where the
solution has to be handled.
>
> > By the time we
> > get to reaping it, everything should be written back and the inode will
> > be ready to close with little delay.
>
>
>
> --
> Chuck Lever
>
>
>
--
Jeff Layton <[email protected]>
On Fri, 28 Oct 2022, Jeff Layton wrote:
> On Fri, 2022-10-28 at 09:51 +1100, NeilBrown wrote:
> > On Fri, 28 Oct 2022, Jeff Layton wrote:
> > > The filecache refcounting is a bit non-standard for something searchable
> > > by RCU, in that we maintain a sentinel reference while it's hashed. This
> > > in turn requires that we have to do things differently in the "put"
> > > depending on whether its hashed, which we believe to have led to races.
> > >
> > > There are other problems in here too. nfsd_file_close_inode_sync can end
> > > up freeing an nfsd_file while there are still outstanding references to
> > > it, and the handling
> >
> > -EINTR ??? (you got interrupted and didn't finish the sentence?)
> >
>
> Yes, I meant to go back and flesh that out, and forgot before posting.
>
> > >
> > > Rework the code so that the refcount is what drives the lifecycle. When
> > > the refcount goes to zero, then unhash and rcu free the object.
> > >
> > > Signed-off-by: Jeff Layton <[email protected]>
> > > ---
> > > fs/nfsd/filecache.c | 291 +++++++++++++++++++++-----------------------
> > > fs/nfsd/trace.h | 5 +-
> > > 2 files changed, 144 insertions(+), 152 deletions(-)
> > >
> > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > index 98c6b5f51bc8..e63534f4b9f8 100644
> > > --- a/fs/nfsd/filecache.c
> > > +++ b/fs/nfsd/filecache.c
> > > @@ -1,6 +1,12 @@
> > > // SPDX-License-Identifier: GPL-2.0
> > > /*
> > > * The NFSD open file cache.
> > > + *
> > > + * Each nfsd_file is created in response to client activity -- either regular
> > > + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> > > + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> > > + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> > > + * eventual disposal if they aren't used again within a short time period.
> > > */
> > >
> > > #include <linux/hash.h>
> > > @@ -302,31 +308,43 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > if (key->gc)
> > > __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > > nf->nf_inode = key->inode;
> > > - /* nf_ref is pre-incremented for hash table */
> > > - refcount_set(&nf->nf_ref, 2);
> > > + refcount_set(&nf->nf_ref, 1);
> > > nf->nf_may = key->need;
> > > nf->nf_mark = NULL;
> > > }
> > > return nf;
> > > }
> > >
> > > -static bool
> > > +static void
> > > +nfsd_file_flush(struct nfsd_file *nf)
> > > +{
> > > + struct file *file = nf->nf_file;
> > > +
> > > + if (!file || !(file->f_mode & FMODE_WRITE))
> > > + return;
> > > + this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > + if (vfs_fsync(file, 1) != 0)
> > > + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > +}
> > > +
> > > +static void
> > > nfsd_file_free(struct nfsd_file *nf)
> > > {
> > > s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> > > - bool flush = false;
> > > +
> > > + trace_nfsd_file_free(nf);
> > >
> > > this_cpu_inc(nfsd_file_releases);
> > > this_cpu_add(nfsd_file_total_age, age);
> > >
> > > - trace_nfsd_file_put_final(nf);
> > > + nfsd_file_flush(nf);
> > > +
> > > if (nf->nf_mark)
> > > nfsd_file_mark_put(nf->nf_mark);
> > > if (nf->nf_file) {
> > > get_file(nf->nf_file);
> > > filp_close(nf->nf_file, NULL);
> > > fput(nf->nf_file);
> > > - flush = true;
> > > }
> > >
> > > /*
> > > @@ -334,10 +352,9 @@ nfsd_file_free(struct nfsd_file *nf)
> > > * WARN and leak it to preserve system stability.
> > > */
> > > if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> > > - return flush;
> > > + return;
> > >
> > > call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> > > - return flush;
> > > }
> > >
> > > static bool
> > > @@ -363,29 +380,23 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
> > > return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
> > > }
> > >
> > > -static void
> > > -nfsd_file_flush(struct nfsd_file *nf)
> > > -{
> > > - struct file *file = nf->nf_file;
> > > -
> > > - if (!file || !(file->f_mode & FMODE_WRITE))
> > > - return;
> > > - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > - if (vfs_fsync(file, 1) != 0)
> > > - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > -}
> > > -
> > > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > > {
> > > set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > > - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > > + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > > trace_nfsd_file_lru_add(nf);
> > > + return true;
> > > + }
> > > + return false;
> > > }
> > >
> > > -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > > +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > > {
> > > - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> > > + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > > trace_nfsd_file_lru_del(nf);
> > > + return true;
> > > + }
> > > + return false;
> > > }
> > >
> > > static void
> > > @@ -409,94 +420,89 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > > return false;
> > > }
> > >
> > > -static void
> > > -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > > +struct nfsd_file *
> > > +nfsd_file_get(struct nfsd_file *nf)
> > > {
> > > - trace_nfsd_file_unhash_and_dispose(nf);
> > > + if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > + return nf;
> > > + return NULL;
> > > +}
> > > +
> > > +/**
> > > + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > > + * @nf: nfsd_file to be unhashed and queued
> > > + * @dispose: list to which it should be queued
> > > + *
> > > + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> > > + * will have a reference held on behalf of the list. That reference may come
> > > + * from the LRU, or we may need to take one. If we can't get a reference,
> > > + * ignore it altogether.
> > > + */
> > > +static bool
> > > +nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > > +{
> > > + trace_nfsd_file_unhash_and_queue(nf);
> > > if (nfsd_file_unhash(nf)) {
> > > - /* caller must call nfsd_file_dispose_list() later */
> > > - nfsd_file_lru_remove(nf);
> > > + /*
> > > + * If we remove it from the LRU, then just use that
> > > + * reference for the dispose list. Otherwise, we need
> > > + * to take a reference. If that fails, just ignore
> > > + * the file altogether.
> > > + */
> > > + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > > + return false;
> > > list_add(&nf->nf_lru, dispose);
> > > + return true;
> > > }
> > > + return false;
> > > }
> > >
> > > -static void
> > > -nfsd_file_put_noref(struct nfsd_file *nf)
> > > +static bool
> > > +__nfsd_file_put(struct nfsd_file *nf)
> >
> > The return value of this function is never tested.
> > Maybe it should return void.
> >
> > Further, I don't think this is a useful abstraction.
> > I would rather move the refcount_dec_and_test to the caller, and move
> > the lru_remove and unash into nfsd_file_free.
> >
>
> Ok, sounds reasonable.
>
> > > {
> > > - trace_nfsd_file_put(nf);
> > > -
> > > if (refcount_dec_and_test(&nf->nf_ref)) {
> > > - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > > - nfsd_file_lru_remove(nf);
> > > + nfsd_file_unhash(nf);
> > > nfsd_file_free(nf);
> > > + return true;
> > > }
> > > + return false;
> > > }
> > >
> > > -static void
> > > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > > -{
> > > - if (nfsd_file_unhash(nf))
> > > - nfsd_file_put_noref(nf);
> > > -}
> > > -
> > > +/**
> > > + * nfsd_file_put - put the reference to a nfsd_file
> > > + * @nf: nfsd_file of which to put the reference
> > > + *
> > > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > > + * reference immediately. In the v2/3 case, if the reference would be
> > > + * the last one, the put it on the LRU instead to be cleaned up later.
> > > + */
> > > void
> > > nfsd_file_put(struct nfsd_file *nf)
> > > {
> > > - might_sleep();
> > > -
> > > - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> > > - nfsd_file_lru_add(nf);
> > > - else if (refcount_read(&nf->nf_ref) == 2)
> > > - nfsd_file_unhash_and_put(nf);
> > > -
> > > - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > - nfsd_file_flush(nf);
> > > - nfsd_file_put_noref(nf);
> > > - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > - nfsd_file_put_noref(nf);
> > > - nfsd_file_schedule_laundrette();
> > > - } else
> > > - nfsd_file_put_noref(nf);
> > > -}
> > > -
> > > -struct nfsd_file *
> > > -nfsd_file_get(struct nfsd_file *nf)
> > > -{
> > > - if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > - return nf;
> > > - return NULL;
> > > -}
> > > -
> > > -static void
> > > -nfsd_file_dispose_list(struct list_head *dispose)
> > > -{
> > > - struct nfsd_file *nf;
> > > + trace_nfsd_file_put(nf);
> > >
> > > - while(!list_empty(dispose)) {
> > > - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > - list_del_init(&nf->nf_lru);
> > > - nfsd_file_flush(nf);
> > > - nfsd_file_put_noref(nf);
> > > + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> >
> > I would prefer this included a test on NFSD_FILE_HASHED as well so that
> > if the file isn't hashed, we don't consider it for the lru.
> > This would me we can simple called nfsd_file_put() for things on the
> > dispose list, rather then needing __nfsd_file_put()
> >
>
> I had an incorrectly reversed test for that in the previous version in
> nfsd_file_lru_add and you mentioned that it was racy. Why would that not
> be the case here?
It accept there is an apparent hypocrisy there :-)
This proposed test isn't racy because of the intent.
The intent isn't to ensure unhashed files never go onto the lru.
The intent is to ensure that if I unhash a file and then call put(),
then the file won't be put on the LRU.
Any code that calls nfsd_file_unhash() will either hold a reference, or
has just dropped the last reference. In either case it can be certain
that no other thread will drop the last reference, so no other thread
can cause the file to be added to the lru.
So in actual fact it is not racy - I was wrong before.
>
> > > + /*
> > > + * If this is the last reference (nf_ref == 1), then transfer
> > > + * it to the LRU. If the add to the LRU fails, just put it as
> > > + * usual.
> > > + */
> > > + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > + return;
> > > }
> > > + __nfsd_file_put(nf);
> >
> > As suggested above, this would become
> > if (refcount_dec_and_test(&nf->nf_ref))
> > nfsd_file_free(nf);
> >
>
> Ok.
>
> > > }
> > >
> > > static void
> > > -nfsd_file_dispose_list_sync(struct list_head *dispose)
> > > +nfsd_file_dispose_list(struct list_head *dispose)
> > > {
> > > - bool flush = false;
> > > struct nfsd_file *nf;
> > >
> > > while(!list_empty(dispose)) {
> > > nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > list_del_init(&nf->nf_lru);
> > > - nfsd_file_flush(nf);
> > > - if (!refcount_dec_and_test(&nf->nf_ref))
> > > - continue;
> > > - if (nfsd_file_free(nf))
> > > - flush = true;
> > > + nfsd_file_free(nf);
> > > }
> > > - if (flush)
> > > - flush_delayed_fput();
> > > }
> > >
> > > static void
> > > @@ -566,21 +572,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > struct list_head *head = arg;
> > > struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> > >
> > > - /*
> > > - * Do a lockless refcount check. The hashtable holds one reference, so
> > > - * we look to see if anything else has a reference, or if any have
> > > - * been put since the shrinker last ran. Those don't get unhashed and
> > > - * released.
> > > - *
> > > - * Note that in the put path, we set the flag and then decrement the
> > > - * counter. Here we check the counter and then test and clear the flag.
> > > - * That order is deliberate to ensure that we can do this locklessly.
> > > - */
> > > - if (refcount_read(&nf->nf_ref) > 1) {
> > > - list_lru_isolate(lru, &nf->nf_lru);
> > > - trace_nfsd_file_gc_in_use(nf);
> > > - return LRU_REMOVED;
> > > - }
> > > + /* We should only be dealing with v2/3 entries here */
> > > + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> > >
> > > /*
> > > * Don't throw out files that are still undergoing I/O or
> > > @@ -591,40 +584,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > return LRU_SKIP;
> > > }
> > >
> > > + /* If it was recently added to the list, skip it */
> > > if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> > > trace_nfsd_file_gc_referenced(nf);
> > > return LRU_ROTATE;
> > > }
> > >
> > > - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > - trace_nfsd_file_gc_hashed(nf);
> > > - return LRU_SKIP;
> > > + /*
> > > + * Put the reference held on behalf of the LRU. If it wasn't the last
> > > + * one, then just remove it from the LRU and ignore it.
> > > + */
> > > + if (!refcount_dec_and_test(&nf->nf_ref)) {
> > > + trace_nfsd_file_gc_in_use(nf);
> > > + list_lru_isolate(lru, &nf->nf_lru);
> > > + return LRU_REMOVED;
> > > }
> > >
> > > + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> > > + nfsd_file_unhash(nf);
> > > list_lru_isolate_move(lru, &nf->nf_lru, head);
> > > this_cpu_inc(nfsd_file_evictions);
> > > trace_nfsd_file_gc_disposed(nf);
> > > return LRU_REMOVED;
> > > }
> > >
> > > -/*
> > > - * Unhash items on @dispose immediately, then queue them on the
> > > - * disposal workqueue to finish releasing them in the background.
> > > - *
> > > - * cel: Note that between the time list_lru_shrink_walk runs and
> > > - * now, these items are in the hash table but marked unhashed.
> > > - * Why release these outside of lru_cb ? There's no lock ordering
> > > - * problem since lru_cb currently takes no lock.
> > > - */
> > > -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> > > -{
> > > - struct nfsd_file *nf;
> > > -
> > > - list_for_each_entry(nf, dispose, nf_lru)
> > > - nfsd_file_hash_remove(nf);
> > > - nfsd_file_dispose_list_delayed(dispose);
> > > -}
> > > -
> > > static void
> > > nfsd_file_gc(void)
> > > {
> > > @@ -634,7 +617,7 @@ nfsd_file_gc(void)
> > > ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> > > &dispose, list_lru_count(&nfsd_file_lru));
> > > trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> > > - nfsd_file_gc_dispose_list(&dispose);
> > > + nfsd_file_dispose_list_delayed(&dispose);
> > > }
> > >
> > > static void
> > > @@ -659,7 +642,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> > > ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> > > nfsd_file_lru_cb, &dispose);
> > > trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> > > - nfsd_file_gc_dispose_list(&dispose);
> > > + nfsd_file_dispose_list_delayed(&dispose);
> > > return ret;
> > > }
> > >
> > > @@ -670,8 +653,11 @@ static struct shrinker nfsd_file_shrinker = {
> > > };
> > >
> > > /*
> > > - * Find all cache items across all net namespaces that match @inode and
> > > - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> > > + * Find all cache items across all net namespaces that match @inode, unhash
> > > + * them, take references and then put them on @dispose if that was successful.
> > > + *
> > > + * The nfsd_file objects on the list will be unhashed, and each will have a
> > > + * reference taken.
> > > */
> > > static unsigned int
> > > __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > @@ -689,52 +675,58 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > nfsd_file_rhash_params);
> > > if (!nf)
> > > break;
> > > - nfsd_file_unhash_and_dispose(nf, dispose);
> > > - count++;
> > > +
> > > + if (nfsd_file_unhash_and_queue(nf, dispose))
> > > + count++;
> > > } while (1);
> > > rcu_read_unlock();
> > > return count;
> > > }
> > >
> > > /**
> > > - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > * @inode: inode of the file to attempt to remove
> > > *
> > > - * Unhash and put, then flush and fput all cache items associated with @inode.
> > > + * Unhash and put all cache item associated with @inode.
> > > */
> > > -void
> > > -nfsd_file_close_inode_sync(struct inode *inode)
> > > +static unsigned int
> > > +nfsd_file_close_inode(struct inode *inode)
> > > {
> > > - LIST_HEAD(dispose);
> > > + struct nfsd_file *nf;
> > > unsigned int count;
> > > + LIST_HEAD(dispose);
> > >
> > > count = __nfsd_file_close_inode(inode, &dispose);
> > > - trace_nfsd_file_close_inode_sync(inode, count);
> > > - nfsd_file_dispose_list_sync(&dispose);
> > > + trace_nfsd_file_close_inode(inode, count);
> > > + if (count) {
> > > + while(!list_empty(&dispose)) {
> > > + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> > > + list_del_init(&nf->nf_lru);
> > > + trace_nfsd_file_closing(nf);
> > > + __nfsd_file_put(nf);
> >
> > If nfsd_file_put() didn't add unhashed files to the lru, this can be
> > nfsd_file_put().
> >
> > > + }
> > > + }
> > > + return count;
> > > }
> > >
> > > /**
> > > - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > * @inode: inode of the file to attempt to remove
> > > *
> > > - * Unhash and put all cache item associated with @inode.
> > > + * Unhash and put, then flush and fput all cache items associated with @inode.
> > > */
> > > -static void
> > > -nfsd_file_close_inode(struct inode *inode)
> > > +void
> > > +nfsd_file_close_inode_sync(struct inode *inode)
> > > {
> > > - LIST_HEAD(dispose);
> > > - unsigned int count;
> > > -
> > > - count = __nfsd_file_close_inode(inode, &dispose);
> > > - trace_nfsd_file_close_inode(inode, count);
> > > - nfsd_file_dispose_list_delayed(&dispose);
> > > + if (nfsd_file_close_inode(inode))
> > > + flush_delayed_fput();
> > > }
> > >
> > > /**
> > > * nfsd_file_delayed_close - close unused nfsd_files
> > > * @work: dummy
> > > *
> > > - * Walk the LRU list and close any entries that have not been used since
> > > + * Walk the LRU list and destroy any entries that have not been used since
> > > * the last scan.
> > > */
> > > static void
> > > @@ -892,7 +884,7 @@ __nfsd_file_cache_purge(struct net *net)
> > > while (!IS_ERR_OR_NULL(nf)) {
> > > if (net && nf->nf_net != net)
> > > continue;
> > > - nfsd_file_unhash_and_dispose(nf, &dispose);
> > > + nfsd_file_unhash_and_queue(nf, &dispose);
> > > nf = rhashtable_walk_next(&iter);
> > > }
> > >
> > > @@ -1093,11 +1085,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > goto out;
> > > }
> > > open_retry = false;
> > > - nfsd_file_put_noref(nf);
> > > + __nfsd_file_put(nf);
> >
> > This nf is not hashed, and I think it has no other reference. So we
> > could use nfsd_file_free() - but nfsd_file_put() would be just as good
> > and safer.
> >
> > > goto retry;
> > > }
> > >
> > > - nfsd_file_lru_remove(nf);
> >
> > Hmmm... why not remove from the lru. I guess this justifies patch 2/3,
> > but it might be cleaner to make this
> >
> > if (nfsd_file_lru_remove(nf))
> > nffsd_file_put(nf);
> > ??
> >
>
> Removing from the LRU means putting a reference now. The last "put" of a
> nfsd_file can be rather expensive (you might need to flush data, and
> issue a close()).
True, but irrelevant. nfsd_file_do_acquire() already holds a reference.
If it succeeds at removing from the LRU, it now holds 2 references. If
it puts one, then it won't be that last "put", and so will be cheap.
I don't object to the way you have don't it - if ! lru_remove then get -
but it isn't necessary. You can just to the get - then if lru_remove,
do a put.
>
> In this particular codepath, that's not so much a danger, but avoiding
> excess "put" calls is still a good thing to do. That's the main reason
> I've tried to "transfer" references to and from the LRU where possible.
>
> > > this_cpu_inc(nfsd_file_cache_hits);
> > >
> > > status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> > > @@ -1107,7 +1098,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > this_cpu_inc(nfsd_file_acquisitions);
> > > *pnf = nf;
> > > } else {
> > > - nfsd_file_put(nf);
> > > + __nfsd_file_put(nf);
> >
> > I don't see the justification for this change.
> > If status == nfserr_jukebox, then it is OK.
> > If status is whatever we might get from break_lease(), then it seems
> > wrong.
> > If we modify nfsd_file_put() as I suggest, it will handle both cases.
> >
> >
>
> The justification is that when we're dealing with an error from an open,
> we don't want to put the nfsd_file onto the LRU. So, a direct call to
> __nfsd_file_put is what's needed here.
Maybe... I guess my concern arises from the fact that I'm unclear on how
break_lease() might fail. If it is a transitory failure then dropping
from the lru doesn't seem appropriate. Maybe I should refresh my
understanding of break_lease() failure modes.
>
> I'll plan to open-code those like you suggest in the next iteration.
>
> > > nf = NULL;
> > > }
> > >
> > > @@ -1134,7 +1125,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > * then unhash.
> > > */
> > > if (status != nfs_ok || key.inode->i_nlink == 0)
> > > - nfsd_file_unhash_and_put(nf);
> > > + nfsd_file_unhash(nf);
> > > clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> > > smp_mb__after_atomic();
> > > wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > index b09ab4f92d43..a44ded06af87 100644
> > > --- a/fs/nfsd/trace.h
> > > +++ b/fs/nfsd/trace.h
> > > @@ -903,10 +903,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
> > > TP_PROTO(struct nfsd_file *nf), \
> > > TP_ARGS(nf))
> > >
> > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
> > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> > > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> > > DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
> > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
> > >
> > > TRACE_EVENT(nfsd_file_alloc,
> > > TP_PROTO(
> > > --
> > > 2.37.3
> > >
> > >
> >
> > Thanks,
> > NeilBrown
>
> --
> Jeff Layton <[email protected]>
>
Thanks a lot,
I quite like your latest version.
NeilBrown
On Sat, 29 Oct 2022, NeilBrown wrote:
> On Fri, 28 Oct 2022, Jeff Layton wrote:
> > On Fri, 2022-10-28 at 09:51 +1100, NeilBrown wrote:
> > > On Fri, 28 Oct 2022, Jeff Layton wrote:
> > > > The filecache refcounting is a bit non-standard for something searchable
> > > > by RCU, in that we maintain a sentinel reference while it's hashed. This
> > > > in turn requires that we have to do things differently in the "put"
> > > > depending on whether its hashed, which we believe to have led to races.
> > > >
> > > > There are other problems in here too. nfsd_file_close_inode_sync can end
> > > > up freeing an nfsd_file while there are still outstanding references to
> > > > it, and the handling
> > >
> > > -EINTR ??? (you got interrupted and didn't finish the sentence?)
> > >
> >
> > Yes, I meant to go back and flesh that out, and forgot before posting.
> >
> > > >
> > > > Rework the code so that the refcount is what drives the lifecycle. When
> > > > the refcount goes to zero, then unhash and rcu free the object.
> > > >
> > > > Signed-off-by: Jeff Layton <[email protected]>
> > > > ---
> > > > fs/nfsd/filecache.c | 291 +++++++++++++++++++++-----------------------
> > > > fs/nfsd/trace.h | 5 +-
> > > > 2 files changed, 144 insertions(+), 152 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > > index 98c6b5f51bc8..e63534f4b9f8 100644
> > > > --- a/fs/nfsd/filecache.c
> > > > +++ b/fs/nfsd/filecache.c
> > > > @@ -1,6 +1,12 @@
> > > > // SPDX-License-Identifier: GPL-2.0
> > > > /*
> > > > * The NFSD open file cache.
> > > > + *
> > > > + * Each nfsd_file is created in response to client activity -- either regular
> > > > + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> > > > + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> > > > + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> > > > + * eventual disposal if they aren't used again within a short time period.
> > > > */
> > > >
> > > > #include <linux/hash.h>
> > > > @@ -302,31 +308,43 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > > if (key->gc)
> > > > __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > > > nf->nf_inode = key->inode;
> > > > - /* nf_ref is pre-incremented for hash table */
> > > > - refcount_set(&nf->nf_ref, 2);
> > > > + refcount_set(&nf->nf_ref, 1);
> > > > nf->nf_may = key->need;
> > > > nf->nf_mark = NULL;
> > > > }
> > > > return nf;
> > > > }
> > > >
> > > > -static bool
> > > > +static void
> > > > +nfsd_file_flush(struct nfsd_file *nf)
> > > > +{
> > > > + struct file *file = nf->nf_file;
> > > > +
> > > > + if (!file || !(file->f_mode & FMODE_WRITE))
> > > > + return;
> > > > + this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > > + if (vfs_fsync(file, 1) != 0)
> > > > + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > +}
> > > > +
> > > > +static void
> > > > nfsd_file_free(struct nfsd_file *nf)
> > > > {
> > > > s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> > > > - bool flush = false;
> > > > +
> > > > + trace_nfsd_file_free(nf);
> > > >
> > > > this_cpu_inc(nfsd_file_releases);
> > > > this_cpu_add(nfsd_file_total_age, age);
> > > >
> > > > - trace_nfsd_file_put_final(nf);
> > > > + nfsd_file_flush(nf);
> > > > +
> > > > if (nf->nf_mark)
> > > > nfsd_file_mark_put(nf->nf_mark);
> > > > if (nf->nf_file) {
> > > > get_file(nf->nf_file);
> > > > filp_close(nf->nf_file, NULL);
> > > > fput(nf->nf_file);
> > > > - flush = true;
> > > > }
> > > >
> > > > /*
> > > > @@ -334,10 +352,9 @@ nfsd_file_free(struct nfsd_file *nf)
> > > > * WARN and leak it to preserve system stability.
> > > > */
> > > > if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> > > > - return flush;
> > > > + return;
> > > >
> > > > call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> > > > - return flush;
> > > > }
> > > >
> > > > static bool
> > > > @@ -363,29 +380,23 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
> > > > return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_flush(struct nfsd_file *nf)
> > > > -{
> > > > - struct file *file = nf->nf_file;
> > > > -
> > > > - if (!file || !(file->f_mode & FMODE_WRITE))
> > > > - return;
> > > > - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > > - if (vfs_fsync(file, 1) != 0)
> > > > - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > -}
> > > > -
> > > > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > > > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > > > {
> > > > set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > > > - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > > > + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > > > trace_nfsd_file_lru_add(nf);
> > > > + return true;
> > > > + }
> > > > + return false;
> > > > }
> > > >
> > > > -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > > > +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > > > {
> > > > - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> > > > + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > > > trace_nfsd_file_lru_del(nf);
> > > > + return true;
> > > > + }
> > > > + return false;
> > > > }
> > > >
> > > > static void
> > > > @@ -409,94 +420,89 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > > > return false;
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > > > +struct nfsd_file *
> > > > +nfsd_file_get(struct nfsd_file *nf)
> > > > {
> > > > - trace_nfsd_file_unhash_and_dispose(nf);
> > > > + if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > > + return nf;
> > > > + return NULL;
> > > > +}
> > > > +
> > > > +/**
> > > > + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > > > + * @nf: nfsd_file to be unhashed and queued
> > > > + * @dispose: list to which it should be queued
> > > > + *
> > > > + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> > > > + * will have a reference held on behalf of the list. That reference may come
> > > > + * from the LRU, or we may need to take one. If we can't get a reference,
> > > > + * ignore it altogether.
> > > > + */
> > > > +static bool
> > > > +nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > > > +{
> > > > + trace_nfsd_file_unhash_and_queue(nf);
> > > > if (nfsd_file_unhash(nf)) {
> > > > - /* caller must call nfsd_file_dispose_list() later */
> > > > - nfsd_file_lru_remove(nf);
> > > > + /*
> > > > + * If we remove it from the LRU, then just use that
> > > > + * reference for the dispose list. Otherwise, we need
> > > > + * to take a reference. If that fails, just ignore
> > > > + * the file altogether.
> > > > + */
> > > > + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > > > + return false;
> > > > list_add(&nf->nf_lru, dispose);
> > > > + return true;
> > > > }
> > > > + return false;
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_put_noref(struct nfsd_file *nf)
> > > > +static bool
> > > > +__nfsd_file_put(struct nfsd_file *nf)
> > >
> > > The return value of this function is never tested.
> > > Maybe it should return void.
> > >
> > > Further, I don't think this is a useful abstraction.
> > > I would rather move the refcount_dec_and_test to the caller, and move
> > > the lru_remove and unash into nfsd_file_free.
> > >
> >
> > Ok, sounds reasonable.
> >
> > > > {
> > > > - trace_nfsd_file_put(nf);
> > > > -
> > > > if (refcount_dec_and_test(&nf->nf_ref)) {
> > > > - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > > > - nfsd_file_lru_remove(nf);
> > > > + nfsd_file_unhash(nf);
> > > > nfsd_file_free(nf);
> > > > + return true;
> > > > }
> > > > + return false;
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > > > -{
> > > > - if (nfsd_file_unhash(nf))
> > > > - nfsd_file_put_noref(nf);
> > > > -}
> > > > -
> > > > +/**
> > > > + * nfsd_file_put - put the reference to a nfsd_file
> > > > + * @nf: nfsd_file of which to put the reference
> > > > + *
> > > > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > > > + * reference immediately. In the v2/3 case, if the reference would be
> > > > + * the last one, the put it on the LRU instead to be cleaned up later.
> > > > + */
> > > > void
> > > > nfsd_file_put(struct nfsd_file *nf)
> > > > {
> > > > - might_sleep();
> > > > -
> > > > - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> > > > - nfsd_file_lru_add(nf);
> > > > - else if (refcount_read(&nf->nf_ref) == 2)
> > > > - nfsd_file_unhash_and_put(nf);
> > > > -
> > > > - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > > - nfsd_file_flush(nf);
> > > > - nfsd_file_put_noref(nf);
> > > > - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > > - nfsd_file_put_noref(nf);
> > > > - nfsd_file_schedule_laundrette();
> > > > - } else
> > > > - nfsd_file_put_noref(nf);
> > > > -}
> > > > -
> > > > -struct nfsd_file *
> > > > -nfsd_file_get(struct nfsd_file *nf)
> > > > -{
> > > > - if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > > - return nf;
> > > > - return NULL;
> > > > -}
> > > > -
> > > > -static void
> > > > -nfsd_file_dispose_list(struct list_head *dispose)
> > > > -{
> > > > - struct nfsd_file *nf;
> > > > + trace_nfsd_file_put(nf);
> > > >
> > > > - while(!list_empty(dispose)) {
> > > > - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > > - list_del_init(&nf->nf_lru);
> > > > - nfsd_file_flush(nf);
> > > > - nfsd_file_put_noref(nf);
> > > > + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > >
> > > I would prefer this included a test on NFSD_FILE_HASHED as well so that
> > > if the file isn't hashed, we don't consider it for the lru.
> > > This would me we can simple called nfsd_file_put() for things on the
> > > dispose list, rather then needing __nfsd_file_put()
> > >
> >
> > I had an incorrectly reversed test for that in the previous version in
> > nfsd_file_lru_add and you mentioned that it was racy. Why would that not
> > be the case here?
>
> It accept there is an apparent hypocrisy there :-)
> This proposed test isn't racy because of the intent.
> The intent isn't to ensure unhashed files never go onto the lru.
> The intent is to ensure that if I unhash a file and then call put(),
> then the file won't be put on the LRU.
>
> Any code that calls nfsd_file_unhash() will either hold a reference, or
> has just dropped the last reference. In either case it can be certain
> that no other thread will drop the last reference, so no other thread
> can cause the file to be added to the lru.
This last bit is wrong. The logic would only hold if the test on HASHED
was performed after refcount_dec_and_test has succeeded. As we test
before I think it still could race.
I don't think that race is important though. Maybe something will get
added to the lru after it has been unhashed. But any code that wants it
unhashed and gone from the lru will explicitly do that and it will
succeed.
In the worst case, an unhashed file will remain on the lru for a while,
then get discarded.
Thanks,
NeilBrown
>
> So in actual fact it is not racy - I was wrong before.
>
> >
> > > > + /*
> > > > + * If this is the last reference (nf_ref == 1), then transfer
> > > > + * it to the LRU. If the add to the LRU fails, just put it as
> > > > + * usual.
> > > > + */
> > > > + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > > + return;
> > > > }
> > > > + __nfsd_file_put(nf);
> > >
> > > As suggested above, this would become
> > > if (refcount_dec_and_test(&nf->nf_ref))
> > > nfsd_file_free(nf);
> > >
> >
> > Ok.
> >
> > > > }
> > > >
> > > > static void
> > > > -nfsd_file_dispose_list_sync(struct list_head *dispose)
> > > > +nfsd_file_dispose_list(struct list_head *dispose)
> > > > {
> > > > - bool flush = false;
> > > > struct nfsd_file *nf;
> > > >
> > > > while(!list_empty(dispose)) {
> > > > nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > > list_del_init(&nf->nf_lru);
> > > > - nfsd_file_flush(nf);
> > > > - if (!refcount_dec_and_test(&nf->nf_ref))
> > > > - continue;
> > > > - if (nfsd_file_free(nf))
> > > > - flush = true;
> > > > + nfsd_file_free(nf);
> > > > }
> > > > - if (flush)
> > > > - flush_delayed_fput();
> > > > }
> > > >
> > > > static void
> > > > @@ -566,21 +572,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > > struct list_head *head = arg;
> > > > struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> > > >
> > > > - /*
> > > > - * Do a lockless refcount check. The hashtable holds one reference, so
> > > > - * we look to see if anything else has a reference, or if any have
> > > > - * been put since the shrinker last ran. Those don't get unhashed and
> > > > - * released.
> > > > - *
> > > > - * Note that in the put path, we set the flag and then decrement the
> > > > - * counter. Here we check the counter and then test and clear the flag.
> > > > - * That order is deliberate to ensure that we can do this locklessly.
> > > > - */
> > > > - if (refcount_read(&nf->nf_ref) > 1) {
> > > > - list_lru_isolate(lru, &nf->nf_lru);
> > > > - trace_nfsd_file_gc_in_use(nf);
> > > > - return LRU_REMOVED;
> > > > - }
> > > > + /* We should only be dealing with v2/3 entries here */
> > > > + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> > > >
> > > > /*
> > > > * Don't throw out files that are still undergoing I/O or
> > > > @@ -591,40 +584,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > > return LRU_SKIP;
> > > > }
> > > >
> > > > + /* If it was recently added to the list, skip it */
> > > > if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> > > > trace_nfsd_file_gc_referenced(nf);
> > > > return LRU_ROTATE;
> > > > }
> > > >
> > > > - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > > - trace_nfsd_file_gc_hashed(nf);
> > > > - return LRU_SKIP;
> > > > + /*
> > > > + * Put the reference held on behalf of the LRU. If it wasn't the last
> > > > + * one, then just remove it from the LRU and ignore it.
> > > > + */
> > > > + if (!refcount_dec_and_test(&nf->nf_ref)) {
> > > > + trace_nfsd_file_gc_in_use(nf);
> > > > + list_lru_isolate(lru, &nf->nf_lru);
> > > > + return LRU_REMOVED;
> > > > }
> > > >
> > > > + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> > > > + nfsd_file_unhash(nf);
> > > > list_lru_isolate_move(lru, &nf->nf_lru, head);
> > > > this_cpu_inc(nfsd_file_evictions);
> > > > trace_nfsd_file_gc_disposed(nf);
> > > > return LRU_REMOVED;
> > > > }
> > > >
> > > > -/*
> > > > - * Unhash items on @dispose immediately, then queue them on the
> > > > - * disposal workqueue to finish releasing them in the background.
> > > > - *
> > > > - * cel: Note that between the time list_lru_shrink_walk runs and
> > > > - * now, these items are in the hash table but marked unhashed.
> > > > - * Why release these outside of lru_cb ? There's no lock ordering
> > > > - * problem since lru_cb currently takes no lock.
> > > > - */
> > > > -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> > > > -{
> > > > - struct nfsd_file *nf;
> > > > -
> > > > - list_for_each_entry(nf, dispose, nf_lru)
> > > > - nfsd_file_hash_remove(nf);
> > > > - nfsd_file_dispose_list_delayed(dispose);
> > > > -}
> > > > -
> > > > static void
> > > > nfsd_file_gc(void)
> > > > {
> > > > @@ -634,7 +617,7 @@ nfsd_file_gc(void)
> > > > ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> > > > &dispose, list_lru_count(&nfsd_file_lru));
> > > > trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> > > > - nfsd_file_gc_dispose_list(&dispose);
> > > > + nfsd_file_dispose_list_delayed(&dispose);
> > > > }
> > > >
> > > > static void
> > > > @@ -659,7 +642,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> > > > ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> > > > nfsd_file_lru_cb, &dispose);
> > > > trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> > > > - nfsd_file_gc_dispose_list(&dispose);
> > > > + nfsd_file_dispose_list_delayed(&dispose);
> > > > return ret;
> > > > }
> > > >
> > > > @@ -670,8 +653,11 @@ static struct shrinker nfsd_file_shrinker = {
> > > > };
> > > >
> > > > /*
> > > > - * Find all cache items across all net namespaces that match @inode and
> > > > - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> > > > + * Find all cache items across all net namespaces that match @inode, unhash
> > > > + * them, take references and then put them on @dispose if that was successful.
> > > > + *
> > > > + * The nfsd_file objects on the list will be unhashed, and each will have a
> > > > + * reference taken.
> > > > */
> > > > static unsigned int
> > > > __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > > @@ -689,52 +675,58 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > > nfsd_file_rhash_params);
> > > > if (!nf)
> > > > break;
> > > > - nfsd_file_unhash_and_dispose(nf, dispose);
> > > > - count++;
> > > > +
> > > > + if (nfsd_file_unhash_and_queue(nf, dispose))
> > > > + count++;
> > > > } while (1);
> > > > rcu_read_unlock();
> > > > return count;
> > > > }
> > > >
> > > > /**
> > > > - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > > + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > > * @inode: inode of the file to attempt to remove
> > > > *
> > > > - * Unhash and put, then flush and fput all cache items associated with @inode.
> > > > + * Unhash and put all cache item associated with @inode.
> > > > */
> > > > -void
> > > > -nfsd_file_close_inode_sync(struct inode *inode)
> > > > +static unsigned int
> > > > +nfsd_file_close_inode(struct inode *inode)
> > > > {
> > > > - LIST_HEAD(dispose);
> > > > + struct nfsd_file *nf;
> > > > unsigned int count;
> > > > + LIST_HEAD(dispose);
> > > >
> > > > count = __nfsd_file_close_inode(inode, &dispose);
> > > > - trace_nfsd_file_close_inode_sync(inode, count);
> > > > - nfsd_file_dispose_list_sync(&dispose);
> > > > + trace_nfsd_file_close_inode(inode, count);
> > > > + if (count) {
> > > > + while(!list_empty(&dispose)) {
> > > > + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> > > > + list_del_init(&nf->nf_lru);
> > > > + trace_nfsd_file_closing(nf);
> > > > + __nfsd_file_put(nf);
> > >
> > > If nfsd_file_put() didn't add unhashed files to the lru, this can be
> > > nfsd_file_put().
> > >
> > > > + }
> > > > + }
> > > > + return count;
> > > > }
> > > >
> > > > /**
> > > > - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > > + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > > * @inode: inode of the file to attempt to remove
> > > > *
> > > > - * Unhash and put all cache item associated with @inode.
> > > > + * Unhash and put, then flush and fput all cache items associated with @inode.
> > > > */
> > > > -static void
> > > > -nfsd_file_close_inode(struct inode *inode)
> > > > +void
> > > > +nfsd_file_close_inode_sync(struct inode *inode)
> > > > {
> > > > - LIST_HEAD(dispose);
> > > > - unsigned int count;
> > > > -
> > > > - count = __nfsd_file_close_inode(inode, &dispose);
> > > > - trace_nfsd_file_close_inode(inode, count);
> > > > - nfsd_file_dispose_list_delayed(&dispose);
> > > > + if (nfsd_file_close_inode(inode))
> > > > + flush_delayed_fput();
> > > > }
> > > >
> > > > /**
> > > > * nfsd_file_delayed_close - close unused nfsd_files
> > > > * @work: dummy
> > > > *
> > > > - * Walk the LRU list and close any entries that have not been used since
> > > > + * Walk the LRU list and destroy any entries that have not been used since
> > > > * the last scan.
> > > > */
> > > > static void
> > > > @@ -892,7 +884,7 @@ __nfsd_file_cache_purge(struct net *net)
> > > > while (!IS_ERR_OR_NULL(nf)) {
> > > > if (net && nf->nf_net != net)
> > > > continue;
> > > > - nfsd_file_unhash_and_dispose(nf, &dispose);
> > > > + nfsd_file_unhash_and_queue(nf, &dispose);
> > > > nf = rhashtable_walk_next(&iter);
> > > > }
> > > >
> > > > @@ -1093,11 +1085,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > goto out;
> > > > }
> > > > open_retry = false;
> > > > - nfsd_file_put_noref(nf);
> > > > + __nfsd_file_put(nf);
> > >
> > > This nf is not hashed, and I think it has no other reference. So we
> > > could use nfsd_file_free() - but nfsd_file_put() would be just as good
> > > and safer.
> > >
> > > > goto retry;
> > > > }
> > > >
> > > > - nfsd_file_lru_remove(nf);
> > >
> > > Hmmm... why not remove from the lru. I guess this justifies patch 2/3,
> > > but it might be cleaner to make this
> > >
> > > if (nfsd_file_lru_remove(nf))
> > > nffsd_file_put(nf);
> > > ??
> > >
> >
> > Removing from the LRU means putting a reference now. The last "put" of a
> > nfsd_file can be rather expensive (you might need to flush data, and
> > issue a close()).
>
> True, but irrelevant. nfsd_file_do_acquire() already holds a reference.
> If it succeeds at removing from the LRU, it now holds 2 references. If
> it puts one, then it won't be that last "put", and so will be cheap.
>
> I don't object to the way you have don't it - if ! lru_remove then get -
> but it isn't necessary. You can just to the get - then if lru_remove,
> do a put.
>
> >
> > In this particular codepath, that's not so much a danger, but avoiding
> > excess "put" calls is still a good thing to do. That's the main reason
> > I've tried to "transfer" references to and from the LRU where possible.
> >
> > > > this_cpu_inc(nfsd_file_cache_hits);
> > > >
> > > > status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> > > > @@ -1107,7 +1098,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > this_cpu_inc(nfsd_file_acquisitions);
> > > > *pnf = nf;
> > > > } else {
> > > > - nfsd_file_put(nf);
> > > > + __nfsd_file_put(nf);
> > >
> > > I don't see the justification for this change.
> > > If status == nfserr_jukebox, then it is OK.
> > > If status is whatever we might get from break_lease(), then it seems
> > > wrong.
> > > If we modify nfsd_file_put() as I suggest, it will handle both cases.
> > >
> > >
> >
> > The justification is that when we're dealing with an error from an open,
> > we don't want to put the nfsd_file onto the LRU. So, a direct call to
> > __nfsd_file_put is what's needed here.
>
> Maybe... I guess my concern arises from the fact that I'm unclear on how
> break_lease() might fail. If it is a transitory failure then dropping
> from the lru doesn't seem appropriate. Maybe I should refresh my
> understanding of break_lease() failure modes.
>
> >
> > I'll plan to open-code those like you suggest in the next iteration.
> >
> > > > nf = NULL;
> > > > }
> > > >
> > > > @@ -1134,7 +1125,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > * then unhash.
> > > > */
> > > > if (status != nfs_ok || key.inode->i_nlink == 0)
> > > > - nfsd_file_unhash_and_put(nf);
> > > > + nfsd_file_unhash(nf);
> > > > clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> > > > smp_mb__after_atomic();
> > > > wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> > > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > > index b09ab4f92d43..a44ded06af87 100644
> > > > --- a/fs/nfsd/trace.h
> > > > +++ b/fs/nfsd/trace.h
> > > > @@ -903,10 +903,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
> > > > TP_PROTO(struct nfsd_file *nf), \
> > > > TP_ARGS(nf))
> > > >
> > > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
> > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> > > > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> > > > DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> > > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
> > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
> > > >
> > > > TRACE_EVENT(nfsd_file_alloc,
> > > > TP_PROTO(
> > > > --
> > > > 2.37.3
> > > >
> > > >
> > >
> > > Thanks,
> > > NeilBrown
> >
> > --
> > Jeff Layton <[email protected]>
> >
>
> Thanks a lot,
> I quite like your latest version.
>
> NeilBrown
>
On Sat, 2022-10-29 at 09:14 +1100, NeilBrown wrote:
> On Fri, 28 Oct 2022, Jeff Layton wrote:
> > On Fri, 2022-10-28 at 09:51 +1100, NeilBrown wrote:
> > > On Fri, 28 Oct 2022, Jeff Layton wrote:
> > > > The filecache refcounting is a bit non-standard for something searchable
> > > > by RCU, in that we maintain a sentinel reference while it's hashed. This
> > > > in turn requires that we have to do things differently in the "put"
> > > > depending on whether its hashed, which we believe to have led to races.
> > > >
> > > > There are other problems in here too. nfsd_file_close_inode_sync can end
> > > > up freeing an nfsd_file while there are still outstanding references to
> > > > it, and the handling
> > >
> > > -EINTR ??? (you got interrupted and didn't finish the sentence?)
> > >
> >
> > Yes, I meant to go back and flesh that out, and forgot before posting.
> >
> > > >
> > > > Rework the code so that the refcount is what drives the lifecycle. When
> > > > the refcount goes to zero, then unhash and rcu free the object.
> > > >
> > > > Signed-off-by: Jeff Layton <[email protected]>
> > > > ---
> > > > fs/nfsd/filecache.c | 291 +++++++++++++++++++++-----------------------
> > > > fs/nfsd/trace.h | 5 +-
> > > > 2 files changed, 144 insertions(+), 152 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > > index 98c6b5f51bc8..e63534f4b9f8 100644
> > > > --- a/fs/nfsd/filecache.c
> > > > +++ b/fs/nfsd/filecache.c
> > > > @@ -1,6 +1,12 @@
> > > > // SPDX-License-Identifier: GPL-2.0
> > > > /*
> > > > * The NFSD open file cache.
> > > > + *
> > > > + * Each nfsd_file is created in response to client activity -- either regular
> > > > + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> > > > + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> > > > + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> > > > + * eventual disposal if they aren't used again within a short time period.
> > > > */
> > > >
> > > > #include <linux/hash.h>
> > > > @@ -302,31 +308,43 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > > if (key->gc)
> > > > __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > > > nf->nf_inode = key->inode;
> > > > - /* nf_ref is pre-incremented for hash table */
> > > > - refcount_set(&nf->nf_ref, 2);
> > > > + refcount_set(&nf->nf_ref, 1);
> > > > nf->nf_may = key->need;
> > > > nf->nf_mark = NULL;
> > > > }
> > > > return nf;
> > > > }
> > > >
> > > > -static bool
> > > > +static void
> > > > +nfsd_file_flush(struct nfsd_file *nf)
> > > > +{
> > > > + struct file *file = nf->nf_file;
> > > > +
> > > > + if (!file || !(file->f_mode & FMODE_WRITE))
> > > > + return;
> > > > + this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > > + if (vfs_fsync(file, 1) != 0)
> > > > + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > +}
> > > > +
> > > > +static void
> > > > nfsd_file_free(struct nfsd_file *nf)
> > > > {
> > > > s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> > > > - bool flush = false;
> > > > +
> > > > + trace_nfsd_file_free(nf);
> > > >
> > > > this_cpu_inc(nfsd_file_releases);
> > > > this_cpu_add(nfsd_file_total_age, age);
> > > >
> > > > - trace_nfsd_file_put_final(nf);
> > > > + nfsd_file_flush(nf);
> > > > +
> > > > if (nf->nf_mark)
> > > > nfsd_file_mark_put(nf->nf_mark);
> > > > if (nf->nf_file) {
> > > > get_file(nf->nf_file);
> > > > filp_close(nf->nf_file, NULL);
> > > > fput(nf->nf_file);
> > > > - flush = true;
> > > > }
> > > >
> > > > /*
> > > > @@ -334,10 +352,9 @@ nfsd_file_free(struct nfsd_file *nf)
> > > > * WARN and leak it to preserve system stability.
> > > > */
> > > > if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> > > > - return flush;
> > > > + return;
> > > >
> > > > call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> > > > - return flush;
> > > > }
> > > >
> > > > static bool
> > > > @@ -363,29 +380,23 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
> > > > return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_flush(struct nfsd_file *nf)
> > > > -{
> > > > - struct file *file = nf->nf_file;
> > > > -
> > > > - if (!file || !(file->f_mode & FMODE_WRITE))
> > > > - return;
> > > > - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > > - if (vfs_fsync(file, 1) != 0)
> > > > - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > -}
> > > > -
> > > > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > > > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > > > {
> > > > set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > > > - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > > > + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > > > trace_nfsd_file_lru_add(nf);
> > > > + return true;
> > > > + }
> > > > + return false;
> > > > }
> > > >
> > > > -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > > > +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > > > {
> > > > - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> > > > + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > > > trace_nfsd_file_lru_del(nf);
> > > > + return true;
> > > > + }
> > > > + return false;
> > > > }
> > > >
> > > > static void
> > > > @@ -409,94 +420,89 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > > > return false;
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > > > +struct nfsd_file *
> > > > +nfsd_file_get(struct nfsd_file *nf)
> > > > {
> > > > - trace_nfsd_file_unhash_and_dispose(nf);
> > > > + if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > > + return nf;
> > > > + return NULL;
> > > > +}
> > > > +
> > > > +/**
> > > > + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > > > + * @nf: nfsd_file to be unhashed and queued
> > > > + * @dispose: list to which it should be queued
> > > > + *
> > > > + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> > > > + * will have a reference held on behalf of the list. That reference may come
> > > > + * from the LRU, or we may need to take one. If we can't get a reference,
> > > > + * ignore it altogether.
> > > > + */
> > > > +static bool
> > > > +nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > > > +{
> > > > + trace_nfsd_file_unhash_and_queue(nf);
> > > > if (nfsd_file_unhash(nf)) {
> > > > - /* caller must call nfsd_file_dispose_list() later */
> > > > - nfsd_file_lru_remove(nf);
> > > > + /*
> > > > + * If we remove it from the LRU, then just use that
> > > > + * reference for the dispose list. Otherwise, we need
> > > > + * to take a reference. If that fails, just ignore
> > > > + * the file altogether.
> > > > + */
> > > > + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > > > + return false;
> > > > list_add(&nf->nf_lru, dispose);
> > > > + return true;
> > > > }
> > > > + return false;
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_put_noref(struct nfsd_file *nf)
> > > > +static bool
> > > > +__nfsd_file_put(struct nfsd_file *nf)
> > >
> > > The return value of this function is never tested.
> > > Maybe it should return void.
> > >
> > > Further, I don't think this is a useful abstraction.
> > > I would rather move the refcount_dec_and_test to the caller, and move
> > > the lru_remove and unash into nfsd_file_free.
> > >
> >
> > Ok, sounds reasonable.
> >
> > > > {
> > > > - trace_nfsd_file_put(nf);
> > > > -
> > > > if (refcount_dec_and_test(&nf->nf_ref)) {
> > > > - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > > > - nfsd_file_lru_remove(nf);
> > > > + nfsd_file_unhash(nf);
> > > > nfsd_file_free(nf);
> > > > + return true;
> > > > }
> > > > + return false;
> > > > }
> > > >
> > > > -static void
> > > > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > > > -{
> > > > - if (nfsd_file_unhash(nf))
> > > > - nfsd_file_put_noref(nf);
> > > > -}
> > > > -
> > > > +/**
> > > > + * nfsd_file_put - put the reference to a nfsd_file
> > > > + * @nf: nfsd_file of which to put the reference
> > > > + *
> > > > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > > > + * reference immediately. In the v2/3 case, if the reference would be
> > > > + * the last one, the put it on the LRU instead to be cleaned up later.
> > > > + */
> > > > void
> > > > nfsd_file_put(struct nfsd_file *nf)
> > > > {
> > > > - might_sleep();
> > > > -
> > > > - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> > > > - nfsd_file_lru_add(nf);
> > > > - else if (refcount_read(&nf->nf_ref) == 2)
> > > > - nfsd_file_unhash_and_put(nf);
> > > > -
> > > > - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > > - nfsd_file_flush(nf);
> > > > - nfsd_file_put_noref(nf);
> > > > - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > > - nfsd_file_put_noref(nf);
> > > > - nfsd_file_schedule_laundrette();
> > > > - } else
> > > > - nfsd_file_put_noref(nf);
> > > > -}
> > > > -
> > > > -struct nfsd_file *
> > > > -nfsd_file_get(struct nfsd_file *nf)
> > > > -{
> > > > - if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > > - return nf;
> > > > - return NULL;
> > > > -}
> > > > -
> > > > -static void
> > > > -nfsd_file_dispose_list(struct list_head *dispose)
> > > > -{
> > > > - struct nfsd_file *nf;
> > > > + trace_nfsd_file_put(nf);
> > > >
> > > > - while(!list_empty(dispose)) {
> > > > - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > > - list_del_init(&nf->nf_lru);
> > > > - nfsd_file_flush(nf);
> > > > - nfsd_file_put_noref(nf);
> > > > + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > >
> > > I would prefer this included a test on NFSD_FILE_HASHED as well so that
> > > if the file isn't hashed, we don't consider it for the lru.
> > > This would me we can simple called nfsd_file_put() for things on the
> > > dispose list, rather then needing __nfsd_file_put()
> > >
> >
> > I had an incorrectly reversed test for that in the previous version in
> > nfsd_file_lru_add and you mentioned that it was racy. Why would that not
> > be the case here?
>
> It accept there is an apparent hypocrisy there :-)
> This proposed test isn't racy because of the intent.
> The intent isn't to ensure unhashed files never go onto the lru.
> The intent is to ensure that if I unhash a file and then call put(),
> then the file won't be put on the LRU.
>
> Any code that calls nfsd_file_unhash() will either hold a reference, or
> has just dropped the last reference. In either case it can be certain
> that no other thread will drop the last reference, so no other thread
> can cause the file to be added to the lru.
>
> So in actual fact it is not racy - I was wrong before.
>
> >
> > > > + /*
> > > > + * If this is the last reference (nf_ref == 1), then transfer
> > > > + * it to the LRU. If the add to the LRU fails, just put it as
> > > > + * usual.
> > > > + */
> > > > + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > > + return;
> > > > }
> > > > + __nfsd_file_put(nf);
> > >
> > > As suggested above, this would become
> > > if (refcount_dec_and_test(&nf->nf_ref))
> > > nfsd_file_free(nf);
> > >
> >
> > Ok.
> >
> > > > }
> > > >
> > > > static void
> > > > -nfsd_file_dispose_list_sync(struct list_head *dispose)
> > > > +nfsd_file_dispose_list(struct list_head *dispose)
> > > > {
> > > > - bool flush = false;
> > > > struct nfsd_file *nf;
> > > >
> > > > while(!list_empty(dispose)) {
> > > > nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > > list_del_init(&nf->nf_lru);
> > > > - nfsd_file_flush(nf);
> > > > - if (!refcount_dec_and_test(&nf->nf_ref))
> > > > - continue;
> > > > - if (nfsd_file_free(nf))
> > > > - flush = true;
> > > > + nfsd_file_free(nf);
> > > > }
> > > > - if (flush)
> > > > - flush_delayed_fput();
> > > > }
> > > >
> > > > static void
> > > > @@ -566,21 +572,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > > struct list_head *head = arg;
> > > > struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> > > >
> > > > - /*
> > > > - * Do a lockless refcount check. The hashtable holds one reference, so
> > > > - * we look to see if anything else has a reference, or if any have
> > > > - * been put since the shrinker last ran. Those don't get unhashed and
> > > > - * released.
> > > > - *
> > > > - * Note that in the put path, we set the flag and then decrement the
> > > > - * counter. Here we check the counter and then test and clear the flag.
> > > > - * That order is deliberate to ensure that we can do this locklessly.
> > > > - */
> > > > - if (refcount_read(&nf->nf_ref) > 1) {
> > > > - list_lru_isolate(lru, &nf->nf_lru);
> > > > - trace_nfsd_file_gc_in_use(nf);
> > > > - return LRU_REMOVED;
> > > > - }
> > > > + /* We should only be dealing with v2/3 entries here */
> > > > + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> > > >
> > > > /*
> > > > * Don't throw out files that are still undergoing I/O or
> > > > @@ -591,40 +584,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > > return LRU_SKIP;
> > > > }
> > > >
> > > > + /* If it was recently added to the list, skip it */
> > > > if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> > > > trace_nfsd_file_gc_referenced(nf);
> > > > return LRU_ROTATE;
> > > > }
> > > >
> > > > - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > > - trace_nfsd_file_gc_hashed(nf);
> > > > - return LRU_SKIP;
> > > > + /*
> > > > + * Put the reference held on behalf of the LRU. If it wasn't the last
> > > > + * one, then just remove it from the LRU and ignore it.
> > > > + */
> > > > + if (!refcount_dec_and_test(&nf->nf_ref)) {
> > > > + trace_nfsd_file_gc_in_use(nf);
> > > > + list_lru_isolate(lru, &nf->nf_lru);
> > > > + return LRU_REMOVED;
> > > > }
> > > >
> > > > + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> > > > + nfsd_file_unhash(nf);
> > > > list_lru_isolate_move(lru, &nf->nf_lru, head);
> > > > this_cpu_inc(nfsd_file_evictions);
> > > > trace_nfsd_file_gc_disposed(nf);
> > > > return LRU_REMOVED;
> > > > }
> > > >
> > > > -/*
> > > > - * Unhash items on @dispose immediately, then queue them on the
> > > > - * disposal workqueue to finish releasing them in the background.
> > > > - *
> > > > - * cel: Note that between the time list_lru_shrink_walk runs and
> > > > - * now, these items are in the hash table but marked unhashed.
> > > > - * Why release these outside of lru_cb ? There's no lock ordering
> > > > - * problem since lru_cb currently takes no lock.
> > > > - */
> > > > -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> > > > -{
> > > > - struct nfsd_file *nf;
> > > > -
> > > > - list_for_each_entry(nf, dispose, nf_lru)
> > > > - nfsd_file_hash_remove(nf);
> > > > - nfsd_file_dispose_list_delayed(dispose);
> > > > -}
> > > > -
> > > > static void
> > > > nfsd_file_gc(void)
> > > > {
> > > > @@ -634,7 +617,7 @@ nfsd_file_gc(void)
> > > > ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> > > > &dispose, list_lru_count(&nfsd_file_lru));
> > > > trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> > > > - nfsd_file_gc_dispose_list(&dispose);
> > > > + nfsd_file_dispose_list_delayed(&dispose);
> > > > }
> > > >
> > > > static void
> > > > @@ -659,7 +642,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> > > > ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> > > > nfsd_file_lru_cb, &dispose);
> > > > trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> > > > - nfsd_file_gc_dispose_list(&dispose);
> > > > + nfsd_file_dispose_list_delayed(&dispose);
> > > > return ret;
> > > > }
> > > >
> > > > @@ -670,8 +653,11 @@ static struct shrinker nfsd_file_shrinker = {
> > > > };
> > > >
> > > > /*
> > > > - * Find all cache items across all net namespaces that match @inode and
> > > > - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> > > > + * Find all cache items across all net namespaces that match @inode, unhash
> > > > + * them, take references and then put them on @dispose if that was successful.
> > > > + *
> > > > + * The nfsd_file objects on the list will be unhashed, and each will have a
> > > > + * reference taken.
> > > > */
> > > > static unsigned int
> > > > __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > > @@ -689,52 +675,58 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > > nfsd_file_rhash_params);
> > > > if (!nf)
> > > > break;
> > > > - nfsd_file_unhash_and_dispose(nf, dispose);
> > > > - count++;
> > > > +
> > > > + if (nfsd_file_unhash_and_queue(nf, dispose))
> > > > + count++;
> > > > } while (1);
> > > > rcu_read_unlock();
> > > > return count;
> > > > }
> > > >
> > > > /**
> > > > - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > > + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > > * @inode: inode of the file to attempt to remove
> > > > *
> > > > - * Unhash and put, then flush and fput all cache items associated with @inode.
> > > > + * Unhash and put all cache item associated with @inode.
> > > > */
> > > > -void
> > > > -nfsd_file_close_inode_sync(struct inode *inode)
> > > > +static unsigned int
> > > > +nfsd_file_close_inode(struct inode *inode)
> > > > {
> > > > - LIST_HEAD(dispose);
> > > > + struct nfsd_file *nf;
> > > > unsigned int count;
> > > > + LIST_HEAD(dispose);
> > > >
> > > > count = __nfsd_file_close_inode(inode, &dispose);
> > > > - trace_nfsd_file_close_inode_sync(inode, count);
> > > > - nfsd_file_dispose_list_sync(&dispose);
> > > > + trace_nfsd_file_close_inode(inode, count);
> > > > + if (count) {
> > > > + while(!list_empty(&dispose)) {
> > > > + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> > > > + list_del_init(&nf->nf_lru);
> > > > + trace_nfsd_file_closing(nf);
> > > > + __nfsd_file_put(nf);
> > >
> > > If nfsd_file_put() didn't add unhashed files to the lru, this can be
> > > nfsd_file_put().
> > >
> > > > + }
> > > > + }
> > > > + return count;
> > > > }
> > > >
> > > > /**
> > > > - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > > + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > > * @inode: inode of the file to attempt to remove
> > > > *
> > > > - * Unhash and put all cache item associated with @inode.
> > > > + * Unhash and put, then flush and fput all cache items associated with @inode.
> > > > */
> > > > -static void
> > > > -nfsd_file_close_inode(struct inode *inode)
> > > > +void
> > > > +nfsd_file_close_inode_sync(struct inode *inode)
> > > > {
> > > > - LIST_HEAD(dispose);
> > > > - unsigned int count;
> > > > -
> > > > - count = __nfsd_file_close_inode(inode, &dispose);
> > > > - trace_nfsd_file_close_inode(inode, count);
> > > > - nfsd_file_dispose_list_delayed(&dispose);
> > > > + if (nfsd_file_close_inode(inode))
> > > > + flush_delayed_fput();
> > > > }
> > > >
> > > > /**
> > > > * nfsd_file_delayed_close - close unused nfsd_files
> > > > * @work: dummy
> > > > *
> > > > - * Walk the LRU list and close any entries that have not been used since
> > > > + * Walk the LRU list and destroy any entries that have not been used since
> > > > * the last scan.
> > > > */
> > > > static void
> > > > @@ -892,7 +884,7 @@ __nfsd_file_cache_purge(struct net *net)
> > > > while (!IS_ERR_OR_NULL(nf)) {
> > > > if (net && nf->nf_net != net)
> > > > continue;
> > > > - nfsd_file_unhash_and_dispose(nf, &dispose);
> > > > + nfsd_file_unhash_and_queue(nf, &dispose);
> > > > nf = rhashtable_walk_next(&iter);
> > > > }
> > > >
> > > > @@ -1093,11 +1085,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > goto out;
> > > > }
> > > > open_retry = false;
> > > > - nfsd_file_put_noref(nf);
> > > > + __nfsd_file_put(nf);
> > >
> > > This nf is not hashed, and I think it has no other reference. So we
> > > could use nfsd_file_free() - but nfsd_file_put() would be just as good
> > > and safer.
> > >
> > > > goto retry;
> > > > }
> > > >
> > > > - nfsd_file_lru_remove(nf);
> > >
> > > Hmmm... why not remove from the lru. I guess this justifies patch 2/3,
> > > but it might be cleaner to make this
> > >
> > > if (nfsd_file_lru_remove(nf))
> > > nffsd_file_put(nf);
> > > ??
> > >
> >
> > Removing from the LRU means putting a reference now. The last "put" of a
> > nfsd_file can be rather expensive (you might need to flush data, and
> > issue a close()).
>
> True, but irrelevant. nfsd_file_do_acquire() already holds a reference.
> If it succeeds at removing from the LRU, it now holds 2 references. If
> it puts one, then it won't be that last "put", and so will be cheap.
>
> I don't object to the way you have don't it - if ! lru_remove then get -
> but it isn't necessary. You can just to the get - then if lru_remove,
> do a put.
>
> >
> > In this particular codepath, that's not so much a danger, but avoiding
> > excess "put" calls is still a good thing to do. That's the main reason
> > I've tried to "transfer" references to and from the LRU where possible.
> >
> > > > this_cpu_inc(nfsd_file_cache_hits);
> > > >
> > > > status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> > > > @@ -1107,7 +1098,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > this_cpu_inc(nfsd_file_acquisitions);
> > > > *pnf = nf;
> > > > } else {
> > > > - nfsd_file_put(nf);
> > > > + __nfsd_file_put(nf);
> > >
> > > I don't see the justification for this change.
> > > If status == nfserr_jukebox, then it is OK.
> > > If status is whatever we might get from break_lease(), then it seems
> > > wrong.
> > > If we modify nfsd_file_put() as I suggest, it will handle both cases.
> > >
> > >
> >
> > The justification is that when we're dealing with an error from an open,
> > we don't want to put the nfsd_file onto the LRU. So, a direct call to
> > __nfsd_file_put is what's needed here.
>
> Maybe... I guess my concern arises from the fact that I'm unclear on how
> break_lease() might fail. If it is a transitory failure then dropping
> from the lru doesn't seem appropriate. Maybe I should refresh my
> understanding of break_lease() failure modes.
>
nfsd_open_break_lease calls break_lease with O_NONBLOCK, so it can fail
with -EWOULDBLOCK, which should be a transient error. So we might end up
tearing down a file when it isn't necessary due to an outstanding
lease.?
That's probably worth fixing, but I'd rather not do that in the context
of this set. With this set, I want to focus on fixing up the issues with
refcounting that are causing crashes in the field.
> >
> > I'll plan to open-code those like you suggest in the next iteration.
> >
> > > > nf = NULL;
> > > > }
> > > >
> > > > @@ -1134,7 +1125,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > * then unhash.
> > > > */
> > > > if (status != nfs_ok || key.inode->i_nlink == 0)
> > > > - nfsd_file_unhash_and_put(nf);
> > > > + nfsd_file_unhash(nf);
> > > > clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> > > > smp_mb__after_atomic();
> > > > wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> > > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > > index b09ab4f92d43..a44ded06af87 100644
> > > > --- a/fs/nfsd/trace.h
> > > > +++ b/fs/nfsd/trace.h
> > > > @@ -903,10 +903,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
> > > > TP_PROTO(struct nfsd_file *nf), \
> > > > TP_ARGS(nf))
> > > >
> > > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
> > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> > > > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> > > > DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> > > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
> > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
> > > >
> > > > TRACE_EVENT(nfsd_file_alloc,
> > > > TP_PROTO(
> > > > --
> > > > 2.37.3
> > > >
> > > >
> > >
> > > Thanks,
> > > NeilBrown
> >
> > --
> > Jeff Layton <[email protected]>
> >
>
> Thanks a lot,
> I quite like your latest version.
>
Thanks! I think we're getting much closer.
--
Jeff Layton <[email protected]>
On Mon, 2022-10-31 at 08:29 +1100, NeilBrown wrote:
> On Sat, 29 Oct 2022, NeilBrown wrote:
> > On Fri, 28 Oct 2022, Jeff Layton wrote:
> > > On Fri, 2022-10-28 at 09:51 +1100, NeilBrown wrote:
> > > > On Fri, 28 Oct 2022, Jeff Layton wrote:
> > > > > The filecache refcounting is a bit non-standard for something searchable
> > > > > by RCU, in that we maintain a sentinel reference while it's hashed. This
> > > > > in turn requires that we have to do things differently in the "put"
> > > > > depending on whether its hashed, which we believe to have led to races.
> > > > >
> > > > > There are other problems in here too. nfsd_file_close_inode_sync can end
> > > > > up freeing an nfsd_file while there are still outstanding references to
> > > > > it, and the handling
> > > >
> > > > -EINTR ??? (you got interrupted and didn't finish the sentence?)
> > > >
> > >
> > > Yes, I meant to go back and flesh that out, and forgot before posting.
> > >
> > > > >
> > > > > Rework the code so that the refcount is what drives the lifecycle. When
> > > > > the refcount goes to zero, then unhash and rcu free the object.
> > > > >
> > > > > Signed-off-by: Jeff Layton <[email protected]>
> > > > > ---
> > > > > fs/nfsd/filecache.c | 291 +++++++++++++++++++++-----------------------
> > > > > fs/nfsd/trace.h | 5 +-
> > > > > 2 files changed, 144 insertions(+), 152 deletions(-)
> > > > >
> > > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > > > index 98c6b5f51bc8..e63534f4b9f8 100644
> > > > > --- a/fs/nfsd/filecache.c
> > > > > +++ b/fs/nfsd/filecache.c
> > > > > @@ -1,6 +1,12 @@
> > > > > // SPDX-License-Identifier: GPL-2.0
> > > > > /*
> > > > > * The NFSD open file cache.
> > > > > + *
> > > > > + * Each nfsd_file is created in response to client activity -- either regular
> > > > > + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> > > > > + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> > > > > + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> > > > > + * eventual disposal if they aren't used again within a short time period.
> > > > > */
> > > > >
> > > > > #include <linux/hash.h>
> > > > > @@ -302,31 +308,43 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > > > if (key->gc)
> > > > > __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > > > > nf->nf_inode = key->inode;
> > > > > - /* nf_ref is pre-incremented for hash table */
> > > > > - refcount_set(&nf->nf_ref, 2);
> > > > > + refcount_set(&nf->nf_ref, 1);
> > > > > nf->nf_may = key->need;
> > > > > nf->nf_mark = NULL;
> > > > > }
> > > > > return nf;
> > > > > }
> > > > >
> > > > > -static bool
> > > > > +static void
> > > > > +nfsd_file_flush(struct nfsd_file *nf)
> > > > > +{
> > > > > + struct file *file = nf->nf_file;
> > > > > +
> > > > > + if (!file || !(file->f_mode & FMODE_WRITE))
> > > > > + return;
> > > > > + this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > > > + if (vfs_fsync(file, 1) != 0)
> > > > > + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > nfsd_file_free(struct nfsd_file *nf)
> > > > > {
> > > > > s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> > > > > - bool flush = false;
> > > > > +
> > > > > + trace_nfsd_file_free(nf);
> > > > >
> > > > > this_cpu_inc(nfsd_file_releases);
> > > > > this_cpu_add(nfsd_file_total_age, age);
> > > > >
> > > > > - trace_nfsd_file_put_final(nf);
> > > > > + nfsd_file_flush(nf);
> > > > > +
> > > > > if (nf->nf_mark)
> > > > > nfsd_file_mark_put(nf->nf_mark);
> > > > > if (nf->nf_file) {
> > > > > get_file(nf->nf_file);
> > > > > filp_close(nf->nf_file, NULL);
> > > > > fput(nf->nf_file);
> > > > > - flush = true;
> > > > > }
> > > > >
> > > > > /*
> > > > > @@ -334,10 +352,9 @@ nfsd_file_free(struct nfsd_file *nf)
> > > > > * WARN and leak it to preserve system stability.
> > > > > */
> > > > > if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> > > > > - return flush;
> > > > > + return;
> > > > >
> > > > > call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> > > > > - return flush;
> > > > > }
> > > > >
> > > > > static bool
> > > > > @@ -363,29 +380,23 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
> > > > > return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
> > > > > }
> > > > >
> > > > > -static void
> > > > > -nfsd_file_flush(struct nfsd_file *nf)
> > > > > -{
> > > > > - struct file *file = nf->nf_file;
> > > > > -
> > > > > - if (!file || !(file->f_mode & FMODE_WRITE))
> > > > > - return;
> > > > > - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
> > > > > - if (vfs_fsync(file, 1) != 0)
> > > > > - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > > -}
> > > > > -
> > > > > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > > > > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > > > > {
> > > > > set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > > > > - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > > > > + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > > > > trace_nfsd_file_lru_add(nf);
> > > > > + return true;
> > > > > + }
> > > > > + return false;
> > > > > }
> > > > >
> > > > > -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > > > > +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > > > > {
> > > > > - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> > > > > + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > > > > trace_nfsd_file_lru_del(nf);
> > > > > + return true;
> > > > > + }
> > > > > + return false;
> > > > > }
> > > > >
> > > > > static void
> > > > > @@ -409,94 +420,89 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > > > > return false;
> > > > > }
> > > > >
> > > > > -static void
> > > > > -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > > > > +struct nfsd_file *
> > > > > +nfsd_file_get(struct nfsd_file *nf)
> > > > > {
> > > > > - trace_nfsd_file_unhash_and_dispose(nf);
> > > > > + if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > > > + return nf;
> > > > > + return NULL;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > > > > + * @nf: nfsd_file to be unhashed and queued
> > > > > + * @dispose: list to which it should be queued
> > > > > + *
> > > > > + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> > > > > + * will have a reference held on behalf of the list. That reference may come
> > > > > + * from the LRU, or we may need to take one. If we can't get a reference,
> > > > > + * ignore it altogether.
> > > > > + */
> > > > > +static bool
> > > > > +nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > > > > +{
> > > > > + trace_nfsd_file_unhash_and_queue(nf);
> > > > > if (nfsd_file_unhash(nf)) {
> > > > > - /* caller must call nfsd_file_dispose_list() later */
> > > > > - nfsd_file_lru_remove(nf);
> > > > > + /*
> > > > > + * If we remove it from the LRU, then just use that
> > > > > + * reference for the dispose list. Otherwise, we need
> > > > > + * to take a reference. If that fails, just ignore
> > > > > + * the file altogether.
> > > > > + */
> > > > > + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > > > > + return false;
> > > > > list_add(&nf->nf_lru, dispose);
> > > > > + return true;
> > > > > }
> > > > > + return false;
> > > > > }
> > > > >
> > > > > -static void
> > > > > -nfsd_file_put_noref(struct nfsd_file *nf)
> > > > > +static bool
> > > > > +__nfsd_file_put(struct nfsd_file *nf)
> > > >
> > > > The return value of this function is never tested.
> > > > Maybe it should return void.
> > > >
> > > > Further, I don't think this is a useful abstraction.
> > > > I would rather move the refcount_dec_and_test to the caller, and move
> > > > the lru_remove and unash into nfsd_file_free.
> > > >
> > >
> > > Ok, sounds reasonable.
> > >
> > > > > {
> > > > > - trace_nfsd_file_put(nf);
> > > > > -
> > > > > if (refcount_dec_and_test(&nf->nf_ref)) {
> > > > > - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > > > > - nfsd_file_lru_remove(nf);
> > > > > + nfsd_file_unhash(nf);
> > > > > nfsd_file_free(nf);
> > > > > + return true;
> > > > > }
> > > > > + return false;
> > > > > }
> > > > >
> > > > > -static void
> > > > > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > > > > -{
> > > > > - if (nfsd_file_unhash(nf))
> > > > > - nfsd_file_put_noref(nf);
> > > > > -}
> > > > > -
> > > > > +/**
> > > > > + * nfsd_file_put - put the reference to a nfsd_file
> > > > > + * @nf: nfsd_file of which to put the reference
> > > > > + *
> > > > > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > > > > + * reference immediately. In the v2/3 case, if the reference would be
> > > > > + * the last one, the put it on the LRU instead to be cleaned up later.
> > > > > + */
> > > > > void
> > > > > nfsd_file_put(struct nfsd_file *nf)
> > > > > {
> > > > > - might_sleep();
> > > > > -
> > > > > - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> > > > > - nfsd_file_lru_add(nf);
> > > > > - else if (refcount_read(&nf->nf_ref) == 2)
> > > > > - nfsd_file_unhash_and_put(nf);
> > > > > -
> > > > > - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > > > - nfsd_file_flush(nf);
> > > > > - nfsd_file_put_noref(nf);
> > > > > - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > > > - nfsd_file_put_noref(nf);
> > > > > - nfsd_file_schedule_laundrette();
> > > > > - } else
> > > > > - nfsd_file_put_noref(nf);
> > > > > -}
> > > > > -
> > > > > -struct nfsd_file *
> > > > > -nfsd_file_get(struct nfsd_file *nf)
> > > > > -{
> > > > > - if (likely(refcount_inc_not_zero(&nf->nf_ref)))
> > > > > - return nf;
> > > > > - return NULL;
> > > > > -}
> > > > > -
> > > > > -static void
> > > > > -nfsd_file_dispose_list(struct list_head *dispose)
> > > > > -{
> > > > > - struct nfsd_file *nf;
> > > > > + trace_nfsd_file_put(nf);
> > > > >
> > > > > - while(!list_empty(dispose)) {
> > > > > - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > > > - list_del_init(&nf->nf_lru);
> > > > > - nfsd_file_flush(nf);
> > > > > - nfsd_file_put_noref(nf);
> > > > > + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > >
> > > > I would prefer this included a test on NFSD_FILE_HASHED as well so that
> > > > if the file isn't hashed, we don't consider it for the lru.
> > > > This would me we can simple called nfsd_file_put() for things on the
> > > > dispose list, rather then needing __nfsd_file_put()
> > > >
> > >
> > > I had an incorrectly reversed test for that in the previous version in
> > > nfsd_file_lru_add and you mentioned that it was racy. Why would that not
> > > be the case here?
> >
> > It accept there is an apparent hypocrisy there :-)
> > This proposed test isn't racy because of the intent.
> > The intent isn't to ensure unhashed files never go onto the lru.
> > The intent is to ensure that if I unhash a file and then call put(),
> > then the file won't be put on the LRU.
> >
> > Any code that calls nfsd_file_unhash() will either hold a reference, or
> > has just dropped the last reference. In either case it can be certain
> > that no other thread will drop the last reference, so no other thread
> > can cause the file to be added to the lru.
>
> This last bit is wrong. The logic would only hold if the test on HASHED
> was performed after refcount_dec_and_test has succeeded. As we test
> before I think it still could race.
>
> I don't think that race is important though. Maybe something will get
> added to the lru after it has been unhashed. But any code that wants it
> unhashed and gone from the lru will explicitly do that and it will
> succeed.
> In the worst case, an unhashed file will remain on the lru for a while,
> then get discarded.
>
>
Right, I think that was the case originally too with version where you
pointed out the race.
That said, leaving it sitting on the LRU could still be problematic, as
it will likely have an open file description still attached that could
block other activity. So, I think we do want to take extra steps to keep
those files off the LRU if we can at all help it.
>
>
> >
> > So in actual fact it is not racy - I was wrong before.
> >
> > >
> > > > > + /*
> > > > > + * If this is the last reference (nf_ref == 1), then transfer
> > > > > + * it to the LRU. If the add to the LRU fails, just put it as
> > > > > + * usual.
> > > > > + */
> > > > > + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > > > + return;
> > > > > }
> > > > > + __nfsd_file_put(nf);
> > > >
> > > > As suggested above, this would become
> > > > if (refcount_dec_and_test(&nf->nf_ref))
> > > > nfsd_file_free(nf);
> > > >
> > >
> > > Ok.
> > >
> > > > > }
> > > > >
> > > > > static void
> > > > > -nfsd_file_dispose_list_sync(struct list_head *dispose)
> > > > > +nfsd_file_dispose_list(struct list_head *dispose)
> > > > > {
> > > > > - bool flush = false;
> > > > > struct nfsd_file *nf;
> > > > >
> > > > > while(!list_empty(dispose)) {
> > > > > nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > > > > list_del_init(&nf->nf_lru);
> > > > > - nfsd_file_flush(nf);
> > > > > - if (!refcount_dec_and_test(&nf->nf_ref))
> > > > > - continue;
> > > > > - if (nfsd_file_free(nf))
> > > > > - flush = true;
> > > > > + nfsd_file_free(nf);
> > > > > }
> > > > > - if (flush)
> > > > > - flush_delayed_fput();
> > > > > }
> > > > >
> > > > > static void
> > > > > @@ -566,21 +572,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > > > struct list_head *head = arg;
> > > > > struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> > > > >
> > > > > - /*
> > > > > - * Do a lockless refcount check. The hashtable holds one reference, so
> > > > > - * we look to see if anything else has a reference, or if any have
> > > > > - * been put since the shrinker last ran. Those don't get unhashed and
> > > > > - * released.
> > > > > - *
> > > > > - * Note that in the put path, we set the flag and then decrement the
> > > > > - * counter. Here we check the counter and then test and clear the flag.
> > > > > - * That order is deliberate to ensure that we can do this locklessly.
> > > > > - */
> > > > > - if (refcount_read(&nf->nf_ref) > 1) {
> > > > > - list_lru_isolate(lru, &nf->nf_lru);
> > > > > - trace_nfsd_file_gc_in_use(nf);
> > > > > - return LRU_REMOVED;
> > > > > - }
> > > > > + /* We should only be dealing with v2/3 entries here */
> > > > > + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> > > > >
> > > > > /*
> > > > > * Don't throw out files that are still undergoing I/O or
> > > > > @@ -591,40 +584,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > > > > return LRU_SKIP;
> > > > > }
> > > > >
> > > > > + /* If it was recently added to the list, skip it */
> > > > > if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> > > > > trace_nfsd_file_gc_referenced(nf);
> > > > > return LRU_ROTATE;
> > > > > }
> > > > >
> > > > > - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > > > > - trace_nfsd_file_gc_hashed(nf);
> > > > > - return LRU_SKIP;
> > > > > + /*
> > > > > + * Put the reference held on behalf of the LRU. If it wasn't the last
> > > > > + * one, then just remove it from the LRU and ignore it.
> > > > > + */
> > > > > + if (!refcount_dec_and_test(&nf->nf_ref)) {
> > > > > + trace_nfsd_file_gc_in_use(nf);
> > > > > + list_lru_isolate(lru, &nf->nf_lru);
> > > > > + return LRU_REMOVED;
> > > > > }
> > > > >
> > > > > + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> > > > > + nfsd_file_unhash(nf);
> > > > > list_lru_isolate_move(lru, &nf->nf_lru, head);
> > > > > this_cpu_inc(nfsd_file_evictions);
> > > > > trace_nfsd_file_gc_disposed(nf);
> > > > > return LRU_REMOVED;
> > > > > }
> > > > >
> > > > > -/*
> > > > > - * Unhash items on @dispose immediately, then queue them on the
> > > > > - * disposal workqueue to finish releasing them in the background.
> > > > > - *
> > > > > - * cel: Note that between the time list_lru_shrink_walk runs and
> > > > > - * now, these items are in the hash table but marked unhashed.
> > > > > - * Why release these outside of lru_cb ? There's no lock ordering
> > > > > - * problem since lru_cb currently takes no lock.
> > > > > - */
> > > > > -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> > > > > -{
> > > > > - struct nfsd_file *nf;
> > > > > -
> > > > > - list_for_each_entry(nf, dispose, nf_lru)
> > > > > - nfsd_file_hash_remove(nf);
> > > > > - nfsd_file_dispose_list_delayed(dispose);
> > > > > -}
> > > > > -
> > > > > static void
> > > > > nfsd_file_gc(void)
> > > > > {
> > > > > @@ -634,7 +617,7 @@ nfsd_file_gc(void)
> > > > > ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> > > > > &dispose, list_lru_count(&nfsd_file_lru));
> > > > > trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> > > > > - nfsd_file_gc_dispose_list(&dispose);
> > > > > + nfsd_file_dispose_list_delayed(&dispose);
> > > > > }
> > > > >
> > > > > static void
> > > > > @@ -659,7 +642,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> > > > > ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> > > > > nfsd_file_lru_cb, &dispose);
> > > > > trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> > > > > - nfsd_file_gc_dispose_list(&dispose);
> > > > > + nfsd_file_dispose_list_delayed(&dispose);
> > > > > return ret;
> > > > > }
> > > > >
> > > > > @@ -670,8 +653,11 @@ static struct shrinker nfsd_file_shrinker = {
> > > > > };
> > > > >
> > > > > /*
> > > > > - * Find all cache items across all net namespaces that match @inode and
> > > > > - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> > > > > + * Find all cache items across all net namespaces that match @inode, unhash
> > > > > + * them, take references and then put them on @dispose if that was successful.
> > > > > + *
> > > > > + * The nfsd_file objects on the list will be unhashed, and each will have a
> > > > > + * reference taken.
> > > > > */
> > > > > static unsigned int
> > > > > __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > > > @@ -689,52 +675,58 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > > > > nfsd_file_rhash_params);
> > > > > if (!nf)
> > > > > break;
> > > > > - nfsd_file_unhash_and_dispose(nf, dispose);
> > > > > - count++;
> > > > > +
> > > > > + if (nfsd_file_unhash_and_queue(nf, dispose))
> > > > > + count++;
> > > > > } while (1);
> > > > > rcu_read_unlock();
> > > > > return count;
> > > > > }
> > > > >
> > > > > /**
> > > > > - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > > > + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > > > * @inode: inode of the file to attempt to remove
> > > > > *
> > > > > - * Unhash and put, then flush and fput all cache items associated with @inode.
> > > > > + * Unhash and put all cache item associated with @inode.
> > > > > */
> > > > > -void
> > > > > -nfsd_file_close_inode_sync(struct inode *inode)
> > > > > +static unsigned int
> > > > > +nfsd_file_close_inode(struct inode *inode)
> > > > > {
> > > > > - LIST_HEAD(dispose);
> > > > > + struct nfsd_file *nf;
> > > > > unsigned int count;
> > > > > + LIST_HEAD(dispose);
> > > > >
> > > > > count = __nfsd_file_close_inode(inode, &dispose);
> > > > > - trace_nfsd_file_close_inode_sync(inode, count);
> > > > > - nfsd_file_dispose_list_sync(&dispose);
> > > > > + trace_nfsd_file_close_inode(inode, count);
> > > > > + if (count) {
> > > > > + while(!list_empty(&dispose)) {
> > > > > + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> > > > > + list_del_init(&nf->nf_lru);
> > > > > + trace_nfsd_file_closing(nf);
> > > > > + __nfsd_file_put(nf);
> > > >
> > > > If nfsd_file_put() didn't add unhashed files to the lru, this can be
> > > > nfsd_file_put().
> > > >
> > > > > + }
> > > > > + }
> > > > > + return count;
> > > > > }
> > > > >
> > > > > /**
> > > > > - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> > > > > + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> > > > > * @inode: inode of the file to attempt to remove
> > > > > *
> > > > > - * Unhash and put all cache item associated with @inode.
> > > > > + * Unhash and put, then flush and fput all cache items associated with @inode.
> > > > > */
> > > > > -static void
> > > > > -nfsd_file_close_inode(struct inode *inode)
> > > > > +void
> > > > > +nfsd_file_close_inode_sync(struct inode *inode)
> > > > > {
> > > > > - LIST_HEAD(dispose);
> > > > > - unsigned int count;
> > > > > -
> > > > > - count = __nfsd_file_close_inode(inode, &dispose);
> > > > > - trace_nfsd_file_close_inode(inode, count);
> > > > > - nfsd_file_dispose_list_delayed(&dispose);
> > > > > + if (nfsd_file_close_inode(inode))
> > > > > + flush_delayed_fput();
> > > > > }
> > > > >
> > > > > /**
> > > > > * nfsd_file_delayed_close - close unused nfsd_files
> > > > > * @work: dummy
> > > > > *
> > > > > - * Walk the LRU list and close any entries that have not been used since
> > > > > + * Walk the LRU list and destroy any entries that have not been used since
> > > > > * the last scan.
> > > > > */
> > > > > static void
> > > > > @@ -892,7 +884,7 @@ __nfsd_file_cache_purge(struct net *net)
> > > > > while (!IS_ERR_OR_NULL(nf)) {
> > > > > if (net && nf->nf_net != net)
> > > > > continue;
> > > > > - nfsd_file_unhash_and_dispose(nf, &dispose);
> > > > > + nfsd_file_unhash_and_queue(nf, &dispose);
> > > > > nf = rhashtable_walk_next(&iter);
> > > > > }
> > > > >
> > > > > @@ -1093,11 +1085,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > > goto out;
> > > > > }
> > > > > open_retry = false;
> > > > > - nfsd_file_put_noref(nf);
> > > > > + __nfsd_file_put(nf);
> > > >
> > > > This nf is not hashed, and I think it has no other reference. So we
> > > > could use nfsd_file_free() - but nfsd_file_put() would be just as good
> > > > and safer.
> > > >
> > > > > goto retry;
> > > > > }
> > > > >
> > > > > - nfsd_file_lru_remove(nf);
> > > >
> > > > Hmmm... why not remove from the lru. I guess this justifies patch 2/3,
> > > > but it might be cleaner to make this
> > > >
> > > > if (nfsd_file_lru_remove(nf))
> > > > nffsd_file_put(nf);
> > > > ??
> > > >
> > >
> > > Removing from the LRU means putting a reference now. The last "put" of a
> > > nfsd_file can be rather expensive (you might need to flush data, and
> > > issue a close()).
> >
> > True, but irrelevant. nfsd_file_do_acquire() already holds a reference.
> > If it succeeds at removing from the LRU, it now holds 2 references. If
> > it puts one, then it won't be that last "put", and so will be cheap.
> >
> > I don't object to the way you have don't it - if ! lru_remove then get -
> > but it isn't necessary. You can just to the get - then if lru_remove,
> > do a put.
> >
> > >
> > > In this particular codepath, that's not so much a danger, but avoiding
> > > excess "put" calls is still a good thing to do. That's the main reason
> > > I've tried to "transfer" references to and from the LRU where possible.
> > >
> > > > > this_cpu_inc(nfsd_file_cache_hits);
> > > > >
> > > > > status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> > > > > @@ -1107,7 +1098,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > > this_cpu_inc(nfsd_file_acquisitions);
> > > > > *pnf = nf;
> > > > > } else {
> > > > > - nfsd_file_put(nf);
> > > > > + __nfsd_file_put(nf);
> > > >
> > > > I don't see the justification for this change.
> > > > If status == nfserr_jukebox, then it is OK.
> > > > If status is whatever we might get from break_lease(), then it seems
> > > > wrong.
> > > > If we modify nfsd_file_put() as I suggest, it will handle both cases.
> > > >
> > > >
> > >
> > > The justification is that when we're dealing with an error from an open,
> > > we don't want to put the nfsd_file onto the LRU. So, a direct call to
> > > __nfsd_file_put is what's needed here.
> >
> > Maybe... I guess my concern arises from the fact that I'm unclear on how
> > break_lease() might fail. If it is a transitory failure then dropping
> > from the lru doesn't seem appropriate. Maybe I should refresh my
> > understanding of break_lease() failure modes.
> >
> > >
> > > I'll plan to open-code those like you suggest in the next iteration.
> > >
> > > > > nf = NULL;
> > > > > }
> > > > >
> > > > > @@ -1134,7 +1125,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > > * then unhash.
> > > > > */
> > > > > if (status != nfs_ok || key.inode->i_nlink == 0)
> > > > > - nfsd_file_unhash_and_put(nf);
> > > > > + nfsd_file_unhash(nf);
> > > > > clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> > > > > smp_mb__after_atomic();
> > > > > wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> > > > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > > > index b09ab4f92d43..a44ded06af87 100644
> > > > > --- a/fs/nfsd/trace.h
> > > > > +++ b/fs/nfsd/trace.h
> > > > > @@ -903,10 +903,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
> > > > > TP_PROTO(struct nfsd_file *nf), \
> > > > > TP_ARGS(nf))
> > > > >
> > > > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
> > > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> > > > > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> > > > > DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> > > > > -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
> > > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> > > > > +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
> > > > >
> > > > > TRACE_EVENT(nfsd_file_alloc,
> > > > > TP_PROTO(
> > > > > --
> > > > > 2.37.3
> > > > >
> > > > >
> > > >
> > > > Thanks,
> > > > NeilBrown
> > >
> > > --
> > > Jeff Layton <[email protected]>
> > >
> >
> > Thanks a lot,
> > I quite like your latest version.
> >
> > NeilBrown
> >
--
Jeff Layton <[email protected]>