2022-10-31 11:40:10

by Jeff Layton

[permalink] [raw]
Subject: [PATCH v4 0/5] nfsd: clean up refcounting in the filecache

This is a revision of the set that I posted late on Friday. The only
real difference is the introduction of a "reorganization" patch that
shuffles some of the code in filecache.c around without any functional
changes. That should make the functional changes I'm proposing in patch
#3 more evident.

Jeff Layton (5):
nfsd: remove the pages_flushed statistic from filecache
nfsd: reorganize filecache.c
nfsd: rework refcounting in filecache
nfsd: close race between unhashing and LRU addition
nfsd: start non-blocking writeback after adding nfsd_file to the LRU

fs/nfsd/filecache.c | 385 +++++++++++++++++++++++---------------------
fs/nfsd/filecache.h | 1 +
fs/nfsd/trace.h | 5 +-
3 files changed, 207 insertions(+), 184 deletions(-)

--
2.38.1



2022-10-31 11:43:04

by Jeff Layton

[permalink] [raw]
Subject: [PATCH v4 1/5] nfsd: remove the pages_flushed statistic from filecache

We're counting mapping->nrpages, but not all of those are necessarily
dirty. We don't really have a simple way to count just the dirty pages,
so just remove this stat since it's not accurate.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 98c6b5f51bc8..f8ebbf7daa18 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -32,7 +32,6 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
-static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed);
static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);

struct nfsd_fcache_disposal {
@@ -370,7 +369,6 @@ nfsd_file_flush(struct nfsd_file *nf)

if (!file || !(file->f_mode & FMODE_WRITE))
return;
- this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
if (vfs_fsync(file, 1) != 0)
nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
@@ -998,7 +996,6 @@ nfsd_file_cache_shutdown(void)
per_cpu(nfsd_file_acquisitions, i) = 0;
per_cpu(nfsd_file_releases, i) = 0;
per_cpu(nfsd_file_total_age, i) = 0;
- per_cpu(nfsd_file_pages_flushed, i) = 0;
per_cpu(nfsd_file_evictions, i) = 0;
}
}
@@ -1212,7 +1209,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*/
int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
{
- unsigned long releases = 0, pages_flushed = 0, evictions = 0;
+ unsigned long releases = 0, evictions = 0;
unsigned long hits = 0, acquisitions = 0;
unsigned int i, count = 0, buckets = 0;
unsigned long lru = 0, total_age = 0;
@@ -1240,7 +1237,6 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
releases += per_cpu(nfsd_file_releases, i);
total_age += per_cpu(nfsd_file_total_age, i);
evictions += per_cpu(nfsd_file_evictions, i);
- pages_flushed += per_cpu(nfsd_file_pages_flushed, i);
}

seq_printf(m, "total entries: %u\n", count);
@@ -1254,6 +1250,5 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
else
seq_printf(m, "mean age (ms): -\n");
- seq_printf(m, "pages flushed: %lu\n", pages_flushed);
return 0;
}
--
2.38.1


2022-10-31 11:43:06

by Jeff Layton

[permalink] [raw]
Subject: [PATCH v4 4/5] nfsd: close race between unhashing and LRU addition

The list_lru_add and list_lru_del functions use list_empty checks to see
whether the object is already on the LRU. That's fine in most cases, but
we occasionally repurpose nf_lru after unhashing. It's possible for an
LRU removal to remove it from a different list altogether if we lose a
race.

Add a new NFSD_FILE_LRU flag, which indicates that the object actually
resides on the LRU and not some other list. Use that when adding and
removing it from the LRU instead of just relying on list_empty checks.

Add an extra HASHED check after adding the entry to the LRU. If it's now
clear, just remove it from the LRU again and put the reference if that
remove is successful.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 44 +++++++++++++++++++++++++++++---------------
fs/nfsd/filecache.h | 1 +
2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index d928c5e38eeb..47cdc6129a7b 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -403,18 +403,22 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
- trace_nfsd_file_lru_add(nf);
- return true;
+ if (!test_and_set_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
+ trace_nfsd_file_lru_add(nf);
+ return true;
+ }
}
return false;
}

static bool nfsd_file_lru_remove(struct nfsd_file *nf)
{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
- trace_nfsd_file_lru_del(nf);
- return true;
+ if (test_and_clear_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
+ trace_nfsd_file_lru_del(nf);
+ return true;
+ }
}
return false;
}
@@ -469,20 +473,30 @@ nfsd_file_put(struct nfsd_file *nf)
{
trace_nfsd_file_put(nf);

- /*
- * The HASHED check is racy. We may end up with the occasional
- * unhashed entry on the LRU, but they should get cleaned up
- * like any other.
- */
if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
/*
- * If this is the last reference (nf_ref == 1), then transfer
- * it to the LRU. If the add to the LRU fails, just put it as
- * usual.
+ * If this is the last reference (nf_ref == 1), then try to
+ * transfer it to the LRU.
*/
- if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
+ if (refcount_dec_not_one(&nf->nf_ref))
return;
+
+ /* Try to add it to the LRU. If that fails, decrement. */
+ if (nfsd_file_lru_add(nf)) {
+ /* If it's still hashed, we're done */
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+ return;
+
+ /*
+ * We're racing with unhashing, so try to remove it from
+ * the LRU. If removal fails, then someone else already
+ * has our reference and we're done. If it succeeds,
+ * fall through to decrement.
+ */
+ if (!nfsd_file_lru_remove(nf))
+ return;
+ }
}
if (refcount_dec_and_test(&nf->nf_ref))
nfsd_file_free(nf);
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index b7efb2c3ddb1..e52ab7d5a44c 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -39,6 +39,7 @@ struct nfsd_file {
#define NFSD_FILE_PENDING (1)
#define NFSD_FILE_REFERENCED (2)
#define NFSD_FILE_GC (3)
+#define NFSD_FILE_LRU (4) /* file is on LRU */
unsigned long nf_flags;
struct inode *nf_inode; /* don't deref */
refcount_t nf_ref;
--
2.38.1


2022-10-31 11:43:06

by Jeff Layton

[permalink] [raw]
Subject: [PATCH v4 2/5] nfsd: reorganize filecache.c

In a coming patch, we're going to rework how the filecache refcounting
works. Move some code around in the function to reduce the churn in the
later patches, and rename some of the functions with (hopefully) clearer
names. This should introduce no functional changes.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 134 ++++++++++++++++++++++----------------------
fs/nfsd/trace.h | 4 +-
2 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index f8ebbf7daa18..ce80b6a80ffc 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -309,6 +309,48 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
return nf;
}

+static void
+nfsd_file_fsync(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return;
+ if (vfs_fsync(file, 1) != 0)
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+}
+
+static int
+nfsd_file_check_write_error(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return 0;
+ return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+}
+
+static void
+nfsd_file_hash_remove(struct nfsd_file *nf)
+{
+ trace_nfsd_file_unhash(nf);
+
+ if (nfsd_file_check_write_error(nf))
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+ rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
+ nfsd_file_rhash_params);
+}
+
+static bool
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+ if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_hash_remove(nf);
+ return true;
+ }
+ return false;
+}
+
static bool
nfsd_file_free(struct nfsd_file *nf)
{
@@ -318,7 +360,7 @@ nfsd_file_free(struct nfsd_file *nf)
this_cpu_inc(nfsd_file_releases);
this_cpu_add(nfsd_file_total_age, age);

- trace_nfsd_file_put_final(nf);
+ trace_nfsd_file_free(nf);
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
@@ -352,27 +394,6 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}

-static int
-nfsd_file_check_write_error(struct nfsd_file *nf)
-{
- struct file *file = nf->nf_file;
-
- if (!file || !(file->f_mode & FMODE_WRITE))
- return 0;
- return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
-}
-
-static void
-nfsd_file_flush(struct nfsd_file *nf)
-{
- struct file *file = nf->nf_file;
-
- if (!file || !(file->f_mode & FMODE_WRITE))
- return;
- if (vfs_fsync(file, 1) != 0)
- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
-}
-
static void nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
@@ -386,31 +407,18 @@ static void nfsd_file_lru_remove(struct nfsd_file *nf)
trace_nfsd_file_lru_del(nf);
}

-static void
-nfsd_file_hash_remove(struct nfsd_file *nf)
-{
- trace_nfsd_file_unhash(nf);
-
- if (nfsd_file_check_write_error(nf))
- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
- rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
- nfsd_file_rhash_params);
-}
-
-static bool
-nfsd_file_unhash(struct nfsd_file *nf)
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
{
- if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_hash_remove(nf);
- return true;
- }
- return false;
+ if (likely(refcount_inc_not_zero(&nf->nf_ref)))
+ return nf;
+ return NULL;
}

static void
-nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
+nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
{
- trace_nfsd_file_unhash_and_dispose(nf);
+ trace_nfsd_file_unhash_and_queue(nf);
if (nfsd_file_unhash(nf)) {
/* caller must call nfsd_file_dispose_list() later */
nfsd_file_lru_remove(nf);
@@ -448,7 +456,7 @@ nfsd_file_put(struct nfsd_file *nf)
nfsd_file_unhash_and_put(nf);

if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_flush(nf);
+ nfsd_file_fsync(nf);
nfsd_file_put_noref(nf);
} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
nfsd_file_put_noref(nf);
@@ -457,14 +465,6 @@ nfsd_file_put(struct nfsd_file *nf)
nfsd_file_put_noref(nf);
}

-struct nfsd_file *
-nfsd_file_get(struct nfsd_file *nf)
-{
- if (likely(refcount_inc_not_zero(&nf->nf_ref)))
- return nf;
- return NULL;
-}
-
static void
nfsd_file_dispose_list(struct list_head *dispose)
{
@@ -473,7 +473,7 @@ nfsd_file_dispose_list(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
+ nfsd_file_fsync(nf);
nfsd_file_put_noref(nf);
}
}
@@ -487,7 +487,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
+ nfsd_file_fsync(nf);
if (!refcount_dec_and_test(&nf->nf_ref))
continue;
if (nfsd_file_free(nf))
@@ -687,7 +687,7 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
nfsd_file_rhash_params);
if (!nf)
break;
- nfsd_file_unhash_and_dispose(nf, dispose);
+ nfsd_file_unhash_and_queue(nf, dispose);
count++;
} while (1);
rcu_read_unlock();
@@ -695,37 +695,37 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
}

/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put, then flush and fput all cache items associated with @inode.
+ * Unhash and put all cache item associated with @inode.
*/
-void
-nfsd_file_close_inode_sync(struct inode *inode)
+static void
+nfsd_file_close_inode(struct inode *inode)
{
LIST_HEAD(dispose);
unsigned int count;

count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode_sync(inode, count);
- nfsd_file_dispose_list_sync(&dispose);
+ trace_nfsd_file_close_inode(inode, count);
+ nfsd_file_dispose_list_delayed(&dispose);
}

/**
- * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put all cache item associated with @inode.
+ * Unhash and put, then flush and fput all cache items associated with @inode.
*/
-static void
-nfsd_file_close_inode(struct inode *inode)
+void
+nfsd_file_close_inode_sync(struct inode *inode)
{
LIST_HEAD(dispose);
unsigned int count;

count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode(inode, count);
- nfsd_file_dispose_list_delayed(&dispose);
+ trace_nfsd_file_close_inode_sync(inode, count);
+ nfsd_file_dispose_list_sync(&dispose);
}

/**
@@ -890,7 +890,7 @@ __nfsd_file_cache_purge(struct net *net)
while (!IS_ERR_OR_NULL(nf)) {
if (net && nf->nf_net != net)
continue;
- nfsd_file_unhash_and_dispose(nf, &dispose);
+ nfsd_file_unhash_and_queue(nf, &dispose);
nf = rhashtable_walk_next(&iter);
}

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index b09ab4f92d43..940252482fd4 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -903,10 +903,10 @@ DEFINE_EVENT(nfsd_file_class, name, \
TP_PROTO(struct nfsd_file *nf), \
TP_ARGS(nf))

-DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);

TRACE_EVENT(nfsd_file_alloc,
TP_PROTO(
--
2.38.1


2022-10-31 11:43:46

by Jeff Layton

[permalink] [raw]
Subject: [PATCH v4 5/5] nfsd: start non-blocking writeback after adding nfsd_file to the LRU

When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
so that we can be ready to close it when the time comes. This should
help minimize delays when freeing objects reaped from the LRU.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 23 +++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 47cdc6129a7b..c43b6cff03e2 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -325,6 +325,20 @@ nfsd_file_fsync(struct nfsd_file *nf)
nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}

+static void
+nfsd_file_flush(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+ struct address_space *mapping;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return;
+
+ mapping = file->f_mapping;
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ filemap_flush(mapping);
+}
+
static int
nfsd_file_check_write_error(struct nfsd_file *nf)
{
@@ -484,9 +498,14 @@ nfsd_file_put(struct nfsd_file *nf)

/* Try to add it to the LRU. If that fails, decrement. */
if (nfsd_file_lru_add(nf)) {
- /* If it's still hashed, we're done */
- if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+ /*
+ * If it's still hashed, we can just return now,
+ * after kicking off SYNC_NONE writeback.
+ */
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_flush(nf);
return;
+ }

/*
* We're racing with unhashing, so try to remove it from
--
2.38.1


2022-10-31 11:43:47

by Jeff Layton

[permalink] [raw]
Subject: [PATCH v4 3/5] nfsd: rework refcounting in filecache

The filecache refcounting is a bit non-standard for something searchable
by RCU, in that we maintain a sentinel reference while it's hashed. This
in turn requires that we have to do things differently in the "put"
depending on whether its hashed, which we believe to have led to races.

There are other problems in here too. nfsd_file_close_inode_sync can end
up freeing an nfsd_file while there are still outstanding references to
it, and there are a number of subtle ToC/ToU races.

Rework the code so that the refcount is what drives the lifecycle. When
the refcount goes to zero, then unhash and rcu free the object.

With this change, the LRU carries a reference. Take special care to
deal with it when removing an entry from the list.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 247 +++++++++++++++++++++-----------------------
fs/nfsd/trace.h | 1 +
2 files changed, 121 insertions(+), 127 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ce80b6a80ffc..d928c5e38eeb 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1,6 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
* The NFSD open file cache.
+ *
+ * Each nfsd_file is created in response to client activity -- either regular
+ * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
+ * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
+ * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
+ * eventual disposal if they aren't used again within a short time period.
*/

#include <linux/hash.h>
@@ -301,8 +307,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
if (key->gc)
__set_bit(NFSD_FILE_GC, &nf->nf_flags);
nf->nf_inode = key->inode;
- /* nf_ref is pre-incremented for hash table */
- refcount_set(&nf->nf_ref, 2);
+ refcount_set(&nf->nf_ref, 1);
nf->nf_may = key->need;
nf->nf_mark = NULL;
}
@@ -351,23 +356,25 @@ nfsd_file_unhash(struct nfsd_file *nf)
return false;
}

-static bool
+static void
nfsd_file_free(struct nfsd_file *nf)
{
s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
- bool flush = false;
+
+ trace_nfsd_file_free(nf);

this_cpu_inc(nfsd_file_releases);
this_cpu_add(nfsd_file_total_age, age);

- trace_nfsd_file_free(nf);
+ nfsd_file_unhash(nf);
+ nfsd_file_fsync(nf);
+
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
get_file(nf->nf_file);
filp_close(nf->nf_file, NULL);
fput(nf->nf_file);
- flush = true;
}

/*
@@ -375,10 +382,9 @@ nfsd_file_free(struct nfsd_file *nf)
* WARN and leak it to preserve system stability.
*/
if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
- return flush;
+ return;

call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
- return flush;
}

static bool
@@ -394,17 +400,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}

-static void nfsd_file_lru_add(struct nfsd_file *nf)
+static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_add(nf);
+ return true;
+ }
+ return false;
}

-static void nfsd_file_lru_remove(struct nfsd_file *nf)
+static bool nfsd_file_lru_remove(struct nfsd_file *nf)
{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_del(nf);
+ return true;
+ }
+ return false;
}

struct nfsd_file *
@@ -415,86 +427,77 @@ nfsd_file_get(struct nfsd_file *nf)
return NULL;
}

-static void
+/**
+ * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
+ * @nf: nfsd_file to be unhashed and queued
+ * @dispose: list to which it should be queued
+ *
+ * Attempt to unhash a nfsd_file and queue it to the given list. Each file
+ * will have a reference held on behalf of the list. That reference may come
+ * from the LRU, or we may need to take one. If we can't get a reference,
+ * ignore it altogether.
+ */
+static bool
nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
{
trace_nfsd_file_unhash_and_queue(nf);
if (nfsd_file_unhash(nf)) {
- /* caller must call nfsd_file_dispose_list() later */
- nfsd_file_lru_remove(nf);
+ /*
+ * If we remove it from the LRU, then just use that
+ * reference for the dispose list. Otherwise, we need
+ * to take a reference. If that fails, just ignore
+ * the file altogether.
+ */
+ if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
+ return false;
list_add(&nf->nf_lru, dispose);
+ return true;
}
+ return false;
}

-static void
-nfsd_file_put_noref(struct nfsd_file *nf)
-{
- trace_nfsd_file_put(nf);
-
- if (refcount_dec_and_test(&nf->nf_ref)) {
- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
- nfsd_file_lru_remove(nf);
- nfsd_file_free(nf);
- }
-}
-
-static void
-nfsd_file_unhash_and_put(struct nfsd_file *nf)
-{
- if (nfsd_file_unhash(nf))
- nfsd_file_put_noref(nf);
-}
-
+/**
+ * nfsd_file_put - put the reference to a nfsd_file
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Put a reference to a nfsd_file. In the v4 case, we just put the
+ * reference immediately. In the v2/3 case, if the reference would be
+ * the last one, the put it on the LRU instead to be cleaned up later.
+ */
void
nfsd_file_put(struct nfsd_file *nf)
{
- might_sleep();
-
- if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
- nfsd_file_lru_add(nf);
- else if (refcount_read(&nf->nf_ref) == 2)
- nfsd_file_unhash_and_put(nf);
-
- if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_fsync(nf);
- nfsd_file_put_noref(nf);
- } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
- nfsd_file_put_noref(nf);
- nfsd_file_schedule_laundrette();
- } else
- nfsd_file_put_noref(nf);
-}
-
-static void
-nfsd_file_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
+ trace_nfsd_file_put(nf);

- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
- nfsd_file_fsync(nf);
- nfsd_file_put_noref(nf);
+ /*
+ * The HASHED check is racy. We may end up with the occasional
+ * unhashed entry on the LRU, but they should get cleaned up
+ * like any other.
+ */
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
+ test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ /*
+ * If this is the last reference (nf_ref == 1), then transfer
+ * it to the LRU. If the add to the LRU fails, just put it as
+ * usual.
+ */
+ if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
+ return;
}
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
}

static void
-nfsd_file_dispose_list_sync(struct list_head *dispose)
+nfsd_file_dispose_list(struct list_head *dispose)
{
- bool flush = false;
struct nfsd_file *nf;

while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
- nfsd_file_fsync(nf);
- if (!refcount_dec_and_test(&nf->nf_ref))
- continue;
- if (nfsd_file_free(nf))
- flush = true;
+ nfsd_file_free(nf);
}
- if (flush)
- flush_delayed_fput();
}

static void
@@ -564,21 +567,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);

- /*
- * Do a lockless refcount check. The hashtable holds one reference, so
- * we look to see if anything else has a reference, or if any have
- * been put since the shrinker last ran. Those don't get unhashed and
- * released.
- *
- * Note that in the put path, we set the flag and then decrement the
- * counter. Here we check the counter and then test and clear the flag.
- * That order is deliberate to ensure that we can do this locklessly.
- */
- if (refcount_read(&nf->nf_ref) > 1) {
- list_lru_isolate(lru, &nf->nf_lru);
- trace_nfsd_file_gc_in_use(nf);
- return LRU_REMOVED;
- }
+ /* We should only be dealing with v2/3 entries here */
+ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));

/*
* Don't throw out files that are still undergoing I/O or
@@ -589,40 +579,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
return LRU_SKIP;
}

+ /* If it was recently added to the list, skip it */
if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
trace_nfsd_file_gc_referenced(nf);
return LRU_ROTATE;
}

- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- trace_nfsd_file_gc_hashed(nf);
- return LRU_SKIP;
+ /*
+ * Put the reference held on behalf of the LRU. If it wasn't the last
+ * one, then just remove it from the LRU and ignore it.
+ */
+ if (!refcount_dec_and_test(&nf->nf_ref)) {
+ trace_nfsd_file_gc_in_use(nf);
+ list_lru_isolate(lru, &nf->nf_lru);
+ return LRU_REMOVED;
}

+ /* Refcount went to zero. Unhash it and queue it to the dispose list */
+ nfsd_file_unhash(nf);
list_lru_isolate_move(lru, &nf->nf_lru, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
}

-/*
- * Unhash items on @dispose immediately, then queue them on the
- * disposal workqueue to finish releasing them in the background.
- *
- * cel: Note that between the time list_lru_shrink_walk runs and
- * now, these items are in the hash table but marked unhashed.
- * Why release these outside of lru_cb ? There's no lock ordering
- * problem since lru_cb currently takes no lock.
- */
-static void nfsd_file_gc_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
-
- list_for_each_entry(nf, dispose, nf_lru)
- nfsd_file_hash_remove(nf);
- nfsd_file_dispose_list_delayed(dispose);
-}
-
static void
nfsd_file_gc(void)
{
@@ -632,7 +612,7 @@ nfsd_file_gc(void)
ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
&dispose, list_lru_count(&nfsd_file_lru));
trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}

static void
@@ -657,7 +637,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
nfsd_file_lru_cb, &dispose);
trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
return ret;
}

@@ -668,8 +648,11 @@ static struct shrinker nfsd_file_shrinker = {
};

/*
- * Find all cache items across all net namespaces that match @inode and
- * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+ * Find all cache items across all net namespaces that match @inode, unhash
+ * them, take references and then put them on @dispose if that was successful.
+ *
+ * The nfsd_file objects on the list will be unhashed, and each will have a
+ * reference taken.
*/
static unsigned int
__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
@@ -687,8 +670,9 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
nfsd_file_rhash_params);
if (!nf)
break;
- nfsd_file_unhash_and_queue(nf, dispose);
- count++;
+
+ if (nfsd_file_unhash_and_queue(nf, dispose))
+ count++;
} while (1);
rcu_read_unlock();
return count;
@@ -700,15 +684,25 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
*
* Unhash and put all cache item associated with @inode.
*/
-static void
+static unsigned int
nfsd_file_close_inode(struct inode *inode)
{
- LIST_HEAD(dispose);
+ struct nfsd_file *nf;
unsigned int count;
+ LIST_HEAD(dispose);

count = __nfsd_file_close_inode(inode, &dispose);
trace_nfsd_file_close_inode(inode, count);
- nfsd_file_dispose_list_delayed(&dispose);
+ if (count) {
+ while(!list_empty(&dispose)) {
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
+ list_del_init(&nf->nf_lru);
+ trace_nfsd_file_closing(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
+ }
+ }
+ return count;
}

/**
@@ -720,19 +714,15 @@ nfsd_file_close_inode(struct inode *inode)
void
nfsd_file_close_inode_sync(struct inode *inode)
{
- LIST_HEAD(dispose);
- unsigned int count;
-
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode_sync(inode, count);
- nfsd_file_dispose_list_sync(&dispose);
+ if (nfsd_file_close_inode(inode))
+ flush_delayed_fput();
}

/**
* nfsd_file_delayed_close - close unused nfsd_files
* @work: dummy
*
- * Walk the LRU list and close any entries that have not been used since
+ * Walk the LRU list and destroy any entries that have not been used since
* the last scan.
*/
static void
@@ -1054,8 +1044,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
rcu_read_lock();
nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
nfsd_file_rhash_params);
- if (nf)
- nf = nfsd_file_get(nf);
+ if (nf) {
+ if (!nfsd_file_lru_remove(nf))
+ nf = nfsd_file_get(nf);
+ }
rcu_read_unlock();
if (nf)
goto wait_for_construction;
@@ -1090,11 +1082,11 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
open_retry = false;
- nfsd_file_put_noref(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
goto retry;
}

- nfsd_file_lru_remove(nf);
this_cpu_inc(nfsd_file_cache_hits);

status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
@@ -1104,7 +1096,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
this_cpu_inc(nfsd_file_acquisitions);
*pnf = nf;
} else {
- nfsd_file_put(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
nf = NULL;
}

@@ -1131,7 +1124,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* then unhash.
*/
if (status != nfs_ok || key.inode->i_nlink == 0)
- nfsd_file_unhash_and_put(nf);
+ nfsd_file_unhash(nf);
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 940252482fd4..a44ded06af87 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -906,6 +906,7 @@ DEFINE_EVENT(nfsd_file_class, name, \
DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);

TRACE_EVENT(nfsd_file_alloc,
--
2.38.1


2022-10-31 21:03:53

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v4 3/5] nfsd: rework refcounting in filecache



> On Oct 31, 2022, at 7:37 AM, Jeff Layton <[email protected]> wrote:
>
> The filecache refcounting is a bit non-standard for something searchable
> by RCU, in that we maintain a sentinel reference while it's hashed. This
> in turn requires that we have to do things differently in the "put"
> depending on whether its hashed, which we believe to have led to races.
>
> There are other problems in here too. nfsd_file_close_inode_sync can end
> up freeing an nfsd_file while there are still outstanding references to
> it, and there are a number of subtle ToC/ToU races.
>
> Rework the code so that the refcount is what drives the lifecycle. When
> the refcount goes to zero, then unhash and rcu free the object.
>
> With this change, the LRU carries a reference. Take special care to
> deal with it when removing an entry from the list.

....removing an entry from the dispose list, or the LRU?

I spent today reviewing and testing this series, and
comparing its behavior to the behavior of the current
code base. A few general comments below:


> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 247 +++++++++++++++++++++-----------------------
> fs/nfsd/trace.h | 1 +
> 2 files changed, 121 insertions(+), 127 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index ce80b6a80ffc..d928c5e38eeb 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -1,6 +1,12 @@
> // SPDX-License-Identifier: GPL-2.0
> /*
> * The NFSD open file cache.
> + *
> + * Each nfsd_file is created in response to client activity -- either regular
> + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> + * eventual disposal if they aren't used again within a short time period.

Trond and I discussed intentionally not using the terms "v4-only"
and "v2/3" in the filecache comments and function names. That's
why the flag is called GC and not, say, NFSV3. Can the new comments
stick with gc and nogc?


> */
>
> #include <linux/hash.h>
> @@ -301,8 +307,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> if (key->gc)
> __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> nf->nf_inode = key->inode;
> - /* nf_ref is pre-incremented for hash table */
> - refcount_set(&nf->nf_ref, 2);
> + refcount_set(&nf->nf_ref, 1);
> nf->nf_may = key->need;
> nf->nf_mark = NULL;
> }
> @@ -351,23 +356,25 @@ nfsd_file_unhash(struct nfsd_file *nf)
> return false;
> }
>

There's still no explanation in the code of why writeback is
needed for GC files. I can write a patch to add that, say,
right here as a documenting comment for nfsd_file_free, if
you agree.


> -static bool
> +static void
> nfsd_file_free(struct nfsd_file *nf)
> {
> s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> - bool flush = false;
> +
> + trace_nfsd_file_free(nf);

Maybe move this tracepoint in 2/5 instead? Or just leave it
where it was. Up to you.


>
> this_cpu_inc(nfsd_file_releases);
> this_cpu_add(nfsd_file_total_age, age);
>
> - trace_nfsd_file_free(nf);
> + nfsd_file_unhash(nf);
> + nfsd_file_fsync(nf);
> +
> if (nf->nf_mark)
> nfsd_file_mark_put(nf->nf_mark);
> if (nf->nf_file) {
> get_file(nf->nf_file);
> filp_close(nf->nf_file, NULL);
> fput(nf->nf_file);
> - flush = true;
> }
>
> /*
> @@ -375,10 +382,9 @@ nfsd_file_free(struct nfsd_file *nf)
> * WARN and leak it to preserve system stability.
> */
> if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> - return flush;
> + return;
>
> call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> - return flush;
> }
>
> static bool
> @@ -394,17 +400,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
> mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
> }
>
> -static void nfsd_file_lru_add(struct nfsd_file *nf)
> +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> {
> set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> trace_nfsd_file_lru_add(nf);
> + return true;
> + }
> + return false;
> }
>
> -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> {
> - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> trace_nfsd_file_lru_del(nf);
> + return true;
> + }
> + return false;
> }
>
> struct nfsd_file *
> @@ -415,86 +427,77 @@ nfsd_file_get(struct nfsd_file *nf)
> return NULL;
> }
>
> -static void
> +/**
> + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> + * @nf: nfsd_file to be unhashed and queued
> + * @dispose: list to which it should be queued
> + *
> + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> + * will have a reference held on behalf of the list. That reference may come
> + * from the LRU, or we may need to take one. If we can't get a reference,
> + * ignore it altogether.
> + */
> +static bool
> nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> {
> trace_nfsd_file_unhash_and_queue(nf);
> if (nfsd_file_unhash(nf)) {
> - /* caller must call nfsd_file_dispose_list() later */
> - nfsd_file_lru_remove(nf);
> + /*
> + * If we remove it from the LRU, then just use that
> + * reference for the dispose list. Otherwise, we need
> + * to take a reference. If that fails, just ignore
> + * the file altogether.
> + */
> + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> + return false;
> list_add(&nf->nf_lru, dispose);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_put_noref(struct nfsd_file *nf)
> -{
> - trace_nfsd_file_put(nf);
> -
> - if (refcount_dec_and_test(&nf->nf_ref)) {
> - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> - nfsd_file_lru_remove(nf);
> - nfsd_file_free(nf);
> - }
> -}
> -
> -static void
> -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> -{
> - if (nfsd_file_unhash(nf))
> - nfsd_file_put_noref(nf);
> -}
> -
> +/**
> + * nfsd_file_put - put the reference to a nfsd_file
> + * @nf: nfsd_file of which to put the reference
> + *
> + * Put a reference to a nfsd_file. In the v4 case, we just put the
> + * reference immediately. In the v2/3 case, if the reference would be
> + * the last one, the put it on the LRU instead to be cleaned up later.
> + */
> void
> nfsd_file_put(struct nfsd_file *nf)
> {
> - might_sleep();

The might_sleep() annotation should be added back to nfsd_file_fsync()
or its callers. It might give more useful warnings if you were to
annotate the external API functions (as was done here) rather than
internal helper functions.


> -
> - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> - nfsd_file_lru_add(nf);
> - else if (refcount_read(&nf->nf_ref) == 2)
> - nfsd_file_unhash_and_put(nf);
> -
> - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> - nfsd_file_fsync(nf);
> - nfsd_file_put_noref(nf);
> - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> - nfsd_file_put_noref(nf);
> - nfsd_file_schedule_laundrette();

This might be the only caller of nfsd_file_schedule_laundrette.
How does the laundrette get scheduled now?


> - } else
> - nfsd_file_put_noref(nf);
> -}
> -
> -static void
> -nfsd_file_dispose_list(struct list_head *dispose)
> -{
> - struct nfsd_file *nf;
> + trace_nfsd_file_put(nf);
>
> - while(!list_empty(dispose)) {
> - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> - list_del_init(&nf->nf_lru);
> - nfsd_file_fsync(nf);
> - nfsd_file_put_noref(nf);
> + /*
> + * The HASHED check is racy. We may end up with the occasional
> + * unhashed entry on the LRU, but they should get cleaned up
> + * like any other.
> + */
> + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
> + test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> + /*
> + * If this is the last reference (nf_ref == 1), then transfer
> + * it to the LRU. If the add to the LRU fails, just put it as
> + * usual.
> + */
> + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> + return;
> }
> + if (refcount_dec_and_test(&nf->nf_ref))
> + nfsd_file_free(nf);
> }
>
> static void
> -nfsd_file_dispose_list_sync(struct list_head *dispose)
> +nfsd_file_dispose_list(struct list_head *dispose)
> {
> - bool flush = false;
> struct nfsd_file *nf;
>
> while(!list_empty(dispose)) {
> nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> list_del_init(&nf->nf_lru);
> - nfsd_file_fsync(nf);
> - if (!refcount_dec_and_test(&nf->nf_ref))
> - continue;
> - if (nfsd_file_free(nf))
> - flush = true;
> + nfsd_file_free(nf);
> }
> - if (flush)
> - flush_delayed_fput();
> }
>
> static void
> @@ -564,21 +567,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> struct list_head *head = arg;
> struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
>
> - /*
> - * Do a lockless refcount check. The hashtable holds one reference, so
> - * we look to see if anything else has a reference, or if any have
> - * been put since the shrinker last ran. Those don't get unhashed and
> - * released.
> - *
> - * Note that in the put path, we set the flag and then decrement the
> - * counter. Here we check the counter and then test and clear the flag.
> - * That order is deliberate to ensure that we can do this locklessly.
> - */
> - if (refcount_read(&nf->nf_ref) > 1) {
> - list_lru_isolate(lru, &nf->nf_lru);
> - trace_nfsd_file_gc_in_use(nf);
> - return LRU_REMOVED;
> - }
> + /* We should only be dealing with v2/3 entries here */
> + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
>
> /*
> * Don't throw out files that are still undergoing I/O or
> @@ -589,40 +579,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> return LRU_SKIP;
> }
>
> + /* If it was recently added to the list, skip it */
> if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> trace_nfsd_file_gc_referenced(nf);
> return LRU_ROTATE;
> }
>
> - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> - trace_nfsd_file_gc_hashed(nf);
> - return LRU_SKIP;
> + /*
> + * Put the reference held on behalf of the LRU. If it wasn't the last
> + * one, then just remove it from the LRU and ignore it.
> + */
> + if (!refcount_dec_and_test(&nf->nf_ref)) {
> + trace_nfsd_file_gc_in_use(nf);
> + list_lru_isolate(lru, &nf->nf_lru);
> + return LRU_REMOVED;
> }
>
> + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> + nfsd_file_unhash(nf);
> list_lru_isolate_move(lru, &nf->nf_lru, head);
> this_cpu_inc(nfsd_file_evictions);
> trace_nfsd_file_gc_disposed(nf);
> return LRU_REMOVED;
> }
>
> -/*
> - * Unhash items on @dispose immediately, then queue them on the
> - * disposal workqueue to finish releasing them in the background.
> - *
> - * cel: Note that between the time list_lru_shrink_walk runs and
> - * now, these items are in the hash table but marked unhashed.
> - * Why release these outside of lru_cb ? There's no lock ordering
> - * problem since lru_cb currently takes no lock.
> - */
> -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> -{
> - struct nfsd_file *nf;
> -
> - list_for_each_entry(nf, dispose, nf_lru)
> - nfsd_file_hash_remove(nf);
> - nfsd_file_dispose_list_delayed(dispose);
> -}
> -
> static void
> nfsd_file_gc(void)
> {
> @@ -632,7 +612,7 @@ nfsd_file_gc(void)
> ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> &dispose, list_lru_count(&nfsd_file_lru));
> trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> }
>
> static void
> @@ -657,7 +637,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> nfsd_file_lru_cb, &dispose);
> trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> return ret;
> }
>
> @@ -668,8 +648,11 @@ static struct shrinker nfsd_file_shrinker = {
> };
>
> /*
> - * Find all cache items across all net namespaces that match @inode and
> - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> + * Find all cache items across all net namespaces that match @inode, unhash
> + * them, take references and then put them on @dispose if that was successful.
> + *
> + * The nfsd_file objects on the list will be unhashed, and each will have a
> + * reference taken.
> */
> static unsigned int
> __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> @@ -687,8 +670,9 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> nfsd_file_rhash_params);
> if (!nf)
> break;
> - nfsd_file_unhash_and_queue(nf, dispose);
> - count++;
> +
> + if (nfsd_file_unhash_and_queue(nf, dispose))
> + count++;
> } while (1);
> rcu_read_unlock();
> return count;
> @@ -700,15 +684,25 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> *
> * Unhash and put all cache item associated with @inode.
> */
> -static void
> +static unsigned int
> nfsd_file_close_inode(struct inode *inode)
> {
> - LIST_HEAD(dispose);
> + struct nfsd_file *nf;
> unsigned int count;
> + LIST_HEAD(dispose);
>
> count = __nfsd_file_close_inode(inode, &dispose);
> trace_nfsd_file_close_inode(inode, count);
> - nfsd_file_dispose_list_delayed(&dispose);
> + if (count) {

Why is it necessary to check @count? I would think that
the the empty list check would be sufficient.


> + while(!list_empty(&dispose)) {
> + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> + list_del_init(&nf->nf_lru);
> + trace_nfsd_file_closing(nf);
> + if (refcount_dec_and_test(&nf->nf_ref))
> + nfsd_file_free(nf);
> + }
> + }
> + return count;
> }
>
> /**
> @@ -720,19 +714,15 @@ nfsd_file_close_inode(struct inode *inode)
> void
> nfsd_file_close_inode_sync(struct inode *inode)
> {
> - LIST_HEAD(dispose);
> - unsigned int count;
> -
> - count = __nfsd_file_close_inode(inode, &dispose);
> - trace_nfsd_file_close_inode_sync(inode, count);
> - nfsd_file_dispose_list_sync(&dispose);
> + if (nfsd_file_close_inode(inode))
> + flush_delayed_fput();
> }
>
> /**
> * nfsd_file_delayed_close - close unused nfsd_files
> * @work: dummy
> *
> - * Walk the LRU list and close any entries that have not been used since
> + * Walk the LRU list and destroy any entries that have not been used since
> * the last scan.
> */
> static void
> @@ -1054,8 +1044,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> rcu_read_lock();
> nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
> nfsd_file_rhash_params);
> - if (nf)
> - nf = nfsd_file_get(nf);
> + if (nf) {
> + if (!nfsd_file_lru_remove(nf))
> + nf = nfsd_file_get(nf);
> + }
> rcu_read_unlock();
> if (nf)
> goto wait_for_construction;
> @@ -1090,11 +1082,11 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> goto out;
> }
> open_retry = false;
> - nfsd_file_put_noref(nf);
> + if (refcount_dec_and_test(&nf->nf_ref))
> + nfsd_file_free(nf);
> goto retry;
> }
>
> - nfsd_file_lru_remove(nf);
> this_cpu_inc(nfsd_file_cache_hits);
>
> status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> @@ -1104,7 +1096,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> this_cpu_inc(nfsd_file_acquisitions);
> *pnf = nf;
> } else {
> - nfsd_file_put(nf);
> + if (refcount_dec_and_test(&nf->nf_ref))
> + nfsd_file_free(nf);
> nf = NULL;
> }
>
> @@ -1131,7 +1124,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> * then unhash.
> */
> if (status != nfs_ok || key.inode->i_nlink == 0)
> - nfsd_file_unhash_and_put(nf);
> + nfsd_file_unhash(nf);
> clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> smp_mb__after_atomic();
> wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 940252482fd4..a44ded06af87 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -906,6 +906,7 @@ DEFINE_EVENT(nfsd_file_class, name, \
> DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
>
> TRACE_EVENT(nfsd_file_alloc,
> --
> 2.38.1
>

--
Chuck Lever




2022-10-31 21:18:39

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v4 5/5] nfsd: start non-blocking writeback after adding nfsd_file to the LRU



> On Oct 31, 2022, at 7:37 AM, Jeff Layton <[email protected]> wrote:
>
> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> so that we can be ready to close it when the time comes. This should
> help minimize delays when freeing objects reaped from the LRU.

Tested against a btrfs export.

So, the current code does indeed do a synchronous fsync when
garbage collecting files (via nfsd_file_delayed_close()).
That indicates that it's at least safe to do, and 3/5 isn't
changing the safety of the filecache by moving the vfs_fsync()
call into nfsd_file_free(). These calls take between 10 and
20 milliseconds each.

But I see the filemap_flush() call added here taking dozens of
milliseconds on my system for large files. This is done before
the WRITE reply is sent to the client, so it adds significant
latency to large UNSTABLE WRITEs. In the current code, the
vfs_fsync() in nfsd_file_put() does not seem to fire except in
very rare circumstances, so it doesn't seem to have much if any
impact.

My scalability concerns therefore are with code that pre-dates
this patch series. I can deal with that later in a separate
series. Do we need to keep this one?


> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 23 +++++++++++++++++++++--
> 1 file changed, 21 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 47cdc6129a7b..c43b6cff03e2 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -325,6 +325,20 @@ nfsd_file_fsync(struct nfsd_file *nf)
> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
>
> +static void
> +nfsd_file_flush(struct nfsd_file *nf)
> +{
> + struct file *file = nf->nf_file;
> + struct address_space *mapping;
> +
> + if (!file || !(file->f_mode & FMODE_WRITE))
> + return;
> +
> + mapping = file->f_mapping;
> + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
> + filemap_flush(mapping);
> +}
> +
> static int
> nfsd_file_check_write_error(struct nfsd_file *nf)
> {
> @@ -484,9 +498,14 @@ nfsd_file_put(struct nfsd_file *nf)
>
> /* Try to add it to the LRU. If that fails, decrement. */
> if (nfsd_file_lru_add(nf)) {
> - /* If it's still hashed, we're done */
> - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
> + /*
> + * If it's still hashed, we can just return now,
> + * after kicking off SYNC_NONE writeback.
> + */
> + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> + nfsd_file_flush(nf);
> return;
> + }
>
> /*
> * We're racing with unhashing, so try to remove it from
> --
> 2.38.1
>

--
Chuck Lever




2022-11-01 11:01:24

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH v4 3/5] nfsd: rework refcounting in filecache

On Mon, 2022-10-31 at 20:45 +0000, Chuck Lever III wrote:
>
> > On Oct 31, 2022, at 7:37 AM, Jeff Layton <[email protected]> wrote:
> >
> > The filecache refcounting is a bit non-standard for something searchable
> > by RCU, in that we maintain a sentinel reference while it's hashed. This
> > in turn requires that we have to do things differently in the "put"
> > depending on whether its hashed, which we believe to have led to races.
> >
> > There are other problems in here too. nfsd_file_close_inode_sync can end
> > up freeing an nfsd_file while there are still outstanding references to
> > it, and there are a number of subtle ToC/ToU races.
> >
> > Rework the code so that the refcount is what drives the lifecycle. When
> > the refcount goes to zero, then unhash and rcu free the object.
> >
> > With this change, the LRU carries a reference. Take special care to
> > deal with it when removing an entry from the list.
>
> ....removing an entry from the dispose list, or the LRU?
>
> I spent today reviewing and testing this series, and
> comparing its behavior to the behavior of the current
> code base. A few general comments below:
>

Thanks for the review!

>
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/filecache.c | 247 +++++++++++++++++++++-----------------------
> > fs/nfsd/trace.h | 1 +
> > 2 files changed, 121 insertions(+), 127 deletions(-)
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index ce80b6a80ffc..d928c5e38eeb 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -1,6 +1,12 @@
> > // SPDX-License-Identifier: GPL-2.0
> > /*
> > * The NFSD open file cache.
> > + *
> > + * Each nfsd_file is created in response to client activity -- either regular
> > + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> > + * cleaned up as soon as their refcount goes to 0. Entries for v2/v3 are
> > + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> > + * eventual disposal if they aren't used again within a short time period.
>
> Trond and I discussed intentionally not using the terms "v4-only"
> and "v2/3" in the filecache comments and function names. That's
> why the flag is called GC and not, say, NFSV3. Can the new comments
> stick with gc and nogc?
>

This comment is explaining that v2/v3 _are_ GC entries. I can just
remove this comment I guess, but converting all the mentions to gc/non-
gc in this comment will make it nonsensical.

>
> > */
> >
> > #include <linux/hash.h>
> > @@ -301,8 +307,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > if (key->gc)
> > __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > nf->nf_inode = key->inode;
> > - /* nf_ref is pre-incremented for hash table */
> > - refcount_set(&nf->nf_ref, 2);
> > + refcount_set(&nf->nf_ref, 1);
> > nf->nf_may = key->need;
> > nf->nf_mark = NULL;
> > }
> > @@ -351,23 +356,25 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > return false;
> > }
> >
>
> There's still no explanation in the code of why writeback is
> needed for GC files. I can write a patch to add that, say,
> right here as a documenting comment for nfsd_file_free, if
> you agree.
>
>

I'll add something here.

> > -static bool
> > +static void
> > nfsd_file_free(struct nfsd_file *nf)
> > {
> > s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> > - bool flush = false;
> > +
> > + trace_nfsd_file_free(nf);
>
> Maybe move this tracepoint in 2/5 instead? Or just leave it
> where it was. Up to you.
>

Sure, done.

>
> >
> > this_cpu_inc(nfsd_file_releases);
> > this_cpu_add(nfsd_file_total_age, age);
> >
> > - trace_nfsd_file_free(nf);
> > + nfsd_file_unhash(nf);
> > + nfsd_file_fsync(nf);
> > +
> > if (nf->nf_mark)
> > nfsd_file_mark_put(nf->nf_mark);
> > if (nf->nf_file) {
> > get_file(nf->nf_file);
> > filp_close(nf->nf_file, NULL);
> > fput(nf->nf_file);
> > - flush = true;
> > }
> >
> > /*
> > @@ -375,10 +382,9 @@ nfsd_file_free(struct nfsd_file *nf)
> > * WARN and leak it to preserve system stability.
> > */
> > if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> > - return flush;
> > + return;
> >
> > call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> > - return flush;
> > }
> >
> > static bool
> > @@ -394,17 +400,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
> > mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
> > }
> >
> > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > {
> > set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > trace_nfsd_file_lru_add(nf);
> > + return true;
> > + }
> > + return false;
> > }
> >
> > -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > {
> > - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> > + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > trace_nfsd_file_lru_del(nf);
> > + return true;
> > + }
> > + return false;
> > }
> >
> > struct nfsd_file *
> > @@ -415,86 +427,77 @@ nfsd_file_get(struct nfsd_file *nf)
> > return NULL;
> > }
> >
> > -static void
> > +/**
> > + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > + * @nf: nfsd_file to be unhashed and queued
> > + * @dispose: list to which it should be queued
> > + *
> > + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> > + * will have a reference held on behalf of the list. That reference may come
> > + * from the LRU, or we may need to take one. If we can't get a reference,
> > + * ignore it altogether.
> > + */
> > +static bool
> > nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > {
> > trace_nfsd_file_unhash_and_queue(nf);
> > if (nfsd_file_unhash(nf)) {
> > - /* caller must call nfsd_file_dispose_list() later */
> > - nfsd_file_lru_remove(nf);
> > + /*
> > + * If we remove it from the LRU, then just use that
> > + * reference for the dispose list. Otherwise, we need
> > + * to take a reference. If that fails, just ignore
> > + * the file altogether.
> > + */
> > + if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > + return false;
> > list_add(&nf->nf_lru, dispose);
> > + return true;
> > }
> > + return false;
> > }
> >
> > -static void
> > -nfsd_file_put_noref(struct nfsd_file *nf)
> > -{
> > - trace_nfsd_file_put(nf);
> > -
> > - if (refcount_dec_and_test(&nf->nf_ref)) {
> > - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > - nfsd_file_lru_remove(nf);
> > - nfsd_file_free(nf);
> > - }
> > -}
> > -
> > -static void
> > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > -{
> > - if (nfsd_file_unhash(nf))
> > - nfsd_file_put_noref(nf);
> > -}
> > -
> > +/**
> > + * nfsd_file_put - put the reference to a nfsd_file
> > + * @nf: nfsd_file of which to put the reference
> > + *
> > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > + * reference immediately. In the v2/3 case, if the reference would be
> > + * the last one, the put it on the LRU instead to be cleaned up later.
> > + */
> > void
> > nfsd_file_put(struct nfsd_file *nf)
> > {
> > - might_sleep();
>
> The might_sleep() annotation should be added back to nfsd_file_fsync()
> or its callers. It might give more useful warnings if you were to
> annotate the external API functions (as was done here) rather than
> internal helper functions.
>

Actually, it's probably better in nfsd_file_put(). That'll give us more
coverage.

>
> > -
> > - if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> > - nfsd_file_lru_add(nf);
> > - else if (refcount_read(&nf->nf_ref) == 2)
> > - nfsd_file_unhash_and_put(nf);
> > -
> > - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > - nfsd_file_fsync(nf);
> > - nfsd_file_put_noref(nf);
> > - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > - nfsd_file_put_noref(nf);
> > - nfsd_file_schedule_laundrette();
>
> This might be the only caller of nfsd_file_schedule_laundrette.
> How does the laundrette get scheduled now?
>

Good point. I think we need to schedule it any time an entry is added to
the LRU.

> > - } else
> > - nfsd_file_put_noref(nf);
> > -}
> > -
> > -static void
> > -nfsd_file_dispose_list(struct list_head *dispose)
> > -{
> > - struct nfsd_file *nf;
> > + trace_nfsd_file_put(nf);
> >
> > - while(!list_empty(dispose)) {
> > - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > - list_del_init(&nf->nf_lru);
> > - nfsd_file_fsync(nf);
> > - nfsd_file_put_noref(nf);
> > + /*
> > + * The HASHED check is racy. We may end up with the occasional
> > + * unhashed entry on the LRU, but they should get cleaned up
> > + * like any other.
> > + */
> > + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
> > + test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > + /*
> > + * If this is the last reference (nf_ref == 1), then transfer
> > + * it to the LRU. If the add to the LRU fails, just put it as
> > + * usual.
> > + */
> > + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > + return;
> > }
> > + if (refcount_dec_and_test(&nf->nf_ref))
> > + nfsd_file_free(nf);
> > }
> >
> > static void
> > -nfsd_file_dispose_list_sync(struct list_head *dispose)
> > +nfsd_file_dispose_list(struct list_head *dispose)
> > {
> > - bool flush = false;
> > struct nfsd_file *nf;
> >
> > while(!list_empty(dispose)) {
> > nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > list_del_init(&nf->nf_lru);
> > - nfsd_file_fsync(nf);
> > - if (!refcount_dec_and_test(&nf->nf_ref))
> > - continue;
> > - if (nfsd_file_free(nf))
> > - flush = true;
> > + nfsd_file_free(nf);
> > }
> > - if (flush)
> > - flush_delayed_fput();
> > }
> >
> > static void
> > @@ -564,21 +567,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > struct list_head *head = arg;
> > struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> >
> > - /*
> > - * Do a lockless refcount check. The hashtable holds one reference, so
> > - * we look to see if anything else has a reference, or if any have
> > - * been put since the shrinker last ran. Those don't get unhashed and
> > - * released.
> > - *
> > - * Note that in the put path, we set the flag and then decrement the
> > - * counter. Here we check the counter and then test and clear the flag.
> > - * That order is deliberate to ensure that we can do this locklessly.
> > - */
> > - if (refcount_read(&nf->nf_ref) > 1) {
> > - list_lru_isolate(lru, &nf->nf_lru);
> > - trace_nfsd_file_gc_in_use(nf);
> > - return LRU_REMOVED;
> > - }
> > + /* We should only be dealing with v2/3 entries here */
> > + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> >
> > /*
> > * Don't throw out files that are still undergoing I/O or
> > @@ -589,40 +579,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > return LRU_SKIP;
> > }
> >
> > + /* If it was recently added to the list, skip it */
> > if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> > trace_nfsd_file_gc_referenced(nf);
> > return LRU_ROTATE;
> > }
> >
> > - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > - trace_nfsd_file_gc_hashed(nf);
> > - return LRU_SKIP;
> > + /*
> > + * Put the reference held on behalf of the LRU. If it wasn't the last
> > + * one, then just remove it from the LRU and ignore it.
> > + */
> > + if (!refcount_dec_and_test(&nf->nf_ref)) {
> > + trace_nfsd_file_gc_in_use(nf);
> > + list_lru_isolate(lru, &nf->nf_lru);
> > + return LRU_REMOVED;
> > }
> >
> > + /* Refcount went to zero. Unhash it and queue it to the dispose list */
> > + nfsd_file_unhash(nf);
> > list_lru_isolate_move(lru, &nf->nf_lru, head);
> > this_cpu_inc(nfsd_file_evictions);
> > trace_nfsd_file_gc_disposed(nf);
> > return LRU_REMOVED;
> > }
> >
> > -/*
> > - * Unhash items on @dispose immediately, then queue them on the
> > - * disposal workqueue to finish releasing them in the background.
> > - *
> > - * cel: Note that between the time list_lru_shrink_walk runs and
> > - * now, these items are in the hash table but marked unhashed.
> > - * Why release these outside of lru_cb ? There's no lock ordering
> > - * problem since lru_cb currently takes no lock.
> > - */
> > -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> > -{
> > - struct nfsd_file *nf;
> > -
> > - list_for_each_entry(nf, dispose, nf_lru)
> > - nfsd_file_hash_remove(nf);
> > - nfsd_file_dispose_list_delayed(dispose);
> > -}
> > -
> > static void
> > nfsd_file_gc(void)
> > {
> > @@ -632,7 +612,7 @@ nfsd_file_gc(void)
> > ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> > &dispose, list_lru_count(&nfsd_file_lru));
> > trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> > - nfsd_file_gc_dispose_list(&dispose);
> > + nfsd_file_dispose_list_delayed(&dispose);
> > }
> >
> > static void
> > @@ -657,7 +637,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> > ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> > nfsd_file_lru_cb, &dispose);
> > trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> > - nfsd_file_gc_dispose_list(&dispose);
> > + nfsd_file_dispose_list_delayed(&dispose);
> > return ret;
> > }
> >
> > @@ -668,8 +648,11 @@ static struct shrinker nfsd_file_shrinker = {
> > };
> >
> > /*
> > - * Find all cache items across all net namespaces that match @inode and
> > - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> > + * Find all cache items across all net namespaces that match @inode, unhash
> > + * them, take references and then put them on @dispose if that was successful.
> > + *
> > + * The nfsd_file objects on the list will be unhashed, and each will have a
> > + * reference taken.
> > */
> > static unsigned int
> > __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > @@ -687,8 +670,9 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > nfsd_file_rhash_params);
> > if (!nf)
> > break;
> > - nfsd_file_unhash_and_queue(nf, dispose);
> > - count++;
> > +
> > + if (nfsd_file_unhash_and_queue(nf, dispose))
> > + count++;
> > } while (1);
> > rcu_read_unlock();
> > return count;
> > @@ -700,15 +684,25 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > *
> > * Unhash and put all cache item associated with @inode.
> > */
> > -static void
> > +static unsigned int
> > nfsd_file_close_inode(struct inode *inode)
> > {
> > - LIST_HEAD(dispose);
> > + struct nfsd_file *nf;
> > unsigned int count;
> > + LIST_HEAD(dispose);
> >
> > count = __nfsd_file_close_inode(inode, &dispose);
> > trace_nfsd_file_close_inode(inode, count);
> > - nfsd_file_dispose_list_delayed(&dispose);
> > + if (count) {
>
> Why is it necessary to check @count? I would think that
> the the empty list check would be sufficient.
>
>

It's not necessary. I'll change it.

> > + while(!list_empty(&dispose)) {
> > + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> > + list_del_init(&nf->nf_lru);
> > + trace_nfsd_file_closing(nf);
> > + if (refcount_dec_and_test(&nf->nf_ref))
> > + nfsd_file_free(nf);
> > + }
> > + }
> > + return count;
> > }
> >
> > /**
> > @@ -720,19 +714,15 @@ nfsd_file_close_inode(struct inode *inode)
> > void
> > nfsd_file_close_inode_sync(struct inode *inode)
> > {
> > - LIST_HEAD(dispose);
> > - unsigned int count;
> > -
> > - count = __nfsd_file_close_inode(inode, &dispose);
> > - trace_nfsd_file_close_inode_sync(inode, count);
> > - nfsd_file_dispose_list_sync(&dispose);
> > + if (nfsd_file_close_inode(inode))
> > + flush_delayed_fput();
> > }
> >
> > /**
> > * nfsd_file_delayed_close - close unused nfsd_files
> > * @work: dummy
> > *
> > - * Walk the LRU list and close any entries that have not been used since
> > + * Walk the LRU list and destroy any entries that have not been used since
> > * the last scan.
> > */
> > static void
> > @@ -1054,8 +1044,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > rcu_read_lock();
> > nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
> > nfsd_file_rhash_params);
> > - if (nf)
> > - nf = nfsd_file_get(nf);
> > + if (nf) {
> > + if (!nfsd_file_lru_remove(nf))
> > + nf = nfsd_file_get(nf);
> > + }
> > rcu_read_unlock();
> > if (nf)
> > goto wait_for_construction;
> > @@ -1090,11 +1082,11 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > goto out;
> > }
> > open_retry = false;
> > - nfsd_file_put_noref(nf);
> > + if (refcount_dec_and_test(&nf->nf_ref))
> > + nfsd_file_free(nf);
> > goto retry;
> > }
> >
> > - nfsd_file_lru_remove(nf);
> > this_cpu_inc(nfsd_file_cache_hits);
> >
> > status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> > @@ -1104,7 +1096,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > this_cpu_inc(nfsd_file_acquisitions);
> > *pnf = nf;
> > } else {
> > - nfsd_file_put(nf);
> > + if (refcount_dec_and_test(&nf->nf_ref))
> > + nfsd_file_free(nf);
> > nf = NULL;
> > }
> >
> > @@ -1131,7 +1124,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > * then unhash.
> > */
> > if (status != nfs_ok || key.inode->i_nlink == 0)
> > - nfsd_file_unhash_and_put(nf);
> > + nfsd_file_unhash(nf);
> > clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> > smp_mb__after_atomic();
> > wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index 940252482fd4..a44ded06af87 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -906,6 +906,7 @@ DEFINE_EVENT(nfsd_file_class, name, \
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> > +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
> >
> > TRACE_EVENT(nfsd_file_alloc,
> > --
> > 2.38.1
> >

Thanks again for the review!
--
Jeff Layton <[email protected]>

2022-11-01 11:14:04

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH v4 5/5] nfsd: start non-blocking writeback after adding nfsd_file to the LRU

On Mon, 2022-10-31 at 21:00 +0000, Chuck Lever III wrote:
>
> > On Oct 31, 2022, at 7:37 AM, Jeff Layton <[email protected]> wrote:
> >
> > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > so that we can be ready to close it when the time comes. This should
> > help minimize delays when freeing objects reaped from the LRU.
>
> Tested against a btrfs export.
>
> So, the current code does indeed do a synchronous fsync when
> garbage collecting files (via nfsd_file_delayed_close()).
> That indicates that it's at least safe to do, and 3/5 isn't
> changing the safety of the filecache by moving the vfs_fsync()
> call into nfsd_file_free(). These calls take between 10 and
> 20 milliseconds each.
>
> But I see the filemap_flush() call added here taking dozens of
> milliseconds on my system for large files. This is done before
> the WRITE reply is sent to the client, so it adds significant
> latency to large UNSTABLE WRITEs. In the current code, the
> vfs_fsync() in nfsd_file_put() does not seem to fire except in
> very rare circumstances, so it doesn't seem to have much if any
> impact.
>
> My scalability concerns therefore are with code that pre-dates
> this patch series. I can deal with that later in a separate
> series. Do we need to keep this one?
>

In the interest of getting the fixes in this series merged, let's just
drop this one for now. We can debate how to best handle this in a
follow-on series.

>
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/filecache.c | 23 +++++++++++++++++++++--
> > 1 file changed, 21 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index 47cdc6129a7b..c43b6cff03e2 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -325,6 +325,20 @@ nfsd_file_fsync(struct nfsd_file *nf)
> > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > }
> >
> > +static void
> > +nfsd_file_flush(struct nfsd_file *nf)
> > +{
> > + struct file *file = nf->nf_file;
> > + struct address_space *mapping;
> > +
> > + if (!file || !(file->f_mode & FMODE_WRITE))
> > + return;
> > +
> > + mapping = file->f_mapping;
> > + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
> > + filemap_flush(mapping);
> > +}
> > +
> > static int
> > nfsd_file_check_write_error(struct nfsd_file *nf)
> > {
> > @@ -484,9 +498,14 @@ nfsd_file_put(struct nfsd_file *nf)
> >
> > /* Try to add it to the LRU. If that fails, decrement. */
> > if (nfsd_file_lru_add(nf)) {
> > - /* If it's still hashed, we're done */
> > - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
> > + /*
> > + * If it's still hashed, we can just return now,
> > + * after kicking off SYNC_NONE writeback.
> > + */
> > + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > + nfsd_file_flush(nf);
> > return;
> > + }
> >
> > /*
> > * We're racing with unhashing, so try to remove it from
> > --
> > 2.38.1
> >
>
> --
> Chuck Lever
>
>
>

--
Jeff Layton <[email protected]>