The filecache refcounting is a bit non-standard for something searchable
by RCU, in that we maintain a sentinel reference while it's hashed.
This in turn requires that we have to do things differently in the "put"
depending on whether its hashed, etc.
Another issue: nfsd_file_close_inode_sync can end up freeing an
nfsd_file while there are still outstanding references to it.
Rework the code so that the refcount is what drives the lifecycle. When
the refcount goes to zero, then unhash and rcu free the object.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 202 ++++++++++++++++++++------------------------
1 file changed, 92 insertions(+), 110 deletions(-)
This passes some basic smoke testing and I think closes a number of
races in this code. I also think the result is a bit simpler and easier
to follow now.
I looked for some ways to break this up into multiple patches, but I
didn't find any. This changes the underlying rules of how the
refcounting works, and I didn't see a way to split that up and still
have it remain bisectable.
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 918d67cec1ad..6c2f4f2c56a6 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * Open file cache.
- *
- * (c) 2015 - Jeff Layton <[email protected]>
+ * The NFSD open file cache.
*/
#include <linux/hash.h>
@@ -303,8 +302,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
if (key->gc)
__set_bit(NFSD_FILE_GC, &nf->nf_flags);
nf->nf_inode = key->inode;
- /* nf_ref is pre-incremented for hash table */
- refcount_set(&nf->nf_ref, 2);
+ refcount_set(&nf->nf_ref, 1);
nf->nf_may = key->need;
nf->nf_mark = NULL;
}
@@ -376,11 +374,15 @@ nfsd_file_flush(struct nfsd_file *nf)
nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
-static void nfsd_file_lru_add(struct nfsd_file *nf)
+static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
+ list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_add(nf);
+ return true;
+ }
+ return false;
}
static void nfsd_file_lru_remove(struct nfsd_file *nf)
@@ -410,7 +412,7 @@ nfsd_file_unhash(struct nfsd_file *nf)
return false;
}
-static void
+static bool
nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
{
trace_nfsd_file_unhash_and_dispose(nf);
@@ -418,46 +420,48 @@ nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
/* caller must call nfsd_file_dispose_list() later */
nfsd_file_lru_remove(nf);
list_add(&nf->nf_lru, dispose);
+ return true;
}
+ return false;
}
-static void
-nfsd_file_put_noref(struct nfsd_file *nf)
+static bool
+__nfsd_file_put(struct nfsd_file *nf)
{
- trace_nfsd_file_put(nf);
-
+ /* v4 case: don't wait for GC */
if (refcount_dec_and_test(&nf->nf_ref)) {
- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+ nfsd_file_unhash(nf);
nfsd_file_lru_remove(nf);
nfsd_file_free(nf);
+ return true;
}
+ return false;
}
-static void
-nfsd_file_unhash_and_put(struct nfsd_file *nf)
-{
- if (nfsd_file_unhash(nf))
- nfsd_file_put_noref(nf);
-}
-
+/**
+ * nfsd_file_put - put the reference to a nfsd_file
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Put a reference to a nfsd_file. In the v4 case, we just put the
+ * reference immediately. In the v2/3 case, if the reference would be
+ * the last one, the put it on the LRU instead to be cleaned up later.
+ */
void
nfsd_file_put(struct nfsd_file *nf)
{
- might_sleep();
-
- if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
- nfsd_file_lru_add(nf);
- else if (refcount_read(&nf->nf_ref) == 2)
- nfsd_file_unhash_and_put(nf);
+ trace_nfsd_file_put(nf);
- if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
- nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
- } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
- nfsd_file_put_noref(nf);
- nfsd_file_schedule_laundrette();
- } else
- nfsd_file_put_noref(nf);
+ /* NFSv2/3 case */
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
+ /*
+ * If this is the last reference (nf_ref == 1), then transfer
+ * it to the LRU. If the add to the LRU fails, just put it as
+ * usual.
+ */
+ if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
+ return;
+ }
+ __nfsd_file_put(nf);
}
struct nfsd_file *
@@ -477,27 +481,8 @@ nfsd_file_dispose_list(struct list_head *dispose)
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
- }
-}
-
-static void
-nfsd_file_dispose_list_sync(struct list_head *dispose)
-{
- bool flush = false;
- struct nfsd_file *nf;
-
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
- if (!refcount_dec_and_test(&nf->nf_ref))
- continue;
- if (nfsd_file_free(nf))
- flush = true;
+ nfsd_file_free(nf);
}
- if (flush)
- flush_delayed_fput();
}
static void
@@ -567,21 +552,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
- /*
- * Do a lockless refcount check. The hashtable holds one reference, so
- * we look to see if anything else has a reference, or if any have
- * been put since the shrinker last ran. Those don't get unhashed and
- * released.
- *
- * Note that in the put path, we set the flag and then decrement the
- * counter. Here we check the counter and then test and clear the flag.
- * That order is deliberate to ensure that we can do this locklessly.
- */
- if (refcount_read(&nf->nf_ref) > 1) {
- list_lru_isolate(lru, &nf->nf_lru);
- trace_nfsd_file_gc_in_use(nf);
- return LRU_REMOVED;
- }
+ /* We should only be dealing with v2/3 entries here */
+ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
/*
* Don't throw out files that are still undergoing I/O or
@@ -592,40 +564,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
return LRU_SKIP;
}
+ /* If it was recently referenced, then skip it */
if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
trace_nfsd_file_gc_referenced(nf);
return LRU_ROTATE;
}
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- trace_nfsd_file_gc_hashed(nf);
- return LRU_SKIP;
+ /*
+ * Put the LRU reference. If it wasn't the last one, then something
+ * took a reference to it recently (or REFERENCED would have
+ * been set). Just remove it from the LRU and ignore it.
+ */
+ if (!refcount_dec_and_test(&nf->nf_ref)) {
+ trace_nfsd_file_gc_in_use(nf);
+ list_lru_isolate(lru, &nf->nf_lru);
+ return LRU_REMOVED;
}
+ /* Refcount went to zero. Queue it to the dispose list */
list_lru_isolate_move(lru, &nf->nf_lru, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
}
-/*
- * Unhash items on @dispose immediately, then queue them on the
- * disposal workqueue to finish releasing them in the background.
- *
- * cel: Note that between the time list_lru_shrink_walk runs and
- * now, these items are in the hash table but marked unhashed.
- * Why release these outside of lru_cb ? There's no lock ordering
- * problem since lru_cb currently takes no lock.
- */
-static void nfsd_file_gc_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
-
- list_for_each_entry(nf, dispose, nf_lru)
- nfsd_file_hash_remove(nf);
- nfsd_file_dispose_list_delayed(dispose);
-}
-
static void
nfsd_file_gc(void)
{
@@ -635,7 +597,7 @@ nfsd_file_gc(void)
ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
&dispose, list_lru_count(&nfsd_file_lru));
trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
static void
@@ -660,7 +622,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
nfsd_file_lru_cb, &dispose);
trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
return ret;
}
@@ -671,8 +633,11 @@ static struct shrinker nfsd_file_shrinker = {
};
/*
- * Find all cache items across all net namespaces that match @inode and
- * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+ * Find all cache items across all net namespaces that match @inode, unhash
+ * them, take references and then put them on @dispose if that was successful.
+ *
+ * The nfsd_file objects on the list will be unhashed but holding a reference
+ * to them. The caller must ensure that the references clean things up.
*/
static unsigned int
__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
@@ -690,45 +655,62 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
nfsd_file_rhash_params);
if (!nf)
break;
- nfsd_file_unhash_and_dispose(nf, dispose);
- count++;
+
+ /* Ignore it if it's already unhashed */
+ if (!nfsd_file_unhash_and_dispose(nf, dispose))
+ continue;
+
+ /* Ignore it if we can't get a reference */
+ if (nfsd_file_get(nf))
+ count++;
+ else
+ list_del_init(&nf->nf_lru);
} while (1);
rcu_read_unlock();
return count;
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put, then flush and fput all cache items associated with @inode.
+ * Unhash and put all cache item associated with @inode.
*/
-void
-nfsd_file_close_inode_sync(struct inode *inode)
+static void
+nfsd_file_close_inode(struct inode *inode)
{
LIST_HEAD(dispose);
unsigned int count;
count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode_sync(inode, count);
- nfsd_file_dispose_list_sync(&dispose);
+ trace_nfsd_file_close_inode(inode, count);
+ nfsd_file_dispose_list_delayed(&dispose);
}
/**
- * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put all cache item associated with @inode.
+ * Unhash and put, then flush and fput all cache items associated with @inode.
*/
-static void
-nfsd_file_close_inode(struct inode *inode)
+void
+nfsd_file_close_inode_sync(struct inode *inode)
{
+ struct nfsd_file *nf;
LIST_HEAD(dispose);
unsigned int count;
count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode(inode, count);
- nfsd_file_dispose_list_delayed(&dispose);
+ trace_nfsd_file_close_inode_sync(inode, count);
+ if (!count)
+ return;
+ while(!list_empty(&dispose)) {
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
+ list_del_init(&nf->nf_lru);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
+ }
+ flush_delayed_fput();
}
/**
@@ -1094,7 +1076,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
open_retry = false;
- nfsd_file_put_noref(nf);
+ nfsd_file_put(nf);
goto retry;
}
@@ -1135,7 +1117,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* then unhash.
*/
if (status != nfs_ok || key.inode->i_nlink == 0)
- nfsd_file_unhash_and_put(nf);
+ nfsd_file_put(nf);
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
--
2.37.3
On Wed, 2022-10-26 at 04:15 -0400, Jeff Layton wrote:
> The filecache refcounting is a bit non-standard for something searchable
> by RCU, in that we maintain a sentinel reference while it's hashed.
> This in turn requires that we have to do things differently in the "put"
> depending on whether its hashed, etc.
>
> Another issue: nfsd_file_close_inode_sync can end up freeing an
> nfsd_file while there are still outstanding references to it.
>
> Rework the code so that the refcount is what drives the lifecycle. When
> the refcount goes to zero, then unhash and rcu free the object.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 202 ++++++++++++++++++++------------------------
> 1 file changed, 92 insertions(+), 110 deletions(-)
>
> This passes some basic smoke testing and I think closes a number of
> races in this code. I also think the result is a bit simpler and easier
> to follow now.
>
> I looked for some ways to break this up into multiple patches, but I
> didn't find any. This changes the underlying rules of how the
> refcounting works, and I didn't see a way to split that up and still
> have it remain bisectable.
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 918d67cec1ad..6c2f4f2c56a6 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -1,7 +1,6 @@
> +// SPDX-License-Identifier: GPL-2.0
> /*
> - * Open file cache.
> - *
> - * (c) 2015 - Jeff Layton <[email protected]>
> + * The NFSD open file cache.
> */
>
> #include <linux/hash.h>
> @@ -303,8 +302,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> if (key->gc)
> __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> nf->nf_inode = key->inode;
> - /* nf_ref is pre-incremented for hash table */
> - refcount_set(&nf->nf_ref, 2);
> + refcount_set(&nf->nf_ref, 1);
> nf->nf_may = key->need;
> nf->nf_mark = NULL;
> }
> @@ -376,11 +374,15 @@ nfsd_file_flush(struct nfsd_file *nf)
> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
>
> -static void nfsd_file_lru_add(struct nfsd_file *nf)
> +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> {
> set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
The above condition is wrong, I think. We only want to add the object to
the LRU if it _is_ hashed.
> + list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> trace_nfsd_file_lru_add(nf);
> + return true;
> + }
> + return false;
> }
>
> static void nfsd_file_lru_remove(struct nfsd_file *nf)
> @@ -410,7 +412,7 @@ nfsd_file_unhash(struct nfsd_file *nf)
> return false;
> }
>
> -static void
> +static bool
> nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> {
> trace_nfsd_file_unhash_and_dispose(nf);
> @@ -418,46 +420,48 @@ nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> /* caller must call nfsd_file_dispose_list() later */
> nfsd_file_lru_remove(nf);
> list_add(&nf->nf_lru, dispose);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_put_noref(struct nfsd_file *nf)
> +static bool
> +__nfsd_file_put(struct nfsd_file *nf)
> {
> - trace_nfsd_file_put(nf);
> -
> + /* v4 case: don't wait for GC */
> if (refcount_dec_and_test(&nf->nf_ref)) {
> - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> + nfsd_file_unhash(nf);
> nfsd_file_lru_remove(nf);
> nfsd_file_free(nf);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> -{
> - if (nfsd_file_unhash(nf))
> - nfsd_file_put_noref(nf);
> -}
> -
> +/**
> + * nfsd_file_put - put the reference to a nfsd_file
> + * @nf: nfsd_file of which to put the reference
> + *
> + * Put a reference to a nfsd_file. In the v4 case, we just put the
> + * reference immediately. In the v2/3 case, if the reference would be
> + * the last one, the put it on the LRU instead to be cleaned up later.
> + */
> void
> nfsd_file_put(struct nfsd_file *nf)
> {
> - might_sleep();
> -
> - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> - nfsd_file_lru_add(nf);
> - else if (refcount_read(&nf->nf_ref) == 2)
> - nfsd_file_unhash_and_put(nf);
> + trace_nfsd_file_put(nf);
>
> - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
> - nfsd_file_flush(nf);
> - nfsd_file_put_noref(nf);
> - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
> - nfsd_file_put_noref(nf);
> - nfsd_file_schedule_laundrette();
> - } else
> - nfsd_file_put_noref(nf);
> + /* NFSv2/3 case */
> + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> + /*
> + * If this is the last reference (nf_ref == 1), then transfer
> + * it to the LRU. If the add to the LRU fails, just put it as
> + * usual.
> + */
> + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> + return;
> + }
> + __nfsd_file_put(nf);
> }
>
> struct nfsd_file *
> @@ -477,27 +481,8 @@ nfsd_file_dispose_list(struct list_head *dispose)
> nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> list_del_init(&nf->nf_lru);
> nfsd_file_flush(nf);
> - nfsd_file_put_noref(nf);
> - }
> -}
> -
> -static void
> -nfsd_file_dispose_list_sync(struct list_head *dispose)
> -{
> - bool flush = false;
> - struct nfsd_file *nf;
> -
> - while(!list_empty(dispose)) {
> - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> - list_del_init(&nf->nf_lru);
> - nfsd_file_flush(nf);
> - if (!refcount_dec_and_test(&nf->nf_ref))
> - continue;
> - if (nfsd_file_free(nf))
> - flush = true;
> + nfsd_file_free(nf);
> }
> - if (flush)
> - flush_delayed_fput();
> }
>
> static void
> @@ -567,21 +552,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> struct list_head *head = arg;
> struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
>
> - /*
> - * Do a lockless refcount check. The hashtable holds one reference, so
> - * we look to see if anything else has a reference, or if any have
> - * been put since the shrinker last ran. Those don't get unhashed and
> - * released.
> - *
> - * Note that in the put path, we set the flag and then decrement the
> - * counter. Here we check the counter and then test and clear the flag.
> - * That order is deliberate to ensure that we can do this locklessly.
> - */
> - if (refcount_read(&nf->nf_ref) > 1) {
> - list_lru_isolate(lru, &nf->nf_lru);
> - trace_nfsd_file_gc_in_use(nf);
> - return LRU_REMOVED;
> - }
> + /* We should only be dealing with v2/3 entries here */
> + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
>
> /*
> * Don't throw out files that are still undergoing I/O or
> @@ -592,40 +564,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> return LRU_SKIP;
> }
>
> + /* If it was recently referenced, then skip it */
> if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> trace_nfsd_file_gc_referenced(nf);
> return LRU_ROTATE;
> }
>
> - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> - trace_nfsd_file_gc_hashed(nf);
> - return LRU_SKIP;
> + /*
> + * Put the LRU reference. If it wasn't the last one, then something
> + * took a reference to it recently (or REFERENCED would have
> + * been set). Just remove it from the LRU and ignore it.
> + */
> + if (!refcount_dec_and_test(&nf->nf_ref)) {
> + trace_nfsd_file_gc_in_use(nf);
> + list_lru_isolate(lru, &nf->nf_lru);
> + return LRU_REMOVED;
> }
>
> + /* Refcount went to zero. Queue it to the dispose list */
> list_lru_isolate_move(lru, &nf->nf_lru, head);
> this_cpu_inc(nfsd_file_evictions);
> trace_nfsd_file_gc_disposed(nf);
> return LRU_REMOVED;
> }
>
> -/*
> - * Unhash items on @dispose immediately, then queue them on the
> - * disposal workqueue to finish releasing them in the background.
> - *
> - * cel: Note that between the time list_lru_shrink_walk runs and
> - * now, these items are in the hash table but marked unhashed.
> - * Why release these outside of lru_cb ? There's no lock ordering
> - * problem since lru_cb currently takes no lock.
> - */
> -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> -{
> - struct nfsd_file *nf;
> -
> - list_for_each_entry(nf, dispose, nf_lru)
> - nfsd_file_hash_remove(nf);
> - nfsd_file_dispose_list_delayed(dispose);
> -}
> -
> static void
> nfsd_file_gc(void)
> {
> @@ -635,7 +597,7 @@ nfsd_file_gc(void)
> ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> &dispose, list_lru_count(&nfsd_file_lru));
> trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> }
>
> static void
> @@ -660,7 +622,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> nfsd_file_lru_cb, &dispose);
> trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> return ret;
> }
>
> @@ -671,8 +633,11 @@ static struct shrinker nfsd_file_shrinker = {
> };
>
> /*
> - * Find all cache items across all net namespaces that match @inode and
> - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> + * Find all cache items across all net namespaces that match @inode, unhash
> + * them, take references and then put them on @dispose if that was successful.
> + *
> + * The nfsd_file objects on the list will be unhashed but holding a reference
> + * to them. The caller must ensure that the references clean things up.
> */
> static unsigned int
> __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> @@ -690,45 +655,62 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> nfsd_file_rhash_params);
> if (!nf)
> break;
> - nfsd_file_unhash_and_dispose(nf, dispose);
> - count++;
> +
> + /* Ignore it if it's already unhashed */
> + if (!nfsd_file_unhash_and_dispose(nf, dispose))
> + continue;
> +
> + /* Ignore it if we can't get a reference */
> + if (nfsd_file_get(nf))
> + count++;
> + else
> + list_del_init(&nf->nf_lru);
> } while (1);
> rcu_read_unlock();
> return count;
> }
>
> /**
> - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> * @inode: inode of the file to attempt to remove
> *
> - * Unhash and put, then flush and fput all cache items associated with @inode.
> + * Unhash and put all cache item associated with @inode.
> */
> -void
> -nfsd_file_close_inode_sync(struct inode *inode)
> +static void
> +nfsd_file_close_inode(struct inode *inode)
> {
> LIST_HEAD(dispose);
> unsigned int count;
>
> count = __nfsd_file_close_inode(inode, &dispose);
> - trace_nfsd_file_close_inode_sync(inode, count);
> - nfsd_file_dispose_list_sync(&dispose);
> + trace_nfsd_file_close_inode(inode, count);
> + nfsd_file_dispose_list_delayed(&dispose);
> }
>
> /**
> - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> * @inode: inode of the file to attempt to remove
> *
> - * Unhash and put all cache item associated with @inode.
> + * Unhash and put, then flush and fput all cache items associated with @inode.
> */
> -static void
> -nfsd_file_close_inode(struct inode *inode)
> +void
> +nfsd_file_close_inode_sync(struct inode *inode)
> {
> + struct nfsd_file *nf;
> LIST_HEAD(dispose);
> unsigned int count;
>
> count = __nfsd_file_close_inode(inode, &dispose);
> - trace_nfsd_file_close_inode(inode, count);
> - nfsd_file_dispose_list_delayed(&dispose);
> + trace_nfsd_file_close_inode_sync(inode, count);
> + if (!count)
> + return;
> + while(!list_empty(&dispose)) {
> + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> + list_del_init(&nf->nf_lru);
> + if (refcount_dec_and_test(&nf->nf_ref))
> + nfsd_file_free(nf);
> + }
> + flush_delayed_fput();
> }
>
> /**
> @@ -1094,7 +1076,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> goto out;
> }
> open_retry = false;
> - nfsd_file_put_noref(nf);
> + nfsd_file_put(nf);
> goto retry;
> }
>
> @@ -1135,7 +1117,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> * then unhash.
> */
> if (status != nfs_ok || key.inode->i_nlink == 0)
> - nfsd_file_unhash_and_put(nf);
> + nfsd_file_put(nf);
> clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> smp_mb__after_atomic();
> wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
--
Jeff Layton <[email protected]>
> On Oct 26, 2022, at 4:15 AM, Jeff Layton <[email protected]> wrote:
>
> The filecache refcounting is a bit non-standard for something searchable
> by RCU, in that we maintain a sentinel reference while it's hashed.
> This in turn requires that we have to do things differently in the "put"
> depending on whether its hashed, etc.
>
> Another issue: nfsd_file_close_inode_sync can end up freeing an
> nfsd_file while there are still outstanding references to it.
>
> Rework the code so that the refcount is what drives the lifecycle. When
> the refcount goes to zero, then unhash and rcu free the object.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 202 ++++++++++++++++++++------------------------
> 1 file changed, 92 insertions(+), 110 deletions(-)
>
> This passes some basic smoke testing and I think closes a number of
> races in this code. I also think the result is a bit simpler and easier
> to follow now.
>
> I looked for some ways to break this up into multiple patches, but I
> didn't find any. This changes the underlying rules of how the
> refcounting works, and I didn't see a way to split that up and still
> have it remain bisectable.
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 918d67cec1ad..6c2f4f2c56a6 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -1,7 +1,6 @@
> +// SPDX-License-Identifier: GPL-2.0
> /*
> - * Open file cache.
> - *
> - * (c) 2015 - Jeff Layton <[email protected]>
> + * The NFSD open file cache.
> */
>
> #include <linux/hash.h>
Let's put the license change in a separate commit. The description
might want to point out that you are the same Jeff Layton as the
one who was at PD, but that's up to you.
Otherwise, I agree: one patch for the rest is probably best.
Browsing this, it looks like a good direction.
> @@ -303,8 +302,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> if (key->gc)
> __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> nf->nf_inode = key->inode;
> - /* nf_ref is pre-incremented for hash table */
> - refcount_set(&nf->nf_ref, 2);
> + refcount_set(&nf->nf_ref, 1);
> nf->nf_may = key->need;
> nf->nf_mark = NULL;
> }
> @@ -376,11 +374,15 @@ nfsd_file_flush(struct nfsd_file *nf)
> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
>
> -static void nfsd_file_lru_add(struct nfsd_file *nf)
> +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> {
> set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
> + list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> trace_nfsd_file_lru_add(nf);
> + return true;
> + }
> + return false;
> }
>
> static void nfsd_file_lru_remove(struct nfsd_file *nf)
> @@ -410,7 +412,7 @@ nfsd_file_unhash(struct nfsd_file *nf)
> return false;
> }
>
> -static void
> +static bool
> nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> {
> trace_nfsd_file_unhash_and_dispose(nf);
> @@ -418,46 +420,48 @@ nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> /* caller must call nfsd_file_dispose_list() later */
> nfsd_file_lru_remove(nf);
> list_add(&nf->nf_lru, dispose);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_put_noref(struct nfsd_file *nf)
> +static bool
> +__nfsd_file_put(struct nfsd_file *nf)
> {
> - trace_nfsd_file_put(nf);
> -
> + /* v4 case: don't wait for GC */
> if (refcount_dec_and_test(&nf->nf_ref)) {
> - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> + nfsd_file_unhash(nf);
> nfsd_file_lru_remove(nf);
> nfsd_file_free(nf);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> -{
> - if (nfsd_file_unhash(nf))
> - nfsd_file_put_noref(nf);
> -}
> -
> +/**
> + * nfsd_file_put - put the reference to a nfsd_file
> + * @nf: nfsd_file of which to put the reference
> + *
> + * Put a reference to a nfsd_file. In the v4 case, we just put the
> + * reference immediately. In the v2/3 case, if the reference would be
> + * the last one, the put it on the LRU instead to be cleaned up later.
> + */
> void
> nfsd_file_put(struct nfsd_file *nf)
> {
> - might_sleep();
> -
> - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> - nfsd_file_lru_add(nf);
> - else if (refcount_read(&nf->nf_ref) == 2)
> - nfsd_file_unhash_and_put(nf);
> + trace_nfsd_file_put(nf);
>
> - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
> - nfsd_file_flush(nf);
> - nfsd_file_put_noref(nf);
> - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
> - nfsd_file_put_noref(nf);
> - nfsd_file_schedule_laundrette();
> - } else
> - nfsd_file_put_noref(nf);
> + /* NFSv2/3 case */
> + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> + /*
> + * If this is the last reference (nf_ref == 1), then transfer
> + * it to the LRU. If the add to the LRU fails, just put it as
> + * usual.
> + */
> + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> + return;
> + }
> + __nfsd_file_put(nf);
> }
>
> struct nfsd_file *
> @@ -477,27 +481,8 @@ nfsd_file_dispose_list(struct list_head *dispose)
> nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> list_del_init(&nf->nf_lru);
> nfsd_file_flush(nf);
> - nfsd_file_put_noref(nf);
> - }
> -}
> -
> -static void
> -nfsd_file_dispose_list_sync(struct list_head *dispose)
> -{
> - bool flush = false;
> - struct nfsd_file *nf;
> -
> - while(!list_empty(dispose)) {
> - nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> - list_del_init(&nf->nf_lru);
> - nfsd_file_flush(nf);
> - if (!refcount_dec_and_test(&nf->nf_ref))
> - continue;
> - if (nfsd_file_free(nf))
> - flush = true;
> + nfsd_file_free(nf);
> }
> - if (flush)
> - flush_delayed_fput();
> }
>
> static void
> @@ -567,21 +552,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> struct list_head *head = arg;
> struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
>
> - /*
> - * Do a lockless refcount check. The hashtable holds one reference, so
> - * we look to see if anything else has a reference, or if any have
> - * been put since the shrinker last ran. Those don't get unhashed and
> - * released.
> - *
> - * Note that in the put path, we set the flag and then decrement the
> - * counter. Here we check the counter and then test and clear the flag.
> - * That order is deliberate to ensure that we can do this locklessly.
> - */
> - if (refcount_read(&nf->nf_ref) > 1) {
> - list_lru_isolate(lru, &nf->nf_lru);
> - trace_nfsd_file_gc_in_use(nf);
> - return LRU_REMOVED;
> - }
> + /* We should only be dealing with v2/3 entries here */
> + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
>
> /*
> * Don't throw out files that are still undergoing I/O or
> @@ -592,40 +564,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> return LRU_SKIP;
> }
>
> + /* If it was recently referenced, then skip it */
> if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> trace_nfsd_file_gc_referenced(nf);
> return LRU_ROTATE;
> }
>
> - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> - trace_nfsd_file_gc_hashed(nf);
> - return LRU_SKIP;
> + /*
> + * Put the LRU reference. If it wasn't the last one, then something
> + * took a reference to it recently (or REFERENCED would have
> + * been set). Just remove it from the LRU and ignore it.
> + */
> + if (!refcount_dec_and_test(&nf->nf_ref)) {
> + trace_nfsd_file_gc_in_use(nf);
> + list_lru_isolate(lru, &nf->nf_lru);
> + return LRU_REMOVED;
> }
>
> + /* Refcount went to zero. Queue it to the dispose list */
> list_lru_isolate_move(lru, &nf->nf_lru, head);
> this_cpu_inc(nfsd_file_evictions);
> trace_nfsd_file_gc_disposed(nf);
> return LRU_REMOVED;
> }
>
> -/*
> - * Unhash items on @dispose immediately, then queue them on the
> - * disposal workqueue to finish releasing them in the background.
> - *
> - * cel: Note that between the time list_lru_shrink_walk runs and
> - * now, these items are in the hash table but marked unhashed.
> - * Why release these outside of lru_cb ? There's no lock ordering
> - * problem since lru_cb currently takes no lock.
> - */
> -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> -{
> - struct nfsd_file *nf;
> -
> - list_for_each_entry(nf, dispose, nf_lru)
> - nfsd_file_hash_remove(nf);
> - nfsd_file_dispose_list_delayed(dispose);
> -}
> -
> static void
> nfsd_file_gc(void)
> {
> @@ -635,7 +597,7 @@ nfsd_file_gc(void)
> ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> &dispose, list_lru_count(&nfsd_file_lru));
> trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> }
>
> static void
> @@ -660,7 +622,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> nfsd_file_lru_cb, &dispose);
> trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> - nfsd_file_gc_dispose_list(&dispose);
> + nfsd_file_dispose_list_delayed(&dispose);
> return ret;
> }
>
> @@ -671,8 +633,11 @@ static struct shrinker nfsd_file_shrinker = {
> };
>
> /*
> - * Find all cache items across all net namespaces that match @inode and
> - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> + * Find all cache items across all net namespaces that match @inode, unhash
> + * them, take references and then put them on @dispose if that was successful.
> + *
> + * The nfsd_file objects on the list will be unhashed but holding a reference
> + * to them. The caller must ensure that the references clean things up.
> */
> static unsigned int
> __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> @@ -690,45 +655,62 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> nfsd_file_rhash_params);
> if (!nf)
> break;
> - nfsd_file_unhash_and_dispose(nf, dispose);
> - count++;
> +
> + /* Ignore it if it's already unhashed */
> + if (!nfsd_file_unhash_and_dispose(nf, dispose))
> + continue;
> +
> + /* Ignore it if we can't get a reference */
> + if (nfsd_file_get(nf))
> + count++;
> + else
> + list_del_init(&nf->nf_lru);
> } while (1);
> rcu_read_unlock();
> return count;
> }
>
> /**
> - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> * @inode: inode of the file to attempt to remove
> *
> - * Unhash and put, then flush and fput all cache items associated with @inode.
> + * Unhash and put all cache item associated with @inode.
> */
> -void
> -nfsd_file_close_inode_sync(struct inode *inode)
> +static void
> +nfsd_file_close_inode(struct inode *inode)
> {
> LIST_HEAD(dispose);
> unsigned int count;
>
> count = __nfsd_file_close_inode(inode, &dispose);
> - trace_nfsd_file_close_inode_sync(inode, count);
> - nfsd_file_dispose_list_sync(&dispose);
> + trace_nfsd_file_close_inode(inode, count);
> + nfsd_file_dispose_list_delayed(&dispose);
> }
>
> /**
> - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
> + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
> * @inode: inode of the file to attempt to remove
> *
> - * Unhash and put all cache item associated with @inode.
> + * Unhash and put, then flush and fput all cache items associated with @inode.
> */
> -static void
> -nfsd_file_close_inode(struct inode *inode)
> +void
> +nfsd_file_close_inode_sync(struct inode *inode)
> {
> + struct nfsd_file *nf;
> LIST_HEAD(dispose);
> unsigned int count;
>
> count = __nfsd_file_close_inode(inode, &dispose);
> - trace_nfsd_file_close_inode(inode, count);
> - nfsd_file_dispose_list_delayed(&dispose);
> + trace_nfsd_file_close_inode_sync(inode, count);
> + if (!count)
> + return;
> + while(!list_empty(&dispose)) {
> + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> + list_del_init(&nf->nf_lru);
> + if (refcount_dec_and_test(&nf->nf_ref))
> + nfsd_file_free(nf);
> + }
> + flush_delayed_fput();
> }
>
> /**
> @@ -1094,7 +1076,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> goto out;
> }
> open_retry = false;
> - nfsd_file_put_noref(nf);
> + nfsd_file_put(nf);
> goto retry;
> }
>
> @@ -1135,7 +1117,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> * then unhash.
> */
> if (status != nfs_ok || key.inode->i_nlink == 0)
> - nfsd_file_unhash_and_put(nf);
> + nfsd_file_put(nf);
> clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> smp_mb__after_atomic();
> wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> --
> 2.37.3
>
--
Chuck Lever
On Wed, 26 Oct 2022, Jeff Layton wrote:
> The filecache refcounting is a bit non-standard for something searchable
> by RCU, in that we maintain a sentinel reference while it's hashed.
> This in turn requires that we have to do things differently in the "put"
> depending on whether its hashed, etc.
>
> Another issue: nfsd_file_close_inode_sync can end up freeing an
> nfsd_file while there are still outstanding references to it.
>
> Rework the code so that the refcount is what drives the lifecycle. When
> the refcount goes to zero, then unhash and rcu free the object.
I think the lru reference needs to be counted. Otherwise the nfsd_file
will be freed (and removed from the LRU) as soon as not active request
is referencing it (in the NFSv3 case). This makes the LRU ineffective.
Looking more closely at the patch, it seems to sometimes treat the LRU
reference as counted, but sometimes not.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 202 ++++++++++++++++++++------------------------
> 1 file changed, 92 insertions(+), 110 deletions(-)
>
> This passes some basic smoke testing and I think closes a number of
> races in this code. I also think the result is a bit simpler and easier
> to follow now.
>
> I looked for some ways to break this up into multiple patches, but I
> didn't find any. This changes the underlying rules of how the
> refcounting works, and I didn't see a way to split that up and still
> have it remain bisectable.
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 918d67cec1ad..6c2f4f2c56a6 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -1,7 +1,6 @@
> +// SPDX-License-Identifier: GPL-2.0
> /*
> - * Open file cache.
> - *
> - * (c) 2015 - Jeff Layton <[email protected]>
> + * The NFSD open file cache.
> */
>
> #include <linux/hash.h>
> @@ -303,8 +302,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> if (key->gc)
> __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> nf->nf_inode = key->inode;
> - /* nf_ref is pre-incremented for hash table */
> - refcount_set(&nf->nf_ref, 2);
> + refcount_set(&nf->nf_ref, 1);
> nf->nf_may = key->need;
> nf->nf_mark = NULL;
> }
> @@ -376,11 +374,15 @@ nfsd_file_flush(struct nfsd_file *nf)
> nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
>
> -static void nfsd_file_lru_add(struct nfsd_file *nf)
> +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> {
> set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
Not only is this test inverted (as you already noted) but it is racy.
The file could get unhashed immediately after the test. That possibly
gets handled somewhere. If not it should be. If so, this test isn't
really needed.
> + list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> trace_nfsd_file_lru_add(nf);
> + return true;
> + }
> + return false;
> }
>
> static void nfsd_file_lru_remove(struct nfsd_file *nf)
> @@ -410,7 +412,7 @@ nfsd_file_unhash(struct nfsd_file *nf)
> return false;
> }
>
> -static void
> +static bool
> nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> {
> trace_nfsd_file_unhash_and_dispose(nf);
> @@ -418,46 +420,48 @@ nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> /* caller must call nfsd_file_dispose_list() later */
> nfsd_file_lru_remove(nf);
> list_add(&nf->nf_lru, dispose);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_put_noref(struct nfsd_file *nf)
> +static bool
> +__nfsd_file_put(struct nfsd_file *nf)
> {
> - trace_nfsd_file_put(nf);
> -
> + /* v4 case: don't wait for GC */
This comment suggests this code is only for the v4 case ....
> if (refcount_dec_and_test(&nf->nf_ref)) {
> - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> + nfsd_file_unhash(nf);
> nfsd_file_lru_remove(nf);
but v4 doesn't use the lru, so this line is pointless.
And if the LRU does hold a counted reference, nf cannot possibly be on
the LRU at this point.
In fact, this function can be called for the v3 case, so the comment is
just misleading .... or maybe the code is poorly structured.
> nfsd_file_free(nf);
> + return true;
> }
> + return false;
> }
>
> -static void
> -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> -{
> - if (nfsd_file_unhash(nf))
> - nfsd_file_put_noref(nf);
> -}
> -
> +/**
> + * nfsd_file_put - put the reference to a nfsd_file
> + * @nf: nfsd_file of which to put the reference
> + *
> + * Put a reference to a nfsd_file. In the v4 case, we just put the
> + * reference immediately. In the v2/3 case, if the reference would be
> + * the last one, the put it on the LRU instead to be cleaned up later.
> + */
> void
> nfsd_file_put(struct nfsd_file *nf)
> {
> - might_sleep();
> -
> - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> - nfsd_file_lru_add(nf);
> - else if (refcount_read(&nf->nf_ref) == 2)
> - nfsd_file_unhash_and_put(nf);
> + trace_nfsd_file_put(nf);
>
> - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
> - nfsd_file_flush(nf);
> - nfsd_file_put_noref(nf);
> - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
> - nfsd_file_put_noref(nf);
> - nfsd_file_schedule_laundrette();
> - } else
> - nfsd_file_put_noref(nf);
> + /* NFSv2/3 case */
> + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> + /*
> + * If this is the last reference (nf_ref == 1), then transfer
> + * it to the LRU. If the add to the LRU fails, just put it as
> + * usual.
> + */
> + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> + return;
This is one place that looks like you are refcounting entries on the lru.
If this is the last reference and you can transfer it to use LRU, then
you don't need to drop the reference.
> + }
> + __nfsd_file_put(nf);
> }
>
NeilBrown
On Thu, 2022-10-27 at 09:32 +1100, NeilBrown wrote:
> On Wed, 26 Oct 2022, Jeff Layton wrote:
> > The filecache refcounting is a bit non-standard for something searchable
> > by RCU, in that we maintain a sentinel reference while it's hashed.
> > This in turn requires that we have to do things differently in the "put"
> > depending on whether its hashed, etc.
> >
> > Another issue: nfsd_file_close_inode_sync can end up freeing an
> > nfsd_file while there are still outstanding references to it.
> >
> > Rework the code so that the refcount is what drives the lifecycle. When
> > the refcount goes to zero, then unhash and rcu free the object.
>
> I think the lru reference needs to be counted. Otherwise the nfsd_file
> will be freed (and removed from the LRU) as soon as not active request
> is referencing it (in the NFSv3 case). This makes the LRU ineffective.
>
> Looking more closely at the patch, it seems to sometimes treat the LRU
> reference as counted, but sometimes not.
>
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/filecache.c | 202 ++++++++++++++++++++------------------------
> > 1 file changed, 92 insertions(+), 110 deletions(-)
> >
> > This passes some basic smoke testing and I think closes a number of
> > races in this code. I also think the result is a bit simpler and easier
> > to follow now.
> >
> > I looked for some ways to break this up into multiple patches, but I
> > didn't find any. This changes the underlying rules of how the
> > refcounting works, and I didn't see a way to split that up and still
> > have it remain bisectable.
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index 918d67cec1ad..6c2f4f2c56a6 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -1,7 +1,6 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > /*
> > - * Open file cache.
> > - *
> > - * (c) 2015 - Jeff Layton <[email protected]>
> > + * The NFSD open file cache.
> > */
> >
> > #include <linux/hash.h>
> > @@ -303,8 +302,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > if (key->gc)
> > __set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > nf->nf_inode = key->inode;
> > - /* nf_ref is pre-incremented for hash table */
> > - refcount_set(&nf->nf_ref, 2);
> > + refcount_set(&nf->nf_ref, 1);
> > nf->nf_may = key->need;
> > nf->nf_mark = NULL;
> > }
> > @@ -376,11 +374,15 @@ nfsd_file_flush(struct nfsd_file *nf)
> > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > }
> >
> > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > {
> > set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
>
>
Yes, I think this is just wrong. We do need to be able to put hashed
entries on the LRU.
>
> > + list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > trace_nfsd_file_lru_add(nf);
> > + return true;
> > + }
> > + return false;
> > }
> >
> > static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > @@ -410,7 +412,7 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > return false;
> > }
> >
> > -static void
> > +static bool
> > nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > {
> > trace_nfsd_file_unhash_and_dispose(nf);
> > @@ -418,46 +420,48 @@ nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > /* caller must call nfsd_file_dispose_list() later */
> > nfsd_file_lru_remove(nf);
> > list_add(&nf->nf_lru, dispose);
> > + return true;
> > }
> > + return false;
> > }
> >
> > -static void
> > -nfsd_file_put_noref(struct nfsd_file *nf)
> > +static bool
> > +__nfsd_file_put(struct nfsd_file *nf)
> > {
> > - trace_nfsd_file_put(nf);
> > -
> > + /* v4 case: don't wait for GC */
>
> This comment suggests this code is only for the v4 case ....
>
> > if (refcount_dec_and_test(&nf->nf_ref)) {
> > - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > + nfsd_file_unhash(nf);
> > nfsd_file_lru_remove(nf);
>
> but v4 doesn't use the lru, so this line is pointless.
> And if the LRU does hold a counted reference, nf cannot possibly be on
> the LRU at this point.
>
> In fact, this function can be called for the v3 case, so the comment is
> just misleading .... or maybe the code is poorly structured.
>
Ok. I'll clean up the comments. We have the "normal" refcount put
codepath, and the v2/3 one where we add it to the LRU if the reference
is the last one.
> > nfsd_file_free(nf);
> > + return true;
> > }
> > + return false;
> > }
> >
> > -static void
> > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > -{
> > - if (nfsd_file_unhash(nf))
> > - nfsd_file_put_noref(nf);
> > -}
> > -
> > +/**
> > + * nfsd_file_put - put the reference to a nfsd_file
> > + * @nf: nfsd_file of which to put the reference
> > + *
> > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > + * reference immediately. In the v2/3 case, if the reference would be
> > + * the last one, the put it on the LRU instead to be cleaned up later.
> > + */
> > void
> > nfsd_file_put(struct nfsd_file *nf)
> > {
> > - might_sleep();
> > -
> > - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> > - nfsd_file_lru_add(nf);
> > - else if (refcount_read(&nf->nf_ref) == 2)
> > - nfsd_file_unhash_and_put(nf);
> > + trace_nfsd_file_put(nf);
> >
> > - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
> > - nfsd_file_flush(nf);
> > - nfsd_file_put_noref(nf);
> > - } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
> > - nfsd_file_put_noref(nf);
> > - nfsd_file_schedule_laundrette();
> > - } else
> > - nfsd_file_put_noref(nf);
> > + /* NFSv2/3 case */
> > + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > + /*
> > + * If this is the last reference (nf_ref == 1), then transfer
> > + * it to the LRU. If the add to the LRU fails, just put it as
> > + * usual.
> > + */
> > + if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > + return;
>
> This is one place that looks like you are refcounting entries on the lru.
> If this is the last reference and you can transfer it to use LRU, then
> you don't need to drop the reference.
>
This is the only call to nfsd_file_lru_add. We only put an entry on the
LRU if it was a "gc" entry and the reference was the last one.
> > + }
> > + __nfsd_file_put(nf);
> > }
> >
>
> NeilBrown
--
Jeff Layton <[email protected]>