2022-09-30 19:16:21

by Jeff Layton

[permalink] [raw]
Subject: [PATCH 3/3] nfsd: fix nfsd_file_unhash_and_dispose

This function is called two reasons:

We're either shutting down and purging the filecache, or we've gotten a
notification about a file delete, so we want to go ahead and unhash it
so that it'll get cleaned up when we close.

We're either walking the hashtable or doing a lookup in it and we
don't take a reference in either case. What we want to do in both cases
is to try and unhash the object and put it on the dispose list if that
was successful. If it's no longer hashed, then we don't want to touch
it, with the assumption being that something else is already cleaning
up the sentinel reference.

Instead of trying to selectively decrement the refcount in this
function, just unhash it, and if that was successful, move it to the
dispose list. Then, the disposal routine will just clean that up as
usual.

Also, just make this a void function, drop the WARN_ON_ONCE, and the
comments about deadlocking since the nature of the purported deadlock
is no longer clear.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 32 ++++++--------------------------
1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 58f4d9267f4a..16bd71a3894e 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -408,19 +408,14 @@ nfsd_file_unhash(struct nfsd_file *nf)
/*
* Return true if the file was unhashed.
*/
-static bool
+static void
nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
{
trace_nfsd_file_unhash_and_dispose(nf);
- if (!nfsd_file_unhash(nf))
- return false;
- /* keep final reference for nfsd_file_lru_dispose */
- if (refcount_dec_not_one(&nf->nf_ref))
- return true;
-
- nfsd_file_lru_remove(nf);
- list_add(&nf->nf_lru, dispose);
- return true;
+ if (nfsd_file_unhash(nf)) {
+ nfsd_file_lru_remove(nf);
+ list_add(&nf->nf_lru, dispose);
+ }
}

static void
@@ -564,8 +559,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
* @lock: LRU list lock (unused)
* @arg: dispose list
*
- * Note this can deadlock with nfsd_file_cache_purge.
- *
* Return values:
* %LRU_REMOVED: @item was removed from the LRU
* %LRU_ROTATE: @item is to be moved to the LRU tail
@@ -750,8 +743,6 @@ nfsd_file_close_inode(struct inode *inode)
*
* Walk the LRU list and close any entries that have not been used since
* the last scan.
- *
- * Note this can deadlock with nfsd_file_cache_purge.
*/
static void
nfsd_file_delayed_close(struct work_struct *work)
@@ -893,16 +884,12 @@ nfsd_file_cache_init(void)
goto out;
}

-/*
- * Note this can deadlock with nfsd_file_lru_cb.
- */
static void
__nfsd_file_cache_purge(struct net *net)
{
struct rhashtable_iter iter;
struct nfsd_file *nf;
LIST_HEAD(dispose);
- bool del;

rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
do {
@@ -912,14 +899,7 @@ __nfsd_file_cache_purge(struct net *net)
while (!IS_ERR_OR_NULL(nf)) {
if (net && nf->nf_net != net)
continue;
- del = nfsd_file_unhash_and_dispose(nf, &dispose);
-
- /*
- * Deadlock detected! Something marked this entry as
- * unhased, but hasn't removed it from the hash list.
- */
- WARN_ON_ONCE(!del);
-
+ nfsd_file_unhash_and_dispose(nf, &dispose);
nf = rhashtable_walk_next(&iter);
}

--
2.37.3


2022-09-30 19:31:58

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH 3/3] nfsd: fix nfsd_file_unhash_and_dispose



> On Sep 30, 2022, at 3:15 PM, Jeff Layton <[email protected]> wrote:
>
> This function is called two reasons:
>
> We're either shutting down and purging the filecache, or we've gotten a
> notification about a file delete, so we want to go ahead and unhash it
> so that it'll get cleaned up when we close.
>
> We're either walking the hashtable or doing a lookup in it and we
> don't take a reference in either case. What we want to do in both cases
> is to try and unhash the object and put it on the dispose list if that
> was successful. If it's no longer hashed, then we don't want to touch
> it, with the assumption being that something else is already cleaning
> up the sentinel reference.
>
> Instead of trying to selectively decrement the refcount in this
> function, just unhash it, and if that was successful, move it to the
> dispose list. Then, the disposal routine will just clean that up as
> usual.
>
> Also, just make this a void function, drop the WARN_ON_ONCE, and the
> comments about deadlocking since the nature of the purported deadlock
> is no longer clear.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/filecache.c | 32 ++++++--------------------------
> 1 file changed, 6 insertions(+), 26 deletions(-)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 58f4d9267f4a..16bd71a3894e 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -408,19 +408,14 @@ nfsd_file_unhash(struct nfsd_file *nf)
> /*
> * Return true if the file was unhashed.
> */

If you're changing the function to return void, the above
comment is now stale.

> -static bool
> +static void
> nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> {
> trace_nfsd_file_unhash_and_dispose(nf);
> - if (!nfsd_file_unhash(nf))
> - return false;
> - /* keep final reference for nfsd_file_lru_dispose */

This comment has been stale since nfsd_file_lru_dispose() was
renamed or removed. The only trouble I have is there isn't a
comment left that explains why we're not decrementing the hash
table reference here. ("don't have to" is enough to say about
it, but there should be something).


> - if (refcount_dec_not_one(&nf->nf_ref))
> - return true;
> -
> - nfsd_file_lru_remove(nf);
> - list_add(&nf->nf_lru, dispose);
> - return true;
> + if (nfsd_file_unhash(nf)) {
> + nfsd_file_lru_remove(nf);
> + list_add(&nf->nf_lru, dispose);
> + }
> }
>
> static void
> @@ -564,8 +559,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
> * @lock: LRU list lock (unused)
> * @arg: dispose list
> *
> - * Note this can deadlock with nfsd_file_cache_purge.
> - *
> * Return values:
> * %LRU_REMOVED: @item was removed from the LRU
> * %LRU_ROTATE: @item is to be moved to the LRU tail
> @@ -750,8 +743,6 @@ nfsd_file_close_inode(struct inode *inode)
> *
> * Walk the LRU list and close any entries that have not been used since
> * the last scan.
> - *
> - * Note this can deadlock with nfsd_file_cache_purge.
> */
> static void
> nfsd_file_delayed_close(struct work_struct *work)
> @@ -893,16 +884,12 @@ nfsd_file_cache_init(void)
> goto out;
> }
>
> -/*
> - * Note this can deadlock with nfsd_file_lru_cb.
> - */
> static void
> __nfsd_file_cache_purge(struct net *net)
> {
> struct rhashtable_iter iter;
> struct nfsd_file *nf;
> LIST_HEAD(dispose);
> - bool del;
>
> rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
> do {
> @@ -912,14 +899,7 @@ __nfsd_file_cache_purge(struct net *net)
> while (!IS_ERR_OR_NULL(nf)) {
> if (net && nf->nf_net != net)
> continue;
> - del = nfsd_file_unhash_and_dispose(nf, &dispose);
> -
> - /*
> - * Deadlock detected! Something marked this entry as
> - * unhased, but hasn't removed it from the hash list.
> - */
> - WARN_ON_ONCE(!del);
> -
> + nfsd_file_unhash_and_dispose(nf, &dispose);
> nf = rhashtable_walk_next(&iter);
> }
>
> --
> 2.37.3
>

--
Chuck Lever



2022-09-30 19:55:38

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 3/3] nfsd: fix nfsd_file_unhash_and_dispose

On Fri, 2022-09-30 at 19:29 +0000, Chuck Lever III wrote:
>
> > On Sep 30, 2022, at 3:15 PM, Jeff Layton <[email protected]> wrote:
> >
> > This function is called two reasons:
> >
> > We're either shutting down and purging the filecache, or we've gotten a
> > notification about a file delete, so we want to go ahead and unhash it
> > so that it'll get cleaned up when we close.
> >
> > We're either walking the hashtable or doing a lookup in it and we
> > don't take a reference in either case. What we want to do in both cases
> > is to try and unhash the object and put it on the dispose list if that
> > was successful. If it's no longer hashed, then we don't want to touch
> > it, with the assumption being that something else is already cleaning
> > up the sentinel reference.
> >
> > Instead of trying to selectively decrement the refcount in this
> > function, just unhash it, and if that was successful, move it to the
> > dispose list. Then, the disposal routine will just clean that up as
> > usual.
> >
> > Also, just make this a void function, drop the WARN_ON_ONCE, and the
> > comments about deadlocking since the nature of the purported deadlock
> > is no longer clear.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/filecache.c | 32 ++++++--------------------------
> > 1 file changed, 6 insertions(+), 26 deletions(-)
> >
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index 58f4d9267f4a..16bd71a3894e 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -408,19 +408,14 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > /*
> > * Return true if the file was unhashed.
> > */
>
> If you're changing the function to return void, the above
> comment is now stale.
>
> > -static bool
> > +static void
> > nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
> > {
> > trace_nfsd_file_unhash_and_dispose(nf);
> > - if (!nfsd_file_unhash(nf))
> > - return false;
> > - /* keep final reference for nfsd_file_lru_dispose */
>
> This comment has been stale since nfsd_file_lru_dispose() was
> renamed or removed. The only trouble I have is there isn't a
> comment left that explains why we're not decrementing the hash
> table reference here. ("don't have to" is enough to say about
> it, but there should be something).
>
>

How about this?

+ if (nfsd_file_unhash(nf)) {
+ /*
+ * Unhashing was successful. Transfer it to the dispose list
+ * so that we can put the sentinel reference for it later.
+ */
+ nfsd_file_lru_remove(nf);
+ list_add(&nf->nf_lru, dispose);
+ }


In this case, we're basically transferring the sentinel reference to the
"dispose" list. Later, we'll call nfsd_file_dispose_list and drop it.

Now that we don't have such onerous spinlocking in this code, we might
be able to just put each reference as we go instead of deferring it to a
list and putting them all at the end. That's probably best done later in
a separate patch however.


> > - if (refcount_dec_not_one(&nf->nf_ref))
> > - return true;
> > -
> > - nfsd_file_lru_remove(nf);
> > - list_add(&nf->nf_lru, dispose);
> > - return true;
> > + if (nfsd_file_unhash(nf)) {
> > + nfsd_file_lru_remove(nf);
> > + list_add(&nf->nf_lru, dispose);
> > + }
> > }
> >
> > static void
> > @@ -564,8 +559,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
> > * @lock: LRU list lock (unused)
> > * @arg: dispose list
> > *
> > - * Note this can deadlock with nfsd_file_cache_purge.
> > - *
> > * Return values:
> > * %LRU_REMOVED: @item was removed from the LRU
> > * %LRU_ROTATE: @item is to be moved to the LRU tail
> > @@ -750,8 +743,6 @@ nfsd_file_close_inode(struct inode *inode)
> > *
> > * Walk the LRU list and close any entries that have not been used since
> > * the last scan.
> > - *
> > - * Note this can deadlock with nfsd_file_cache_purge.
> > */
> > static void
> > nfsd_file_delayed_close(struct work_struct *work)
> > @@ -893,16 +884,12 @@ nfsd_file_cache_init(void)
> > goto out;
> > }
> >
> > -/*
> > - * Note this can deadlock with nfsd_file_lru_cb.
> > - */
> > static void
> > __nfsd_file_cache_purge(struct net *net)
> > {
> > struct rhashtable_iter iter;
> > struct nfsd_file *nf;
> > LIST_HEAD(dispose);
> > - bool del;
> >
> > rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
> > do {
> > @@ -912,14 +899,7 @@ __nfsd_file_cache_purge(struct net *net)
> > while (!IS_ERR_OR_NULL(nf)) {
> > if (net && nf->nf_net != net)
> > continue;
> > - del = nfsd_file_unhash_and_dispose(nf, &dispose);
> > -
> > - /*
> > - * Deadlock detected! Something marked this entry as
> > - * unhased, but hasn't removed it from the hash list.
> > - */
> > - WARN_ON_ONCE(!del);
> > -
> > + nfsd_file_unhash_and_dispose(nf, &dispose);
> > nf = rhashtable_walk_next(&iter);
> > }
> >
> > --
> > 2.37.3
> >
>
> --
> Chuck Lever
>
>
>

--
Jeff Layton <[email protected]>

2022-09-30 20:30:04

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH 3/3] nfsd: fix nfsd_file_unhash_and_dispose



> On Sep 30, 2022, at 3:42 PM, Jeff Layton <[email protected]> wrote:
>
> On Fri, 2022-09-30 at 19:29 +0000, Chuck Lever III wrote:
>>
>>> On Sep 30, 2022, at 3:15 PM, Jeff Layton <[email protected]> wrote:
>>>
>>> This function is called two reasons:
>>>
>>> We're either shutting down and purging the filecache, or we've gotten a
>>> notification about a file delete, so we want to go ahead and unhash it
>>> so that it'll get cleaned up when we close.
>>>
>>> We're either walking the hashtable or doing a lookup in it and we
>>> don't take a reference in either case. What we want to do in both cases
>>> is to try and unhash the object and put it on the dispose list if that
>>> was successful. If it's no longer hashed, then we don't want to touch
>>> it, with the assumption being that something else is already cleaning
>>> up the sentinel reference.
>>>
>>> Instead of trying to selectively decrement the refcount in this
>>> function, just unhash it, and if that was successful, move it to the
>>> dispose list. Then, the disposal routine will just clean that up as
>>> usual.
>>>
>>> Also, just make this a void function, drop the WARN_ON_ONCE, and the
>>> comments about deadlocking since the nature of the purported deadlock
>>> is no longer clear.
>>>
>>> Signed-off-by: Jeff Layton <[email protected]>
>>> ---
>>> fs/nfsd/filecache.c | 32 ++++++--------------------------
>>> 1 file changed, 6 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>> index 58f4d9267f4a..16bd71a3894e 100644
>>> --- a/fs/nfsd/filecache.c
>>> +++ b/fs/nfsd/filecache.c
>>> @@ -408,19 +408,14 @@ nfsd_file_unhash(struct nfsd_file *nf)
>>> /*
>>> * Return true if the file was unhashed.
>>> */
>>
>> If you're changing the function to return void, the above
>> comment is now stale.
>>
>>> -static bool
>>> +static void
>>> nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
>>> {
>>> trace_nfsd_file_unhash_and_dispose(nf);
>>> - if (!nfsd_file_unhash(nf))
>>> - return false;
>>> - /* keep final reference for nfsd_file_lru_dispose */
>>
>> This comment has been stale since nfsd_file_lru_dispose() was
>> renamed or removed. The only trouble I have is there isn't a
>> comment left that explains why we're not decrementing the hash
>> table reference here. ("don't have to" is enough to say about
>> it, but there should be something).
>>
>>
>
> How about this?
>
> + if (nfsd_file_unhash(nf)) {
> + /*
> + * Unhashing was successful. Transfer it to the dispose list
> + * so that we can put the sentinel reference for it later.
> + */

Right idea, but I would say nothing more than "nfsd_file_dispose_list()
will put the sentinel reference later."


> + nfsd_file_lru_remove(nf);
> + list_add(&nf->nf_lru, dispose);
> + }

I was staring at this earlier today and thinking it needed clean
up. This looks right to me.


> In this case, we're basically transferring the sentinel reference to the
> "dispose" list. Later, we'll call nfsd_file_dispose_list and drop it.
>
> Now that we don't have such onerous spinlocking in this code, we might
> be able to just put each reference as we go instead of deferring it to a
> list and putting them all at the end. That's probably best done later in
> a separate patch however.
>
>
>>> - if (refcount_dec_not_one(&nf->nf_ref))
>>> - return true;
>>> -
>>> - nfsd_file_lru_remove(nf);
>>> - list_add(&nf->nf_lru, dispose);
>>> - return true;
>>> + if (nfsd_file_unhash(nf)) {
>>> + nfsd_file_lru_remove(nf);
>>> + list_add(&nf->nf_lru, dispose);
>>> + }
>>> }
>>>
>>> static void
>>> @@ -564,8 +559,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
>>> * @lock: LRU list lock (unused)
>>> * @arg: dispose list
>>> *
>>> - * Note this can deadlock with nfsd_file_cache_purge.
>>> - *
>>> * Return values:
>>> * %LRU_REMOVED: @item was removed from the LRU
>>> * %LRU_ROTATE: @item is to be moved to the LRU tail
>>> @@ -750,8 +743,6 @@ nfsd_file_close_inode(struct inode *inode)
>>> *
>>> * Walk the LRU list and close any entries that have not been used since
>>> * the last scan.
>>> - *
>>> - * Note this can deadlock with nfsd_file_cache_purge.
>>> */
>>> static void
>>> nfsd_file_delayed_close(struct work_struct *work)
>>> @@ -893,16 +884,12 @@ nfsd_file_cache_init(void)
>>> goto out;
>>> }
>>>
>>> -/*
>>> - * Note this can deadlock with nfsd_file_lru_cb.
>>> - */
>>> static void
>>> __nfsd_file_cache_purge(struct net *net)
>>> {
>>> struct rhashtable_iter iter;
>>> struct nfsd_file *nf;
>>> LIST_HEAD(dispose);
>>> - bool del;
>>>
>>> rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
>>> do {
>>> @@ -912,14 +899,7 @@ __nfsd_file_cache_purge(struct net *net)
>>> while (!IS_ERR_OR_NULL(nf)) {
>>> if (net && nf->nf_net != net)
>>> continue;
>>> - del = nfsd_file_unhash_and_dispose(nf, &dispose);
>>> -
>>> - /*
>>> - * Deadlock detected! Something marked this entry as
>>> - * unhased, but hasn't removed it from the hash list.
>>> - */
>>> - WARN_ON_ONCE(!del);
>>> -
>>> + nfsd_file_unhash_and_dispose(nf, &dispose);
>>> nf = rhashtable_walk_next(&iter);
>>> }
>>>
>>> --
>>> 2.37.3
>>>
>>
>> --
>> Chuck Lever
>>
>>
>>
>
> --
> Jeff Layton <[email protected]>

--
Chuck Lever