John Hubbard reports seeing the following stack trace:
nfs4_do_reclaim
rcu_read_lock /* we are now in_atomic() and must not sleep */
nfs4_purge_state_owners
nfs4_free_state_owner
nfs4_destroy_seqid_counter
rpc_destroy_wait_queue
cancel_delayed_work_sync
__cancel_work_timer
__flush_work
start_flush_work
might_sleep:
(kernel/workqueue.c:2975: BUG)
The solution is to separate out the freeing of the state owners
from nfs4_purge_state_owners(), and perform that outside the atomic
context.
Reported-by: John Hubbard <[email protected]>
Fixes: 0aaaf5c424c7f ("NFS: Cache state owners after files are closed")
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/nfs4_fs.h | 3 ++-
fs/nfs/nfs4client.c | 5 ++++-
fs/nfs/nfs4state.c | 27 ++++++++++++++++++++++-----
3 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index d778dad9a75e..3564da1ba8a1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -465,7 +465,8 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session,
extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
-extern void nfs4_purge_state_owners(struct nfs_server *);
+extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *);
+extern void nfs4_free_state_owners(struct list_head *head);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
extern void nfs4_put_open_state(struct nfs4_state *);
extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 616393a01c06..da6204025a2d 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -758,9 +758,12 @@ int nfs41_walk_client_list(struct nfs_client *new,
static void nfs4_destroy_server(struct nfs_server *server)
{
+ LIST_HEAD(freeme);
+
nfs_server_return_all_delegations(server);
unset_pnfs_layoutdriver(server);
- nfs4_purge_state_owners(server);
+ nfs4_purge_state_owners(server, &freeme);
+ nfs4_free_state_owners(&freeme);
}
/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d03b9cf42bd0..a4e866b2b43b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -624,24 +624,39 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
/**
* nfs4_purge_state_owners - Release all cached state owners
* @server: nfs_server with cached state owners to release
+ * @head: resulting list of state owners
*
* Called at umount time. Remaining state owners will be on
* the LRU with ref count of zero.
+ * Note that the state owners are not freed, but are added
+ * to the list @head, which can later be used as an argument
+ * to nfs4_free_state_owners.
*/
-void nfs4_purge_state_owners(struct nfs_server *server)
+void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state_owner *sp, *tmp;
- LIST_HEAD(doomed);
spin_lock(&clp->cl_lock);
list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
- list_move(&sp->so_lru, &doomed);
+ list_move(&sp->so_lru, head);
nfs4_remove_state_owner_locked(sp);
}
spin_unlock(&clp->cl_lock);
+}
- list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @head: resulting list of state owners
+ *
+ * Frees a list of state owners that was generated by
+ * nfs4_purge_state_owners
+ */
+void nfs4_free_state_owners(struct list_head *head)
+{
+ struct nfs4_state_owner *sp, *tmp;
+
+ list_for_each_entry_safe(sp, tmp, head, so_lru) {
list_del(&sp->so_lru);
nfs4_free_state_owner(sp);
}
@@ -1865,12 +1880,13 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
struct nfs4_state_owner *sp;
struct nfs_server *server;
struct rb_node *pos;
+ LIST_HEAD(freeme);
int status = 0;
restart:
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- nfs4_purge_state_owners(server);
+ nfs4_purge_state_owners(server, &freeme);
spin_lock(&clp->cl_lock);
for (pos = rb_first(&server->state_owners);
pos != NULL;
@@ -1899,6 +1915,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
spin_unlock(&clp->cl_lock);
}
rcu_read_unlock();
+ nfs4_free_state_owners(&freeme);
return 0;
}
--
2.21.0
On 8/3/19 7:40 AM, Trond Myklebust wrote:
> John Hubbard reports seeing the following stack trace:
>
> nfs4_do_reclaim
> rcu_read_lock /* we are now in_atomic() and must not sleep */
> nfs4_purge_state_owners
> nfs4_free_state_owner
> nfs4_destroy_seqid_counter
> rpc_destroy_wait_queue
> cancel_delayed_work_sync
> __cancel_work_timer
> __flush_work
> start_flush_work
> might_sleep:
> (kernel/workqueue.c:2975: BUG)
>
> The solution is to separate out the freeing of the state owners
> from nfs4_purge_state_owners(), and perform that outside the atomic
> context.
>
All better now--this definitely fixes it. I can reboot the server, and
of course that backtrace is gone. Then the client mounts hang, so I do
a mount -a -o remount, and after about 1 minute, the client mounts
start working again, with no indication of problems. I assume that the
pause is by design--timing out somewhere, to recover from the server
going missing for a while. If so, then all is well.
thanks,
--
John Hubbard
NVIDIA
> Reported-by: John Hubbard <[email protected]>
> Fixes: 0aaaf5c424c7f ("NFS: Cache state owners after files are closed")
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/nfs4_fs.h | 3 ++-
> fs/nfs/nfs4client.c | 5 ++++-
> fs/nfs/nfs4state.c | 27 ++++++++++++++++++++++-----
> 3 files changed, 28 insertions(+), 7 deletions(-)
>
> diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
> index d778dad9a75e..3564da1ba8a1 100644
> --- a/fs/nfs/nfs4_fs.h
> +++ b/fs/nfs/nfs4_fs.h
> @@ -465,7 +465,8 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session,
>
> extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t);
> extern void nfs4_put_state_owner(struct nfs4_state_owner *);
> -extern void nfs4_purge_state_owners(struct nfs_server *);
> +extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *);
> +extern void nfs4_free_state_owners(struct list_head *head);
> extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
> extern void nfs4_put_open_state(struct nfs4_state *);
> extern void nfs4_close_state(struct nfs4_state *, fmode_t);
> diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
> index 616393a01c06..da6204025a2d 100644
> --- a/fs/nfs/nfs4client.c
> +++ b/fs/nfs/nfs4client.c
> @@ -758,9 +758,12 @@ int nfs41_walk_client_list(struct nfs_client *new,
>
> static void nfs4_destroy_server(struct nfs_server *server)
> {
> + LIST_HEAD(freeme);
> +
> nfs_server_return_all_delegations(server);
> unset_pnfs_layoutdriver(server);
> - nfs4_purge_state_owners(server);
> + nfs4_purge_state_owners(server, &freeme);
> + nfs4_free_state_owners(&freeme);
> }
>
> /*
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index d03b9cf42bd0..a4e866b2b43b 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -624,24 +624,39 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
> /**
> * nfs4_purge_state_owners - Release all cached state owners
> * @server: nfs_server with cached state owners to release
> + * @head: resulting list of state owners
> *
> * Called at umount time. Remaining state owners will be on
> * the LRU with ref count of zero.
> + * Note that the state owners are not freed, but are added
> + * to the list @head, which can later be used as an argument
> + * to nfs4_free_state_owners.
> */
> -void nfs4_purge_state_owners(struct nfs_server *server)
> +void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
> {
> struct nfs_client *clp = server->nfs_client;
> struct nfs4_state_owner *sp, *tmp;
> - LIST_HEAD(doomed);
>
> spin_lock(&clp->cl_lock);
> list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
> - list_move(&sp->so_lru, &doomed);
> + list_move(&sp->so_lru, head);
> nfs4_remove_state_owner_locked(sp);
> }
> spin_unlock(&clp->cl_lock);
> +}
>
> - list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
> +/**
> + * nfs4_purge_state_owners - Release all cached state owners
> + * @head: resulting list of state owners
> + *
> + * Frees a list of state owners that was generated by
> + * nfs4_purge_state_owners
> + */
> +void nfs4_free_state_owners(struct list_head *head)
> +{
> + struct nfs4_state_owner *sp, *tmp;
> +
> + list_for_each_entry_safe(sp, tmp, head, so_lru) {
> list_del(&sp->so_lru);
> nfs4_free_state_owner(sp);
> }
> @@ -1865,12 +1880,13 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
> struct nfs4_state_owner *sp;
> struct nfs_server *server;
> struct rb_node *pos;
> + LIST_HEAD(freeme);
> int status = 0;
>
> restart:
> rcu_read_lock();
> list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
> - nfs4_purge_state_owners(server);
> + nfs4_purge_state_owners(server, &freeme);
> spin_lock(&clp->cl_lock);
> for (pos = rb_first(&server->state_owners);
> pos != NULL;
> @@ -1899,6 +1915,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
> spin_unlock(&clp->cl_lock);
> }
> rcu_read_unlock();
> + nfs4_free_state_owners(&freeme);
> return 0;
> }
>
>
On Sat, 2019-08-03 at 12:07 -0700, John Hubbard wrote:
> On 8/3/19 7:40 AM, Trond Myklebust wrote:
> > John Hubbard reports seeing the following stack trace:
> >
> > nfs4_do_reclaim
> > rcu_read_lock /* we are now in_atomic() and must not sleep */
> > nfs4_purge_state_owners
> > nfs4_free_state_owner
> > nfs4_destroy_seqid_counter
> > rpc_destroy_wait_queue
> > cancel_delayed_work_sync
> > __cancel_work_timer
> > __flush_work
> > start_flush_work
> > might_sleep:
> > (kernel/workqueue.c:2975:
> > BUG)
> >
> > The solution is to separate out the freeing of the state owners
> > from nfs4_purge_state_owners(), and perform that outside the atomic
> > context.
> >
>
> All better now--this definitely fixes it. I can reboot the server,
> and
> of course that backtrace is gone. Then the client mounts hang, so I
> do
> a mount -a -o remount, and after about 1 minute, the client mounts
> start working again, with no indication of problems. I assume that
> the
> pause is by design--timing out somewhere, to recover from the server
> going missing for a while. If so, then all is well.
>
Thanks very much for the report, and for testing!
With regards to the 1 minute delay, I strongly suspect that what you
are seeing is the NFSv4 "grace period".
After a NFSv4.x server reboot, the clients are given a certain amount
of time in which to recover the file open state and lock state that
they may have held before the reboot. All non-recovery opens, locks and
all I/O are stopped while this recovery process is happening to ensure
that locking conflicts do not occur. This ensures that all locks can
survive server reboots without any loss of atomicity.
With NFSv4.1 and NFSv4.2, the server can determine when all the clients
have finished recovering state and end the grace period early, however
I've recently seen cases where that was not happening. I'm not sure yet
if that is a real server regression.
Cheers
Trond
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 8/3/19 3:22 PM, Trond Myklebust wrote:
> On Sat, 2019-08-03 at 12:07 -0700, John Hubbard wrote:
>> On 8/3/19 7:40 AM, Trond Myklebust wrote:
>>> John Hubbard reports seeing the following stack trace:
>>>
>>> nfs4_do_reclaim
>>> rcu_read_lock /* we are now in_atomic() and must not sleep */
>>> nfs4_purge_state_owners
>>> nfs4_free_state_owner
>>> nfs4_destroy_seqid_counter
>>> rpc_destroy_wait_queue
>>> cancel_delayed_work_sync
>>> __cancel_work_timer
>>> __flush_work
>>> start_flush_work
>>> might_sleep:
>>> (kernel/workqueue.c:2975:
>>> BUG)
>>>
>>> The solution is to separate out the freeing of the state owners
>>> from nfs4_purge_state_owners(), and perform that outside the atomic
>>> context.
>>>
>>
>> All better now--this definitely fixes it. I can reboot the server,
>> and
>> of course that backtrace is gone. Then the client mounts hang, so I
>> do
>> a mount -a -o remount, and after about 1 minute, the client mounts
>> start working again, with no indication of problems. I assume that
>> the
>> pause is by design--timing out somewhere, to recover from the server
>> going missing for a while. If so, then all is well.
>>
>
> Thanks very much for the report, and for testing!
>
> With regards to the 1 minute delay, I strongly suspect that what you
> are seeing is the NFSv4 "grace period".
>
> After a NFSv4.x server reboot, the clients are given a certain amount
> of time in which to recover the file open state and lock state that
> they may have held before the reboot. All non-recovery opens, locks and
> all I/O are stopped while this recovery process is happening to ensure
> that locking conflicts do not occur. This ensures that all locks can
> survive server reboots without any loss of atomicity.
>
> With NFSv4.1 and NFSv4.2, the server can determine when all the clients
> have finished recovering state and end the grace period early, however
> I've recently seen cases where that was not happening. I'm not sure yet
> if that is a real server regression.
>
Aha, thanks for explaining. It's great to see such refined behavior now
from NFS, definitely enjoying v4! :)
thanks,
--
John Hubbard
NVIDIA