2024-01-26 17:47:40

by Chuck Lever

[permalink] [raw]
Subject: [PATCH 2 03/14] NFSD: Reschedule CB operations when backchannel rpc_clnt is shut down

From: Chuck Lever <[email protected]>

As part of managing a client disconnect, NFSD closes down and
replaces the backchannel rpc_clnt.

If a callback operation is pending when the backchannel rpc_clnt is
shut down, currently nfsd4_run_cb_work() just discards that
callback. But there are multiple cases to deal with here:

o The client's lease is getting destroyed. Throw the CB away.

o The client disconnected. It might be forcing a retransmit of
CB operations, or it could have disconnected for other reasons.
Reschedule the CB so it is retransmitted when the client
reconnects.

Since callback operations can now be rescheduled, ensure that
cb_ops->prepare can be called only once by moving the
cb_ops->prepare paragraph down to just before the rpc_call_async()
call.

Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()")
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4callback.c | 32 +++++++++++++++++++++++---------
1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 1ed2512b3648..389d05985c52 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -890,6 +890,13 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
return queue_delayed_work(callback_wq, &cb->cb_work, 0);
}

+static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
+ unsigned long msecs)
+{
+ queue_delayed_work(callback_wq, &cb->cb_work,
+ msecs_to_jiffies(msecs));
+}
+
static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
{
atomic_inc(&clp->cl_cb_inflight);
@@ -1375,20 +1382,21 @@ nfsd4_run_cb_work(struct work_struct *work)
struct rpc_clnt *clnt;
int flags;

- if (cb->cb_need_restart) {
- cb->cb_need_restart = false;
- } else {
- if (cb->cb_ops && cb->cb_ops->prepare)
- cb->cb_ops->prepare(cb);
- }
-
if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
nfsd4_process_cb_update(cb);

clnt = clp->cl_cb_client;
if (!clnt) {
- /* Callback channel broken, or client killed; give up: */
- nfsd41_destroy_cb(cb);
+ if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+ nfsd41_destroy_cb(cb);
+ else {
+ /*
+ * XXX: Ideally, we could wait for the client to
+ * reconnect, but I haven't figured out how
+ * to do that yet.
+ */
+ nfsd4_queue_cb_delayed(cb, 25);
+ }
return;
}

@@ -1401,6 +1409,12 @@ nfsd4_run_cb_work(struct work_struct *work)
return;
}

+ if (cb->cb_need_restart) {
+ cb->cb_need_restart = false;
+ } else {
+ if (cb->cb_ops && cb->cb_ops->prepare)
+ cb->cb_ops->prepare(cb);
+ }
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,




2024-01-26 19:10:38

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 2 03/14] NFSD: Reschedule CB operations when backchannel rpc_clnt is shut down

On Fri, 2024-01-26 at 12:45 -0500, Chuck Lever wrote:
> From: Chuck Lever <[email protected]>
>
> As part of managing a client disconnect, NFSD closes down and
> replaces the backchannel rpc_clnt.
>
> If a callback operation is pending when the backchannel rpc_clnt is
> shut down, currently nfsd4_run_cb_work() just discards that
> callback. But there are multiple cases to deal with here:
>
> o The client's lease is getting destroyed. Throw the CB away.
>
> o The client disconnected. It might be forcing a retransmit of
> CB operations, or it could have disconnected for other reasons.
> Reschedule the CB so it is retransmitted when the client
> reconnects.
>
> Since callback operations can now be rescheduled, ensure that
> cb_ops->prepare can be called only once by moving the
> cb_ops->prepare paragraph down to just before the rpc_call_async()
> call.
>
> Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()")
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> fs/nfsd/nfs4callback.c | 32 +++++++++++++++++++++++---------
> 1 file changed, 23 insertions(+), 9 deletions(-)
>
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 1ed2512b3648..389d05985c52 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -890,6 +890,13 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
> return queue_delayed_work(callback_wq, &cb->cb_work, 0);
> }
>
> +static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
> + unsigned long msecs)
> +{
> + queue_delayed_work(callback_wq, &cb->cb_work,
> + msecs_to_jiffies(msecs));
> +}
> +
> static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
> {
> atomic_inc(&clp->cl_cb_inflight);
> @@ -1375,20 +1382,21 @@ nfsd4_run_cb_work(struct work_struct *work)
> struct rpc_clnt *clnt;
> int flags;
>
> - if (cb->cb_need_restart) {
> - cb->cb_need_restart = false;
> - } else {
> - if (cb->cb_ops && cb->cb_ops->prepare)
> - cb->cb_ops->prepare(cb);
> - }
> -
> if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
> nfsd4_process_cb_update(cb);
>
> clnt = clp->cl_cb_client;
> if (!clnt) {
> - /* Callback channel broken, or client killed; give up: */
> - nfsd41_destroy_cb(cb);
> + if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> + nfsd41_destroy_cb(cb);
> + else {
> + /*
> + * XXX: Ideally, we could wait for the client to
> + * reconnect, but I haven't figured out how
> + * to do that yet.
> + */
> + nfsd4_queue_cb_delayed(cb, 25);
> + }
> return;
> }
>
> @@ -1401,6 +1409,12 @@ nfsd4_run_cb_work(struct work_struct *work)
> return;
> }
>
> + if (cb->cb_need_restart) {
> + cb->cb_need_restart = false;
> + } else {
> + if (cb->cb_ops && cb->cb_ops->prepare)
> + cb->cb_ops->prepare(cb);
> + }
> cb->cb_msg.rpc_cred = clp->cl_cb_cred;
> flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
> rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
>
>
>

Reviewed-by: Jeff Layton <[email protected]>