2023-02-10 08:12:44

by Dai Ngo

[permalink] [raw]
Subject: [RFC PATCH 1/1] SUNRPC: increase max timeout for rebind to handle NFS server restart

Occasionally NLM lock and unlock request fail with EIO and ENOLCK
respectively. This usually happens when the NFS server is restarted
while NLM lock test is running.

Currently there is a 9 seconds limit for retrying the bind operation.
If the server is under load the port mapper might take more than 9
seconds to become ready after the NFS server restarted.

This patch increases the timeout for rebind from 9 to 30 seconds
allowing a bit more time for the port mapper to become ready.

Signed-off-by: Dai Ngo <[email protected]>
---
include/linux/sunrpc/clnt.h | 3 +++
include/linux/sunrpc/sched.h | 4 ++--
net/sunrpc/clnt.c | 2 +-
net/sunrpc/sched.c | 3 ++-
4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 770ef2cb5775..7f2dee56c121 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -162,6 +162,9 @@ struct rpc_add_xprt_test {
#define RPC_CLNT_CREATE_REUSEPORT (1UL << 11)
#define RPC_CLNT_CREATE_CONNECTED (1UL << 12)

+#define RPC_CLNT_REBIND_DELAY 3
+#define RPC_CLNT_REBIND_MAX_TIMEOUT 30
+
struct rpc_clnt *rpc_create(struct rpc_create_args *args);
struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *,
const struct rpc_program *, u32);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index b8ca3ecaf8d7..e9dc142f10bb 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -90,8 +90,8 @@ struct rpc_task {
#endif
unsigned char tk_priority : 2,/* Task priority */
tk_garb_retry : 2,
- tk_cred_retry : 2,
- tk_rebind_retry : 2;
+ tk_cred_retry : 2;
+ unsigned char tk_rebind_retry;
};

typedef void (*rpc_action)(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0b0b9f1eed46..6c89a1fa40bf 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2053,7 +2053,7 @@ call_bind_status(struct rpc_task *task)
if (task->tk_rebind_retry == 0)
break;
task->tk_rebind_retry--;
- rpc_delay(task, 3*HZ);
+ rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ);
goto retry_timeout;
case -ENOBUFS:
rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index be587a308e05..5c18a35752aa 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -817,7 +817,8 @@ rpc_init_task_statistics(struct rpc_task *task)
/* Initialize retry counters */
task->tk_garb_retry = 2;
task->tk_cred_retry = 2;
- task->tk_rebind_retry = 2;
+ task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT /
+ RPC_CLNT_REBIND_DELAY;

/* starting timestamp */
task->tk_start = ktime_get();
--
2.9.5



2023-02-17 18:22:42

by Dai Ngo

[permalink] [raw]
Subject: Re: [RFC PATCH 1/1] SUNRPC: increase max timeout for rebind to handle NFS server restart

Hi Trond,

Could you please let me know your opinion on this patch?

Thanks,
-Dai

On 2/10/23 12:10 AM, Dai Ngo wrote:
> Occasionally NLM lock and unlock request fail with EIO and ENOLCK
> respectively. This usually happens when the NFS server is restarted
> while NLM lock test is running.
>
> Currently there is a 9 seconds limit for retrying the bind operation.
> If the server is under load the port mapper might take more than 9
> seconds to become ready after the NFS server restarted.
>
> This patch increases the timeout for rebind from 9 to 30 seconds
> allowing a bit more time for the port mapper to become ready.
>
> Signed-off-by: Dai Ngo <[email protected]>
> ---
> include/linux/sunrpc/clnt.h | 3 +++
> include/linux/sunrpc/sched.h | 4 ++--
> net/sunrpc/clnt.c | 2 +-
> net/sunrpc/sched.c | 3 ++-
> 4 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
> index 770ef2cb5775..7f2dee56c121 100644
> --- a/include/linux/sunrpc/clnt.h
> +++ b/include/linux/sunrpc/clnt.h
> @@ -162,6 +162,9 @@ struct rpc_add_xprt_test {
> #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11)
> #define RPC_CLNT_CREATE_CONNECTED (1UL << 12)
>
> +#define RPC_CLNT_REBIND_DELAY 3
> +#define RPC_CLNT_REBIND_MAX_TIMEOUT 30
> +
> struct rpc_clnt *rpc_create(struct rpc_create_args *args);
> struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *,
> const struct rpc_program *, u32);
> diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
> index b8ca3ecaf8d7..e9dc142f10bb 100644
> --- a/include/linux/sunrpc/sched.h
> +++ b/include/linux/sunrpc/sched.h
> @@ -90,8 +90,8 @@ struct rpc_task {
> #endif
> unsigned char tk_priority : 2,/* Task priority */
> tk_garb_retry : 2,
> - tk_cred_retry : 2,
> - tk_rebind_retry : 2;
> + tk_cred_retry : 2;
> + unsigned char tk_rebind_retry;
> };
>
> typedef void (*rpc_action)(struct rpc_task *);
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 0b0b9f1eed46..6c89a1fa40bf 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -2053,7 +2053,7 @@ call_bind_status(struct rpc_task *task)
> if (task->tk_rebind_retry == 0)
> break;
> task->tk_rebind_retry--;
> - rpc_delay(task, 3*HZ);
> + rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ);
> goto retry_timeout;
> case -ENOBUFS:
> rpc_delay(task, HZ >> 2);
> diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
> index be587a308e05..5c18a35752aa 100644
> --- a/net/sunrpc/sched.c
> +++ b/net/sunrpc/sched.c
> @@ -817,7 +817,8 @@ rpc_init_task_statistics(struct rpc_task *task)
> /* Initialize retry counters */
> task->tk_garb_retry = 2;
> task->tk_cred_retry = 2;
> - task->tk_rebind_retry = 2;
> + task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT /
> + RPC_CLNT_REBIND_DELAY;
>
> /* starting timestamp */
> task->tk_start = ktime_get();

2023-02-23 05:40:47

by Dai Ngo

[permalink] [raw]
Subject: Re: [RFC PATCH 1/1] SUNRPC: increase max timeout for rebind to handle NFS server restart

Hi Anna,

Just a reminder that this patch is still waiting for a review.

Thanks,
-Dai

On 2/17/23 10:22 AM, [email protected] wrote:
> Hi Trond,
>
> Could you please let me know your opinion on this patch?
>
> Thanks,
> -Dai
>
> On 2/10/23 12:10 AM, Dai Ngo wrote:
>> Occasionally NLM lock and unlock request fail with EIO and ENOLCK
>> respectively. This usually happens when the NFS server is restarted
>> while NLM lock test is running.
>>
>> Currently there is a 9 seconds limit for retrying the bind operation.
>> If the server is under load the port mapper might take more than 9
>> seconds to become ready after the NFS server restarted.
>>
>> This patch increases the timeout for rebind from 9 to 30 seconds
>> allowing a bit more time for the port mapper to become ready.
>>
>> Signed-off-by: Dai Ngo <[email protected]>
>> ---
>>   include/linux/sunrpc/clnt.h  | 3 +++
>>   include/linux/sunrpc/sched.h | 4 ++--
>>   net/sunrpc/clnt.c            | 2 +-
>>   net/sunrpc/sched.c           | 3 ++-
>>   4 files changed, 8 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
>> index 770ef2cb5775..7f2dee56c121 100644
>> --- a/include/linux/sunrpc/clnt.h
>> +++ b/include/linux/sunrpc/clnt.h
>> @@ -162,6 +162,9 @@ struct rpc_add_xprt_test {
>>   #define RPC_CLNT_CREATE_REUSEPORT    (1UL << 11)
>>   #define RPC_CLNT_CREATE_CONNECTED    (1UL << 12)
>>   +#define    RPC_CLNT_REBIND_DELAY        3
>> +#define    RPC_CLNT_REBIND_MAX_TIMEOUT    30
>> +
>>   struct rpc_clnt *rpc_create(struct rpc_create_args *args);
>>   struct rpc_clnt    *rpc_bind_new_program(struct rpc_clnt *,
>>                   const struct rpc_program *, u32);
>> diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
>> index b8ca3ecaf8d7..e9dc142f10bb 100644
>> --- a/include/linux/sunrpc/sched.h
>> +++ b/include/linux/sunrpc/sched.h
>> @@ -90,8 +90,8 @@ struct rpc_task {
>>   #endif
>>       unsigned char        tk_priority : 2,/* Task priority */
>>                   tk_garb_retry : 2,
>> -                tk_cred_retry : 2,
>> -                tk_rebind_retry : 2;
>> +                tk_cred_retry : 2;
>> +    unsigned char        tk_rebind_retry;
>>   };
>>     typedef void            (*rpc_action)(struct rpc_task *);
>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>> index 0b0b9f1eed46..6c89a1fa40bf 100644
>> --- a/net/sunrpc/clnt.c
>> +++ b/net/sunrpc/clnt.c
>> @@ -2053,7 +2053,7 @@ call_bind_status(struct rpc_task *task)
>>           if (task->tk_rebind_retry == 0)
>>               break;
>>           task->tk_rebind_retry--;
>> -        rpc_delay(task, 3*HZ);
>> +        rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ);
>>           goto retry_timeout;
>>       case -ENOBUFS:
>>           rpc_delay(task, HZ >> 2);
>> diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
>> index be587a308e05..5c18a35752aa 100644
>> --- a/net/sunrpc/sched.c
>> +++ b/net/sunrpc/sched.c
>> @@ -817,7 +817,8 @@ rpc_init_task_statistics(struct rpc_task *task)
>>       /* Initialize retry counters */
>>       task->tk_garb_retry = 2;
>>       task->tk_cred_retry = 2;
>> -    task->tk_rebind_retry = 2;
>> +    task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT /
>> +                    RPC_CLNT_REBIND_DELAY;
>>         /* starting timestamp */
>>       task->tk_start = ktime_get();