An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT
only when a Server Sent a RECALL do to that GET_LAYOUT, or
the RECALL and GET_LAYOUT crossed on the wire.
In any way this means we want to wait at most until in-flight IO
is finished and the RECALL can be satisfied.
So a proper wait here is more like 1/10 of a second, not 15 seconds
like we have now. (We use NFS4_POLL_RETRY_MIN here)
Current code totally craps out performance of very large files on
most pnfs-objects layouts, because of how the map changes when the
file has grown and spills into the next raid group.
CC: Stable Tree <[email protected]>
Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/nfs4proc.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d53d678..3264fca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7058,7 +7058,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
struct nfs4_state *state = NULL;
unsigned long timeo, giveup;
- dprintk("--> %s\n", __func__);
+ dprintk("--> %s tk_status => %d\n", __func__, task->tk_status);
if (!nfs41_sequence_done(task, &lgp->res.seq_res))
goto out;
@@ -7067,11 +7067,27 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
case 0:
goto out;
case -NFS4ERR_LAYOUTTRYLATER:
+ /* NFS4ERR_RECALLCONFLICT is always a minimal delay (conflict with
+ * self)
+ * TODO: NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+ * (or clients). What we should do is randomize a short delay like on a
+ * network broadcast burst, and raise the random max every failure.
+ * For now leave it stateless and do this polling.
+ */
case -NFS4ERR_RECALLCONFLICT:
timeo = rpc_get_timeout(task->tk_client);
giveup = lgp->args.timestamp + timeo;
- if (time_after(giveup, jiffies))
- task->tk_status = -NFS4ERR_DELAY;
+ if (time_after(giveup, jiffies)) {
+ /* Do a minimum delay, We are actually waiting for our
+ * own IO to finish (In most cases)
+ */
+ dprintk("%s: NFS4ERR_RECALLCONFLICT waiting\n",
+ __func__);
+ rpc_delay(task, NFS4_POLL_RETRY_MIN);
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ goto out; /* Do not call nfs4_async_handle_error() */
+ }
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
--
1.7.11.7
Trond hi
The subject needs to read "pnfs:" and not "pnfs-obj:"
On 01/14/2014 04:45 PM, Boaz Harrosh wrote:
>
> An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT
> only when a Server Sent a RECALL do to that GET_LAYOUT, or
> the RECALL and GET_LAYOUT crossed on the wire.
> In any way this means we want to wait at most until in-flight IO
> is finished and the RECALL can be satisfied.
>
> So a proper wait here is more like 1/10 of a second, not 15 seconds
> like we have now. (We use NFS4_POLL_RETRY_MIN here)
>
> Current code totally craps out performance of very large files on
> most pnfs-objects layouts, because of how the map changes when the
> file has grown and spills into the next raid group.
>
> CC: Stable Tree <[email protected]>
And please fix the email here to <[email protected]>
(OK I'll send a new one sorry)
Thanks
Boaz
> Signed-off-by: Boaz Harrosh <[email protected]>
> ---
> fs/nfs/nfs4proc.c | 22 +++++++++++++++++++---
> 1 file changed, 19 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index d53d678..3264fca 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -7058,7 +7058,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
> struct nfs4_state *state = NULL;
> unsigned long timeo, giveup;
>
> - dprintk("--> %s\n", __func__);
> + dprintk("--> %s tk_status => %d\n", __func__, task->tk_status);
>
> if (!nfs41_sequence_done(task, &lgp->res.seq_res))
> goto out;
> @@ -7067,11 +7067,27 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
> case 0:
> goto out;
> case -NFS4ERR_LAYOUTTRYLATER:
> + /* NFS4ERR_RECALLCONFLICT is always a minimal delay (conflict with
> + * self)
> + * TODO: NFS4ERR_LAYOUTTRYLATER is a conflict with another client
> + * (or clients). What we should do is randomize a short delay like on a
> + * network broadcast burst, and raise the random max every failure.
> + * For now leave it stateless and do this polling.
> + */
> case -NFS4ERR_RECALLCONFLICT:
> timeo = rpc_get_timeout(task->tk_client);
> giveup = lgp->args.timestamp + timeo;
> - if (time_after(giveup, jiffies))
> - task->tk_status = -NFS4ERR_DELAY;
> + if (time_after(giveup, jiffies)) {
> + /* Do a minimum delay, We are actually waiting for our
> + * own IO to finish (In most cases)
> + */
> + dprintk("%s: NFS4ERR_RECALLCONFLICT waiting\n",
> + __func__);
> + rpc_delay(task, NFS4_POLL_RETRY_MIN);
> + task->tk_status = 0;
> + rpc_restart_call_prepare(task);
> + goto out; /* Do not call nfs4_async_handle_error() */
> + }
> break;
> case -NFS4ERR_EXPIRED:
> case -NFS4ERR_BAD_STATEID:
>
On 01/14/2014 04:58 PM, Boaz Harrosh wrote:
> Sorry forgot to CC Stable Tree <[email protected]>
>
> Greg hi
>
> In Linux v3.9 there is a conflict around this area exactly do to change:
> [30005121] NFSv4.1: LAYOUTGET EDELAY loops timeout to the MDS
>
> If there are stables below 3.9 please tell me I will send you a patch
> for these.
>
> Thanks
> Boaz
>
<>
> Subject: [PATCH] pnfs-obj: Proper delay for NFS4ERR_RECALLCONFLICT in layout_get_done
>
>
> An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT
> only when a Server Sent a RECALL do to that GET_LAYOUT, or
> the RECALL and GET_LAYOUT crossed on the wire.
> In any way this means we want to wait at most until in-flight IO
> is finished and the RECALL can be satisfied.
>
> So a proper wait here is more like 1/10 of a second, not 15 seconds
> like we have now. (We use NFS4_POLL_RETRY_MIN here)
>
> Current code totally craps out performance of very large files on
> most pnfs-objects layouts, because of how the map changes when the
> file has grown and spills into the next raid group.
>
> CC: Stable Tree <[email protected]>
> Signed-off-by: Boaz Harrosh <[email protected]>
Trond hi
I'm sitting on this bug for over 6 month now. I completely forgot about
it until QA moved to new Fedora and a vanila Kernel which is missing this
fix.
This is a real bummer for objects, in the case of large clusters for example
a Panasas cluster with two shelves and up. So on big clusters where performance
should be better, but with out this fix it is miserably unacceptedly slow.
Thanks
Boaz
Sorry forgot to CC Stable Tree <[email protected]>
Greg hi
In Linux v3.9 there is a conflict around this area exactly do to change:
[30005121] NFSv4.1: LAYOUTGET EDELAY loops timeout to the MDS
If there are stables below 3.9 please tell me I will send you a patch
for these.
Thanks
Boaz
-------- Original Message --------
Date: Tue, 14 Jan 2014 16:45:27 +0200
From: Boaz Harrosh <[email protected]>
To: Trond Myklebust <[email protected]>, NFS list <[email protected]>, open-osd <[email protected]>
Subject: [PATCH] pnfs-obj: Proper delay for NFS4ERR_RECALLCONFLICT in layout_get_done
An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT
only when a Server Sent a RECALL do to that GET_LAYOUT, or
the RECALL and GET_LAYOUT crossed on the wire.
In any way this means we want to wait at most until in-flight IO
is finished and the RECALL can be satisfied.
So a proper wait here is more like 1/10 of a second, not 15 seconds
like we have now. (We use NFS4_POLL_RETRY_MIN here)
Current code totally craps out performance of very large files on
most pnfs-objects layouts, because of how the map changes when the
file has grown and spills into the next raid group.
CC: Stable Tree <[email protected]>
Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/nfs4proc.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d53d678..3264fca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7058,7 +7058,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
struct nfs4_state *state = NULL;
unsigned long timeo, giveup;
- dprintk("--> %s\n", __func__);
+ dprintk("--> %s tk_status => %d\n", __func__, task->tk_status);
if (!nfs41_sequence_done(task, &lgp->res.seq_res))
goto out;
@@ -7067,11 +7067,27 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
case 0:
goto out;
case -NFS4ERR_LAYOUTTRYLATER:
+ /* NFS4ERR_RECALLCONFLICT is always a minimal delay (conflict with
+ * self)
+ * TODO: NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+ * (or clients). What we should do is randomize a short delay like on a
+ * network broadcast burst, and raise the random max every failure.
+ * For now leave it stateless and do this polling.
+ */
case -NFS4ERR_RECALLCONFLICT:
timeo = rpc_get_timeout(task->tk_client);
giveup = lgp->args.timestamp + timeo;
- if (time_after(giveup, jiffies))
- task->tk_status = -NFS4ERR_DELAY;
+ if (time_after(giveup, jiffies)) {
+ /* Do a minimum delay, We are actually waiting for our
+ * own IO to finish (In most cases)
+ */
+ dprintk("%s: NFS4ERR_RECALLCONFLICT waiting\n",
+ __func__);
+ rpc_delay(task, NFS4_POLL_RETRY_MIN);
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ goto out; /* Do not call nfs4_async_handle_error() */
+ }
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
--
1.7.11.7
_______________________________________________
osd-dev mailing list
[email protected]
http://mailman.open-osd.org/mailman/listinfo/osd-dev