2012-04-13 11:25:17

by Jeff Layton

[permalink] [raw]
Subject: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

ESTALE errors are a source of pain for many users of NFS. Often they
occur when a lookup is satisfied out of the dcache, but the associated
inode has been removed on the server. In the case where a file is
renamed on top of another, the call would ordinarily have succeeded had
we issued an on the wire lookup instead. There are also other situations
that can lead to ESTALE errors, and the remedy is usually to retry the
lookup and call. If this is handled correctly in the kernel, then
userspace should only rarely see an ESTALE error.

This is an RFC patch showing the general approach to addressing this
problem that I'm suggesting. In order to solve this completely we'll
need to fix most of the path-based syscalls in a similar way.

This is not the first attempt at fixing this. Peter Staubach made a
similar attempt several years ago:

https://lkml.org/lkml/2008/3/10/267

This patch attempts to address the problem by having the VFS retry
the lookup (with LOOKUP_REVAL set) and the getattr call whenever the
getattr returns ESTALE.

A new fs flag is specified that allows filesystems to opt-in to
this behavior. That should address many of the concerns about whether
it's ok to retry indefinitely, but it does mean that we're only
going to do retry the call when the lookup succeeds.

However, basing this on a flag does give us a bit of a discrepancy in
retry behavior between the lookup and other operations. The path-walking
code already will reattempt the lookup once on an ESTALE error. The
proposed patch here however will make it retry indefinitely when the
getattr call returns ESTALE.

Since fixing this means touching a lot of code, it's pretty important
that we get the semantics correct here. I'm quite open to suggestion
about how we should make this behave as long as it resolves most of
these sorts of problems.

The main questions I have are:

1) should we retry these calls on all filesystems, or attempt to have
them "opt-in" in some fashion? This patch adds a flag for that, but
we could just treat all filesystems the same way.

2) How many times should we retry on an ESTALE error? Once?
Indefinitely? Some amount in between? Retrying once would probably
fix the bulk of the real world problems with this, but there will
still be cases where that's not sufficient.

3) Should we consider also changing the path-walking code to retry
more than once on an ESTALE error?

Once we nail down some acceptable semantics for this, I should be
able to make a more comprehensive patch that fixes this for all
path-based syscalls. I don't really want to dive into that work
however until we've come to some agreement about it.

Thanks to Michael Brantley for his initial work and patch for this
problem.

Reported-by: Michael Brantley <[email protected]>
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/super.c | 14 +++++++-------
fs/stat.c | 9 ++++++++-
include/linux/fs.h | 3 +++
3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 37412f7..b22aaba 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -296,7 +296,7 @@ static struct file_system_type nfs_fs_type = {
.name = "nfs",
.mount = nfs_fs_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_RETRY_ESTALE,
};

struct file_system_type nfs_xdev_fs_type = {
@@ -304,7 +304,7 @@ struct file_system_type nfs_xdev_fs_type = {
.name = "nfs",
.mount = nfs_xdev_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_RETRY_ESTALE,
};

static const struct super_operations nfs_sops = {
@@ -344,7 +344,7 @@ static struct file_system_type nfs4_fs_type = {
.name = "nfs4",
.mount = nfs4_mount,
.kill_sb = nfs4_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_RETRY_ESTALE,
};

static struct file_system_type nfs4_remote_fs_type = {
@@ -352,7 +352,7 @@ static struct file_system_type nfs4_remote_fs_type = {
.name = "nfs4",
.mount = nfs4_remote_mount,
.kill_sb = nfs4_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_RETRY_ESTALE,
};

struct file_system_type nfs4_xdev_fs_type = {
@@ -360,7 +360,7 @@ struct file_system_type nfs4_xdev_fs_type = {
.name = "nfs4",
.mount = nfs4_xdev_mount,
.kill_sb = nfs4_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_RETRY_ESTALE,
};

static struct file_system_type nfs4_remote_referral_fs_type = {
@@ -368,7 +368,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
.name = "nfs4",
.mount = nfs4_remote_referral_mount,
.kill_sb = nfs4_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_RETRY_ESTALE,
};

struct file_system_type nfs4_referral_fs_type = {
@@ -376,7 +376,7 @@ struct file_system_type nfs4_referral_fs_type = {
.name = "nfs4",
.mount = nfs4_referral_mount,
.kill_sb = nfs4_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_RETRY_ESTALE,
};

static const struct super_operations nfs4_sops = {
diff --git a/fs/stat.c b/fs/stat.c
index c733dc5..02ce590 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
{
struct path path;
int error = -EINVAL;
- int lookup_flags = 0;
+ unsigned int lookup_flags = 0;
+ bool should_retry;

if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH)) != 0)
@@ -84,12 +85,18 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;

+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;

error = vfs_getattr(path.mnt, path.dentry, stat);
+ should_retry = error == -ESTALE ? retry_estale(path.dentry) : false;
path_put(&path);
+ if (should_retry) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8de6755..36fd4fa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -181,11 +181,14 @@ struct inodes_stat_t {
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
+#define FS_RETRY_ESTALE 8 /* retry operation on ESTALE error */
#define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move()
* during rename() internally.
*/

+#define retry_estale(dentry) (dentry->d_sb->s_type->fs_flags & FS_RETRY_ESTALE)
+
/*
* These are the fs-independent mount-flags: up to 32 flags are supported
*/
--
1.7.7.6



2012-04-16 11:36:19

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Sun, 15 Apr 2012 21:03:23 +0200
Bernd Schubert <[email protected]> wrote:

> On 04/13/2012 05:42 PM, Jeff Layton wrote:
> > (note: please don't trim the CC list!)
> >
> > Indefinitely does make some sense (as Peter articulated in his original
> > set). It's possible you could race several times in a row, or a server
> > misconfiguration or something has happened and you have a transient
> > error that will eventually recover. His assertion was that any limit on
> > the number of retries is by definition wrong. For NFS, a fatal signal
> > ought to interrupt things as well, so retrying indefinitely has some
> > appeal there.
> >
> > OTOH, we do have to contend with filesystems that might return ESTALE
> > persistently for other reasons and that might not respond to signals.
> > Miklos pointed out that some FUSE fs' do this in his review of Peter's
> > set.
> >
> > As a purely defensive coding measure, limiting the number of retries to
> > something finite makes sense. If we're going to do that though, I'd
> > probably recommend that we set the number of retries be something
> > higher just so that this is more resilient in the face of multiple
> > races. Those other fs' might "spin" a bit in that case but it is an
> > error condition and IMO resiliency trumps performance -- at least in
> this case.
>
> I am definitely voting against an infinite number of retries. I'm
> working on FhGFS, which supports distributed meta data servers. So when
> a file is moved around between directories, its file handle, which
> contains the meta-data target id might become invalid. As NFSv3 is
> stateless we cannot inform the client about that and must return ESTALE
> then. NFSv4 is better, but I'm not sure how well invalidating a file
> handle works. So retrying once on ESTALE might be a good idea, but
> retrying forever is not.

It's important to note that I'm only proposing to wrap syscalls this
way that take a pathname argument. We can't do anything about those
that don't since at that point we have no way to retry the lookup.

So, I'm not sure this patch would affect the case you're concerned
about one way or another. If you move the file to a different
directory, then it's pathname would also change, and at that point
you'd end up with an ENOENT error or something on the next retry.

If the file was open and you were (for instance) reading or writing to
it from a client when you moved it, then we can't retry the lookup at
that point. The open is long since done and the pathname is now gone.
You'll get an ESTALE back in userspace regardless.

> Also, what about asymmetric HA servers? I believe to remember that also
> resulted in ESTALE. So for example server1 exports /home and /scratch,
> but on failure server2 can only take over /home and denies access to
> /scratch.
>

That sounds like a broken cluster configuration. Still...

Presumably at some point in the future, a sysadmin would intervene and
fix the situation such that /scratch is available again. Is it better
to return an error to the application at that point, or simply allow it
to keep retrying until the problem has been fixed?

The person with the long running job that's doing operations
in /scratch would probably prefer the latter. If not, then they could
always send the program a fatal signal to stop it altogether.

--
Jeff Layton <[email protected]>

2012-04-22 05:40:58

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Fri, Apr 20, 2012 at 11:13 PM, Jeff Layton <[email protected]> wrote:
> On Fri, 20 Apr 2012 15:37:26 -0500
> Malahal Naineni <[email protected]> wrote:
>
>> Steve Dickson [[email protected]] wrote:
>> > > 2) if we assume that it is fairly representative of one, how can we
>> > > achieve retrying indefinitely with NFS, or at least some large finite
>> > > amount?
>> > The amount of looping would be peer speculation. If the problem can
>> > not be handled by one simple retry I would say we simply pass the
>> > error up to the app... Its an application issue...
>>
>> As someone said, ESTALE is an incorrect errno for a path based call.
>> How about turning ESTALE into ENOENT after a retry or few retries?
>>
>
> It's not really the same thing. One could envision an application
> that's repeatedly renaming a new file on top of another one. The file
> is never missing from the namespace of the server, but you could still
> end up getting an ESTALE.
>
> That would break other atomicity guarantees in an even worse way, IMO...

For directory operations ESTALE *is* equivalent to ENOENT if already
retrying with LOOKUP_REVAL. Think about it. Atomic replacement by
another directory with rename(2) is not an excuse here actually.
Local filesystems too can end up with IS_DEAD directory after lookup
in that case.

For non directories we basically have getattr and setattr. NFSv4 can
handle both without retries if we supply the name instead of the
handle (i.e. i_op->getattr_by_name, i_op->setattr_by_name). Other
protocols can do whatever they want, exponential backoff with limited
number of retries, whatever.

No looping required in the VFS.

Thanks,
Miklos

2012-04-17 16:00:44

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call



On 04/17/2012 11:02 AM, Jeff Layton wrote:
> On Tue, 17 Apr 2012 16:27:16 +0200
> Miklos Szeredi <[email protected]> wrote:
>
>> Steve Dickson <[email protected]> writes:
>>
>>> True, but even so... Giving file systems an opt-out option with the
>>> default being out, maybe still have some merit... Making file systems
>>> enable this new type of functionality would cut down on any of the
>>> "surprise" that might occur with this redo ;-)
>>
>> I've been arguing for something slightly different for quite some time:
>> I never liked errno values which have side effects in the kernel yet
>> might be visible to userspace.
>>
>> So why not introduce ERETRYSTALE, a *kernel internal* errno value that
>> userspace will never see and filesystems never accidentally set. The
>> VFS can turn this into ESTALE if it doesn't retry for some reason
>> (e.g. already retried).
>>
>
> That's possible but it's certainly a lot more invasive. It's also far
> more difficult for filesystems to opt-in to this sort of behavior.
>
> All of the places that call vfs_getattr, for instance will need to be
> fixed up to deal with that sort of error. That will also make it messy
> to do this in any sort of piecemeal fashion. We can't (for instance)
> convert NFSERR_STALE to -ESTALERETRY universally. We'll need to take
> great care only to return that to codepaths that are equipped to deal
> with it.
>
> Personally, I'd prefer not to foist so much code churn on any filesystem
> that might want to do this sort of retry, but I'll live with it if
> that's the consensus opinion...
>
I agree with Jeff.... Introducing a new errno is a bit of overkill.. IMHO...

steved.

2012-04-24 15:54:38

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

Jeff Layton <[email protected]> writes:

> On Mon, 23 Apr 2012 16:38:00 -0400
> Peter Staubach <[email protected]> wrote:
>
>> I don't really like the idea of introducing another errno as well. It seems like too much complexity and represents complexity that no one has really justified needing.
>>
>
> I tend to agree here. Miklos, can you elaborate a bit on what fuse
> filesystems you're particularly concerned about here? Which ones return
> ESTALE and under what conditions. Maybe we can try to tailor this
> solution to avoid the complexity without impacting them.

It is not just fuse I'm concerned about. Grep for -ESTALE, there are
about 120 hits about 20 of which come from NFS. There's no guarantee
that any of those ESTALE errors will go away on retry, which for an
unlimited retry means a hung OS. If you limit the number of retries
then in the best case it's just lots of wasted CPU cycles.

And an audit would still not ensure safety against future additions of
ESTALE.

And a simple audit won't find things like fuse, where the error comes
from outside the kernel. Fixing that is not trivial either. Turning
ESTALE into some other error prevents looping but breaks the return
value.

Thanks,
Miklos

2012-04-17 13:10:46

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call



On 04/16/2012 07:05 PM, Jeff Layton wrote:
> On Mon, 16 Apr 2012 20:25:06 +0000
> "Myklebust, Trond" <[email protected]> wrote:
>
>> On Mon, 2012-04-16 at 15:43 -0400, Jeff Layton wrote:
>>> On Mon, 16 Apr 2012 19:33:05 +0000
>>> "Myklebust, Trond" <[email protected]> wrote:
>>>
>>>> On Mon, 2012-04-16 at 13:46 -0400, Jeff Layton wrote:
>>>>> The question about looping indefinitely really comes down to:
>>>>>
>>>>> 1) is a persistent ESTALE in conjunction with a successful lookup a
>>>>> situation that we expect to be temporary. i.e. will the admin at some
>>>>> point be able to do something about it? If not, then there's no point
>>>>> in continuing to retry. Again, this is a situation that *really* should
>>>>> not happen if the filesystem is doing the right thing.
>>>>>
>>>>> 2) If the admin can't do anything about it, is it reasonable to expect
>>>>> that users can send a fatal signal to hung applications if this
>>>>> situation occurs.
>>>>>
>>>>> We expect that that's ok in other situations to resolve hung
>>>>> applications, so I'm not sure I understand why it wouldn't be
>>>>> acceptable here...
>>>>
>>>> There are definitely potentially persistent pathological situations that
>>>> the filesystem can't do anything about. If the point of origin for your
>>>> pathname (for instance your current directory in the case of a relative
>>>> pathname) is stale, then no amount of looping is going to help you to
>>>> recover.
>>>>
>>>
>>> Ok -- Peter pretty much said something similar. Retrying indefnitely
>>> when the lookup returns ESTALE probably won't help. I'm ok with
>>> basically letting the VFS continue to do what it does there already. If
>>> it gets an ESTALE, it tries again with LOOKUP_REVAL set and then gives
>>> up if that doesn't work.
>>>
>>> If however, the operation itself keeps returning ESTALE, are we OK to
>>> retry indefinitely assuming that we'll break out of the loop on fatal
>>> signals?
>>>
>>> For example, something like the v2 patch I sent a little while ago?
>>
>>
>> Won't something like fstatat(AT_FDCWD, "", &stat, AT_EMPTY_PATH) risk
>> looping forever there, or am I missing something?
>>
>
> To make sure I understand, that should be "shortcut" for a lookup of the
> cwd?
>
> So I guess the concern is that you'd do the above and get a successful
> lookup since you're just going to get back the cwd. At that point,
> you'd attempt the getattr and get ESTALE back. Then, you'd redo the
> lookup with LOOKUP_REVAL set -- but since we're operating on the
> cwd, we don't have a way to redo the lookup since we don't have a
> pathname that we can look up again...
>
> So yeah, I guess if you're sitting in a stale directory, something like
> that could loop eternally.
>
> Do you think the proposed check for fatal_signal_pending is enough to
> mitigate such a problem? Or do we need to limit the number of retries
> to address those sorts of loops?
>
I think your version 2 patch is definitely more clever than v1, but I'm
thinking Trond's point is no matter what type of looping is done or how
long its done its going to fail... So why loop at all?

Again I think the safest and simply way, from the VFS point of view,
is do the looping once if the file system as registered for it
through the fs_flags. This will catch %99 of the issues, failing
on the %1 of the corner cases...

steved.





2012-04-23 17:50:28

by Peter Staubach

[permalink] [raw]
Subject: RE: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

The situation of stale current directories and/or stale root directories for mounted file systems is handled. In these cases, recovery is not possible and so, are not looped for.

Thanx...

ps


-----Original Message-----
From: Miklos Szeredi [mailto:[email protected]]
Sent: Monday, April 23, 2012 11:24 AM
To: Chuck Lever
Cc: J. Bruce Fields; Jeff Layton; Malahal Naineni; Steve Dickson; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; Peter Staubach; [email protected]; [email protected]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

Chuck Lever <[email protected]> writes:

> On Apr 23, 2012, at 10:51 AM, Miklos Szeredi wrote:
>
>> "J. Bruce Fields" <[email protected]> writes:
>>
>>>
>>> I also wonder whether it would be making too many assumptions about
>>> the server or filesystem: just because ordinary posix interfaces
>>> don't allow atomic replacement of a whole directory tree doesn't
>>> mean the server might not have some way to do it.
>>
>> Exactly because posix limits the atomic replacement to empty
>> directories is that this feature is not useful and is why linux can
>> get away with the dead directory behavior in this case. And thinking
>> about fixing this in NFS is completely pointless since no one will
>> rely on the atomic replacement behavior. Fixing local filesystems is
>> also pointless for the same reason.
>>
>> Atomic replacement of whole directory trees would indeed be more
>> useful, but it's highly unlikely to be used anywhere since
>> applications relying on this feature would be limited to special filesystems that allow this.
>
> The cases I can think of have to do with file system restore, file
> system and block device snapshots, and so on. This type of use case
> may not practical on today's Linux server, but they are a reality for
> anyone using high-end NFS storage.

Problem with this is: if some directory file handles are stale (e.g. due to directories being removed, recreated, moved around) and on the client there are cwd-s referring to these handles, then they are going to become stale no matter what you do, even if the same path does exist after the restore on the server.

Thanks,
Miklos

2012-04-23 13:13:25

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, 23 Apr 2012 09:00:09 -0400
"J. Bruce Fields" <[email protected]> wrote:

> On Mon, Apr 23, 2012 at 08:00:12AM -0400, Jeff Layton wrote:
> > On Sun, 22 Apr 2012 07:40:57 +0200
> > Miklos Szeredi <[email protected]> wrote:
> >
> > > On Fri, Apr 20, 2012 at 11:13 PM, Jeff Layton <[email protected]> wrote:
> > > > On Fri, 20 Apr 2012 15:37:26 -0500
> > > > Malahal Naineni <[email protected]> wrote:
> > > >
> > > >> Steve Dickson [[email protected]] wrote:
> > > >> > > 2) if we assume that it is fairly representative of one, how can we
> > > >> > > achieve retrying indefinitely with NFS, or at least some large finite
> > > >> > > amount?
> > > >> > The amount of looping would be peer speculation. If the problem can
> > > >> > not be handled by one simple retry I would say we simply pass the
> > > >> > error up to the app... Its an application issue...
> > > >>
> > > >> As someone said, ESTALE is an incorrect errno for a path based call.
> > > >> How about turning ESTALE into ENOENT after a retry or few retries?
> > > >>
> > > >
> > > > It's not really the same thing. One could envision an application
> > > > that's repeatedly renaming a new file on top of another one. The file
> > > > is never missing from the namespace of the server, but you could still
> > > > end up getting an ESTALE.
> > > >
> > > > That would break other atomicity guarantees in an even worse way, IMO...
> > >
> > > For directory operations ESTALE *is* equivalent to ENOENT if already
> > > retrying with LOOKUP_REVAL. Think about it. Atomic replacement by
> > > another directory with rename(2) is not an excuse here actually.
> > > Local filesystems too can end up with IS_DEAD directory after lookup
> > > in that case.
> > >
> >
> > Doesn't that violate POSIX? rename(2) is supposed to be atomic, and I
> > can't see where there's any exception for that for directories.
>
> Hm, but that only allows atomic replacement of the last component of a
> path.
>
> Suppose you're looking up a path, you've so far reached intermediate
> directory "D", and the next step of the lookup (of some entry in D)
> returns ESTALE. Then either:
>
> - D has since been unlinked, and ENOENT is obviously right.
> - D was unlinked and then replaced by something else, in which
> case there was still a moment when ENOENT was correct.
> - D was replaced atomically by a rename. But for the rename to
> work it must have been replacing an empty directory, so there
> was still a moment when ENOENT would have been correct.

I don't think so...D should always exist in the namespace, so ENOENT
would not be correct. Just because it was empty doesn't mean that it
didn't exist...

> (Exception: if D was actually a regular file or some other
> non-directory object, then ENOTDIR would be the right error:
> but if you're able to get at least object type atomically with
> a lookup, then you should have noticed this already on lookup
> of D.)
>
> I think that's what Miklos meant?
>
> --b.

Here's an example -- suppose we have two directories: /foo
and /bar. /bar is empty. We call:

rename("/foo","/bar");

...and at the same time, someone is calling:

stat("/bar");

...the calls race and in this condition the stat() gets ESTALE back
-- /bar got replaced after we did the lookup.

According to POSIX, the name "/bar" should never be absent from the
namespace in this situation, so I'm not sure I understand why returning
ENOENT here would be acceptable.

--
Jeff Layton <[email protected]>

2012-04-17 15:02:00

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, 17 Apr 2012 16:27:16 +0200
Miklos Szeredi <[email protected]> wrote:

> Steve Dickson <[email protected]> writes:
>
> > True, but even so... Giving file systems an opt-out option with the
> > default being out, maybe still have some merit... Making file systems
> > enable this new type of functionality would cut down on any of the
> > "surprise" that might occur with this redo ;-)
>
> I've been arguing for something slightly different for quite some time:
> I never liked errno values which have side effects in the kernel yet
> might be visible to userspace.
>
> So why not introduce ERETRYSTALE, a *kernel internal* errno value that
> userspace will never see and filesystems never accidentally set. The
> VFS can turn this into ESTALE if it doesn't retry for some reason
> (e.g. already retried).
>

That's possible but it's certainly a lot more invasive. It's also far
more difficult for filesystems to opt-in to this sort of behavior.

All of the places that call vfs_getattr, for instance will need to be
fixed up to deal with that sort of error. That will also make it messy
to do this in any sort of piecemeal fashion. We can't (for instance)
convert NFSERR_STALE to -ESTALERETRY universally. We'll need to take
great care only to return that to codepaths that are equipped to deal
with it.

Personally, I'd prefer not to foist so much code churn on any filesystem
that might want to do this sort of retry, but I'll live with it if
that's the consensus opinion...

--
Jeff Layton <[email protected]>

2012-04-23 17:43:26

by Peter Staubach

[permalink] [raw]
Subject: RE: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

The test program runs and expects many, many ENOENTS to be returned. It just doesn't expect ESTALE to be returned. It doesn't see ESTALE from local file systems.

To answer a question asked earlier -- the test program does not mimic any particular application behavior, except in the extreme. It is designed to create as stressful a situation as might be ever seen.

Has anyone explained why the full solution won't work from a technical viewpoint?

Thanx...

ps


-----Original Message-----
From: Steve Dickson [mailto:[email protected]]
Sent: Monday, April 23, 2012 10:55 AM
To: Jeff Layton
Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; Peter Staubach; [email protected]; [email protected]; [email protected]; [email protected]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call



On 04/20/2012 05:13 PM, Jeff Layton wrote:
> On Fri, 20 Apr 2012 16:18:37 -0400
> Steve Dickson <[email protected]> wrote:
>
>> On 04/20/2012 10:40 AM, Jeff Layton wrote:
>>> I guess the questions at this point is:
>>>
>>> 1) How representative is Peter's mkdir_test() of a real-world workload?
>> Reading your email I had to wonder the same thing... What application
>> removes hierarchy of directories in a loop from two different clients?
>> I would suspect not many, if any... esp over NFS...
>>
>
> Peter's test just happens to demonstrate the problem well, but one
> could envision someone removing a heirarchy of directories on the
> server while we're trying to do other operations in it. At that point,
> we can easily end up hitting an ESTALE twice while doing the lookup
> and returning ESTALE back to userspace.
Just curious, what happens when you run Peter's mkdir_test() on a local file system? Any errors returned?

I would think removing hierarchy of directories while they are being accessed has to even cause local fs some type of havoc

>
>>>
>>> 2) if we assume that it is fairly representative of one, how can we
>>> achieve retrying indefinitely with NFS, or at least some large
>>> finite amount?
>> The amount of looping would be peer speculation. If the problem can
>> not be handled by one simple retry I would say we simply pass the
>> error up to the app... Its an application issue...
>>
>
> It's not an application issue. The application just asked the kernel
> to do an operation on a pathname. The only reason you're getting an
> ESTALE back in this situation is a shortcoming of the implementation.
>
> We passed it a pathname after all, not a filehandle. ESTALE really has
> no place as a return code in that situation...
We'll have to agree to disagree... I think any application that is removing hierarchies of file and directory w/out taking any precautionary locking is a shortcoming of the application implementation.

>
>>>
>>> I have my doubts as to whether it would really be as big a problem
>>> for other filesystems as Miklos and others have asserted, but I'll
>>> take their word for it at the moment. What's the best way to contain
>>> this behavior to just those filesystems that want to retry
>>> indefinitely when they get an ESTALE? Would we need to go with an
>>> entirely new ESTALERETRY after all?
>>>
>> Introducing a new errno to handle this problem would be overkill IMHO...
>>
>> If we have to go to the looping approach, I would strong suggest we
>> make the file systems register for this type of behavior...
>>
>
> Returning ESTALERETRY would be registering for it in a way and it is
> somewhat cleaner than having to go all the way back up to the fstype
> to figure out whether you want to retry it or not.
How would legacy apps handle this new errno, esp if they have logic to take care of ESTALE errors?

steved.

2012-04-17 14:27:04

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

Steve Dickson <[email protected]> writes:

> True, but even so... Giving file systems an opt-out option with the
> default being out, maybe still have some merit... Making file systems
> enable this new type of functionality would cut down on any of the
> "surprise" that might occur with this redo ;-)

I've been arguing for something slightly different for quite some time:
I never liked errno values which have side effects in the kernel yet
might be visible to userspace.

So why not introduce ERETRYSTALE, a *kernel internal* errno value that
userspace will never see and filesystems never accidentally set. The
VFS can turn this into ESTALE if it doesn't retry for some reason
(e.g. already retried).

Thanks,
Miklos

2012-04-18 11:52:12

by Jeff Layton

[permalink] [raw]
Subject: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

ESTALE errors are a source of pain for many users of NFS. Usually they
occur when a file is removed from the server after a successful lookup
against it.

Luckily, the remedy in these cases is usually simple. We should just
redo the lookup, forcing revalidations all the way in and then retry the
call. We of course cannot do this for syscalls that do not involve a
path, but for path-based syscalls we can and should attempt to recover
from an ESTALE.

This patch implements this by having the VFS reattempt the lookup (with
LOOKUP_REVAL set) and call exactly once when it would ordinarily return
ESTALE. This should catch the bulk of these cases under normal usage,
without unduly inconveniencing other filesystems that return ESTALE on
path-based syscalls.

Note that it's possible to hit this race more than once, but a single
retry should catch the bulk of these cases under normal circumstances.

This patch is just an example. We'll alter most path-based syscalls in a
similar fashion to fix this correctly. At this point, I'm just trying to
ensure that the retry semantics are acceptable before I being that work.

Does anyone have strong objections to this patch? I'm aware that the
retry mechanism is not as robust as many (e.g. Peter) would like, but it
should at least improve the current situation.

If no one has a strong objection, then I'll start going through and
adding similar code to the other syscalls. And we can hopefully we can
get at least some of them in for 3.5.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/stat.c | 9 ++++++++-
1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/fs/stat.c b/fs/stat.c
index c733dc5..0ee9cb4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
{
struct path path;
int error = -EINVAL;
- int lookup_flags = 0;
+ bool retried = false;
+ unsigned int lookup_flags = 0;

if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH)) != 0)
@@ -84,12 +85,18 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;

+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;

error = vfs_getattr(path.mnt, path.dentry, stat);
path_put(&path);
+ if (error == -ESTALE && !retried) {
+ retried = true;
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
--
1.7.7.6


2012-04-23 11:21:27

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Sun, 22 Apr 2012 09:46:32 +0530
Ric Wheeler <[email protected]> wrote:

> On 04/20/2012 08:10 PM, Jeff Layton wrote:
> > On Wed, 18 Apr 2012 07:52:07 -0400
> > Jeff Layton<[email protected]> wrote:
> >
> >> ESTALE errors are a source of pain for many users of NFS. Usually they
> >> occur when a file is removed from the server after a successful lookup
> >> against it.
> >>
> >> Luckily, the remedy in these cases is usually simple. We should just
> >> redo the lookup, forcing revalidations all the way in and then retry the
> >> call. We of course cannot do this for syscalls that do not involve a
> >> path, but for path-based syscalls we can and should attempt to recover
> >> from an ESTALE.
> >>
> >> This patch implements this by having the VFS reattempt the lookup (with
> >> LOOKUP_REVAL set) and call exactly once when it would ordinarily return
> >> ESTALE. This should catch the bulk of these cases under normal usage,
> >> without unduly inconveniencing other filesystems that return ESTALE on
> >> path-based syscalls.
> >>
> >> Note that it's possible to hit this race more than once, but a single
> >> retry should catch the bulk of these cases under normal circumstances.
> >>
> >> This patch is just an example. We'll alter most path-based syscalls in a
> >> similar fashion to fix this correctly. At this point, I'm just trying to
> >> ensure that the retry semantics are acceptable before I being that work.
> >>
> >> Does anyone have strong objections to this patch? I'm aware that the
> >> retry mechanism is not as robust as many (e.g. Peter) would like, but it
> >> should at least improve the current situation.
> >>
> >> If no one has a strong objection, then I'll start going through and
> >> adding similar code to the other syscalls. And we can hopefully we can
> >> get at least some of them in for 3.5.
> >>
> >> Signed-off-by: Jeff Layton<[email protected]>
> >> ---
> >> fs/stat.c | 9 ++++++++-
> >> 1 files changed, 8 insertions(+), 1 deletions(-)
> >>
> >> diff --git a/fs/stat.c b/fs/stat.c
> >> index c733dc5..0ee9cb4 100644
> >> --- a/fs/stat.c
> >> +++ b/fs/stat.c
> >> @@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
> >> {
> >> struct path path;
> >> int error = -EINVAL;
> >> - int lookup_flags = 0;
> >> + bool retried = false;
> >> + unsigned int lookup_flags = 0;
> >>
> >> if ((flag& ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
> >> AT_EMPTY_PATH)) != 0)
> >> @@ -84,12 +85,18 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
> >> if (flag& AT_EMPTY_PATH)
> >> lookup_flags |= LOOKUP_EMPTY;
> >>
> >> +retry:
> >> error = user_path_at(dfd, filename, lookup_flags,&path);
> >> if (error)
> >> goto out;
> >>
> >> error = vfs_getattr(path.mnt, path.dentry, stat);
> >> path_put(&path);
> >> + if (error == -ESTALE&& !retried) {
> >> + retried = true;
> >> + lookup_flags |= LOOKUP_REVAL;
> >> + goto retry;
> >> + }
> >> out:
> >> return error;
> >> }
> > Apologies for replying to myself here. Just to beat on the deceased
> > equine a little longer, I should note that the above approach does
> > *not* fix Peter's reproducer in his original email. It fails rather
> > quickly when run simultaneously on the client and server.
> >
> > At least one of the tests in it creates and removes a hierarchy of
> > directories in a loop. During that, the lookup from the client can
> > easily fail more than once with ESTALE. Since we give up after a single
> > retry, that causes the call to return ESTALE.
> >
> > While testing this approach with mkdir and fstatat, I modified the
> > patch to retry multiple times, also retry when the lookup thows ESTALE
> > and to throw a printk when the number of retries was> 1 with the
> > number of retries that the call did and the eventual error code.
> >
> > With Peter's (admittedly synthetic) test, we can get an answer of sorts
> > to Trond's question from earlier in the thread as to how many retries
> > is "enough":
> >
> > [ 45.023665] sys_mkdirat: retries=33 error=-2
> > [ 47.889300] sys_mkdirat: retries=51 error=-2
> > [ 49.172746] sys_mkdirat: retries=27 error=-2
> > [ 52.325723] sys_mkdirat: retries=10 error=-2
> > [ 58.082576] sys_mkdirat: retries=33 error=-2
> > [ 59.810461] sys_mkdirat: retries=5 error=-2
> > [ 63.387914] sys_mkdirat: retries=14 error=-2
> > [ 63.630785] sys_mkdirat: retries=4 error=-2
> > [ 68.268903] sys_mkdirat: retries=6 error=-2
> > [ 71.124173] sys_mkdirat: retries=99 error=-2
> > [ 75.657649] sys_mkdirat: retries=123 error=-2
> > [ 76.903814] sys_mkdirat: retries=26 error=-2
> > [ 82.009463] sys_mkdirat: retries=30 error=-2
> > [ 84.807731] sys_mkdirat: retries=67 error=-2
> > [ 89.825529] sys_mkdirat: retries=166 error=-2
> > [ 91.599104] sys_mkdirat: retries=8 error=-2
> > [ 95.621855] sys_mkdirat: retries=44 error=-2
> > [ 98.164588] sys_mkdirat: retries=61 error=-2
> > [ 99.783347] sys_mkdirat: retries=11 error=-2
> > [ 105.593980] sys_mkdirat: retries=104 error=-2
> > [ 110.348861] sys_mkdirat: retries=8 error=-2
> > [ 112.087966] sys_mkdirat: retries=46 error=-2
> > [ 117.613316] sys_mkdirat: retries=90 error=-2
> > [ 120.117550] sys_mkdirat: retries=2 error=-2
> > [ 122.624330] sys_mkdirat: retries=15 error=-2
> >
> > So, now I'm having buyers remorse of sorts about proposing to just
> > retry once as that may not be strong enough to fix some of the cases
> > we're interested in.
> >
> > I guess the questions at this point is:
> >
> > 1) How representative is Peter's mkdir_test() of a real-world workload?
> >
> > 2) if we assume that it is fairly representative of one, how can we
> > achieve retrying indefinitely with NFS, or at least some large finite
> > amount?
> >
> > I have my doubts as to whether it would really be as big a problem for
> > other filesystems as Miklos and others have asserted, but I'll take
> > their word for it at the moment. What's the best way to contain this
> > behavior to just those filesystems that want to retry indefinitely when
> > they get an ESTALE? Would we need to go with an entirely new
> > ESTALERETRY after all?
> >
>
> Maybe we should have a default of a single loop and a tunable to allow clients
> to crank it up?
>

While I generally don't like adding tunables, this seems like it might
be a reasonable place for one. There's been a lot of debate about what
the correct behavior should be on a retry.. A tunable would allow us to
experiment with different values.

That said, I'm warming up to Miklos' suggestion about adding an
ESTALERETRY error code. If we go that route then it's reasonably simple
to retry indefinitely on that return from the lower fs.

--
Jeff Layton <[email protected]>

2012-04-16 19:33:22

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

T24gTW9uLCAyMDEyLTA0LTE2IGF0IDEzOjQ2IC0wNDAwLCBKZWZmIExheXRvbiB3cm90ZToNCj4g
VGhlIHF1ZXN0aW9uIGFib3V0IGxvb3BpbmcgaW5kZWZpbml0ZWx5IHJlYWxseSBjb21lcyBkb3du
IHRvOg0KPiANCj4gMSkgaXMgYSBwZXJzaXN0ZW50IEVTVEFMRSBpbiBjb25qdW5jdGlvbiB3aXRo
IGEgc3VjY2Vzc2Z1bCBsb29rdXAgYQ0KPiBzaXR1YXRpb24gdGhhdCB3ZSBleHBlY3QgdG8gYmUg
dGVtcG9yYXJ5LiBpLmUuIHdpbGwgdGhlIGFkbWluIGF0IHNvbWUNCj4gcG9pbnQgYmUgYWJsZSB0
byBkbyBzb21ldGhpbmcgYWJvdXQgaXQ/IElmIG5vdCwgdGhlbiB0aGVyZSdzIG5vIHBvaW50DQo+
IGluIGNvbnRpbnVpbmcgdG8gcmV0cnkuIEFnYWluLCB0aGlzIGlzIGEgc2l0dWF0aW9uIHRoYXQg
KnJlYWxseSogc2hvdWxkDQo+IG5vdCBoYXBwZW4gaWYgdGhlIGZpbGVzeXN0ZW0gaXMgZG9pbmcg
dGhlIHJpZ2h0IHRoaW5nLg0KPiANCj4gMikgSWYgdGhlIGFkbWluIGNhbid0IGRvIGFueXRoaW5n
IGFib3V0IGl0LCBpcyBpdCByZWFzb25hYmxlIHRvIGV4cGVjdA0KPiB0aGF0IHVzZXJzIGNhbiBz
ZW5kIGEgZmF0YWwgc2lnbmFsIHRvIGh1bmcgYXBwbGljYXRpb25zIGlmIHRoaXMNCj4gc2l0dWF0
aW9uIG9jY3Vycy4NCj4gDQo+IFdlIGV4cGVjdCB0aGF0IHRoYXQncyBvayBpbiBvdGhlciBzaXR1
YXRpb25zIHRvIHJlc29sdmUgaHVuZw0KPiBhcHBsaWNhdGlvbnMsIHNvIEknbSBub3Qgc3VyZSBJ
IHVuZGVyc3RhbmQgd2h5IGl0IHdvdWxkbid0IGJlDQo+IGFjY2VwdGFibGUgaGVyZS4uLg0KDQpU
aGVyZSBhcmUgZGVmaW5pdGVseSBwb3RlbnRpYWxseSBwZXJzaXN0ZW50IHBhdGhvbG9naWNhbCBz
aXR1YXRpb25zIHRoYXQNCnRoZSBmaWxlc3lzdGVtIGNhbid0IGRvIGFueXRoaW5nIGFib3V0LiBJ
ZiB0aGUgcG9pbnQgb2Ygb3JpZ2luIGZvciB5b3VyDQpwYXRobmFtZSAoZm9yIGluc3RhbmNlIHlv
dXIgY3VycmVudCBkaXJlY3RvcnkgaW4gdGhlIGNhc2Ugb2YgYSByZWxhdGl2ZQ0KcGF0aG5hbWUp
IGlzIHN0YWxlLCB0aGVuIG5vIGFtb3VudCBvZiBsb29waW5nIGlzIGdvaW5nIHRvIGhlbHAgeW91
IHRvDQpyZWNvdmVyLg0KDQotLSANClRyb25kIE15a2xlYnVzdA0KTGludXggTkZTIGNsaWVudCBt
YWludGFpbmVyDQoNCk5ldEFwcA0KVHJvbmQuTXlrbGVidXN0QG5ldGFwcC5jb20NCnd3dy5uZXRh
cHAuY29tDQoNCg==

2012-04-17 14:04:36

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

T24gVHVlLCAyMDEyLTA0LTE3IGF0IDA5OjMyIC0wNDAwLCBKZWZmIExheXRvbiB3cm90ZToNCj4g
T24gVHVlLCAxNyBBcHIgMjAxMiAxNToxMjoyMCArMDIwMA0KPiBNaWtsb3MgU3plcmVkaSA8bWlr
bG9zQHN6ZXJlZGkuaHU+IHdyb3RlOg0KDQo+IFRvIGRvIHRoYXQgd291bGQgcmVxdWlyZSBwcm90
b2NvbCBzdXBwb3J0IHRoYXQgd2Ugc2ltcGx5IGRvbid0IGhhdmUuIFdlDQo+IGRvbid0IGhhdmUg
YSB3YXkgdG8gKGZvciBpbnN0YW5jZSkgc2F5IHZpYSBORlMgImdpdmUgbWUgdGhlIGF0dHJpYnV0
ZXMNCj4gZm9yIHRoaXMgZmlsZW5hbWUiLiBXZWxsLCBhdCBsZWFzdCBub3QgZm9yIE5GU3YzLi4u
DQoNCldoYXQncyB3cm9uZyB3aXRoIExPT0tVUD8NCg0KPiBXaXRoIHY0IHlvdSBjb3VsZCB0aGVv
cmV0aWNhbGx5IGNvbnN0cnVjdCBhIGNvbXBvdW5kIHRoYXQgZG9lcyB0aGF0LA0KPiBidXQgeW91
J2QgaGF2ZSB0byBhc3N1bWUgdGhhdCB0aGUgc2VydmVyIHdvbid0IHJlbGVhc2UgdGhlIHJlZmVy
ZW5jZSB0bw0KPiB0aGUgaW5vZGUgbWlkd2F5IHRocm91Z2ggdGhlIGNvbXBvdW5kLiBUaGF0J3Mg
YSByZWFzb25hYmx5IHNhZmUNCj4gYXNzdW1wdGlvbi4NCg0KQWN0dWFsbHksIE5GU3Y0IGlzIHRo
ZSBvbmUgdGhhdCBoYXMgdGhlIHByb2JsZW06IHRoZXJlIGFyZSBubyBhdG9taWNpdHkNCmd1YXJh
bnRlZXMgd2l0aGluIGNvbXBvdW5kcywgc28geW91IGNvdWxkIHRoZW9yZXRpY2FsbHkgZ2V0IGFu
IEVTVEFMRSBpbg0KdGhlIEdFVEFUVFIgcGFydCBvZiBvdXIgbG9va3VwIGNvbXBvdW5kLg0KDQot
LSANClRyb25kIE15a2xlYnVzdA0KTGludXggTkZTIGNsaWVudCBtYWludGFpbmVyDQoNCk5ldEFw
cA0KVHJvbmQuTXlrbGVidXN0QG5ldGFwcC5jb20NCnd3dy5uZXRhcHAuY29tDQoNCg==

2012-04-14 01:14:57

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Fri, 2012-04-13 at 13:34 -0400, Peter Staubach wrote:
> I still think that returning ESTALE to the application is just exposing a short coming in the implementation. From a path based system like stat(), the application should see either ENOENT or some sort valid return.
>
> I also look at the looping from the other side. While possible, of course, I'd like to see someone construct a situation where it really happens. By this, I don't mean a thought experiment, but a real running situation.
>
> We already have evidence, in the form of the Solaris NFS client, that infinite looping does not happen in nature.

Could we turn that statistic around? Has the Solaris client ever seen a
loop of more than, say, 5 or 6 retries?

I'm just trying to limit the scope of the problem...

Cheers
Trond


2012-04-23 15:28:00

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

Jeff Layton <[email protected]> writes:

>
> Ok, but again, that only applies to the lookup. It has no bearing on
> the subsequent operation. For instance, if we're doing:
>
> rename("/foo", "/bar");
>
> ...and another client is simultaneously doing:
>
> creat("/bar/baz", 0600);
>
> ...and we get back ESTALE from the server on the create because the
> "old" /bar got replaced after the lookup of it. Then it seems like
> returning -ENOENT would not be correct since there was never a time
> where /bar didn't exist...

It may not be "correct" according to some standard. But it's what Linux
does since day one on *all* filesystems. And probably other OS's that
have remotely scalable lookup routines.

Thanks,
Miklos

2012-04-23 15:32:48

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, 23 Apr 2012 10:55:24 -0400
Steve Dickson <[email protected]> wrote:

>
>
> On 04/20/2012 05:13 PM, Jeff Layton wrote:
> > On Fri, 20 Apr 2012 16:18:37 -0400
> > Steve Dickson <[email protected]> wrote:
> >
> >> On 04/20/2012 10:40 AM, Jeff Layton wrote:
> >>> I guess the questions at this point is:
> >>>
> >>> 1) How representative is Peter's mkdir_test() of a real-world workload?
> >> Reading your email I had to wonder the same thing... What application
> >> removes hierarchy of directories in a loop from two different clients?
> >> I would suspect not many, if any... esp over NFS...
> >>
> >
> > Peter's test just happens to demonstrate the problem well, but one
> > could envision someone removing a heirarchy of directories on the
> > server while we're trying to do other operations in it. At that point,
> > we can easily end up hitting an ESTALE twice while doing the lookup and
> > returning ESTALE back to userspace.
> Just curious, what happens when you run Peter's mkdir_test() on a
> local file system? Any errors returned?
>
> I would think removing hierarchy of directories while they are being
> accessed has to even cause local fs some type of havoc
>

Peter's test only treats an ESTALE error as a failure since it was
specifically designed to ensure that those didn't make it in to
userspace.

If you run 2 copies on the same local fs and strace it, then you'll see
the syscalls get back things like ENOENT or EEXIST as they step on each
others' toes in the mkdir()/rmdir() calls.

> >
> >>>
> >>> 2) if we assume that it is fairly representative of one, how can we
> >>> achieve retrying indefinitely with NFS, or at least some large finite
> >>> amount?
> >> The amount of looping would be peer speculation. If the problem can
> >> not be handled by one simple retry I would say we simply pass the
> >> error up to the app... Its an application issue...
> >>
> >
> > It's not an application issue. The application just asked the kernel
> > to do an operation on a pathname. The only reason you're getting an
> > ESTALE back in this situation is a shortcoming of the implementation.
> >
> > We passed it a pathname after all, not a filehandle. ESTALE really has
> > no place as a return code in that situation...
> We'll have to agree to disagree... I think any application that is
> removing hierarchies of file and directory w/out taking any
> precautionary locking is a shortcoming of the application
> implementation.
>

I'm not saying they should never get an error in that situation. I'm
just saying that an ESTALE return in this situation is wrong (or at
least not helpful) since the syscall was provided a pathname not a
filehandle or open fd or anything. When we still have the pathname,
then we have the ability to reattempt on an ESTALE, and it would be
preferable to do so.

> >
> >>>
> >>> I have my doubts as to whether it would really be as big a problem for
> >>> other filesystems as Miklos and others have asserted, but I'll take
> >>> their word for it at the moment. What's the best way to contain this
> >>> behavior to just those filesystems that want to retry indefinitely when
> >>> they get an ESTALE? Would we need to go with an entirely new
> >>> ESTALERETRY after all?
> >>>
> >> Introducing a new errno to handle this problem would be overkill IMHO...
> >>
> >> If we have to go to the looping approach, I would strong suggest we
> >> make the file systems register for this type of behavior...
> >>
> >
> > Returning ESTALERETRY would be registering for it in a way and it is
> > somewhat cleaner than having to go all the way back up to the fstype to
> > figure out whether you want to retry it or not.
> How would legacy apps handle this new errno, esp if they have logic
> to take care of ESTALE errors?
>

Userspace should never see that error. The idea is that this would be a
kernel-internal error code that indicates to the VFS that it should
retry the lookup and operation. If the kernel decides to give up after
the FS returns ESTALERETRY, then we'd have to convert that error
into ESTALE.

It'd be preferable to me if we didn't require a new error code, but if
different filesystems require different semantics from the VFS on an
ESTALE return, then that is one way to achieve it.

--
Jeff Layton <[email protected]>

2012-04-17 14:02:55

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

Jeff Layton <[email protected]> writes:

>> The retry is needed when when we discover during ->getattr() that the
>> cached lookup returned a stale file handle.
>>
>> If the lookup wasn't cached or if there was no lookup at all
>> (stat(".") and friends) then retrying will not gain anything.
>>
>
> That's not necessarily the case, at least not with NFS. It's easily
> possible for you to do a full-fledged lookup over the wire, and then
> for that inode to be removed prior to issuing a call against the FH that
> you got back.
>
>> And that also means that retrying multiple times is pointless, since
>> after the first retry we are sure to have up-to-date attributes.
>>
>
> Again, it's not pointless. It's possible (though somewhat pathological)
> for you to hit the race above more than once in the same operation.
> Granted, it's an unlikely race but it is possible.

->lookup() should (and does AFAICS) leave a fully initialized inode with
up-to-date atributes in there and at that moment there's not a lot of
sense in fetching the attributes from the server *again*.

How that lookup is implemented is another question, I can see in
nfs3_proc_lookup() that it may or may not be atomic, and retrying (and
looping) there might make sense. But it doesn't retry and doesn't loop,
so the fact is, it will either return ESTALE (and the path lookup will
retry once with LOOKUP_REVAL) or the inode *will* be up-to-date.

So I think I'm still right that it's pointless to do retry after a
non-cached lookup.

>> Unfortunately it's impossible for the filesystem to know whether a
>> ->getattr (or other inode operation) was perfromed after a cached or a
>> non-cached lookup.
>>
>> I'm not sure what the right interface for this would be. One would be
>> to just pass the "cached-or-not" information as a flag. That works for
>> getattr() but not for other operations.
>>
>> Another is to introduce atomic lookup+foo variants of these operations
>> just like for open. E.g. the lookup+getattr is called if the cached
>> lookup fails or if the cached lookup succeeds and the plain ->getattr
>> call returns ESTALE.
>>
>
> To do that would require protocol support that we simply don't have. We
> don't have a way to (for instance) say via NFS "give me the attributes
> for this filename". Well, at least not for NFSv3...
>
> With v4 you could theoretically construct a compound that does that,
> but you'd have to assume that the server won't release the reference to
> the inode midway through the compound. That's a reasonably safe
> assumption.
>
> While it's nice to consider new atomic ops like this, it's not really
> possible with earlier versions of NFS.

It's just an interface question: where you put the retry logic for those
non-atomic protocols. I think it's cleaner if the retry logic is in the
filesystem and not in the VFS. Then NFS can do whatever it wants
without having to impose policy in the generic filesystem layer.

Thanks,
Miklos

2012-04-17 14:15:18

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call



On 04/17/2012 09:36 AM, Jeff Layton wrote:
> On Tue, 17 Apr 2012 07:46:19 -0400
> Steve Dickson <[email protected]> wrote:
>
>>
>>
>> On 04/16/2012 07:05 PM, Jeff Layton wrote:
>>> On Mon, 16 Apr 2012 20:25:06 +0000
>>> "Myklebust, Trond" <[email protected]> wrote:
>>>
>>>> On Mon, 2012-04-16 at 15:43 -0400, Jeff Layton wrote:
>>>>> On Mon, 16 Apr 2012 19:33:05 +0000
>>>>> "Myklebust, Trond" <[email protected]> wrote:
>>>>>
>>>>>> On Mon, 2012-04-16 at 13:46 -0400, Jeff Layton wrote:
>>>>>>> The question about looping indefinitely really comes down to:
>>>>>>>
>>>>>>> 1) is a persistent ESTALE in conjunction with a successful lookup a
>>>>>>> situation that we expect to be temporary. i.e. will the admin at some
>>>>>>> point be able to do something about it? If not, then there's no point
>>>>>>> in continuing to retry. Again, this is a situation that *really* should
>>>>>>> not happen if the filesystem is doing the right thing.
>>>>>>>
>>>>>>> 2) If the admin can't do anything about it, is it reasonable to expect
>>>>>>> that users can send a fatal signal to hung applications if this
>>>>>>> situation occurs.
>>>>>>>
>>>>>>> We expect that that's ok in other situations to resolve hung
>>>>>>> applications, so I'm not sure I understand why it wouldn't be
>>>>>>> acceptable here...
>>>>>>
>>>>>> There are definitely potentially persistent pathological situations that
>>>>>> the filesystem can't do anything about. If the point of origin for your
>>>>>> pathname (for instance your current directory in the case of a relative
>>>>>> pathname) is stale, then no amount of looping is going to help you to
>>>>>> recover.
>>>>>>
>>>>>
>>>>> Ok -- Peter pretty much said something similar. Retrying indefnitely
>>>>> when the lookup returns ESTALE probably won't help. I'm ok with
>>>>> basically letting the VFS continue to do what it does there already. If
>>>>> it gets an ESTALE, it tries again with LOOKUP_REVAL set and then gives
>>>>> up if that doesn't work.
>>>>>
>>>>> If however, the operation itself keeps returning ESTALE, are we OK to
>>>>> retry indefinitely assuming that we'll break out of the loop on fatal
>>>>> signals?
>>>>>
>>>>> For example, something like the v2 patch I sent a little while ago?
>>>>
>>>>
>>>> Won't something like fstatat(AT_FDCWD, "", &stat, AT_EMPTY_PATH) risk
>>>> looping forever there, or am I missing something?
>>>>
>>>
>>> To make sure I understand, that should be "shortcut" for a lookup of the
>>> cwd?
>>>
>>> So I guess the concern is that you'd do the above and get a successful
>>> lookup since you're just going to get back the cwd. At that point,
>>> you'd attempt the getattr and get ESTALE back. Then, you'd redo the
>>> lookup with LOOKUP_REVAL set -- but since we're operating on the
>>> cwd, we don't have a way to redo the lookup since we don't have a
>>> pathname that we can look up again...
>>>
>>> So yeah, I guess if you're sitting in a stale directory, something like
>>> that could loop eternally.
>>>
>>> Do you think the proposed check for fatal_signal_pending is enough to
>>> mitigate such a problem? Or do we need to limit the number of retries
>>> to address those sorts of loops?
>>>
>> I think your version 2 patch is definitely more clever than v1, but I'm
>> thinking Trond's point is no matter what type of looping is done or how
>> long its done its going to fail... So why loop at all?
>>
>
> Because we can't distinguish between that case and a case where
> retrying may help. If we can avoid failing altogether, isn't that
> preferable?
>
>> Again I think the safest and simply way, from the VFS point of view,
>> is do the looping once if the file system as registered for it
>> through the fs_flags. This will catch %99 of the issues, failing
>> on the %1 of the corner cases...
>>
>
> If we're just going to retry once, then I see no need to bother with a
> fs flag. Retrying once should be harmless for all filesystems. The main
> reason to bother with a per-fs flag is to deal with those that would
> have problems looping indefinitely on an ESTALE return from the
> operation.
True, but even so... Giving file systems an opt-out option with the
default being out, maybe still have some merit... Making file systems
enable this new type of functionality would cut down on any of the
"surprise" that might occur with this redo ;-)

>
> So, to give your proposal some "legs", you'd prefer to see something
> like this?
>
> ---------------------------------[snip]-------------------------------
>
> [PATCH] vfs: make fstatat retry once on ESTALE errors from getattr call
>
> A third patch. This one takes the minimalist approach. It only retries
> once per syscall.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/stat.c | 9 ++++++++-
> 1 files changed, 8 insertions(+), 1 deletions(-)
>
> diff --git a/fs/stat.c b/fs/stat.c
> index c733dc5..0ee9cb4 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
> {
> struct path path;
> int error = -EINVAL;
> - int lookup_flags = 0;
> + bool retried = false;
> + unsigned int lookup_flags = 0;
>
> if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
> AT_EMPTY_PATH)) != 0)
> @@ -84,12 +85,18 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
> if (flag & AT_EMPTY_PATH)
> lookup_flags |= LOOKUP_EMPTY;
>
> +retry:
> error = user_path_at(dfd, filename, lookup_flags, &path);
> if (error)
> goto out;
>
> error = vfs_getattr(path.mnt, path.dentry, stat);
> path_put(&path);
> + if (error == -ESTALE && !retried) {
> + retried = true;
> + lookup_flags |= LOOKUP_REVAL;
> + goto retry;
> + }
> out:
> return error;
> }
Yes, with the exception of the fs_flag check. If this proves out to
handle 99% of the cases... all but the very few corner cases.. then
why does it have to be more complicated?

steved.


2012-04-16 20:25:23

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

T24gTW9uLCAyMDEyLTA0LTE2IGF0IDE1OjQzIC0wNDAwLCBKZWZmIExheXRvbiB3cm90ZToNCj4g
T24gTW9uLCAxNiBBcHIgMjAxMiAxOTozMzowNSArMDAwMA0KPiAiTXlrbGVidXN0LCBUcm9uZCIg
PFRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tPiB3cm90ZToNCj4gDQo+ID4gT24gTW9uLCAyMDEy
LTA0LTE2IGF0IDEzOjQ2IC0wNDAwLCBKZWZmIExheXRvbiB3cm90ZToNCj4gPiA+IFRoZSBxdWVz
dGlvbiBhYm91dCBsb29waW5nIGluZGVmaW5pdGVseSByZWFsbHkgY29tZXMgZG93biB0bzoNCj4g
PiA+IA0KPiA+ID4gMSkgaXMgYSBwZXJzaXN0ZW50IEVTVEFMRSBpbiBjb25qdW5jdGlvbiB3aXRo
IGEgc3VjY2Vzc2Z1bCBsb29rdXAgYQ0KPiA+ID4gc2l0dWF0aW9uIHRoYXQgd2UgZXhwZWN0IHRv
IGJlIHRlbXBvcmFyeS4gaS5lLiB3aWxsIHRoZSBhZG1pbiBhdCBzb21lDQo+ID4gPiBwb2ludCBi
ZSBhYmxlIHRvIGRvIHNvbWV0aGluZyBhYm91dCBpdD8gSWYgbm90LCB0aGVuIHRoZXJlJ3Mgbm8g
cG9pbnQNCj4gPiA+IGluIGNvbnRpbnVpbmcgdG8gcmV0cnkuIEFnYWluLCB0aGlzIGlzIGEgc2l0
dWF0aW9uIHRoYXQgKnJlYWxseSogc2hvdWxkDQo+ID4gPiBub3QgaGFwcGVuIGlmIHRoZSBmaWxl
c3lzdGVtIGlzIGRvaW5nIHRoZSByaWdodCB0aGluZy4NCj4gPiA+IA0KPiA+ID4gMikgSWYgdGhl
IGFkbWluIGNhbid0IGRvIGFueXRoaW5nIGFib3V0IGl0LCBpcyBpdCByZWFzb25hYmxlIHRvIGV4
cGVjdA0KPiA+ID4gdGhhdCB1c2VycyBjYW4gc2VuZCBhIGZhdGFsIHNpZ25hbCB0byBodW5nIGFw
cGxpY2F0aW9ucyBpZiB0aGlzDQo+ID4gPiBzaXR1YXRpb24gb2NjdXJzLg0KPiA+ID4gDQo+ID4g
PiBXZSBleHBlY3QgdGhhdCB0aGF0J3Mgb2sgaW4gb3RoZXIgc2l0dWF0aW9ucyB0byByZXNvbHZl
IGh1bmcNCj4gPiA+IGFwcGxpY2F0aW9ucywgc28gSSdtIG5vdCBzdXJlIEkgdW5kZXJzdGFuZCB3
aHkgaXQgd291bGRuJ3QgYmUNCj4gPiA+IGFjY2VwdGFibGUgaGVyZS4uLg0KPiA+IA0KPiA+IFRo
ZXJlIGFyZSBkZWZpbml0ZWx5IHBvdGVudGlhbGx5IHBlcnNpc3RlbnQgcGF0aG9sb2dpY2FsIHNp
dHVhdGlvbnMgdGhhdA0KPiA+IHRoZSBmaWxlc3lzdGVtIGNhbid0IGRvIGFueXRoaW5nIGFib3V0
LiBJZiB0aGUgcG9pbnQgb2Ygb3JpZ2luIGZvciB5b3VyDQo+ID4gcGF0aG5hbWUgKGZvciBpbnN0
YW5jZSB5b3VyIGN1cnJlbnQgZGlyZWN0b3J5IGluIHRoZSBjYXNlIG9mIGEgcmVsYXRpdmUNCj4g
PiBwYXRobmFtZSkgaXMgc3RhbGUsIHRoZW4gbm8gYW1vdW50IG9mIGxvb3BpbmcgaXMgZ29pbmcg
dG8gaGVscCB5b3UgdG8NCj4gPiByZWNvdmVyLg0KPiA+IA0KPiANCj4gT2sgLS0gUGV0ZXIgcHJl
dHR5IG11Y2ggc2FpZCBzb21ldGhpbmcgc2ltaWxhci4gUmV0cnlpbmcgaW5kZWZuaXRlbHkNCj4g
d2hlbiB0aGUgbG9va3VwIHJldHVybnMgRVNUQUxFIHByb2JhYmx5IHdvbid0IGhlbHAuIEknbSBv
ayB3aXRoDQo+IGJhc2ljYWxseSBsZXR0aW5nIHRoZSBWRlMgY29udGludWUgdG8gZG8gd2hhdCBp
dCBkb2VzIHRoZXJlIGFscmVhZHkuIElmDQo+IGl0IGdldHMgYW4gRVNUQUxFLCBpdCB0cmllcyBh
Z2FpbiB3aXRoIExPT0tVUF9SRVZBTCBzZXQgYW5kIHRoZW4gZ2l2ZXMNCj4gdXAgaWYgdGhhdCBk
b2Vzbid0IHdvcmsuDQo+IA0KPiBJZiBob3dldmVyLCB0aGUgb3BlcmF0aW9uIGl0c2VsZiBrZWVw
cyByZXR1cm5pbmcgRVNUQUxFLCBhcmUgd2UgT0sgdG8NCj4gcmV0cnkgaW5kZWZpbml0ZWx5IGFz
c3VtaW5nIHRoYXQgd2UnbGwgYnJlYWsgb3V0IG9mIHRoZSBsb29wIG9uIGZhdGFsDQo+IHNpZ25h
bHM/DQo+DQo+IEZvciBleGFtcGxlLCBzb21ldGhpbmcgbGlrZSB0aGUgdjIgcGF0Y2ggSSBzZW50
IGEgbGl0dGxlIHdoaWxlIGFnbz8NCg0KDQpXb24ndCBzb21ldGhpbmcgbGlrZSBmc3RhdGF0KEFU
X0ZEQ1dELCAiIiwgJnN0YXQsIEFUX0VNUFRZX1BBVEgpIHJpc2sNCmxvb3BpbmcgZm9yZXZlciB0
aGVyZSwgb3IgYW0gSSBtaXNzaW5nIHNvbWV0aGluZz8NCg0KLS0gDQpUcm9uZCBNeWtsZWJ1c3QN
CkxpbnV4IE5GUyBjbGllbnQgbWFpbnRhaW5lcg0KDQpOZXRBcHANClRyb25kLk15a2xlYnVzdEBu
ZXRhcHAuY29tDQp3d3cubmV0YXBwLmNvbQ0KDQo=

2012-04-13 23:00:39

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Fri, 13 Apr 2012 13:34:51 -0400
Peter Staubach <[email protected]> wrote:

> I still think that returning ESTALE to the application is just exposing a short coming in the implementation. From a path based system like stat(), the application should see either ENOENT or some sort valid return.
>

Agreed, but I'm willing to live with a solution that addresses most of
these situations, even if we can't fix them all. If retrying
indefinitely may be a problem for other filesystems then we can't
just ignore that...

> I also look at the looping from the other side. While possible, of course, I'd like to see someone construct a situation where it really happens. By this, I don't mean a thought experiment, but a real running situation.
>
> We already have evidence, in the form of the Solaris NFS client, that infinite looping does not happen in nature.
>

I'm fairly certainly that looping indefinitely would be just fine for
NFS. My main concern is those FUSE fs' that Miklos alluded to when he
reviewed your earlier set.

He said that some can return ESTALE indefinitely, and they don't
necessarily respect signals. If limiting the number of retries helps
prevent problems with those, then that's still better than the current
situation.

--
Jeff Layton <[email protected]>

2012-04-13 17:37:20

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

I still think that returning ESTALE to the application is just exposing a short coming in the implementation. From a path based system like stat(), the application should see either ENOENT or some sort valid return.

I also look at the looping from the other side. While possible, of course, I'd like to see someone construct a situation where it really happens. By this, I don't mean a thought experiment, but a real running situation.

We already have evidence, in the form of the Solaris NFS client, that infinite looping does not happen in nature.

Thanx...

ps


Sent from my iPhone

On Apr 13, 2012, at 12:07 PM, "Steve Dickson" <[email protected]> wrote:

>
>
> On 04/13/2012 11:42 AM, Jeff Layton wrote:
>> On Fri, 13 Apr 2012 10:05:18 -0500
>> Malahal Naineni <[email protected]> wrote:
>>
>>> Jeff Layton [[email protected]] wrote:
>>>> 1) should we retry these calls on all filesystems, or attempt to have
>>>> them "opt-in" in some fashion? This patch adds a flag for that, but
>>>> we could just treat all filesystems the same way.
>>>
>>> I don't know any cases where a retry on ESTALE would hurt. I would say
>>> retry on all file systems the same way.
>>>
>>>> 2) How many times should we retry on an ESTALE error? Once?
>>>> Indefinitely? Some amount in between? Retrying once would probably
>>>> fix the bulk of the real world problems with this, but there will
>>>> still be cases where that's not sufficient.
>>>
>>> As you say 1 retry should work in most cases. Indefinitely doesn't make
>>> sense, I would rather let my application fail! How about 3 retries (3 is
>>> a nice number! :-) )
>>>
>>
>> (note: please don't trim the CC list!)
>>
>> Indefinitely does make some sense (as Peter articulated in his original
>> set). It's possible you could race several times in a row, or a server
>> misconfiguration or something has happened and you have a transient
>> error that will eventually recover. His assertion was that any limit on
>> the number of retries is by definition wrong. For NFS, a fatal signal
>> ought to interrupt things as well, so retrying indefinitely has some
>> appeal there.
>>
>> OTOH, we do have to contend with filesystems that might return ESTALE
>> persistently for other reasons and that might not respond to signals.
>> Miklos pointed out that some FUSE fs' do this in his review of Peter's
>> set.
>>
>> As a purely defensive coding measure, limiting the number of retries to
>> something finite makes sense. If we're going to do that though, I'd
>> probably recommend that we set the number of retries be something
>> higher just so that this is more resilient in the face of multiple
>> races. Those other fs' might "spin" a bit in that case but it is an
>> error condition and IMO resiliency trumps performance -- at least in
>> this case.
> I'm of the opinion retry more than once has the potential of
> doing more harm than good... Why introduce looping when there
> is no solid evidence its even needed.
>
> I would think 99% of the time the one try would solve the problem.
> That 1% probably due two apps that have gone wild fight over the same
> file or the FUSE case. In those cases the error should be returned
> IMHO...
>
> steved.
>
>>
>> Of course, if we're going to do this for all fs', then we probably
>> ought to try to handle ESTALEs that are encountered in the pathwalking
>> code in a similar way. That may mean changing do_path_lookup and
>> do_filp_open_* to reattempt several times on an ESTALE error.
>>

2012-04-13 15:06:20

by Malahal Naineni

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

Jeff Layton [[email protected]] wrote:
> 1) should we retry these calls on all filesystems, or attempt to have
> them "opt-in" in some fashion? This patch adds a flag for that, but
> we could just treat all filesystems the same way.

I don't know any cases where a retry on ESTALE would hurt. I would say
retry on all file systems the same way.

> 2) How many times should we retry on an ESTALE error? Once?
> Indefinitely? Some amount in between? Retrying once would probably
> fix the bulk of the real world problems with this, but there will
> still be cases where that's not sufficient.

As you say 1 retry should work in most cases. Indefinitely doesn't make
sense, I would rather let my application fail! How about 3 retries (3 is
a nice number! :-) )

Regards, Malahal.


2012-04-23 14:56:21

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call



On 04/20/2012 05:13 PM, Jeff Layton wrote:
> On Fri, 20 Apr 2012 16:18:37 -0400
> Steve Dickson <[email protected]> wrote:
>
>> On 04/20/2012 10:40 AM, Jeff Layton wrote:
>>> I guess the questions at this point is:
>>>
>>> 1) How representative is Peter's mkdir_test() of a real-world workload?
>> Reading your email I had to wonder the same thing... What application
>> removes hierarchy of directories in a loop from two different clients?
>> I would suspect not many, if any... esp over NFS...
>>
>
> Peter's test just happens to demonstrate the problem well, but one
> could envision someone removing a heirarchy of directories on the
> server while we're trying to do other operations in it. At that point,
> we can easily end up hitting an ESTALE twice while doing the lookup and
> returning ESTALE back to userspace.
Just curious, what happens when you run Peter's mkdir_test() on a
local file system? Any errors returned?

I would think removing hierarchy of directories while they are being
accessed has to even cause local fs some type of havoc

>
>>>
>>> 2) if we assume that it is fairly representative of one, how can we
>>> achieve retrying indefinitely with NFS, or at least some large finite
>>> amount?
>> The amount of looping would be peer speculation. If the problem can
>> not be handled by one simple retry I would say we simply pass the
>> error up to the app... Its an application issue...
>>
>
> It's not an application issue. The application just asked the kernel
> to do an operation on a pathname. The only reason you're getting an
> ESTALE back in this situation is a shortcoming of the implementation.
>
> We passed it a pathname after all, not a filehandle. ESTALE really has
> no place as a return code in that situation...
We'll have to agree to disagree... I think any application that is
removing hierarchies of file and directory w/out taking any
precautionary locking is a shortcoming of the application
implementation.

>
>>>
>>> I have my doubts as to whether it would really be as big a problem for
>>> other filesystems as Miklos and others have asserted, but I'll take
>>> their word for it at the moment. What's the best way to contain this
>>> behavior to just those filesystems that want to retry indefinitely when
>>> they get an ESTALE? Would we need to go with an entirely new
>>> ESTALERETRY after all?
>>>
>> Introducing a new errno to handle this problem would be overkill IMHO...
>>
>> If we have to go to the looping approach, I would strong suggest we
>> make the file systems register for this type of behavior...
>>
>
> Returning ESTALERETRY would be registering for it in a way and it is
> somewhat cleaner than having to go all the way back up to the fstype to
> figure out whether you want to retry it or not.
How would legacy apps handle this new errno, esp if they have logic
to take care of ESTALE errors?

steved.

2012-04-23 20:39:06

by Peter Staubach

[permalink] [raw]
Subject: RE: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

I don't really like the idea of introducing another errno as well. It seems like too much complexity and represents complexity that no one has really justified needing.

This is a situation which we know happens in nature. We should fix it and fix it correctly and not for just "part of the time". The changes are pretty simple and straightforward, so complexity isn't even an argument.

A tunable sounds good, until it is needed, and when it is needed, it is too late. The system should just work correctly on its own, so I don't think that this is such a good idea either.

Thanx...

ps


-----Original Message-----
From: Steve Dickson [mailto:[email protected]]
Sent: Monday, April 23, 2012 2:07 PM
To: Jeff Layton
Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; Peter Staubach; [email protected]; [email protected]; [email protected]; [email protected]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call



On 04/23/2012 11:32 AM, Jeff Layton wrote:
> On Mon, 23 Apr 2012 10:55:24 -0400
> Steve Dickson <[email protected]> wrote:
>
>>
>>
>> On 04/20/2012 05:13 PM, Jeff Layton wrote:
>>> On Fri, 20 Apr 2012 16:18:37 -0400
>>> Steve Dickson <[email protected]> wrote:
>>>
>>>> On 04/20/2012 10:40 AM, Jeff Layton wrote:
>>>>> I guess the questions at this point is:
>>>>>
>>>>> 1) How representative is Peter's mkdir_test() of a real-world workload?
>>>> Reading your email I had to wonder the same thing... What
>>>> application removes hierarchy of directories in a loop from two different clients?
>>>> I would suspect not many, if any... esp over NFS...
>>>>
>>>
>>> Peter's test just happens to demonstrate the problem well, but one
>>> could envision someone removing a heirarchy of directories on the
>>> server while we're trying to do other operations in it. At that
>>> point, we can easily end up hitting an ESTALE twice while doing the
>>> lookup and returning ESTALE back to userspace.
>> Just curious, what happens when you run Peter's mkdir_test() on a
>> local file system? Any errors returned?
>>
>> I would think removing hierarchy of directories while they are being
>> accessed has to even cause local fs some type of havoc
>>
>
> Peter's test only treats an ESTALE error as a failure since it was
> specifically designed to ensure that those didn't make it in to
> userspace.
>
> If you run 2 copies on the same local fs and strace it, then you'll
> see the syscalls get back things like ENOENT or EEXIST as they step on
> each others' toes in the mkdir()/rmdir() calls.
I figured as much... I just don't see any real world applications remove directory hierarchies without some type of synchronization locking...

>
>>>
>>>>>
>>>>> 2) if we assume that it is fairly representative of one, how can
>>>>> we achieve retrying indefinitely with NFS, or at least some large
>>>>> finite amount?
>>>> The amount of looping would be peer speculation. If the problem can
>>>> not be handled by one simple retry I would say we simply pass the
>>>> error up to the app... Its an application issue...
>>>>
>>>
>>> It's not an application issue. The application just asked the kernel
>>> to do an operation on a pathname. The only reason you're getting an
>>> ESTALE back in this situation is a shortcoming of the implementation.
>>>
>>> We passed it a pathname after all, not a filehandle. ESTALE really
>>> has no place as a return code in that situation...
>> We'll have to agree to disagree... I think any application that is
>> removing hierarchies of file and directory w/out taking any
>> precautionary locking is a shortcoming of the application
>> implementation.
>>
>
> I'm not saying they should never get an error in that situation. I'm
> just saying that an ESTALE return in this situation is wrong (or at
> least not helpful) since the syscall was provided a pathname not a
> filehandle or open fd or anything. When we still have the pathname,
> then we have the ability to reattempt on an ESTALE, and it would be
> preferable to do so.
Point. But if the reestablishment can not be done in one try, the I say we punt...

>
>>>
>>>>>
>>>>> I have my doubts as to whether it would really be as big a problem
>>>>> for other filesystems as Miklos and others have asserted, but I'll
>>>>> take their word for it at the moment. What's the best way to
>>>>> contain this behavior to just those filesystems that want to retry
>>>>> indefinitely when they get an ESTALE? Would we need to go with an
>>>>> entirely new ESTALERETRY after all?
>>>>>
>>>> Introducing a new errno to handle this problem would be overkill IMHO...
>>>>
>>>> If we have to go to the looping approach, I would strong suggest we
>>>> make the file systems register for this type of behavior...
>>>>
>>>
>>> Returning ESTALERETRY would be registering for it in a way and it is
>>> somewhat cleaner than having to go all the way back up to the fstype
>>> to figure out whether you want to retry it or not.
>> How would legacy apps handle this new errno, esp if they have logic
>> to take care of ESTALE errors?
>>
>
> Userspace should never see that error.
Why do you say this? ESTALE the errno has been around forever...
Its defined in the errno man page "ESTALE - Stale file handle (POSIX.1)"

> The idea is that this would be a
> kernel-internal error code that indicates to the VFS that it should
> retry the lookup and operation. If the kernel decides to give up after
> the FS returns ESTALERETRY, then we'd have to convert that error into
> ESTALE.
Yeah... I understand the idea... I just don't think another error code is needed to handle this problem...

>
> It'd be preferable to me if we didn't require a new error code, but if
> different filesystems require different semantics from the VFS on an
> ESTALE return, then that is one way to achieve it.
>
Well I thought the use of the fs_flags to register for this type of semantics was a good one...

steved.

2012-04-16 14:38:34

by Bernd Schubert

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On 04/15/2012 09:27 PM, J. Bruce Fields wrote:
> On Sun, Apr 15, 2012 at 09:03:23PM +0200, Bernd Schubert wrote:
>> On 04/13/2012 05:42 PM, Jeff Layton wrote:
>>> (note: please don't trim the CC list!)
>>>
>>> Indefinitely does make some sense (as Peter articulated in his original
>>> set). It's possible you could race several times in a row, or a server
>>> misconfiguration or something has happened and you have a transient
>>> error that will eventually recover. His assertion was that any limit on
>>> the number of retries is by definition wrong. For NFS, a fatal signal
>>> ought to interrupt things as well, so retrying indefinitely has some
>>> appeal there.
>>>
>>> OTOH, we do have to contend with filesystems that might return ESTALE
>>> persistently for other reasons and that might not respond to signals.
>>> Miklos pointed out that some FUSE fs' do this in his review of Peter's
>>> set.
>>>
>>> As a purely defensive coding measure, limiting the number of retries to
>>> something finite makes sense. If we're going to do that though, I'd
>>> probably recommend that we set the number of retries be something
>>> higher just so that this is more resilient in the face of multiple
>>> races. Those other fs' might "spin" a bit in that case but it is an
>>> error condition and IMO resiliency trumps performance -- at least in
>> this case.
>>
>> I am definitely voting against an infinite number of retries. I'm
>> working on FhGFS, which supports distributed meta data servers. So when
>> a file is moved around between directories, its file handle, which
>> contains the meta-data target id might become invalid. As NFSv3 is
>> stateless we cannot inform the client about that and must return ESTALE
>> then.
>
> Note we're not talking about retrying the operation that returned ESTALE
> with the same filehandle--probably any server would return ESTALE again
> in that case.
>
> We're talking about re-looking up the path (in the case where we're
> implementing a system call that takes a path as an argument), and then
> retrying the operation with the newly looked-up filehandle.
>

Oh, sorry my mistake. Somehow I missed that it is really _only_ about
path lookups and not already opened files.

Thanks,
Bernd

2012-04-23 15:23:36

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

Chuck Lever <[email protected]> writes:

> On Apr 23, 2012, at 10:51 AM, Miklos Szeredi wrote:
>
>> "J. Bruce Fields" <[email protected]> writes:
>>
>>>
>>> I also wonder whether it would be making too many assumptions about the
>>> server or filesystem: just because ordinary posix interfaces don't allow
>>> atomic replacement of a whole directory tree doesn't mean the server
>>> might not have some way to do it.
>>
>> Exactly because posix limits the atomic replacement to empty directories
>> is that this feature is not useful and is why linux can get away with
>> the dead directory behavior in this case. And thinking about fixing
>> this in NFS is completely pointless since no one will rely on the atomic
>> replacement behavior. Fixing local filesystems is also pointless for
>> the same reason.
>>
>> Atomic replacement of whole directory trees would indeed be more useful,
>> but it's highly unlikely to be used anywhere since applications relying
>> on this feature would be limited to special filesystems that allow this.
>
> The cases I can think of have to do with file system restore, file
> system and block device snapshots, and so on. This type of use case
> may not practical on today's Linux server, but they are a reality for
> anyone using high-end NFS storage.

Problem with this is: if some directory file handles are stale (e.g. due
to directories being removed, recreated, moved around) and on the client
there are cwd-s referring to these handles, then they are going to
become stale no matter what you do, even if the same path does exist
after the restore on the server.

Thanks,
Miklos

2012-04-23 13:00:16

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, Apr 23, 2012 at 08:00:12AM -0400, Jeff Layton wrote:
> On Sun, 22 Apr 2012 07:40:57 +0200
> Miklos Szeredi <[email protected]> wrote:
>
> > On Fri, Apr 20, 2012 at 11:13 PM, Jeff Layton <[email protected]> wrote:
> > > On Fri, 20 Apr 2012 15:37:26 -0500
> > > Malahal Naineni <[email protected]> wrote:
> > >
> > >> Steve Dickson [[email protected]] wrote:
> > >> > > 2) if we assume that it is fairly representative of one, how can we
> > >> > > achieve retrying indefinitely with NFS, or at least some large finite
> > >> > > amount?
> > >> > The amount of looping would be peer speculation. If the problem can
> > >> > not be handled by one simple retry I would say we simply pass the
> > >> > error up to the app... Its an application issue...
> > >>
> > >> As someone said, ESTALE is an incorrect errno for a path based call.
> > >> How about turning ESTALE into ENOENT after a retry or few retries?
> > >>
> > >
> > > It's not really the same thing. One could envision an application
> > > that's repeatedly renaming a new file on top of another one. The file
> > > is never missing from the namespace of the server, but you could still
> > > end up getting an ESTALE.
> > >
> > > That would break other atomicity guarantees in an even worse way, IMO...
> >
> > For directory operations ESTALE *is* equivalent to ENOENT if already
> > retrying with LOOKUP_REVAL. Think about it. Atomic replacement by
> > another directory with rename(2) is not an excuse here actually.
> > Local filesystems too can end up with IS_DEAD directory after lookup
> > in that case.
> >
>
> Doesn't that violate POSIX? rename(2) is supposed to be atomic, and I
> can't see where there's any exception for that for directories.

Hm, but that only allows atomic replacement of the last component of a
path.

Suppose you're looking up a path, you've so far reached intermediate
directory "D", and the next step of the lookup (of some entry in D)
returns ESTALE. Then either:

- D has since been unlinked, and ENOENT is obviously right.
- D was unlinked and then replaced by something else, in which
case there was still a moment when ENOENT was correct.
- D was replaced atomically by a rename. But for the rename to
work it must have been replacing an empty directory, so there
was still a moment when ENOENT would have been correct.
(Exception: if D was actually a regular file or some other
non-directory object, then ENOTDIR would be the right error:
but if you're able to get at least object type atomically with
a lookup, then you should have noticed this already on lookup
of D.)

I think that's what Miklos meant?

--b.

2012-04-16 19:42:42

by Scott Lovenberg

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On 4/16/2012 1:46 PM, Jeff Layton wrote:
> NFS will generally return a different error if the process catches a
> fatal signal, so a soft mount should not be necessary and is not
> recommended anyway...
>
> In any case, we loop indefinitely now in the NFS code when (for
> instance) there's a loss of communication. Users are not generally
> happy if that causes an error, since their applications start dying.
>
From the peanut gallery, I've always set an infinite loop with an
exponential backoff on the loss of communication. IE, in some code I
wrote for S3backer (a FUSE file system on top of Amazon EC3) a few years
ago (committed by Archie Cobbs).

The trade off is that your applications will try to submit requests if
you don't tell them "leave me alone, I can't service you now". The more
I think about it, the more it seems like failing silently. Isn't the
rule supposed to be "if you must fail, do it loudly and as soon as
possible"? Just my $0.02. Take it as you will.

2012-04-13 12:09:48

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Fri, 13 Apr 2012 08:02:32 -0400
Jim Rees <[email protected]> wrote:

> Jeff Layton wrote:
>
> +retry:
> error = user_path_at(dfd, filename, lookup_flags, &path);
> if (error)
> goto out;
>
> error = vfs_getattr(path.mnt, path.dentry, stat);
> + should_retry = error == -ESTALE ? retry_estale(path.dentry) : false;
> path_put(&path);
> + if (should_retry) {
> + lookup_flags |= LOOKUP_REVAL;
> + goto retry;
> + }
> out:
> return error;
> }
>
> This is starting to look like FORTRAN. Maybe turn this into a "do while"?
> Then you could make the "goto out" into a break and get rid of them both.

Sure, that's doable. If we want to do a finite number of retries that
may be preferable anyway.

--
Jeff Layton <[email protected]>

2012-04-13 12:02:43

by Jim Rees

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

Jeff Layton wrote:

+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;

error = vfs_getattr(path.mnt, path.dentry, stat);
+ should_retry = error == -ESTALE ? retry_estale(path.dentry) : false;
path_put(&path);
+ if (should_retry) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}

This is starting to look like FORTRAN. Maybe turn this into a "do while"?
Then you could make the "goto out" into a break and get rid of them both.

2012-04-17 15:45:58

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, Apr 17, 2012 at 10:20:35AM -0400, Jeff Layton wrote:
> Well, it's possible, but it seems pathological to me for a server to do
> that...
>
> Bruce and I were discussing this the other day. It would be good to add
> something like this to the RFCs:
>
> "On a PUTFH, a server SHOULD hold a reference to the filehandle such

For "filehandle" I'd specify "current and saved filehandle".

> that it does not go stale over the life of the compound."

And that's *much* less of a burden on the server than requiring that the
compound execute atomically.

> ...or something along those lines. That's a different matter though and
> not directly related to this. :)

Yes.

--b.

2012-04-24 14:52:24

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, 23 Apr 2012 16:38:00 -0400
Peter Staubach <[email protected]> wrote:

> I don't really like the idea of introducing another errno as well. It seems like too much complexity and represents complexity that no one has really justified needing.
>

I tend to agree here. Miklos, can you elaborate a bit on what fuse
filesystems you're particularly concerned about here? Which ones return
ESTALE and under what conditions. Maybe we can try to tailor this
solution to avoid the complexity without impacting them.

> This is a situation which we know happens in nature. We should fix it and fix it correctly and not for just "part of the time". The changes are pretty simple and straightforward, so complexity isn't even an argument.
>

That was the case in the original patch, yes. One thing I see that will
be tricky in forward porting all of that work is that that set also had
some checks to make sure that lookups were making forward progress
before retrying. Trying to add that to the current lookup code may be
more difficult.

do_path_lookup (for instance) does this currently:

---------------[snip]---------------
retval = path_lookupat(dfd, name, flags, nd);
if (unlikely(retval == -ESTALE))
retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
---------------[snip]---------------

The trivial thing to do to make that retry is to turn that ESTALE check
into a while loop (optionally with some sort of limit on the number of
retries). But...we don't have a way to know the pointer to the last
dentry that we successfully looked up.

One possibility there is to try and have the error paths that matter
set the nd->path.dentry to the current one (without taking any
references). Then we could compare that to the last one on each pass of
the while loop and ensure that it's different and just not try to
dereference it.

> A tunable sounds good, until it is needed, and when it is needed, it is too late. The system should just work correctly on its own, so I don't think that this is such a good idea either.
>

Yeah, it's not ideal, but if there are concerns about the number of
retries, then that's one way to alleviate them. Eventually we could
eliminate the tunable when/if we found a default that worked for
everyone, or just decide that retrying indefinitely is OK.

--
Jeff Layton <[email protected]>

2012-04-22 04:16:43

by Ric Wheeler

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On 04/20/2012 08:10 PM, Jeff Layton wrote:
> On Wed, 18 Apr 2012 07:52:07 -0400
> Jeff Layton<[email protected]> wrote:
>
>> ESTALE errors are a source of pain for many users of NFS. Usually they
>> occur when a file is removed from the server after a successful lookup
>> against it.
>>
>> Luckily, the remedy in these cases is usually simple. We should just
>> redo the lookup, forcing revalidations all the way in and then retry the
>> call. We of course cannot do this for syscalls that do not involve a
>> path, but for path-based syscalls we can and should attempt to recover
>> from an ESTALE.
>>
>> This patch implements this by having the VFS reattempt the lookup (with
>> LOOKUP_REVAL set) and call exactly once when it would ordinarily return
>> ESTALE. This should catch the bulk of these cases under normal usage,
>> without unduly inconveniencing other filesystems that return ESTALE on
>> path-based syscalls.
>>
>> Note that it's possible to hit this race more than once, but a single
>> retry should catch the bulk of these cases under normal circumstances.
>>
>> This patch is just an example. We'll alter most path-based syscalls in a
>> similar fashion to fix this correctly. At this point, I'm just trying to
>> ensure that the retry semantics are acceptable before I being that work.
>>
>> Does anyone have strong objections to this patch? I'm aware that the
>> retry mechanism is not as robust as many (e.g. Peter) would like, but it
>> should at least improve the current situation.
>>
>> If no one has a strong objection, then I'll start going through and
>> adding similar code to the other syscalls. And we can hopefully we can
>> get at least some of them in for 3.5.
>>
>> Signed-off-by: Jeff Layton<[email protected]>
>> ---
>> fs/stat.c | 9 ++++++++-
>> 1 files changed, 8 insertions(+), 1 deletions(-)
>>
>> diff --git a/fs/stat.c b/fs/stat.c
>> index c733dc5..0ee9cb4 100644
>> --- a/fs/stat.c
>> +++ b/fs/stat.c
>> @@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
>> {
>> struct path path;
>> int error = -EINVAL;
>> - int lookup_flags = 0;
>> + bool retried = false;
>> + unsigned int lookup_flags = 0;
>>
>> if ((flag& ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
>> AT_EMPTY_PATH)) != 0)
>> @@ -84,12 +85,18 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
>> if (flag& AT_EMPTY_PATH)
>> lookup_flags |= LOOKUP_EMPTY;
>>
>> +retry:
>> error = user_path_at(dfd, filename, lookup_flags,&path);
>> if (error)
>> goto out;
>>
>> error = vfs_getattr(path.mnt, path.dentry, stat);
>> path_put(&path);
>> + if (error == -ESTALE&& !retried) {
>> + retried = true;
>> + lookup_flags |= LOOKUP_REVAL;
>> + goto retry;
>> + }
>> out:
>> return error;
>> }
> Apologies for replying to myself here. Just to beat on the deceased
> equine a little longer, I should note that the above approach does
> *not* fix Peter's reproducer in his original email. It fails rather
> quickly when run simultaneously on the client and server.
>
> At least one of the tests in it creates and removes a hierarchy of
> directories in a loop. During that, the lookup from the client can
> easily fail more than once with ESTALE. Since we give up after a single
> retry, that causes the call to return ESTALE.
>
> While testing this approach with mkdir and fstatat, I modified the
> patch to retry multiple times, also retry when the lookup thows ESTALE
> and to throw a printk when the number of retries was> 1 with the
> number of retries that the call did and the eventual error code.
>
> With Peter's (admittedly synthetic) test, we can get an answer of sorts
> to Trond's question from earlier in the thread as to how many retries
> is "enough":
>
> [ 45.023665] sys_mkdirat: retries=33 error=-2
> [ 47.889300] sys_mkdirat: retries=51 error=-2
> [ 49.172746] sys_mkdirat: retries=27 error=-2
> [ 52.325723] sys_mkdirat: retries=10 error=-2
> [ 58.082576] sys_mkdirat: retries=33 error=-2
> [ 59.810461] sys_mkdirat: retries=5 error=-2
> [ 63.387914] sys_mkdirat: retries=14 error=-2
> [ 63.630785] sys_mkdirat: retries=4 error=-2
> [ 68.268903] sys_mkdirat: retries=6 error=-2
> [ 71.124173] sys_mkdirat: retries=99 error=-2
> [ 75.657649] sys_mkdirat: retries=123 error=-2
> [ 76.903814] sys_mkdirat: retries=26 error=-2
> [ 82.009463] sys_mkdirat: retries=30 error=-2
> [ 84.807731] sys_mkdirat: retries=67 error=-2
> [ 89.825529] sys_mkdirat: retries=166 error=-2
> [ 91.599104] sys_mkdirat: retries=8 error=-2
> [ 95.621855] sys_mkdirat: retries=44 error=-2
> [ 98.164588] sys_mkdirat: retries=61 error=-2
> [ 99.783347] sys_mkdirat: retries=11 error=-2
> [ 105.593980] sys_mkdirat: retries=104 error=-2
> [ 110.348861] sys_mkdirat: retries=8 error=-2
> [ 112.087966] sys_mkdirat: retries=46 error=-2
> [ 117.613316] sys_mkdirat: retries=90 error=-2
> [ 120.117550] sys_mkdirat: retries=2 error=-2
> [ 122.624330] sys_mkdirat: retries=15 error=-2
>
> So, now I'm having buyers remorse of sorts about proposing to just
> retry once as that may not be strong enough to fix some of the cases
> we're interested in.
>
> I guess the questions at this point is:
>
> 1) How representative is Peter's mkdir_test() of a real-world workload?
>
> 2) if we assume that it is fairly representative of one, how can we
> achieve retrying indefinitely with NFS, or at least some large finite
> amount?
>
> I have my doubts as to whether it would really be as big a problem for
> other filesystems as Miklos and others have asserted, but I'll take
> their word for it at the moment. What's the best way to contain this
> behavior to just those filesystems that want to retry indefinitely when
> they get an ESTALE? Would we need to go with an entirely new
> ESTALERETRY after all?
>

Maybe we should have a default of a single loop and a tunable to allow clients
to crank it up?

Ric


2012-04-23 13:55:03

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, Apr 23, 2012 at 09:50:21AM -0400, Jeff Layton wrote:
> On Mon, 23 Apr 2012 09:34:12 -0400
> "J. Bruce Fields" <[email protected]> wrote:
>
> > On Mon, Apr 23, 2012 at 09:12:55AM -0400, Jeff Layton wrote:
> > > Here's an example -- suppose we have two directories: /foo
> > > and /bar. /bar is empty. We call:
> > >
> > > rename("/foo","/bar");
> > >
> > > ...and at the same time, someone is calling:
> > >
> > > stat("/bar");
> > >
> > > ...the calls race and in this condition the stat() gets ESTALE back
> > > -- /bar got replaced after we did the lookup.
> > >
> > > According to POSIX, the name "/bar" should never be absent from the
> > > namespace in this situation, so I'm not sure I understand why returning
> > > ENOENT here would be acceptable.
> >
> > Yes, agreed, my assertion was just that an ESTALE on a lookup of a
> > non-final component is probably equivalent to ENOENT.
> >
> > I'm not sure if that's what Miklos meant.
> >
>
> Ahh ok, sorry I misunderstood. Yeah in that case I suppose it would
> be ok to replace ESTALE with ENOENT. Ok, so to illustrate...
>
> Suppose we're trying to stat("/bar/baz") instead in the above example.
> Then we could just return ENOENT instead on an ESTALE return for the
> reasons that Bruce outlined. If the dir was stale, then there was a
> at least one point in time where we *know* that "baz" didn't exist.
>
> That doesn't seem like it'll work as a general solution though since it
> wouldn't apply to an ESTALE on the last component. For that we'd need
> to do something different -- retry the operation in some form, but it
> might be potential optimization in the path walking code to avoid
> retrying in some cases.

I also wonder whether it would be making too many assumptions about the
server or filesystem: just because ordinary posix interfaces don't allow
atomic replacement of a whole directory tree doesn't mean the server
might not have some way to do it.

--b.

2012-04-23 15:17:14

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, 23 Apr 2012 16:51:04 +0200
Miklos Szeredi <[email protected]> wrote:

> "J. Bruce Fields" <[email protected]> writes:
>
> >
> > I also wonder whether it would be making too many assumptions about the
> > server or filesystem: just because ordinary posix interfaces don't allow
> > atomic replacement of a whole directory tree doesn't mean the server
> > might not have some way to do it.
>
> Exactly because posix limits the atomic replacement to empty directories
> is that this feature is not useful and is why linux can get away with
> the dead directory behavior in this case. And thinking about fixing
> this in NFS is completely pointless since no one will rely on the atomic
> replacement behavior. Fixing local filesystems is also pointless for
> the same reason.
>
> Atomic replacement of whole directory trees would indeed be more useful,
> but it's highly unlikely to be used anywhere since applications relying
> on this feature would be limited to special filesystems that allow this.
>
> So my statement is "ENOENT is equivalent to ESTALE if already retrying
> path lookup with LOOKUP_REVAL on any operation that takes an parent
> directory and a name (lookup, create, link, unlink, symlink, mkdir,
> rmdir, mknod, rename)."
>

Ok, but again, that only applies to the lookup. It has no bearing on
the subsequent operation. For instance, if we're doing:

rename("/foo", "/bar");

...and another client is simultaneously doing:

creat("/bar/baz", 0600);

...and we get back ESTALE from the server on the create because the
"old" /bar got replaced after the lookup of it. Then it seems like
returning -ENOENT would not be correct since there was never a time
where /bar didn't exist...

It might eventually be nice to add that optimization to the path lookup
code. OTOH, it only solves a very specific problem that's not really
applicable in a lot of the cases I'm interested in fixing.

--
Jeff Layton <[email protected]>

2012-04-23 14:50:47

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

"J. Bruce Fields" <[email protected]> writes:

>
> I also wonder whether it would be making too many assumptions about the
> server or filesystem: just because ordinary posix interfaces don't allow
> atomic replacement of a whole directory tree doesn't mean the server
> might not have some way to do it.

Exactly because posix limits the atomic replacement to empty directories
is that this feature is not useful and is why linux can get away with
the dead directory behavior in this case. And thinking about fixing
this in NFS is completely pointless since no one will rely on the atomic
replacement behavior. Fixing local filesystems is also pointless for
the same reason.

Atomic replacement of whole directory trees would indeed be more useful,
but it's highly unlikely to be used anywhere since applications relying
on this feature would be limited to special filesystems that allow this.

So my statement is "ENOENT is equivalent to ESTALE if already retrying
path lookup with LOOKUP_REVAL on any operation that takes an parent
directory and a name (lookup, create, link, unlink, symlink, mkdir,
rmdir, mknod, rename)."

This equivalence is in the sense that it doesn't change behavior
compared to local filesystems.

For other operations ENOENT is not equivalent to ESTALE.

Thanks,
Miklos

2012-04-16 13:41:01

by Peter Staubach

[permalink] [raw]
Subject: RE: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

There seems to be a lot of, perhaps, misinformation here.

The looping occurs when the file system on the server returns a valid file handle during a pathname traversal and then returns ESTALE on a subsequent operation.

The client should retry the pathname traversal, which should either return a valid file handle or ENOENT. If a subsequent operation returns ESTALE, then start over again at the original pathname traversal.

The client should not loop when 1) there is a signal pending which would cause the system call to terminate with EINTR or when it can't recover from the ESTALE return. For example, if lookup("./foo") returns ESTALE, then clearly the current directory has become stale and there is no way for the client to recover. One could make the argument that the client can recover by relooking up the current directory, which is fine, but eventually if the file handle for the root of the mounted file system is also stale, then there is no further recovery possible, at least for NFSv[23] and perhaps even for NFSv4 depending upon how the file system was mounted.

I think that engaging again with Miklos and/or Bernd to understand why the underlying file system would return valid file identification information, but then be unable to use it in a subsequent operation. The definition of ESTALE is that a file handle, which referred to a valid file, no longer refers to a valid file. Why should FUSE impose any special requirements in this regard? If FUSE has attempted to expand the definition of ESTALE, then we should consider this situation and perhaps make recommendations for a more appropriate errno to use.

The bottom line is that for pathname based system calls, the application should never see an ESTALE error. The system call should either succeed or fail on its own merits or fail with ENOENT.

Thanx...

ps


-----Original Message-----
From: Jeff Layton [mailto:[email protected]]
Sent: Monday, April 16, 2012 7:37 AM
To: Bernd Schubert
Cc: Malahal Naineni; [email protected]; [email protected]; [email protected]; Peter Staubach; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Sun, 15 Apr 2012 21:03:23 +0200
Bernd Schubert <[email protected]> wrote:

> On 04/13/2012 05:42 PM, Jeff Layton wrote:
> > (note: please don't trim the CC list!)
> >
> > Indefinitely does make some sense (as Peter articulated in his
> > original set). It's possible you could race several times in a row,
> > or a server misconfiguration or something has happened and you have
> > a transient error that will eventually recover. His assertion was
> > that any limit on the number of retries is by definition wrong. For
> > NFS, a fatal signal ought to interrupt things as well, so retrying
> > indefinitely has some appeal there.
> >
> > OTOH, we do have to contend with filesystems that might return
> > ESTALE persistently for other reasons and that might not respond to signals.
> > Miklos pointed out that some FUSE fs' do this in his review of
> > Peter's set.
> >
> > As a purely defensive coding measure, limiting the number of retries
> > to something finite makes sense. If we're going to do that though,
> > I'd probably recommend that we set the number of retries be
> > something higher just so that this is more resilient in the face of
> > multiple races. Those other fs' might "spin" a bit in that case but
> > it is an error condition and IMO resiliency trumps performance -- at
> > least in
> this case.
>
> I am definitely voting against an infinite number of retries. I'm
> working on FhGFS, which supports distributed meta data servers. So
> when a file is moved around between directories, its file handle,
> which contains the meta-data target id might become invalid. As NFSv3
> is stateless we cannot inform the client about that and must return
> ESTALE then. NFSv4 is better, but I'm not sure how well invalidating a
> file handle works. So retrying once on ESTALE might be a good idea,
> but retrying forever is not.

It's important to note that I'm only proposing to wrap syscalls this way that take a pathname argument. We can't do anything about those that don't since at that point we have no way to retry the lookup.

So, I'm not sure this patch would affect the case you're concerned about one way or another. If you move the file to a different directory, then it's pathname would also change, and at that point you'd end up with an ENOENT error or something on the next retry.

If the file was open and you were (for instance) reading or writing to it from a client when you moved it, then we can't retry the lookup at that point. The open is long since done and the pathname is now gone.
You'll get an ESTALE back in userspace regardless.

> Also, what about asymmetric HA servers? I believe to remember that
> also resulted in ESTALE. So for example server1 exports /home and
> /scratch, but on failure server2 can only take over /home and denies
> access to /scratch.
>

That sounds like a broken cluster configuration. Still...

Presumably at some point in the future, a sysadmin would intervene and fix the situation such that /scratch is available again. Is it better to return an error to the application at that point, or simply allow it to keep retrying until the problem has been fixed?

The person with the long running job that's doing operations in /scratch would probably prefer the latter. If not, then they could always send the program a fatal signal to stop it altogether.

--
Jeff Layton <[email protected]>

2012-04-16 16:56:06

by Jeff Layton

[permalink] [raw]
Subject: [PATCH RFC v2] vfs: make fstatat retry on ESTALE errors from getattr call

Here's another RFC patch that incorporates some of the suggestions so
far:

1) it converts the "goto" construct into a do-while loop

2) This patch drops the FS_* flag so all filesystems would be affected.

3) It adds a backoff timeout mechanism to the loop with an arbitrary 5s cap
on the sleep between attempts (this is certainly negotiable).

4) Also, if the process catches a fatal signal, then we'd break out of the
loop and return the ESTALE to userspace. Not that it would care at that
point since the program is going to die anyway.

Clearly this inlined function will need to go to a header and the cap on
the sleep is probably negotiable. This is just a proof of concept thing
at this point.

Also note that this patch still doesn't incorporate any hard cap on the
number of retries. I could add that too, but perhaps that's not
necessary since we're handling the fatal_signal case here?

Opinions welcome...

Signed-off-by: Jeff Layton <[email protected]>
---
fs/stat.c | 35 +++++++++++++++++++++++++++++------
1 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/fs/stat.c b/fs/stat.c
index c733dc5..71f2f5b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,12 +68,25 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
}
EXPORT_SYMBOL(vfs_fstat);

+/* max retry timeout of 5s */
+#define MAX_ESTALE_RETRY_TIMEOUT (5 * HZ)
+
+static inline bool
+estale_retry(int error, long timeout)
+{
+ if (likely(error != -ESTALE))
+ return false;
+
+ return !schedule_timeout_killable(timeout);
+}
+
int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
int flag)
{
struct path path;
int error = -EINVAL;
- int lookup_flags = 0;
+ long timeout = 1;
+ unsigned int lookup_flags = 0;

if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH)) != 0)
@@ -84,12 +97,22 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;

- error = user_path_at(dfd, filename, lookup_flags, &path);
- if (error)
- goto out;
+ do {
+ /*
+ * if lookup fails with ESTALE, then that implies that the
+ * root of the mount went stale. Give up at that point...
+ */
+ error = user_path_at(dfd, filename, lookup_flags, &path);
+ if (error)
+ break;

- error = vfs_getattr(path.mnt, path.dentry, stat);
- path_put(&path);
+ error = vfs_getattr(path.mnt, path.dentry, stat);
+ path_put(&path);
+ lookup_flags |= LOOKUP_REVAL;
+ timeout <<= 1;
+ if (timeout > MAX_ESTALE_RETRY_TIMEOUT)
+ timeout = MAX_ESTALE_RETRY_TIMEOUT;
+ } while (estale_retry(error, timeout));
out:
return error;
}
--
1.7.7.6


2012-04-25 09:41:34

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

Jeff Layton <[email protected]> writes:

>> And an audit would still not ensure safety against future additions of
>> ESTALE.
>>
>
> Well, nothing is safe from the future. It's incumbent upon us to review
> patches and such before such breakage goes in.

Good interfaces are safe against stupidity, and that very much applies
to the kernel too. Review is not an excuse for bad interfaces.

> But, let's say for the purposes of argument that we do have a fs (FUSE
> or otherwise) that is persistently returning ESTALE on a lookup. Why
> was Peter's check that we were making forward progress not enough to
> guard against this problem?
>
> In particular, I'm talking about the code he added to link_path_walk in
> this patch to check that the value of nd->path.dentry was changing:
>
> https://lkml.org/lkml/2008/3/10/266
>
> It seems like that ought to be enough to alleviate your fears on this.
> We could also check for fatal signals on each pass and that would allow
> users to break out of the loop even when the underlying fs doesn't
> handle signals properly.

AFAICS that doesn't ensure progress, only change. It helps those cases
which persistently return ESTALE, but not cases where there is change
but no progress. E.g. it doesn't prevent DoS by a client doing renames
over a directory in a tight loop.

Thanks,
Miklos

2012-04-13 17:10:50

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Fri, 13 Apr 2012 12:07:14 -0400
Steve Dickson <[email protected]> wrote:

>
>
> On 04/13/2012 11:42 AM, Jeff Layton wrote:
> > On Fri, 13 Apr 2012 10:05:18 -0500
> > Malahal Naineni <[email protected]> wrote:
> >
> >> Jeff Layton [[email protected]] wrote:
> >>> 1) should we retry these calls on all filesystems, or attempt to have
> >>> them "opt-in" in some fashion? This patch adds a flag for that, but
> >>> we could just treat all filesystems the same way.
> >>
> >> I don't know any cases where a retry on ESTALE would hurt. I would say
> >> retry on all file systems the same way.
> >>
> >>> 2) How many times should we retry on an ESTALE error? Once?
> >>> Indefinitely? Some amount in between? Retrying once would probably
> >>> fix the bulk of the real world problems with this, but there will
> >>> still be cases where that's not sufficient.
> >>
> >> As you say 1 retry should work in most cases. Indefinitely doesn't make
> >> sense, I would rather let my application fail! How about 3 retries (3 is
> >> a nice number! :-) )
> >>
> >
> > (note: please don't trim the CC list!)
> >
> > Indefinitely does make some sense (as Peter articulated in his original
> > set). It's possible you could race several times in a row, or a server
> > misconfiguration or something has happened and you have a transient
> > error that will eventually recover. His assertion was that any limit on
> > the number of retries is by definition wrong. For NFS, a fatal signal
> > ought to interrupt things as well, so retrying indefinitely has some
> > appeal there.
> >
> > OTOH, we do have to contend with filesystems that might return ESTALE
> > persistently for other reasons and that might not respond to signals.
> > Miklos pointed out that some FUSE fs' do this in his review of Peter's
> > set.
> >
> > As a purely defensive coding measure, limiting the number of retries to
> > something finite makes sense. If we're going to do that though, I'd
> > probably recommend that we set the number of retries be something
> > higher just so that this is more resilient in the face of multiple
> > races. Those other fs' might "spin" a bit in that case but it is an
> > error condition and IMO resiliency trumps performance -- at least in
> > this case.
> I'm of the opinion retry more than once has the potential of
> doing more harm than good... Why introduce looping when there
> is no solid evidence its even needed.
>
> I would think 99% of the time the one try would solve the problem.
> That 1% probably due two apps that have gone wild fight over the same
> file or the FUSE case. In those cases the error should be returned
> IMHO...
>
> steved.
>

My only real concern with doing that is that again we're going to have
to alter every path-based syscall. If we decide later that retrying
once isn't sufficient, then we'll be stuck doing it again.

That said, it would jive better with the existing ESTALE retry in the
lookup code.

--
Jeff Layton <[email protected]>

2012-04-17 13:31:53

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, 17 Apr 2012 15:12:20 +0200
Miklos Szeredi <[email protected]> wrote:

> Jeff Layton <[email protected]> writes:
>
> >>
> >> Won't something like fstatat(AT_FDCWD, "", &stat, AT_EMPTY_PATH) risk
> >> looping forever there, or am I missing something?
> >>
> >
> > To make sure I understand, that should be "shortcut" for a lookup of the
> > cwd?
> >
> > So I guess the concern is that you'd do the above and get a successful
> > lookup since you're just going to get back the cwd. At that point,
> > you'd attempt the getattr and get ESTALE back. Then, you'd redo the
> > lookup with LOOKUP_REVAL set -- but since we're operating on the
> > cwd, we don't have a way to redo the lookup since we don't have a
> > pathname that we can look up again...
> >
> > So yeah, I guess if you're sitting in a stale directory, something like
> > that could loop eternally.
> >
> > Do you think the proposed check for fatal_signal_pending is enough to
> > mitigate such a problem? Or do we need to limit the number of retries
> > to address those sorts of loops?
>
> Lets step back a bit.
>
> The retry is needed when when we discover during ->getattr() that the
> cached lookup returned a stale file handle.
>
> If the lookup wasn't cached or if there was no lookup at all
> (stat(".") and friends) then retrying will not gain anything.
>

That's not necessarily the case, at least not with NFS. It's easily
possible for you to do a full-fledged lookup over the wire, and then
for that inode to be removed prior to issuing a call against the FH that
you got back.

> And that also means that retrying multiple times is pointless, since
> after the first retry we are sure to have up-to-date attributes.
>

Again, it's not pointless. It's possible (though somewhat pathological)
for you to hit the race above more than once in the same operation.
Granted, it's an unlikely race but it is possible.

> Unfortunately it's impossible for the filesystem to know whether a
> ->getattr (or other inode operation) was perfromed after a cached or a
> non-cached lookup.
>
> I'm not sure what the right interface for this would be. One would be
> to just pass the "cached-or-not" information as a flag. That works for
> getattr() but not for other operations.
>
> Another is to introduce atomic lookup+foo variants of these operations
> just like for open. E.g. the lookup+getattr is called if the cached
> lookup fails or if the cached lookup succeeds and the plain ->getattr
> call returns ESTALE.
>

To do that would require protocol support that we simply don't have. We
don't have a way to (for instance) say via NFS "give me the attributes
for this filename". Well, at least not for NFSv3...

With v4 you could theoretically construct a compound that does that,
but you'd have to assume that the server won't release the reference to
the inode midway through the compound. That's a reasonably safe
assumption.

While it's nice to consider new atomic ops like this, it's not really
possible with earlier versions of NFS.

--
Jeff Layton <[email protected]>

2012-04-15 19:03:33

by Bernd Schubert

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On 04/13/2012 05:42 PM, Jeff Layton wrote:
> (note: please don't trim the CC list!)
>
> Indefinitely does make some sense (as Peter articulated in his original
> set). It's possible you could race several times in a row, or a server
> misconfiguration or something has happened and you have a transient
> error that will eventually recover. His assertion was that any limit on
> the number of retries is by definition wrong. For NFS, a fatal signal
> ought to interrupt things as well, so retrying indefinitely has some
> appeal there.
>
> OTOH, we do have to contend with filesystems that might return ESTALE
> persistently for other reasons and that might not respond to signals.
> Miklos pointed out that some FUSE fs' do this in his review of Peter's
> set.
>
> As a purely defensive coding measure, limiting the number of retries to
> something finite makes sense. If we're going to do that though, I'd
> probably recommend that we set the number of retries be something
> higher just so that this is more resilient in the face of multiple
> races. Those other fs' might "spin" a bit in that case but it is an
> error condition and IMO resiliency trumps performance -- at least in
this case.

I am definitely voting against an infinite number of retries. I'm
working on FhGFS, which supports distributed meta data servers. So when
a file is moved around between directories, its file handle, which
contains the meta-data target id might become invalid. As NFSv3 is
stateless we cannot inform the client about that and must return ESTALE
then. NFSv4 is better, but I'm not sure how well invalidating a file
handle works. So retrying once on ESTALE might be a good idea, but
retrying forever is not.
Also, what about asymmetric HA servers? I believe to remember that also
resulted in ESTALE. So for example server1 exports /home and /scratch,
but on failure server2 can only take over /home and denies access to
/scratch.


Thanks,
Bernd

2012-04-17 13:36:15

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, 17 Apr 2012 07:46:19 -0400
Steve Dickson <[email protected]> wrote:

>
>
> On 04/16/2012 07:05 PM, Jeff Layton wrote:
> > On Mon, 16 Apr 2012 20:25:06 +0000
> > "Myklebust, Trond" <[email protected]> wrote:
> >
> >> On Mon, 2012-04-16 at 15:43 -0400, Jeff Layton wrote:
> >>> On Mon, 16 Apr 2012 19:33:05 +0000
> >>> "Myklebust, Trond" <[email protected]> wrote:
> >>>
> >>>> On Mon, 2012-04-16 at 13:46 -0400, Jeff Layton wrote:
> >>>>> The question about looping indefinitely really comes down to:
> >>>>>
> >>>>> 1) is a persistent ESTALE in conjunction with a successful lookup a
> >>>>> situation that we expect to be temporary. i.e. will the admin at some
> >>>>> point be able to do something about it? If not, then there's no point
> >>>>> in continuing to retry. Again, this is a situation that *really* should
> >>>>> not happen if the filesystem is doing the right thing.
> >>>>>
> >>>>> 2) If the admin can't do anything about it, is it reasonable to expect
> >>>>> that users can send a fatal signal to hung applications if this
> >>>>> situation occurs.
> >>>>>
> >>>>> We expect that that's ok in other situations to resolve hung
> >>>>> applications, so I'm not sure I understand why it wouldn't be
> >>>>> acceptable here...
> >>>>
> >>>> There are definitely potentially persistent pathological situations that
> >>>> the filesystem can't do anything about. If the point of origin for your
> >>>> pathname (for instance your current directory in the case of a relative
> >>>> pathname) is stale, then no amount of looping is going to help you to
> >>>> recover.
> >>>>
> >>>
> >>> Ok -- Peter pretty much said something similar. Retrying indefnitely
> >>> when the lookup returns ESTALE probably won't help. I'm ok with
> >>> basically letting the VFS continue to do what it does there already. If
> >>> it gets an ESTALE, it tries again with LOOKUP_REVAL set and then gives
> >>> up if that doesn't work.
> >>>
> >>> If however, the operation itself keeps returning ESTALE, are we OK to
> >>> retry indefinitely assuming that we'll break out of the loop on fatal
> >>> signals?
> >>>
> >>> For example, something like the v2 patch I sent a little while ago?
> >>
> >>
> >> Won't something like fstatat(AT_FDCWD, "", &stat, AT_EMPTY_PATH) risk
> >> looping forever there, or am I missing something?
> >>
> >
> > To make sure I understand, that should be "shortcut" for a lookup of the
> > cwd?
> >
> > So I guess the concern is that you'd do the above and get a successful
> > lookup since you're just going to get back the cwd. At that point,
> > you'd attempt the getattr and get ESTALE back. Then, you'd redo the
> > lookup with LOOKUP_REVAL set -- but since we're operating on the
> > cwd, we don't have a way to redo the lookup since we don't have a
> > pathname that we can look up again...
> >
> > So yeah, I guess if you're sitting in a stale directory, something like
> > that could loop eternally.
> >
> > Do you think the proposed check for fatal_signal_pending is enough to
> > mitigate such a problem? Or do we need to limit the number of retries
> > to address those sorts of loops?
> >
> I think your version 2 patch is definitely more clever than v1, but I'm
> thinking Trond's point is no matter what type of looping is done or how
> long its done its going to fail... So why loop at all?
>

Because we can't distinguish between that case and a case where
retrying may help. If we can avoid failing altogether, isn't that
preferable?

> Again I think the safest and simply way, from the VFS point of view,
> is do the looping once if the file system as registered for it
> through the fs_flags. This will catch %99 of the issues, failing
> on the %1 of the corner cases...
>

If we're just going to retry once, then I see no need to bother with a
fs flag. Retrying once should be harmless for all filesystems. The main
reason to bother with a per-fs flag is to deal with those that would
have problems looping indefinitely on an ESTALE return from the
operation.

So, to give your proposal some "legs", you'd prefer to see something
like this?

---------------------------------[snip]-------------------------------

[PATCH] vfs: make fstatat retry once on ESTALE errors from getattr call

A third patch. This one takes the minimalist approach. It only retries
once per syscall.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/stat.c | 9 ++++++++-
1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/fs/stat.c b/fs/stat.c
index c733dc5..0ee9cb4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
{
struct path path;
int error = -EINVAL;
- int lookup_flags = 0;
+ bool retried = false;
+ unsigned int lookup_flags = 0;

if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH)) != 0)
@@ -84,12 +85,18 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;

+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;

error = vfs_getattr(path.mnt, path.dentry, stat);
path_put(&path);
+ if (error == -ESTALE && !retried) {
+ retried = true;
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
--
1.7.7.6


2012-04-17 14:48:53

by Peter Staubach

[permalink] [raw]
Subject: RE: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

SSBkb24ndCB0aGluayB0aGF0IHRoZSBjb2RlIGVuZHMgdXAgYmVpbmcgYWxsIHRoYXQgY29tcGxl
eCwgYWN0dWFsbHkuICBJIHdpbGwgaGF2ZSB0byBkaWcgb3V0IHRoZSBwYXRjaGVzIHRoYXQgSSBo
YWQgcHJldmlvdXNseSB0byBsb29rIHRvIHNlZSB3aGF0IHRoZXkgZGlkLiAgSSBhbSBwcmV0dHkg
c3VyZSB0aGF0IHRoZXkgaGFuZGxlZCBhbGwgb2YgdGhlc2UgY2FzZXMuICBJIGFsc28gaGFkIHNv
bWUgdGVzdHMgd2hpY2ggZXhlcmNpc2VkIHRoZSBtb2RpZmllZCBwYXRoIGJhc2VkIHN5c3RlbSBj
YWxscyBhbmQgYXQgbGVhc3QsIGF0IG9uZSBwb2ludCBpbiB0aW1lLCB0aGV5IHdvdWxkIHJ1biB3
aXRob3V0IHJldHVybmluZyBFU1RBTEUgdG8gdGhlIHVzZXIgbGV2ZWwuDQoNCkxldCBtZSBzZWUg
d2hhdCBJIGNhbiBmaW5kLiAgU3RldmVELCB5b3Ugd291bGRuJ3QgaGF2ZSBzcXVpcnJlbGxlZCBh
d2F5IGEgY29weSBvZiBteSBzdHVmZiBmcm9tIFJlZCBIYXQsIHdvdWxkIHlvdT8NCg0KCVRoYW54
Li4uDQoNCgkJcHMNCg0KDQotLS0tLU9yaWdpbmFsIE1lc3NhZ2UtLS0tLQ0KRnJvbTogTXlrbGVi
dXN0LCBUcm9uZCBbbWFpbHRvOlRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tXSANClNlbnQ6IFR1
ZXNkYXksIEFwcmlsIDE3LCAyMDEyIDEwOjA5IEFNDQpUbzogUGV0ZXIgU3RhdWJhY2gNCkNjOiBK
ZWZmIExheXRvbjsgQmVybmQgU2NodWJlcnQ7IE1hbGFoYWwgTmFpbmVuaTsgbGludXgtbmZzQHZn
ZXIua2VybmVsLm9yZzsgbGludXgtZnNkZXZlbEB2Z2VyLmtlcm5lbC5vcmc7IGxpbnV4LWtlcm5l
bEB2Z2VyLmtlcm5lbC5vcmc7IG1pa2xvc0BzemVyZWRpLmh1OyB2aXJvQFplbklWLmxpbnV4Lm9y
Zy51azsgaGNoQGluZnJhZGVhZC5vcmc7IG1pY2hhZWwuYnJhbnRsZXlAZGVzaGF3LmNvbTsgc3Zl
bi5icmV1bmVyQGl0d20uZnJhdW5ob2Zlci5kZQ0KU3ViamVjdDogUkU6IFtQQVRDSCBSRkNdIHZm
czogbWFrZSBmc3RhdGF0IHJldHJ5IG9uIEVTVEFMRSBlcnJvcnMgZnJvbSBnZXRhdHRyIGNhbGwN
Cg0KT24gVHVlLCAyMDEyLTA0LTE3IGF0IDA5OjM5IC0wNDAwLCBQZXRlciBTdGF1YmFjaCB3cm90
ZToNCj4gSGkuDQo+IA0KPiBUaGVyZSBkb2VzIG5lZWQgdG8gYmUgc3VwcG9ydCBpbiB0aGUgbG9v
a3VwIHBhdGggdG8gaGFuZGxlIHN0YWxlIGRpcmVjdG9yaWVzLiAgRHVyaW5nIGEgbXVsdGktY29t
cG9uZW50IGxvb2t1cCwgaXQgaXMgcG9zc2libGUgZm9yIG9uZSBjb21wb25lbnQgdG8gYmUgbG9v
a2VkIHVwLCBidXQgdGhlbiBiZWNvbWUgc3RhbGUgYmVmb3JlIGJlaW5nIHVzZWQuICBUaGUgc3Vw
cG9ydCBuZWVkcyB0byBnbyBpbnRvIG1vcmUgdGhhbiBqdXN0IHRoZSBzZXQgb2Ygc3lzdGVtIGNh
bGxzLCBidXQgYWxzbyBpbnRvIHRoZSBsb29rdXAgY29kZS4NCj4gDQo+IEkgc3VzcGVjdCB0aGF0
IHNpbmNlIHRoZSAiIiBjYXNlIGlzIGhhbmRsZWQgc3BlY2lhbGx5IGFueXdheSwgY29kZSBjb3Vs
ZCBiZSBhZGRlZCB0byBkbyBzb21ldGhpbmcgbGlrZSBpc3N1ZSBhIEdFVEFUVFIgdG8gdmVyaWZ5
IHRoYXQgdGhlIGRpcmVjdG9yeSBpcyBzdGlsbCB2YWxpZC4gIFNpbmNlICIiIGlzIHByb2JhYmx5
IHVzZWQgbWluaW1hbGx5LCB0aGUgcGVyZm9ybWFuY2UgaW1wYWN0IHRvIHRoZSBzeXN0ZW0gc2hv
dWxkIGJlIGFsc28gYmUgbWluaW1hbC4NCg0KVGhlIHByb2JsZW0gd2l0aCB0aGF0IGlzIHRoYXQg
ZXZlbiBpZiB5b3VyIGN1cnJlbnQgZGlyZWN0b3J5IGlzIHN0YWxlLCBhIHBhdGggb2YgdGhlIGZv
cm0gIi4uLy4uLy4uL2JsYWgiIGNhbiBzdGlsbCBiZSB2YWxpZC4gSSBzdXNwZWN0IHRoYXQgYW55
IGNvZGUgdGhhdCB0cmllcyB0byBsb29wIGZvcmV2ZXIgY2FuIHF1aWNrbHkgZ2V0IHZlcnkgY29t
cGxpY2F0ZWQgYXMgd2Ugc3RhcnQgYWRkaW5nIGFsbCB0aGVzZSBjb3JuZXIgY2FzZXMuDQoNClRo
ZSB0aGluZyBhYm91dCBzdGF0KCkgaXMgdGhhdCBpbiA5OS45OSUgb2YgY2FzZXMgd2hlbiB0aGUg
cGF0aCByZXZhbGlkYXRpb24gY29kZSB3b3JrcywgdGhlbiB0aGUgTkZTIGNsaWVudCB3b24ndCBo
YXZlIHRvIGFjdHVhbGx5IGlzc3VlIGEgR0VUQVRUUiBpbiB0aGUgY2FsbCB0byB2ZnNfZ2V0c3Rh
dCgpIHNpbmNlIHRoZSBMT09LVVAgb2YgdGhhdCBmaW5hbCBwYXRoIGNvbXBvbmVudCB3aWxsIHJl
dmFsaWRhdGUgdGhvc2UgYXR0cmlidXRlcyBhbnl3YXkuDQoNClRoZSBvbmx5IGV4Y2VwdGlvbiB0
byB0aGF0IHJ1bGUgd2lsbCBiZSB0aGUgJ25vYWMnIG1vdW50cywgd2hpY2ggcmVhbGx5IGNhbiBl
bmQgdXAgbG9vcGluZyBmb3JldmVyLg0KDQpDaGVlcnMNCiAgVHJvbmQNCg0KPiAJCXBzDQo+IA0K
PiANCj4gLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCj4gRnJvbTogSmVmZiBMYXl0b24gW21h
aWx0bzpqbGF5dG9uQHJlZGhhdC5jb21dDQo+IFNlbnQ6IE1vbmRheSwgQXByaWwgMTYsIDIwMTIg
NzowNiBQTQ0KPiBUbzogTXlrbGVidXN0LCBUcm9uZA0KPiBDYzogQmVybmQgU2NodWJlcnQ7IE1h
bGFoYWwgTmFpbmVuaTsgbGludXgtbmZzQHZnZXIua2VybmVsLm9yZzsgDQo+IGxpbnV4LWZzZGV2
ZWxAdmdlci5rZXJuZWwub3JnOyBsaW51eC1rZXJuZWxAdmdlci5rZXJuZWwub3JnOyBQZXRlciAN
Cj4gU3RhdWJhY2g7IG1pa2xvc0BzemVyZWRpLmh1OyB2aXJvQFplbklWLmxpbnV4Lm9yZy51azsg
DQo+IGhjaEBpbmZyYWRlYWQub3JnOyBtaWNoYWVsLmJyYW50bGV5QGRlc2hhdy5jb207IA0KPiBz
dmVuLmJyZXVuZXJAaXR3bS5mcmF1bmhvZmVyLmRlDQo+IFN1YmplY3Q6IFJlOiBbUEFUQ0ggUkZD
XSB2ZnM6IG1ha2UgZnN0YXRhdCByZXRyeSBvbiBFU1RBTEUgZXJyb3JzIGZyb20gDQo+IGdldGF0
dHIgY2FsbA0KPiANCj4gT24gTW9uLCAxNiBBcHIgMjAxMiAyMDoyNTowNiArMDAwMA0KPiAiTXlr
bGVidXN0LCBUcm9uZCIgPFRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tPiB3cm90ZToNCj4gDQo+
ID4gT24gTW9uLCAyMDEyLTA0LTE2IGF0IDE1OjQzIC0wNDAwLCBKZWZmIExheXRvbiB3cm90ZToN
Cj4gPiA+IE9uIE1vbiwgMTYgQXByIDIwMTIgMTk6MzM6MDUgKzAwMDAgIk15a2xlYnVzdCwgVHJv
bmQiIA0KPiA+ID4gPFRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tPiB3cm90ZToNCj4gPiA+IA0K
PiA+ID4gPiBPbiBNb24sIDIwMTItMDQtMTYgYXQgMTM6NDYgLTA0MDAsIEplZmYgTGF5dG9uIHdy
b3RlOg0KPiA+ID4gPiA+IFRoZSBxdWVzdGlvbiBhYm91dCBsb29waW5nIGluZGVmaW5pdGVseSBy
ZWFsbHkgY29tZXMgZG93biB0bzoNCj4gPiA+ID4gPiANCj4gPiA+ID4gPiAxKSBpcyBhIHBlcnNp
c3RlbnQgRVNUQUxFIGluIGNvbmp1bmN0aW9uIHdpdGggYSBzdWNjZXNzZnVsIA0KPiA+ID4gPiA+
IGxvb2t1cCBhIHNpdHVhdGlvbiB0aGF0IHdlIGV4cGVjdCB0byBiZSB0ZW1wb3JhcnkuIGkuZS4g
d2lsbCANCj4gPiA+ID4gPiB0aGUgYWRtaW4gYXQgc29tZSBwb2ludCBiZSBhYmxlIHRvIGRvIHNv
bWV0aGluZyBhYm91dCBpdD8gSWYgDQo+ID4gPiA+ID4gbm90LCB0aGVuIHRoZXJlJ3Mgbm8gcG9p
bnQgaW4gY29udGludWluZyB0byByZXRyeS4gQWdhaW4sIHRoaXMgDQo+ID4gPiA+ID4gaXMgYSBz
aXR1YXRpb24gdGhhdCAqcmVhbGx5KiBzaG91bGQgbm90IGhhcHBlbiBpZiB0aGUgZmlsZXN5c3Rl
bSBpcyBkb2luZyB0aGUgcmlnaHQgdGhpbmcuDQo+ID4gPiA+ID4gDQo+ID4gPiA+ID4gMikgSWYg
dGhlIGFkbWluIGNhbid0IGRvIGFueXRoaW5nIGFib3V0IGl0LCBpcyBpdCByZWFzb25hYmxlIA0K
PiA+ID4gPiA+IHRvIGV4cGVjdCB0aGF0IHVzZXJzIGNhbiBzZW5kIGEgZmF0YWwgc2lnbmFsIHRv
IGh1bmcgDQo+ID4gPiA+ID4gYXBwbGljYXRpb25zIGlmIHRoaXMgc2l0dWF0aW9uIG9jY3Vycy4N
Cj4gPiA+ID4gPiANCj4gPiA+ID4gPiBXZSBleHBlY3QgdGhhdCB0aGF0J3Mgb2sgaW4gb3RoZXIg
c2l0dWF0aW9ucyB0byByZXNvbHZlIGh1bmcgDQo+ID4gPiA+ID4gYXBwbGljYXRpb25zLCBzbyBJ
J20gbm90IHN1cmUgSSB1bmRlcnN0YW5kIHdoeSBpdCB3b3VsZG4ndCBiZSANCj4gPiA+ID4gPiBh
Y2NlcHRhYmxlIGhlcmUuLi4NCj4gPiA+ID4gDQo+ID4gPiA+IFRoZXJlIGFyZSBkZWZpbml0ZWx5
IHBvdGVudGlhbGx5IHBlcnNpc3RlbnQgcGF0aG9sb2dpY2FsIA0KPiA+ID4gPiBzaXR1YXRpb25z
IHRoYXQgdGhlIGZpbGVzeXN0ZW0gY2FuJ3QgZG8gYW55dGhpbmcgYWJvdXQuIElmIHRoZSANCj4g
PiA+ID4gcG9pbnQgb2Ygb3JpZ2luIGZvciB5b3VyIHBhdGhuYW1lIChmb3IgaW5zdGFuY2UgeW91
ciBjdXJyZW50IA0KPiA+ID4gPiBkaXJlY3RvcnkgaW4gdGhlIGNhc2Ugb2YgYSByZWxhdGl2ZQ0K
PiA+ID4gPiBwYXRobmFtZSkgaXMgc3RhbGUsIHRoZW4gbm8gYW1vdW50IG9mIGxvb3BpbmcgaXMg
Z29pbmcgdG8gaGVscCANCj4gPiA+ID4geW91IHRvIHJlY292ZXIuDQo+ID4gPiA+IA0KPiA+ID4g
DQo+ID4gPiBPayAtLSBQZXRlciBwcmV0dHkgbXVjaCBzYWlkIHNvbWV0aGluZyBzaW1pbGFyLiBS
ZXRyeWluZyANCj4gPiA+IGluZGVmbml0ZWx5IHdoZW4gdGhlIGxvb2t1cCByZXR1cm5zIEVTVEFM
RSBwcm9iYWJseSB3b24ndCBoZWxwLiANCj4gPiA+IEknbSBvayB3aXRoIGJhc2ljYWxseSBsZXR0
aW5nIHRoZSBWRlMgY29udGludWUgdG8gZG8gd2hhdCBpdCBkb2VzIHRoZXJlIGFscmVhZHkuDQo+
ID4gPiBJZiBpdCBnZXRzIGFuIEVTVEFMRSwgaXQgdHJpZXMgYWdhaW4gd2l0aCBMT09LVVBfUkVW
QUwgc2V0IGFuZCANCj4gPiA+IHRoZW4gZ2l2ZXMgdXAgaWYgdGhhdCBkb2Vzbid0IHdvcmsuDQo+
ID4gPiANCj4gPiA+IElmIGhvd2V2ZXIsIHRoZSBvcGVyYXRpb24gaXRzZWxmIGtlZXBzIHJldHVy
bmluZyBFU1RBTEUsIGFyZSB3ZSBPSyANCj4gPiA+IHRvIHJldHJ5IGluZGVmaW5pdGVseSBhc3N1
bWluZyB0aGF0IHdlJ2xsIGJyZWFrIG91dCBvZiB0aGUgbG9vcCBvbiANCj4gPiA+IGZhdGFsIHNp
Z25hbHM/DQo+ID4gPg0KPiA+ID4gRm9yIGV4YW1wbGUsIHNvbWV0aGluZyBsaWtlIHRoZSB2MiBw
YXRjaCBJIHNlbnQgYSBsaXR0bGUgd2hpbGUgYWdvPw0KPiA+IA0KPiA+IA0KPiA+IFdvbid0IHNv
bWV0aGluZyBsaWtlIGZzdGF0YXQoQVRfRkRDV0QsICIiLCAmc3RhdCwgQVRfRU1QVFlfUEFUSCkg
DQo+ID4gcmlzayBsb29waW5nIGZvcmV2ZXIgdGhlcmUsIG9yIGFtIEkgbWlzc2luZyBzb21ldGhp
bmc/DQo+ID4gDQo+IA0KPiBUbyBtYWtlIHN1cmUgSSB1bmRlcnN0YW5kLCB0aGF0IHNob3VsZCBi
ZSAic2hvcnRjdXQiIGZvciBhIGxvb2t1cCBvZiB0aGUgY3dkPw0KPiANCj4gU28gSSBndWVzcyB0
aGUgY29uY2VybiBpcyB0aGF0IHlvdSdkIGRvIHRoZSBhYm92ZSBhbmQgZ2V0IGEgc3VjY2Vzc2Z1
bCBsb29rdXAgc2luY2UgeW91J3JlIGp1c3QgZ29pbmcgdG8gZ2V0IGJhY2sgdGhlIGN3ZC4gQXQg
dGhhdCBwb2ludCwgeW91J2QgYXR0ZW1wdCB0aGUgZ2V0YXR0ciBhbmQgZ2V0IEVTVEFMRSBiYWNr
LiBUaGVuLCB5b3UnZCByZWRvIHRoZSBsb29rdXAgd2l0aCBMT09LVVBfUkVWQUwgc2V0IC0tIGJ1
dCBzaW5jZSB3ZSdyZSBvcGVyYXRpbmcgb24gdGhlIGN3ZCwgd2UgZG9uJ3QgaGF2ZSBhIHdheSB0
byByZWRvIHRoZSBsb29rdXAgc2luY2Ugd2UgZG9uJ3QgaGF2ZSBhIHBhdGhuYW1lIHRoYXQgd2Ug
Y2FuIGxvb2sgdXAgYWdhaW4uLi4NCj4gDQo+IFNvIHllYWgsIEkgZ3Vlc3MgaWYgeW91J3JlIHNp
dHRpbmcgaW4gYSBzdGFsZSBkaXJlY3RvcnksIHNvbWV0aGluZyBsaWtlIHRoYXQgY291bGQgbG9v
cCBldGVybmFsbHkuDQo+IA0KPiBEbyB5b3UgdGhpbmsgdGhlIHByb3Bvc2VkIGNoZWNrIGZvciBm
YXRhbF9zaWduYWxfcGVuZGluZyBpcyBlbm91Z2ggdG8gbWl0aWdhdGUgc3VjaCBhIHByb2JsZW0/
IE9yIGRvIHdlIG5lZWQgdG8gbGltaXQgdGhlIG51bWJlciBvZiByZXRyaWVzIHRvIGFkZHJlc3Mg
dGhvc2Ugc29ydHMgb2YgbG9vcHM/DQo+IA0KPiAtLQ0KPiBKZWZmIExheXRvbiA8amxheXRvbkBy
ZWRoYXQuY29tPg0KDQotLQ0KVHJvbmQgTXlrbGVidXN0DQpMaW51eCBORlMgY2xpZW50IG1haW50
YWluZXINCg0KTmV0QXBwDQpUcm9uZC5NeWtsZWJ1c3RAbmV0YXBwLmNvbQ0Kd3d3Lm5ldGFwcC5j
b20NCg0K

2012-04-23 15:03:16

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call


On Apr 23, 2012, at 10:51 AM, Miklos Szeredi wrote:

> "J. Bruce Fields" <[email protected]> writes:
>
>>
>> I also wonder whether it would be making too many assumptions about the
>> server or filesystem: just because ordinary posix interfaces don't allow
>> atomic replacement of a whole directory tree doesn't mean the server
>> might not have some way to do it.
>
> Exactly because posix limits the atomic replacement to empty directories
> is that this feature is not useful and is why linux can get away with
> the dead directory behavior in this case. And thinking about fixing
> this in NFS is completely pointless since no one will rely on the atomic
> replacement behavior. Fixing local filesystems is also pointless for
> the same reason.
>
> Atomic replacement of whole directory trees would indeed be more useful,
> but it's highly unlikely to be used anywhere since applications relying
> on this feature would be limited to special filesystems that allow this.

The cases I can think of have to do with file system restore, file system and block device snapshots, and so on. This type of use case may not practical on today's Linux server, but they are a reality for anyone using high-end NFS storage.

> So my statement is "ENOENT is equivalent to ESTALE if already retrying
> path lookup with LOOKUP_REVAL on any operation that takes an parent
> directory and a name (lookup, create, link, unlink, symlink, mkdir,
> rmdir, mknod, rename)."
>
> This equivalence is in the sense that it doesn't change behavior
> compared to local filesystems.
>
> For other operations ENOENT is not equivalent to ESTALE.
>
> Thanks,
> Miklos
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





2012-04-16 19:43:14

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Mon, 16 Apr 2012 19:33:05 +0000
"Myklebust, Trond" <[email protected]> wrote:

> On Mon, 2012-04-16 at 13:46 -0400, Jeff Layton wrote:
> > The question about looping indefinitely really comes down to:
> >
> > 1) is a persistent ESTALE in conjunction with a successful lookup a
> > situation that we expect to be temporary. i.e. will the admin at some
> > point be able to do something about it? If not, then there's no point
> > in continuing to retry. Again, this is a situation that *really* should
> > not happen if the filesystem is doing the right thing.
> >
> > 2) If the admin can't do anything about it, is it reasonable to expect
> > that users can send a fatal signal to hung applications if this
> > situation occurs.
> >
> > We expect that that's ok in other situations to resolve hung
> > applications, so I'm not sure I understand why it wouldn't be
> > acceptable here...
>
> There are definitely potentially persistent pathological situations that
> the filesystem can't do anything about. If the point of origin for your
> pathname (for instance your current directory in the case of a relative
> pathname) is stale, then no amount of looping is going to help you to
> recover.
>

Ok -- Peter pretty much said something similar. Retrying indefnitely
when the lookup returns ESTALE probably won't help. I'm ok with
basically letting the VFS continue to do what it does there already. If
it gets an ESTALE, it tries again with LOOKUP_REVAL set and then gives
up if that doesn't work.

If however, the operation itself keeps returning ESTALE, are we OK to
retry indefinitely assuming that we'll break out of the loop on fatal
signals?

For example, something like the v2 patch I sent a little while ago?

--
Jeff Layton <[email protected]>

2012-04-16 23:05:42

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Mon, 16 Apr 2012 20:25:06 +0000
"Myklebust, Trond" <[email protected]> wrote:

> On Mon, 2012-04-16 at 15:43 -0400, Jeff Layton wrote:
> > On Mon, 16 Apr 2012 19:33:05 +0000
> > "Myklebust, Trond" <[email protected]> wrote:
> >
> > > On Mon, 2012-04-16 at 13:46 -0400, Jeff Layton wrote:
> > > > The question about looping indefinitely really comes down to:
> > > >
> > > > 1) is a persistent ESTALE in conjunction with a successful lookup a
> > > > situation that we expect to be temporary. i.e. will the admin at some
> > > > point be able to do something about it? If not, then there's no point
> > > > in continuing to retry. Again, this is a situation that *really* should
> > > > not happen if the filesystem is doing the right thing.
> > > >
> > > > 2) If the admin can't do anything about it, is it reasonable to expect
> > > > that users can send a fatal signal to hung applications if this
> > > > situation occurs.
> > > >
> > > > We expect that that's ok in other situations to resolve hung
> > > > applications, so I'm not sure I understand why it wouldn't be
> > > > acceptable here...
> > >
> > > There are definitely potentially persistent pathological situations that
> > > the filesystem can't do anything about. If the point of origin for your
> > > pathname (for instance your current directory in the case of a relative
> > > pathname) is stale, then no amount of looping is going to help you to
> > > recover.
> > >
> >
> > Ok -- Peter pretty much said something similar. Retrying indefnitely
> > when the lookup returns ESTALE probably won't help. I'm ok with
> > basically letting the VFS continue to do what it does there already. If
> > it gets an ESTALE, it tries again with LOOKUP_REVAL set and then gives
> > up if that doesn't work.
> >
> > If however, the operation itself keeps returning ESTALE, are we OK to
> > retry indefinitely assuming that we'll break out of the loop on fatal
> > signals?
> >
> > For example, something like the v2 patch I sent a little while ago?
>
>
> Won't something like fstatat(AT_FDCWD, "", &stat, AT_EMPTY_PATH) risk
> looping forever there, or am I missing something?
>

To make sure I understand, that should be "shortcut" for a lookup of the
cwd?

So I guess the concern is that you'd do the above and get a successful
lookup since you're just going to get back the cwd. At that point,
you'd attempt the getattr and get ESTALE back. Then, you'd redo the
lookup with LOOKUP_REVAL set -- but since we're operating on the
cwd, we don't have a way to redo the lookup since we don't have a
pathname that we can look up again...

So yeah, I guess if you're sitting in a stale directory, something like
that could loop eternally.

Do you think the proposed check for fatal_signal_pending is enough to
mitigate such a problem? Or do we need to limit the number of retries
to address those sorts of loops?

--
Jeff Layton <[email protected]>

2012-04-17 15:49:55

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

Jeff Layton <[email protected]> writes:

> On Tue, 17 Apr 2012 16:27:16 +0200
> Miklos Szeredi <[email protected]> wrote:
>
>> Steve Dickson <[email protected]> writes:
>>
>> > True, but even so... Giving file systems an opt-out option with the
>> > default being out, maybe still have some merit... Making file systems
>> > enable this new type of functionality would cut down on any of the
>> > "surprise" that might occur with this redo ;-)
>>
>> I've been arguing for something slightly different for quite some time:
>> I never liked errno values which have side effects in the kernel yet
>> might be visible to userspace.
>>
>> So why not introduce ERETRYSTALE, a *kernel internal* errno value that
>> userspace will never see and filesystems never accidentally set. The
>> VFS can turn this into ESTALE if it doesn't retry for some reason
>> (e.g. already retried).
>>
>
> That's possible but it's certainly a lot more invasive. It's also far
> more difficult for filesystems to opt-in to this sort of behavior.

As a first step all that is needed to opt in is to convert -ESTALE to
-ESTALERETRY on those operations that allow retrying. Which is lookup,
at the moment.

If we decide to make more ops retryable (I'm not convinced that's the
best approach, but it might be) then those can be converted without too
much pain.

Thanks,
Miklos


diff --git a/fs/namei.c b/fs/namei.c
index 0062dd1..66b7d06 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1776,9 +1776,14 @@ static int do_path_lookup(int dfd, const char *name,
int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(dfd, name, flags, nd);
- if (unlikely(retval == -ESTALE))
+ if (unlikely(retval == -ERETRYLOOKUP)) {
retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);

+ /* retry only once */
+ if (retval == -ERETRYLOOKUP)
+ retval = -ESTALE;
+ }
+
if (likely(!retval)) {
if (unlikely(!audit_dummy_context())) {
if (nd->path.dentry && nd->inode)
@@ -2433,8 +2438,12 @@ struct file *do_filp_open(int dfd, const char *pathname,
filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(dfd, pathname, &nd, op, flags);
- if (unlikely(filp == ERR_PTR(-ESTALE)))
+ if (unlikely(filp == ERR_PTR(-ERETRYLOOKUP))) {
filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+ if (filp == ERR_PTR(-ERETRYLOOKUP))
+ filp = ERR_PTR(-ESTALE);
+ }
+
return filp;
}

@@ -2455,8 +2464,11 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
if (unlikely(file == ERR_PTR(-ECHILD)))
file = path_openat(-1, name, &nd, op, flags);
- if (unlikely(file == ERR_PTR(-ESTALE)))
+ if (unlikely(file == ERR_PTR(-ESTALE))) {
file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+ if (file == ERR_PTR(-ERETRYLOOKUP))
+ file = ERR_PTR(-ESTALE);
+ }
return file;
}

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4aaf031..cd35da3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1303,6 +1303,8 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
+ if (error == -ESTALE)
+ error = -ERETRYLOOKUP;
res = ERR_PTR(error);
goto out_unblock_sillyrename;
}
diff --git a/include/linux/errno.h b/include/linux/errno.h
index 2d09bfa..cfaedb8 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -17,6 +17,7 @@
#define ENOIOCTLCMD 515 /* No ioctl command */
#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
#define EPROBE_DEFER 517 /* Driver requests probe retry */
+#define ERETRYLOOKUP 519 /* Retry lookup */

/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */




2012-04-17 16:03:50

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, 17 Apr 2012 17:50:06 +0200
Miklos Szeredi <[email protected]> wrote:

> Jeff Layton <[email protected]> writes:
>
> > On Tue, 17 Apr 2012 16:27:16 +0200
> > Miklos Szeredi <[email protected]> wrote:
> >
> >> Steve Dickson <[email protected]> writes:
> >>
> >> > True, but even so... Giving file systems an opt-out option with the
> >> > default being out, maybe still have some merit... Making file systems
> >> > enable this new type of functionality would cut down on any of the
> >> > "surprise" that might occur with this redo ;-)
> >>
> >> I've been arguing for something slightly different for quite some time:
> >> I never liked errno values which have side effects in the kernel yet
> >> might be visible to userspace.
> >>
> >> So why not introduce ERETRYSTALE, a *kernel internal* errno value that
> >> userspace will never see and filesystems never accidentally set. The
> >> VFS can turn this into ESTALE if it doesn't retry for some reason
> >> (e.g. already retried).
> >>
> >
> > That's possible but it's certainly a lot more invasive. It's also far
> > more difficult for filesystems to opt-in to this sort of behavior.
>
> As a first step all that is needed to opt in is to convert -ESTALE to
> -ESTALERETRY on those operations that allow retrying. Which is lookup,
> at the moment.
>
> If we decide to make more ops retryable (I'm not convinced that's the
> best approach, but it might be) then those can be converted without too
> much pain.
>

We will need to make more operations retryable. The main problem is
that inodes can go stale between the lookup and the actual operation
that we want the syscall to do. Mostly, that happens when a lookup
is satisfied out of the dcache, but it can happen even after an
on-the-wire lookup too.

Just having an opt-in to to retry when the lookup fails won't
materially help that situation.

The more I look at it, the less fond I am of this new error code. The
patch below just increases the complexity of the lookup codepath by
introducing ERETRYLOOKUP, which basically means the same thing as
ESTALE in most situations. It doesn't really add much benefit that I
can see.

Again, what's the problem with just basing this decision on the existing
ESTALE return? Very few filesystems return ESTALE at all, and most of them
just do it in their export-ops codepaths which won't be involved here.
Do we really need to jump through these sorts of hoops in order to
avoid a single retry in an ESTALE codepath?

>
> diff --git a/fs/namei.c b/fs/namei.c
> index 0062dd1..66b7d06 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -1776,9 +1776,14 @@ static int do_path_lookup(int dfd, const char *name,
> int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
> if (unlikely(retval == -ECHILD))
> retval = path_lookupat(dfd, name, flags, nd);
> - if (unlikely(retval == -ESTALE))
> + if (unlikely(retval == -ERETRYLOOKUP)) {
> retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
>
> + /* retry only once */
> + if (retval == -ERETRYLOOKUP)
> + retval = -ESTALE;
> + }
> +
> if (likely(!retval)) {
> if (unlikely(!audit_dummy_context())) {
> if (nd->path.dentry && nd->inode)
> @@ -2433,8 +2438,12 @@ struct file *do_filp_open(int dfd, const char *pathname,
> filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
> if (unlikely(filp == ERR_PTR(-ECHILD)))
> filp = path_openat(dfd, pathname, &nd, op, flags);
> - if (unlikely(filp == ERR_PTR(-ESTALE)))
> + if (unlikely(filp == ERR_PTR(-ERETRYLOOKUP))) {
> filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
> + if (filp == ERR_PTR(-ERETRYLOOKUP))
> + filp = ERR_PTR(-ESTALE);
> + }
> +
> return filp;
> }
>
> @@ -2455,8 +2464,11 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
> file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
> if (unlikely(file == ERR_PTR(-ECHILD)))
> file = path_openat(-1, name, &nd, op, flags);
> - if (unlikely(file == ERR_PTR(-ESTALE)))
> + if (unlikely(file == ERR_PTR(-ESTALE))) {
> file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
> + if (file == ERR_PTR(-ERETRYLOOKUP))
> + file = ERR_PTR(-ESTALE);
> + }
> return file;
> }
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 4aaf031..cd35da3 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -1303,6 +1303,8 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
> if (error == -ENOENT)
> goto no_entry;
> if (error < 0) {
> + if (error == -ESTALE)
> + error = -ERETRYLOOKUP;
> res = ERR_PTR(error);
> goto out_unblock_sillyrename;
> }
> diff --git a/include/linux/errno.h b/include/linux/errno.h
> index 2d09bfa..cfaedb8 100644
> --- a/include/linux/errno.h
> +++ b/include/linux/errno.h
> @@ -17,6 +17,7 @@
> #define ENOIOCTLCMD 515 /* No ioctl command */
> #define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
> #define EPROBE_DEFER 517 /* Driver requests probe retry */
> +#define ERETRYLOOKUP 519 /* Retry lookup */
>
> /* Defined for the NFSv3 protocol */
> #define EBADHANDLE 521 /* Illegal NFS file handle */
>
>
>


--
Jeff Layton <[email protected]>

2012-04-23 12:00:52

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Sun, 22 Apr 2012 07:40:57 +0200
Miklos Szeredi <[email protected]> wrote:

> On Fri, Apr 20, 2012 at 11:13 PM, Jeff Layton <[email protected]> wrote:
> > On Fri, 20 Apr 2012 15:37:26 -0500
> > Malahal Naineni <[email protected]> wrote:
> >
> >> Steve Dickson [[email protected]] wrote:
> >> > > 2) if we assume that it is fairly representative of one, how can we
> >> > > achieve retrying indefinitely with NFS, or at least some large finite
> >> > > amount?
> >> > The amount of looping would be peer speculation. If the problem can
> >> > not be handled by one simple retry I would say we simply pass the
> >> > error up to the app... Its an application issue...
> >>
> >> As someone said, ESTALE is an incorrect errno for a path based call.
> >> How about turning ESTALE into ENOENT after a retry or few retries?
> >>
> >
> > It's not really the same thing. One could envision an application
> > that's repeatedly renaming a new file on top of another one. The file
> > is never missing from the namespace of the server, but you could still
> > end up getting an ESTALE.
> >
> > That would break other atomicity guarantees in an even worse way, IMO...
>
> For directory operations ESTALE *is* equivalent to ENOENT if already
> retrying with LOOKUP_REVAL. Think about it. Atomic replacement by
> another directory with rename(2) is not an excuse here actually.
> Local filesystems too can end up with IS_DEAD directory after lookup
> in that case.
>

Doesn't that violate POSIX? rename(2) is supposed to be atomic, and I
can't see where there's any exception for that for directories.

Seems like it ought to be possible to eliminate that race for other
filesystems as well, by turning those into an ESTALE return and
retrying again.

> For non directories we basically have getattr and setattr. NFSv4 can
> handle both without retries if we supply the name instead of the
> handle (i.e. i_op->getattr_by_name, i_op->setattr_by_name). Other
> protocols can do whatever they want, exponential backoff with limited
> number of retries, whatever.
>
> No looping required in the VFS.
>

Per-name operations for things like getattr and setattr would be nice.
It would also make things cleaner on CIFS since there'd be less
conversion from dentry to path.

Note that we would still need to do a lookup. We have to update the
inode attributes with these operations too. But, that would likely
avoid the ESTALE problems with NFS since we could retry within
the lower fs itself.

That said, there's more than just those two operations involved. We'd
need similar ones for other inode operations too:

readlink
permission
*xattr

...and probably others, especially if you want to allow for more
resilient create/delete in the face of a stale directory.

--
Jeff Layton <[email protected]>

2012-04-17 16:02:06

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

"J. Bruce Fields" <[email protected]> writes:

> On Tue, Apr 17, 2012 at 10:20:35AM -0400, Jeff Layton wrote:
>> Well, it's possible, but it seems pathological to me for a server to do
>> that...
>>
>> Bruce and I were discussing this the other day. It would be good to add
>> something like this to the RFCs:
>>
>> "On a PUTFH, a server SHOULD hold a reference to the filehandle such
>
> For "filehandle" I'd specify "current and saved filehandle".
>
>> that it does not go stale over the life of the compound."
>
> And that's *much* less of a burden on the server than requiring that the
> compound execute atomically.
>
>> ...or something along those lines. That's a different matter though and
>> not directly related to this. :)
>
> Yes.

It's only related, because it proves that it's theoretically possible to
deal with this problem without introducing infinite retries. And that
applies to all operations, not just getattr.

As for atomicity, the VFS doesn't have any atomicity guarantees here
either. So for example getattr("foo") may end up with st_nlink == 0
with a concurrent rename("bar", "foo"). Whether this is permitted by
the standards is another matter, but it's not something that appears to
bother anybody.

Thanks,
Miklos

2012-04-16 16:03:45

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Mon, 16 Apr 2012 08:54:02 -0400
Peter Staubach <[email protected]> wrote:

> There seems to be a lot of, perhaps, misinformation here.
>
> The looping occurs when the file system on the server returns a valid file handle during a pathname traversal and then returns ESTALE on a subsequent operation.
>
> The client should retry the pathname traversal, which should either return a valid file handle or ENOENT. If a subsequent operation returns ESTALE, then start over again at the original pathname traversal.
>
> The client should not loop when 1) there is a signal pending which would cause the system call to terminate with EINTR or when it can't recover from the ESTALE return. For example, if lookup("./foo") returns ESTALE, then clearly the current directory has become stale and there is no way for the client to recover.

I think it's reasonable to make the syscall "wrapper" break out of the
loop if there's a _fatal_ signal pending. At that point, we don't
necessarily care what the return was since the program is going to be
dying soon anyway.

Some filesystems (e.g. NFS) would already return a different error code
in that situation, but dealing with it here seems like a reasonable way
to mitigate problems from filesystems that do not deal with signals
themselves.

>One could make the argument that the client can recover by relooking up the current directory, which is fine, but eventually if the file handle for the root of the mounted file system is also stale, then there is no further recovery possible, at least for NFSv[23] and perhaps even for NFSv4 depending upon how the file system was mounted.

If it gets an ESTALE on the initial lookup, then the VFS will already
attempt to retry the lookup again with LOOKUP_REVAL set. IIUC, that
makes it revalidate all the way back up to the root. It currently does
this regardless of whether LOOKUP_REVAL was set on the initial lookup
attempt.

It sounds here like you're suggesting that we should just go ahead and
give up and return ESTALE to userspace if the root of the mount went
stale. So, if someone mistakenly unexports the fs on the server, these
operations would still fail with ESTALE.

If so, I'm OK with that since I'm not necessarily interested in making
that situation more recoverable. It would be a nice to have, but it's
not as essential as fixing these other situations where the ESTALE is
more easily recovered.

--
Jeff Layton <[email protected]>

2012-04-23 19:02:23

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call



On 04/23/2012 11:32 AM, Jeff Layton wrote:
> On Mon, 23 Apr 2012 10:55:24 -0400
> Steve Dickson <[email protected]> wrote:
>
>>
>>
>> On 04/20/2012 05:13 PM, Jeff Layton wrote:
>>> On Fri, 20 Apr 2012 16:18:37 -0400
>>> Steve Dickson <[email protected]> wrote:
>>>
>>>> On 04/20/2012 10:40 AM, Jeff Layton wrote:
>>>>> I guess the questions at this point is:
>>>>>
>>>>> 1) How representative is Peter's mkdir_test() of a real-world workload?
>>>> Reading your email I had to wonder the same thing... What application
>>>> removes hierarchy of directories in a loop from two different clients?
>>>> I would suspect not many, if any... esp over NFS...
>>>>
>>>
>>> Peter's test just happens to demonstrate the problem well, but one
>>> could envision someone removing a heirarchy of directories on the
>>> server while we're trying to do other operations in it. At that point,
>>> we can easily end up hitting an ESTALE twice while doing the lookup and
>>> returning ESTALE back to userspace.
>> Just curious, what happens when you run Peter's mkdir_test() on a
>> local file system? Any errors returned?
>>
>> I would think removing hierarchy of directories while they are being
>> accessed has to even cause local fs some type of havoc
>>
>
> Peter's test only treats an ESTALE error as a failure since it was
> specifically designed to ensure that those didn't make it in to
> userspace.
>
> If you run 2 copies on the same local fs and strace it, then you'll see
> the syscalls get back things like ENOENT or EEXIST as they step on each
> others' toes in the mkdir()/rmdir() calls.
I figured as much... I just don't see any real world applications remove
directory hierarchies without some type of synchronization locking...

>
>>>
>>>>>
>>>>> 2) if we assume that it is fairly representative of one, how can we
>>>>> achieve retrying indefinitely with NFS, or at least some large finite
>>>>> amount?
>>>> The amount of looping would be peer speculation. If the problem can
>>>> not be handled by one simple retry I would say we simply pass the
>>>> error up to the app... Its an application issue...
>>>>
>>>
>>> It's not an application issue. The application just asked the kernel
>>> to do an operation on a pathname. The only reason you're getting an
>>> ESTALE back in this situation is a shortcoming of the implementation.
>>>
>>> We passed it a pathname after all, not a filehandle. ESTALE really has
>>> no place as a return code in that situation...
>> We'll have to agree to disagree... I think any application that is
>> removing hierarchies of file and directory w/out taking any
>> precautionary locking is a shortcoming of the application
>> implementation.
>>
>
> I'm not saying they should never get an error in that situation. I'm
> just saying that an ESTALE return in this situation is wrong (or at
> least not helpful) since the syscall was provided a pathname not a
> filehandle or open fd or anything. When we still have the pathname,
> then we have the ability to reattempt on an ESTALE, and it would be
> preferable to do so.
Point. But if the reestablishment can not be done in one try, the
I say we punt...

>
>>>
>>>>>
>>>>> I have my doubts as to whether it would really be as big a problem for
>>>>> other filesystems as Miklos and others have asserted, but I'll take
>>>>> their word for it at the moment. What's the best way to contain this
>>>>> behavior to just those filesystems that want to retry indefinitely when
>>>>> they get an ESTALE? Would we need to go with an entirely new
>>>>> ESTALERETRY after all?
>>>>>
>>>> Introducing a new errno to handle this problem would be overkill IMHO...
>>>>
>>>> If we have to go to the looping approach, I would strong suggest we
>>>> make the file systems register for this type of behavior...
>>>>
>>>
>>> Returning ESTALERETRY would be registering for it in a way and it is
>>> somewhat cleaner than having to go all the way back up to the fstype to
>>> figure out whether you want to retry it or not.
>> How would legacy apps handle this new errno, esp if they have logic
>> to take care of ESTALE errors?
>>
>
> Userspace should never see that error.
Why do you say this? ESTALE the errno has been around forever...
Its defined in the errno man page "ESTALE - Stale file handle (POSIX.1)"

> The idea is that this would be a
> kernel-internal error code that indicates to the VFS that it should
> retry the lookup and operation. If the kernel decides to give up after
> the FS returns ESTALERETRY, then we'd have to convert that error
> into ESTALE.
Yeah... I understand the idea... I just don't think another error
code is needed to handle this problem...

>
> It'd be preferable to me if we didn't require a new error code, but if
> different filesystems require different semantics from the VFS on an
> ESTALE return, then that is one way to achieve it.
>
Well I thought the use of the fs_flags to register for this type
of semantics was a good one...

steved.

2012-04-15 19:27:21

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Sun, Apr 15, 2012 at 09:03:23PM +0200, Bernd Schubert wrote:
> On 04/13/2012 05:42 PM, Jeff Layton wrote:
> > (note: please don't trim the CC list!)
> >
> > Indefinitely does make some sense (as Peter articulated in his original
> > set). It's possible you could race several times in a row, or a server
> > misconfiguration or something has happened and you have a transient
> > error that will eventually recover. His assertion was that any limit on
> > the number of retries is by definition wrong. For NFS, a fatal signal
> > ought to interrupt things as well, so retrying indefinitely has some
> > appeal there.
> >
> > OTOH, we do have to contend with filesystems that might return ESTALE
> > persistently for other reasons and that might not respond to signals.
> > Miklos pointed out that some FUSE fs' do this in his review of Peter's
> > set.
> >
> > As a purely defensive coding measure, limiting the number of retries to
> > something finite makes sense. If we're going to do that though, I'd
> > probably recommend that we set the number of retries be something
> > higher just so that this is more resilient in the face of multiple
> > races. Those other fs' might "spin" a bit in that case but it is an
> > error condition and IMO resiliency trumps performance -- at least in
> this case.
>
> I am definitely voting against an infinite number of retries. I'm
> working on FhGFS, which supports distributed meta data servers. So when
> a file is moved around between directories, its file handle, which
> contains the meta-data target id might become invalid. As NFSv3 is
> stateless we cannot inform the client about that and must return ESTALE
> then.

Note we're not talking about retrying the operation that returned ESTALE
with the same filehandle--probably any server would return ESTALE again
in that case.

We're talking about re-looking up the path (in the case where we're
implementing a system call that takes a path as an argument), and then
retrying the operation with the newly looked-up filehandle.

--b.

> NFSv4 is better, but I'm not sure how well invalidating a file
> handle works. So retrying once on ESTALE might be a good idea, but
> retrying forever is not.
> Also, what about asymmetric HA servers? I believe to remember that also
> resulted in ESTALE. So for example server1 exports /home and /scratch,
> but on failure server2 can only take over /home and denies access to
> /scratch.
>
>
> Thanks,
> Bernd
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2012-04-17 14:20:09

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, 17 Apr 2012 14:04:34 +0000
"Myklebust, Trond" <[email protected]> wrote:

> On Tue, 2012-04-17 at 09:32 -0400, Jeff Layton wrote:
> > On Tue, 17 Apr 2012 15:12:20 +0200
> > Miklos Szeredi <[email protected]> wrote:
>
> > To do that would require protocol support that we simply don't have. We
> > don't have a way to (for instance) say via NFS "give me the attributes
> > for this filename". Well, at least not for NFSv3...
>
> What's wrong with LOOKUP?
>

Some of this is due to my decision to use fstatat as an example, but we
should bear in mind that I've just been using that as an example. We'll
eventually have to do something similar for all path based syscalls in
order to fix tihs comprehensively.

While it may be possible to handle stat type calls atomically with a
simple LOOKUP call, other operations may require more than one round
trip. Consider chmod(), for instance...

> > With v4 you could theoretically construct a compound that does that,
> > but you'd have to assume that the server won't release the reference to
> > the inode midway through the compound. That's a reasonably safe
> > assumption.
>
> Actually, NFSv4 is the one that has the problem: there are no atomicity
> guarantees within compounds, so you could theoretically get an ESTALE in
> the GETATTR part of our lookup compound.
>

Well, it's possible, but it seems pathological to me for a server to do
that...

Bruce and I were discussing this the other day. It would be good to add
something like this to the RFCs:

"On a PUTFH, a server SHOULD hold a reference to the filehandle such
that it does not go stale over the life of the compound."

...or something along those lines. That's a different matter though and
not directly related to this. :)

--
Jeff Layton <[email protected]>

2012-04-15 19:58:00

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call


On Apr 15, 2012, at 3:03 PM, Bernd Schubert wrote:

> On 04/13/2012 05:42 PM, Jeff Layton wrote:
>> (note: please don't trim the CC list!)
>>
>> Indefinitely does make some sense (as Peter articulated in his original
>> set). It's possible you could race several times in a row, or a server
>> misconfiguration or something has happened and you have a transient
>> error that will eventually recover. His assertion was that any limit on
>> the number of retries is by definition wrong. For NFS, a fatal signal
>> ought to interrupt things as well, so retrying indefinitely has some
>> appeal there.
>>
>> OTOH, we do have to contend with filesystems that might return ESTALE
>> persistently for other reasons and that might not respond to signals.
>> Miklos pointed out that some FUSE fs' do this in his review of Peter's
>> set.
>>
>> As a purely defensive coding measure, limiting the number of retries to
>> something finite makes sense. If we're going to do that though, I'd
>> probably recommend that we set the number of retries be something
>> higher just so that this is more resilient in the face of multiple
>> races. Those other fs' might "spin" a bit in that case but it is an
>> error condition and IMO resiliency trumps performance -- at least in
> this case.
>
> I am definitely voting against an infinite number of retries. I'm
> working on FhGFS, which supports distributed meta data servers. So when
> a file is moved around between directories, its file handle, which
> contains the meta-data target id might become invalid.

Doesn't Jeff's recovery mechanism resolve this situation? The client does a fresh lookup, so shouldn't it get the new FH at that point? If there is no possible way for the client to discover the new FH, then ESTALE recovery should probably be finite.

> As NFSv3 is
> stateless we cannot inform the client about that and must return ESTALE
> then. NFSv4 is better, but I'm not sure how well invalidating a file
> handle works. So retrying once on ESTALE might be a good idea, but
> retrying forever is not.
> Also, what about asymmetric HA servers? I believe to remember that also
> resulted in ESTALE. So for example server1 exports /home and /scratch,
> but on failure server2 can only take over /home and denies access to
> /scratch.

Retrying forever is bad only if we think there are cases where there is no possible recovery action for the client, or the ESTALE signals a condition that is not temporary.

It is temporary, for instance, when an administrator takes an exported volume offline; at some point, that volume will be brought back online. Maybe it is better generally for the client to retry indefinitely instead of causing applications to fail.

Retrying forever is exactly what we do for "hard" NFS mounts, for example, and for many types of NFSv4 state recovery. Philosophically, how is this situation different?

It would be reasonable, at least, to insert a backoff delay in the retry logic.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





2012-04-23 13:34:18

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, Apr 23, 2012 at 09:12:55AM -0400, Jeff Layton wrote:
> On Mon, 23 Apr 2012 09:00:09 -0400
> "J. Bruce Fields" <[email protected]> wrote:
>
> > On Mon, Apr 23, 2012 at 08:00:12AM -0400, Jeff Layton wrote:
> > > On Sun, 22 Apr 2012 07:40:57 +0200
> > > Miklos Szeredi <[email protected]> wrote:
> > >
> > > > On Fri, Apr 20, 2012 at 11:13 PM, Jeff Layton <[email protected]> wrote:
> > > > > On Fri, 20 Apr 2012 15:37:26 -0500
> > > > > Malahal Naineni <[email protected]> wrote:
> > > > >
> > > > >> Steve Dickson [[email protected]] wrote:
> > > > >> > > 2) if we assume that it is fairly representative of one, how can we
> > > > >> > > achieve retrying indefinitely with NFS, or at least some large finite
> > > > >> > > amount?
> > > > >> > The amount of looping would be peer speculation. If the problem can
> > > > >> > not be handled by one simple retry I would say we simply pass the
> > > > >> > error up to the app... Its an application issue...
> > > > >>
> > > > >> As someone said, ESTALE is an incorrect errno for a path based call.
> > > > >> How about turning ESTALE into ENOENT after a retry or few retries?
> > > > >>
> > > > >
> > > > > It's not really the same thing. One could envision an application
> > > > > that's repeatedly renaming a new file on top of another one. The file
> > > > > is never missing from the namespace of the server, but you could still
> > > > > end up getting an ESTALE.
> > > > >
> > > > > That would break other atomicity guarantees in an even worse way, IMO...
> > > >
> > > > For directory operations ESTALE *is* equivalent to ENOENT if already
> > > > retrying with LOOKUP_REVAL. Think about it. Atomic replacement by
> > > > another directory with rename(2) is not an excuse here actually.
> > > > Local filesystems too can end up with IS_DEAD directory after lookup
> > > > in that case.
> > > >
> > >
> > > Doesn't that violate POSIX? rename(2) is supposed to be atomic, and I
> > > can't see where there's any exception for that for directories.
> >
> > Hm, but that only allows atomic replacement of the last component of a
> > path.
> >
> > Suppose you're looking up a path, you've so far reached intermediate
> > directory "D", and the next step of the lookup (of some entry in D)
> > returns ESTALE. Then either:
> >
> > - D has since been unlinked, and ENOENT is obviously right.
> > - D was unlinked and then replaced by something else, in which
> > case there was still a moment when ENOENT was correct.
> > - D was replaced atomically by a rename. But for the rename to
> > work it must have been replacing an empty directory, so there
> > was still a moment when ENOENT would have been correct.
>
> I don't think so...D should always exist in the namespace, so ENOENT
> would not be correct.

The operation above is a lookup in D, not a lookup of D.

> Just because it was empty doesn't mean that it
> didn't exist...
>
> > (Exception: if D was actually a regular file or some other
> > non-directory object, then ENOTDIR would be the right error:
> > but if you're able to get at least object type atomically with
> > a lookup, then you should have noticed this already on lookup
> > of D.)
> >
> > I think that's what Miklos meant?
> >
> > --b.
>
> Here's an example -- suppose we have two directories: /foo
> and /bar. /bar is empty. We call:
>
> rename("/foo","/bar");
>
> ...and at the same time, someone is calling:
>
> stat("/bar");
>
> ...the calls race and in this condition the stat() gets ESTALE back
> -- /bar got replaced after we did the lookup.
>
> According to POSIX, the name "/bar" should never be absent from the
> namespace in this situation, so I'm not sure I understand why returning
> ENOENT here would be acceptable.

Yes, agreed, my assertion was just that an ESTALE on a lookup of a
non-final component is probably equivalent to ENOENT.

I'm not sure if that's what Miklos meant.

--b.

2012-04-13 15:43:21

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Fri, 13 Apr 2012 10:05:18 -0500
Malahal Naineni <[email protected]> wrote:

> Jeff Layton [[email protected]] wrote:
> > 1) should we retry these calls on all filesystems, or attempt to have
> > them "opt-in" in some fashion? This patch adds a flag for that, but
> > we could just treat all filesystems the same way.
>
> I don't know any cases where a retry on ESTALE would hurt. I would say
> retry on all file systems the same way.
>
> > 2) How many times should we retry on an ESTALE error? Once?
> > Indefinitely? Some amount in between? Retrying once would probably
> > fix the bulk of the real world problems with this, but there will
> > still be cases where that's not sufficient.
>
> As you say 1 retry should work in most cases. Indefinitely doesn't make
> sense, I would rather let my application fail! How about 3 retries (3 is
> a nice number! :-) )
>

(note: please don't trim the CC list!)

Indefinitely does make some sense (as Peter articulated in his original
set). It's possible you could race several times in a row, or a server
misconfiguration or something has happened and you have a transient
error that will eventually recover. His assertion was that any limit on
the number of retries is by definition wrong. For NFS, a fatal signal
ought to interrupt things as well, so retrying indefinitely has some
appeal there.

OTOH, we do have to contend with filesystems that might return ESTALE
persistently for other reasons and that might not respond to signals.
Miklos pointed out that some FUSE fs' do this in his review of Peter's
set.

As a purely defensive coding measure, limiting the number of retries to
something finite makes sense. If we're going to do that though, I'd
probably recommend that we set the number of retries be something
higher just so that this is more resilient in the face of multiple
races. Those other fs' might "spin" a bit in that case but it is an
error condition and IMO resiliency trumps performance -- at least in
this case.

Of course, if we're going to do this for all fs', then we probably
ought to try to handle ESTALEs that are encountered in the pathwalking
code in a similar way. That may mean changing do_path_lookup and
do_filp_open_* to reattempt several times on an ESTALE error.

--
Jeff Layton <[email protected]>

2012-04-17 14:08:59

by Myklebust, Trond

[permalink] [raw]
Subject: RE: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

T24gVHVlLCAyMDEyLTA0LTE3IGF0IDA5OjM5IC0wNDAwLCBQZXRlciBTdGF1YmFjaCB3cm90ZToN
Cj4gSGkuDQo+IA0KPiBUaGVyZSBkb2VzIG5lZWQgdG8gYmUgc3VwcG9ydCBpbiB0aGUgbG9va3Vw
IHBhdGggdG8gaGFuZGxlIHN0YWxlIGRpcmVjdG9yaWVzLiAgRHVyaW5nIGEgbXVsdGktY29tcG9u
ZW50IGxvb2t1cCwgaXQgaXMgcG9zc2libGUgZm9yIG9uZSBjb21wb25lbnQgdG8gYmUgbG9va2Vk
IHVwLCBidXQgdGhlbiBiZWNvbWUgc3RhbGUgYmVmb3JlIGJlaW5nIHVzZWQuICBUaGUgc3VwcG9y
dCBuZWVkcyB0byBnbyBpbnRvIG1vcmUgdGhhbiBqdXN0IHRoZSBzZXQgb2Ygc3lzdGVtIGNhbGxz
LCBidXQgYWxzbyBpbnRvIHRoZSBsb29rdXAgY29kZS4NCj4gDQo+IEkgc3VzcGVjdCB0aGF0IHNp
bmNlIHRoZSAiIiBjYXNlIGlzIGhhbmRsZWQgc3BlY2lhbGx5IGFueXdheSwgY29kZSBjb3VsZCBi
ZSBhZGRlZCB0byBkbyBzb21ldGhpbmcgbGlrZSBpc3N1ZSBhIEdFVEFUVFIgdG8gdmVyaWZ5IHRo
YXQgdGhlIGRpcmVjdG9yeSBpcyBzdGlsbCB2YWxpZC4gIFNpbmNlICIiIGlzIHByb2JhYmx5IHVz
ZWQgbWluaW1hbGx5LCB0aGUgcGVyZm9ybWFuY2UgaW1wYWN0IHRvIHRoZSBzeXN0ZW0gc2hvdWxk
IGJlIGFsc28gYmUgbWluaW1hbC4NCg0KVGhlIHByb2JsZW0gd2l0aCB0aGF0IGlzIHRoYXQgZXZl
biBpZiB5b3VyIGN1cnJlbnQgZGlyZWN0b3J5IGlzIHN0YWxlLCBhDQpwYXRoIG9mIHRoZSBmb3Jt
ICIuLi8uLi8uLi9ibGFoIiBjYW4gc3RpbGwgYmUgdmFsaWQuIEkgc3VzcGVjdCB0aGF0IGFueQ0K
Y29kZSB0aGF0IHRyaWVzIHRvIGxvb3AgZm9yZXZlciBjYW4gcXVpY2tseSBnZXQgdmVyeSBjb21w
bGljYXRlZCBhcyB3ZQ0Kc3RhcnQgYWRkaW5nIGFsbCB0aGVzZSBjb3JuZXIgY2FzZXMuDQoNClRo
ZSB0aGluZyBhYm91dCBzdGF0KCkgaXMgdGhhdCBpbiA5OS45OSUgb2YgY2FzZXMgd2hlbiB0aGUg
cGF0aA0KcmV2YWxpZGF0aW9uIGNvZGUgd29ya3MsIHRoZW4gdGhlIE5GUyBjbGllbnQgd29uJ3Qg
aGF2ZSB0byBhY3R1YWxseQ0KaXNzdWUgYSBHRVRBVFRSIGluIHRoZSBjYWxsIHRvIHZmc19nZXRz
dGF0KCkgc2luY2UgdGhlIExPT0tVUCBvZiB0aGF0DQpmaW5hbCBwYXRoIGNvbXBvbmVudCB3aWxs
IHJldmFsaWRhdGUgdGhvc2UgYXR0cmlidXRlcyBhbnl3YXkuDQoNClRoZSBvbmx5IGV4Y2VwdGlv
biB0byB0aGF0IHJ1bGUgd2lsbCBiZSB0aGUgJ25vYWMnIG1vdW50cywgd2hpY2ggcmVhbGx5DQpj
YW4gZW5kIHVwIGxvb3BpbmcgZm9yZXZlci4NCg0KQ2hlZXJzDQogIFRyb25kDQoNCj4gCQlwcw0K
PiANCj4gDQo+IC0tLS0tT3JpZ2luYWwgTWVzc2FnZS0tLS0tDQo+IEZyb206IEplZmYgTGF5dG9u
IFttYWlsdG86amxheXRvbkByZWRoYXQuY29tXSANCj4gU2VudDogTW9uZGF5LCBBcHJpbCAxNiwg
MjAxMiA3OjA2IFBNDQo+IFRvOiBNeWtsZWJ1c3QsIFRyb25kDQo+IENjOiBCZXJuZCBTY2h1YmVy
dDsgTWFsYWhhbCBOYWluZW5pOyBsaW51eC1uZnNAdmdlci5rZXJuZWwub3JnOyBsaW51eC1mc2Rl
dmVsQHZnZXIua2VybmVsLm9yZzsgbGludXgta2VybmVsQHZnZXIua2VybmVsLm9yZzsgUGV0ZXIg
U3RhdWJhY2g7IG1pa2xvc0BzemVyZWRpLmh1OyB2aXJvQFplbklWLmxpbnV4Lm9yZy51azsgaGNo
QGluZnJhZGVhZC5vcmc7IG1pY2hhZWwuYnJhbnRsZXlAZGVzaGF3LmNvbTsgc3Zlbi5icmV1bmVy
QGl0d20uZnJhdW5ob2Zlci5kZQ0KPiBTdWJqZWN0OiBSZTogW1BBVENIIFJGQ10gdmZzOiBtYWtl
IGZzdGF0YXQgcmV0cnkgb24gRVNUQUxFIGVycm9ycyBmcm9tIGdldGF0dHIgY2FsbA0KPiANCj4g
T24gTW9uLCAxNiBBcHIgMjAxMiAyMDoyNTowNiArMDAwMA0KPiAiTXlrbGVidXN0LCBUcm9uZCIg
PFRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tPiB3cm90ZToNCj4gDQo+ID4gT24gTW9uLCAyMDEy
LTA0LTE2IGF0IDE1OjQzIC0wNDAwLCBKZWZmIExheXRvbiB3cm90ZToNCj4gPiA+IE9uIE1vbiwg
MTYgQXByIDIwMTIgMTk6MzM6MDUgKzAwMDANCj4gPiA+ICJNeWtsZWJ1c3QsIFRyb25kIiA8VHJv
bmQuTXlrbGVidXN0QG5ldGFwcC5jb20+IHdyb3RlOg0KPiA+ID4gDQo+ID4gPiA+IE9uIE1vbiwg
MjAxMi0wNC0xNiBhdCAxMzo0NiAtMDQwMCwgSmVmZiBMYXl0b24gd3JvdGU6DQo+ID4gPiA+ID4g
VGhlIHF1ZXN0aW9uIGFib3V0IGxvb3BpbmcgaW5kZWZpbml0ZWx5IHJlYWxseSBjb21lcyBkb3du
IHRvOg0KPiA+ID4gPiA+IA0KPiA+ID4gPiA+IDEpIGlzIGEgcGVyc2lzdGVudCBFU1RBTEUgaW4g
Y29uanVuY3Rpb24gd2l0aCBhIHN1Y2Nlc3NmdWwgDQo+ID4gPiA+ID4gbG9va3VwIGEgc2l0dWF0
aW9uIHRoYXQgd2UgZXhwZWN0IHRvIGJlIHRlbXBvcmFyeS4gaS5lLiB3aWxsIHRoZSANCj4gPiA+
ID4gPiBhZG1pbiBhdCBzb21lIHBvaW50IGJlIGFibGUgdG8gZG8gc29tZXRoaW5nIGFib3V0IGl0
PyBJZiBub3QsIA0KPiA+ID4gPiA+IHRoZW4gdGhlcmUncyBubyBwb2ludCBpbiBjb250aW51aW5n
IHRvIHJldHJ5LiBBZ2FpbiwgdGhpcyBpcyBhIA0KPiA+ID4gPiA+IHNpdHVhdGlvbiB0aGF0ICpy
ZWFsbHkqIHNob3VsZCBub3QgaGFwcGVuIGlmIHRoZSBmaWxlc3lzdGVtIGlzIGRvaW5nIHRoZSBy
aWdodCB0aGluZy4NCj4gPiA+ID4gPiANCj4gPiA+ID4gPiAyKSBJZiB0aGUgYWRtaW4gY2FuJ3Qg
ZG8gYW55dGhpbmcgYWJvdXQgaXQsIGlzIGl0IHJlYXNvbmFibGUgdG8gDQo+ID4gPiA+ID4gZXhw
ZWN0IHRoYXQgdXNlcnMgY2FuIHNlbmQgYSBmYXRhbCBzaWduYWwgdG8gaHVuZyBhcHBsaWNhdGlv
bnMgDQo+ID4gPiA+ID4gaWYgdGhpcyBzaXR1YXRpb24gb2NjdXJzLg0KPiA+ID4gPiA+IA0KPiA+
ID4gPiA+IFdlIGV4cGVjdCB0aGF0IHRoYXQncyBvayBpbiBvdGhlciBzaXR1YXRpb25zIHRvIHJl
c29sdmUgaHVuZyANCj4gPiA+ID4gPiBhcHBsaWNhdGlvbnMsIHNvIEknbSBub3Qgc3VyZSBJIHVu
ZGVyc3RhbmQgd2h5IGl0IHdvdWxkbid0IGJlIA0KPiA+ID4gPiA+IGFjY2VwdGFibGUgaGVyZS4u
Lg0KPiA+ID4gPiANCj4gPiA+ID4gVGhlcmUgYXJlIGRlZmluaXRlbHkgcG90ZW50aWFsbHkgcGVy
c2lzdGVudCBwYXRob2xvZ2ljYWwgDQo+ID4gPiA+IHNpdHVhdGlvbnMgdGhhdCB0aGUgZmlsZXN5
c3RlbSBjYW4ndCBkbyBhbnl0aGluZyBhYm91dC4gSWYgdGhlIA0KPiA+ID4gPiBwb2ludCBvZiBv
cmlnaW4gZm9yIHlvdXIgcGF0aG5hbWUgKGZvciBpbnN0YW5jZSB5b3VyIGN1cnJlbnQgDQo+ID4g
PiA+IGRpcmVjdG9yeSBpbiB0aGUgY2FzZSBvZiBhIHJlbGF0aXZlDQo+ID4gPiA+IHBhdGhuYW1l
KSBpcyBzdGFsZSwgdGhlbiBubyBhbW91bnQgb2YgbG9vcGluZyBpcyBnb2luZyB0byBoZWxwIHlv
dSANCj4gPiA+ID4gdG8gcmVjb3Zlci4NCj4gPiA+ID4gDQo+ID4gPiANCj4gPiA+IE9rIC0tIFBl
dGVyIHByZXR0eSBtdWNoIHNhaWQgc29tZXRoaW5nIHNpbWlsYXIuIFJldHJ5aW5nIGluZGVmbml0
ZWx5IA0KPiA+ID4gd2hlbiB0aGUgbG9va3VwIHJldHVybnMgRVNUQUxFIHByb2JhYmx5IHdvbid0
IGhlbHAuIEknbSBvayB3aXRoIA0KPiA+ID4gYmFzaWNhbGx5IGxldHRpbmcgdGhlIFZGUyBjb250
aW51ZSB0byBkbyB3aGF0IGl0IGRvZXMgdGhlcmUgYWxyZWFkeS4gDQo+ID4gPiBJZiBpdCBnZXRz
IGFuIEVTVEFMRSwgaXQgdHJpZXMgYWdhaW4gd2l0aCBMT09LVVBfUkVWQUwgc2V0IGFuZCB0aGVu
IA0KPiA+ID4gZ2l2ZXMgdXAgaWYgdGhhdCBkb2Vzbid0IHdvcmsuDQo+ID4gPiANCj4gPiA+IElm
IGhvd2V2ZXIsIHRoZSBvcGVyYXRpb24gaXRzZWxmIGtlZXBzIHJldHVybmluZyBFU1RBTEUsIGFy
ZSB3ZSBPSyANCj4gPiA+IHRvIHJldHJ5IGluZGVmaW5pdGVseSBhc3N1bWluZyB0aGF0IHdlJ2xs
IGJyZWFrIG91dCBvZiB0aGUgbG9vcCBvbiANCj4gPiA+IGZhdGFsIHNpZ25hbHM/DQo+ID4gPg0K
PiA+ID4gRm9yIGV4YW1wbGUsIHNvbWV0aGluZyBsaWtlIHRoZSB2MiBwYXRjaCBJIHNlbnQgYSBs
aXR0bGUgd2hpbGUgYWdvPw0KPiA+IA0KPiA+IA0KPiA+IFdvbid0IHNvbWV0aGluZyBsaWtlIGZz
dGF0YXQoQVRfRkRDV0QsICIiLCAmc3RhdCwgQVRfRU1QVFlfUEFUSCkgcmlzayANCj4gPiBsb29w
aW5nIGZvcmV2ZXIgdGhlcmUsIG9yIGFtIEkgbWlzc2luZyBzb21ldGhpbmc/DQo+ID4gDQo+IA0K
PiBUbyBtYWtlIHN1cmUgSSB1bmRlcnN0YW5kLCB0aGF0IHNob3VsZCBiZSAic2hvcnRjdXQiIGZv
ciBhIGxvb2t1cCBvZiB0aGUgY3dkPw0KPiANCj4gU28gSSBndWVzcyB0aGUgY29uY2VybiBpcyB0
aGF0IHlvdSdkIGRvIHRoZSBhYm92ZSBhbmQgZ2V0IGEgc3VjY2Vzc2Z1bCBsb29rdXAgc2luY2Ug
eW91J3JlIGp1c3QgZ29pbmcgdG8gZ2V0IGJhY2sgdGhlIGN3ZC4gQXQgdGhhdCBwb2ludCwgeW91
J2QgYXR0ZW1wdCB0aGUgZ2V0YXR0ciBhbmQgZ2V0IEVTVEFMRSBiYWNrLiBUaGVuLCB5b3UnZCBy
ZWRvIHRoZSBsb29rdXAgd2l0aCBMT09LVVBfUkVWQUwgc2V0IC0tIGJ1dCBzaW5jZSB3ZSdyZSBv
cGVyYXRpbmcgb24gdGhlIGN3ZCwgd2UgZG9uJ3QgaGF2ZSBhIHdheSB0byByZWRvIHRoZSBsb29r
dXAgc2luY2Ugd2UgZG9uJ3QgaGF2ZSBhIHBhdGhuYW1lIHRoYXQgd2UgY2FuIGxvb2sgdXAgYWdh
aW4uLi4NCj4gDQo+IFNvIHllYWgsIEkgZ3Vlc3MgaWYgeW91J3JlIHNpdHRpbmcgaW4gYSBzdGFs
ZSBkaXJlY3RvcnksIHNvbWV0aGluZyBsaWtlIHRoYXQgY291bGQgbG9vcCBldGVybmFsbHkuDQo+
IA0KPiBEbyB5b3UgdGhpbmsgdGhlIHByb3Bvc2VkIGNoZWNrIGZvciBmYXRhbF9zaWduYWxfcGVu
ZGluZyBpcyBlbm91Z2ggdG8gbWl0aWdhdGUgc3VjaCBhIHByb2JsZW0/IE9yIGRvIHdlIG5lZWQg
dG8gbGltaXQgdGhlIG51bWJlciBvZiByZXRyaWVzIHRvIGFkZHJlc3MgdGhvc2Ugc29ydHMgb2Yg
bG9vcHM/DQo+IA0KPiAtLQ0KPiBKZWZmIExheXRvbiA8amxheXRvbkByZWRoYXQuY29tPg0KDQot
LSANClRyb25kIE15a2xlYnVzdA0KTGludXggTkZTIGNsaWVudCBtYWludGFpbmVyDQoNCk5ldEFw
cA0KVHJvbmQuTXlrbGVidXN0QG5ldGFwcC5jb20NCnd3dy5uZXRhcHAuY29tDQoNCg==

2012-04-17 13:12:10

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

Jeff Layton <[email protected]> writes:

>>
>> Won't something like fstatat(AT_FDCWD, "", &stat, AT_EMPTY_PATH) risk
>> looping forever there, or am I missing something?
>>
>
> To make sure I understand, that should be "shortcut" for a lookup of the
> cwd?
>
> So I guess the concern is that you'd do the above and get a successful
> lookup since you're just going to get back the cwd. At that point,
> you'd attempt the getattr and get ESTALE back. Then, you'd redo the
> lookup with LOOKUP_REVAL set -- but since we're operating on the
> cwd, we don't have a way to redo the lookup since we don't have a
> pathname that we can look up again...
>
> So yeah, I guess if you're sitting in a stale directory, something like
> that could loop eternally.
>
> Do you think the proposed check for fatal_signal_pending is enough to
> mitigate such a problem? Or do we need to limit the number of retries
> to address those sorts of loops?

Lets step back a bit.

The retry is needed when when we discover during ->getattr() that the
cached lookup returned a stale file handle.

If the lookup wasn't cached or if there was no lookup at all
(stat(".") and friends) then retrying will not gain anything.

And that also means that retrying multiple times is pointless, since
after the first retry we are sure to have up-to-date attributes.

Unfortunately it's impossible for the filesystem to know whether a
->getattr (or other inode operation) was perfromed after a cached or a
non-cached lookup.

I'm not sure what the right interface for this would be. One would be
to just pass the "cached-or-not" information as a flag. That works for
getattr() but not for other operations.

Another is to introduce atomic lookup+foo variants of these operations
just like for open. E.g. the lookup+getattr is called if the cached
lookup fails or if the cached lookup succeeds and the plain ->getattr
call returns ESTALE.

Thanks,
Miklos

2012-04-16 14:44:13

by Bernd Schubert

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

>> I am definitely voting against an infinite number of retries. I'm
>> working on FhGFS, which supports distributed meta data servers. So when
>> a file is moved around between directories, its file handle, which
>> contains the meta-data target id might become invalid. As NFSv3 is
>> stateless we cannot inform the client about that and must return ESTALE
>> then. NFSv4 is better, but I'm not sure how well invalidating a file
>> handle works. So retrying once on ESTALE might be a good idea, but
>> retrying forever is not.
>
> It's important to note that I'm only proposing to wrap syscalls this
> way that take a pathname argument. We can't do anything about those
> that don't since at that point we have no way to retry the lookup.
>
> So, I'm not sure this patch would affect the case you're concerned
> about one way or another. If you move the file to a different
> directory, then it's pathname would also change, and at that point
> you'd end up with an ENOENT error or something on the next retry.
>
> If the file was open and you were (for instance) reading or writing to
> it from a client when you moved it, then we can't retry the lookup at
> that point. The open is long since done and the pathname is now gone.
> You'll get an ESTALE back in userspace regardless.

Yes, sorry, I should have read the patch and its description more carefully.

>
>> Also, what about asymmetric HA servers? I believe to remember that also
>> resulted in ESTALE. So for example server1 exports /home and /scratch,
>> but on failure server2 can only take over /home and denies access to
>> /scratch.
>>
>
> That sounds like a broken cluster configuration. Still...

Simply budget and safety. I had to do that in the past, /home was
mirrored via drbd, but /scratch was not. And although /scratch was on an
external raid system, I simply did not setup a connection to the
failover system. HA software is not reliable and extensive testing
revealed possible split brain situations with corrupting double mounts.
Nowadays there are ext4 + enabled MMP, but in 2004 there was no such
additional protection.

>
> Presumably at some point in the future, a sysadmin would intervene and
> fix the situation such that /scratch is available again. Is it better
> to return an error to the application at that point, or simply allow it
> to keep retrying until the problem has been fixed?
>
> The person with the long running job that's doing operations
> in /scratch would probably prefer the latter. If not, then they could
> always send the program a fatal signal to stop it altogether.
>

That was not a compute cluster, but the diskless desktop environment.
And things get difficult if desktop evironments start to hang. I'm not
sure if a soft mount is the solution then, as users do not like to kill
their running desktops. And with kde and gnome and their habit to
monitor everything it might not be easy to kill their proccesses
monitoring /scratch without killing the entire desktop.
But then I'm not sure if anyone still is doing async HA clusters and how
applications react to that nowadays. I just wonder if it is always a
good idea to loop forever in ESTALE in such situations.


Cheers,
Bernd

2012-04-16 11:22:39

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Sun, 15 Apr 2012 15:57:32 -0400
Chuck Lever <[email protected]> wrote:

>
> On Apr 15, 2012, at 3:03 PM, Bernd Schubert wrote:
>
> > On 04/13/2012 05:42 PM, Jeff Layton wrote:
> >> (note: please don't trim the CC list!)
> >>
> >> Indefinitely does make some sense (as Peter articulated in his original
> >> set). It's possible you could race several times in a row, or a server
> >> misconfiguration or something has happened and you have a transient
> >> error that will eventually recover. His assertion was that any limit on
> >> the number of retries is by definition wrong. For NFS, a fatal signal
> >> ought to interrupt things as well, so retrying indefinitely has some
> >> appeal there.
> >>
> >> OTOH, we do have to contend with filesystems that might return ESTALE
> >> persistently for other reasons and that might not respond to signals.
> >> Miklos pointed out that some FUSE fs' do this in his review of Peter's
> >> set.
> >>
> >> As a purely defensive coding measure, limiting the number of retries to
> >> something finite makes sense. If we're going to do that though, I'd
> >> probably recommend that we set the number of retries be something
> >> higher just so that this is more resilient in the face of multiple
> >> races. Those other fs' might "spin" a bit in that case but it is an
> >> error condition and IMO resiliency trumps performance -- at least in
> > this case.
> >
> > I am definitely voting against an infinite number of retries. I'm
> > working on FhGFS, which supports distributed meta data servers. So when
> > a file is moved around between directories, its file handle, which
> > contains the meta-data target id might become invalid.
>
> Doesn't Jeff's recovery mechanism resolve this situation? The client does a fresh lookup, so shouldn't it get the new FH at that point? If there is no possible way for the client to discover the new FH, then ESTALE recovery should probably be finite.
>
> > As NFSv3 is
> > stateless we cannot inform the client about that and must return ESTALE
> > then. NFSv4 is better, but I'm not sure how well invalidating a file
> > handle works. So retrying once on ESTALE might be a good idea, but
> > retrying forever is not.
> > Also, what about asymmetric HA servers? I believe to remember that also
> > resulted in ESTALE. So for example server1 exports /home and /scratch,
> > but on failure server2 can only take over /home and denies access to
> > /scratch.
>
> Retrying forever is bad only if we think there are cases where there is no possible recovery action for the client, or the ESTALE signals a condition that is not temporary.
>
> It is temporary, for instance, when an administrator takes an exported volume offline; at some point, that volume will be brought back online. Maybe it is better generally for the client to retry indefinitely instead of causing applications to fail.
>
> Retrying forever is exactly what we do for "hard" NFS mounts, for example, and for many types of NFSv4 state recovery. Philosophically, how is this situation different?
>
> It would be reasonable, at least, to insert a backoff delay in the retry logic.
>

Good idea. If we go with an infinite retry or a large number of
attempts, then an exponential backoff would be good. Probably not
worthwhile though if we're only going to retry a dozen times or so.

We might also want to consider having this code bail out of the loop on
fatal_signal_pending() too. That would help cover the case of
filesystems that don't handle signals appropriately themselves.

There are 3 possible "problem" situations that I can see with an
infinite retry:

1) you have a filesystem that persistently returns ESTALE on a lookup
for some reason. That situation will never resolve itself if we retry
indefinitely without outside intervention.

2) a filesystem has a successful lookup but is handing out bogus inodes
such that the operation consistently fails with ESTALE. Again, that
will probably never resolve itself, and is probably indicative that
something is broken in the fs.

3) you have a situation where the results of the lookup consistently go
stale before the actual operation, and the operation returns ESTALE.
With NFS, this could happen if you had a job on the server renaming a
new file on top of an old one very rapidly while trying to access it in
some fashion from the other client. If timed just right, this could end
up in a livelock of sorts.

To answer your question above, I don't see any major difference
philosophically between retrying on ESTALE and retrying a v4 operation
on an OLD_STATEID error or something. That said, I'm not worried about
NFS here. I'm pretty sure it can cope in some fashion with all of the
above situations.

The big questions are whether that would cause problems with other
filesystems and if so, how best would we deal with it?

--
Jeff Layton <[email protected]>

2012-04-23 13:50:55

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, 23 Apr 2012 09:34:12 -0400
"J. Bruce Fields" <[email protected]> wrote:

> On Mon, Apr 23, 2012 at 09:12:55AM -0400, Jeff Layton wrote:
> > On Mon, 23 Apr 2012 09:00:09 -0400
> > "J. Bruce Fields" <[email protected]> wrote:
> >
> > > On Mon, Apr 23, 2012 at 08:00:12AM -0400, Jeff Layton wrote:
> > > > On Sun, 22 Apr 2012 07:40:57 +0200
> > > > Miklos Szeredi <[email protected]> wrote:
> > > >
> > > > > On Fri, Apr 20, 2012 at 11:13 PM, Jeff Layton <[email protected]> wrote:
> > > > > > On Fri, 20 Apr 2012 15:37:26 -0500
> > > > > > Malahal Naineni <[email protected]> wrote:
> > > > > >
> > > > > >> Steve Dickson [[email protected]] wrote:
> > > > > >> > > 2) if we assume that it is fairly representative of one, how can we
> > > > > >> > > achieve retrying indefinitely with NFS, or at least some large finite
> > > > > >> > > amount?
> > > > > >> > The amount of looping would be peer speculation. If the problem can
> > > > > >> > not be handled by one simple retry I would say we simply pass the
> > > > > >> > error up to the app... Its an application issue...
> > > > > >>
> > > > > >> As someone said, ESTALE is an incorrect errno for a path based call.
> > > > > >> How about turning ESTALE into ENOENT after a retry or few retries?
> > > > > >>
> > > > > >
> > > > > > It's not really the same thing. One could envision an application
> > > > > > that's repeatedly renaming a new file on top of another one. The file
> > > > > > is never missing from the namespace of the server, but you could still
> > > > > > end up getting an ESTALE.
> > > > > >
> > > > > > That would break other atomicity guarantees in an even worse way, IMO...
> > > > >
> > > > > For directory operations ESTALE *is* equivalent to ENOENT if already
> > > > > retrying with LOOKUP_REVAL. Think about it. Atomic replacement by
> > > > > another directory with rename(2) is not an excuse here actually.
> > > > > Local filesystems too can end up with IS_DEAD directory after lookup
> > > > > in that case.
> > > > >
> > > >
> > > > Doesn't that violate POSIX? rename(2) is supposed to be atomic, and I
> > > > can't see where there's any exception for that for directories.
> > >
> > > Hm, but that only allows atomic replacement of the last component of a
> > > path.
> > >
> > > Suppose you're looking up a path, you've so far reached intermediate
> > > directory "D", and the next step of the lookup (of some entry in D)
> > > returns ESTALE. Then either:
> > >
> > > - D has since been unlinked, and ENOENT is obviously right.
> > > - D was unlinked and then replaced by something else, in which
> > > case there was still a moment when ENOENT was correct.
> > > - D was replaced atomically by a rename. But for the rename to
> > > work it must have been replacing an empty directory, so there
> > > was still a moment when ENOENT would have been correct.
> >
> > I don't think so...D should always exist in the namespace, so ENOENT
> > would not be correct.
>
> The operation above is a lookup in D, not a lookup of D.
>
> > Just because it was empty doesn't mean that it
> > didn't exist...
> >
> > > (Exception: if D was actually a regular file or some other
> > > non-directory object, then ENOTDIR would be the right error:
> > > but if you're able to get at least object type atomically with
> > > a lookup, then you should have noticed this already on lookup
> > > of D.)
> > >
> > > I think that's what Miklos meant?
> > >
> > > --b.
> >
> > Here's an example -- suppose we have two directories: /foo
> > and /bar. /bar is empty. We call:
> >
> > rename("/foo","/bar");
> >
> > ...and at the same time, someone is calling:
> >
> > stat("/bar");
> >
> > ...the calls race and in this condition the stat() gets ESTALE back
> > -- /bar got replaced after we did the lookup.
> >
> > According to POSIX, the name "/bar" should never be absent from the
> > namespace in this situation, so I'm not sure I understand why returning
> > ENOENT here would be acceptable.
>
> Yes, agreed, my assertion was just that an ESTALE on a lookup of a
> non-final component is probably equivalent to ENOENT.
>
> I'm not sure if that's what Miklos meant.
>

Ahh ok, sorry I misunderstood. Yeah in that case I suppose it would
be ok to replace ESTALE with ENOENT. Ok, so to illustrate...

Suppose we're trying to stat("/bar/baz") instead in the above example.
Then we could just return ENOENT instead on an ESTALE return for the
reasons that Bruce outlined. If the dir was stale, then there was a
at least one point in time where we *know* that "baz" didn't exist.

That doesn't seem like it'll work as a general solution though since it
wouldn't apply to an ESTALE on the last component. For that we'd need
to do something different -- retry the operation in some form, but it
might be potential optimization in the path walking code to avoid
retrying in some cases.

--
Jeff Layton <[email protected]>

2012-04-20 21:12:44

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Fri, 20 Apr 2012 15:37:26 -0500
Malahal Naineni <[email protected]> wrote:

> Steve Dickson [[email protected]] wrote:
> > > 2) if we assume that it is fairly representative of one, how can we
> > > achieve retrying indefinitely with NFS, or at least some large finite
> > > amount?
> > The amount of looping would be peer speculation. If the problem can
> > not be handled by one simple retry I would say we simply pass the
> > error up to the app... Its an application issue...
>
> As someone said, ESTALE is an incorrect errno for a path based call.
> How about turning ESTALE into ENOENT after a retry or few retries?
>

It's not really the same thing. One could envision an application
that's repeatedly renaming a new file on top of another one. The file
is never missing from the namespace of the server, but you could still
end up getting an ESTALE.

That would break other atomicity guarantees in an even worse way, IMO...

--
Jeff Layton <[email protected]>

2012-04-16 18:11:57

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Mon, 16 Apr 2012 16:44:06 +0200
Bernd Schubert <[email protected]> wrote:

> >> I am definitely voting against an infinite number of retries. I'm
> >> working on FhGFS, which supports distributed meta data servers. So when
> >> a file is moved around between directories, its file handle, which
> >> contains the meta-data target id might become invalid. As NFSv3 is
> >> stateless we cannot inform the client about that and must return ESTALE
> >> then. NFSv4 is better, but I'm not sure how well invalidating a file
> >> handle works. So retrying once on ESTALE might be a good idea, but
> >> retrying forever is not.
> >
> > It's important to note that I'm only proposing to wrap syscalls this
> > way that take a pathname argument. We can't do anything about those
> > that don't since at that point we have no way to retry the lookup.
> >
> > So, I'm not sure this patch would affect the case you're concerned
> > about one way or another. If you move the file to a different
> > directory, then it's pathname would also change, and at that point
> > you'd end up with an ENOENT error or something on the next retry.
> >
> > If the file was open and you were (for instance) reading or writing to
> > it from a client when you moved it, then we can't retry the lookup at
> > that point. The open is long since done and the pathname is now gone.
> > You'll get an ESTALE back in userspace regardless.
>
> Yes, sorry, I should have read the patch and its description more carefully.
>
> >
> >> Also, what about asymmetric HA servers? I believe to remember that also
> >> resulted in ESTALE. So for example server1 exports /home and /scratch,
> >> but on failure server2 can only take over /home and denies access to
> >> /scratch.
> >>
> >
> > That sounds like a broken cluster configuration. Still...
>
> Simply budget and safety. I had to do that in the past, /home was
> mirrored via drbd, but /scratch was not. And although /scratch was on an
> external raid system, I simply did not setup a connection to the
> failover system. HA software is not reliable and extensive testing
> revealed possible split brain situations with corrupting double mounts.
> Nowadays there are ext4 + enabled MMP, but in 2004 there was no such
> additional protection.
>
> >
> > Presumably at some point in the future, a sysadmin would intervene and
> > fix the situation such that /scratch is available again. Is it better
> > to return an error to the application at that point, or simply allow it
> > to keep retrying until the problem has been fixed?
> >
> > The person with the long running job that's doing operations
> > in /scratch would probably prefer the latter. If not, then they could
> > always send the program a fatal signal to stop it altogether.
> >
>
> That was not a compute cluster, but the diskless desktop environment.
> And things get difficult if desktop evironments start to hang. I'm not
> sure if a soft mount is the solution then, as users do not like to kill
> their running desktops. And with kde and gnome and their habit to
> monitor everything it might not be easy to kill their proccesses
> monitoring /scratch without killing the entire desktop.
> But then I'm not sure if anyone still is doing async HA clusters and how
> applications react to that nowadays. I just wonder if it is always a
> good idea to loop forever in ESTALE in such situations.
>

NFS will generally return a different error if the process catches a
fatal signal, so a soft mount should not be necessary and is not
recommended anyway...

In any case, we loop indefinitely now in the NFS code when (for
instance) there's a loss of communication. Users are not generally
happy if that causes an error, since their applications start dying.

While an ESTALE return is different, there are some parallels.

The question about looping indefinitely really comes down to:

1) is a persistent ESTALE in conjunction with a successful lookup a
situation that we expect to be temporary. i.e. will the admin at some
point be able to do something about it? If not, then there's no point
in continuing to retry. Again, this is a situation that *really* should
not happen if the filesystem is doing the right thing.

2) If the admin can't do anything about it, is it reasonable to expect
that users can send a fatal signal to hung applications if this
situation occurs.

We expect that that's ok in other situations to resolve hung
applications, so I'm not sure I understand why it wouldn't be
acceptable here...

--
Jeff Layton <[email protected]>

2012-04-20 21:12:35

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Fri, 20 Apr 2012 16:18:37 -0400
Steve Dickson <[email protected]> wrote:

> On 04/20/2012 10:40 AM, Jeff Layton wrote:
> > I guess the questions at this point is:
> >
> > 1) How representative is Peter's mkdir_test() of a real-world workload?
> Reading your email I had to wonder the same thing... What application
> removes hierarchy of directories in a loop from two different clients?
> I would suspect not many, if any... esp over NFS...
>

Peter's test just happens to demonstrate the problem well, but one
could envision someone removing a heirarchy of directories on the
server while we're trying to do other operations in it. At that point,
we can easily end up hitting an ESTALE twice while doing the lookup and
returning ESTALE back to userspace.

> >
> > 2) if we assume that it is fairly representative of one, how can we
> > achieve retrying indefinitely with NFS, or at least some large finite
> > amount?
> The amount of looping would be peer speculation. If the problem can
> not be handled by one simple retry I would say we simply pass the
> error up to the app... Its an application issue...
>

It's not an application issue. The application just asked the kernel
to do an operation on a pathname. The only reason you're getting an
ESTALE back in this situation is a shortcoming of the implementation.

We passed it a pathname after all, not a filehandle. ESTALE really has
no place as a return code in that situation...

> >
> > I have my doubts as to whether it would really be as big a problem for
> > other filesystems as Miklos and others have asserted, but I'll take
> > their word for it at the moment. What's the best way to contain this
> > behavior to just those filesystems that want to retry indefinitely when
> > they get an ESTALE? Would we need to go with an entirely new
> > ESTALERETRY after all?
> >
> Introducing a new errno to handle this problem would be overkill IMHO...
>
> If we have to go to the looping approach, I would strong suggest we
> make the file systems register for this type of behavior...
>

Returning ESTALERETRY would be registering for it in a way and it is
somewhat cleaner than having to go all the way back up to the fstype to
figure out whether you want to retry it or not.

There's also the non-trivial matter of needing to retry during the
lookup itself. If the lookup just returns ESTALE, then you don't have
any way to know whether the underlying fs wanted you to retry it or
not...

I'm not thrilled with having to do all of this ESTALE to ESTALERETRY
conversion and back, but it does give us a way to neatly deal with and
ESTALE during the lookup.

--
Jeff Layton <[email protected]>

2012-04-17 14:22:25

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, 17 Apr 2012 16:03:06 +0200
Miklos Szeredi <[email protected]> wrote:

> Jeff Layton <[email protected]> writes:
>
> >> The retry is needed when when we discover during ->getattr() that the
> >> cached lookup returned a stale file handle.
> >>
> >> If the lookup wasn't cached or if there was no lookup at all
> >> (stat(".") and friends) then retrying will not gain anything.
> >>
> >
> > That's not necessarily the case, at least not with NFS. It's easily
> > possible for you to do a full-fledged lookup over the wire, and then
> > for that inode to be removed prior to issuing a call against the FH that
> > you got back.
> >
> >> And that also means that retrying multiple times is pointless, since
> >> after the first retry we are sure to have up-to-date attributes.
> >>
> >
> > Again, it's not pointless. It's possible (though somewhat pathological)
> > for you to hit the race above more than once in the same operation.
> > Granted, it's an unlikely race but it is possible.
>
> ->lookup() should (and does AFAICS) leave a fully initialized inode with
> up-to-date atributes in there and at that moment there's not a lot of
> sense in fetching the attributes from the server *again*.
>
> How that lookup is implemented is another question, I can see in
> nfs3_proc_lookup() that it may or may not be atomic, and retrying (and
> looping) there might make sense. But it doesn't retry and doesn't loop,
> so the fact is, it will either return ESTALE (and the path lookup will
> retry once with LOOKUP_REVAL) or the inode *will* be up-to-date.
>
> So I think I'm still right that it's pointless to do retry after a
> non-cached lookup.
>

Ok, I'll grant that for stat() type calls, but I've just been focusing
on that as an example here. We'll need to do something similar for all
path based syscalls. What about things that require more than one round
trip -- e.g. chmod()?

> >> Unfortunately it's impossible for the filesystem to know whether a
> >> ->getattr (or other inode operation) was perfromed after a cached or a
> >> non-cached lookup.
> >>
> >> I'm not sure what the right interface for this would be. One would be
> >> to just pass the "cached-or-not" information as a flag. That works for
> >> getattr() but not for other operations.
> >>
> >> Another is to introduce atomic lookup+foo variants of these operations
> >> just like for open. E.g. the lookup+getattr is called if the cached
> >> lookup fails or if the cached lookup succeeds and the plain ->getattr
> >> call returns ESTALE.
> >>
> >
> > To do that would require protocol support that we simply don't have. We
> > don't have a way to (for instance) say via NFS "give me the attributes
> > for this filename". Well, at least not for NFSv3...
> >
> > With v4 you could theoretically construct a compound that does that,
> > but you'd have to assume that the server won't release the reference to
> > the inode midway through the compound. That's a reasonably safe
> > assumption.
> >
> > While it's nice to consider new atomic ops like this, it's not really
> > possible with earlier versions of NFS.
>
> It's just an interface question: where you put the retry logic for those
> non-atomic protocols. I think it's cleaner if the retry logic is in the
> filesystem and not in the VFS. Then NFS can do whatever it wants
> without having to impose policy in the generic filesystem layer.
>

So essentially you want to push all of the LOOKUP handling down into
the filesystems such that all ops are atomic_* or have an atomic_*
variant. That sounds like a very heavy-handed way to solve this.

--
Jeff Layton <[email protected]>

2012-04-20 20:37:41

by Malahal Naineni

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

Steve Dickson [[email protected]] wrote:
> > 2) if we assume that it is fairly representative of one, how can we
> > achieve retrying indefinitely with NFS, or at least some large finite
> > amount?
> The amount of looping would be peer speculation. If the problem can
> not be handled by one simple retry I would say we simply pass the
> error up to the app... Its an application issue...

As someone said, ESTALE is an incorrect errno for a path based call.
How about turning ESTALE into ENOENT after a retry or few retries?

Regards, Malahal.


2012-04-13 16:07:35

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call



On 04/13/2012 11:42 AM, Jeff Layton wrote:
> On Fri, 13 Apr 2012 10:05:18 -0500
> Malahal Naineni <[email protected]> wrote:
>
>> Jeff Layton [[email protected]] wrote:
>>> 1) should we retry these calls on all filesystems, or attempt to have
>>> them "opt-in" in some fashion? This patch adds a flag for that, but
>>> we could just treat all filesystems the same way.
>>
>> I don't know any cases where a retry on ESTALE would hurt. I would say
>> retry on all file systems the same way.
>>
>>> 2) How many times should we retry on an ESTALE error? Once?
>>> Indefinitely? Some amount in between? Retrying once would probably
>>> fix the bulk of the real world problems with this, but there will
>>> still be cases where that's not sufficient.
>>
>> As you say 1 retry should work in most cases. Indefinitely doesn't make
>> sense, I would rather let my application fail! How about 3 retries (3 is
>> a nice number! :-) )
>>
>
> (note: please don't trim the CC list!)
>
> Indefinitely does make some sense (as Peter articulated in his original
> set). It's possible you could race several times in a row, or a server
> misconfiguration or something has happened and you have a transient
> error that will eventually recover. His assertion was that any limit on
> the number of retries is by definition wrong. For NFS, a fatal signal
> ought to interrupt things as well, so retrying indefinitely has some
> appeal there.
>
> OTOH, we do have to contend with filesystems that might return ESTALE
> persistently for other reasons and that might not respond to signals.
> Miklos pointed out that some FUSE fs' do this in his review of Peter's
> set.
>
> As a purely defensive coding measure, limiting the number of retries to
> something finite makes sense. If we're going to do that though, I'd
> probably recommend that we set the number of retries be something
> higher just so that this is more resilient in the face of multiple
> races. Those other fs' might "spin" a bit in that case but it is an
> error condition and IMO resiliency trumps performance -- at least in
> this case.
I'm of the opinion retry more than once has the potential of
doing more harm than good... Why introduce looping when there
is no solid evidence its even needed.

I would think 99% of the time the one try would solve the problem.
That 1% probably due two apps that have gone wild fight over the same
file or the FUSE case. In those cases the error should be returned
IMHO...

steved.

>
> Of course, if we're going to do this for all fs', then we probably
> ought to try to handle ESTALEs that are encountered in the pathwalking
> code in a similar way. That may mean changing do_path_lookup and
> do_filp_open_* to reattempt several times on an ESTALE error.
>

2012-04-25 12:04:34

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Wed, 25 Apr 2012 11:41:52 +0200
Miklos Szeredi <[email protected]> wrote:

> Jeff Layton <[email protected]> writes:
>
> >> And an audit would still not ensure safety against future additions of
> >> ESTALE.
> >>
> >
> > Well, nothing is safe from the future. It's incumbent upon us to review
> > patches and such before such breakage goes in.
>
> Good interfaces are safe against stupidity, and that very much applies
> to the kernel too. Review is not an excuse for bad interfaces.
>

Exactly. The current behavior/interface is that we return ESTALE to
userspace, even when we have a pathname and can retry. That's a bad
interface that certainly isn't helpful to users.

ESTALE has a specific meaning -- stale NFS filehandle. If we expand
that definition a bit, then we can rephrase that to mean that the
results of the lookup now point to something that doesn't exist.

If FUSE or other filesystems are using ESTALE to mean something else,
then they're not returning the right error code. Is there something
documented in FUSE that told them to expect some specific behavior from
an ESTALE return?

> > But, let's say for the purposes of argument that we do have a fs (FUSE
> > or otherwise) that is persistently returning ESTALE on a lookup. Why
> > was Peter's check that we were making forward progress not enough to
> > guard against this problem?
> >
> > In particular, I'm talking about the code he added to link_path_walk in
> > this patch to check that the value of nd->path.dentry was changing:
> >
> > https://lkml.org/lkml/2008/3/10/266
> >
> > It seems like that ought to be enough to alleviate your fears on this.
> > We could also check for fatal signals on each pass and that would allow
> > users to break out of the loop even when the underlying fs doesn't
> > handle signals properly.
>
> AFAICS that doesn't ensure progress, only change. It helps those cases
> which persistently return ESTALE, but not cases where there is change
> but no progress. E.g. it doesn't prevent DoS by a client doing renames
> over a directory in a tight loop.
>

Retrying indefinitely in that situation is sort of the point. What you
consider to be a DoS here could also be a completely valid use-case in
other situations. At least in the case of NFS, we have to expect "nice"
behavior out of the server. There are plenty of ways that the server
or another client can screw things up for you...

In any case, I think we can also minimize the impact on the system in
the event that we do have DoS. We can allow the user to break out of
the loop with a fatal signal. I also think we should take Chuck's
advice from earlier and add a backoff delay between retries to minimize
the impact while looping.

Here's sort of what I'm thinking. I'm still in the process of testing
this approach to make sure it does the right thing, so consider this
another RFC:

-------------------------------------------------------------------------------
vfs-add-an-retry_lookup-functi
-------------------------------------------------------------------------------
vfs: add an retry_estale function to handle retries on ESTALE

From: Jeff Layton <[email protected]>

Signed-off-by: Jeff Layton <[email protected]>
---

fs/namei.c | 32 ++++++++++++++++++++++++++++++++
include/linux/fs.h | 2 ++
2 files changed, 34 insertions(+), 0 deletions(-)


diff --git a/fs/namei.c b/fs/namei.c
index 0062dd1..0555819 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -116,6 +116,38 @@
* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
* PATH_MAX includes the nul terminator --RR.
*/
+
+/**
+ * retry_estale - determine whether the caller should retry an operation
+ *
+ * @error: the error we'll be returning
+ * @try: number of retries already performed
+ *
+ * See if the error code is -ESTALE. If so, then calculate the amount of
+ * delay that we want based on the number of tries already done, and go into
+ * TASK_KILLABLE sleep for that long.
+ *
+ * Returns true if the caller should try again, false if not.
+ */
+bool
+retry_estale(const long error, const unsigned int try)
+{
+ long timeout;
+
+ if (likely(error != -ESTALE))
+ return false;
+
+ /* don't try to schedule if the timeout will be 0 */
+ if (try == 0)
+ return true;
+
+ /* arbitrarily cap backoff delay at 1s */
+ timeout = try * 2;
+ timeout = timeout > HZ ? HZ : timeout;
+
+ return !schedule_timeout_killable(timeout);
+}
+
static int do_getname(const char __user *filename, char *page)
{
int retval;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8de6755..c9b6684 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2026,6 +2026,8 @@ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
extern int filp_close(struct file *, fl_owner_t id);
extern char * getname(const char __user *);

+extern bool retry_estale(const long error, const unsigned int retry);
+
/* fs/ioctl.c */

extern int ioctl_preallocate(struct file *filp, void __user *argp);
-------------------------------------------------------------------------------
vfs-retry-lookups-indefinitely
-------------------------------------------------------------------------------
vfs: retry lookups indefinitely on an ESTALE error

From: Jeff Layton <[email protected]>

...but at the same time, attempt to ensure that we're making forward
progress by ensuring that the nd->path.dentry pointer is changing on
each attempt. If it isn't then give up and return -ENOENT instead.

Signed-off-by: Jeff Layton <[email protected]>
---

fs/namei.c | 39 ++++++++++++++++++++++++++++++++++++---
1 files changed, 36 insertions(+), 3 deletions(-)


diff --git a/fs/namei.c b/fs/namei.c
index 0555819..57822a7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1805,11 +1805,22 @@ static int path_lookupat(int dfd, const char *name,
static int do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
+ unsigned int try = 0;
+ struct dentry *saved = NULL;
int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(dfd, name, flags, nd);
- if (unlikely(retval == -ESTALE))
+ while (unlikely(retry_estale(retval, try++))) {
+ /*
+ * Note that the dentry pointers may be bogus by now. We do
+ * not attempt to dereference them, but simply use them to
+ * ensure that we're making progress on the lookup attempt.
+ */
+ if (saved == nd->path.dentry)
+ break;
+ saved = nd->path.dentry;
retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
+ }

if (likely(!retval)) {
if (unlikely(!audit_dummy_context())) {
@@ -2459,20 +2470,33 @@ out_filp:
struct file *do_filp_open(int dfd, const char *pathname,
const struct open_flags *op, int flags)
{
+ unsigned int try = 0;
+ struct dentry *saved = NULL;
struct nameidata nd;
struct file *filp;

filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(dfd, pathname, &nd, op, flags);
- if (unlikely(filp == ERR_PTR(-ESTALE)))
+ while (unlikely(retry_estale(PTR_ERR(filp), try++))) {
+ /*
+ * Note that the dentry pointers may be bogus by now. We do
+ * not attempt to dereference them, but simply use them to
+ * ensure that we're making progress on the lookup attempt.
+ */
+ if (saved == nd.path.dentry)
+ break;
+ saved = nd.path.dentry;
filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+ }
return filp;
}

struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
const char *name, const struct open_flags *op, int flags)
{
+ unsigned int try = 0;
+ struct dentry *saved = NULL;
struct nameidata nd;
struct file *file;

@@ -2487,8 +2511,17 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
if (unlikely(file == ERR_PTR(-ECHILD)))
file = path_openat(-1, name, &nd, op, flags);
- if (unlikely(file == ERR_PTR(-ESTALE)))
+ while (unlikely(retry_estale(PTR_ERR(file), try++))) {
+ /*
+ * Note that the dentry pointers may be bogus by now. We do
+ * not attempt to dereference them, but simply use them to
+ * ensure that we're making progress on the lookup attempt.
+ */
+ if (saved == nd.path.dentry)
+ break;
+ saved = nd.path.dentry;
file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+ }
return file;
}


2012-04-20 20:19:18

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On 04/20/2012 10:40 AM, Jeff Layton wrote:
> I guess the questions at this point is:
>
> 1) How representative is Peter's mkdir_test() of a real-world workload?
Reading your email I had to wonder the same thing... What application
removes hierarchy of directories in a loop from two different clients?
I would suspect not many, if any... esp over NFS...

>
> 2) if we assume that it is fairly representative of one, how can we
> achieve retrying indefinitely with NFS, or at least some large finite
> amount?
The amount of looping would be peer speculation. If the problem can
not be handled by one simple retry I would say we simply pass the
error up to the app... Its an application issue...

>
> I have my doubts as to whether it would really be as big a problem for
> other filesystems as Miklos and others have asserted, but I'll take
> their word for it at the moment. What's the best way to contain this
> behavior to just those filesystems that want to retry indefinitely when
> they get an ESTALE? Would we need to go with an entirely new
> ESTALERETRY after all?
>
Introducing a new errno to handle this problem would be overkill IMHO...

If we have to go to the looping approach, I would strong suggest we
make the file systems register for this type of behavior...

steved.

2012-04-17 11:54:22

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call



On 04/15/2012 03:57 PM, Chuck Lever wrote:
>
> On Apr 15, 2012, at 3:03 PM, Bernd Schubert wrote:
>
>> On 04/13/2012 05:42 PM, Jeff Layton wrote:
>>> (note: please don't trim the CC list!)
>>>
>>> Indefinitely does make some sense (as Peter articulated in his original
>>> set). It's possible you could race several times in a row, or a server
>>> misconfiguration or something has happened and you have a transient
>>> error that will eventually recover. His assertion was that any limit on
>>> the number of retries is by definition wrong. For NFS, a fatal signal
>>> ought to interrupt things as well, so retrying indefinitely has some
>>> appeal there.
>>>
>>> OTOH, we do have to contend with filesystems that might return ESTALE
>>> persistently for other reasons and that might not respond to signals.
>>> Miklos pointed out that some FUSE fs' do this in his review of Peter's
>>> set.
>>>
>>> As a purely defensive coding measure, limiting the number of retries to
>>> something finite makes sense. If we're going to do that though, I'd
>>> probably recommend that we set the number of retries be something
>>> higher just so that this is more resilient in the face of multiple
>>> races. Those other fs' might "spin" a bit in that case but it is an
>>> error condition and IMO resiliency trumps performance -- at least in
>> this case.
>>
>> I am definitely voting against an infinite number of retries. I'm
>> working on FhGFS, which supports distributed meta data servers. So when
>> a file is moved around between directories, its file handle, which
>> contains the meta-data target id might become invalid.
>
> Doesn't Jeff's recovery mechanism resolve this situation? The client does a fresh lookup, so shouldn't it get the new FH at that point? If there is no possible way for the client to discover the new FH, then ESTALE recovery should probably be finite.
>
>> As NFSv3 is
>> stateless we cannot inform the client about that and must return ESTALE
>> then. NFSv4 is better, but I'm not sure how well invalidating a file
>> handle works. So retrying once on ESTALE might be a good idea, but
>> retrying forever is not.
>> Also, what about asymmetric HA servers? I believe to remember that also
>> resulted in ESTALE. So for example server1 exports /home and /scratch,
>> but on failure server2 can only take over /home and denies access to
>> /scratch.
>
> Retrying forever is bad only if we think there are cases where there is no possible recovery action for the client, or the ESTALE signals a condition that is not temporary.
>
> It is temporary, for instance, when an administrator takes an exported volume offline; at some point, that volume will be brought back online. Maybe it is better generally for the client to retry indefinitely instead of causing applications to fail.
>
> Retrying forever is exactly what we do for "hard" NFS mounts, for example, and for many types of NFSv4 state recovery. Philosophically, how is this situation different?
The analogy of mount looping and ESTATE loop is a bit of a stretch IMHO...
With mount looping the server has not responded so it make sense to wait for a response (aka the first response). With ESTALE looping the server has actively remove something the client has been working with... I see those as two different events so they need to be handled differently...

>
> It would be reasonable, at least, to insert a backoff delay in the retry logic.
If looping is going to be involved in ESTALE handling then a time based loop does make more sense than a simple for loop.. IMHO... Similar to Jeff's version 2 patch...

steved.


2012-04-23 19:06:29

by Malahal Naineni

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

Steve Dickson [[email protected]] wrote:
> > Returning ESTALERETRY would be registering for it in a way and it is
> > somewhat cleaner than having to go all the way back up to the fstype to
> > figure out whether you want to retry it or not.
> How would legacy apps handle this new errno, esp if they have logic
> to take care of ESTALE errors?
>
> steved.

As I understand, there is no new errno for the apps. This new
ESTALERETRY is produced and consumed by kernel only.


2012-04-18 15:16:25

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Tue, 17 Apr 2012 10:48:42 -0400
Peter Staubach <[email protected]> wrote:

> I don't think that the code ends up being all that complex, actually. I will have to dig out the patches that I had previously to look to see what they did. I am pretty sure that they handled all of these cases. I also had some tests which exercised the modified path based system calls and at least, at one point in time, they would run without returning ESTALE to the user level.
>
> Let me see what I can find. SteveD, you wouldn't have squirrelled away a copy of my stuff from Red Hat, would you?
>
> Thanx...
>
> ps
>

Yes, your original set was very comprehensive, AFAICT. I'm not sure if
it'll apply to mainline well at this point, but I was definitely using
it as a guideline for what to do here.

The cover letter for the patchset that you sent out a few years ago had
a testcase in it. I was planning to use that to test once I start
broading this to other syscalls:

https://lkml.org/lkml/2008/3/10/267

--
Jeff Layton <[email protected]>

2012-04-17 13:39:55

by Peter Staubach

[permalink] [raw]
Subject: RE: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

Hi.

There does need to be support in the lookup path to handle stale directories. During a multi-component lookup, it is possible for one component to be looked up, but then become stale before being used. The support needs to go into more than just the set of system calls, but also into the lookup code.

I suspect that since the "" case is handled specially anyway, code could be added to do something like issue a GETATTR to verify that the directory is still valid. Since "" is probably used minimally, the performance impact to the system should be also be minimal.

ps


-----Original Message-----
From: Jeff Layton [mailto:[email protected]]
Sent: Monday, April 16, 2012 7:06 PM
To: Myklebust, Trond
Cc: Bernd Schubert; Malahal Naineni; [email protected]; [email protected]; [email protected]; Peter Staubach; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
Subject: Re: [PATCH RFC] vfs: make fstatat retry on ESTALE errors from getattr call

On Mon, 16 Apr 2012 20:25:06 +0000
"Myklebust, Trond" <[email protected]> wrote:

> On Mon, 2012-04-16 at 15:43 -0400, Jeff Layton wrote:
> > On Mon, 16 Apr 2012 19:33:05 +0000
> > "Myklebust, Trond" <[email protected]> wrote:
> >
> > > On Mon, 2012-04-16 at 13:46 -0400, Jeff Layton wrote:
> > > > The question about looping indefinitely really comes down to:
> > > >
> > > > 1) is a persistent ESTALE in conjunction with a successful
> > > > lookup a situation that we expect to be temporary. i.e. will the
> > > > admin at some point be able to do something about it? If not,
> > > > then there's no point in continuing to retry. Again, this is a
> > > > situation that *really* should not happen if the filesystem is doing the right thing.
> > > >
> > > > 2) If the admin can't do anything about it, is it reasonable to
> > > > expect that users can send a fatal signal to hung applications
> > > > if this situation occurs.
> > > >
> > > > We expect that that's ok in other situations to resolve hung
> > > > applications, so I'm not sure I understand why it wouldn't be
> > > > acceptable here...
> > >
> > > There are definitely potentially persistent pathological
> > > situations that the filesystem can't do anything about. If the
> > > point of origin for your pathname (for instance your current
> > > directory in the case of a relative
> > > pathname) is stale, then no amount of looping is going to help you
> > > to recover.
> > >
> >
> > Ok -- Peter pretty much said something similar. Retrying indefnitely
> > when the lookup returns ESTALE probably won't help. I'm ok with
> > basically letting the VFS continue to do what it does there already.
> > If it gets an ESTALE, it tries again with LOOKUP_REVAL set and then
> > gives up if that doesn't work.
> >
> > If however, the operation itself keeps returning ESTALE, are we OK
> > to retry indefinitely assuming that we'll break out of the loop on
> > fatal signals?
> >
> > For example, something like the v2 patch I sent a little while ago?
>
>
> Won't something like fstatat(AT_FDCWD, "", &stat, AT_EMPTY_PATH) risk
> looping forever there, or am I missing something?
>

To make sure I understand, that should be "shortcut" for a lookup of the cwd?

So I guess the concern is that you'd do the above and get a successful lookup since you're just going to get back the cwd. At that point, you'd attempt the getattr and get ESTALE back. Then, you'd redo the lookup with LOOKUP_REVAL set -- but since we're operating on the cwd, we don't have a way to redo the lookup since we don't have a pathname that we can look up again...

So yeah, I guess if you're sitting in a stale directory, something like that could loop eternally.

Do you think the proposed check for fatal_signal_pending is enough to mitigate such a problem? Or do we need to limit the number of retries to address those sorts of loops?

--
Jeff Layton <[email protected]>

2012-04-24 16:35:02

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Tue, 24 Apr 2012 17:54:56 +0200
Miklos Szeredi <[email protected]> wrote:

> Jeff Layton <[email protected]> writes:
>
> > On Mon, 23 Apr 2012 16:38:00 -0400
> > Peter Staubach <[email protected]> wrote:
> >
> >> I don't really like the idea of introducing another errno as well. It seems like too much complexity and represents complexity that no one has really justified needing.
> >>
> >
> > I tend to agree here. Miklos, can you elaborate a bit on what fuse
> > filesystems you're particularly concerned about here? Which ones return
> > ESTALE and under what conditions. Maybe we can try to tailor this
> > solution to avoid the complexity without impacting them.
>
> It is not just fuse I'm concerned about. Grep for -ESTALE, there are
> about 120 hits about 20 of which come from NFS. There's no guarantee
> that any of those ESTALE errors will go away on retry, which for an
> unlimited retry means a hung OS. If you limit the number of retries
> then in the best case it's just lots of wasted CPU cycles.
>

Yes, that's one of the first things I did when I went to look at this
problem. Almost all of those are in export_operations related code and
would never be returned on a path based syscall.

Others are not in fs-related code at all (another subsystem has
repurposed the error code), or are in fs-related code but are using it
internally for other purposes (JFS seems to have some of this).

While I don't like to waste CPU cycles, this is an error condition and
I don't think we're well served in optimizing for it.

> And an audit would still not ensure safety against future additions of
> ESTALE.
>

Well, nothing is safe from the future. It's incumbent upon us to review
patches and such before such breakage goes in.

> And a simple audit won't find things like fuse, where the error comes
> from outside the kernel. Fixing that is not trivial either. Turning
> ESTALE into some other error prevents looping but breaks the return
> value.
>

Then that fs is just plain returning the wrong error, IMO. We're not
breaking any kABI guarantees with this -- they're still able to return
ESTALE, it's just that the behavior on such a return is more resilient
if we reattempt.

But, let's say for the purposes of argument that we do have a fs (FUSE
or otherwise) that is persistently returning ESTALE on a lookup. Why
was Peter's check that we were making forward progress not enough to
guard against this problem?

In particular, I'm talking about the code he added to link_path_walk in
this patch to check that the value of nd->path.dentry was changing:

https://lkml.org/lkml/2008/3/10/266

It seems like that ought to be enough to alleviate your fears on this.
We could also check for fatal signals on each pass and that would allow
users to break out of the loop even when the underlying fs doesn't
handle signals properly.

--
Jeff Layton <[email protected]>

2012-04-23 18:35:10

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, 23 Apr 2012 14:06:46 -0400
Steve Dickson <[email protected]> wrote:


> >
> > It'd be preferable to me if we didn't require a new error code, but if
> > different filesystems require different semantics from the VFS on an
> > ESTALE return, then that is one way to achieve it.
> >
> Well I thought the use of the fs_flags to register for this type
> of semantics was a good one...
>

The problem with that is that you don't have a way to know whether the
fs wants you to retry or not on a lookup. When you get back an ESTALE
on a lookup, then that's all you have. You don't know what fs was
involved. The ERETRYLOOKUP return solves that case nicely.

The problem with the ERETRYLOOKUP idea in other codepaths is that we'll
have a bunch of this kind of crap all over the place in the vfs code:

return rc == -ERETRYLOOKUP ? -ESTALE : rc;

...in order to deal with those codepaths that might call down into the
fs, get that error back and then not actually be able to retry. For
instance, vfs_getattr is called from both vfs_fstat and vfs_fstatat.
The latter can and should retry on an ESTALE, but the former cannot.

That's a relatively simple case though. The real PITA will be stuff
like the inode_permission codepaths which branch out very widely. It'll
be hard to ensure that ERETRYLOOKUP doesn't leak out from there.

Another possibility is a hybrid approach. Have only the lookup codepath
return ERETRYLOOKUP to indicate that the fs wants to retry more than
once. Once you have a pointer to the sb, then you can look at the FS_*
flag and use that to indicate that you want to retry on an ESTALE
return.

--
Jeff Layton <[email protected]>

2012-04-20 14:41:29

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Wed, 18 Apr 2012 07:52:07 -0400
Jeff Layton <[email protected]> wrote:

> ESTALE errors are a source of pain for many users of NFS. Usually they
> occur when a file is removed from the server after a successful lookup
> against it.
>
> Luckily, the remedy in these cases is usually simple. We should just
> redo the lookup, forcing revalidations all the way in and then retry the
> call. We of course cannot do this for syscalls that do not involve a
> path, but for path-based syscalls we can and should attempt to recover
> from an ESTALE.
>
> This patch implements this by having the VFS reattempt the lookup (with
> LOOKUP_REVAL set) and call exactly once when it would ordinarily return
> ESTALE. This should catch the bulk of these cases under normal usage,
> without unduly inconveniencing other filesystems that return ESTALE on
> path-based syscalls.
>
> Note that it's possible to hit this race more than once, but a single
> retry should catch the bulk of these cases under normal circumstances.
>
> This patch is just an example. We'll alter most path-based syscalls in a
> similar fashion to fix this correctly. At this point, I'm just trying to
> ensure that the retry semantics are acceptable before I being that work.
>
> Does anyone have strong objections to this patch? I'm aware that the
> retry mechanism is not as robust as many (e.g. Peter) would like, but it
> should at least improve the current situation.
>
> If no one has a strong objection, then I'll start going through and
> adding similar code to the other syscalls. And we can hopefully we can
> get at least some of them in for 3.5.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/stat.c | 9 ++++++++-
> 1 files changed, 8 insertions(+), 1 deletions(-)
>
> diff --git a/fs/stat.c b/fs/stat.c
> index c733dc5..0ee9cb4 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
> {
> struct path path;
> int error = -EINVAL;
> - int lookup_flags = 0;
> + bool retried = false;
> + unsigned int lookup_flags = 0;
>
> if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
> AT_EMPTY_PATH)) != 0)
> @@ -84,12 +85,18 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
> if (flag & AT_EMPTY_PATH)
> lookup_flags |= LOOKUP_EMPTY;
>
> +retry:
> error = user_path_at(dfd, filename, lookup_flags, &path);
> if (error)
> goto out;
>
> error = vfs_getattr(path.mnt, path.dentry, stat);
> path_put(&path);
> + if (error == -ESTALE && !retried) {
> + retried = true;
> + lookup_flags |= LOOKUP_REVAL;
> + goto retry;
> + }
> out:
> return error;
> }

Apologies for replying to myself here. Just to beat on the deceased
equine a little longer, I should note that the above approach does
*not* fix Peter's reproducer in his original email. It fails rather
quickly when run simultaneously on the client and server.

At least one of the tests in it creates and removes a hierarchy of
directories in a loop. During that, the lookup from the client can
easily fail more than once with ESTALE. Since we give up after a single
retry, that causes the call to return ESTALE.

While testing this approach with mkdir and fstatat, I modified the
patch to retry multiple times, also retry when the lookup thows ESTALE
and to throw a printk when the number of retries was > 1 with the
number of retries that the call did and the eventual error code.

With Peter's (admittedly synthetic) test, we can get an answer of sorts
to Trond's question from earlier in the thread as to how many retries
is "enough":

[ 45.023665] sys_mkdirat: retries=33 error=-2
[ 47.889300] sys_mkdirat: retries=51 error=-2
[ 49.172746] sys_mkdirat: retries=27 error=-2
[ 52.325723] sys_mkdirat: retries=10 error=-2
[ 58.082576] sys_mkdirat: retries=33 error=-2
[ 59.810461] sys_mkdirat: retries=5 error=-2
[ 63.387914] sys_mkdirat: retries=14 error=-2
[ 63.630785] sys_mkdirat: retries=4 error=-2
[ 68.268903] sys_mkdirat: retries=6 error=-2
[ 71.124173] sys_mkdirat: retries=99 error=-2
[ 75.657649] sys_mkdirat: retries=123 error=-2
[ 76.903814] sys_mkdirat: retries=26 error=-2
[ 82.009463] sys_mkdirat: retries=30 error=-2
[ 84.807731] sys_mkdirat: retries=67 error=-2
[ 89.825529] sys_mkdirat: retries=166 error=-2
[ 91.599104] sys_mkdirat: retries=8 error=-2
[ 95.621855] sys_mkdirat: retries=44 error=-2
[ 98.164588] sys_mkdirat: retries=61 error=-2
[ 99.783347] sys_mkdirat: retries=11 error=-2
[ 105.593980] sys_mkdirat: retries=104 error=-2
[ 110.348861] sys_mkdirat: retries=8 error=-2
[ 112.087966] sys_mkdirat: retries=46 error=-2
[ 117.613316] sys_mkdirat: retries=90 error=-2
[ 120.117550] sys_mkdirat: retries=2 error=-2
[ 122.624330] sys_mkdirat: retries=15 error=-2

So, now I'm having buyers remorse of sorts about proposing to just
retry once as that may not be strong enough to fix some of the cases
we're interested in.

I guess the questions at this point is:

1) How representative is Peter's mkdir_test() of a real-world workload?

2) if we assume that it is fairly representative of one, how can we
achieve retrying indefinitely with NFS, or at least some large finite
amount?

I have my doubts as to whether it would really be as big a problem for
other filesystems as Miklos and others have asserted, but I'll take
their word for it at the moment. What's the best way to contain this
behavior to just those filesystems that want to retry indefinitely when
they get an ESTALE? Would we need to go with an entirely new
ESTALERETRY after all?

--
Jeff Layton <[email protected]>

2012-04-23 19:00:07

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH RFC v3] vfs: make fstatat retry once on ESTALE errors from getattr call

On Mon, 23 Apr 2012 17:28:19 +0200
Miklos Szeredi <[email protected]> wrote:

> Jeff Layton <[email protected]> writes:
>
> >
> > Ok, but again, that only applies to the lookup. It has no bearing on
> > the subsequent operation. For instance, if we're doing:
> >
> > rename("/foo", "/bar");
> >
> > ...and another client is simultaneously doing:
> >
> > creat("/bar/baz", 0600);
> >
> > ...and we get back ESTALE from the server on the create because the
> > "old" /bar got replaced after the lookup of it. Then it seems like
> > returning -ENOENT would not be correct since there was never a time
> > where /bar didn't exist...
>
> It may not be "correct" according to some standard. But it's what Linux
> does since day one on *all* filesystems. And probably other OS's that
> have remotely scalable lookup routines.
>
> Thanks,
> Miklos

The race window with local filesystems is very small though. I'm not
sure I've ever heard of anyone hitting this on one in practice. For NFS
OTOH, it's quite large and it's easy to hit an ESTALE without even
trying very hard.

On a related note, here's a first pass at making the getattr codepaths
handle ERETRYLOOKUP. It assumes that we don't need to fix the devtmpfs
callers or block/loop.c.

The really ugly part is that we have to deal with other callers that
don't have the ability to retry. So we have to convert ERETRYLOOKUP
back to ESTALE in those callers.

Dealing with vfs_getattr is actually fairly trivial. A tougher one would
be the inode_permission codepaths. Those go all over the place and it
would be tough to ensure that we don't leak this new error code to
userspace.

Personally, I think this approach is just too ugly to live in these
other codepaths, but it seems like a reasonable approach to allow us to
retry indefinitely on an ESTALE from a lookup.

-------------------------------------------------------------------------------
vfs-add-an-retry_lookup-functi
-------------------------------------------------------------------------------
vfs: add an retry_lookup function to handle retries on ERETRYLOOKUP

From: Jeff Layton <[email protected]>

Signed-off-by: Jeff Layton <[email protected]>
---

fs/namei.c | 23 +++++++++++++++++++++++
include/linux/fs.h | 2 ++
2 files changed, 25 insertions(+), 0 deletions(-)


diff --git a/fs/namei.c b/fs/namei.c
index fef6a0d..a781b0e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -116,6 +116,29 @@
* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
* PATH_MAX includes the nul terminator --RR.
*/
+
+/**
+ * retry_lookup - determine whether the caller should retry a lookup and op
+ *
+ * @error: the error we'll be returning
+ * @retry: what retry attempt we're on
+ *
+ */
+bool
+retry_lookup(const long error, const unsigned int retry)
+{
+ long timeout;
+
+ if (likely(error != -ERETRYLOOKUP))
+ return false;
+
+ /* arbitrarily cap backoff delay at 1s */
+ timeout = retry * 2;
+ timeout = timeout > HZ ? HZ : timeout;
+
+ return !schedule_timeout_killable(timeout);
+}
+
static int do_getname(const char __user *filename, char *page)
{
int retval;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8de6755..5cc03c3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2026,6 +2026,8 @@ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
extern int filp_close(struct file *, fl_owner_t id);
extern char * getname(const char __user *);

+extern bool retry_lookup(const long error, const unsigned int retry);
+
/* fs/ioctl.c */

extern int ioctl_preallocate(struct file *filp, void __user *argp);
-------------------------------------------------------------------------------
vfs-make-fstatat-retry-on-esta
-------------------------------------------------------------------------------
vfs: make fstatat retry on ESTALE errors from getattr call

From: Jeff Layton <[email protected]>

Have NFS return ERETRYLOOKUP to the vfs when it gets an ESTALE back on a
GETATTR call. The VFS will then retry the lookup and call when it sees
that error.

Unfortunately, we do need to deal with the potential for "leakage" of
this error when we can't retry, so we must check for it in *all* callers
of vfs_getattr and swap -ESTALE for it when it occurs.

Signed-off-by: Jeff Layton <[email protected]>
---

fs/ecryptfs/inode.c | 2 +-
fs/nfs/inode.c | 2 +-
fs/nfsd/nfsproc.c | 1 +
fs/stat.c | 20 ++++++++++++--------
4 files changed, 15 insertions(+), 10 deletions(-)


diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ab35b11..d4b5f15 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1068,7 +1068,7 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
generic_fillattr(dentry->d_inode, stat);
stat->blocks = lower_stat.blocks;
}
- return rc;
+ return rc == -ERETRYLOOKUP ? -ESTALE : rc;
}

int
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e8bbfa5..2916e87 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,7 +541,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
}
out:
- return err;
+ return err == -ESTALE ? -ERETRYLOOKUP : err;
}

static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e15dc45..5dee967 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -742,6 +742,7 @@ nfserrno (int errno)
{ nfserr_notsupp, -EOPNOTSUPP },
{ nfserr_toosmall, -ETOOSMALL },
{ nfserr_serverfault, -ESERVERFAULT },
+ { nfserr_stale, -ERETRYLOOKUP },
};
int i;

diff --git a/fs/stat.c b/fs/stat.c
index c733dc5..fb4cf29 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -64,7 +64,7 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
fput(f);
}
- return error;
+ return error == -ERETRYLOOKUP ? -ESTALE : error;
}
EXPORT_SYMBOL(vfs_fstat);

@@ -73,7 +73,8 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
{
struct path path;
int error = -EINVAL;
- int lookup_flags = 0;
+ unsigned int try = 0;
+ unsigned int lookup_flags = 0;

if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH)) != 0)
@@ -84,12 +85,15 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;

- error = user_path_at(dfd, filename, lookup_flags, &path);
- if (error)
- goto out;
-
- error = vfs_getattr(path.mnt, path.dentry, stat);
- path_put(&path);
+ do {
+ error = user_path_at(dfd, filename, lookup_flags, &path);
+ if (!error) {
+ error = vfs_getattr(path.mnt, path.dentry, stat);
+ path_put(&path);
+ break;
+ }
+ lookup_flags |= LOOKUP_REVAL;
+ } while (retry_lookup(error, try++));
out:
return error;
}