2005-10-06 14:38:37

by Miklos Szeredi

[permalink] [raw]
Subject: [RFC] atomic create+open

Currently it's impossible to implement a network filesystem that is

a) served by an unprivileged userspace process

b) supports "strange" open semantics, e.g.:

open("foo", O_WRONLY | O_CREAT, 0400);

c) not overly "hacky"

The basic problem is that because of a) permission checking cannot be
separated from the actual operations.

By the time the ->open method is called, the file has been created
with a read-only mode, and the server won't be able to open it in
write mode.

Several hacks come to mind for solving this, but these have severe
problems and are excluded.

One approach for solving this properly is to add a new atomic
create+open method to inode operations. The prototype would be a
merger of ->create() and ->open, like this:

int (*create_open) (struct inode *dir, struct dentry *dentry, int mode,
struct file *filp);

The below patch (against -mm) is a first try at implementing the VFS
part of this. But at this stage I'm more interested in opinions about
the interface, rather than about the implementation.

A different approach is to extend the open_intents structure and allow
the filesystem to do the open from ->create(), but this is a much less
clean interface. Trond Myklebust has a patch that does this (no longer
applies):

http://client.linux-nfs.org/Linux-2.6.x/2.6.12/linux-2.6.12-63-open_file_intents.dif

Comments about either solution are appreciated.

Thanks,
Miklos

Index: linux/fs/namei.c
===================================================================
--- linux.orig/fs/namei.c 2005-10-05 16:59:46.000000000 +0200
+++ linux/fs/namei.c 2005-10-06 15:18:18.000000000 +0200
@@ -28,6 +28,7 @@
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
+#include <linux/file.h>
#include <asm/namei.h>
#include <asm/uaccess.h>

@@ -1394,6 +1395,91 @@ int may_open(struct nameidata *nd, int a
return 0;
}

+static int vfs_create_open(struct inode *dir, struct dentry *dentry, int flag,
+ int mode, struct nameidata *nd, struct file *f)
+{
+ struct inode *inode;
+ struct nameidata tmpnd;
+ int error = may_create(dir, dentry, nd);
+ if (error)
+ return error;
+
+ mode &= S_IALLUGO;
+ mode |= S_IFREG;
+ error = security_inode_create(dir, dentry, mode);
+ if (error)
+ return error;
+
+ f->f_mode = ((f->f_flags+1) & O_ACCMODE) | FMODE_LSEEK |
+ FMODE_PREAD | FMODE_PWRITE;
+
+ f->f_mapping = NULL;
+ f->f_dentry = dget(dentry);
+ f->f_vfsmnt = mntget(nd->mnt);
+ f->f_pos = 0;
+ f->f_op = NULL;
+ file_move(f, &dir->i_sb->s_files);
+
+ DQUOT_INIT(dir);
+ error = dir->i_op->create_open(dir, dentry, mode, f);
+ if (error)
+ goto out_dput;
+
+ fsnotify_create(dir, dentry->d_name.name);
+ inode = dentry->d_inode;
+ if (!f->f_mapping)
+ f->f_mapping = inode->i_mapping;
+ if (!f->f_op)
+ f->f_op = fops_get(inode->i_fop);
+
+ flag &= ~O_TRUNC;
+ tmpnd = *nd;
+ tmpnd.dentry = dentry;
+
+ /* Shouldn't fail, just go though the motions (security hook, ...) */
+ error = may_open(&tmpnd, 0, flag);
+ if (error)
+ goto out_release;
+
+ if (f->f_mode & FMODE_WRITE) {
+ /* Should not fail */
+ error = get_write_access(inode);
+ if (error)
+ goto out_release;
+ }
+
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+ file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
+
+ /* NB: we're sure to have correct a_ops only after f_op->open */
+ if (f->f_flags & O_DIRECT) {
+ if (!f->f_mapping->a_ops ||
+ ((!f->f_mapping->a_ops->direct_IO) &&
+ (!f->f_mapping->a_ops->get_xip_page))) {
+ error = -EINVAL;
+ goto out_put_write_access;
+ }
+ }
+ return 0;
+
+ out_put_write_access:
+ if (f->f_mode & FMODE_WRITE)
+ put_write_access(inode);
+ out_release:
+ if (f->f_op && f->f_op->release)
+ f->f_op->release(inode, f);
+
+ fops_put(f->f_op);
+ file_kill(f);
+ out_dput:
+ dput(f->f_dentry);
+ mntput(f->f_vfsmnt);
+ f->f_dentry = NULL;
+ f->f_vfsmnt = NULL;
+ return error;
+}
+
/*
* open_namei()
*
@@ -1408,7 +1494,8 @@ int may_open(struct nameidata *nd, int a
* for symlinks (where the permissions are checked later).
* SMP-safe
*/
-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
+static int open_namei(const char * pathname, int flag, int mode,
+ struct nameidata *nd, struct file *f)
{
int acc_mode, error = 0;
struct path path;
@@ -1469,6 +1556,16 @@ do_last:
if (!path.dentry->d_inode) {
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current->fs->umask;
+
+ if (dir->d_inode->i_op && dir->d_inode->i_op->create_open) {
+ error = vfs_create_open(dir->d_inode, path.dentry,
+ flag, mode, nd, f);
+ up(&dir->d_inode->i_sem);
+ dput(nd->dentry);
+ nd->dentry = path.dentry;
+ goto exit;
+ }
+
error = vfs_create(dir->d_inode, path.dentry, mode, nd);
up(&dir->d_inode->i_sem);
dput(nd->dentry);
@@ -1509,7 +1606,7 @@ ok:
error = may_open(nd, acc_mode, flag);
if (error)
goto exit;
- return 0;
+ return __dentry_open(nd->dentry, nd->mnt, f);

exit_dput:
dput_path(&path, nd);
@@ -1561,6 +1658,59 @@ do_link:
goto do_last;
}

+/*
+ * Note that while the flag value (low two bits) for sys_open means:
+ * 00 - read-only
+ * 01 - write-only
+ * 10 - read-write
+ * 11 - special
+ * it is changed into
+ * 00 - no permissions needed
+ * 01 - read-permission
+ * 10 - write-permission
+ * 11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc). 00 is
+ * used by symlinks.
+ */
+struct file *filp_open(const char * filename, int flags, int mode)
+{
+ int namei_flags, error;
+ struct nameidata nd;
+ struct file *f;
+ static int warned;
+
+ /*
+ * Access mode of 3 had some old uses, that are probably not
+ * applicable anymore. For now just warn about deprecation.
+ * Later it can be changed to return -EINVAL.
+ */
+ if ((flags & O_ACCMODE) == 3 && warned < 5) {
+ warned++;
+ printk(KERN_WARNING "Warning: '%s' (pid=%i) uses deprecated "
+ "open flags, please report!\n",
+ current->comm, current->tgid);
+ }
+ namei_flags = flags;
+ if ((namei_flags+1) & O_ACCMODE)
+ namei_flags++;
+ if (namei_flags & O_TRUNC)
+ namei_flags |= 2;
+
+ error = -ENFILE;
+ f = get_empty_filp();
+ if (f == NULL)
+ return ERR_PTR(error);
+
+ f->f_flags = flags;
+ error = open_namei(filename, namei_flags, mode, &nd, f);
+ if (!error)
+ return f;
+
+ put_filp(f);
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL(filp_open);
+
/**
* lookup_create - lookup a dentry, creating it if it doesn't exist
* @nd: nameidata info
Index: linux/include/linux/fs.h
===================================================================
--- linux.orig/include/linux/fs.h 2005-10-05 16:59:46.000000000 +0200
+++ linux/include/linux/fs.h 2005-10-05 18:03:22.000000000 +0200
@@ -1002,6 +1002,8 @@ struct inode_operations {
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
+ int (*create_open) (struct inode *, struct dentry *, int,
+ struct file *);
};

struct seq_file;
@@ -1432,7 +1434,8 @@ static inline void allow_write_access(st
}
extern int do_pipe(int *);

-extern int open_namei(const char *, int, int, struct nameidata *);
+extern int __dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+ struct file *f);
extern int may_open(struct nameidata *, int, int);

extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
Index: linux/fs/open.c
===================================================================
--- linux.orig/fs/open.c 2005-10-05 16:59:46.000000000 +0200
+++ linux/fs/open.c 2005-10-05 17:40:22.000000000 +0200
@@ -738,14 +738,12 @@ asmlinkage long sys_fchown(unsigned int
return error;
}

-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
- int flags, struct file *f)
+int __dentry_open(struct dentry *dentry, struct vfsmount *mnt, struct file *f)
{
struct inode *inode;
int error;

- f->f_flags = flags;
- f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK |
+ f->f_mode = ((f->f_flags+1) & O_ACCMODE) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
@@ -776,11 +774,11 @@ static struct file *__dentry_open(struct
((!f->f_mapping->a_ops->direct_IO) &&
(!f->f_mapping->a_ops->get_xip_page))) {
fput(f);
- f = ERR_PTR(-EINVAL);
+ return -EINVAL;
}
}

- return f;
+ return 0;

cleanup_all:
fops_put(f->f_op);
@@ -790,76 +788,29 @@ cleanup_all:
f->f_dentry = NULL;
f->f_vfsmnt = NULL;
cleanup_file:
- put_filp(f);
dput(dentry);
mntput(mnt);
- return ERR_PTR(error);
+ return error;
}

-/*
- * Note that while the flag value (low two bits) for sys_open means:
- * 00 - read-only
- * 01 - write-only
- * 10 - read-write
- * 11 - special
- * it is changed into
- * 00 - no permissions needed
- * 01 - read-permission
- * 10 - write-permission
- * 11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc). 00 is
- * used by symlinks.
- */
-struct file *filp_open(const char * filename, int flags, int mode)
+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
{
- int namei_flags, error;
- struct nameidata nd;
+ int error;
struct file *f;
- static int warned;
-
- /*
- * Access mode of 3 had some old uses, that are probably not
- * applicable anymore. For now just warn about deprecation.
- * Later it can be changed to return -EINVAL.
- */
- if ((flags & O_ACCMODE) == 3 && warned < 5) {
- warned++;
- printk(KERN_WARNING "Warning: '%s' (pid=%i) uses deprecated "
- "open flags, please report!\n",
- current->comm, current->tgid);
- }
- namei_flags = flags;
- if ((namei_flags+1) & O_ACCMODE)
- namei_flags++;
- if (namei_flags & O_TRUNC)
- namei_flags |= 2;

error = -ENFILE;
f = get_empty_filp();
if (f == NULL)
return ERR_PTR(error);

- error = open_namei(filename, namei_flags, mode, &nd);
+ f->f_flags = flags;
+ error = __dentry_open(dentry, mnt, f);
if (!error)
- return __dentry_open(nd.dentry, nd.mnt, flags, f);
+ return f;

put_filp(f);
return ERR_PTR(error);
}
-EXPORT_SYMBOL(filp_open);
-
-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-{
- int error;
- struct file *f;
-
- error = -ENFILE;
- f = get_empty_filp();
- if (f == NULL)
- return ERR_PTR(error);
-
- return __dentry_open(dentry, mnt, flags, f);
-}
EXPORT_SYMBOL(dentry_open);

/*


2005-10-06 16:41:37

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 16:38 (+0200) skreiv Miklos Szeredi:
> Currently it's impossible to implement a network filesystem that is
>
> a) served by an unprivileged userspace process
>
> b) supports "strange" open semantics, e.g.:
>
> open("foo", O_WRONLY | O_CREAT, 0400);
>
> c) not overly "hacky"
>
> The basic problem is that because of a) permission checking cannot be
> separated from the actual operations.
>
> By the time the ->open method is called, the file has been created
> with a read-only mode, and the server won't be able to open it in
> write mode.
>
> Several hacks come to mind for solving this, but these have severe
> problems and are excluded.
>
> One approach for solving this properly is to add a new atomic
> create+open method to inode operations. The prototype would be a
> merger of ->create() and ->open, like this:
>
> int (*create_open) (struct inode *dir, struct dentry *dentry, int mode,
> struct file *filp);

The reason why we do it as a lookup intent is because this has to be
atomic lookup+create+open in order to be at all useful to NFS.

Just doing create+open atomically is worthless since it leaves you with
a bunch of races where someone on the server can create, say, a symlink
between the RPC call to lookup and the RPC call that creates the file.

> A different approach is to extend the open_intents structure and allow
> the filesystem to do the open from ->create(), but this is a much less
> clean interface. Trond Myklebust has a patch that does this (no longer
> applies):
>
> http://client.linux-nfs.org/Linux-2.6.x/2.6.12/linux-2.6.12-63-open_file_intents.dif

I'm going to fix this up.

Cheers,
Trond

2005-10-06 17:04:45

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> The reason why we do it as a lookup intent is because this has to be
> atomic lookup+create+open in order to be at all useful to NFS.
>
> Just doing create+open atomically is worthless since it leaves you with
> a bunch of races where someone on the server can create, say, a symlink
> between the RPC call to lookup and the RPC call that creates the file.

That's easy to solve: filesystem returns -EAGAIN, namei_open() redoes
the lookup and continues with the resolving. There would have to be
some safeguard counter to avoid infinite loops.

Filesystem could even populate the dentry with the symlink in
->open_create() to optimize away the relookup.

Do you see a problem with that?

Miklos

2005-10-06 17:07:40

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 19:02 (+0200) skreiv Miklos Szeredi:
> > The reason why we do it as a lookup intent is because this has to be
> > atomic lookup+create+open in order to be at all useful to NFS.
> >
> > Just doing create+open atomically is worthless since it leaves you with
> > a bunch of races where someone on the server can create, say, a symlink
> > between the RPC call to lookup and the RPC call that creates the file.
>
> That's easy to solve: filesystem returns -EAGAIN, namei_open() redoes
> the lookup and continues with the resolving. There would have to be
> some safeguard counter to avoid infinite loops.

Yech.

> Filesystem could even populate the dentry with the symlink in
> ->open_create() to optimize away the relookup.

Better.

> Do you see a problem with that?

No, but what value does an extra function call add then when you already
have lookup intents?

Cheers,
Trond

2005-10-06 17:08:06

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> The reason why we do it as a lookup intent is because this has to be
> atomic lookup+create+open in order to be at all useful to NFS.

Oh, and btw there's a problem with atomic lookup+create+open: mounts.
Do you want to follow mounts inside ->lookup(). Ugly.

Miklos


2005-10-06 17:25:53

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 19:06 (+0200) skreiv Miklos Szeredi:
> > The reason why we do it as a lookup intent is because this has to be
> > atomic lookup+create+open in order to be at all useful to NFS.
>
> Oh, and btw there's a problem with atomic lookup+create+open: mounts.
> Do you want to follow mounts inside ->lookup(). Ugly.

No. Why do you think you would need to? The VFS is supposed to protect
you against races with mount and other local objects (dcache races,
inode races,...). The problem is remote objects.

Cheers,
Trond


2005-10-06 17:25:31

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> No, but what value does an extra function call add then when you already
> have lookup intents?

Just to provide a proper interface, and not have to extend open
intents further.

Earlier you said, that intents are meant to be optional, so this
atomicity requirement is getting further from the "intent" concept.

Miklos

2005-10-06 17:32:15

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > > The reason why we do it as a lookup intent is because this has to be
> > > atomic lookup+create+open in order to be at all useful to NFS.
> >
> > Oh, and btw there's a problem with atomic lookup+create+open: mounts.
> > Do you want to follow mounts inside ->lookup(). Ugly.
>
> No. Why do you think you would need to? The VFS is supposed to protect
> you against races with mount and other local objects (dcache races,
> inode races,...). The problem is remote objects.

Say you have NFS mount on /mnt and a bind mount over the regular file
/mnt/foo. You do open("/mnt/foo", O_RDWR | O_CREAT, 0644). How do
you solve the atomic open case.

If you open in ->lookup("foo") you will be opening the wrong file,
unless you want to follow mounts inside ->lookup.

Miklos

2005-10-06 17:36:50

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 19:23 (+0200) skreiv Miklos Szeredi:
> > No, but what value does an extra function call add then when you already
> > have lookup intents?
>
> Just to provide a proper interface, and not have to extend open
> intents further.

Why do you consider an extra function call to be a more "proper
interface"?

Does it fix more races? Does it allow you to do new things more
elegantly? Does it offer a better abstraction?

Just trying to understand why you are trying to dump an interface that
has been agreed upon for several years and replace it with one that was
rejected.

> Earlier you said, that intents are meant to be optional, so this
> atomicity requirement is getting further from the "intent" concept.

No it is not. It is bang right in the middle of the intent concept.

Intents are there in order to allow the filesystem to determine which
operation is calling lookup so that it can optimise for that particular
operation.
If your filesystem is able to do lookup+create+open more efficiently
than the VFS can, then that is what it is designed to allow you to do.

Cheers,
Trond

2005-10-06 17:42:18

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 19:30 (+0200) skreiv Miklos Szeredi:

> Say you have NFS mount on /mnt and a bind mount over the regular file
> /mnt/foo. You do open("/mnt/foo", O_RDWR | O_CREAT, 0644). How do
> you solve the atomic open case.

> If you open in ->lookup("foo") you will be opening the wrong file,
> unless you want to follow mounts inside ->lookup.

Firstly, if that is the case, then you will have dentries for both the
covered and the covering copies of /mnt/foo. A simple test of
have_submounts() on the dentry will suffice to tell the filesystem.
whether or not it should open the file.

Secondly, Linux doesn't actually allow bind mounts on top of regular
files.

Either one of these two points suffices to resolve the "race".

Cheers,
Trond

2005-10-06 17:49:13

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > > No, but what value does an extra function call add then when you already
> > > have lookup intents?
> >
> > Just to provide a proper interface, and not have to extend open
> > intents further.
>
> Why do you consider an extra function call to be a more "proper
> interface"?
>
> Does it fix more races? Does it allow you to do new things more
> elegantly? Does it offer a better abstraction?

Extending the open intent with 'struct file *' in itself is pretty
lame. How do you initilaize intent.open.file from within ->lookup and
->create? Do you call dentry_open(), that doesn't cut it with the
reorganized filp_open() thing. And anyway it would be rather ugly
that it would recurse into the filesystems ->open() method.

See the problems I have with the intent.open.file thing?

> Just trying to understand why you are trying to dump an interface that
> has been agreed upon for several years and replace it with one that was
> rejected.
>
> > Earlier you said, that intents are meant to be optional, so this
> > atomicity requirement is getting further from the "intent" concept.
>
> No it is not. It is bang right in the middle of the intent concept.
>
> Intents are there in order to allow the filesystem to determine which
> operation is calling lookup so that it can optimise for that particular
> operation.
> If your filesystem is able to do lookup+create+open more efficiently
> than the VFS can, then that is what it is designed to allow you to do.

Yes, the problem is trying to fit too many things into this intent
thing.

But if you post a patch, that implements an equivalent (or wider)
interface to the open_create() method, only with intents, and it
doesn't get too convoluted, it's OK by me.

I tried to do that, but the ->open_create() interface just seemed
cleaner.

Miklos

2005-10-06 17:53:07

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > Say you have NFS mount on /mnt and a bind mount over the regular file
> > /mnt/foo. You do open("/mnt/foo", O_RDWR | O_CREAT, 0644). How do
> > you solve the atomic open case.
>
> > If you open in ->lookup("foo") you will be opening the wrong file,
> > unless you want to follow mounts inside ->lookup.
>
> Firstly, if that is the case, then you will have dentries for both the
> covered and the covering copies of /mnt/foo. A simple test of
> have_submounts() on the dentry will suffice to tell the filesystem.
> whether or not it should open the file.

And if the bind is umounted after NFS determined not to open the file,
and at the same time it changes to a symlink on the server (not very
likely I agree, but possible nonetheless), then shit happens.

> Secondly, Linux doesn't actually allow bind mounts on top of regular
> files.

It does. Try it.

Miklos

2005-10-06 17:59:49

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

Trond Myklebust wrote:
> > Earlier you said, that intents are meant to be optional, so this
> > atomicity requirement is getting further from the "intent" concept.
>
> No it is not. It is bang right in the middle of the intent concept.
>
> Intents are there in order to allow the filesystem to determine which
> operation is calling lookup so that it can optimise for that particular
> operation.

I think Miklos' point is that it's not an "optimisation" because it's
not optional. Optimisations are things where if you don't do them,
the behaviour is still correct but slower.

As far as I can tell from this discussion, the atomic lookup+create is
a non-optional requirement.

-- Jamie

2005-10-06 18:00:16

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

Miklos Szeredi wrote:
> > Secondly, Linux doesn't actually allow bind mounts on top of regular
> > files.
>
> It does. Try it.

The feature is even useful from time to time.

-- Jamie

2005-10-06 18:15:28

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> I think Miklos' point is that it's not an "optimisation" because it's
> not optional. Optimisations are things where if you don't do them,
> the behaviour is still correct but slower.
>
> As far as I can tell from this discussion, the atomic lookup+create is
> a non-optional requirement.

Exactly.

Trond, you wrote this in an earlier discussion:

> > so the filesystem can delay returning the error from the open
> > operation until the other errors have been sorted out by the lookup
> > code.
>
> Intents are meant as optimisations, not replacements for existing
> operations. I'm therefore not really comfortable about having them
> return errors at all.

The case I described is not an optimization, so in that case you seem
to agree, that lookup intents are not the solution.

Miklos

2005-10-06 18:38:33

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 19:51 (+0200) skreiv Miklos Szeredi:
> > > Say you have NFS mount on /mnt and a bind mount over the regular file
> > > /mnt/foo. You do open("/mnt/foo", O_RDWR | O_CREAT, 0644). How do
> > > you solve the atomic open case.
> >
> > > If you open in ->lookup("foo") you will be opening the wrong file,
> > > unless you want to follow mounts inside ->lookup.
> >
> > Firstly, if that is the case, then you will have dentries for both the
> > covered and the covering copies of /mnt/foo. A simple test of
> > have_submounts() on the dentry will suffice to tell the filesystem.
> > whether or not it should open the file.
>
> And if the bind is umounted after NFS determined not to open the file,
> and at the same time it changes to a symlink on the server (not very
> likely I agree, but possible nonetheless), then shit happens.

Please elaborate. What would you expect to see happen in this situation?

> > Secondly, Linux doesn't actually allow bind mounts on top of regular
> > files.
>
> It does. Try it.

Nice. I never noticed that.

Cheers,
Trond

2005-10-06 18:42:44

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 18:59 (+0100) skreiv Jamie Lokier:

> > Intents are there in order to allow the filesystem to determine which
> > operation is calling lookup so that it can optimise for that particular
> > operation.
>
> I think Miklos' point is that it's not an "optimisation" because it's
> not optional. Optimisations are things where if you don't do them,
> the behaviour is still correct but slower.
>
> As far as I can tell from this discussion, the atomic lookup+create is
> a non-optional requirement.

By that definition, intents have _never_ been optional. NFSv3/v4 O_EXCL
creates have never worked correctly without them.

Cheers,
Trond

2005-10-06 18:48:30

by Daniel Jacobowitz

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

On Thu, Oct 06, 2005 at 07:00:06PM +0100, Jamie Lokier wrote:
> Miklos Szeredi wrote:
> > > Secondly, Linux doesn't actually allow bind mounts on top of regular
> > > files.
> >
> > It does. Try it.
>
> The feature is even useful from time to time.

But very limited... the last time I needed something like this, bind
mounts on files were so awkward that I ended up solving the problem in
FUSE instead. Explanation at:

http://return.false.org/~drow/fuse/

--
Daniel Jacobowitz
CodeSourcery, LLC

2005-10-06 18:50:42

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > And if the bind is umounted after NFS determined not to open the file,
> > and at the same time it changes to a symlink on the server (not very
> > likely I agree, but possible nonetheless), then shit happens.
>
> Please elaborate. What would you expect to see happen in this situation?

For simplicity case let's omit the creation of simlink, just say, the
file is removed.

So NFS calls have_submounts(), which returns true.

Then the bind is umounted. Nothing prevents this happening
concurrently with the lookup.

Then the file is removed on the server.

When open_namei() gets around to following the mounts, it is not there
any more, so the dentry for /mnt/foo (the NFS one is returned) and
NFS's ->open is called on the file, which returns -ENOENT. But
open(..., O_CREAT, ...) should never return -ENOENT.

The symlink case is similar, just even more unlikely.

Miklos

2005-10-06 19:17:52

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 20:49 (+0200) skreiv Miklos Szeredi:

> For simplicity case let's omit the creation of simlink, just say, the
> file is removed.
>
> So NFS calls have_submounts(), which returns true.
>
> Then the bind is umounted. Nothing prevents this happening
> concurrently with the lookup.
>
> Then the file is removed on the server.
>
> When open_namei() gets around to following the mounts, it is not there
> any more, so the dentry for /mnt/foo (the NFS one is returned) and
> NFS's ->open is called on the file, which returns -ENOENT. But
> open(..., O_CREAT, ...) should never return -ENOENT.

...and so the VFS can recognise the case, and be made to retry the
operation.
A more difficult race to deal with occurs if you allow a mount while
inside d_revalidate(). In that case NFS can end up opening the wrong
file.
Both these two races could, however, be fixed by moving the
__follow_mount() in open_namei() inside the section that is protected by
the parent directory i_sem.

In any case, all you are doing here is showing that the situation w.r.t.
mount races and lookup+create+open is difficult. I see nothing that
convinces me that a special atomic create+open will help to resolve
those races.
Nor do I see that adding a special atomic create+open will help me avoid
intents for the case of atomic lookup+open(). As far as I'm concerned,
the case of lookup+create+open is just a special case of lookup+open.

Cheers,
Trond

2005-10-06 20:18:52

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> >
> > When open_namei() gets around to following the mounts, it is not there
> > any more, so the dentry for /mnt/foo (the NFS one is returned) and
> > NFS's ->open is called on the file, which returns -ENOENT. But
> > open(..., O_CREAT, ...) should never return -ENOENT.
>
> ...and so the VFS can recognise the case, and be made to retry the
> operation.
> A more difficult race to deal with occurs if you allow a mount while
> inside d_revalidate(). In that case NFS can end up opening the wrong
> file.

Yes, in fact all my examples should have been for ->d_revalidate(),
since it's not possible that ->lookup() be called for a mounted
dentry.

> Both these two races could, however, be fixed by moving the
> __follow_mount() in open_namei() inside the section that is protected by
> the parent directory i_sem.

No. Only the namespace semaphore could protect against mount/umount,
but you don't want to take that in lookup logic.

> In any case, all you are doing here is showing that the situation w.r.t.
> mount races and lookup+create+open is difficult. I see nothing that
> convinces me that a special atomic create+open will help to resolve
> those races.

I just think that filesystem code should _never_ need to care about
mounts. If you want to do the lookup+open, you somehow will have to
deal with mounts, which is ugly.

> Nor do I see that adding a special atomic create+open will help me avoid
> intents for the case of atomic lookup+open(). As far as I'm concerned,
> the case of lookup+create+open is just a special case of lookup+open.

IMO the lookup+open optimization is not valid, because dealing with
mounts is the job of the VFS and not the filesystem.

Path lookup usually ends in a sequence of ->lookup (or
->d_revalidate), then following mounts, then doing the operation. You
shouldn't try to merge the ->lookup and the ->fsop into one operation.
That way leads to madness.

Miklos

2005-10-06 21:12:29

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

to den 06.10.2005 Klokka 22:17 (+0200) skreiv Miklos Szeredi:
> I just think that filesystem code should _never_ need to care about
> mounts. If you want to do the lookup+open, you somehow will have to
> deal with mounts, which is ugly.

You appear to think that atomic lookup+open is a question of choice. It
is not.

Cheers,
Trond

2005-10-07 06:02:58

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > I just think that filesystem code should _never_ need to care about
> > mounts. If you want to do the lookup+open, you somehow will have to
> > deal with mounts, which is ugly.
>
> You appear to think that atomic lookup+open is a question of choice. It
> is not.

Atomic lookup+open is an optimization, and as such a question of
choice. Atomic create+open is not.

I know you are thinking of the non-exclusive create case when between
the lookup and the open the file is removed or transmuted on the
server.

Yes, it's tricky to sovle, but by no means impossible without atomic
lookup+open. E.g. consider this pseudo-code (only the atomic
open+create case) in open_namei():

down(&dir->d_inode->i_sem);
dentry = __lookup_hash(&nd->last, nd->dentry, nd);
do_last:

error = PTR_ERR(dentry);
if (IS_ERR(dentry)) {
up(&dir->d_inode->i_sem);
goto exit;
}

/* Negative dentry, just create the file */
if (!dentry->d_inode)
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current->fs->umask;
vfs_create_open(...);
up(&dir->d_inode->i_sem);
dput(nd->dentry);
nd->dentry = dentry;
goto exit;
} else if (!(flag & O_EXCL) && may_create(dir)) {
if (__follow_mount(&path)) {
up(&dir->d_inode->i_sem);
goto mount_followed;
}
vfs_create_open(...);
up(&dir->d_inode->i_sem);
dput(nd->dentry);
nd->dentry = dentry;
goto exit;
}

/*
* It already exists.
*/
up(&dir->d_inode->i_sem);

error = -EEXIST;
if (flag & O_EXCL)
goto exit_dput;

if (__follow_mount(&path)) {
error = -ELOOP;
if (flag & O_NOFOLLOW)
goto exit_dput;
}
mount_followed:


Miklos

2005-10-07 13:38:26

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

fr den 07.10.2005 Klokka 08:01 (+0200) skreiv Miklos Szeredi:
> > > I just think that filesystem code should _never_ need to care about
> > > mounts. If you want to do the lookup+open, you somehow will have to
> > > deal with mounts, which is ugly.
> >
> > You appear to think that atomic lookup+open is a question of choice. It
> > is not.
>
> Atomic lookup+open is an optimization, and as such a question of
> choice. Atomic create+open is not.

Really? Under NFSv4, the one and only OPEN command does an atomic lookup
+open, It _has to_ in order to deal with all the races.

Once that is the case, then separating lookup and open into two
operations means that you need to worry about namespace changes on the
server too (since OPEN takes a name argument rather than a filehandle).
If you end up opening a different file to the one you looked up, things
can get very interesting.

> I know you are thinking of the non-exclusive create case when between
> the lookup and the open the file is removed or transmuted on the
> server..

> Yes, it's tricky to sovle, but by no means impossible without atomic
> lookup+open. E.g. consider this pseudo-code (only the atomic
> open+create case) in open_namei():

Firstly, that pseudo-code doesn't deal at all with the race you describe
above. It only deals with lookup + file creation.

Secondly, it also fails to deal with the issue of propagation of open
context.
If you open a file, then that creates open context/state on the server.
Most protocols will then have some way of tracking that state using an
identifier (the equivalent of the POSIX open file descriptor). I see
absolutely nothing in your proposal that will allow me to save the state
identifier that results from atomic open+create and then propagate it to
the struct file.

Without that stateid/descriptor, it becomes impossible to actually READ,
WRITE, lock/unlock the file or even to CLOSE it when done.

This is why I added the struct file to the intent code in the first
place.

Trond

2005-10-07 14:01:32

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > > > I just think that filesystem code should _never_ need to care about
> > > > mounts. If you want to do the lookup+open, you somehow will have to
> > > > deal with mounts, which is ugly.
> > >
> > > You appear to think that atomic lookup+open is a question of choice. It
> > > is not.
> >
> > Atomic lookup+open is an optimization, and as such a question of
> > choice. Atomic create+open is not.
>
> Really? Under NFSv4, the one and only OPEN command does an atomic lookup
> +open, It _has to_ in order to deal with all the races.
>
> Once that is the case, then separating lookup and open into two
> operations means that you need to worry about namespace changes on the
> server too (since OPEN takes a name argument rather than a filehandle).

So, you are saying OPEN has to do the lookup too. That's OK, but that
_does not_ mean that you have to do the OPEN operation from the
->lookup() or ->d_revalidate() methods. In fact you cannot do the
later without getting into trouble over mounts.

> If you end up opening a different file to the one you looked up, things
> can get very interesting.

You can replace the inode in ->create_open() if you want to. Or let
the VFS redo the lookup (as if d_revalidate() returned 0).

> > I know you are thinking of the non-exclusive create case when between
> > the lookup and the open the file is removed or transmuted on the
> > server..
>
> > Yes, it's tricky to sovle, but by no means impossible without atomic
> > lookup+open. E.g. consider this pseudo-code (only the atomic
> > open+create case) in open_namei():
>
> Firstly, that pseudo-code doesn't deal at all with the race you describe
> above. It only deals with lookup + file creation.

It does deal with the race:

__lookup_hash() returns a positive dentry

file is removed on server

"else if (!(flag & O_EXCL) && may_create(dir))" condition is met

__follow_mount() return false

vfs_create_open() calls ->create_open()

NFS does OPEN (O_CREAT), file is opened, dentry replaced (this is not
ellaborated in the pseudocode).


> Secondly, it also fails to deal with the issue of propagation of open
> context.
> If you open a file, then that creates open context/state on the server.
> Most protocols will then have some way of tracking that state using an
> identifier (the equivalent of the POSIX open file descriptor). I see
> absolutely nothing in your proposal that will allow me to save the state
> identifier that results from atomic open+create and then propagate it to
> the struct file.

If you read the original patch, ->open_create() has a 'struct file *'
parameter, just like ->open().

Miklos

2005-10-07 14:48:13

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

fr den 07.10.2005 Klokka 15:59 (+0200) skreiv Miklos Szeredi:
> So, you are saying OPEN has to do the lookup too. That's OK, but that
> _does not_ mean that you have to do the OPEN operation from the
> ->lookup() or ->d_revalidate() methods. In fact you cannot do the
> later without getting into trouble over mounts.

You cannot do anything else without getting into trouble over dentry
races. Given that choice, I prefer to take my chances with the mounts as
those are rare.

We can add locking in order to exclude the mount races, or we can just
ignore them. AFAICS there are 2 cases:

1) umount races
refcounting of any in-use dentry is supposed to prevent trouble here.

2) mount races. Either
Just ignore if the filesystem has already opened the file.
or
Add locking to prevent races.

> > If you end up opening a different file to the one you looked up, things
> > can get very interesting.
>
> You can replace the inode in ->create_open() if you want to. Or let
> the VFS redo the lookup (as if d_revalidate() returned 0).

...but I cannot do that once I get to dentry_open(). You are ignoring
the case of generic file open without creation.

> > > I know you are thinking of the non-exclusive create case when between
> > > the lookup and the open the file is removed or transmuted on the
> > > server..
> >
> > > Yes, it's tricky to sovle, but by no means impossible without atomic
> > > lookup+open. E.g. consider this pseudo-code (only the atomic
> > > open+create case) in open_namei():
> >
> > Firstly, that pseudo-code doesn't deal at all with the race you describe
> > above. It only deals with lookup + file creation.
>
> It does deal with the race:
>
> __lookup_hash() returns a positive dentry
>
> file is removed on server
>
> "else if (!(flag & O_EXCL) && may_create(dir))" condition is met
>
> __follow_mount() return false
>
> vfs_create_open() calls ->create_open()
>
> NFS does OPEN (O_CREAT), file is opened, dentry replaced (this is not
> ellaborated in the pseudocode).

Which again only deals with the case of open(O_CREAT). My point is that
the race exists for the case of generic open().

If you first lookup the dentry in open_namei(), then end up opening a
completely different file in dentry_open(), then you are fscked.

Cheers,
Trond

2005-10-07 15:14:10

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

fr den 07.10.2005 Klokka 15:59 (+0200) skreiv Miklos Szeredi:

> You can replace the inode in ->create_open() if you want to.

Thinking a bit more clearly after a cup of coffee. This statement isn't
even true.

Your pseudo-code offers no guarantees that you are the sole user of the
dentry once you get to create_open().

> Or let the VFS redo the lookup (as if d_revalidate() returned 0).

Which may return yet another result for the dentry and another race.
There is no guarantee that you will ever make progress if someone is
doing something like.

while true
do
echo "1" > foo
echo "2" > foo
done

on the server.

Cheers,
Trond

2005-10-07 15:20:38

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> fr den 07.10.2005 Klokka 15:59 (+0200) skreiv Miklos Szeredi:
> > So, you are saying OPEN has to do the lookup too. That's OK, but that
> > _does not_ mean that you have to do the OPEN operation from the
> > ->lookup() or ->d_revalidate() methods. In fact you cannot do the
> > later without getting into trouble over mounts.
>
> You cannot do anything else without getting into trouble over dentry
> races. Given that choice, I prefer to take my chances with the mounts as
> those are rare.
>
> We can add locking in order to exclude the mount races, or we can just
> ignore them. AFAICS there are 2 cases:
>
> 1) umount races
> refcounting of any in-use dentry is supposed to prevent trouble here.

Refcounting solves nothing. The problem is that you try to do the
mount handling partly in the filesystem, and partly outside of it.

I hate the idea of this, but if there's no alternative...

> 2) mount races. Either
> Just ignore if the filesystem has already opened the file.

Yes, ignoring is perfectly fine. Unlike the umount case, mounting
cannot cause problems.

> > You can replace the inode in ->create_open() if you want to. Or let
> > the VFS redo the lookup (as if d_revalidate() returned 0).
>
> ...but I cannot do that once I get to dentry_open(). You are ignoring
> the case of generic file open without creation.

You can't do open by inode number (or file handle, whatever)? Only by
name? In that case yes, I see your problem.

> > NFS does OPEN (O_CREAT), file is opened, dentry replaced (this is not
> > ellaborated in the pseudocode).
>
> Which again only deals with the case of open(O_CREAT). My point is that
> the race exists for the case of generic open().

And so does for setattr() etc. You can safely return -ENOENT in these
cases. O_CREAT is problematic only because it cannot return -ENOENT
if the file was removed between ->lookup and ->open.

Miklos

2005-10-07 15:29:39

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > You can replace the inode in ->create_open() if you want to.
>
> Thinking a bit more clearly after a cup of coffee. This statement isn't
> even true.
>
> Your pseudo-code offers no guarantees that you are the sole user of the
> dentry once you get to create_open().

You are right. I meant, replace the dentry.

> > Or let the VFS redo the lookup (as if d_revalidate() returned 0).
>
> Which may return yet another result for the dentry and another race.
> There is no guarantee that you will ever make progress if someone is
> doing something like.
>
> while true
> do
> echo "1" > foo
> echo "2" > foo
> done
>
> on the server.

Not good example. This won't change the file, only the contents.
Something with rename would be better.

We are still pitting two different races against each other. I can't
see such a big difference in ugliness...

Miklos



2005-10-07 16:01:43

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

fr den 07.10.2005 Klokka 17:18 (+0200) skreiv Miklos Szeredi:

> > > You can replace the inode in ->create_open() if you want to. Or let
> > > the VFS redo the lookup (as if d_revalidate() returned 0).
> >
> > ...but I cannot do that once I get to dentry_open(). You are ignoring
> > the case of generic file open without creation.
>
> You can't do open by inode number (or file handle, whatever)? Only by
> name? In that case yes, I see your problem.

As I believe I said earlier, open by inode number/filehandle/... don't
exist in the NFSv4 protocol due to the potential for races.

> > > NFS does OPEN (O_CREAT), file is opened, dentry replaced (this is not
> > > ellaborated in the pseudocode).
> >
> > Which again only deals with the case of open(O_CREAT). My point is that
> > the race exists for the case of generic open().
>
> And so does for setattr() etc. You can safely return -ENOENT in these
> cases. O_CREAT is problematic only because it cannot return -ENOENT
> if the file was removed between ->lookup and ->open.

No. There is no race for setattr() etc since they only do one lookup
(and they don't set up any state on the server).

open() is the only case where we currently have to look things up twice
(and I remind you that the second "lookup" is in fact the OPEN
operation).

Trond

2005-10-07 16:24:02

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

fr den 07.10.2005 Klokka 17:28 (+0200) skreiv Miklos Szeredi:
> >
> > Which may return yet another result for the dentry and another race.
> > There is no guarantee that you will ever make progress if someone is
> > doing something like.
> >
> > while true
> > do
> > echo "1" > foo
> > echo "2" > foo
> > done
> >
> > on the server.
>
> Not good example. This won't change the file, only the contents.
> Something with rename would be better.

Sorry, yes. This tweak should demonstrate what I meant

while true
do
echo "1" > foo
echo "2" > bar
mv bar foo
done

> We are still pitting two different races against each other. I can't
> see such a big difference in ugliness...

No we're not. I telling you that your open_create is not a solution for
the problems we have with open in NFSv4.

If it doesn't do atomic lookup+open, then I have an unfixable race. Any
"solution" that requires NFS to assume that the dcache will remain
consistent with the server namespace across more than one RPC operation
is prone to races.

OTOH mount/umount races are fixable since they involve only the local
namespace. Just add locking.

Trond

2005-10-07 17:22:27

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> As I believe I said earlier, open by inode number/filehandle/... don't
> exist in the NFSv4 protocol due to the potential for races.

I must have missed this.

Yes, open(O_CREAT) has race problems. Plain open() doesn't. So I
still don't see why you want to use the open-by-name for the
non-create case.

> No. There is no race for setattr() etc since they only do one lookup
> (and they don't set up any state on the server).
>
> open() is the only case where we currently have to look things up twice
> (and I remind you that the second "lookup" is in fact the OPEN
> operation).

If NFS cannot do open by filehandle, then ->open_create() interface is
not enough obviously.

Miklos


2005-10-07 17:28:47

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > >
> > > Which may return yet another result for the dentry and another race.
> > > There is no guarantee that you will ever make progress if someone is
> > > doing something like.
> > >
> > > while true
> > > do
> > > echo "1" > foo
> > > echo "2" > foo
> > > done
> > >
> > > on the server.
> >
> > Not good example. This won't change the file, only the contents.
> > Something with rename would be better.
>
> Sorry, yes. This tweak should demonstrate what I meant
>
> while true
> do
> echo "1" > foo
> echo "2" > bar
> mv bar foo
> done
>
> > We are still pitting two different races against each other. I can't
> > see such a big difference in ugliness...
>
> No we're not. I telling you that your open_create is not a solution for
> the problems we have with open in NFSv4.

OK, I think I see the problem better now.

> If it doesn't do atomic lookup+open, then I have an unfixable race. Any
> "solution" that requires NFS to assume that the dcache will remain
> consistent with the server namespace across more than one RPC operation
> is prone to races.
>
> OTOH mount/umount races are fixable since they involve only the local
> namespace. Just add locking.

You _could_ take the namespace semaphore across lookup_hash() and
follow_mount(), but I'm not sure everybody would like that.

Are you planning to post a patch? Atomic open+create is something
that FUSE wants as well, so it would be nice to get a proper solution
into the kernel.

I'll think a bit more about solving the mount thing properly.

Miklos

2005-10-07 17:36:25

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

fr den 07.10.2005 Klokka 19:27 (+0200) skreiv Miklos Szeredi:

> Are you planning to post a patch? Atomic open+create is something
> that FUSE wants as well, so it would be nice to get a proper solution
> into the kernel.

I've got an updated patch in testing right now. I just want to try to
tighten up the intent interface a tad more in order to avoid struct
nameidata initialisation bugs.

Cheers,
Trond

2005-10-12 13:21:37

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

on den 12.10.2005 Klokka 13:01 (+0200) skreiv Miklos Szeredi:

> However I don't really like that the filesystem is reentered from
> lookup_instantiate_filp() via __dentry_open() and ->open(). Is this
> necessary?

If filesystems need to be able to change the value of f_mapping, then
yes, however if none of the potential users of lookup_instantiate_filp()
care about doing so, then we can get rid of it.
I don't care either way since we will not be supporting non-intent based
opens for NFSv4.

> I see you've fixed the O_TRUNC problem. The accmode==3 case is still
> slightly broken, since now the file is being opened in read-write mode
> instead of no-read-no-write mode. This probably won't break anything
> too badly though.

It is non-portable and it was never supported on NFSv4 anyway. If
someone cares, they can fix it, but I don't see much need.

> Equivalent and simpler:
>
> flags = nd->intent.open.flags - 1;
>
> Note, that the access bits of intent.open.flags will never both be
> zero, so this is safe.

Agreed.

Cheers,
Trond

2005-10-12 13:54:05

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > However I don't really like that the filesystem is reentered from
> > lookup_instantiate_filp() via __dentry_open() and ->open(). Is this
> > necessary?
>
> If filesystems need to be able to change the value of f_mapping, then
> yes, however if none of the potential users of lookup_instantiate_filp()
> care about doing so, then we can get rid of it.

That would be nice. Filesysteem could set intent.open.file->f_mapping
before call to lookup_instantiate_filp(), which would check it before
resetting to inode->i_mapping.

> I don't care either way since we will not be supporting non-intent based
> opens for NFSv4.

I need this for FUSE, since non-create opens and non-exclusive open of
positive dentry will be done through ->open().

There is an ugly workaround: filesystem sets
nd->intent.open.file->private_data to some non-NULL value before
calling lookup_instantiate_filp() and in ->open() skip open based on
value of file->private_data.

But I'd prefer if lookup_instantiate_filp() would simply not call
->open().

> > I see you've fixed the O_TRUNC problem. The accmode==3 case is still
> > slightly broken, since now the file is being opened in read-write mode
> > instead of no-read-no-write mode. This probably won't break anything
> > too badly though.
>
> It is non-portable and it was never supported on NFSv4 anyway. If
> someone cares, they can fix it, but I don't see much need.

I suggest that nameidata_to_filp() to check this ((flags & O_ACCMODE)
== 3) if the file is already open and bail out with -EINVAL.

This would make sure, that things that relied on the old behavior at
least fail loudly instead of silently.

Miklos

2005-10-12 15:22:14

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> +/**
> + * release_open_intent - free up open intent resources
> + * @nd: pointer to nameidata
> + */
> +void release_open_intent(struct nameidata *nd)
> +{
> + if (nd->intent.open.file->f_dentry == NULL)
> + put_filp(nd->intent.open.file);
> + else
> + fput(nd->intent.open.file);
> +}
> +

This (or at least it's call site in open_namei()) should check for
IS_ERR(nd->intent.open.file).

If lookup_instantiate_filp() retuns an error and is called from
->create(), release_open_intent() will be called twice, and the second
one will Oops.

Miklos

2005-10-12 19:01:44

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

VFS: Allow the filesystem to return a full file pointer on open intent

This is needed by NFSv4 for atomicity reasons: our open command is in
fact a lookup+open, so we need to be able to propagate open context
information from lookup() into the resulting struct file's
private_data field.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/exec.c | 12 +++---
fs/namei.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++----
fs/open.c | 79 ++++++++++++++++++++++++++++++++++--------
include/linux/namei.h | 8 ++++
4 files changed, 165 insertions(+), 27 deletions(-)

Index: linux-2.6.14-rc4/fs/namei.c
===================================================================
--- linux-2.6.14-rc4.orig/fs/namei.c
+++ linux-2.6.14-rc4/fs/namei.c
@@ -28,6 +28,7 @@
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
+#include <linux/file.h>
#include <asm/namei.h>
#include <asm/uaccess.h>

@@ -317,6 +318,18 @@ void path_release_on_umount(struct namei
mntput_no_expire(nd->mnt);
}

+/**
+ * release_open_intent - free up open intent resources
+ * @nd: pointer to nameidata
+ */
+void release_open_intent(struct nameidata *nd)
+{
+ if (nd->intent.open.file->f_dentry == NULL)
+ put_filp(nd->intent.open.file);
+ else
+ fput(nd->intent.open.file);
+}
+
/*
* Internal lookup() using the new generic dcache.
* SMP-safe
@@ -1052,6 +1065,70 @@ out:
return retval;
}

+static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags, int create_mode)
+{
+ struct file *filp = get_empty_filp();
+ int err;
+
+ if (filp == NULL)
+ return -ENFILE;
+ nd->intent.open.file = filp;
+ nd->intent.open.flags = open_flags;
+ nd->intent.open.create_mode = create_mode;
+ err = path_lookup(name, lookup_flags|LOOKUP_OPEN, nd);
+ if (IS_ERR(nd->intent.open.file)) {
+ if (err == 0) {
+ err = PTR_ERR(nd->intent.open.file);
+ path_release(nd);
+ }
+ } else if (err != 0)
+ release_open_intent(nd);
+ return err;
+}
+
+/**
+ * path_lookup_open - lookup a file path with open intent
+ * @name: pointer to file name
+ * @lookup_flags: lookup intent flags
+ * @nd: pointer to nameidata
+ * @open_flags: open intent flags
+ */
+int path_lookup_open(const char *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags)
+{
+ return __path_lookup_intent_open(name, lookup_flags, nd,
+ open_flags, 0);
+}
+
+/**
+ * path_lookup_create - lookup a file path with open + create intent
+ * @name: pointer to file name
+ * @lookup_flags: lookup intent flags
+ * @nd: pointer to nameidata
+ * @open_flags: open intent flags
+ * @create_mode: create intent flags
+ */
+int path_lookup_create(const char *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags, int create_mode)
+{
+ return __path_lookup_intent_open(name, lookup_flags|LOOKUP_CREATE, nd,
+ open_flags, create_mode);
+}
+
+int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags)
+{
+ char *tmp = getname(name);
+ int err = PTR_ERR(tmp);
+
+ if (!IS_ERR(tmp)) {
+ err = __path_lookup_intent_open(tmp, lookup_flags, nd, open_flags, 0);
+ putname(tmp);
+ }
+ return err;
+}
+
/*
* Restricted form of lookup. Doesn't follow links, single-component only,
* needs parent already locked. Doesn't follow mounts.
@@ -1416,27 +1493,27 @@ int may_open(struct nameidata *nd, int a
*/
int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
{
- int acc_mode, error = 0;
+ int acc_mode, error;
struct path path;
struct dentry *dir;
int count = 0;

acc_mode = ACC_MODE(flag);

+ /* O_TRUNC implies we need access checks for write permissions */
+ if (flag & O_TRUNC)
+ acc_mode |= MAY_WRITE;
+
/* Allow the LSM permission hook to distinguish append
access from general write access. */
if (flag & O_APPEND)
acc_mode |= MAY_APPEND;

- /* Fill in the open() intent data */
- nd->intent.open.flags = flag;
- nd->intent.open.create_mode = mode;
-
/*
* The simplest case - just a plain lookup.
*/
if (!(flag & O_CREAT)) {
- error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
+ error = path_lookup_open(pathname, lookup_flags(flag), nd, flag);
if (error)
return error;
goto ok;
@@ -1445,7 +1522,7 @@ int open_namei(const char * pathname, in
/*
* Create - we need to know the parent.
*/
- error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
+ error = path_lookup_create(pathname, LOOKUP_PARENT, nd, flag, mode);
if (error)
return error;

@@ -1520,6 +1597,8 @@ ok:
exit_dput:
dput_path(&path, nd);
exit:
+ if (!IS_ERR(nd->intent.open.file))
+ release_open_intent(nd);
path_release(nd);
return error;

Index: linux-2.6.14-rc4/fs/exec.c
===================================================================
--- linux-2.6.14-rc4.orig/fs/exec.c
+++ linux-2.6.14-rc4/fs/exec.c
@@ -126,8 +126,7 @@ asmlinkage long sys_uselib(const char __
struct nameidata nd;
int error;

- nd.intent.open.flags = FMODE_READ;
- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
+ error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ);
if (error)
goto out;

@@ -139,7 +138,7 @@ asmlinkage long sys_uselib(const char __
if (error)
goto exit;

- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
+ file = nameidata_to_filp(&nd, O_RDONLY);
error = PTR_ERR(file);
if (IS_ERR(file))
goto out;
@@ -167,6 +166,7 @@ asmlinkage long sys_uselib(const char __
out:
return error;
exit:
+ release_open_intent(&nd);
path_release(&nd);
goto out;
}
@@ -490,8 +490,7 @@ struct file *open_exec(const char *name)
int err;
struct file *file;

- nd.intent.open.flags = FMODE_READ;
- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
+ err = path_lookup_open(name, LOOKUP_FOLLOW, &nd, FMODE_READ);
file = ERR_PTR(err);

if (!err) {
@@ -504,7 +503,7 @@ struct file *open_exec(const char *name)
err = -EACCES;
file = ERR_PTR(err);
if (!err) {
- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
+ file = nameidata_to_filp(&nd, O_RDONLY);
if (!IS_ERR(file)) {
err = deny_write_access(file);
if (err) {
@@ -516,6 +515,7 @@ out:
return file;
}
}
+ release_open_intent(&nd);
path_release(&nd);
}
goto out;
Index: linux-2.6.14-rc4/include/linux/namei.h
===================================================================
--- linux-2.6.14-rc4.orig/include/linux/namei.h
+++ linux-2.6.14-rc4/include/linux/namei.h
@@ -8,6 +8,7 @@ struct vfsmount;
struct open_intent {
int flags;
int create_mode;
+ struct file *file;
};

enum { MAX_NESTED_LINKS = 5 };
@@ -65,6 +66,13 @@ extern int FASTCALL(link_path_walk(const
extern void path_release(struct nameidata *);
extern void path_release_on_umount(struct nameidata *);

+extern int __user_path_lookup_open(const char __user *, unsigned lookup_flags, struct nameidata *nd, int open_flags);
+extern int path_lookup_open(const char *, unsigned lookup_flags, struct nameidata *, int open_flags);
+extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
+ int (*open)(struct inode *, struct file *));
+extern struct file *nameidata_to_filp(struct nameidata *nd, int flags);
+extern void release_open_intent(struct nameidata *);
+
extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
extern struct dentry * lookup_hash(struct qstr *, struct dentry *);

Index: linux-2.6.14-rc4/fs/open.c
===================================================================
--- linux-2.6.14-rc4.orig/fs/open.c
+++ linux-2.6.14-rc4/fs/open.c
@@ -739,7 +739,8 @@ asmlinkage long sys_fchown(unsigned int
}

static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
- int flags, struct file *f)
+ int flags, struct file *f,
+ int (*open)(struct inode *, struct file *))
{
struct inode *inode;
int error;
@@ -761,11 +762,14 @@ static struct file *__dentry_open(struct
f->f_op = fops_get(inode->i_fop);
file_move(f, &inode->i_sb->s_files);

- if (f->f_op && f->f_op->open) {
- error = f->f_op->open(inode,f);
+ if (!open && f->f_op)
+ open = f->f_op->open;
+ if (open) {
+ error = open(inode, f);
if (error)
goto cleanup_all;
}
+
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
@@ -814,28 +818,75 @@ struct file *filp_open(const char * file
{
int namei_flags, error;
struct nameidata nd;
- struct file *f;

namei_flags = flags;
if ((namei_flags+1) & O_ACCMODE)
namei_flags++;
- if (namei_flags & O_TRUNC)
- namei_flags |= 2;
-
- error = -ENFILE;
- f = get_empty_filp();
- if (f == NULL)
- return ERR_PTR(error);

error = open_namei(filename, namei_flags, mode, &nd);
if (!error)
- return __dentry_open(nd.dentry, nd.mnt, flags, f);
+ return nameidata_to_filp(&nd, flags);

- put_filp(f);
return ERR_PTR(error);
}
EXPORT_SYMBOL(filp_open);

+/**
+ * lookup_instantiate_filp - instantiates the open intent filp
+ * @nd: pointer to nameidata
+ * @dentry: pointer to dentry
+ * @open: open callback
+ *
+ * Helper for filesystems that want to use lookup open intents and pass back
+ * a fully instantiated struct file to the caller.
+ * This function is meant to be called from within a filesystem's
+ * lookup method.
+ * Note that in case of error, nd->intent.open.file is destroyed, but the
+ * path information remains valid.
+ * If the open callback is set to NULL, then the standard f_op->open()
+ * filesystem callback is substituted.
+ */
+struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
+ int (*open)(struct inode *, struct file *))
+{
+ if (IS_ERR(nd->intent.open.file))
+ goto out;
+ if (IS_ERR(dentry))
+ goto out_err;
+ nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->mnt),
+ nd->intent.open.flags - 1,
+ nd->intent.open.file,
+ open);
+out:
+ return nd->intent.open.file;
+out_err:
+ release_open_intent(nd);
+ nd->intent.open.file = (struct file *)dentry;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
+
+/**
+ * nameidata_to_filp - convert a nameidata to an open filp.
+ * @nd: pointer to nameidata
+ * @flags: open flags
+ *
+ * Note that this function destroys the original nameidata
+ */
+struct file *nameidata_to_filp(struct nameidata *nd, int flags)
+{
+ struct file *filp;
+
+ /* Pick up the filp from the open intent */
+ filp = nd->intent.open.file;
+ /* Has the filesystem initialised the file for us? */
+ if (filp->f_dentry == NULL)
+ filp = __dentry_open(nd->dentry, nd->mnt, flags, filp, NULL);
+ else
+ path_release(nd);
+ return filp;
+}
+
struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
{
int error;
@@ -846,7 +897,7 @@ struct file *dentry_open(struct dentry *
if (f == NULL)
return ERR_PTR(error);

- return __dentry_open(dentry, mnt, flags, f);
+ return __dentry_open(dentry, mnt, flags, f, NULL);
}
EXPORT_SYMBOL(dentry_open);


Attachments:
linux-2.6.14-11-open_file_intents.dif (10.90 kB)

2005-10-12 19:55:16

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RFC] atomic create+open

> > > I don't care either way since we will not be supporting non-intent based
> > > opens for NFSv4.
> >
> > I need this for FUSE, since non-create opens and non-exclusive open of
> > positive dentry will be done through ->open().
>
> How about something like the following instead? That gives you the
> option of choosing a non-standard initialisation for the intent case,
> with the default being ->open().

Fine by me.

Thanks,
Miklos