2003-06-18 14:07:22

by David Howells

[permalink] [raw]
Subject: [PATCH] VFS autmounter support v2


Hi Linus, Al,

I've revised my patch to make sure a process in one namespace doesn't change
the topology of another namespace (kern_automount() will return an error in
that case, as does (u)mount). As a bonus, check_mnt() has been simplified to
take account of the namespace pointer now in vfsmount.

I've also fixes it so that if an automounted subtree is touched with "mount
--bind" or "mount --move", then the base of that tree has its expiry timeout
cleared, thus making it a permanent fixture (until manually unmounted).

David


diff -uNr linux-2.5.72/fs/namei.c linux-2.5.72-auto/fs/namei.c
--- linux-2.5.72/fs/namei.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/namei.c 2003-06-17 15:06:42.000000000 +0100
@@ -434,23 +434,35 @@
return 1;
}

-static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static int follow_mount(struct vfsmount **mnt, struct dentry **dentry, unsigned int flags)
{
int res = 0;
- while (d_mountpoint(*dentry)) {
- struct vfsmount *mounted;
- spin_lock(&dcache_lock);
- mounted = lookup_mnt(*mnt, *dentry);
- if (!mounted) {
+ for (;;) {
+
+ if (d_mountpoint(*dentry)) {
+ struct vfsmount *mounted;
+ spin_lock(&dcache_lock);
+ mounted = lookup_mnt(*mnt, *dentry);
+ if (!mounted) {
+ spin_unlock(&dcache_lock);
+ break;
+ }
+ *mnt = mntget(mounted);
spin_unlock(&dcache_lock);
+ dput(*dentry);
+ mntput(mounted->mnt_parent);
+ *dentry = dget(mounted->mnt_root);
+ res = 1;
+
+ } else if (d_automount_point(*dentry)) {
+ if (flags & LOOKUP_NOAUTOMOUNT)
+ break;
+ res = kern_automount(*mnt, *dentry);
+ if (res < 0)
+ break;
+ } else {
break;
}
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
- dput(*dentry);
- mntput(mounted->mnt_parent);
- *dentry = dget(mounted->mnt_root);
- res = 1;
}
return res;
}
@@ -510,7 +522,7 @@
mntput(*mnt);
*mnt = parent;
}
- follow_mount(mnt, dentry);
+ follow_mount(mnt, dentry, 0);
}

struct path {
@@ -643,7 +655,9 @@
if (err)
break;
/* Check mountpoints.. */
- follow_mount(&next.mnt, &next.dentry);
+ err = follow_mount(&next.mnt, &next.dentry, 0);
+ if (err < 0)
+ goto out_dput;

err = -ENOENT;
inode = next.dentry->d_inode;
@@ -703,7 +717,10 @@
err = do_lookup(nd, &this, &next, 0);
if (err)
break;
- follow_mount(&next.mnt, &next.dentry);
+ err = follow_mount(&next.mnt, &next.dentry, nd->flags & LOOKUP_NOAUTOMOUNT);
+ if (err < 0)
+ goto out_dput;
+
inode = next.dentry->d_inode;
if ((lookup_flags & LOOKUP_FOLLOW)
&& inode && inode->i_op && inode->i_op->follow_link) {
diff -uNr linux-2.5.72/fs/namespace.c linux-2.5.72-auto/fs/namespace.c
--- linux-2.5.72/fs/namespace.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/namespace.c 2003-06-18 15:04:30.000000000 +0100
@@ -30,6 +30,8 @@
static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache;

+static void kern_automount_expiry_timeout(void *_data);
+
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long) mnt / L1_CACHE_BYTES);
@@ -84,13 +86,9 @@
return p;
}

-static int check_mnt(struct vfsmount *mnt)
+static inline int check_mnt(struct vfsmount *mnt)
{
- spin_lock(&dcache_lock);
- while (mnt->mnt_parent != mnt)
- mnt = mnt->mnt_parent;
- spin_unlock(&dcache_lock);
- return mnt == current->namespace->root;
+ return mnt->mnt_namespace == current->namespace;
}

static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
@@ -142,6 +140,9 @@
mnt->mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = old->mnt_namespace;
+ mnt->mnt_expiry_timeout = old->mnt_expiry_timeout;
+ mnt->mnt_expires_at = old->mnt_expires_at;
}
return mnt;
}
@@ -530,6 +531,7 @@
}

if (mnt) {
+ mnt->mnt_expiry_timeout = 0;
err = graft_tree(mnt, nd);
if (err) {
spin_lock(&dcache_lock);
@@ -622,6 +624,7 @@

detach_mnt(old_nd.mnt, &parent_nd);
attach_mnt(old_nd.mnt, nd);
+ old_nd.mnt->mnt_expiry_timeout = 0;
out2:
spin_unlock(&dcache_lock);
out1:
@@ -674,6 +677,138 @@
return err;
}

+int kern_automount(struct vfsmount *on_mnt, struct dentry *on_dentry)
+{
+ struct nameidata nd;
+ struct vfsmount *mnt;
+ int err;
+
+ if (!on_dentry->d_inode || !S_ISDIR(on_dentry->d_inode->i_mode))
+ return -ENOTDIR;
+
+ if (!check_mnt(on_mnt))
+ return -EACCES;
+
+ mnt = on_dentry->d_op->d_automount(on_dentry);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ mnt->mnt_expires_at = get_seconds() + mnt->mnt_expiry_timeout;
+
+ memset(&nd,0,sizeof(nd));
+ nd.dentry = on_dentry;
+ nd.mnt = on_mnt;
+
+ down_write(&current->namespace->sem);
+ /* Something was mounted here while we slept */
+ while(d_mountpoint(nd.dentry) && follow_down(&nd.mnt, &nd.dentry))
+ ;
+ err = -EINVAL;
+ if (!check_mnt(nd.mnt))
+ goto unlock;
+
+ /* Refuse the same filesystem on the same mount point */
+ err = -EBUSY;
+ if (nd.mnt->mnt_sb == mnt->mnt_sb && nd.mnt->mnt_root == nd.dentry)
+ goto unlock;
+
+ err = graft_tree(mnt, &nd);
+unlock:
+ up_write(&current->namespace->sem);
+ mntput(mnt);
+ return err;
+}
+
+static inline void kern_automount_set_expiry_timer(struct namespace *namespace,
+ unsigned long timeout)
+{
+ spin_lock(&dcache_lock);
+
+ if (atomic_read(&namespace->count) > 0) {
+ get_namespace(namespace);
+
+ if (!schedule_delayed_work(&namespace->mnt_expiry_work,
+ (timeout + 1) * HZ))
+ put_namespace(namespace);
+ }
+
+ spin_unlock(&dcache_lock);
+}
+
+void kern_automount_begin_expiry(struct vfsmount *mnt)
+{
+ mnt->mnt_expires_at = get_seconds() + mnt->mnt_expiry_timeout;
+
+ kern_automount_set_expiry_timer(mnt->mnt_namespace,
+ mnt->mnt_expiry_timeout);
+}
+
+static void kern_automount_expiry_timeout(void *_data)
+{
+ struct namespace *namespace = _data;
+ struct list_head *_p, *_n, graveyard;
+ struct vfsmount *mnt;
+ time_t now;
+ int timeout = INT_MAX, tmp;
+
+ INIT_LIST_HEAD(&graveyard);
+
+ down_write(&namespace->sem);
+
+ now = get_seconds();
+
+ list_for_each_safe(_p, _n, &namespace->list) {
+ mnt = list_entry(_p, struct vfsmount, mnt_list);
+
+ if (mnt->mnt_expiry_timeout &&
+ atomic_read(&mnt->mnt_count) == 1) {
+ spin_lock(&dcache_lock);
+
+ if (atomic_read(&mnt->mnt_count) == 1) {
+ tmp = (int) mnt->mnt_expires_at - (int) now;
+ if (tmp <= 0) {
+ list_move_tail(&mnt->mnt_list,
+ &graveyard);
+ list_del_init(&mnt->mnt_child);
+ list_del_init(&mnt->mnt_hash);
+ mnt->mnt_mountpoint->d_mounted--;
+ } else if (tmp < timeout) {
+ timeout = tmp;
+ }
+ }
+
+ spin_unlock(&dcache_lock);
+ }
+ }
+
+ up_write(&namespace->sem);
+
+ while (!list_empty(&graveyard)) {
+ mnt = list_entry(graveyard.next, struct vfsmount, mnt_list);
+ list_del_init(&mnt->mnt_list);
+
+ dput(xchg(&mnt->mnt_mountpoint, mnt->mnt_root));
+ mntput(xchg(&mnt->mnt_parent, mnt));
+
+ if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
+ /* last instance - try to be smart */
+ lock_kernel();
+ DQUOT_OFF(mnt->mnt_sb);
+ acct_auto_close(mnt->mnt_sb);
+ unlock_kernel();
+ }
+
+ mntput(mnt);
+ }
+
+ if (timeout != INT_MAX) {
+ kern_automount_set_expiry_timer(namespace, timeout);
+ }
+ else {
+ put_namespace(namespace);
+ }
+}
+
static int copy_mount_options (const void __user *data, unsigned long *where)
{
int i;
@@ -800,6 +935,9 @@
init_rwsem(&new_ns->sem);
new_ns->root = NULL;
INIT_LIST_HEAD(&new_ns->list);
+ INIT_WORK(&new_ns->mnt_expiry_work,
+ kern_automount_expiry_timeout,
+ new_ns);

down_write(&tsk->namespace->sem);
/* First pass: copy the tree topology */
@@ -816,6 +954,8 @@
p = namespace->root;
q = new_ns->root;
while (p) {
+ q->mnt_namespace = new_ns;
+
if (p == fs->rootmnt) {
rootmnt = p;
fs->rootmnt = mntget(q);
@@ -844,6 +984,8 @@
if (altrootmnt)
mntput(altrootmnt);

+ kern_automount_set_expiry_timer(namespace, 1);
+
put_namespace(namespace);
return 0;

@@ -1079,9 +1221,13 @@
panic("Can't allocate initial namespace");
atomic_set(&namespace->count, 1);
INIT_LIST_HEAD(&namespace->list);
+ INIT_WORK(&namespace->mnt_expiry_work,
+ kern_automount_expiry_timeout,
+ namespace);
init_rwsem(&namespace->sem);
list_add(&mnt->mnt_list, &namespace->list);
namespace->root = mnt;
+ mnt->mnt_namespace = namespace;

init_task.namespace = namespace;
read_lock(&tasklist_lock);
diff -uNr linux-2.5.72/fs/stat.c linux-2.5.72-auto/fs/stat.c
--- linux-2.5.72/fs/stat.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/stat.c 2003-06-17 15:06:42.000000000 +0100
@@ -61,7 +61,7 @@
struct nameidata nd;
int error;

- error = user_path_walk(name, &nd);
+ error = user_path_walk_stat(name, &nd);
if (!error) {
error = vfs_getattr(nd.mnt, nd.dentry, stat);
path_release(&nd);
@@ -74,7 +74,7 @@
struct nameidata nd;
int error;

- error = user_path_walk_link(name, &nd);
+ error = user_path_walk_link_stat(name, &nd);
if (!error) {
error = vfs_getattr(nd.mnt, nd.dentry, stat);
path_release(&nd);
diff -uNr linux-2.5.72/fs/super.c linux-2.5.72-auto/fs/super.c
--- linux-2.5.72/fs/super.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/super.c 2003-06-17 15:06:42.000000000 +0100
@@ -21,6 +21,7 @@
*/

#include <linux/config.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/acct.h>
@@ -683,6 +684,7 @@
mnt->mnt_root = dget(sb->s_root);
mnt->mnt_mountpoint = sb->s_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = current->namespace;
up_write(&sb->s_umount);
put_filesystem(type);
return mnt;
@@ -697,6 +699,8 @@
return (struct vfsmount *)sb;
}

+EXPORT_SYMBOL_GPL(do_kern_mount);
+
struct vfsmount *kern_mount(struct file_system_type *type)
{
return do_kern_mount(type->name, 0, type->name, NULL);
diff -uNr linux-2.5.72/include/linux/dcache.h linux-2.5.72-auto/include/linux/dcache.h
--- linux-2.5.72/include/linux/dcache.h 2003-06-17 15:01:36.000000000 +0100
+++ linux-2.5.72-auto/include/linux/dcache.h 2003-06-17 15:06:42.000000000 +0100
@@ -112,6 +112,7 @@
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
+ struct vfsmount *(*d_automount)(struct dentry *);
};

/* the dentry parameter passed to d_hash and d_compare is the parent
@@ -307,6 +308,11 @@
return dentry->d_mounted;
}

+static inline int d_automount_point(struct dentry *dentry)
+{
+ return dentry->d_op && dentry->d_op->d_automount;
+}
+
extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
#endif /* __KERNEL__ */

diff -uNr linux-2.5.72/include/linux/mount.h linux-2.5.72-auto/include/linux/mount.h
--- linux-2.5.72/include/linux/mount.h 2003-06-17 15:01:35.000000000 +0100
+++ linux-2.5.72-auto/include/linux/mount.h 2003-06-17 15:06:42.000000000 +0100
@@ -31,6 +31,9 @@
int mnt_flags;
char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
+ struct namespace *mnt_namespace; /* containing namespace */
+ time_t mnt_expires_at; /* time at which automount expires */
+ unsigned mnt_expiry_timeout; /* expiry timeout (in seconds) or 0 */
};

static inline struct vfsmount *mntget(struct vfsmount *mnt)
@@ -40,11 +43,15 @@
return mnt;
}

+extern void kern_automount_begin_expiry(struct vfsmount *mnt);
extern void __mntput(struct vfsmount *mnt);

static inline void mntput(struct vfsmount *mnt)
{
if (mnt) {
+ if (atomic_read(&mnt->mnt_count) == 2 &&
+ mnt->mnt_expiry_timeout)
+ kern_automount_begin_expiry(mnt);
if (atomic_dec_and_test(&mnt->mnt_count))
__mntput(mnt);
}
diff -uNr linux-2.5.72/include/linux/namei.h linux-2.5.72-auto/include/linux/namei.h
--- linux-2.5.72/include/linux/namei.h 2003-06-17 15:01:36.000000000 +0100
+++ linux-2.5.72-auto/include/linux/namei.h 2003-06-17 15:06:42.000000000 +0100
@@ -31,6 +31,7 @@
#define LOOKUP_CONTINUE 4
#define LOOKUP_PARENT 16
#define LOOKUP_NOALT 32
+#define LOOKUP_NOAUTOMOUNT 64


extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
@@ -38,6 +39,10 @@
__user_walk(name, LOOKUP_FOLLOW, nd)
#define user_path_walk_link(name,nd) \
__user_walk(name, 0, nd)
+#define user_path_walk_stat(name,nd) \
+ __user_walk(name, LOOKUP_FOLLOW|LOOKUP_NOAUTOMOUNT, nd)
+#define user_path_walk_link_stat(name,nd) \
+ __user_walk(name, LOOKUP_NOAUTOMOUNT, nd)
extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
extern int FASTCALL(path_walk(const char *, struct nameidata *));
extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
@@ -52,4 +57,6 @@
extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);

+extern int kern_automount(struct vfsmount *mnt, struct dentry *dentry);
+
#endif /* _LINUX_NAMEI_H */
diff -uNr linux-2.5.72/include/linux/namespace.h linux-2.5.72-auto/include/linux/namespace.h
--- linux-2.5.72/include/linux/namespace.h 2003-06-17 15:01:36.000000000 +0100
+++ linux-2.5.72-auto/include/linux/namespace.h 2003-06-17 15:06:42.000000000 +0100
@@ -10,6 +10,7 @@
struct vfsmount * root;
struct list_head list;
struct rw_semaphore sem;
+ struct work_struct mnt_expiry_work;
};

extern void umount_tree(struct vfsmount *);


2003-06-18 20:45:49

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2

On Wed, Jun 18, 2003 at 03:20:15PM +0100, David Howells wrote:
>
> Hi Linus, Al,
>
> I've revised my patch to make sure a process in one namespace doesn't change
> the topology of another namespace (kern_automount() will return an error in
> that case, as does (u)mount). As a bonus, check_mnt() has been simplified to
> take account of the namespace pointer now in vfsmount.

You _still_ don't get it. OK, the last time: kern_automount() will
always do the same thing, no matter which namespace we are it. It
might be OK for AFS, but it's definitely unfit for any other use.

No amount of "use of (u)mount to rearrange topology" will help here -
with your code you have dentry marked, and stepping on it (in any
namespace, in any instance of that fs in a namespace) will always do
the same thing. And that is Wrong(tm).

2003-06-19 09:33:00

by David Howells

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2


> > I've revised my patch to make sure a process in one namespace doesn't
> > change the topology of another namespace (kern_automount() will return an
> > error in that case, as does (u)mount). As a bonus, check_mnt() has been
> > simplified to take account of the namespace pointer now in vfsmount.
>
> You _still_ don't get it. OK, the last time: kern_automount() will
> always do the same thing, no matter which namespace we are it. It
> might be OK for AFS, but it's definitely unfit for any other use.

Well, I disagree. I think it should do the same thing (but with results
limited to the namespace of whatever triggered the automount) until the
administrator discards that portion of the tree (umount -l will work
there). But I don't think I'm going to get anywhere arguing about it with you.

And, furthermore, I think autofs could be reimplemented in simpler form using
it (and still call out to userspace), but that's just my opinion.

> No amount of "use of (u)mount to rearrange topology" will help here -
> with your code you have dentry marked, and stepping on it (in any
> namespace, in any instance of that fs in a namespace) will always do
> the same thing. And that is Wrong(tm).

Again, I disagree. In my opinion an automount point should still work
correctly in a cloned namespace, until the filesystem that is "proposing" it
is unmounted.

If I mount an automount-capable fs on a directory, and then create a shell in
a derivative namespace, I would _expect_ the automounting to still _work_
until at such point I unmounted that automount-capable fs.

Besides, going back to your original argument, I think you make an invalid
assertion:

Namespaces are completely unrelated - I have them set for two
different users that happen to need some common files, but otherwise
have very different environments.

They are not quite completely unrelated. The only way (or so it seems) to get
a new namespace is to derive one by way of CLONE_NS. This clones the namespace
of the parent, and so the child's namespace then should have all the features
of the parent's namespace _UNTIL_ at such time the child or one of its
descendents rearranges the topology!

If the two users need very different environments, then they'll change their
namespace to suit themselves - thus making my point.

Creating a new namespace does _not_ automatically confer a complete new
topology of an independent predetermined design, nor does it generate a
totally empty namespace.

But, barring that, you are correct... changes to one namespace do not impinge
on any other (/proc not withstanding). My patch honours this. Tripping an
automount point in one namespace does not change another namespace.

| I have two namespaces. One of them has filesystem A mounted on
| /usr/include. Another - on /usr/local/include. The first one wants
| /usr/include/foo1 trigger mounting B and /usr/include/foo2 trigger mounting
| C. The second one wants /usr/local/include/foo1 trigger mounting D and
| /usr/local/include/foo2 not trigger anything.

Firstly, if one of these topologies has been derived from the other, then one
or both of them have been rearranged since the point of divergence. You have
implied that.

Secondly, if "filesystem A" provides those automount points, then it is a
different filesystem in each case, but obviously you're not thinking along
those lines.

What you appear to be thinking of is that some independent third party (autofs
daemon, administrator or whatever) has glued on arbitrary automounting
triggers (perhaps by mounting them).

I think this can make use of my suggested change... If you make a trigger fs
that has its root directory simply an automount point (though this leads me to
think that perhaps (u)mount should follow the example of stat and use
LOOKUP_NOAUTOMOUNT).

Perhaps:

[namespace X]
mount /dev/hda8 /usr/include
mount -t trap "mount-fs-B" /usr/include/foo1
mount -t trap "mount-fs-C" /usr/include/foo2

[namespace Y]
mount /dev/hda8 /usr/local/include
mount -t trap "mount-fs-D" /usr/local/include/foo1

Then have a daemon that can take a request to mount and then reply with the
mount parameters, allowing the trap fs to obtain a vfsmount via
do_kern_mount(). I would make the trap fs supply the daemon with an fd
attached to the trap rootdir to act as a token representing the request (and
controlling its lifetime).

David

2003-06-19 13:50:38

by Mike Waychison

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2

David Howells wrote:
>
> I think this can make use of my suggested change... If you make a trigger fs
> that has its root directory simply an automount point (though this leads me to
> think that perhaps (u)mount should follow the example of stat and use
> LOOKUP_NOAUTOMOUNT).
>
> Perhaps:
>
> [namespace X]
> mount /dev/hda8 /usr/include
> mount -t trap "mount-fs-B" /usr/include/foo1
> mount -t trap "mount-fs-C" /usr/include/foo2
>
> [namespace Y]
> mount /dev/hda8 /usr/local/include
> mount -t trap "mount-fs-D" /usr/local/include/foo1
>
> Then have a daemon that can take a request to mount and then reply with the
> mount parameters, allowing the trap fs to obtain a vfsmount via
> do_kern_mount(). I would make the trap fs supply the daemon with an fd
> attached to the trap rootdir to act as a token representing the request (and
> controlling its lifetime).
>

You would have to go this route. I think Al's opinion in this is that
your original proposal allows arbitrary dentry's in the tree to act as
traps. As such, there is no way for a derived namespace to manipulate
that trap at all. By implying that the trap is installed via mount says
you are now proposing that every trap is represented by its own superblock.

You're new proposal is exactly what I have been working on, autofs
direct mountpoints using the less intrusive follow_link magic Anvin has
mentioned on a previous thread both here and on autofs@vger.

The one problem with this solution is the following breaks:

# installtrap /foo host:/export/foo
<userspace daemon listens for requests to mount>
# newnssh
newnssh # cd /foo

Oops, the daemon started from the initial namespace doesn't have access
to the namespace in my second shell.

The most reasonable way I can see to cope with this is to allow
CAP_SYS_ADMIN processes the ability to change namespaces. Eg, the
daemon can be told which pid triggered the trap on /foo,
open(/proc/<pid>/mounts) and perform a ioctl(IOC_USENAMESPACE) on it.

What do you guys think?

Mike Waychison

2003-06-19 14:18:04

by David Howells

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2


> > Then have a daemon that can take a request to mount and then reply with
> > the mount parameters, allowing the trap fs to obtain a vfsmount via
> > do_kern_mount(). I would make the trap fs supply the daemon with an fd
> > attached to the trap rootdir to act as a token representing the request
> > (and controlling its lifetime).
> >
>
> You would have to go this route. I think Al's opinion in this is that
> your original proposal allows arbitrary dentry's in the tree to act as
> traps. As such, there is no way for a derived namespace to manipulate
> that trap at all.

> By implying that the trap is installed via mount says you are now proposing
> that every trap is represented by its own superblock.

This is more or less what Al suggested, except that he suggested "traps" are
special vfsmounts that don't have superblocks, dentries, and inodes.

> You're new proposal is exactly what I have been working on, autofs
> direct mountpoints using the less intrusive follow_link magic Anvin has
> mentioned on a previous thread both here and on autofs@vger.
>
> The one problem with this solution is the following breaks:
>
> # installtrap /foo host:/export/foo
> <userspace daemon listens for requests to mount>
> # newnssh
> newnssh # cd /foo
>
> Oops, the daemon started from the initial namespace doesn't have access
> to the namespace in my second shell.

Yes. You can see that happening now with autofs and amd. However it does work
with my suggestion because the "automounter" code just returns a namespace
independent vfsmount, which the VFS can then bind into the appropriate
namespace and the appropriate place.

> The most reasonable way I can see to cope with this is to allow
> CAP_SYS_ADMIN processes the ability to change namespaces. Eg, the
> daemon can be told which pid triggered the trap on /foo,
> open(/proc/<pid>/mounts) and perform a ioctl(IOC_USENAMESPACE) on it.
>
> What do you guys think?

I think a better way is for the kernel to pass the daemon a file descriptor
attached to the mount point. This would then act as a token representing the
request, and as such it automatically includes the mount point info (struct
file has vfsmount/dentry pointers) and can also be used to manage the lifetime
of the request.

I'd then make there be either a "mount" ioctl/fcntl on that fd that uses the
info stored on the struct file, or perhaps provide an "fmount" syscall (but
you have to be careful - otherwise someone can use an arbitrary
cross-namespace fd to mangle some other namespace).

David

2003-06-19 14:41:56

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2

On Thu, Jun 19, 2003 at 10:46:50AM +0100, David Howells wrote:

> Besides, going back to your original argument, I think you make an invalid
> assertion:
>
> Namespaces are completely unrelated - I have them set for two
> different users that happen to need some common files, but otherwise
> have very different environments.
>
> They are not quite completely unrelated. The only way (or so it seems) to get
> a new namespace is to derive one by way of CLONE_NS. This clones the namespace
> of the parent, and so the child's namespace then should have all the features
> of the parent's namespace _UNTIL_ at such time the child or one of its
> descendents rearranges the topology!

And how the fuck do you "rearrange topology" with your patch?!?

2003-06-19 15:03:39

by Mike Waychison

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2

David Howells wrote:

>>>Then have a daemon that can take a request to mount and then reply with
>>>the mount parameters, allowing the trap fs to obtain a vfsmount via
>>>do_kern_mount(). I would make the trap fs supply the daemon with an fd
>>>attached to the trap rootdir to act as a token representing the request
>>>(and controlling its lifetime).
>>>
>>>
>>>
>>You would have to go this route. I think Al's opinion in this is that
>>your original proposal allows arbitrary dentry's in the tree to act as
>>traps. As such, there is no way for a derived namespace to manipulate
>>that trap at all.
>>
>>
>
>
>
>>By implying that the trap is installed via mount says you are now proposing
>>that every trap is represented by its own superblock.
>>
>>
>
>This is more or less what Al suggested, except that he suggested "traps" are
>special vfsmounts that don't have superblocks, dentries, and inodes.
>

Introducing special trap vfsmounts w/o super_blocks means we can no
longer have arbitrary actions on those traps. AFS wants to define what
happens in kernelspace, autofs wants to define it in userspace. Last I
checked, vfsmount doesn't have an ops structure.

>
>
>
>>You're new proposal is exactly what I have been working on, autofs
>>direct mountpoints using the less intrusive follow_link magic Anvin has
>>mentioned on a previous thread both here and on autofs@vger.
>>
>>The one problem with this solution is the following breaks:
>>
>># installtrap /foo host:/export/foo
>><userspace daemon listens for requests to mount>
>># newnssh
>>newnssh # cd /foo
>>
>>Oops, the daemon started from the initial namespace doesn't have access
>>to the namespace in my second shell.
>>
>>
>
>Yes. You can see that happening now with autofs and amd. However it does work
>with my suggestion because the "automounter" code just returns a namespace
>independent vfsmount, which the VFS can then bind into the appropriate
>namespace and the appropriate place.
>

This only works for mounts performed in kernel space. It doesn't lend
itself to performing mounts in userspace and would force autofs to
re-implement mount(1) parsing/struct packing in kernelspace. Definitely
not a good solution.

>
>
>
>>The most reasonable way I can see to cope with this is to allow
>>CAP_SYS_ADMIN processes the ability to change namespaces. Eg, the
>>daemon can be told which pid triggered the trap on /foo,
>>open(/proc/<pid>/mounts) and perform a ioctl(IOC_USENAMESPACE) on it.
>>
>>What do you guys think?
>>
>>
>
>I think a better way is for the kernel to pass the daemon a file descriptor
>attached to the mount point. This would then act as a token representing the
>request, and as such it automatically includes the mount point info (struct
>file has vfsmount/dentry pointers) and can also be used to manage the lifetime
>of the request.
>
>I'd then make there be either a "mount" ioctl/fcntl on that fd that uses the
>info stored on the struct file, or perhaps provide an "fmount" syscall (but
>you have to be careful - otherwise someone can use an arbitrary
>cross-namespace fd to mangle some other namespace).
>
>
A mount ioctl on the fd is probably a good idea:
- it would require modifying mount(1) so that you can have it use
the fd ioctl in lieu of sys_(old)mount.
- he have to ask ourselves, does this logic really belong in vfs? or
is it better placed in a filesystem-independent implementation.

I'm still partial to the idea that a usenamespace ioctl on
/proc/<pid>/mounts is a cleaner solution in the long run, both for
automounting as well as for administration tools.

Mike Waychison

2003-06-19 15:20:55

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2

On Thu, Jun 19, 2003 at 11:13:42AM -0400, Mike Waychison wrote:

> Introducing special trap vfsmounts w/o super_blocks means we can no
> longer have arbitrary actions on those traps. AFS wants to define what
> happens in kernelspace, autofs wants to define it in userspace. Last I
> checked, vfsmount doesn't have an ops structure.

It would have send an event over attached opened file. Attached at
creation time.

> This only works for mounts performed in kernel space. It doesn't lend
> itself to performing mounts in userspace and would force autofs to
> re-implement mount(1) parsing/struct packing in kernelspace. Definitely
> not a good solution.

Or if passed event contains opened mountpoint-to-be.

> I'm still partial to the idea that a usenamespace ioctl on
> /proc/<pid>/mounts is a cleaner solution in the long run, both for
> automounting as well as for administration tools.

Vetoed. ioctl() is _not_ an acceptable way to implement any generic
functionality. It basically says "my interface is a garbage".

And yes, we need to think about a new syscall for mount-related
work. With sane API - mount(2) one is _not_. sys_mount() would
still stay, obviously.

2003-06-19 16:43:28

by Mike Waychison

[permalink] [raw]
Subject: Re: [PATCH] VFS autmounter support v2

[email protected] wrote:

>On Thu, Jun 19, 2003 at 11:13:42AM -0400, Mike Waychison wrote:
>
>
>
>>Introducing special trap vfsmounts w/o super_blocks means we can no
>>longer have arbitrary actions on those traps. AFS wants to define what
>>happens in kernelspace, autofs wants to define it in userspace. Last I
>>checked, vfsmount doesn't have an ops structure.
>>
>>
>
>It would have send an event over attached opened file. Attached at
>creation time.
>
That's a pretty good idea then :)

>
>
>
>>This only works for mounts performed in kernel space. It doesn't lend
>>itself to performing mounts in userspace and would force autofs to
>>re-implement mount(1) parsing/struct packing in kernelspace. Definitely
>>not a good solution.
>>
>>
>
>Or if passed event contains opened mountpoint-to-be.
>
By this, I assume you are implying that infrastructure for mounting on a
given struct file (w/ S_ISDIR) would be made. Correct?

How would this kind of trap be installed in userspace? 'mount -t trap
-o fd=# none /trappoint' which gets caught by the vfs layer in a special
manner I suppose? The vfs system would of course be responsible for
pipe errors/closure. As well, the passed opened mountpoint-to-be would
have to be owned by the process owning the reading end of the pipe.

>
>
>
>>I'm still partial to the idea that a usenamespace ioctl on
>>/proc/<pid>/mounts is a cleaner solution in the long run, both for
>>automounting as well as for administration tools.
>>
>>
>
>Vetoed. ioctl() is _not_ an acceptable way to implement any generic
>functionality. It basically says "my interface is a garbage".
>

Alright. Automounting aside, does it still make sense to have *some*
way for a sys-admin to join an existing namespace? sys_pushns(pid_t
pid)/sys_popns() perhaps? Administrating an environment with multiple
running namespaces may become difficult to administer without such
capability.

>
>And yes, we need to think about a new syscall for mount-related
>work. With sane API - mount(2) one is _not_. sys_mount() would
>still stay, obviously.
>

What is not sane about mount(2)? Are you talking about the
move/bind/remount functionality?



Mike Waychison