2005-11-08 02:04:54

by Al Viro

[permalink] [raw]
Subject: [PATCH 12/18] shared mount handling: bind and rbind

From: Ram Pai <[email protected]>
Date: 1131401990 -0500

Handling of MS_BIND in presense of shared mounts (see
Documentation/sharedsubtree.txt in the end of patch series
for detailed description).

Signed-off-by: Ram Pai ([email protected])
Signed-off-by: Al Viro <[email protected]>

---

fs/namespace.c | 126 +++++++++++++++++++++++++++++++++++++++++++---------
fs/pnode.c | 81 +++++++++++++++++++++++++++++++++
fs/pnode.h | 14 ++++++
include/linux/fs.h | 5 ++
4 files changed, 204 insertions(+), 22 deletions(-)

61916467f11424161a197c5aab53a6bb79619911
diff --git a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -28,8 +28,6 @@

extern int __init init_rootfs(void);

-#define CL_EXPIRE 0x01
-
#ifdef CONFIG_SYSFS
extern int __init sysfs_init(void);
#else
@@ -145,13 +143,43 @@ static void detach_mnt(struct vfsmount *
old_nd->dentry->d_mounted--;
}

+void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
+ struct vfsmount *child_mnt)
+{
+ child_mnt->mnt_parent = mntget(mnt);
+ child_mnt->mnt_mountpoint = dget(dentry);
+ dentry->d_mounted++;
+}
+
static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
{
- mnt->mnt_parent = mntget(nd->mnt);
- mnt->mnt_mountpoint = dget(nd->dentry);
- list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
+ mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
+ list_add_tail(&mnt->mnt_hash, mount_hashtable +
+ hash(nd->mnt, nd->dentry));
list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
- nd->dentry->d_mounted++;
+}
+
+/*
+ * the caller must hold vfsmount_lock
+ */
+static void commit_tree(struct vfsmount *mnt)
+{
+ struct vfsmount *parent = mnt->mnt_parent;
+ struct vfsmount *m;
+ LIST_HEAD(head);
+ struct namespace *n = parent->mnt_namespace;
+
+ BUG_ON(parent == mnt);
+
+ list_add_tail(&head, &mnt->mnt_list);
+ list_for_each_entry(m, &head, mnt_list)
+ m->mnt_namespace = n;
+ list_splice(&head, n->list.prev);
+
+ list_add_tail(&mnt->mnt_hash, mount_hashtable +
+ hash(parent, mnt->mnt_mountpoint));
+ list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ touch_namespace(n);
}

static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
@@ -183,7 +211,11 @@ static struct vfsmount *clone_mnt(struct
mnt->mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
- mnt->mnt_namespace = current->namespace;
+
+ if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+ list_add(&mnt->mnt_share, &old->mnt_share);
+ if (flag & CL_MAKE_SHARED)
+ set_mnt_shared(mnt);

/* stick the duplicate mount on the same expiry list
* as the original if that was on one */
@@ -379,7 +411,7 @@ int may_umount(struct vfsmount *mnt)

EXPORT_SYMBOL(may_umount);

-static void release_mounts(struct list_head *head)
+void release_mounts(struct list_head *head)
{
struct vfsmount *mnt;
while(!list_empty(head)) {
@@ -401,7 +433,7 @@ static void release_mounts(struct list_h
}
}

-static void umount_tree(struct vfsmount *mnt, struct list_head *kill)
+void umount_tree(struct vfsmount *mnt, struct list_head *kill)
{
struct vfsmount *p;

@@ -581,7 +613,7 @@ static int lives_below_in_same_fs(struct
}
}

-static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
int flag)
{
struct vfsmount *res, *p, *q, *r, *s;
@@ -626,6 +658,67 @@ Enomem:
return NULL;
}

+/*
+ * @source_mnt : mount tree to be attached
+ * @nd : place the mount tree @source_mnt is attached
+ *
+ * NOTE: in the table below explains the semantics when a source mount
+ * of a given type is attached to a destination mount of a given type.
+ * ---------------------------------------------
+ * | BIND MOUNT OPERATION |
+ * |********************************************
+ * | source-->| shared | private |
+ * | dest | | |
+ * | | | | |
+ * | v | | |
+ * |********************************************
+ * | shared | shared (++) | shared (+) |
+ * | | | |
+ * |non-shared| shared (+) | private |
+ * *********************************************
+ * A bind operation clones the source mount and mounts the clone on the
+ * destination mount.
+ *
+ * (++) the cloned mount is propagated to all the mounts in the propagation
+ * tree of the destination mount and the cloned mount is added to
+ * the peer group of the source mount.
+ * (+) the cloned mount is created under the destination mount and is marked
+ * as shared. The cloned mount is added to the peer group of the source
+ * mount.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each mount in the tree.
+ * Must be called without spinlocks held, since this function can sleep
+ * in allocations.
+ */
+static int attach_recursive_mnt(struct vfsmount *source_mnt,
+ struct nameidata *nd)
+{
+ LIST_HEAD(tree_list);
+ struct vfsmount *dest_mnt = nd->mnt;
+ struct dentry *dest_dentry = nd->dentry;
+ struct vfsmount *child, *p;
+
+ if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
+ return -EINVAL;
+
+ if (IS_MNT_SHARED(dest_mnt)) {
+ for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+ set_mnt_shared(p);
+ }
+
+ spin_lock(&vfsmount_lock);
+ mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+ commit_tree(source_mnt);
+
+ list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
+ list_del_init(&child->mnt_hash);
+ commit_tree(child);
+ }
+ spin_unlock(&vfsmount_lock);
+ return 0;
+}
+
static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
{
int err;
@@ -646,17 +739,8 @@ static int graft_tree(struct vfsmount *m
goto out_unlock;

err = -ENOENT;
- spin_lock(&vfsmount_lock);
- if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
- struct list_head head;
-
- attach_mnt(mnt, nd);
- list_add_tail(&head, &mnt->mnt_list);
- list_splice(&head, current->namespace->list.prev);
- err = 0;
- touch_namespace(current->namespace);
- }
- spin_unlock(&vfsmount_lock);
+ if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
+ err = attach_recursive_mnt(mnt, nd);
out_unlock:
up(&nd->dentry->d_inode->i_sem);
if (!err)
diff --git a/fs/pnode.c b/fs/pnode.c
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -20,9 +20,88 @@ static inline struct vfsmount *next_peer
void change_mnt_propagation(struct vfsmount *mnt, int type)
{
if (type == MS_SHARED) {
- mnt->mnt_flags |= MNT_SHARED;
+ set_mnt_shared(mnt);
} else {
list_del_init(&mnt->mnt_share);
mnt->mnt_flags &= ~MNT_PNODE_MASK;
}
}
+
+/*
+ * get the next mount in the propagation tree.
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ */
+static struct vfsmount *propagation_next(struct vfsmount *m,
+ struct vfsmount *origin)
+{
+ m = next_peer(m);
+ if (m == origin)
+ return NULL;
+ return m;
+}
+
+/*
+ * mount 'source_mnt' under the destination 'dest_mnt' at
+ * dentry 'dest_dentry'. And propagate that mount to
+ * all the peer and slave mounts of 'dest_mnt'.
+ * Link all the new mounts into a propagation tree headed at
+ * source_mnt. Also link all the new mounts using ->mnt_list
+ * headed at source_mnt's ->mnt_list
+ *
+ * @dest_mnt: destination mount.
+ * @dest_dentry: destination dentry.
+ * @source_mnt: source mount.
+ * @tree_list : list of heads of trees to be attached.
+ */
+int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
+ struct vfsmount *source_mnt, struct list_head *tree_list)
+{
+ struct vfsmount *m, *child;
+ int ret = 0;
+ struct vfsmount *prev_dest_mnt = dest_mnt;
+ struct vfsmount *prev_src_mnt = source_mnt;
+ LIST_HEAD(tmp_list);
+ LIST_HEAD(umount_list);
+
+ for (m = propagation_next(dest_mnt, dest_mnt); m;
+ m = propagation_next(m, dest_mnt)) {
+ int type = CL_PROPAGATION;
+
+ if (IS_MNT_NEW(m))
+ continue;
+
+ if (IS_MNT_SHARED(m))
+ type |= CL_MAKE_SHARED;
+
+ if (!(child = copy_tree(source_mnt, source_mnt->mnt_root,
+ type))) {
+ ret = -ENOMEM;
+ list_splice(tree_list, tmp_list.prev);
+ goto out;
+ }
+
+ if (is_subdir(dest_dentry, m->mnt_root)) {
+ mnt_set_mountpoint(m, dest_dentry, child);
+ list_add_tail(&child->mnt_hash, tree_list);
+ } else {
+ /*
+ * This can happen if the parent mount was bind mounted
+ * on some subdirectory of a shared/slave mount.
+ */
+ list_add_tail(&child->mnt_hash, &tmp_list);
+ }
+ prev_dest_mnt = m;
+ prev_src_mnt = child;
+ }
+out:
+ spin_lock(&vfsmount_lock);
+ while (!list_empty(&tmp_list)) {
+ child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
+ list_del_init(&child->mnt_hash);
+ umount_tree(child, &umount_list);
+ }
+ spin_unlock(&vfsmount_lock);
+ release_mounts(&umount_list);
+ return ret;
+}
diff --git a/fs/pnode.h b/fs/pnode.h
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,21 @@
#include <linux/mount.h>

#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
+#define IS_MNT_NEW(mnt) (!mnt->mnt_namespace)
#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED)

+#define CL_EXPIRE 0x01
+#define CL_COPY_ALL 0x04
+#define CL_MAKE_SHARED 0x08
+#define CL_PROPAGATION 0x10
+
+static inline void set_mnt_shared(struct vfsmount *mnt)
+{
+ mnt->mnt_flags &= ~MNT_PNODE_MASK;
+ mnt->mnt_flags |= MNT_SHARED;
+}
+
void change_mnt_propagation(struct vfsmount *, int);
+int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
+ struct list_head *);
#endif /* _LINUX_PNODE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1251,7 +1251,12 @@ extern int unregister_filesystem(struct
extern struct vfsmount *kern_mount(struct file_system_type *);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
+extern void umount_tree(struct vfsmount *, struct list_head *);
+extern void release_mounts(struct list_head *);
extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
+ struct vfsmount *);

extern int vfs_statfs(struct super_block *, struct kstatfs *);


2005-11-08 14:12:05

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
> {
> - mnt->mnt_parent = mntget(nd->mnt);
> - mnt->mnt_mountpoint = dget(nd->dentry);
> - list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
> + mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
> + list_add_tail(&mnt->mnt_hash, mount_hashtable +
> + hash(nd->mnt, nd->dentry));

Ram,

IIRC the list_add -> list_add_tail change has been voted down. Or do
you have new reasons why it's needed?

Miklos

2005-11-08 15:49:09

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Tue, 2005-11-08 at 06:11, Miklos Szeredi wrote:
> > static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
> > {
> > - mnt->mnt_parent = mntget(nd->mnt);
> > - mnt->mnt_mountpoint = dget(nd->dentry);
> > - list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
> > + mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
> > + list_add_tail(&mnt->mnt_hash, mount_hashtable +
> > + hash(nd->mnt, nd->dentry));
>
> Ram,
>
> IIRC the list_add -> list_add_tail change has been voted down. Or do
> you have new reasons why it's needed?

No. As explained in the same earlier threads; without this change the
behavior of shared-subtrees leads to inconsistency and confusion in some
scenarios.

Under the premise that no application should depend on this behavior
(most-recent-mount-visible v/s top-most-mount-visible), Al Viro
permitted this change. And this is certainly the right behavior.

RP
>
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2005-11-08 15:55:43

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> No. As explained in the same earlier threads; without this change the
> behavior of shared-subtrees leads to inconsistency and confusion in some
> scenarios.
>
> Under the premise that no application should depend on this behavior
> (most-recent-mount-visible v/s top-most-mount-visible),

The strongest argument against was that

mount foo .; umount .

would no longer be a no-op.

> Al Viro permitted this change. And this is certainly the right
> behavior.

Which is a contradiction in term, since you are saying that
applications _do_ depend on it.

Miklos

2005-11-09 10:55:13

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> +/*
> + * mount 'source_mnt' under the destination 'dest_mnt' at
> + * dentry 'dest_dentry'. And propagate that mount to
> + * all the peer and slave mounts of 'dest_mnt'.
> + * Link all the new mounts into a propagation tree headed at
> + * source_mnt. Also link all the new mounts using ->mnt_list
> + * headed at source_mnt's ->mnt_list
> + *
> + * @dest_mnt: destination mount.
> + * @dest_dentry: destination dentry.
> + * @source_mnt: source mount.
> + * @tree_list : list of heads of trees to be attached.
> + */
> +int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
> + struct vfsmount *source_mnt, struct list_head *tree_list)
> +{
> + struct vfsmount *m, *child;
> + int ret = 0;
> + struct vfsmount *prev_dest_mnt = dest_mnt;
> + struct vfsmount *prev_src_mnt = source_mnt;
> + LIST_HEAD(tmp_list);
> + LIST_HEAD(umount_list);
> +
> + for (m = propagation_next(dest_mnt, dest_mnt); m;
> + m = propagation_next(m, dest_mnt)) {
> + int type = CL_PROPAGATION;
> +
> + if (IS_MNT_NEW(m))
> + continue;
> +
> + if (IS_MNT_SHARED(m))
> + type |= CL_MAKE_SHARED;
> +
> + if (!(child = copy_tree(source_mnt, source_mnt->mnt_root,
> + type))) {
> + ret = -ENOMEM;
> + list_splice(tree_list, tmp_list.prev);
> + goto out;
> + }
> +
> + if (is_subdir(dest_dentry, m->mnt_root)) {

Shouldn't this check go before copy_tree()? Not much point in copying
the tree if we are sure it won't be used.

> + mnt_set_mountpoint(m, dest_dentry, child);
> + list_add_tail(&child->mnt_hash, tree_list);
> + } else {
> + /*
> + * This can happen if the parent mount was bind mounted
> + * on some subdirectory of a shared/slave mount.
> + */

I can't grok this comment. Shouldn't it read something like

... if 'm' is a result of a bind from a subdirectory of 'dest_dentry'


Miklos

2005-11-09 14:31:12

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wed, Nov 09, 2005 at 11:54:36AM +0100, Miklos Szeredi wrote:
> Shouldn't this check go before copy_tree()? Not much point in copying
> the tree if we are sure it won't be used.

Incorrect. Propagation nodes further down the tree can very well be
mountable.

2005-11-09 15:23:40

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> On Wed, Nov 09, 2005 at 11:54:36AM +0100, Miklos Szeredi wrote:
> > Shouldn't this check go before copy_tree()? Not much point in copying
> > the tree if we are sure it won't be used.
>
> Incorrect. Propagation nodes further down the tree can very well be
> mountable.

Can you please give an example. I'm feeling thick.

What I see in the code is that the the mount tree is copied, then put
on the tmp_list, and at the end the newly copied tree is freed with
umount_tree().

+ if (!(child = copy_tree(source_mnt, source_mnt->mnt_root,
+ type))) {
+ ret = -ENOMEM;
+ list_splice(tree_list, tmp_list.prev);
+ goto out;
+ }
+
+ if (is_subdir(dest_dentry, m->mnt_root)) {
+ mnt_set_mountpoint(m, dest_dentry, child);
+ list_add_tail(&child->mnt_hash, tree_list);
+ } else {
+ /*
+ * This can happen if the parent mount was bind mounted
+ * on some subdirectory of a shared/slave mount.
+ */
+ list_add_tail(&child->mnt_hash, &tmp_list);
+ }
+ prev_dest_mnt = m;
+ prev_src_mnt = child;
+ }
+out:
+ spin_lock(&vfsmount_lock);
+ while (!list_empty(&tmp_list)) {
+ child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
+ list_del_init(&child->mnt_hash);
+ umount_tree(child, &umount_list);
+ }
+ spin_unlock(&vfsmount_lock);
+ release_mounts(&umount_list);
+ return ret;

2005-11-09 15:56:39

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wed, Nov 09, 2005 at 04:22:23PM +0100, Miklos Szeredi wrote:
> > On Wed, Nov 09, 2005 at 11:54:36AM +0100, Miklos Szeredi wrote:
> > > Shouldn't this check go before copy_tree()? Not much point in copying
> > > the tree if we are sure it won't be used.
> >
> > Incorrect. Propagation nodes further down the tree can very well be
> > mountable.
>
> Can you please give an example. I'm feeling thick.
>
> What I see in the code is that the the mount tree is copied, then put
> on the tmp_list, and at the end the newly copied tree is freed with
> umount_tree().

Before it gets freed it may end up being copied. Example: vfsmounts
A and B are peers, C is a slave of that peer group. It happens to be
on slave list of B. B has root deeper than A, which, in turn is deeper
than that of C (e.g. A and B had been created by binding subtrees of
C, which had been made slave afterwards). We bind on something in A,
outside of the subtree mapped by B.

Alternatively, have A -> (B, D) -> C, with C on slave list of B. Mountpoint
is within subtrees for A, C and D, but not B. And no, we can't say "skip B,
just make a slave of tree on A and slap it on C" - correct result is to
have T_A -> T_D -> T_C (i.e. tree on C gets propagation from tree on D).
Which kills the variants with not creating that copy and making subsequent
ones directly from the original tree.

2005-11-09 16:33:47

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> Before it gets freed it may end up being copied. Example: vfsmounts
> A and B are peers, C is a slave of that peer group. It happens to be
> on slave list of B. B has root deeper than A, which, in turn is deeper
> than that of C (e.g. A and B had been created by binding subtrees of
> C, which had been made slave afterwards). We bind on something in A,
> outside of the subtree mapped by B.
>
> Alternatively, have A -> (B, D) -> C, with C on slave list of B. Mountpoint
> is within subtrees for A, C and D, but not B. And no, we can't say "skip B,
> just make a slave of tree on A and slap it on C" - correct result is to
> have T_A -> T_D -> T_C (i.e. tree on C gets propagation from tree on D).
> Which kills the variants with not creating that copy and making subsequent
> ones directly from the original tree.

OK, I see it now. What confused me is that from patch 12 it's not yet
obvious, that the copied mount will be used for futher propagation.

Miklos

2005-11-09 18:44:39

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Tue, 2005-11-08 at 07:55, Miklos Szeredi wrote:
> > No. As explained in the same earlier threads; without this change the
> > behavior of shared-subtrees leads to inconsistency and confusion in some
> > scenarios.
> >
> > Under the premise that no application should depend on this behavior
> > (most-recent-mount-visible v/s top-most-mount-visible),
>
> The strongest argument against was that
>
> mount foo .; umount .
>
> would no longer be a no-op.

It is a no-op even now. Try it.

What you meant was something like this is no-op now?

P1: cd /tmp1
P2: mount --bind /var /tmp1
P1: mount --bind /bin .
P1: umount .

Yes this will not be a noop with my changes. However this behavior is
not documented to be a no-op anywhere. right? And 'umount .' really
doen't make sense. What does it mean? umount the current mount? or
umount of the mount that is mounted on this dentry?

My changes just changes one behavior and that is:
If multiple mounts are mounted on the same <mount,dentry> combination
the later mounts are obscured by the preceeding mounts. Earlier it was
opposite behavior.

My biggest complaint about the earlier behavior was that the later
mounts obscured not only the earlier mounts on the <mount,dentry> tuple,
but also obscured all the mounts that got stacked on top of the
earlier <mount,dentry>. It seemed totally unnatural, and confusing
with shared-subtree.


> > Al Viro permitted this change. And this is certainly the right
> > behavior.
>
> Which is a contradiction in term, since you are saying that
> applications _do_ depend on it.

no. I said application _should_not_ depend on it, because it is a
undefined semantics.

Sorry I was out yesterday and could reply earlier.
RP

>
> Miklos

2005-11-09 19:00:06

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind



On Wed, 9 Nov 2005, Ram Pai wrote:
>
> And 'umount .' really doen't make sense. What does it mean? umount the
> current mount? or umount of the mount that is mounted on this dentry?

"umount <directory>" _absolutely_ makes sense, whether "directory" is "."
or something else. People do it all the time.

Now, if it doesn't unmount the last thing mounted on top of ".", then
that's a misfeature. It might be a misfeature in the mount program (it
might scan /etc/mounts top-to-bottom rather than the other way), but the
kernel should also support it.

> no. I said application _should_not_ depend on it, because it is a
> undefined semantics.

It's definitely neither unusual nor undefined. I do all my umounts by
directory (in fact, doing it by anything else really _is_ badly defined,
since a block device can be mounted in many places), and the only sane
semantics would be to peel off the last mount on that directory.

Now, that doesn't necessarily mean that "list_add_tail()" is wrong. But
if we add new mounts to the end, then umount remove them from the end too,
no?

Linus

2005-11-09 19:26:15

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wed, Nov 09, 2005 at 10:59:47AM -0800, Linus Torvalds wrote:
>
>
> On Wed, 9 Nov 2005, Ram Pai wrote:
> >
> > And 'umount .' really doen't make sense. What does it mean? umount the
> > current mount? or umount of the mount that is mounted on this dentry?
>
> "umount <directory>" _absolutely_ makes sense, whether "directory" is "."
> or something else. People do it all the time.

With current (and all previous, actually) tree umount . is usually -EBUSY.

The case Mikulas is talking about is much uglier - it's "mount on top
of current directory, then umount .". _That_ (i.e. when . is overmounted)
happens to work. And semantics is really, really not well-defined.

Situation with overmounts is nasty - it's *not* just a chain, unfortunately.
I certainly intended it to be such. However, we can get a *tree* of
overmounts due to side-effects I've missed back then.

We really need it sanitized (and that's what I'm doing right now), but
yes, it *will* cause user-visible changes. Incidentally, one of those
will be that umount . will work...

The trouble begins since we allow to attach vfsmounts to the *middle* of
overmount chain. I.e.

mount foo /tmp
cd /tmp
mount bar .
mount baz .

will end up with *two* vfsmounts having root of foo as mountpoint and having
the same mnt_parent. Which one is seen depends on phase of moon - the only
answer is "whichever is first in mnt_hash chain". Which is certainly not a
sane answer. We need explicit rules dealing with effect of overmounts;
anything that seriously relies on details of current behaviour in that sort
of corner cases is very definitely broken.

And "we allow" above should be read as "Al had not thought about that mess
back in 2001" ;-/ Current behaviour in that sort setups is an accident -
as soon as it gets to such forked chains of overmounts sanity exits stage
left. To be fixed...

2005-11-09 19:28:56

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wed, 2005-11-09 at 10:59, Linus Torvalds wrote:
> On Wed, 9 Nov 2005, Ram Pai wrote:
> >
> > And 'umount .' really doen't make sense. What does it mean? umount the
> > current mount? or umount of the mount that is mounted on this dentry?
>
> "umount <directory>" _absolutely_ makes sense, whether "directory" is "."
> or something else. People do it all the time.

ok. so you say 'umount <dir>' means umount something mounted on top of
dir. In that case, when I say 'umount /tmp' it has to unmount something
that is mounted on top of /tmp, but there cannot be anything mounted on
top of /tmp. In the first place I cannot reach this mount since it is
obscured by the mount on top. right?

eg: m1 is the root mount
m2 is the mount on top of tmp on the root mount
m3 is the mount on top of tmp(root dentry) of m2

'umount /tmp'
will mean umount something that is mounted on top of root dentry of m3.



>
> Now, if it doesn't unmount the last thing mounted on top of ".", then
> that's a misfeature. It might be a misfeature in the mount program (it
> might scan /etc/mounts top-to-bottom rather than the other way), but the
> kernel should also support it.
>
> > no. I said application _should_not_ depend on it, because it is a
> > undefined semantics.
>
> It's definitely neither unusual nor undefined. I do all my umounts by
> directory (in fact, doing it by anything else really _is_ badly defined,
> since a block device can be mounted in many places), and the only sane
> semantics would be to peel off the last mount on that directory.
>
> Now, that doesn't necessarily mean that "list_add_tail()" is wrong. But
> if we add new mounts to the end, then umount remove them from the end too,
> no?

Yes it does. it removes the mounts that receive propagation from the
tail. Anything that has been asked to be unmounted explicitly will be
removed irrespective of where it is on the list, and always it is at the
head because lookup_mnt() always returns the one at the head.

RP
>
> Linus

2005-11-16 03:30:13

by Rob Landley

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wednesday 09 November 2005 12:59, Linus Torvalds wrote:
> > no. I said application _should_not_ depend on it, because it is a
> > undefined semantics.
>
> It's definitely neither unusual nor undefined. I do all my umounts by
> directory (in fact, doing it by anything else really _is_ badly defined,
> since a block device can be mounted in many places), and the only sane
> semantics would be to peel off the last mount on that directory.

I noticed this upgrading busybox mount a few months back. I was trying to
figure out if the correct semantics for umount /dev/block were to umount
_all_ instances of this block device, or umount just the most recent one. I
wound up just passing it through to the kernel and letting it decide, but I
wasn't sure why it did what it did.

The 2.6 multiple mount semantics are still new enough that the tools are just
now catching up. Last I checked, the standard mount was kind of unhappy with
--bind and --move mounts (they were corrupting /etc/mtab):

http://www.busybox.net/lists/busybox/2005-August/015285.html

The side effects of mount can be really non-obvious at times. For example,
while implementing busybox's switch_root I found out that this snippet of
klibc's run-init.c is slightly wrong:
if ( chdir(realroot) )
die("chdir to new root");
/* snip */
/* Overmount the root */
if ( mount(".", "/", NULL, MS_MOVE, NULL) )
die("overmounting root");

/* chroot, chdir */
if ( chroot(".") || chdir("/") )
die("chroot");

The || fallback in the third part won't work. chroot(".") will get you to the
new filesystem, but chdir("/") still gets you to the old one, even though
we've overmounted it. (I have no idea why. I assume it's because / is
special.)

Rob

2005-11-16 03:53:53

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind



On Tue, 15 Nov 2005, Rob Landley wrote:
>
> The || fallback in the third part won't work. chroot(".") will get you to the
> new filesystem, but chdir("/") still gets you to the old one, even though
> we've overmounted it. (I have no idea why. I assume it's because / is
> special.)

'/' is special exactly the same way '.' is: one is shorthand for "current
process' root", and the other is shorthand for "current process' cwd".

So if you mount over '/', it won't actually do what you think it does:
because when you open "/", it will continue to open the _old_ "/". Exactly
the same way that mounting over somebody's cwd won't do what you think it
does - because the root and the cwd have been looked-up earlier and are
cached with the process.

This is why we have "pivot_root()" and "chroot()", which can both be used
to do what you want to do. You mount the new root somewhere else, and then
you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
cwd into the new root too (and only at that point have you "lost" the old
root - although you can actually get it back if you have some file
descriptor open to it).

Linus

2005-11-16 05:44:28

by Al Boldi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

Linus Torvalds wrote:
> This is why we have "pivot_root()" and "chroot()", which can both be used
> to do what you want to do. You mount the new root somewhere else, and then
> you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> cwd into the new root too (and only at that point have you "lost" the old
> root - although you can actually get it back if you have some file
> descriptor open to it).

Wouldn't this constitute a security flaw?

Shouldn't chroot jail you?

--
Al

2005-11-16 08:20:46

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> > This is why we have "pivot_root()" and "chroot()", which can both be used
> > to do what you want to do. You mount the new root somewhere else, and then
> > you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> > cwd into the new root too (and only at that point have you "lost" the old
> > root - although you can actually get it back if you have some file
> > descriptor open to it).
>
> Wouldn't this constitute a security flaw?
>
> Shouldn't chroot jail you?

No, chroot should just change the root.

If you don't want to be able to get back the old root, just close all
file descriptors _in addition_ to chroot() and chdir().

Miklos

2005-11-16 08:42:08

by Rob Landley

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Tuesday 15 November 2005 21:53, Linus Torvalds wrote:
> So if you mount over '/', it won't actually do what you think it does:
> because when you open "/", it will continue to open the _old_ "/". Exactly
> the same way that mounting over somebody's cwd won't do what you think it
> does - because the root and the cwd have been looked-up earlier and are
> cached with the process.

So does mounting over / actually accomplish anything? Or is it sort of an
undermount instead of an overmount, resulting in a mounted but inaccessible
filesystem? (In theory, fork() would copy the current cached value of "/",
and all absolute path lookups are really sort of relative paths from the
cached "/"...)

I ask because I'm trying to figure out what switch_root's "mount --move . /"
accomplishes, other than making /proc/mounts look right. If we just did the
chroot(".") it'd be functionally the same, and slightly smaller (which
busybox cares about).

I'm also remembering that while playing around with stuff in a PID 1 shell
under UML (trying to figure out how to implement pivot_root), I mounted
something directly on / (which was a NOP) and then umount was also a NOP
(presumably because it was trying to umount rootfs), meaning I had a mount
that had simply _leaked_. It still showed up in /proc/mounts, but was
totally inaccessable and couldn't be removed either.

I guess that's a "don't do that then".

> This is why we have "pivot_root()" and "chroot()", which can both be used
> to do what you want to do. You mount the new root somewhere else, and then
> you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> cwd into the new root too (and only at that point have you "lost" the old
> root - although you can actually get it back if you have some file
> descriptor open to it).

So all chroot(2) really does is reset the "/" reference?

In the specific case of "mount --move . /" || chroot ("."), I don't see why we
need a chdir afterwards, because cwd points to the correct filesystem. (In
fact, for a moment there between the mount move and the chroot it's the
_only_ reference we have to this filesystem.)

Perhaps ".." isn't correct unless we chdir again...?

> Linus

Rob

2005-11-16 08:48:04

by Rob Landley

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Tuesday 15 November 2005 23:35, Al Boldi wrote:
> Linus Torvalds wrote:
> > This is why we have "pivot_root()" and "chroot()", which can both be used
> > to do what you want to do. You mount the new root somewhere else, and
> > then you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to
> > move the cwd into the new root too (and only at that point have you
> > "lost" the old root - although you can actually get it back if you have
> > some file descriptor open to it).
>
> Wouldn't this constitute a security flaw?
>
> Shouldn't chroot jail you?

A few years ago I had a build script that compiled a new Linux From Scratch
system I could chroot into, and one of the things in the new chroot
environment was a different boot loader. And for testing purposes (and with
a boot CD standing by) I would chroot into this new environment and run the
lilo in it to add the new test kernel into the boot option list.

One day, I upgraded to a new kernel version and it stopped working, because
chroot had acquired some unwanted security feature that prevented lilo from
properly talking to /dev/hda from within a chroot environment.

I remember being rather put out by this.

Chroot is sometimes used for other purposes than "security".

Rob

2005-11-16 09:11:18

by Rob Landley

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wednesday 16 November 2005 02:19, Miklos Szeredi wrote:
> > > This is why we have "pivot_root()" and "chroot()", which can both be
> > > used to do what you want to do. You mount the new root somewhere else,
> > > and then you chroot (or pivot-root) to it. And THEN you do 'chdir("/")'
> > > to move the cwd into the new root too (and only at that point have you
> > > "lost" the old root - although you can actually get it back if you have
> > > some file descriptor open to it).
> >
> > Wouldn't this constitute a security flaw?
> >
> > Shouldn't chroot jail you?
>
> No, chroot should just change the root.
>
> If you don't want to be able to get back the old root, just close all
> file descriptors _in addition_ to chroot() and chdir().

If you try the chdir by filedescriptor trick on the stdin/stdout/stderr fed
into PID 1 when it's started up by the kernel, which filesystem do you wind
up in? (rootfs?)

I ask because switch_root redoes those to point to /dev/console from the real
root (presumably for security reasons), and this happens _before_ the init on
the real root gets called, and thus before the real root gets to populate
its' own dynamic /dev.

I suppose initramfs could make a temporary /dev, do the mknods for console and
the real root, and then mount --move this tmpdir to the real root's /dev once
that's available (and then let the real root's udev populate it the rest of
the way). Or the real root could have a hard /dev/console living in the
directory that's going to get overmounted by tmpfs later. Or just leave
initramfs accessible until init can switch consoles...

Sigh. I need to document the requirements here...

Rob

2005-11-16 10:15:43

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> > If you don't want to be able to get back the old root, just close all
> > file descriptors _in addition_ to chroot() and chdir().
>
> If you try the chdir by filedescriptor trick on the stdin/stdout/stderr fed
> into PID 1 when it's started up by the kernel, which filesystem do you wind
> up in? (rootfs?)

You can't fchdir() to a non-directory, so this shouldn't be an issue.

Miklos

2005-11-16 14:00:52

by Shaya Potter

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wed, 2005-11-16 at 09:19 +0100, Miklos Szeredi wrote:
> > > This is why we have "pivot_root()" and "chroot()", which can both be used
> > > to do what you want to do. You mount the new root somewhere else, and then
> > > you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> > > cwd into the new root too (and only at that point have you "lost" the old
> > > root - although you can actually get it back if you have some file
> > > descriptor open to it).
> >
> > Wouldn't this constitute a security flaw?
> >
> > Shouldn't chroot jail you?
>
> No, chroot should just change the root.
>
> If you don't want to be able to get back the old root, just close all
> file descriptors _in addition_ to chroot() and chdir().

hah. As long as you're running as root, chroot() again to a directory
below you, and you effectively broken the chroot and can make a relative
path to the old root. :)

I created a patch years ago that creates a chain of "chroot" points, and
any past chroot point would be considered a place that follow_dotdot
would consider a root. There didn't seem much interest in the patch
though.

2005-11-16 16:18:58

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind



On Wed, 16 Nov 2005, Rob Landley wrote:
>
> So does mounting over / actually accomplish anything? Or is it sort of an
> undermount instead of an overmount, resulting in a mounted but inaccessible
> filesystem?

I'd say that _usually_ you're better off using chroot() than mounting over
"/".

> So all chroot(2) really does is reset the "/" reference?

Yes. Literally. Everything else stays the same, including any open files
(and cwd).

It's a "flaw" in chroot if you consider it a jail, but it's used for so
much more than that. In fact, you shouldn't consider it jail: it's really
just a small _part_ of the notion of limiting somebody to a specific area.

(The smallest part, in fact. And you should be aware that root can always
get out of a chdir() if he just has enough tools - and the tools aren't
even very big. "mknod" + "mount" will do it even in the absense of a way
to add binaries, as will /proc access).

Note that the most common use of chroot isn't actually the "jail" kind of
usage, but building and installation environments (ie a lot of package
building stuff end up using chroot as a way to create the "target
environment").

> In the specific case of "mount --move . /" || chroot ("."), I don't see why we
> need a chdir afterwards, because cwd points to the correct filesystem. (In
> fact, for a moment there between the mount move and the chroot it's the
> _only_ reference we have to this filesystem.)
>
> Perhaps ".." isn't correct unless we chdir again...?

Indeed. The issue ends up being ".." and "getcwd()", which both want to
know what your root is in order to know where to stop.

Linus

2005-11-16 16:37:17

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

> > > Shouldn't chroot jail you?
> >
> > No, chroot should just change the root.
> >
> > If you don't want to be able to get back the old root, just close all
> > file descriptors _in addition_ to chroot() and chdir().
>
> hah. As long as you're running as root, chroot() again to a directory
> below you, and you effectively broken the chroot and can make a relative
> path to the old root. :)

... AND do 'setuid(nonroot)'. Root can get out of a chroot() jail in
much more interesting ways.

Miklos

2005-11-16 20:08:43

by Al Boldi

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

Shaya Potter wrote:
> I created a patch years ago that creates a chain of "chroot" points, and
> any past chroot point would be considered a place that follow_dotdot
> would consider a root. There didn't seem much interest in the patch
> though.

Could you resubmit this patch for possible inclusion in 2.6.16, and make it a
runtime option such that the default behaviour remains unchanged?

Thanks!!!

--
Al

2005-11-16 20:22:28

by Shaya Potter

[permalink] [raw]
Subject: Re: [PATCH 12/18] shared mount handling: bind and rbind

On Wed, 2005-11-16 at 23:05 +0300, Al Boldi wrote:
> Shaya Potter wrote:
> > I created a patch years ago that creates a chain of "chroot" points, and
> > any past chroot point would be considered a place that follow_dotdot
> > would consider a root. There didn't seem much interest in the patch
> > though.
>
> Could you resubmit this patch for possible inclusion in 2.6.16, and make it a
> runtime option such that the default behaviour remains unchanged?

resubmit? It was a long long time ago (middle of 2.4 era) I'd have to
find it, who knows if its still around.

Conceptually it was pretty simple, basically instead of just overwriting
the fs struct root/rootmnt you create a linked list of them (appending
on every chroot). I think I might have had a seperate struct to
maintain that linked list (it wasn't the best code in the world).

follow_dotdot is then modified so that the code

if (nd->dentry == current->fs->root &&
nd->mnt == current->fs->rootmnt) {
read_unlock(&current->fs->lock);
break;
}

instead of being just a single test, loops over every element in the
linked list. Hence, if you are ever at a "chroot" point, you get a
root, so solves the simple problem of breaking out of a chroot by
calling chroot() again. In the normal case (no chroot), there should be
no real performance hit, as it would hit the loop once.

However, as others have said, if you can make a device node, then you
can break out of it in other ways.

Now, there are other things one has to take care of.

1) Obvious: cleaning up the list on process termination.
2) Obvious: propagating list to child processes.

and probably some less obvious things that I don't remember or didn't
even consider at the time.