2008-03-13 21:29:46

by Miklos Szeredi

[permalink] [raw]
Subject: [patch 3/6] vfs: mountinfo stable peer group id

From: Miklos Szeredi <[email protected]>

Add a stable identifier for shared mounts.

Signed-off-by: Miklos Szeredi <[email protected]>
---
Documentation/filesystems/proc.txt | 12 +------
fs/namespace.c | 7 ++--
fs/pnode.c | 63 ++++++++++++++++++++++++-------------
fs/pnode.h | 1
include/linux/mount.h | 1
5 files changed, 52 insertions(+), 32 deletions(-)

Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c 2008-03-13 20:45:49.000000000 +0100
+++ linux/fs/pnode.c 2008-03-13 20:45:50.000000000 +0100
@@ -9,8 +9,12 @@
#include <linux/mnt_namespace.h>
#include <linux/mount.h>
#include <linux/fs.h>
+#include <linux/idr.h>
#include "pnode.h"

+static DEFINE_SPINLOCK(mnt_pgid_lock);
+static DEFINE_IDA(mnt_pgid_ida);
+
/* return the next shared peer mount of @p */
static inline struct vfsmount *next_peer(struct vfsmount *p)
{
@@ -27,47 +31,58 @@ static inline struct vfsmount *next_slav
return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
}

-void set_mnt_shared(struct vfsmount *mnt)
+static void __set_mnt_shared(struct vfsmount *mnt)
{
mnt->mnt_flags &= ~MNT_PNODE_MASK;
mnt->mnt_flags |= MNT_SHARED;
}

-void clear_mnt_shared(struct vfsmount *mnt)
+void set_mnt_shared(struct vfsmount *mnt)
{
- mnt->mnt_flags &= ~MNT_SHARED;
+ int res;
+
+ retry:
+ spin_lock(&mnt_pgid_lock);
+ if (IS_MNT_SHARED(mnt)) {
+ spin_unlock(&mnt_pgid_lock);
+ return;
+ }
+
+ res = ida_get_new(&mnt_pgid_ida, &mnt->mnt_pgid);
+ spin_unlock(&mnt_pgid_lock);
+ if (res == -EAGAIN) {
+ if (ida_pre_get(&mnt_pgid_ida, GFP_KERNEL))
+ goto retry;
+ }
+ __set_mnt_shared(mnt);
}

-static int __peer_group_id(struct vfsmount *mnt)
+void clear_mnt_shared(struct vfsmount *mnt)
{
- struct vfsmount *m;
- int id = mnt->mnt_id;
-
- for (m = next_peer(mnt); m != mnt; m = next_peer(m))
- id = min(id, m->mnt_id);
+ if (IS_MNT_SHARED(mnt)) {
+ mnt->mnt_flags &= ~MNT_SHARED;
+ mnt->mnt_pgid = -1;
+ }
+}

- return id;
+void make_mnt_peer(struct vfsmount *old, struct vfsmount *mnt)
+{
+ mnt->mnt_pgid = old->mnt_pgid;
+ list_add(&mnt->mnt_share, &old->mnt_share);
+ __set_mnt_shared(mnt);
}

-/* return the smallest ID within the peer group */
int get_peer_group_id(struct vfsmount *mnt)
{
- int id;
-
- spin_lock(&vfsmount_lock);
- id = __peer_group_id(mnt);
- spin_unlock(&vfsmount_lock);
-
- return id;
+ return mnt->mnt_pgid;
}

-/* return the smallest ID within the master's peer group */
int get_master_id(struct vfsmount *mnt)
{
int id;

spin_lock(&vfsmount_lock);
- id = __peer_group_id(mnt->mnt_master);
+ id = get_peer_group_id(mnt->mnt_master);
spin_unlock(&vfsmount_lock);

return id;
@@ -91,7 +106,13 @@ static int do_make_slave(struct vfsmount
if (peer_mnt == mnt)
peer_mnt = NULL;
}
- list_del_init(&mnt->mnt_share);
+ if (!list_empty(&mnt->mnt_share))
+ list_del_init(&mnt->mnt_share);
+ else if (IS_MNT_SHARED(mnt)) {
+ spin_lock(&mnt_pgid_lock);
+ ida_remove(&mnt_pgid_ida, mnt->mnt_pgid);
+ spin_unlock(&mnt_pgid_lock);
+ }

if (peer_mnt)
master = peer_mnt;
Index: linux/include/linux/mount.h
===================================================================
--- linux.orig/include/linux/mount.h 2008-03-13 20:45:15.000000000 +0100
+++ linux/include/linux/mount.h 2008-03-13 20:45:50.000000000 +0100
@@ -57,6 +57,7 @@ struct vfsmount {
struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
int mnt_id; /* mount identifier */
+ int mnt_pgid; /* peer group identifier */
/*
* We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
* to let these frequently modified fields in a separate cache line
Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c 2008-03-13 20:45:49.000000000 +0100
+++ linux/fs/namespace.c 2008-03-13 20:45:50.000000000 +0100
@@ -95,6 +95,7 @@ struct vfsmount *alloc_vfsmnt(const char
return NULL;
}

+ mnt->mnt_pgid = -1;
atomic_set(&mnt->mnt_count, 1);
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
@@ -539,8 +540,10 @@ static struct vfsmount *clone_mnt(struct
mnt->mnt_master = old;
clear_mnt_shared(mnt);
} else if (!(flag & CL_PRIVATE)) {
- if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
- list_add(&mnt->mnt_share, &old->mnt_share);
+ if (flag & CL_PROPAGATION)
+ set_mnt_shared(old);
+ if (IS_MNT_SHARED(old))
+ make_mnt_peer(old, mnt);
if (IS_MNT_SLAVE(old))
list_add(&mnt->mnt_slave, &old->mnt_slave);
mnt->mnt_master = old->mnt_master;
Index: linux/Documentation/filesystems/proc.txt
===================================================================
--- linux.orig/Documentation/filesystems/proc.txt 2008-03-13 20:45:15.000000000 +0100
+++ linux/Documentation/filesystems/proc.txt 2008-03-13 20:45:50.000000000 +0100
@@ -2367,18 +2367,12 @@ MNTOPTS: per mount options
SBOPTS: per super block options
PROPAGATION: propagation type

-propagation type: <propagation_flag>[:<mntid>][,...]
- note: 'shared' flag is followed by the mntid of its peer mount
- 'slave' flag is followed by the mntid of its master mount
+propagation type: <propagation_flag>[:<peergrpid>][,...]
+ note: 'shared' flag is followed by the id of this mount's peer group
+ 'slave' flag is followed by the peer group id of its master mount
'private' flag stands by itself
'unbindable' flag stands by itself

-The 'mntid' used in the propagation type is a canonical ID of the peer
-group (currently the smallest ID within the group is used for this
-purpose, but this should not be relied on). Since mounts can be added
-or removed from the peer group, this ID only guaranteed to stay the
-same on a static propagation tree.
-
For more information see:

Documentation/filesystems/sharedsubtree.txt
Index: linux/fs/pnode.h
===================================================================
--- linux.orig/fs/pnode.h 2008-03-13 20:45:49.000000000 +0100
+++ linux/fs/pnode.h 2008-03-13 20:45:50.000000000 +0100
@@ -25,6 +25,7 @@

void set_mnt_shared(struct vfsmount *);
void clear_mnt_shared(struct vfsmount *);
+void make_mnt_peer(struct vfsmount *, struct vfsmount *);
void change_mnt_propagation(struct vfsmount *, int);
int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
struct list_head *);

--


2008-03-19 21:19:10

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Thu, Mar 13, 2008 at 10:26:44PM +0100, Miklos Szeredi wrote:
> From: Miklos Szeredi <[email protected]>
>
> Add a stable identifier for shared mounts.

> +static DEFINE_SPINLOCK(mnt_pgid_lock);

Um? Do you ever need to take it outside of vfsmount_lock?

2008-03-19 21:24:19

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Wed, Mar 19, 2008 at 05:41:15PM +0100, Miklos Szeredi wrote:
> > > From: Miklos Szeredi <[email protected]>
> > Um? Do you ever need to take it outside of vfsmount_lock?
> >
>
> Tried to think this through:
>
> It's always called with namespace_sem, which is enough, no need for a
> new lock. The bigger problem, is that it _is_ called with
> vfsmount_lock in one case, which is bad, since the allocation may
> sleep.

It is called with vfsmount_lock in *all* cases. You've missed one
in umount_tree(), BTW; you won't block in that case, though.

> That is in do_change_type(). But do we really need to hold
> vfsmount_lock in that case?

Not the issue.

> I think not, the propagation tree has no
> relevance outside namespace_sem, so that one should be sufficient.

Callers manipulate more than propagation tree. Note that e.g.
umount_tree() changes all sorts of data structures, including ones
that are traversed without namespace_sem.

I _really_ don't like the idea of different locking rules for caller
of a function depending on the value of argument of that function.
They are complicated enough as it is.

Argh... OK, I'll try to put something together tonight, after I get some
sleep - 31 hours of uptime _suck_ ;-/ BTW, on top of everything else,
the current variant plays interesting games with CL_PROPAGATION behaviour
and I really don't like the look of what it's doing there. Later...

2008-03-20 00:16:44

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

> > > > From: Miklos Szeredi <[email protected]>
> > > Um? Do you ever need to take it outside of vfsmount_lock?
> > >
> >
> > Tried to think this through:
> >
> > It's always called with namespace_sem, which is enough, no need for a
> > new lock. The bigger problem, is that it _is_ called with
> > vfsmount_lock in one case, which is bad, since the allocation may
> > sleep.
>
> It is called with vfsmount_lock in *all* cases. You've missed one
> in umount_tree(), BTW; you won't block in that case, though.

set_mnt_shared() is called from namespace.c as well, without
vfsmount_lock. But agreed, that's not the real issue.

>
> > That is in do_change_type(). But do we really need to hold
> > vfsmount_lock in that case?
>
> Not the issue.
>
> > I think not, the propagation tree has no
> > relevance outside namespace_sem, so that one should be sufficient.
>
> Callers manipulate more than propagation tree. Note that e.g.
> umount_tree() changes all sorts of data structures, including ones
> that are traversed without namespace_sem.
>
> I _really_ don't like the idea of different locking rules for caller
> of a function depending on the value of argument of that function.
> They are complicated enough as it is.
>
> Argh... OK, I'll try to put something together tonight, after I get some
> sleep - 31 hours of uptime _suck_ ;-/

Gosh, yes.

Thanks,
Miklos

2008-03-20 00:17:49

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

> > From: Miklos Szeredi <[email protected]>
> >
> > Add a stable identifier for shared mounts.
>
> > +static DEFINE_SPINLOCK(mnt_pgid_lock);
>
> Um? Do you ever need to take it outside of vfsmount_lock?
>

Tried to think this through:

It's always called with namespace_sem, which is enough, no need for a
new lock. The bigger problem, is that it _is_ called with
vfsmount_lock in one case, which is bad, since the allocation may
sleep.

That is in do_change_type(). But do we really need to hold
vfsmount_lock in that case? I think not, the propagation tree has no
relevance outside namespace_sem, so that one should be sufficient.

This patch should fix it (untested, just for review).

I have a half mind to throw out the IDR allocation altogether, and
just go with a 64bit counter, some poeple would much prefer that...

Thanks,
Miklos

---
fs/namespace.c | 2 --
fs/pnode.c | 20 ++++++++------------
2 files changed, 8 insertions(+), 14 deletions(-)

Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c 2008-03-19 16:39:28.000000000 +0100
+++ linux/fs/pnode.c 2008-03-19 17:26:44.000000000 +0100
@@ -12,7 +12,6 @@
#include <linux/idr.h>
#include "pnode.h"

-static DEFINE_SPINLOCK(mnt_pgid_lock);
static DEFINE_IDA(mnt_pgid_ida);

/* return the next shared peer mount of @p */
@@ -37,19 +36,19 @@ static void __set_mnt_shared(struct vfsm
mnt->mnt_flags |= MNT_SHARED;
}

+/*
+ * - must hold namespace_sem to protect the mnt_pgid_ida
+ * - must not hold vfsmount_lock, because this function might sleep
+ */
void set_mnt_shared(struct vfsmount *mnt)
{
int res;

- retry:
- spin_lock(&mnt_pgid_lock);
- if (IS_MNT_SHARED(mnt)) {
- spin_unlock(&mnt_pgid_lock);
+ might_sleep();
+ if (IS_MNT_SHARED(mnt))
return;
- }
-
+ retry:
res = ida_get_new(&mnt_pgid_ida, &mnt->mnt_pgid);
- spin_unlock(&mnt_pgid_lock);
if (res == -EAGAIN) {
if (ida_pre_get(&mnt_pgid_ida, GFP_KERNEL))
goto retry;
@@ -159,11 +158,8 @@ static int do_make_slave(struct vfsmount
peer_mnt = NULL;
}

- if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) {
- spin_lock(&mnt_pgid_lock);
+ if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
ida_remove(&mnt_pgid_ida, mnt->mnt_pgid);
- spin_unlock(&mnt_pgid_lock);
- }
list_del_init(&mnt->mnt_share);

if (peer_mnt)
Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c 2008-03-19 16:39:28.000000000 +0100
+++ linux/fs/namespace.c 2008-03-19 17:23:06.000000000 +0100
@@ -1351,10 +1351,8 @@ static noinline int do_change_type(struc
return -EINVAL;

down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- spin_unlock(&vfsmount_lock);
up_write(&namespace_sem);
return 0;
}

2008-03-20 21:43:42

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:

> > Argh... OK, I'll try to put something together tonight, after I get some
> > sleep - 31 hours of uptime _suck_ ;-/
>
> Gosh, yes.

Folks, we have some problems. I've sat down to write documentation on
locking rules, etc. for struct vfsmount and there are some serious
turds in the area. Among the other things we have interesting race
between shrink_submounts() and copy_tree() - the former calls
propagate_mount_busy(), which walks ->mnt_share and friends. The
latter adds elements to those. Wouldn't be a problem, except that
shrink_submounts() only holds vfsmount_lock while everything else
(including copy_tree()) uses namespace_sem to protect those.

There's another interesting issue with shrink_submounts() - it can
put vfsmounts back on the wrong list. If e.g. cifs is mounted under
nfs and does an automount, umount of nfs might push cifs expirables
*to* *nfs* *list*.

I'm not sure that I understand Trond's intentions with that stuff;
what exactly are we trying to achieve and why the hell is it fs-specific
to start with?

Another interesting thing is locking in mark_mounts_for_expiry(); we
go to all sorts of convolutions that are, AFAICS, not needed anymore.
We used to need rather interesting logics back when we had per-namespace
semaphores; thus grabbing the namespace, dropping vfsmount_lock, dealing
with races in case if something manages to walk into the potential
victim, etc.

I think that having the damn thing global *and* having umount_tree()
callable under vfsmount_lock (we unhash everything in the subtree and
put it on the kill list, so that nothing can walk into those suckers
via hash lookup; release_mounts() called after we are done (and after
releasing vfsmount_lock *and* namespace_sem) will take care of actual
talking to filesystems) removes the need of all that crap.

IOW, mark_mounts_for_expiry() should do the following:
grabbing namespace_sem exclusive
grab vfsmount_lock
walk the list as it does now, except that it should do the right
check from the very beginning (propagate_mount_busy())
without dropping the vfsmount_lock, go through the collected list,
calling umount_tree()
drop the locks
do release_mounts()
The second pass is needed since umount_tree() might do interesting things
to expiry list, so we make life easier for ourselves by leaving that to
second pass when we just want to drain the resulting list until it's empty.

Does anybody see holes in the above?

shrink_submounts() is _probably_ similar (lock/collect/umount_tree on all/
unlock/release_mounts), but I'm not sure if I understand WTF is really
attempted in there.

Is there any reason why we do that in ->umount_begin() and not *after*
it, unconditionally, straight from do_umount()? AFAICS, the only reason
why it's done from fs-specific code is figuring out which mount-list
should the stuff go back to, and that's both broken *and* not needed
with sanitized locking as above. While we are at it, I'd rather return
->umount_begin() to its previous prototype, TYVM - the less filesystem
sees vfsmounts, the better off we all are...

Comments? If nobody objects, I'm going to do that in vfs-fixes branch
and then push to Linus...

2008-03-21 08:57:51

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

> IOW, mark_mounts_for_expiry() should do the following:
> grabbing namespace_sem exclusive
> grab vfsmount_lock
> walk the list as it does now, except that it should do the right
> check from the very beginning (propagate_mount_busy())
> without dropping the vfsmount_lock, go through the collected list,
> calling umount_tree()
> drop the locks
> do release_mounts()
> The second pass is needed since umount_tree() might do interesting things
> to expiry list, so we make life easier for ourselves by leaving that to
> second pass when we just want to drain the resulting list until it's empty.
>
> Does anybody see holes in the above?
>
> shrink_submounts() is _probably_ similar (lock/collect/umount_tree on all/
> unlock/release_mounts), but I'm not sure if I understand WTF is really
> attempted in there.
>
> Is there any reason why we do that in ->umount_begin() and not *after*
> it, unconditionally, straight from do_umount()? AFAICS, the only reason
> why it's done from fs-specific code is figuring out which mount-list
> should the stuff go back to, and that's both broken *and* not needed
> with sanitized locking as above. While we are at it, I'd rather return
> ->umount_begin() to its previous prototype, TYVM - the less filesystem
> sees vfsmounts, the better off we all are...

All of that seems sane to me.

Miklos

2008-03-22 03:50:19

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Thu, Mar 20, 2008 at 09:43:19PM +0000, Al Viro wrote:
> shrink_submounts() is _probably_ similar (lock/collect/umount_tree on all/
> unlock/release_mounts), but I'm not sure if I understand WTF is really
> attempted in there.

Argh... Doing release_mounts() after collection phase won't work ;-/
It would leave references to parents until the very end, leaving us
with false-busy shrinkable vfsmounts if we had shrinkable automounted
on top of shrinkable...

It does work for mark_mounts_for_expiry(), but not here. We could do
the same kind of loops as now, releasing namespace_sem after each
portion of candidates, doing release_mounts() and regaining namespace_sem,
but that leaves us with indefinitely long stalls if somebody keeps
doing lookups triggering automounts. OTOH, we probably could get away
with separate counter covering only that kind of references... That
would be bumped in umount_tree() (at the same point where we decrement
d_mounted) and dropped in release_mounts() when we reset ->mnt_parent
and do mntput() on it.

Then we would simply make do_refcount_check() in pnode.c do
int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
return (mycount > count);
instead of what it does now, and everything would work fine...

So, let's define mnt->mnt_ghosts by requiring that outside of vfsmount_lock
it would be equal to number of vfsmounts with ->mnt_parent == mnt that are
_not_ on child list of mnt.

We'd need to decrement it in release_mounts(), increment in
mnt_set_mountpoint(), decrement again in attach_mnt() (which strongly
suggests that increment should happen in _callers_ of mnt_set_mountpoint(),
so that attach_mnt() wouldn't modify it at all), decrement in commit_tree(),
and increment in umount_tree() at the same point where we play with d_mounted.
AFAICS, that's all.

Shifting increment from mnt_set_mountpoint() and commit_tree()
to theirs callers and collapsing where possible, we get the following:
* decrement in release_mounts() when resetting ->mnt_parent
* increment in propagate_mnt() after call of mnt_set_mountpoint()
* decrement in attach_recursive_mnt() in the loop calling
commit_tree() for clones (on mountpoint of each clone).
* increment in umount_tree() at the point where we update d_mounted.

All these places are under vfsmount_lock, so we are fine with plain int; no
atomics needed.

So... Attack plan: introduce mnt_ghosts+use it in propagate_mnt_busy()
(that gets rid of false-busy stuff), then switch shrink_submounts() and
mark_mounts_for_expiry() to the scheme from the previous posting, then
call shrink_submounts() from do_umount() unconditionally, removing it from
->umount_begin() instances, then restore sane prototype for shrink_submounts().
Four patches...

Comments? Ram, Miklos, Trond?

2008-03-22 03:54:28

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Sat, Mar 22, 2008 at 03:49:50AM +0000, Al Viro wrote:
> ->umount_begin() instances, then restore sane prototype for shrink_submounts().

s/shrink_submounts/->umount_begin/ in the line above

2008-03-22 04:11:48

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Sat, Mar 22, 2008 at 03:49:50AM +0000, Al Viro wrote:

> Shifting increment from mnt_set_mountpoint() and commit_tree()
> to theirs callers and collapsing where possible, we get the following:
> * decrement in release_mounts() when resetting ->mnt_parent
> * increment in propagate_mnt() after call of mnt_set_mountpoint()
> * decrement in attach_recursive_mnt() in the loop calling
> commit_tree() for clones (on mountpoint of each clone).
> * increment in umount_tree() at the point where we update d_mounted.

... except that it'd give a leak in case of mount to shared mountpoint
failing halfway through - we'll get double increments since umount_tree()
would hit the mountpoints of cloned trees with extra increment, even though
reference from root of cloned to its mountpoint is _already_ a ghost.

OTOH, we probably don't want to bother with counting those anyway - i.e.
it's simply a bad definition and the right one would be along the lines of
"number of vfsmounts that are doomed to be eaten by release_mounts() and
that have ->mnt_parent pointing to us". IOW, dropping the 2nd and 3rd
in the above would do the right thing - anything chewed by umount_tree()
*will* go to release_mounts() and ones in flight are what we are interested
in...

2008-03-22 04:56:34

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

OK, preliminary patches are in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6.git/ expiry-fixes

BTW, cifs use of MNT_SHRINKABLE is broken; it updates mnt_flags on
*mountpoint* instead of just having them set on new vfsmount.
Their expiry policy is also very odd - "expire everything cifs
whenever some cifs filesystem is unmounted". Hell knows what had
been intended there (dfs_shrink_umount_helper() and friends)...

2008-03-22 16:27:46

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:
> set_mnt_shared() is called from namespace.c as well, without
> vfsmount_lock. But agreed, that's not the real issue.

How about the following: let's separate set_mnt_shared() and inventing
group ids. All we need is this:
invent_group_ids(mnt) /* call under namespace_sem */
for all vfsmounts p in subtree rooted at mnt
if p->mnt_share is non-empty
continue
get ID for p
if allocation fails
goto cleanup
return 0
cleanup:
for all vfsmounts q in subtree rooted at mnt
if q == p
break
if q->mnt_share is non-empty
continue
release ID of q
return -ENOMEM

Now here's what we do:
* in do_change_type(), outside of vfsmount_lock, do invent_group_ids()
If it fails - bugger off, if not - proceed as now.
* in attach_recursive_mnt() if IS_MNT_SHARED(dest_mnt) do
invent_group_ids() on the dest_mnt immediately and if it fails do
umount_tree(dest_mnt, 0, ) under vfsmount_lock, then release_mounts()
and bugger off (FWIW, we might want to lift the last part to caller
and do the same to release_mounts() in propagate_mnt()). If it hadn't
failed, we proceed as now.
* in clone_mnt() do
int new_group = group ID of old;
int free_group = 0;
if (flag & (CL_SLAVE | CL_PRIVATE))
new_group = 0; /* not a peer of original */
if ((flag & CL_MAKE_SHARED) && !new_group)
new_group = allocate new ID
if failed
return 0;
free_group = 1;
}
mnt = alloc_vfsmount();
if (mnt) {
set group ID of mnt to new_group;
free_group = 0;
/* as in mainline */
}
if (free_group)
release group ID found in new_group;
return mnt;

then (after allocating new vfsmount) set its group ID to new_group if
alloc_vfsmount() succeeds. Otherwise release group ID if needed and
bugger off as usual.

No need to mess with any additional exclusion for idr protection or with
any kind of retries; allocation failure is allocation failure.

Releasing group ID should be done from do_make_slave(), along with clearing
group ID in vfsmount.

Care to do that using mountinfo-base in vfs-2.6.git as base tree?

2008-03-24 08:19:39

by Ram Pai

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Sat, 2008-03-22 at 16:27 +0000, Al Viro wrote:
> On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:
> > set_mnt_shared() is called from namespace.c as well, without
> > vfsmount_lock. But agreed, that's not the real issue.
>
> How about the following: let's separate set_mnt_shared() and inventing
> group ids. All we need is this:
> invent_group_ids(mnt) /* call under namespace_sem */
> for all vfsmounts p in subtree rooted at mnt
> if p->mnt_share is non-empty
> continue
> get ID for p
> if allocation fails
> goto cleanup
> return 0
> cleanup:
> for all vfsmounts q in subtree rooted at mnt
> if q == p
> break
> if q->mnt_share is non-empty
> continue
> release ID of q
> return -ENOMEM
>
> Now here's what we do:
> * in do_change_type(), outside of vfsmount_lock, do invent_group_ids()
> If it fails - bugger off, if not - proceed as now.

Has it to be done outside vfsmount_lock? AFAICT, invent_group_ids()
does not sleep, nor does change_mnt_propagation().

> * in attach_recursive_mnt() if IS_MNT_SHARED(dest_mnt) do
> invent_group_ids() on the dest_mnt immediately and if it fails do

I think you meant, invent_group_ids() on the source_mnt. But again
applying invent_group_ids() on the source_mnt has to be done carefully,
because, source_mnt may have been shared to begin with.

right?
RP

> umount_tree(dest_mnt, 0, ) under vfsmount_lock, then release_mounts()
> and bugger off (FWIW, we might want to lift the last part to caller
> and do the same to release_mounts() in propagate_mnt()). If it hadn't
> failed, we proceed as now.
> * in clone_mnt() do
> int new_group = group ID of old;
> int free_group = 0;
> if (flag & (CL_SLAVE | CL_PRIVATE))
> new_group = 0; /* not a peer of original */
> if ((flag & CL_MAKE_SHARED) && !new_group)
> new_group = allocate new ID
> if failed
> return 0;
> free_group = 1;
> }
> mnt = alloc_vfsmount();
> if (mnt) {
> set group ID of mnt to new_group;
> free_group = 0;
> /* as in mainline */
> }
> if (free_group)
> release group ID found in new_group;
> return mnt;
>
> then (after allocating new vfsmount) set its group ID to new_group if
> alloc_vfsmount() succeeds. Otherwise release group ID if needed and
> bugger off as usual.
>
> No need to mess with any additional exclusion for idr protection or with
> any kind of retries; allocation failure is allocation failure.
>
> Releasing group ID should be done from do_make_slave(), along with clearing
> group ID in vfsmount.
>
> Care to do that using mountinfo-base in vfs-2.6.git as base tree?

2008-03-24 08:50:00

by Ram Pai

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Thu, 2008-03-20 at 21:43 +0000, Al Viro wrote:
> On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:
>
> > > Argh... OK, I'll try to put something together tonight, after I get some
> > > sleep - 31 hours of uptime _suck_ ;-/
> >
> > Gosh, yes.
>
.....snip...
> Is there any reason why we do that in ->umount_begin() and not *after*
> it, unconditionally, straight from do_umount()? AFAICS, the only reason
> why it's done from fs-specific code is figuring out which mount-list
> should the stuff go back to, and that's both broken *and* not needed
> with sanitized locking as above. While we are at it, I'd rather return
> ->umount_begin() to its previous prototype, TYVM - the less filesystem
> sees vfsmounts, the better off we all are...

I think that ->umount_begin also acts a hook for providing pre-umount
event notification to userspace from filesystems; something that is
required by DMAPI interface.

RP

>
> Comments? If nobody objects, I'm going to do that in vfs-fixes branch
> and then push to Linus...

2008-03-24 08:55:10

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Mon, Mar 24, 2008 at 01:50:14AM -0700, Ram Pai wrote:
> I think that ->umount_begin also acts a hook for providing pre-umount
> event notification to userspace from filesystems; something that is
> required by DMAPI interface.

No such thing like dmapi near mainline, and we ever want user-space
assiseted HDM in mainline we'd do it in VFS.

FYI: SGI out of tree dmapi doesn't make use of ->unmount_begin either.

2008-03-24 09:34:55

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Mon, Mar 24, 2008 at 01:19:41AM -0700, Ram Pai wrote:
> > * in do_change_type(), outside of vfsmount_lock, do invent_group_ids()
> > If it fails - bugger off, if not - proceed as now.
>
> Has it to be done outside vfsmount_lock? AFAICT, invent_group_ids()
> does not sleep, nor does change_mnt_propagation().

It does allocation. And no, GFP_ATOMIC is not appropriate for that.
The same goes for mount IDs, BTW, and there we _also_ don't need
vfsmount_lock - see my current tree, there we have clone_mnt()
serialized by namespace_sem in all cases.

> > * in attach_recursive_mnt() if IS_MNT_SHARED(dest_mnt) do
> > invent_group_ids() on the dest_mnt immediately and if it fails do
>
> I think you meant, invent_group_ids() on the source_mnt.
Yes

> But again
> applying invent_group_ids() on the source_mnt has to be done carefully,
> because, source_mnt may have been shared to begin with.

And? See the original posting - invent_group_ids() skips the vfsmounts
with already set group ID.

2008-03-24 09:53:20

by Al Viro

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Mon, Mar 24, 2008 at 01:50:14AM -0700, Ram Pai wrote:
> > Is there any reason why we do that in ->umount_begin() and not *after*
> > it, unconditionally, straight from do_umount()? AFAICS, the only reason
> > why it's done from fs-specific code is figuring out which mount-list
> > should the stuff go back to, and that's both broken *and* not needed
> > with sanitized locking as above. While we are at it, I'd rather return
> > ->umount_begin() to its previous prototype, TYVM - the less filesystem
> > sees vfsmounts, the better off we all are...
>
> I think that ->umount_begin also acts a hook for providing pre-umount
> event notification to userspace from filesystems; something that is
> required by DMAPI interface.

"Why, so can I, and so can any man..."

IOW, DMAPI might require whatever its authors want it to require, but
what does that have to do with us?

BTW, I've dropped several more patches into the tree (sanitizing
namespace.c/pnode.c) and merged that into mountinfo-base. Documentation
is mostly done, so it will be the next thing to go there, then it's
time for 'dissolve unreachable subtrees on d_invalidate()' stuff.
Which probably will mean *another* cyclic list in vfsmount, not anchored
but pointed to by replacement of d_mounted; same protection as for
mnt_child/mnt_mounts, contains vfsmounts with given mnt_mountpoint.
I'm not too fond of that, but I don't see cleaner way to do it at the
moment. Any better ideas are welcome...

2008-03-30 19:33:42

by Ram Pai

[permalink] [raw]
Subject: Re: [patch 3/6] vfs: mountinfo stable peer group id

On Sat, 2008-03-22 at 04:11 +0000, Al Viro wrote:
> On Sat, Mar 22, 2008 at 03:49:50AM +0000, Al Viro wrote:
>
> > Shifting increment from mnt_set_mountpoint() and commit_tree()
> > to theirs callers and collapsing where possible, we get the
> following:
> > * decrement in release_mounts() when resetting ->mnt_parent
> > * increment in propagate_mnt() after call of
> mnt_set_mountpoint()
> > * decrement in attach_recursive_mnt() in the loop calling
> > commit_tree() for clones (on mountpoint of each clone).
> > * increment in umount_tree() at the point where we update
> d_mounted.
>
> ... except that it'd give a leak in case of mount to shared mountpoint
> failing halfway through - we'll get double increments since
> umount_tree()
> would hit the mountpoints of cloned trees with extra increment, even
> though
> reference from root of cloned to its mountpoint is _already_ a ghost.

> OTOH, we probably don't want to bother with counting those anyway -
> i.e.
> it's simply a bad definition and the right one would be along the
> lines of
> "number of vfsmounts that are doomed to be eaten by release_mounts()
> and
> that have ->mnt_parent pointing to us". IOW, dropping the 2nd and 3rd
> in the above would do the right thing - anything chewed by
> umount_tree()
> *will* go to release_mounts() and ones in flight are what we are
> interested
> in...

By not accounting for the ghost reference created in propagate_mnt(),
i.e case 2 and 3; the race is still on with shrink_mounts. But I think,
you are right. We don't want the shrink_mounts and friends to think that
the mounts are available to be purged, by accounting them into
mnt_ghosts.

RP