Hi Linus, Al,
The attached patch adds automounting support and mountpount expiry support to
the VFS.
This patch involves the adding the following features:
(1) A new dentry operation that (a) marks a dentry as being an automount
point, and (b) gets called by the VFS to come up with a vfsmount
structure which the VFS then stitches into the mount tree fabric at the
appropriate place.
(2) A new lookup flag that is used by sys_*stat() to prevent automounting of
the path endpoint. This means "ls -l" in an automounter directory doesn't
cause a mount storm, but will display all the mountpoints in that
directory as subdirectories (either the underlying mountpoint dir or the
root dir of the mounted fs if the mountpoint has been triggered already).
(3) do_kern_mount() is now exported.
(4) The vfsmount structure has acquired, amongst other things, a timeout
field. If mntput() notices a vfsmount reach a usage count of 1, then the
vfsmount expiry time is set and the namespace that contains the vfsmount
has its expiration work chitty queued.
(5) The namespace structure has acquired a work struct that is used to
actually perform vfsmount expiry under process context.
Here's a cut-down example, taken from the implementation I'm using my AFS
filesystem client:
static struct dentry_operations afs_fs_mntpt_dentry_operations = {
.d_revalidate = afs_d_revalidate,
.d_delete = afs_d_delete,
.d_automount = afs_mntpt_d_automount,
};
struct vfsmount *afs_mntpt_d_automount(struct dentry *mntpt)
{
struct vfsmount *mnt;
size_t size;
char *buf, *devname = NULL;
devname = (char *) get_zeroed_page(GFP_KERNEL);
/* read the contents of the AFS special symlink */
size = afs_read_string(mntpt->d_inode, devname, PAGE_SIZE - 1);
mnt = do_kern_mount("afs", 0, devname, NULL);
mnt->mnt_expiry_timeout = 30;
free_page((unsigned long)devname);
return mnt;
}
Note that the inode attached to the underlying mountpoint can be used to
determine _what_ should be mounted.
I've some ideas as to how to implement a leaner autofs with it as well as
using it for AFS.
Any comments would be appreciated.
David
diff -uNr linux-2.5.72/fs/namei.c linux-2.5.72-auto/fs/namei.c
--- linux-2.5.72/fs/namei.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/namei.c 2003-06-17 15:06:42.000000000 +0100
@@ -434,23 +434,35 @@
return 1;
}
-static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static int follow_mount(struct vfsmount **mnt, struct dentry **dentry, unsigned int flags)
{
int res = 0;
- while (d_mountpoint(*dentry)) {
- struct vfsmount *mounted;
- spin_lock(&dcache_lock);
- mounted = lookup_mnt(*mnt, *dentry);
- if (!mounted) {
+ for (;;) {
+
+ if (d_mountpoint(*dentry)) {
+ struct vfsmount *mounted;
+ spin_lock(&dcache_lock);
+ mounted = lookup_mnt(*mnt, *dentry);
+ if (!mounted) {
+ spin_unlock(&dcache_lock);
+ break;
+ }
+ *mnt = mntget(mounted);
spin_unlock(&dcache_lock);
+ dput(*dentry);
+ mntput(mounted->mnt_parent);
+ *dentry = dget(mounted->mnt_root);
+ res = 1;
+
+ } else if (d_automount_point(*dentry)) {
+ if (flags & LOOKUP_NOAUTOMOUNT)
+ break;
+ res = kern_automount(*mnt, *dentry);
+ if (res < 0)
+ break;
+ } else {
break;
}
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
- dput(*dentry);
- mntput(mounted->mnt_parent);
- *dentry = dget(mounted->mnt_root);
- res = 1;
}
return res;
}
@@ -510,7 +522,7 @@
mntput(*mnt);
*mnt = parent;
}
- follow_mount(mnt, dentry);
+ follow_mount(mnt, dentry, 0);
}
struct path {
@@ -643,7 +655,9 @@
if (err)
break;
/* Check mountpoints.. */
- follow_mount(&next.mnt, &next.dentry);
+ err = follow_mount(&next.mnt, &next.dentry, 0);
+ if (err < 0)
+ goto out_dput;
err = -ENOENT;
inode = next.dentry->d_inode;
@@ -703,7 +717,10 @@
err = do_lookup(nd, &this, &next, 0);
if (err)
break;
- follow_mount(&next.mnt, &next.dentry);
+ err = follow_mount(&next.mnt, &next.dentry, nd->flags & LOOKUP_NOAUTOMOUNT);
+ if (err < 0)
+ goto out_dput;
+
inode = next.dentry->d_inode;
if ((lookup_flags & LOOKUP_FOLLOW)
&& inode && inode->i_op && inode->i_op->follow_link) {
diff -uNr linux-2.5.72/fs/namespace.c linux-2.5.72-auto/fs/namespace.c
--- linux-2.5.72/fs/namespace.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/namespace.c 2003-06-17 15:06:42.000000000 +0100
@@ -30,6 +30,8 @@
static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache;
+static void kern_automount_expiry_timeout(void *_data);
+
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long) mnt / L1_CACHE_BYTES);
@@ -142,6 +144,9 @@
mnt->mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = old->mnt_namespace;
+ mnt->mnt_expiry_timeout = old->mnt_expiry_timeout;
+ mnt->mnt_expires_at = old->mnt_expires_at;
}
return mnt;
}
@@ -674,6 +679,135 @@
return err;
}
+int kern_automount(struct vfsmount *on_mnt, struct dentry *on_dentry)
+{
+ struct nameidata nd;
+ struct vfsmount *mnt;
+ int err;
+
+ if (!on_dentry->d_inode || !S_ISDIR(on_dentry->d_inode->i_mode))
+ return -ENOTDIR;
+
+ mnt = on_dentry->d_op->d_automount(on_dentry);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ mnt->mnt_expires_at = get_seconds() + mnt->mnt_expiry_timeout;
+
+ memset(&nd,0,sizeof(nd));
+ nd.dentry = on_dentry;
+ nd.mnt = on_mnt;
+
+ down_write(¤t->namespace->sem);
+ /* Something was mounted here while we slept */
+ while(d_mountpoint(nd.dentry) && follow_down(&nd.mnt, &nd.dentry))
+ ;
+ err = -EINVAL;
+ if (!check_mnt(nd.mnt))
+ goto unlock;
+
+ /* Refuse the same filesystem on the same mount point */
+ err = -EBUSY;
+ if (nd.mnt->mnt_sb == mnt->mnt_sb && nd.mnt->mnt_root == nd.dentry)
+ goto unlock;
+
+ err = graft_tree(mnt, &nd);
+unlock:
+ up_write(¤t->namespace->sem);
+ mntput(mnt);
+ return err;
+}
+
+static inline void kern_automount_set_expiry_timer(struct namespace *namespace,
+ unsigned long timeout)
+{
+ spin_lock(&dcache_lock);
+
+ if (atomic_read(&namespace->count) > 0) {
+ get_namespace(namespace);
+
+ if (!schedule_delayed_work(&namespace->mnt_expiry_work,
+ (timeout + 1) * HZ))
+ put_namespace(namespace);
+ }
+
+ spin_unlock(&dcache_lock);
+}
+
+void kern_automount_begin_expiry(struct vfsmount *mnt)
+{
+ mnt->mnt_expires_at = get_seconds() + mnt->mnt_expiry_timeout;
+
+ kern_automount_set_expiry_timer(mnt->mnt_namespace,
+ mnt->mnt_expiry_timeout);
+}
+
+static void kern_automount_expiry_timeout(void *_data)
+{
+ struct namespace *namespace = _data;
+ struct list_head *_p, *_n, graveyard;
+ struct vfsmount *mnt;
+ time_t now;
+ int timeout = INT_MAX, tmp;
+
+ INIT_LIST_HEAD(&graveyard);
+
+ down_write(&namespace->sem);
+
+ now = get_seconds();
+
+ list_for_each_safe(_p, _n, &namespace->list) {
+ mnt = list_entry(_p, struct vfsmount, mnt_list);
+
+ if (mnt->mnt_expiry_timeout &&
+ atomic_read(&mnt->mnt_count) == 1) {
+ spin_lock(&dcache_lock);
+
+ if (atomic_read(&mnt->mnt_count) == 1) {
+ tmp = (int) mnt->mnt_expires_at - (int) now;
+ if (tmp <= 0) {
+ list_move_tail(&mnt->mnt_list,
+ &graveyard);
+ list_del_init(&mnt->mnt_child);
+ list_del_init(&mnt->mnt_hash);
+ mnt->mnt_mountpoint->d_mounted--;
+ } else if (tmp < timeout) {
+ timeout = tmp;
+ }
+ }
+
+ spin_unlock(&dcache_lock);
+ }
+ }
+
+ up_write(&namespace->sem);
+
+ while (!list_empty(&graveyard)) {
+ mnt = list_entry(graveyard.next, struct vfsmount, mnt_list);
+ list_del_init(&mnt->mnt_list);
+
+ dput(xchg(&mnt->mnt_mountpoint, mnt->mnt_root));
+ mntput(xchg(&mnt->mnt_parent, mnt));
+
+ if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
+ /* last instance - try to be smart */
+ lock_kernel();
+ DQUOT_OFF(mnt->mnt_sb);
+ acct_auto_close(mnt->mnt_sb);
+ unlock_kernel();
+ }
+
+ mntput(mnt);
+ }
+
+ if (timeout != INT_MAX) {
+ kern_automount_set_expiry_timer(namespace, timeout);
+ }
+ else {
+ put_namespace(namespace);
+ }
+}
+
static int copy_mount_options (const void __user *data, unsigned long *where)
{
int i;
@@ -800,6 +934,9 @@
init_rwsem(&new_ns->sem);
new_ns->root = NULL;
INIT_LIST_HEAD(&new_ns->list);
+ INIT_WORK(&new_ns->mnt_expiry_work,
+ kern_automount_expiry_timeout,
+ new_ns);
down_write(&tsk->namespace->sem);
/* First pass: copy the tree topology */
@@ -816,6 +953,8 @@
p = namespace->root;
q = new_ns->root;
while (p) {
+ q->mnt_namespace = new_ns;
+
if (p == fs->rootmnt) {
rootmnt = p;
fs->rootmnt = mntget(q);
@@ -844,6 +983,8 @@
if (altrootmnt)
mntput(altrootmnt);
+ kern_automount_set_expiry_timer(namespace, 1);
+
put_namespace(namespace);
return 0;
@@ -1079,6 +1220,9 @@
panic("Can't allocate initial namespace");
atomic_set(&namespace->count, 1);
INIT_LIST_HEAD(&namespace->list);
+ INIT_WORK(&namespace->mnt_expiry_work,
+ kern_automount_expiry_timeout,
+ namespace);
init_rwsem(&namespace->sem);
list_add(&mnt->mnt_list, &namespace->list);
namespace->root = mnt;
diff -uNr linux-2.5.72/fs/stat.c linux-2.5.72-auto/fs/stat.c
--- linux-2.5.72/fs/stat.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/stat.c 2003-06-17 15:06:42.000000000 +0100
@@ -61,7 +61,7 @@
struct nameidata nd;
int error;
- error = user_path_walk(name, &nd);
+ error = user_path_walk_stat(name, &nd);
if (!error) {
error = vfs_getattr(nd.mnt, nd.dentry, stat);
path_release(&nd);
@@ -74,7 +74,7 @@
struct nameidata nd;
int error;
- error = user_path_walk_link(name, &nd);
+ error = user_path_walk_link_stat(name, &nd);
if (!error) {
error = vfs_getattr(nd.mnt, nd.dentry, stat);
path_release(&nd);
diff -uNr linux-2.5.72/fs/super.c linux-2.5.72-auto/fs/super.c
--- linux-2.5.72/fs/super.c 2003-06-17 15:01:51.000000000 +0100
+++ linux-2.5.72-auto/fs/super.c 2003-06-17 15:06:42.000000000 +0100
@@ -21,6 +21,7 @@
*/
#include <linux/config.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/acct.h>
@@ -683,6 +684,7 @@
mnt->mnt_root = dget(sb->s_root);
mnt->mnt_mountpoint = sb->s_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = current->namespace;
up_write(&sb->s_umount);
put_filesystem(type);
return mnt;
@@ -697,6 +699,8 @@
return (struct vfsmount *)sb;
}
+EXPORT_SYMBOL_GPL(do_kern_mount);
+
struct vfsmount *kern_mount(struct file_system_type *type)
{
return do_kern_mount(type->name, 0, type->name, NULL);
diff -uNr linux-2.5.72/include/linux/dcache.h linux-2.5.72-auto/include/linux/dcache.h
--- linux-2.5.72/include/linux/dcache.h 2003-06-17 15:01:36.000000000 +0100
+++ linux-2.5.72-auto/include/linux/dcache.h 2003-06-17 15:06:42.000000000 +0100
@@ -112,6 +112,7 @@
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
+ struct vfsmount *(*d_automount)(struct dentry *);
};
/* the dentry parameter passed to d_hash and d_compare is the parent
@@ -307,6 +308,11 @@
return dentry->d_mounted;
}
+static inline int d_automount_point(struct dentry *dentry)
+{
+ return dentry->d_op && dentry->d_op->d_automount;
+}
+
extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
#endif /* __KERNEL__ */
diff -uNr linux-2.5.72/include/linux/mount.h linux-2.5.72-auto/include/linux/mount.h
--- linux-2.5.72/include/linux/mount.h 2003-06-17 15:01:35.000000000 +0100
+++ linux-2.5.72-auto/include/linux/mount.h 2003-06-17 15:06:42.000000000 +0100
@@ -31,6 +31,9 @@
int mnt_flags;
char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
+ struct namespace *mnt_namespace; /* containing namespace */
+ time_t mnt_expires_at; /* time at which automount expires */
+ unsigned mnt_expiry_timeout; /* expiry timeout (in seconds) or 0 */
};
static inline struct vfsmount *mntget(struct vfsmount *mnt)
@@ -40,11 +43,15 @@
return mnt;
}
+extern void kern_automount_begin_expiry(struct vfsmount *mnt);
extern void __mntput(struct vfsmount *mnt);
static inline void mntput(struct vfsmount *mnt)
{
if (mnt) {
+ if (atomic_read(&mnt->mnt_count) == 2 &&
+ mnt->mnt_expiry_timeout)
+ kern_automount_begin_expiry(mnt);
if (atomic_dec_and_test(&mnt->mnt_count))
__mntput(mnt);
}
diff -uNr linux-2.5.72/include/linux/namei.h linux-2.5.72-auto/include/linux/namei.h
--- linux-2.5.72/include/linux/namei.h 2003-06-17 15:01:36.000000000 +0100
+++ linux-2.5.72-auto/include/linux/namei.h 2003-06-17 15:06:42.000000000 +0100
@@ -31,6 +31,7 @@
#define LOOKUP_CONTINUE 4
#define LOOKUP_PARENT 16
#define LOOKUP_NOALT 32
+#define LOOKUP_NOAUTOMOUNT 64
extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
@@ -38,6 +39,10 @@
__user_walk(name, LOOKUP_FOLLOW, nd)
#define user_path_walk_link(name,nd) \
__user_walk(name, 0, nd)
+#define user_path_walk_stat(name,nd) \
+ __user_walk(name, LOOKUP_FOLLOW|LOOKUP_NOAUTOMOUNT, nd)
+#define user_path_walk_link_stat(name,nd) \
+ __user_walk(name, LOOKUP_NOAUTOMOUNT, nd)
extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
extern int FASTCALL(path_walk(const char *, struct nameidata *));
extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
@@ -52,4 +57,6 @@
extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);
+extern int kern_automount(struct vfsmount *mnt, struct dentry *dentry);
+
#endif /* _LINUX_NAMEI_H */
diff -uNr linux-2.5.72/include/linux/namespace.h linux-2.5.72-auto/include/linux/namespace.h
--- linux-2.5.72/include/linux/namespace.h 2003-06-17 15:01:36.000000000 +0100
+++ linux-2.5.72-auto/include/linux/namespace.h 2003-06-17 15:06:42.000000000 +0100
@@ -10,6 +10,7 @@
struct vfsmount * root;
struct list_head list;
struct rw_semaphore sem;
+ struct work_struct mnt_expiry_work;
};
extern void umount_tree(struct vfsmount *);
Followup to: <[email protected]>
By author: David Howells <[email protected]>
In newsgroup: linux.dev.kernel
>
>
> Hi Linus, Al,
>
> The attached patch adds automounting support and mountpount expiry support to
> the VFS.
>
> This patch involves the adding the following features:
>
> (1) A new dentry operation that (a) marks a dentry as being an automount
> point, and (b) gets called by the VFS to come up with a vfsmount
> structure which the VFS then stitches into the mount tree fabric at the
> appropriate place.
>
> (2) A new lookup flag that is used by sys_*stat() to prevent automounting of
> the path endpoint. This means "ls -l" in an automounter directory doesn't
> cause a mount storm, but will display all the mountpoints in that
> directory as subdirectories (either the underlying mountpoint dir or the
> root dir of the mounted fs if the mountpoint has been triggered already).
>
> (3) do_kern_mount() is now exported.
>
> (4) The vfsmount structure has acquired, amongst other things, a timeout
> field. If mntput() notices a vfsmount reach a usage count of 1, then the
> vfsmount expiry time is set and the namespace that contains the vfsmount
> has its expiration work chitty queued.
>
> (5) The namespace structure has acquired a work struct that is used to
> actually perform vfsmount expiry under process context.
>
This seems a bit heavyweight; although some VFS support is needed for
a complex filesystem, effectively doing it all in the kernel (#3)
seems a bit... excessive.
At least #2 can be done with existing means using follow_link.
I think using a revalidation pointer like dentries might be a better
way to do #4/#5, although using the existing one in the dentries is
probably better.
#1 isn't really clear to me what you're going for, but it seems to be
to duplicate bookkeeping.
I also don't see how this solves the biggest problems with complex
automounts, which are:
a) how to guarantee that a large mount tree can be safely destroyed;
b) how to detect partial unmounts.
-hpa
--
<[email protected]> at work, <[email protected]> in private!
"Unix gives you enough rope to shoot yourself in the foot."
Architectures needed: ia64 m68k mips64 ppc ppc64 s390 s390x sh v850 x86-64
> This seems a bit heavyweight; although some VFS support is needed for
> a complex filesystem, effectively doing it all in the kernel (#3)
> seems a bit... excessive.
One of the problems I have to deal with is namespaces. This means I can't just
have an automounter running in userspace that's passed requests to mount
things as it might not be able to access the target namespace.
Doing it this way means that I don't need to care which namespace the
automount needs to take effect in. I can just return a vfsmount to the VFS (as
acquired from do_kern_mount()) and let that paste it into the right place.
Furthermore, for AFS at least, it's a lot less excessive than, say, calling
back into userspace.
> At least #2 can be done with existing means using follow_link.
How? I want to be able to mount on the location in question (so it has to be a
directory), but I don't want "ls -l" to cause it to mount (otherwise
accidentally doing that or tab expansion if /afs, say, will take ages).
Maybe you mean construct a symlink that points to somewhere I can actually
mount the filesystem? If so, that too can suffer from namespace problems.
Whatever happens, stat() must _not_ cause the automount point to mount.
> I think using a revalidation pointer like dentries might be a better
> way to do #4/#5, although using the existing one in the dentries is
> probably better.
Do you mean dispose of the expired mount point when it's next revalidated? If
so, surely you _don't_ want to do it then, as that's normally a prelude to
reusing it.
Or do you mean do it actually inside dentry->d_op->d_revalidate()? But you
can't do it there because you don't know what vfsmount you are dealing with.
> #1 isn't really clear to me what you're going for, but it seems to be
> to duplicate bookkeeping.
Duplicate of what bookkeeping?
The fact that the operation is provided indicates that a dentry is an
automount point, and as such should be handled specially by path-walk. All the
logic to link the new vfsmount into the filesystem topology can be handled
easily by the VFS at that point because all the details are to hand.
> I also don't see how this solves the biggest problems with complex
> automounts, which are:
>
> a) how to guarantee that a large mount tree can be safely destroyed;
What do you mean by safely? I check that the usage count on vfsmount
structures is 1 under lock just before unlinking it - thereby making sure that
no one has a file open on it, no process has it as its root or cwd, and that
nothing is mounted upon it.
Also, I do the actual unmounting from process context by walking the
namespace's extant mount list, rather than directly nominating a vfsmount for
removal.
One drawback is that - taking AFS as an example - doing a umount of /afs won't
work until all the subtrees have either been manually unmounted or have
expired (though I can make umount capable of handling this).
> b) how to detect partial unmounts.
What do you mean by a partial unmount?
David
David Howells wrote:
>
> One of the problems I have to deal with is namespaces. This means I can't just
> have an automounter running in userspace that's passed requests to mount
> things as it might not be able to access the target namespace.
> Doing it this way means that I don't need to care which namespace the
> automount needs to take effect in. I can just return a vfsmount to the VFS (as
> acquired from do_kern_mount()) and let that paste it into the right place.
>
> Furthermore, for AFS at least, it's a lot less excessive than, say, calling
> back into userspace.
>
Namespaces have all kinds of problems anyway when mixed with
automounting (it's not at all clear what the semantics should be, and
I'm pretty sure that the semantics the current namespaces give are
overall undersirable), but when I discussed this issue with Al Viro he
indicated that you can always access all namespaces via procfs; if not,
you *should* be able to...
>
>>At least #2 can be done with existing means using follow_link.
>
> How? I want to be able to mount on the location in question (so it has to be a
> directory), but I don't want "ls -l" to cause it to mount (otherwise
> accidentally doing that or tab expansion if /afs, say, will take ages).
>
> Maybe you mean construct a symlink that points to somewhere I can actually
> mount the filesystem? If so, that too can suffer from namespace problems.
>
> Whatever happens, stat() must _not_ cause the automount point to mount.
>
That's actually not true. It's lstat() that mustn't cause the automount
point to mount -- stat() only comes into play if lstat() resolves to a
symlink. However, lstat() never invokes follow_link, so creating a
dentry with a follow_link method resolving to itself, and an associated
dummy directory inode, does what's required.
>
>>I think using a revalidation pointer like dentries might be a better
>>way to do #4/#5, although using the existing one in the dentries is
>>probably better.
>
> Do you mean dispose of the expired mount point when it's next revalidated? If
> so, surely you _don't_ want to do it then, as that's normally a prelude to
> reusing it.
>
> Or do you mean do it actually inside dentry->d_op->d_revalidate()? But you
> can't do it there because you don't know what vfsmount you are dealing with.
>
I mean inside d_revalidate().
>
>>#1 isn't really clear to me what you're going for, but it seems to be
>>to duplicate bookkeeping.
>
> Duplicate of what bookkeeping?
>
> The fact that the operation is provided indicates that a dentry is an
> automount point, and as such should be handled specially by path-walk. All the
> logic to link the new vfsmount into the filesystem topology can be handled
> easily by the VFS at that point because all the details are to hand.
>
I don't see that it should be handed specially.
>
>>I also don't see how this solves the biggest problems with complex
>>automounts, which are:
>>
>>a) how to guarantee that a large mount tree can be safely destroyed;
>
> What do you mean by safely? I check that the usage count on vfsmount
> structures is 1 under lock just before unlinking it - thereby making sure that
> no one has a file open on it, no process has it as its root or cwd, and that
> nothing is mounted upon it.
Not good enough. You need to be able to tell that atomically for a full
*tree*, that can contain multiple mounts, some of which have other
mounts on top of them, not just for a single superblock.
> Also, I do the actual unmounting from process context by walking the
> namespace's extant mount list, rather than directly nominating a vfsmount for
> removal.
>
> One drawback is that - taking AFS as an example - doing a umount of /afs won't
> work until all the subtrees have either been manually unmounted or have
> expired (though I can make umount capable of handling this).
See above.
>>b) how to detect partial unmounts.
>
> What do you mean by a partial unmount?
/foo/bar is an automounted filesystem, which has /foo/bar/baz mounted on
top of it. Now the user manually umounts /foo/bar/baz (because of
staleness, or whatever.) Now the automount system needs to detect
accesses to /foo/bar/baz and remount... effectively, /foo/bar/baz needs
to atomically turn into an automounter point.
-hpa
On Tue, 17 Jun 2003, David Howells wrote:
>
> The attached patch adds automounting support and mountpount expiry support to
> the VFS.
I don't think this is evil, but it looks a bit non-appropriate for now.
But I'd like to see Al's (and others) comments on it..
Linus
On Tue, Jun 17, 2003 at 09:55:17PM -0700, Linus Torvalds wrote:
>
> On Tue, 17 Jun 2003, David Howells wrote:
> >
> > The attached patch adds automounting support and mountpount expiry support to
> > the VFS.
>
> I don't think this is evil, but it looks a bit non-appropriate for now.
> But I'd like to see Al's (and others) comments on it..
I'm not too happy with it. If nothing else, IMO it's the wrong way to
solve the problem - if you want a bunch of filesystems look like a
single object (i.e. go together wherever we mount it, etc.), make it
a filesystem. That would make a lot of sense, and not only for AFS.
We need a light-weight automount. No arguments here. But it should
be per-namespace - i.e. "I want to have <foo> mounted on /usr/barf on
demand and I have no intention to screw somebody else - somebody who
might have the same directory seen as /usr/local/debian/barf and want
<blah> mounted on demand there".
Namespace is controled by owner of that namespace. It is a security
boundary, among other things. And events in one namespace should not
cause mounts in another. Yes, AFS wants an illusion of single filesystem
composed in fixed way from many pieces. But if you want to do that,
do it right - make sure that it acts as a single chunk in mount tree.
IOW, make it look like a single filesystem.
Hi Al,
> > I don't think this is evil, but it looks a bit non-appropriate for now.
> > But I'd like to see Al's (and others) comments on it..
>
> I'm not too happy with it. If nothing else, IMO it's the wrong way to
> solve the problem - if you want a bunch of filesystems look like a
> single object (i.e. go together wherever we mount it, etc.), make it
> a filesystem. That would make a lot of sense, and not only for AFS.
But I don't want them to look like a single object...
If I did, I could do AFS entirely inside one superblock. What I want is for
each AFS volume to be represented by its own superblock - and for the
differences to be visible from userspace. This also means that I can use AFS
vnode numbers directly as VFS inode numbers.
Furthermore, doing it this way means that if multiple instances of a volume
get mounted for some reason (maybe in different namespaces, or maybe by
multiple mountpoints referring to the same volume), they can share inode and
dentry records.
Layering another filesystem over the top (and doing the actual automount in a
private namespace for example) incurs certain "problems":
(*) rename (and to a lesser extent, link) are not trivial
(*) it hides the joins from userspace - which may not be what you want
(*) it may exhibit distinct files of the same inode and device numbers
(*) it adds a layer of indirection to every operation
(*) every "file" you want to access requires an extra inode and dentry
> We need a light-weight automount. No arguments here. But it should
> be per-namespace - i.e. "I want to have <foo> mounted on /usr/barf on
> demand and I have no intention to screw somebody else - somebody who
> might have the same directory seen as /usr/local/debian/barf and want
> <blah> mounted on demand there".
I don't have that intention of mucking someone else up either... But consider:
what happens when a namespace is copied? All the automounter directories for
autofs/amd/AFS are copied too... With the way I've come up with, this is
irrelevant; the in-VFS automount will still work exactly the way it did until
it is removed from that namespace. This is the way _I_ would expect things to
work.
I don't see that your argument is a problem... anyone who wants something
different in their namespace is going to have to change things anyway.
I think the default behaviour should be that a clone of a namespace will
behave exactly like the original would at the point it was copied, though
changes in the clone will not reflect on the original and vice-versa (as
should be the case with my patch).
> Namespace is controled by owner of that namespace. It is a security
> boundary, among other things. And events in one namespace should not
> cause mounts in another.
Which they don't (or, at least, shouldn't) with my patch...
> Yes, AFS wants an illusion of single filesystem composed in fixed way from
> many pieces. But if you want to do that, do it right - make sure that it
> acts as a single chunk in mount tree. IOW, make it look like a single
> filesystem.
As I said above, if I wanted AFS to appear as a single object, there's a lot
easier way of doing it - without touching the VFS at all.
However, mounting each volume separately gives several advantages:
(1) vnode numbers map directly to inode numbers, without inducing confusion
(2) rename() and link() get free checks and controls from the VFS
(3) "find -xdev" can be used to limit a search to a single volume
(4) the contents of a volume can easily and cheaply be reused for other
appearances in the same or a different vfsmount topology
David
> Namespaces have all kinds of problems anyway when mixed with
> automounting (it's not at all clear what the semantics should be, and
> I'm pretty sure that the semantics the current namespaces give are
> overall undersirable), but when I discussed this issue with Al Viro he
> indicated that you can always access all namespaces via procfs; if not,
> you *should* be able to...
You can't - at the moment anyway - necessarily access an entire
namespace... chroot, for example, may exclude part of it. This, however, may
not be a problem, except for fchdir might be able to put the current working
directory outside of your current root tree:
fd = open("/afs/.cambridge.redhat.com",O_DIRECTORY);
chdir("/sillyroot/");
chroot("/sillyroot/");
fchdir(fd);
chdir("afsdoc"); <-- automount required
With my patch, this should just work because the VFS knows exactly where the
automount needs to be done, and merely requires a raw vfsmount struct to be
supplied by the filesystem.
> > Whatever happens, stat() must _not_ cause the automount point to mount.
> >
>
> That's actually not true. It's lstat() that mustn't cause the automount
> point to mount -- stat() only comes into play if lstat() resolves to a
> symlink. However, lstat() never invokes follow_link, so creating a
> dentry with a follow_link method resolving to itself, and an associated
> dummy directory inode, does what's required.
That _is_ actually true. Doing "ls -l" in that directory would otherwise cause
a mount storm.
follow_link resolving to itself? Surely that'll cause ELOOP very quickly? And
where does this "dummy directory inode" live?
> > Or do you mean do it actually inside dentry->d_op->d_revalidate()? But you
> > can't do it there because you don't know what vfsmount you are dealing
> > with.
>
> I mean inside d_revalidate().
You can't do the mount inside d_revalidate(). You don't have enough
information.
> I don't see that it should be handed specially.
But it is special. It's a fs future if you will. If it's done at VFS level it
can be done a lot cleaner.
> >>a) how to guarantee that a large mount tree can be safely destroyed;
> >
> > What do you mean by safely? I check that the usage count on vfsmount
> > structures is 1 under lock just before unlinking it - thereby making sure
> > that no one has a file open on it, no process has it as its root or cwd,
> > and that nothing is mounted upon it.
>
> Not good enough. You need to be able to tell that atomically for a full
> *tree*, that can contain multiple mounts, some of which have other
> mounts on top of them, not just for a single superblock.
_Why_ do I need to do that?
I currently degrade a tree from the leaves inwards, true, but the fact that
each layer then needs to wait for expiry can be improved.
I don't expire branches in the tree because the leaves and branches depending
from them pin them by way of their usage counts.
One thing I can't do is a tree unmount, but then we can't do that anyway:
[root@host135 root]# df /home
Filesystem 1k-blocks Used Available Use% Mounted on
automount(pid902) 0 0 0 - /home
[root@host135 root]# ls /home/
[root@host135 root]# ls /home/dhowells/
...
[root@host135 root]# ls /home/
dhowells
[root@host135 root]# umount /home
umount: /home: device is busy
umount can, perhaps, be made to try to do this by trying to rid the tree of
unused automount points, until either it finds something that is in use or is
not a mountpoint, or it manages to completely clear the subtree.
Unused leaves can be determined by checking mnt_count under the dcache lock -
if the count is 1, then the leaf can be removed.
> >>b) how to detect partial unmounts.
> >
> > What do you mean by a partial unmount?
>
> /foo/bar is an automounted filesystem, which has /foo/bar/baz mounted on
> top of it. Now the user manually umounts /foo/bar/baz (because of
> staleness, or whatever.) Now the automount system needs to detect
> accesses to /foo/bar/baz and remount... effectively, /foo/bar/baz needs
> to atomically turn into an automounter point.
Yes... it does that. The dentry from the underlying filesystem is still there,
and resurfaces after the umount; and _that_ dentry is what marks this an
automount point (by virtue of its d_automount operation). So this is not a
problem.
David
On Wed, Jun 18, 2003 at 08:55:33AM +0100, David Howells wrote:
> > We need a light-weight automount. No arguments here. But it should
> > be per-namespace - i.e. "I want to have <foo> mounted on /usr/barf on
> > demand and I have no intention to screw somebody else - somebody who
> > might have the same directory seen as /usr/local/debian/barf and want
> > <blah> mounted on demand there".
>
> I don't have that intention of mucking someone else up either... But consider:
> what happens when a namespace is copied? All the automounter directories for
> autofs/amd/AFS are copied too... With the way I've come up with, this is
> irrelevant; the in-VFS automount will still work exactly the way it did until
> it is removed from that namespace. This is the way _I_ would expect things to
> work.
With your patch automount will happen on every reference to dentry.
Regardless of the namespace we are in.
> I don't see that your argument is a problem... anyone who wants something
> different in their namespace is going to have to change things anyway.
And how would they do it? You get mount triggered by stepping on dentry.
Period. Your kern_automount() will then create a new vfsmount and slap
it on the tree. Again, regardless of the namespace we are in / vfsmount
we are looking at / etc.
I have two namespaces. One of them has filesystem A mounted on /usr/include.
Another - on /usr/local/include. The first one wants /usr/include/foo1 trigger
mounting B and /usr/include/foo2 trigger mounting C. The second one wants
/usr/local/include/foo1 trigger mounting D and /usr/local/include/foo2 not
trigger anything.
Namespaces are completely unrelated - I have them set for two different
users that happen to need some common files, but otherwise have very
different environments. Could you describe what that "having to change
things" would involve?
On Wed, Jun 18, 2003 at 09:37:06AM +0100, David Howells wrote:
> One thing I can't do is a tree unmount, but then we can't do that anyway:
>
> [root@host135 root]# df /home
> Filesystem 1k-blocks Used Available Use% Mounted on
> automount(pid902) 0 0 0 - /home
> [root@host135 root]# ls /home/
> [root@host135 root]# ls /home/dhowells/
> ...
> [root@host135 root]# ls /home/
> dhowells
> [root@host135 root]# umount /home
> umount: /home: device is busy
Yes, we can. umount -l /home and there you go. RTFM.
> > > We need a light-weight automount. No arguments here. But it should
> > > be per-namespace - i.e. "I want to have <foo> mounted on /usr/barf on
> > > demand and I have no intention to screw somebody else - somebody who
> > > might have the same directory seen as /usr/local/debian/barf and want
> > > <blah> mounted on demand there".
> >
> > I don't have that intention of mucking someone else up either... But
> > consider: what happens when a namespace is copied? All the automounter
> > directories for autofs/amd/AFS are copied too... With the way I've come up
> > with, this is irrelevant; the in-VFS automount will still work exactly the
> > way it did until it is removed from that namespace. This is the way _I_
> > would expect things to work.
>
> With your patch automount will happen on every reference to dentry.
> Regardless of the namespace we are in.
Yes, that is, I think, the correct thing to do... but see also below.
> > I don't see that your argument is a problem... anyone who wants something
> > different in their namespace is going to have to change things anyway.
>
> And how would they do it? You get mount triggered by stepping on dentry.
> Period.
I think that's the correct behaviour for an automounter, except for stat(),
which I'd prefer _not_ to cause this (so you can "browse").
> Your kern_automount() will then create a new vfsmount and slap it on the
> tree. Again, regardless of the namespace we are in / vfsmount we are
> looking at / etc.
Looking at my code, I should probably use the semaphore from the namespace
pointed to by the vfsmount I'm going to be mounting on, just in case we've
managed to cross into someone else's namespace:
int kern_automount(struct vfsmount *on_mnt, struct dentry *on_dentry)
{
+ struct namespace *namespace = on_mnt->mnt_namespace;
struct nameidata nd;
...
- down_write(¤t->namespace->sem);
+ down_write(&namespace->sem);
...
- up_write(¤t->namespace->sem);
+ up_write(&namespace->sem);
...
}
> I have two namespaces. One of them has filesystem A mounted on /usr/include.
> Another - on /usr/local/include. The first one wants /usr/include/foo1 trigger
> mounting B and /usr/include/foo2 trigger mounting C. The second one wants
> /usr/local/include/foo1 trigger mounting D and /usr/local/include/foo2 not
> trigger anything.
>
> Namespaces are completely unrelated - I have them set for two different
> users that happen to need some common files, but otherwise have very
> different environments.
Not exactly unrelated. If namespace A derives from namespace B during clone(),
then I would consider B should behave exactly as A until modified - including
automount facilities.
> Could you describe what that "having to change things" would involve?
Well, if you, say, cloned a shell with its own namespace with the intent of
rearranging its namespace for that user, then I think it would be fair to say
that you'd have to "change things" at some point (ie: use (u)mount to
rearrange the topology).
David
David Howells wrote:
>>
>>That's actually not true. It's lstat() that mustn't cause the automount
>>point to mount -- stat() only comes into play if lstat() resolves to a
>>symlink. However, lstat() never invokes follow_link, so creating a
>>dentry with a follow_link method resolving to itself, and an associated
>>dummy directory inode, does what's required.
>
> That _is_ actually true. Doing "ls -l" in that directory would otherwise cause
> a mount storm.
>
It's not. ls -l and all the GUI tools do lstat(), not stat().
> follow_link resolving to itself? Surely that'll cause ELOOP very quickly? And
> where does this "dummy directory inode" live?
Nope. You can follow_link() nonrecursively. You need a dummy directory
inode to mount upon anyway.
-hpa
> > That _is_ actually true. Doing "ls -l" in that directory would otherwise
> > cause a mount storm.
> >
>
> It's not. ls -l and all the GUI tools do lstat(), not stat().
Sorry... you're correct. That should have been "ls -F" or "ls --color", both
of which are, I believe, commonly used - _they_ definitely use stat() as well
as lstat().
> > follow_link resolving to itself? Surely that'll cause ELOOP very quickly?
> > And where does this "dummy directory inode" live?
>
> Nope. You can follow_link() nonrecursively. You need a dummy directory
> inode to mount upon anyway.
You're right about follow_link() not recursing... it would have to recurse
itself, and so can avoid that. However, if it only ever follows to itself, how
does that help? That never actually gets you anywhere... It needs to trigger a
mount at some point.
Or do you mean it should follow to an arbitrary (disconnected, otherwise it
changes the topology from what the AFS admin required) dentry with a dummy
directory inode attached to it?
David
David Howells wrote:
>>>That _is_ actually true. Doing "ls -l" in that directory would otherwise
>>>cause a mount storm.
>>>
>>
>>It's not. ls -l and all the GUI tools do lstat(), not stat().
>
>
> Sorry... you're correct. That should have been "ls -F" or "ls --color", both
> of which are, I believe, commonly used - _they_ definitely use stat() as well
> as lstat().
>
Only if S_ISLNK.
>
>>>follow_link resolving to itself? Surely that'll cause ELOOP very quickly?
>>>And where does this "dummy directory inode" live?
>>
>>Nope. You can follow_link() nonrecursively. You need a dummy directory
>>inode to mount upon anyway.
>
> You're right about follow_link() not recursing... it would have to recurse
> itself, and so can avoid that. However, if it only ever follows to itself, how
> does that help? That never actually gets you anywhere... It needs to trigger a
> mount at some point.
>
> Or do you mean it should follow to an arbitrary (disconnected, otherwise it
> changes the topology from what the AFS admin required) dentry with a dummy
> directory inode attached to it?
>
You could do that if you wanted to, but it doesn't need to.
-hpa
> > Sorry... you're correct. That should have been "ls -F" or "ls --color",
> > both of which are, I believe, commonly used - _they_ definitely use stat()
> > as well as lstat().
>
> Only if S_ISLNK.
I don't know how widespread the practice is, but in the AFS Admin Guide under
"Configuring Your AFS Filespace" / "The Second (Cellname) Level" it suggests
creating symlinks in /afs to provide aliases for cells. For instance, in my
test cell, I've got:
[root@host135 root]# ls -l /afs
total 0
lrwxr-xr-x 1 4077 root 20 Feb 7 2002 cambridge -> cambridge.redhat.com
drw-r--r-- 1 4077 root 11 Feb 7 2002 cambridge.redhat.com
drw-r--r-- 1 4077 root 29 Mar 27 2002 grand.central.org
Unfortunately, the above command causes cambridge.redhat.com to be mounted
with the follow_link() on directory method you suggested (which is
undesirable):-/
David