2005-05-03 14:39:34

by Miklos Szeredi

[permalink] [raw]
Subject: [RCF] [PATCH] unprivileged mount/umount

This (lightly tested) patch against 2.6.12-rc* adds some
infrastructure and basic functionality for unprivileged mount/umount
system calls.

Details:

- new mnt_owner field in struct vfsmount
- if mnt_owner is NULL, it's a privileged mount
- global limit on unprivileged mounts in /proc/sys/fs/mount-max
- per user limit of mounts in rlimit
- allow umount for the owner (except force flag)
- allow unprivileged bind mount to files/directories writable by owner
- add nosuid,nodev flags to unprivileged mounts

Next step would be to add some policy for new mounts. I'm thinking of
either something static: e.g. FS_SAFE flag for "safe" filesystems, or
a more configurable approach through sysfs or something.

Comments?

Thanks,
Miklos

Index: fs/namespace.c
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/fs/namespace.c (mode:100644 sha1:3b93e5d750ebf8452ea1264251c5b55cc89f48f8)
+++ uncommitted/fs/namespace.c (mode:100644)
@@ -42,7 +42,7 @@
static struct list_head *mount_hashtable;
static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache;
-
+struct mounts_stat_struct mounts_stat;
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long) mnt / L1_CACHE_BYTES);
@@ -51,10 +51,44 @@
return tmp & hash_mask;
}

-struct vfsmount *alloc_vfsmnt(const char *name)
+static inline int inc_nr_mounts(struct user_struct *owner)
{
- struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
- if (mnt) {
+ int err = 0;
+ spin_lock(&vfsmount_lock);
+ if (owner &&
+ (mounts_stat.nr_mounts >= mounts_stat.max_mounts ||
+ owner->mounts >= current->signal->rlim[RLIMIT_MOUNTS].rlim_cur))
+ err = -EPERM;
+ else {
+ mounts_stat.nr_mounts++;
+ if (owner)
+ owner->mounts++;
+ }
+ spin_unlock(&vfsmount_lock);
+ return err;
+}
+
+static inline void dec_nr_mounts(struct user_struct *owner)
+{
+ spin_lock(&vfsmount_lock);
+ mounts_stat.nr_mounts--;
+ if (owner)
+ owner->mounts--;
+ spin_unlock(&vfsmount_lock);
+}
+
+struct vfsmount *alloc_vfsmnt(const char *name, struct user_struct *owner)
+{
+ struct vfsmount *mnt;
+ int err = inc_nr_mounts(owner);
+ if (err)
+ return ERR_PTR(err);
+
+ mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
+ if (!mnt) {
+ dec_nr_mounts(owner);
+ return ERR_PTR(-ENOMEM);
+ } else {
memset(mnt, 0, sizeof(struct vfsmount));
atomic_set(&mnt->mnt_count,1);
INIT_LIST_HEAD(&mnt->mnt_hash);
@@ -70,14 +104,21 @@
mnt->mnt_devname = newname;
}
}
+ if (owner) {
+ atomic_inc(&owner->__count);
+ mnt->mnt_owner = owner;
+ }
}
return mnt;
}

void free_vfsmnt(struct vfsmount *mnt)
{
+ struct user_struct *owner = mnt->mnt_owner;
kfree(mnt->mnt_devname);
kmem_cache_free(mnt_cache, mnt);
+ dec_nr_mounts(owner);
+ free_uid(owner);
}

/*
@@ -148,13 +189,16 @@
}

static struct vfsmount *
-clone_mnt(struct vfsmount *old, struct dentry *root)
+clone_mnt(struct vfsmount *old, struct dentry *root, struct user_struct *owner)
{
struct super_block *sb = old->mnt_sb;
- struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+ struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname, owner);

- if (mnt) {
+ if (!IS_ERR(mnt)) {
mnt->mnt_flags = old->mnt_flags;
+ /* Unprivileged mounts require NOSUID and NODEV */
+ if (owner)
+ mnt->mnt_flags |= MNT_NOSUID | MNT_NODEV;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
mnt->mnt_root = dget(root);
@@ -252,6 +296,8 @@
if (mnt->mnt_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}
+ if (mnt->mnt_owner)
+ seq_printf(m, ",owner=%i", mnt->mnt_owner->uid);
if (mnt->mnt_sb->s_op->show_options)
err = mnt->mnt_sb->s_op->show_options(m, mnt);
seq_puts(m, " 0 0\n");
@@ -480,7 +526,8 @@
goto dput_and_out;

retval = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && (nd.mnt->mnt_owner != current->user ||
+ (flags & MNT_FORCE)))
goto dput_and_out;

retval = do_umount(nd.mnt, flags);
@@ -503,22 +550,21 @@

#endif

-static int mount_is_safe(struct nameidata *nd)
+static struct user_struct *mount_is_safe(struct nameidata *nd)
{
if (capable(CAP_SYS_ADMIN))
- return 0;
- return -EPERM;
-#ifdef notyet
- if (S_ISLNK(nd->dentry->d_inode->i_mode))
- return -EPERM;
+ return NULL;
+
+ if (!S_ISDIR(nd->dentry->d_inode->i_mode) &&
+ !S_ISREG(nd->dentry->d_inode->i_mode))
+ return ERR_PTR(-EPERM);
if (nd->dentry->d_inode->i_mode & S_ISVTX) {
- if (current->uid != nd->dentry->d_inode->i_uid)
- return -EPERM;
+ if (current->fsuid != nd->dentry->d_inode->i_uid)
+ return ERR_PTR(-EPERM);
}
if (permission(nd->dentry->d_inode, MAY_WRITE, nd))
- return -EPERM;
- return 0;
-#endif
+ return ERR_PTR(-EPERM);
+ return current->user;
}

static int
@@ -533,15 +579,16 @@
}
}

-static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
+static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+ struct user_struct *owner)
{
struct vfsmount *res, *p, *q, *r, *s;
struct list_head *h;
struct nameidata nd;

- res = q = clone_mnt(mnt, dentry);
- if (!q)
- goto Enomem;
+ res = q = clone_mnt(mnt, dentry, owner);
+ if (IS_ERR(q))
+ goto out_error;
q->mnt_mountpoint = mnt->mnt_mountpoint;

p = mnt;
@@ -558,9 +605,9 @@
p = s;
nd.mnt = q;
nd.dentry = p->mnt_mountpoint;
- q = clone_mnt(p, p->mnt_root);
- if (!q)
- goto Enomem;
+ q = clone_mnt(p, p->mnt_root, owner);
+ if (IS_ERR(q))
+ goto out_error;
spin_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &nd);
@@ -568,13 +615,13 @@
}
}
return res;
- Enomem:
+ out_error:
if (res) {
spin_lock(&vfsmount_lock);
umount_tree(res);
spin_unlock(&vfsmount_lock);
}
- return NULL;
+ return q;
}

static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
@@ -620,11 +667,12 @@
*/
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
+ int err;
struct nameidata old_nd;
struct vfsmount *mnt = NULL;
- int err = mount_is_safe(nd);
- if (err)
- return err;
+ struct user_struct *owner = mount_is_safe(nd);
+ if (IS_ERR(owner))
+ return PTR_ERR(owner);
if (!old_name || !*old_name)
return -EINVAL;
err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
@@ -634,11 +682,14 @@
down_write(&current->namespace->sem);
err = -EINVAL;
if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
- err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(old_nd.mnt, old_nd.dentry, owner);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(old_nd.mnt, old_nd.dentry, owner);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
+ goto out;
+ }
}

if (mnt) {
@@ -656,6 +707,7 @@
mntput(mnt);
}

+ out:
up_write(&current->namespace->sem);
path_release(&old_nd);
return err;
@@ -1066,6 +1118,7 @@
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
struct fs_struct *fs = tsk->fs;
struct vfsmount *p, *q;
+ int err;

if (!namespace)
return 0;
@@ -1075,11 +1128,11 @@
if (!(flags & CLONE_NEWNS))
return 0;

- if (!capable(CAP_SYS_ADMIN)) {
- put_namespace(namespace);
- return -EPERM;
- }
+ err = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;

+ err = -ENOMEM;
new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
if (!new_ns)
goto out;
@@ -1090,8 +1143,9 @@

down_write(&tsk->namespace->sem);
/* First pass: copy the tree topology */
- new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
- if (!new_ns->root) {
+ new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root, 0);
+ if (IS_ERR(new_ns->root)) {
+ err = PTR_ERR(new_ns->root);
up_write(&tsk->namespace->sem);
kfree(new_ns);
goto out;
@@ -1142,7 +1196,7 @@

out:
put_namespace(namespace);
- return -ENOMEM;
+ return err;
}

asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
Index: fs/super.c
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/fs/super.c (mode:100644 sha1:3a1b8ca04ba601b37ffa5dadb8f9695272952e39)
+++ uncommitted/fs/super.c (mode:100644)
@@ -797,7 +797,7 @@
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
struct file_system_type *type = get_fs_type(fstype);
- struct super_block *sb = ERR_PTR(-ENOMEM);
+ struct super_block *sb;
struct vfsmount *mnt;
int error;
char *secdata = NULL;
@@ -805,9 +805,11 @@
if (!type)
return ERR_PTR(-ENODEV);

- mnt = alloc_vfsmnt(name);
- if (!mnt)
- goto out;
+ mnt = alloc_vfsmnt(name, NULL);
+ if (IS_ERR(mnt)) {
+ put_filesystem(type);
+ return mnt;
+ }

if (data) {
secdata = alloc_secdata();
@@ -845,7 +847,6 @@
free_secdata(secdata);
out_mnt:
free_vfsmnt(mnt);
-out:
put_filesystem(type);
return (struct vfsmount *)sb;
}
Index: include/asm-generic/resource.h
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/include/asm-generic/resource.h (mode:100644 sha1:cfe3692b23e580a01ccf0ad7a9102e854c8be792)
+++ uncommitted/include/asm-generic/resource.h (mode:100644)
@@ -44,8 +44,9 @@
#define RLIMIT_NICE 13 /* max nice prio allowed to raise to
0-39 for nice level 19 .. -20 */
#define RLIMIT_RTPRIO 14 /* maximum realtime priority */
+#define RLIMIT_MOUNTS 15 /* maximum number of mounts for user */

-#define RLIM_NLIMITS 15
+#define RLIM_NLIMITS 16

/*
* SuS says limits have to be unsigned.
@@ -86,6 +87,7 @@
[RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \
[RLIMIT_NICE] = { 0, 0 }, \
[RLIMIT_RTPRIO] = { 0, 0 }, \
+ [RLIMIT_MOUNTS] = { RLIM_INFINITY, RLIM_INFINITY }, \
}

#endif /* __KERNEL__ */
Index: include/linux/fs.h
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/include/linux/fs.h (mode:100644 sha1:4edba067a7178103f78544717a91ef98dae7994d)
+++ uncommitted/include/linux/fs.h (mode:100644)
@@ -36,6 +36,12 @@
};
extern struct files_stat_struct files_stat;

+struct mounts_stat_struct {
+ int nr_mounts;
+ int max_mounts;
+};
+extern struct mounts_stat_struct mounts_stat;
+
struct inodes_stat_t {
int nr_inodes;
int nr_unused;
Index: include/linux/mount.h
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/include/linux/mount.h (mode:100644 sha1:8b8d3b9beefdc4f91cfa12d9275976408590cbcc)
+++ uncommitted/include/linux/mount.h (mode:100644)
@@ -33,6 +33,7 @@
int mnt_flags;
int mnt_expiry_mark; /* true if marked for expiry */
char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
+ struct user_struct *mnt_owner; /* Mount owner */
struct list_head mnt_list;
struct list_head mnt_fslink; /* link in fs-specific expiry list */
struct namespace *mnt_namespace; /* containing namespace */
@@ -64,7 +65,8 @@
}

extern void free_vfsmnt(struct vfsmount *mnt);
-extern struct vfsmount *alloc_vfsmnt(const char *name);
+extern struct vfsmount *alloc_vfsmnt(const char *name,
+ struct user_struct *owner);
extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
const char *name, void *data);

Index: include/linux/sched.h
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/include/linux/sched.h (mode:100644 sha1:5f868a5885811571fdc701037bf7b09b40a746b8)
+++ uncommitted/include/linux/sched.h (mode:100644)
@@ -408,6 +408,9 @@
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
unsigned long locked_shm; /* How many pages of mlocked shm ? */

+ /* protected by vfsmount_lock */
+ unsigned long mounts; /* Number of mounts this user owns */
+
#ifdef CONFIG_KEYS
struct key *uid_keyring; /* UID specific keyring */
struct key *session_keyring; /* UID's default session keyring */
Index: include/linux/sysctl.h
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/include/linux/sysctl.h (mode:100644 sha1:772998147e3ef989988c184520f6dacba9fb601d)
+++ uncommitted/include/linux/sysctl.h (mode:100644)
@@ -667,8 +667,8 @@
FS_NRFILE=6, /* int:current number of allocated filedescriptors */
FS_MAXFILE=7, /* int:maximum number of filedescriptors that can be allocated */
FS_DENTRY=8,
- FS_NRSUPER=9, /* int:current number of allocated super_blocks */
- FS_MAXSUPER=10, /* int:maximum number of super_blocks that can be allocated */
+ FS_NRMOUNT=9, /* int:current number of mounts */
+ FS_MAXMOUNT=10, /* int:maximum number of mounts allowed */
FS_OVERFLOWUID=11, /* int: overflow UID */
FS_OVERFLOWGID=12, /* int: overflow GID */
FS_LEASES=13, /* int: leases enabled */
Index: kernel/sysctl.c
===================================================================
--- a6d962c4f559f3644678574a66310084fd13d130/kernel/sysctl.c (mode:100644 sha1:701d12c6306844136a8e525902d06082733127fb)
+++ uncommitted/kernel/sysctl.c (mode:100644)
@@ -885,6 +885,22 @@
.proc_handler = &proc_dointvec,
},
{
+ .ctl_name = FS_NRMOUNT,
+ .procname = "mount-nr",
+ .data = &mounts_stat.nr_mounts,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = FS_MAXMOUNT,
+ .procname = "mount-max",
+ .data = &mounts_stat.max_mounts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = FS_OVERFLOWUID,
.procname = "overflowuid",
.data = &fs_overflowuid,


2005-05-03 19:54:34

by Bill Davidsen

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Miklos Szeredi wrote:
> This (lightly tested) patch against 2.6.12-rc* adds some
> infrastructure and basic functionality for unprivileged mount/umount
> system calls.
>
> Details:
>
> - new mnt_owner field in struct vfsmount
> - if mnt_owner is NULL, it's a privileged mount
> - global limit on unprivileged mounts in /proc/sys/fs/mount-max
> - per user limit of mounts in rlimit
> - allow umount for the owner (except force flag)
> - allow unprivileged bind mount to files/directories writable by owner
> - add nosuid,nodev flags to unprivileged mounts
>
> Next step would be to add some policy for new mounts. I'm thinking of
> either something static: e.g. FS_SAFE flag for "safe" filesystems, or
> a more configurable approach through sysfs or something.
>
> Comments?

Are these public or private mounts? In other words, is the mount visible
only to the mounting process and children, or is it visible to (and can
effect) other processes. Clearly true private mounts open uses with
chroot jails and virtualization.

--
-bill davidsen ([email protected])
"The secret to procrastination is to put things off until the
last possible moment - but no longer" -me

2005-05-04 13:08:11

by Eric Van Hensbergen

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On 5/3/05, Miklos Szeredi <[email protected]> wrote:
> This (lightly tested) patch against 2.6.12-rc* adds some
> infrastructure and basic functionality for unprivileged mount/umount
> system calls.
>
> Details:
>
> - new mnt_owner field in struct vfsmount
> - if mnt_owner is NULL, it's a privileged mount
> - global limit on unprivileged mounts in /proc/sys/fs/mount-max
> - per user limit of mounts in rlimit

I was starting down this track in my tree, but I'm glad you beat me to it ;).
Your initial limit (10) seems low if you consider binds as mounts. I
can easily see a user using more than 10 binds in an environment. As
Ram mentioned earlier - we are going to run into problems with the
shared-subtree stuff if propagations into private namespaces count as
a new mount. We need to think through how we are going to deal with
this.

> - allow umount for the owner (except force flag)
> - allow unprivileged bind mount to files/directories writable by owner
> - add nosuid,nodev flags to unprivileged mounts
>
> Next step would be to add some policy for new mounts. I'm thinking of
> either something static: e.g. FS_SAFE flag for "safe" filesystems, or
> a more configurable approach through sysfs or something.
>

I think the FS_SAFE stuff needs to be part of this patch. Folks made
a pretty good argument that mounting corrupted images with certain
file systems could be a really bad thing. I'm not sure on the whole
sysfs configurable approach -- it seems like more advanced policies
would be best handled in user-space.

My major complaint is that I really think having user mounts without
requiring them to be in a user's private namespace creates quite a
mess. It potentially pollutes the system's namespace and opens up the
possibility of all sorts of synthetic file system "traps". I'd rather
see the private namespace stuff enforced before enabling user-mounts.

-eric

2005-05-04 13:49:43

by Martin Waitz

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

hoi :)

On Tue, May 03, 2005 at 04:31:35PM +0200, Miklos Szeredi wrote:
> This (lightly tested) patch against 2.6.12-rc* adds some
> infrastructure and basic functionality for unprivileged mount/umount
> system calls.

most of this unprivileged mount policy can be handled by a suid
userspace helper (e.g. pmount)

what are the pros/cons of handling that in the kernel?

--
Martin Waitz


Attachments:
(No filename) (388.00 B)
signature.asc (189.00 B)
Digital signature
Download all attachments

2005-05-04 14:21:57

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> I was starting down this track in my tree, but I'm glad you beat me
> to it ;). Your initial limit (10) seems low if you consider binds
> as mounts.

Where did you see 10 as the limit? The initial global limit is zero,
the initial per-user limit is infinity (I did actually test these ;)

> I can easily see a user using more than 10 binds in an environment.
> As Ram mentioned earlier - we are going to run into problems with
> the shared-subtree stuff if propagations into private namespaces
> count as a new mount. We need to think through how we are going to
> deal with this.

Hmm, interesting question. The safest is to account all mounts in
private namespace (including initial and propagated mounts) to the
user creating the namespace.

> My major complaint is that I really think having user mounts without
> requiring them to be in a user's private namespace creates quite a
> mess. It potentially pollutes the system's namespace and opens up the
> possibility of all sorts of synthetic file system "traps". I'd rather
> see the private namespace stuff enforced before enabling user-mounts.

Yes, I see your point. However the problem of malicious filesystem
"traps" applies to private namespaces as well (because of suid
programs).

So if a user creates a private namespace, it should have the choice of:

1) Giving up all suid rights (i.e. all mounts are cloned and
propagated with nosuid)

2) Not giving up suid for cloned and propagated mounts, but having
extra limitations (suid/sgid programs cannot access unprivileged
"synthetic" mounts)

1) could have the advantage of allowing the user _arbitrary_
modification to the namespace, without limitations on mountpoint
writablity, etc.

Bind mounts are less problematic than user-controlled "synthetic"
filesystems, but it could still cause problems (e.g. trick updatedb to
enter directories which are otherwise in it's prune list).

So what are the possible solutions?

a) Use "invisible" mounts (a-la my earlier patch) to hide
problematic mounts from other users. This would not be a
replacement for private namespaces, rather a supplement.

b) same as a) but disallow unprivileged mounts in the global
namespace completely

c/d) same as a/b) but without invisible mounts, simply by not
allowing processes owned by other users (incl. suid/sgid
processes started by mount owner) to access the files. FUSE
now does c).

e) ???

My order of preference is a), c), d), b). I don't see any advantage
of b) over a).

Opinions? Ideas?

Thanks,
Miklos

2005-05-04 14:34:46

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> > This (lightly tested) patch against 2.6.12-rc* adds some
> > infrastructure and basic functionality for unprivileged mount/umount
> > system calls.
>
> most of this unprivileged mount policy can be handled by a suid
> userspace helper (e.g. pmount)

Yes. This patch was created because certain people (including myself
to some extent) don't like that approach.

> what are the pros/cons of handling that in the kernel?

pro:

- mount would still work in "suidless" private namespace

con:

- kernel bloat

I'm sure there are others, they just elude me for the moment :)

Thanks,
Miklos

2005-05-04 14:51:34

by Eric Van Hensbergen

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On 5/4/05, Miklos Szeredi <[email protected]> wrote:
>
> Where did you see 10 as the limit? The initial global limit is zero,
> the initial per-user limit is infinity (I did actually test these ;)
>

The verdict is: I should stop trying to read code before I've had my
coffee in the morning. Sorry about that.

>
> > My major complaint is that I really think having user mounts without
> > requiring them to be in a user's private namespace creates quite a
> > mess. It potentially pollutes the system's namespace and opens up the
> > possibility of all sorts of synthetic file system "traps". I'd rather
> > see the private namespace stuff enforced before enabling user-mounts.
>
> Yes, I see your point. However the problem of malicious filesystem
> "traps" applies to private namespaces as well (because of suid
> programs).
>

I'm not sure I quite get this (perhaps I haven't had enough coffee
yet...hmmm). There are, of course, multiple potential vulnerabilities
here -- but if you confine a user's modification to a private
namespace, system daemons and other users would be safe unless they
explicitly suid to the user and we had the per-user namespace
semantics that are being discussed in the other thread.

So, two comments here - if the per-user namespace semantics exist and
someone/something suid's to that user - they make themselves
vulnerable. I imagine there are numerous traps you can set for
someone that suids to your profile. Do any of the system daemons
actually do this?

The other comment is that this is why I don't like per-user namespaces
- per process namespaces avoid this vulnerability. However, that
discussion is already being covered to some length in the other
thread, so I don't really want to go into it again here.

> ...

I don't like any of the proposed solutions. Invisible mounts just
aren't the right solution. Restricting suid/sgid in a cloned
namespace seems like it would be impractical and cause lots of
problems.

Viro - you put in the private namespace stuff originally, and use it
from a user-context (binding your bin directories and what not).
What's your vision on this? What do we need to do to make user
mounts/binds/private-namespace a reality?

-eric

2005-05-04 15:22:27

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> >
> > > My major complaint is that I really think having user mounts without
> > > requiring them to be in a user's private namespace creates quite a
> > > mess. It potentially pollutes the system's namespace and opens up the
> > > possibility of all sorts of synthetic file system "traps". I'd rather
> > > see the private namespace stuff enforced before enabling user-mounts.
> >
> > Yes, I see your point. However the problem of malicious filesystem
> > "traps" applies to private namespaces as well (because of suid
> > programs).
> >
>
> I'm not sure I quite get this (perhaps I haven't had enough coffee
> yet...hmmm). There are, of course, multiple potential vulnerabilities
> here -- but if you confine a user's modification to a private
> namespace, system daemons and other users would be safe

Yes.

I'm talking about a different problem. Here's a little example (it's
a little made up, because I didn't properly do my research yet)

sysadmin makes an 'pkginstall' tool available to users with
'sudo'

'pkginstall' checks (with some signature) that the package is from a
trusted source, before installing it. (I don't know if real package
managers can do this, but I can imagine so)

'pkginstall' locks it's database before installation

black hat user implements a filesystem, in which file open never
returns. Then mounts it and then runs 'sudo pkginstall
/tmp/nastyfs/t.pkg'

the result will be that no other users will be able to install
packages until the sysadmin comes back and kills the hanging
process.

I'm not sure if this is a problem is real life, but I can imagine it
being so.

The basic reason is that with a synthetic filesystem controlled by the
user, the user gets to tamper with the operation of the suid/sgid
process. There _is_ a reason why suid/sgid processes are not
ptrace()-able, and this is something very similar.

> unless they explicitly suid to the user and we had the per-user
> namespace semantics that are being discussed in the other thread.
>
> So, two comments here - if the per-user namespace semantics exist and
> someone/something suid's to that user - they make themselves
> vulnerable. I imagine there are numerous traps you can set for
> someone that suids to your profile. Do any of the system daemons
> actually do this?

Not that I know of. Even if they would, the reasoning is that they
would be vulnerable to being ptraced (and thus tempered with in
various ways) anyway.

So if you only allow access for processes that the filesystem owner
can ptrace, you made sure that you cannot "trap" any process you
couldn't already trap.

> I don't like any of the proposed solutions. Invisible mounts just
> aren't the right solution. Restricting suid/sgid in a cloned
> namespace seems like it would be impractical and cause lots of
> problems.

Probably true in most cases. Optionally having a "suidless"
playground for users could be very nice and perfectly secure though.

Thanks,
Miklos

2005-05-11 08:48:32

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Tue, May 03, 2005 at 04:31:35PM +0200, Miklos Szeredi wrote:
> This (lightly tested) patch against 2.6.12-rc* adds some
> infrastructure and basic functionality for unprivileged mount/umount
> system calls.

Thanks for doing this.

> Details:
>
> - new mnt_owner field in struct vfsmount
> - if mnt_owner is NULL, it's a privileged mount
> - global limit on unprivileged mounts in /proc/sys/fs/mount-max

I think the name should be different. user-mount-max?

Acutally the accounting in your patch is a little odd, we account for
all mounts, and after mount-max is reached user mounts are denied.
Shouldn't we account only for user mounts?

> - per user limit of mounts in rlimit
> - allow umount for the owner (except force flag)
> - allow unprivileged bind mount to files/directories writable by owner
> - add nosuid,nodev flags to unprivileged mounts
>
> Next step would be to add some policy for new mounts. I'm thinking of
> either something static: e.g. FS_SAFE flag for "safe" filesystems, or
> a more configurable approach through sysfs or something.
>
> Comments?

> --- a6d962c4f559f3644678574a66310084fd13d130/fs/namespace.c (mode:100644 sha1:3b93e5d750ebf8452ea1264251c5b55cc89f48f8)
> +++ uncommitted/fs/namespace.c (mode:100644)
> @@ -42,7 +42,7 @@
> static struct list_head *mount_hashtable;
> static int hash_mask, hash_bits;
> static kmem_cache_t *mnt_cache;
> -
> +struct mounts_stat_struct mounts_stat;
> static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)

minor nipick - please keep a empty line before the function here.
Also I wonder whether we should have struct mounts_stat_struct at all,
just having two variables seems a lot saner to me.

> - if (!capable(CAP_SYS_ADMIN))
> + if (!capable(CAP_SYS_ADMIN) && (nd.mnt->mnt_owner != current->user ||
> + (flags & MNT_FORCE)))
> goto dput_and_out;

although it won't have different results I'd reorder this to make reading
more easy:

if ((nd.mnt->mnt_owner != current->user || (flags & MNT_FORCE)) &&
!capable(CAP_SYS_ADMIN))

> -static int mount_is_safe(struct nameidata *nd)
> +static struct user_struct *mount_is_safe(struct nameidata *nd)
> {
> if (capable(CAP_SYS_ADMIN))
> - return 0;
> - return -EPERM;
> -#ifdef notyet
> - if (S_ISLNK(nd->dentry->d_inode->i_mode))
> - return -EPERM;
> + return NULL;
> +
> + if (!S_ISDIR(nd->dentry->d_inode->i_mode) &&
> + !S_ISREG(nd->dentry->d_inode->i_mode))
> + return ERR_PTR(-EPERM);
> if (nd->dentry->d_inode->i_mode & S_ISVTX) {
> - if (current->uid != nd->dentry->d_inode->i_uid)
> - return -EPERM;
> + if (current->fsuid != nd->dentry->d_inode->i_uid)
> + return ERR_PTR(-EPERM);
> }
> if (permission(nd->dentry->d_inode, MAY_WRITE, nd))
> - return -EPERM;
> - return 0;
> -#endif
> + return ERR_PTR(-EPERM);
> + return current->user;

Currently we do allow bind mounts over every type of file for the super
user. I think we should keep allowing that. Also I think this function
wants a really big comment explaining all the rules for user mounts.

2005-05-11 08:53:33

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, May 04, 2005 at 03:47:17PM +0200, Martin Waitz wrote:
> hoi :)
>
> On Tue, May 03, 2005 at 04:31:35PM +0200, Miklos Szeredi wrote:
> > This (lightly tested) patch against 2.6.12-rc* adds some
> > infrastructure and basic functionality for unprivileged mount/umount
> > system calls.
>
> most of this unprivileged mount policy can be handled by a suid
> userspace helper (e.g. pmount)

Not sanely. It's what started this whole threads. see the threads about
fuse and the nasty cifs ioctl.

2005-05-11 08:52:09

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, May 04, 2005 at 04:21:23PM +0200, Miklos Szeredi wrote:
> Yes, I see your point. However the problem of malicious filesystem
> "traps" applies to private namespaces as well (because of suid
> programs).
>
> So if a user creates a private namespace, it should have the choice of:
>
> 1) Giving up all suid rights (i.e. all mounts are cloned and
> propagated with nosuid)
>
> 2) Not giving up suid for cloned and propagated mounts, but having
> extra limitations (suid/sgid programs cannot access unprivileged
> "synthetic" mounts)

Although I hate special cases I think that we might need 2) to avoid too
much trouble tripping over the global namespace. OTOH that should also
accelarate adoption of giving each user a separate namespace on login in
the various distribution, which is a good thing ;-)

2005-05-11 10:21:19

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> > Details:
> >
> > - new mnt_owner field in struct vfsmount
> > - if mnt_owner is NULL, it's a privileged mount
> > - global limit on unprivileged mounts in /proc/sys/fs/mount-max
>
> I think the name should be different. user-mount-max?
>
> Acutally the accounting in your patch is a little odd, we account for
> all mounts, and after mount-max is reached user mounts are denied.
> Shouldn't we account only for user mounts?

It's done similarly to files-max. I'm not particularly attached to
either view.

> > --- a6d962c4f559f3644678574a66310084fd13d130/fs/namespace.c (mode:100644 sha1:3b93e5d750ebf8452ea1264251c5b55cc89f48f8)
> > +++ uncommitted/fs/namespace.c (mode:100644)
> > @@ -42,7 +42,7 @@
> > static struct list_head *mount_hashtable;
> > static int hash_mask, hash_bits;
> > static kmem_cache_t *mnt_cache;
> > -
> > +struct mounts_stat_struct mounts_stat;
> > static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
>
> minor nipick - please keep a empty line before the function here.
> Also I wonder whether we should have struct mounts_stat_struct at all,
> just having two variables seems a lot saner to me.

OK. Again I was just copying files_stat_struct.

> > - if (!capable(CAP_SYS_ADMIN))
> > + if (!capable(CAP_SYS_ADMIN) && (nd.mnt->mnt_owner != current->user ||
> > + (flags & MNT_FORCE)))
> > goto dput_and_out;
>
> although it won't have different results I'd reorder this to make reading
> more easy:
>
> if ((nd.mnt->mnt_owner != current->user || (flags & MNT_FORCE)) &&
> !capable(CAP_SYS_ADMIN))

OK.

> > -static int mount_is_safe(struct nameidata *nd)
> > +static struct user_struct *mount_is_safe(struct nameidata *nd)
> > {
> > if (capable(CAP_SYS_ADMIN))
> > - return 0;
> > - return -EPERM;
> > -#ifdef notyet
> > - if (S_ISLNK(nd->dentry->d_inode->i_mode))
> > - return -EPERM;
> > + return NULL;
> > +
> > + if (!S_ISDIR(nd->dentry->d_inode->i_mode) &&
> > + !S_ISREG(nd->dentry->d_inode->i_mode))
> > + return ERR_PTR(-EPERM);
> > if (nd->dentry->d_inode->i_mode & S_ISVTX) {
> > - if (current->uid != nd->dentry->d_inode->i_uid)
> > - return -EPERM;
> > + if (current->fsuid != nd->dentry->d_inode->i_uid)
> > + return ERR_PTR(-EPERM);
> > }
> > if (permission(nd->dentry->d_inode, MAY_WRITE, nd))
> > - return -EPERM;
> > - return 0;
> > -#endif
> > + return ERR_PTR(-EPERM);
> > + return current->user;
>
> Currently we do allow bind mounts over every type of file for the super
> user. I think we should keep allowing that.

Yep. I didn't change that check (first two lines of function), so it
should work as it used to.

> Also I think this function wants a really big comment explaining
> all the rules for user mounts.

OK.

Thanks for the comments,
Miklos

2005-05-11 10:31:24

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> > Yes, I see your point. However the problem of malicious filesystem
> > "traps" applies to private namespaces as well (because of suid
> > programs).
> >
> > So if a user creates a private namespace, it should have the choice of:
> >
> > 1) Giving up all suid rights (i.e. all mounts are cloned and
> > propagated with nosuid)
> >
> > 2) Not giving up suid for cloned and propagated mounts, but having
> > extra limitations (suid/sgid programs cannot access unprivileged
> > "synthetic" mounts)
>
> Although I hate special cases I think that we might need 2) to avoid too
> much trouble tripping over the global namespace.

I think it should be both. How about a new clone option "CLONE_NOSUID"?

Miklos

2005-05-11 16:41:41

by Bodo Eggert

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Miklos Szeredi <[email protected]> wrote:

> How about a new clone option "CLONE_NOSUID"?

IMO, the clone call ist the wrong place to create namespaces. It should be
deprecated by a mkdir/chdir-like interface.
--
Incoming fire has the right of way.

2005-05-11 17:07:44

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Bodo Eggert <[email protected]> wrote:
> > How about a new clone option "CLONE_NOSUID"?
>
> IMO, the clone call ist the wrong place to create namespaces. It should be
> deprecated by a mkdir/chdir-like interface.

And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".

There are some small quirks to fix, should we decide that's the way to
go. But it's basically there.

File descriptors keep track of the namespace (actually vfsmnt) where
they were opened. Today, if you pass a directory file descriptor from
one process to another, you're granting access to see the other's
namespace.

That's why /proc/NNN/root works (with small fixes) in much the way
you'd expect.

-- Jamie

2005-05-11 18:49:48

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> > > How about a new clone option "CLONE_NOSUID"?
> >
> > IMO, the clone call ist the wrong place to create namespaces. It should be
> > deprecated by a mkdir/chdir-like interface.
>
> And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".

That's the chdir part.

The mkdir part is clone() or unshare().

How else do you propose to create new namespaces?

Miklos

2005-05-11 19:07:44

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Quoting Miklos Szeredi ([email protected]):
> > > > How about a new clone option "CLONE_NOSUID"?
> > >
> > > IMO, the clone call ist the wrong place to create namespaces. It should be
> > > deprecated by a mkdir/chdir-like interface.
> >
> > And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".
>
> That's the chdir part.
>
> The mkdir part is clone() or unshare().

Right, sys_unshare(), as per Janak's patch. Does it lack anything which
you would need?

> How else do you propose to create new namespaces?

thanks,
-serge

2005-05-11 19:32:33

by Bodo Eggert

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, 11 May 2005, Jamie Lokier wrote:
> Bodo Eggert <[email protected]> wrote:

> > > How about a new clone option "CLONE_NOSUID"?
> >
> > IMO, the clone call ist the wrong place to create namespaces. It should be
> > deprecated by a mkdir/chdir-like interface.
>
> And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".

If you want persistent namespaces, this will be a PITA (I don't want a
keep-the-namespace-open-daemon), and if you don't, it will be racy
(user a logs in, while his second/nth login expires).

Keeping a list of named namespaces in kernel can be made cheap and secure.
--
Friendly fire isn't.

2005-05-11 19:36:38

by Ram Pai

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, 2005-05-11 at 11:49, Miklos Szeredi wrote:
> > > > How about a new clone option "CLONE_NOSUID"?
> > >
> > > IMO, the clone call ist the wrong place to create namespaces. It should be
> > > deprecated by a mkdir/chdir-like interface.
> >
> > And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".
>
> That's the chdir part.

What if proc filesystem is removed from the kernel?

Ability to access some other namespace through the proc filesystem does
not look clean. I think it should be cleanly supported through VFS.

Also cd'ing into a new namespace just allows you to browse through
the other namespace. But it does not effectively change the process's
namespace. Things like mount in the other namespace will be failed
by check_mount() anyway.

I think, we need sys calls like sys_cdnamespace() which switches to a
new namespace.

Effectively the process's current->namespace has to be modified,
for the process to be effectively work in the new namespace.


>
> The mkdir part is clone() or unshare().

clone/unshare will give you the ability to share/unshare a know
namespace. But to share some arbitrary namespace to which a process
has access rights to.

> How else do you propose to create new namespaces?
>

RP


> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2005-05-11 19:46:13

by Bodo Eggert

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, 11 May 2005 [email protected] wrote:
> Quoting Miklos Szeredi ([email protected]):

> > > > > How about a new clone option "CLONE_NOSUID"?
> > > >
> > > > IMO, the clone call ist the wrong place to create namespaces. It should be
> > > > deprecated by a mkdir/chdir-like interface.
> > >
> > > And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".
> >
> > That's the chdir part.
> >
> > The mkdir part is clone() or unshare().
>
> Right, sys_unshare(), as per Janak's patch. Does it lack anything which
> you would need?

e.g.

create_persistent_namespace(char* name)
destroy_persistent_namespace(char* name)
list_persistent_namespaces(char * prefix, uint flags)
# flags & 1: 1=list all namespaces minus prefix
# 0=trim names at the first slash after stripping prefix

--
Funny quotes:
27. If people from Poland are called Poles, why aren't people from Holland
called Holes?

2005-05-11 20:31:59

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> What if proc filesystem is removed from the kernel?
>
> Ability to access some other namespace through the proc filesystem does
> not look clean. I think it should be cleanly supported through VFS.
>
> Also cd'ing into a new namespace just allows you to browse through
> the other namespace. But it does not effectively change the process's
> namespace. Things like mount in the other namespace will be failed
> by check_mount() anyway.
>
> I think, we need sys calls like sys_cdnamespace() which switches to a
> new namespace.

Jamie's proposal was to make chroot() swich namespace.

> Effectively the process's current->namespace has to be modified,
> for the process to be effectively work in the new namespace.

current->namespace could be dropped altogether, and
current->current->fs->rootmnt->mnt_namespace could be used instead.

It's a nice logical extension of chroot(), without needing new
syscalls.

Miklos

2005-05-11 20:40:46

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> e.g.
>
> create_persistent_namespace(char* name)
> destroy_persistent_namespace(char* name)
> list_persistent_namespaces(char * prefix, uint flags)
> # flags & 1: 1=list all namespaces minus prefix
> # 0=trim names at the first slash after stripping prefix

Just create a namespacefs :)

Mkdir creates a new namespace, rmdir removes a namespace, chroot
switches to the namespace. No new interfaces needed.

Miklos

2005-05-11 21:10:12

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Miklos Szeredi wrote:
> > > > How about a new clone option "CLONE_NOSUID"?
> > >
> > > IMO, the clone call ist the wrong place to create namespaces. It should be
> > > deprecated by a mkdir/chdir-like interface.
> >
> > And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".
>
> That's the chdir part.
>
> The mkdir part is clone() or unshare().
>
> How else do you propose to create new namespaces?

This is not a proposal - I'm not saying it's pretty - but a suggestion
that you can use today.

Use clone(), and then have the child task open "/" and pass that file
descriptor back to the parent process using a unix socket. The child
can exit and the parent can use the new namespace how it likes. Short
and sweet, and you can create as many namespaces as you like :)

That's mkdir done.

You can't do a lot with the new namespace, because of the security
restrictions on mount() on a foreign namespace. That's what I meant
about the "small fixes" - get rid of the current->namespace checks and
it'll be usable.

I don't see the purpose of current->namespace and the associated mount
restrictions at all. I asked Al Viro what it's for, but haven't seen
a reply :( IMHO current->namespace should simply be removed, because the
"current namespace" is represented just fine by
current->fs->rootmnt->mnt_namespace.

-- Jamie

2005-05-11 21:12:52

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

[email protected] wrote:
> Right, sys_unshare(), as per Janak's patch. Does it lack anything which
> you would need?

For creating new namespaces to be held by a daemon for handing out to
user processes on demand, it's no more convenient than clone().
Neither of them provide a facility to create multiple namespaces as
first-class objects, without switching to them.

-- Jamie

2005-05-11 21:24:15

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Bodo Eggert wrote:
> > > > How about a new clone option "CLONE_NOSUID"?
> > >
> > > IMO, the clone call ist the wrong place to create namespaces. It
> > > should be deprecated by a mkdir/chdir-like interface.
> >
> > And the mkdir/chdir interface already exists, see "cd /proc/NNN/root".
>
> If you want persistent namespaces, this will be a PITA (I don't want a
> keep-the-namespace-open-daemon), and if you don't, it will be racy
> (user a logs in, while his second/nth login expires).
>
> Keeping a list of named namespaces in kernel can be made cheap and secure.

Still easy.

To keep persistent named namespaces in /var/namespaces, thus:

# Just once please!
mount -t tmpfs none /var/namespaces

# Make a named namespace.
NSNAME='fred'
mkdir /var/namespaces/$NSNAME
run_in_new_namespace mount -t bind / /var/namespaces/$NSNAME

# Make a named namespace for the _original_ namespace.
mkdir /var/namespaces/initial
mount -t bind / /var/namespaces/initial

# Access the namespace.
ls /var/namespaces/fred

# Enter the namespace.
chroot /var/namespaces/fred

# Delete a named namespace.
NSNAME='fred'
umount /var/namespaces/$NSNAME
rmdir /var/namespaces/$NSNAME

Some of the above will fail due to security checks in fs/namespace.c,
where it tests against current->namespace. Without those checks,
which seem to have no purpose _other_ than preventing the above usage,
I think the above would all work.

-- Jamie

2005-05-11 21:26:19

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> This is not a proposal - I'm not saying it's pretty - but a suggestion
> that you can use today.
>
> Use clone(), and then have the child task open "/" and pass that file
> descriptor back to the parent process using a unix socket. The child
> can exit and the parent can use the new namespace how it likes. Short
> and sweet, and you can create as many namespaces as you like :)
>
> That's mkdir done.

Right.

> You can't do a lot with the new namespace, because of the security
> restrictions on mount() on a foreign namespace. That's what I meant
> about the "small fixes" - get rid of the current->namespace checks and
> it'll be usable.
>
> I don't see the purpose of current->namespace and the associated mount
> restrictions at all. I asked Al Viro what it's for, but haven't seen
> a reply :( IMHO current->namespace should simply be removed, because the
> "current namespace" is represented just fine by
> current->fs->rootmnt->mnt_namespace.

I'll look at what it would take to remove current->namespace.

Miklos

2005-05-11 21:28:54

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Ram wrote:
> What if proc filesystem is removed from the kernel?
>
> Ability to access some other namespace through the proc filesystem does
> not look clean. I think it should be cleanly supported through VFS.

You don't have to use /proc/NNN/root - that's just one convenient way
to do it.

Other ways are

run_in_namespace mount -t bind / /var/namespaces/$NAME

and

clone + open("/") + pass to parent using unix socket

which I think both work already.

> Also cd'ing into a new namespace just allows you to browse through
> the other namespace. But it does not effectively change the process's
> namespace. Things like mount in the other namespace will be failed
> by check_mount() anyway.

That's correct.

> I think, we need sys calls like sys_cdnamespace() which switches to a
> new namespace.

Can you give a reason why sys_chdir() shouldn't have that behaviour?

> Effectively the process's current->namespace has to be modified,
> for the process to be effectively work in the new namespace.

Or just remove current->namespace. It's entire purpose seems to be to
prevent namespaces from being first class objects. The idea of
"current namespace" is adequately represented by
current->fs->mnt_root->mnt_namespace IMHO.

-- Jamie

2005-05-11 21:32:52

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Miklos Szeredi wrote:
> > I don't see the purpose of current->namespace and the associated mount
> > restrictions at all. I asked Al Viro what it's for, but haven't seen
> > a reply :( IMHO current->namespace should simply be removed, because the
> > "current namespace" is represented just fine by
> > current->fs->rootmnt->mnt_namespace.
>
> I'll look at what it would take to remove current->namespace.

The security implications don't seem to be important. If you don't
want someone to access a namespace, then don't make it accessible.

But knowing Al it might exist for some subtle locking reason. Be sure
to study the locking carefully :)

-- Jamie

2005-05-11 21:34:55

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> Still easy.
>
> To keep persistent named namespaces in /var/namespaces, thus:
>
> # Just once please!
> mount -t tmpfs none /var/namespaces
>
> # Make a named namespace.
> NSNAME='fred'
> mkdir /var/namespaces/$NSNAME
> run_in_new_namespace mount -t bind / /var/namespaces/$NSNAME

That's not going to work, since the mount will only affect the new
namespace. You'd need clone(), and then in the original namespace

mount --bind /proc/CHILDPID/root /var/namespace/$NSNAME

and then child process can safely exit.

Miklos

2005-05-11 21:36:43

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Miklos Szeredi wrote:
> > # Make a named namespace.
> > NSNAME='fred'
> > mkdir /var/namespaces/$NSNAME
> > run_in_new_namespace mount -t bind / /var/namespaces/$NSNAME
>
> That's not going to work, since the mount will only affect the new
> namespace.

Ah, good point. I'm still thinking in terms of shared subtrees, where
it might work.

> You'd need clone(), and then in the original namespace
>
> mount --bind /proc/CHILDPID/root /var/namespace/$NSNAME
>
> and then child process can safely exit.

Or pass the file descriptor over a unix domain socket, so that it
doesn't need /proc.

-- Jamie

2005-05-11 22:42:54

by Ram Pai

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, 2005-05-11 at 14:28, Jamie Lokier wrote:
> Ram wrote:
> > What if proc filesystem is removed from the kernel?
> >
> > Ability to access some other namespace through the proc filesystem does
> > not look clean. I think it should be cleanly supported through VFS.
>
> You don't have to use /proc/NNN/root - that's just one convenient way
> to do it.
>
> Other ways are
>
> run_in_namespace mount -t bind / /var/namespaces/$NAME
>
> and
>
> clone + open("/") + pass to parent using unix socket
>
> which I think both work already.
>
> > Also cd'ing into a new namespace just allows you to browse through
> > the other namespace. But it does not effectively change the process's
> > namespace. Things like mount in the other namespace will be failed
> > by check_mount() anyway.
>
> That's correct.
>
> > I think, we need sys calls like sys_cdnamespace() which switches to a
> > new namespace.
>
> Can you give a reason why sys_chdir() shouldn't have that behaviour?

Do you mean to say you want to change the namespace when a process
changes to a directory which belongs to that namespace?

Well it makes it totally confusing. A user would start seeing different
set of mounts suddenly as he changes directories beloning to different
namespaces. I am not sure, if changing namespace implicitly is a good
idea. Not saying its a bad idea, but seems to change my notion of
namespaces completely.

I think a process should have access to one
namespace at any given point in time, and should have the ability
to explicitly switch to a different namespace of its choice, provided
it has enough access permission to that namespace.

having the ability to access two namespaces simultaneously can allow
cross contamination. Which essentially makes namespaces as a concept
irrelevent.

RP

>
> > Effectively the process's current->namespace has to be modified,
> > for the process to be effectively work in the new namespace.
>
> Or just remove current->namespace. It's entire purpose seems to be to
> prevent namespaces from being first class objects. The idea of
> "current namespace" is adequately represented by
> current->fs->mnt_root->mnt_namespace IMHO.
>
> -- Jamie

2005-05-11 23:01:05

by Eric Van Hensbergen

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On 5/11/05, Ram <[email protected]> wrote:
> On Wed, 2005-05-11 at 14:28, Jamie Lokier wrote:
> > Ram wrote:
>
> Well it makes it totally confusing. A user would start seeing different
> set of mounts suddenly as he changes directories beloning to different
> namespaces. I am not sure, if changing namespace implicitly is a good
> idea. Not saying its a bad idea, but seems to change my notion of
> namespaces completely.
>
> I think a process should have access to one
> namespace at any given point in time, and should have the ability
> to explicitly switch to a different namespace of its choice, provided
> it has enough access permission to that namespace.
>

I agree with Ram. This whole recent flurry of activity seems to be
going down a path which will end in tears. I think Miklos' patch for
allowing user mounts and Janak's patch were both more or less the
direction I'd like to see us moving. Let's hold off on all these
freaky shared-namespace and passed-namespace semantics until we get
the basics in place and get some experience using them.

-eric

2005-05-12 01:00:02

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Ram wrote:
> > Can you give a reason why sys_chdir() shouldn't have that behaviour?
>
> Do you mean to say you want to change the namespace when a process
> changes to a directory which belongs to that namespace?
>
> Well it makes it totally confusing. A user would start seeing different
> set of mounts suddenly as he changes directories beloning to different
> namespaces. I am not sure, if changing namespace implicitly is a good
> idea. Not saying its a bad idea, but seems to change my notion of
> namespaces completely.

That happens _already_. I'm not suggesting it, it's been there in the
kernel for ages.

Let me explain how namespaces _really_ work in Linux.

For path lookup, mounts are just mappings from (dentry,vfsmnt) to
(dentry,vfsmnt). There's a unique vfsmnt for each
(filesystem,namespace) pair, by the way.

During path lookup, each path component moves from one (dentry,vfsmnt)
pair to the next. vfsmnt doesn't change from one dentry to the next
while following a path component. But then, if there's a mount whose
key is the current (dentry,vfsmnt) pair, the current pair is replaced
by the value of the mount.

Notice how namespaces aren't involved in path lookup at all.

That's nothing new: it's what Linux does already.

If that seems confusing, it's because _bind mounts_ are confusing.

Namespaces don't really exist. When you create a new namespace with
CLONE_NEWNS, all that really happens is a lot of bind mounts, to
create a copy of the current directory tree, and then chrooting into
that tree (in effect).

> having the ability to access two namespaces simultaneously can allow
> cross contamination. Which essentially makes namespaces as a concept
> irrelevent.

Cross contamination is already possible, using file descriptor passing
or ptrace().

Also, your suggested new syscall for accessing another namespace would
have exactly the same effect, wouldn't it? :)

-- Jamie

2005-05-12 01:03:40

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Eric Van Hensbergen wrote:
> > Well it makes it totally confusing. A user would start seeing different
> > set of mounts suddenly as he changes directories beloning to different
> > namespaces. I am not sure, if changing namespace implicitly is a good
> > idea. Not saying its a bad idea, but seems to change my notion of
> > namespaces completely.
> >
> > I think a process should have access to one
> > namespace at any given point in time, and should have the ability
> > to explicitly switch to a different namespace of its choice, provided
> > it has enough access permission to that namespace.
> >
>
> I agree with Ram. This whole recent flurry of activity seems to be
> going down a path which will end in tears.

Please read carefully: I've described what _current_ kernels do.

It's a poorly understood area of the kernel, and I'm attempting to
clarify it. This talk of new system calls for entering a namespace
makes no sense when you can _already_ do some things that people
haven't realised the kernel does.

-- Jamie

2005-05-12 02:19:10

by Eric Van Hensbergen

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On 5/11/05, Jamie Lokier <[email protected]> wrote:
>
> Please read carefully: I've described what _current_ kernels do.
>

I guess I misread when you wrote:

>>
>>You can't do a lot with the new namespace, because of the security
>>restrictions on mount() on a foreign namespace. That's what I meant
>>about the "small fixes" - get rid of the current->namespace checks and
>>it'll be usable.
>>
>>I don't see the purpose of current->namespace and the associated mount
>>restrictions at all. I asked Al Viro what it's for, but haven't seen
>>a reply :( IMHO current->namespace should simply be removed, because the
>>"current namespace" is represented just fine by
>>current->fs->rootmnt->mnt_namespace.
>>

That sounds an awful lot like you want to make changes to the current
support in the kernel.

> It's a poorly understood area of the kernel, and I'm attempting to
> clarify it. This talk of new system calls for entering a namespace
> makes no sense when you can _already_ do some things that people
> haven't realised the kernel does.
>

IMHO part of the reason its so poorly understood is that people aren't
using it. That's why I suggest we use some of the proposed patches
which open up name space operations to common users. There are some
security checks (like the one brought up justifying the CAP_SYS_ADMIN
permissions on CLONE_NS) that need to be added, before we start
removing others -- and I'm quite concerned that Viro hasn't weighed in
on any of these new patches, I wonder if its because this thread seems
to have gone off the deep end.

-eric

2005-05-12 03:07:24

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Quoting Jamie Lokier ([email protected]):
> [email protected] wrote:
> > Right, sys_unshare(), as per Janak's patch. Does it lack anything which
> > you would need?
>
> For creating new namespaces to be held by a daemon for handing out to
> user processes on demand, it's no more convenient than clone().

I see.

My use for namespaces is to simply customize the fs view on login, so
sys_unshare is more convenient than clone because it now allows a pam
library to switch namespaces, which was either impossible or hard to do
using clone. Making namespaces first-class objects should also work,
since the namespaces can just be set up in advance and then entered from
inside a pam library.

Still I'd agree with Eric. It'd be good to see just how much we can do
with the ability to clone a namespace outside of clone(). Going back to
the daemon handing out namespaces... why can't you just take your
earlier example of /var/namespaces/NS1, etc, where you just create a
bunch of fs trees under the /var/namespaces directory using bind mounts,
and then login or pam does

sys_unshare(CLONE_NEWNS);
chdir(/var/namespaces/NS3)
pivot_root(/var/namespaces/NS3, tmp)
umount(tmp, MNT_DETACH)
chroot .
?

No daemon needed...

thanks,
-serge

2005-05-12 03:09:45

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Quoting Jamie Lokier ([email protected]):
> Miklos Szeredi wrote:
> > > # Make a named namespace.
> > > NSNAME='fred'
> > > mkdir /var/namespaces/$NSNAME
> > > run_in_new_namespace mount -t bind / /var/namespaces/$NSNAME
> >
> > That's not going to work, since the mount will only affect the new
> > namespace.
>
> Ah, good point. I'm still thinking in terms of shared subtrees, where
> it might work.

Oh! Well, I hope it's safe to assume we'll have those "soon"!

-serge

2005-05-12 06:45:58

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Eric Van Hensbergen wrote:
> On 5/11/05, Jamie Lokier <[email protected]> wrote:
> > Please read carefully: I've described what _current_ kernels do.
> >
>
> I guess I misread when you wrote:

There are some things which you can't do with current kernels due to
the checks against current->namespace. I'm not sure how important
those checks are. And there are some things which you _can_ do, such
as passing directory file descriptors among processes which are in
different namespaces.

-- Jamie

2005-05-12 13:23:52

by Eric Van Hensbergen

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On 5/12/05, Jamie Lokier <[email protected]> wrote:
> Eric Van Hensbergen wrote:
> > On 5/11/05, Jamie Lokier <[email protected]> wrote:
> > > Please read carefully: I've described what _current_ kernels do.
> > >
> >
> > I guess I misread when you wrote:
>
> There are some things which you can't do with current kernels due to
> the checks against current->namespace. I'm not sure how important
> those checks are. And there are some things which you _can_ do, such
> as passing directory file descriptors among processes which are in
> different namespaces.
>

I'm not sure passing directory file descriptors is the right semantic
we want - but at least it provides a point of explicit control (in
much the same way as a bind). Are you sure the clone + open("/") +
pass-to-parent scenario you allows the parent to traverse the child's
private name space through that fd? That seems as bad as accessing
pid name space via the /proc file system.

In Plan 9, file descriptors are passed between name spaces, but the
only use of such a facility (described in fork(2) [plan9man]) is to
pass channels to file servers which can then be mounted in a blank
name space. exportfs(4)[plan9man] seems to provide a much nicer
semantic for this sort of name space sharing.

Let's focus on baby steps first, and to me that's:
a) get rid of holes that allow users to traverse out of a chroot jail
by using the creation of private name spaces (is anyone working on
this, did I miss a patch?)
b) make CLONE_NEWNS (and any other name space creation mechanisms such
as the proposed unshare system call) available to normal users
c) Get the unshare system call adopted as it seems to be generally useful
d) Get Miklos' unprivileged mount/umount patch adopted in mainline

These 4 things open up lots of opportunities and alternatives, if they
prove to be insufficient then we can press forward on name spaces as
first class objects, etc.

-eric


[plan9man] Plan 9 manual pages are available at
http://plan9.bell-labs.com/sys/man/index.html

2005-05-12 13:49:19

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Quoting Eric Van Hensbergen ([email protected]):
> Let's focus on baby steps first, and to me that's:
> a) get rid of holes that allow users to traverse out of a chroot jail
> by using the creation of private name spaces (is anyone working on
> this, did I miss a patch?)

I have tested doing
clone(CLONE_NEWNS);
chdir(/some_jail_dir);
pivot_root(., tmp)
umount2(tmp, MNT_DETACH)
chroot(.)

which appears to prevent escapes from chroot jails. So unless my tests
were insufficient, we don't need additional kernel support. We can just
use something like chroot_ns.c from http://www.sf.net/projects/linux-jail/.

> b) make CLONE_NEWNS (and any other name space creation mechanisms such
> as the proposed unshare system call) available to normal users
> c) Get the unshare system call adopted as it seems to be generally useful
> d) Get Miklos' unprivileged mount/umount patch adopted in mainline

and I'd say

e) Work towards the shared namespaces, which are really one of the
main reasons not to use namespaces right now.

I know this work is being done, so this isn't so much a request for the
shared namespaces, as just a reminder that this will be one of the major
pieces of functionality to consider along with the ones you listed.

thanks,
-serge

2005-05-12 15:19:00

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Eric Van Hensbergen wrote:
> I'm not sure passing directory file descriptors is the right semantic
> we want - but at least it provides a point of explicit control (in
> much the same way as a bind). Are you sure the clone + open("/") +
> pass-to-parent scenario you allows the parent to traverse the child's
> private name space through that fd?

Pretty sure.

> That seems as bad as accessing pid name space via the /proc file
> system.

Or good, depending on your point of view :)

Don't think of "namespaces". Think "set of mounts".
Then what Linux does makes more sense.

Don't forget that if you couldn't do this, you could still use ptrace
to traverse the child's namespace by tracing it.

> In Plan 9, file descriptors are passed between name spaces, but the
> only use of such a facility (described in fork(2) [plan9man]) is to
> pass channels to file servers which can then be mounted in a blank
> name space. exportfs(4)[plan9man] seems to provide a much nicer
> semantic for this sort of name space sharing.

> c) Get the unshare system call adopted as it seems to be generally useful

I'm not convinced the functionality is all that useful. It doesn't
address the need which arose in this thread, which is roughly
equivalent to per-user namespaces (the precise meaning determined by
userspace policy). So what applicatins is it useful for? Do we have
examples, or is it just a nice idea?

-- Jamie

2005-05-12 17:52:04

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Quoting Jamie Lokier ([email protected]):
> Eric Van Hensbergen wrote:
> > c) Get the unshare system call adopted as it seems to be generally useful
>
> I'm not convinced the functionality is all that useful. It doesn't
> address the need which arose in this thread, which is roughly
> equivalent to per-user namespaces (the precise meaning determined by
> userspace policy). So what applicatins is it useful for? Do we have
> examples, or is it just a nice idea?

[my last reply appears to have disappeared, apologies if two show up]

It is useful for polyinstantiated filesystems. For instance, user u1
opens two sessions, one at clearance L1:C1, one at clearance L3:C1,C2.
He starts some software which expects to open tempfiles under /tmp by a
particular name. He later (or simultaneously) opens it also under the
lower clearance. The software now fails because it can't access the
higher clearance file.

This is typically solved by creating /tmp/subdir-CLEARANCE for each
clearance which needs it. Now on login, the user gets a new namespace,
and /tmp/subidr-CLEARANCE is bind mounted over /tmp. (Traditionally,
in other operating systems, instead of bind mounting, lookups under /tmp
are just redirected to the appropriate subdir)

This is also used for other dirs, this is just one example. Note that
MAC is used to actually enforce the clearances, so this is more to
provide a nicer user experience. Note also that I haven't worked on
such a system, so I'm just telling you what I'm told :)

Unshare is useful here because we can use it from pam. I haven't yet
found how we could use clone() from inside a pam library in a meaningful
way to create a new namespace for the resulting login process, so near
as I can tell the alternative is to hack each login program (ssh, login,
etc) separately.

Unshare should also useful for apps which want to change clearance
without needing to clone. There is clearly a desire for this since the
ability to transition without exec was already added to SELinux.

thanks,
-serge

2005-05-12 18:53:23

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount


> > I'm not sure passing directory file descriptors is the right semantic
> > we want - but at least it provides a point of explicit control (in
> > much the same way as a bind). Are you sure the clone + open("/") +
> > pass-to-parent scenario you allows the parent to traverse the child's
> > private name space through that fd?
>
> Pretty sure.

Yup. Attached a little program that can be used to try this out. It
creates a new namespace in the child, does a bind mount (so the
namespaces can be differentiated), then sends the file descriptor of
"/" to the parent. The parent does fchdir(fd), then starts a shell.

So the result is that CWD is under the child namespace, while root is
under the initial namespace.

I also tried bind mounting from the child's namespace to the parent's,
and that works too. But the new mount's mnt_namespace is copied from
the old, which makes the mount un-removable. This is most likely not
intentional, IOW a bug.

Miklos

=== newns.c =========================================================
#define _GNU_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
#include <sched.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/un.h>
#include <sys/socket.h>

static int socks[2];

static int send_fd(int sock_fd, int fd)
{
int retval;
struct msghdr msg;
struct cmsghdr *p_cmsg;
struct iovec vec;
char cmsgbuf[CMSG_SPACE(sizeof(fd))];
int *p_fds;
char sendchar = 0;

msg.msg_control = cmsgbuf;
msg.msg_controllen = sizeof(cmsgbuf);
p_cmsg = CMSG_FIRSTHDR(&msg);
p_cmsg->cmsg_level = SOL_SOCKET;
p_cmsg->cmsg_type = SCM_RIGHTS;
p_cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
p_fds = (int *) CMSG_DATA(p_cmsg);
*p_fds = fd;
msg.msg_controllen = p_cmsg->cmsg_len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = &vec;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
/* "To pass file descriptors or credentials you need to send/read at
* least one byte" (man 7 unix) */
vec.iov_base = &sendchar;
vec.iov_len = sizeof(sendchar);
while ((retval = sendmsg(sock_fd, &msg, 0)) == -1 && errno == EINTR);
if (retval != 1) {
perror("sending file descriptor");
return -1;
}
return 0;
}

static int receive_fd(int fd)
{
struct msghdr msg;
struct iovec iov;
char buf[1];
int rv;
int connfd = -1;
char ccmsg[CMSG_SPACE(sizeof(connfd))];
struct cmsghdr *cmsg;

iov.iov_base = buf;
iov.iov_len = 1;

msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
/* old BSD implementations should use msg_accrights instead of
* msg_control; the interface is different. */
msg.msg_control = ccmsg;
msg.msg_controllen = sizeof(ccmsg);

while(((rv = recvmsg(fd, &msg, 0)) == -1) && errno == EINTR);
if (rv == -1) {
perror("recvmsg");
return -1;
}
if(!rv) {
/* EOF */
return -1;
}

cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg->cmsg_type == SCM_RIGHTS) {
fprintf(stderr, "got control message of unknown type %d\n",
cmsg->cmsg_type);
return -1;
}
return *(int*)CMSG_DATA(cmsg);
}

int childfn(void *p)
{
int fd;

(void) p;
mkdir("/tmp/clonetest", 755);
mkdir("/tmp/clonetest/dir1", 755);
mkdir("/tmp/clonetest/dir1/subdir1", 755);
mkdir("/tmp/clonetest/mnt", 755);
system("mount --bind /tmp/clonetest/dir1 /tmp/clonetest/mnt");
fd = open("/", O_RDONLY | O_DIRECTORY);
send_fd(socks[0], fd);
sleep(1000);
return 1;
}

int main()
{
char buf[10000];
pid_t pid;
int res;
int childfd;

res = socketpair(AF_UNIX, SOCK_STREAM, 0, socks);
if (res == -1) {
perror("socketpair");
return 1;
}

pid = clone(childfn, buf+5000, CLONE_NEWNS | SIGCHLD, NULL);
if ((int) pid == -1) {
perror("clone");
exit(1);
}

childfd = receive_fd(socks[1]);
res = fchdir(childfd);
if (res == -1) {
perror("fchdir");
return 1;
}
execl("/bin/bash", "/bin/bash", NULL);

return 0;
}

2005-05-12 19:56:18

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Miklos Szeredi wrote:
> I also tried bind mounting from the child's namespace to the parent's,
> and that works too. But the new mount's mnt_namespace is copied from
> the old, which makes the mount un-removable. This is most likely not
> intentional, IOW a bug.

I agree about the bug (and it's why I think the current->namespace
checks in fs/namespace.c should be killed - the _only_ effect is to
make un-removable mounts like the above, and the checks are completely
redundant for "normal" namespace operations).

I think the best thing to do for "jails" and such is to think of a
private namespace as a collection of bind mounts in a private tree
that (normally) cannot be reached from elsewhere, not can it reach
elsewhere.

And leave it to adminstration to ensure that a tree isn't visible from
another tree if you don't want it to be for security purposes. That
basically amounts to making sure processes that shouldn't communicate
or ptrace each other can't.

With that view, the kernel's notion of "namespace" is redundant. It
doesn't add anything to the security model, and it doesn't add any
useful functionality.

It's an abstract idea which does not need to be supported by kernel
code, except for the CLONE_NEWNS operation which actually means "copy
all mounts found recursively starting from the current root) to a new
tree of mounts, and chroot to the new tree".

In other words, is ->mnt_namespace required at all, except to contain
namespace->sem (which could be changed)? I don't think it adds
anything realistic to security. The way to make secure jails is to
isolated their trees so they are unreachable. ->mnt_namespace doesn't
make any difference to that.

-- Jamie

2005-05-12 21:09:51

by Bryan Henderson

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> So if a user creates a private namespace, it should have the choice of:
>
> 1) Giving up all suid rights (i.e. all mounts are cloned and
> propagated with nosuid)
>
> 2) Not giving up suid for cloned and propagated mounts, but having
> extra limitations (suid/sgid programs cannot access unprivileged
> "synthetic" mounts)

(2) isn't realistic. There's no such thing as a suid program. Suid is a
characteristic of a _file_. There's no way to know whether a given
executing program is running with privileges that came from a suid file
getting exec'ed. Bear in mind that that exec could be pretty remote --
done by a now-dead ancestor with three more execs in between.

Many user space programs contain hacks to try to discern this information,
and they often cause me headaches and I have to fix them. The usual hacks
are euid==uid, euid==suid, and/or euid==0. It would be an order of
magnitude worse for the kernel to contain such a hack.

--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems

2005-05-13 01:19:54

by Ram Pai

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Thu, 2005-05-12 at 11:51, Miklos Szeredi wrote:
> > > I'm not sure passing directory file descriptors is the right semantic
> > > we want - but at least it provides a point of explicit control (in
> > > much the same way as a bind). Are you sure the clone + open("/") +
> > > pass-to-parent scenario you allows the parent to traverse the child's
> > > private name space through that fd?
> >
> > Pretty sure.
>
> Yup. Attached a little program that can be used to try this out. It
> creates a new namespace in the child, does a bind mount (so the
> namespaces can be differentiated), then sends the file descriptor of
> "/" to the parent. The parent does fchdir(fd), then starts a shell.


> So the result is that CWD is under the child namespace, while root is
> under the initial namespace.
>

r u sure, this program works? Sorry if I am saying something dumb here.
Correct me. When a file descriptor is sent from one process to other,
arn't they referring to different files in each of the processes.
fd=5 may be pointing to file 'xyz' in parent process,
where as fd=5 will be pointing to 'abc' in the child process.

This program did not work for me, and I was wondering if adding
CLONE_FILES in clone() would help. Because that would make sure
that both
the processes share the same file descriptor. It did not work too.

What am I understanding wrong?

In any case my opinion is if this program works than the hole should
be closed instead of exploting it to access different namespace. I
know Jamie is going to pounce at me. ;)

RP



> I also tried bind mounting from the child's namespace to the parent's,
> and that works too. But the new mount's mnt_namespace is copied from
> the old, which makes the mount un-removable. This is most likely not
> intentional, IOW a bug.
>
> Miklos
>
> === newns.c =========================================================
> #define _GNU_SOURCE
>
> #include <stdio.h>
> #include <unistd.h>
> #include <stdlib.h>
> #include <signal.h>
> #include <sched.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <sys/un.h>
> #include <sys/socket.h>
>
> static int socks[2];
>
> static int send_fd(int sock_fd, int fd)
> {
> int retval;
> struct msghdr msg;
> struct cmsghdr *p_cmsg;
> struct iovec vec;
> char cmsgbuf[CMSG_SPACE(sizeof(fd))];
> int *p_fds;
> char sendchar = 0;
>
> msg.msg_control = cmsgbuf;
> msg.msg_controllen = sizeof(cmsgbuf);
> p_cmsg = CMSG_FIRSTHDR(&msg);
> p_cmsg->cmsg_level = SOL_SOCKET;
> p_cmsg->cmsg_type = SCM_RIGHTS;
> p_cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
> p_fds = (int *) CMSG_DATA(p_cmsg);
> *p_fds = fd;
> msg.msg_controllen = p_cmsg->cmsg_len;
> msg.msg_name = NULL;
> msg.msg_namelen = 0;
> msg.msg_iov = &vec;
> msg.msg_iovlen = 1;
> msg.msg_flags = 0;
> /* "To pass file descriptors or credentials you need to send/read at
> * least one byte" (man 7 unix) */
> vec.iov_base = &sendchar;
> vec.iov_len = sizeof(sendchar);
> while ((retval = sendmsg(sock_fd, &msg, 0)) == -1 && errno == EINTR);
> if (retval != 1) {
> perror("sending file descriptor");
> return -1;
> }
> return 0;
> }
>
> static int receive_fd(int fd)
> {
> struct msghdr msg;
> struct iovec iov;
> char buf[1];
> int rv;
> int connfd = -1;
> char ccmsg[CMSG_SPACE(sizeof(connfd))];
> struct cmsghdr *cmsg;
>
> iov.iov_base = buf;
> iov.iov_len = 1;
>
> msg.msg_name = 0;
> msg.msg_namelen = 0;
> msg.msg_iov = &iov;
> msg.msg_iovlen = 1;
> /* old BSD implementations should use msg_accrights instead of
> * msg_control; the interface is different. */
> msg.msg_control = ccmsg;
> msg.msg_controllen = sizeof(ccmsg);
>
> while(((rv = recvmsg(fd, &msg, 0)) == -1) && errno == EINTR);
> if (rv == -1) {
> perror("recvmsg");
> return -1;
> }
> if(!rv) {
> /* EOF */
> return -1;
> }
>
> cmsg = CMSG_FIRSTHDR(&msg);
> if (!cmsg->cmsg_type == SCM_RIGHTS) {
> fprintf(stderr, "got control message of unknown type %d\n",
> cmsg->cmsg_type);
> return -1;
> }
> return *(int*)CMSG_DATA(cmsg);
> }
>
> int childfn(void *p)
> {
> int fd;
>
> (void) p;
> mkdir("/tmp/clonetest", 755);
> mkdir("/tmp/clonetest/dir1", 755);
> mkdir("/tmp/clonetest/dir1/subdir1", 755);
> mkdir("/tmp/clonetest/mnt", 755);
> system("mount --bind /tmp/clonetest/dir1 /tmp/clonetest/mnt");
> fd = open("/", O_RDONLY | O_DIRECTORY);
> send_fd(socks[0], fd);
> sleep(1000);
> return 1;
> }
>
> int main()
> {
> char buf[10000];
> pid_t pid;
> int res;
> int childfd;
>
> res = socketpair(AF_UNIX, SOCK_STREAM, 0, socks);
> if (res == -1) {
> perror("socketpair");
> return 1;
> }
>
> pid = clone(childfn, buf+5000, CLONE_NEWNS | SIGCHLD, NULL);
> if ((int) pid == -1) {
> perror("clone");
> exit(1);
> }
>
> childfd = receive_fd(socks[1]);
> res = fchdir(childfd);
> if (res == -1) {
> perror("fchdir");
> return 1;
> }
> execl("/bin/bash", "/bin/bash", NULL);
>
> return 0;
> }

2005-05-13 05:47:44

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> > So if a user creates a private namespace, it should have the choice of:
> >
> > 1) Giving up all suid rights (i.e. all mounts are cloned and
> > propagated with nosuid)
> >
> > 2) Not giving up suid for cloned and propagated mounts, but having
> > extra limitations (suid/sgid programs cannot access unprivileged
> > "synthetic" mounts)
>
> (2) isn't realistic. There's no such thing as a suid program. Suid is a
> characteristic of a _file_. There's no way to know whether a given
> executing program is running with privileges that came from a suid file
> getting exec'ed. Bear in mind that that exec could be pretty remote --
> done by a now-dead ancestor with three more execs in between.
>
> Many user space programs contain hacks to try to discern this information,
> and they often cause me headaches and I have to fix them. The usual hacks
> are euid==uid, euid==suid, and/or euid==0. It would be an order of
> magnitude worse for the kernel to contain such a hack.

Guess what? It's already there. Look in ptrace_attach() in
kernel/ptrace.c

You could argue about the usefulness of ptrace. The point is, that
suid/sgid programs _can_ be discerned, and ptrace _needs_ to discern
them.

And for the same reason, user controlled filesystems also need to do
this check. See Documentation/filesystems/fuse.txt in -mm and later
discussion in this thread for more information.

The strange thing is that people argue so vehemently about this on
theoretical grounds. But practice shows, that there's in fact no
problem with it whatsoever. This check (or something similar), has
been in FUSE for more than 3 years, used by thousands of people, and
it didn't cause _any_ problems, except various sub-mount related
things, which we are trying to solve properly, by making mount
unprivileged.

Miklos

2005-05-13 06:06:44

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> r u sure, this program works?

Yes :)

> Sorry if I am saying something dumb here.
> Correct me. When a file descriptor is sent from one process to other,
> arn't they referring to different files in each of the processes.
> fd=5 may be pointing to file 'xyz' in parent process,
> where as fd=5 will be pointing to 'abc' in the child process.

See 'man 7 unix' SCM_RIGHTS.

How do you know the program did not work?

Note: the program starts a new shell, it doesn't exit! Do this in the
started shell (do not 'cd' before it):

ls tmp/clonetest/mnt
ls /tmp/clonetest/mnt

If you see different things, then the program worked.

It's not such a magic thing. You can also do something similar for
example by:

shell1> mount --bind / /tmp/mnt
shell1> cd /tmp/mnt
sheel2> umount -l /tmp/mnt

Now shell1 sees two separate "namespaces" in in CWD and /

Miklos

2005-05-13 06:43:25

by Ram Pai

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, 2005-05-11 at 17:59, Jamie Lokier wrote:
> Ram wrote:
> > > Can you give a reason why sys_chdir() shouldn't have that behaviour?
> >
> > Do you mean to say you want to change the namespace when a process
> > changes to a directory which belongs to that namespace?
> >
> > Well it makes it totally confusing. A user would start seeing different
> > set of mounts suddenly as he changes directories beloning to different
> > namespaces. I am not sure, if changing namespace implicitly is a good
> > idea. Not saying its a bad idea, but seems to change my notion of
> > namespaces completely.
>
> That happens _already_. I'm not suggesting it, it's been there in the
> kernel for ages.
>
> Let me explain how namespaces _really_ work in Linux.
>
> For path lookup, mounts are just mappings from (dentry,vfsmnt) to
> (dentry,vfsmnt). There's a unique vfsmnt for each
> (filesystem,namespace) pair, by the way.
>
> During path lookup, each path component moves from one (dentry,vfsmnt)
> pair to the next. vfsmnt doesn't change from one dentry to the next
> while following a path component. But then, if there's a mount whose
> key is the current (dentry,vfsmnt) pair, the current pair is replaced
> by the value of the mount.
>
> Notice how namespaces aren't involved in path lookup at all.
>
> That's nothing new: it's what Linux does already.
>
> If that seems confusing, it's because _bind mounts_ are confusing.
>
> Namespaces don't really exist. When you create a new namespace with
> CLONE_NEWNS, all that really happens is a lot of bind mounts, to
> create a copy of the current directory tree, and then chrooting into
> that tree (in effect).
>
> > having the ability to access two namespaces simultaneously can allow
> > cross contamination. Which essentially makes namespaces as a concept
> > irrelevent.
>
> Cross contamination is already possible, using file descriptor passing
> or ptrace().

I am yet to convince myself that this is possible. Maybe possible and if
its possible we should close those holes.

>
> Also, your suggested new syscall for accessing another namespace would
> have exactly the same effect, wouldn't it? :)

No it wont. a process can contaminate namespaces if the process has
access to both the namespace simultaneously. With the proposed syscall,
the process would have access to one namespace at any given point in
time. There cannot be a instance where a process will have access
to vfsmounts belonging to two different namespaces.

So I dont' think contamination is possible. By contamination I mean
'ability for a process to bind mount from one namespace to other'


RP



>
> -- Jamie

2005-05-13 07:21:05

by Jan Hudec

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Fri, May 13, 2005 at 07:47:07 +0200, Miklos Szeredi wrote:
> > > 2) Not giving up suid for cloned and propagated mounts, but having
> > > extra limitations (suid/sgid programs cannot access unprivileged
> > > "synthetic" mounts)
> >
> > (2) isn't realistic. There's no such thing as a suid program. Suid is a
> > characteristic of a _file_. There's no way to know whether a given
> > executing program is running with privileges that came from a suid file
> > getting exec'ed. Bear in mind that that exec could be pretty remote --
> > done by a now-dead ancestor with three more execs in between.
> >
> > Many user space programs contain hacks to try to discern this information,
> > and they often cause me headaches and I have to fix them. The usual hacks
> > are euid==uid, euid==suid, and/or euid==0. It would be an order of
> > magnitude worse for the kernel to contain such a hack.
>
> Guess what? It's already there. Look in ptrace_attach() in
> kernel/ptrace.c
>
> You could argue about the usefulness of ptrace. The point is, that
> suid/sgid programs _can_ be discerned, and ptrace _needs_ to discern
> them.

I actually neither needs to, nor does. For ptrace the definition is:
If the tracee has different privilegies, than the tracer, than it
can't be traced.

For this definition, the check is not a hack. It's the only way to go.

Now this definition is really what is needed for the filesystem case
too, so I think it's not a hack either.

> And for the same reason, user controlled filesystems also need to do
> this check. See Documentation/filesystems/fuse.txt in -mm and later
> discussion in this thread for more information.

-------------------------------------------------------------------------------
Jan 'Bulb' Hudec <[email protected]>


Attachments:
(No filename) (1.77 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments

2005-05-13 07:27:52

by Ram Pai

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Thu, 2005-05-12 at 18:10, Ram wrote:
> On Thu, 2005-05-12 at 11:51, Miklos Szeredi wrote:
> > > > I'm not sure passing directory file descriptors is the right semantic
> > > > we want - but at least it provides a point of explicit control (in
> > > > much the same way as a bind). Are you sure the clone + open("/") +
> > > > pass-to-parent scenario you allows the parent to traverse the child's
> > > > private name space through that fd?
> > >
> > > Pretty sure.
> >
> > Yup. Attached a little program that can be used to try this out. It
> > creates a new namespace in the child, does a bind mount (so the
> > namespaces can be differentiated), then sends the file descriptor of
> > "/" to the parent. The parent does fchdir(fd), then starts a shell.
>
>
> > So the result is that CWD is under the child namespace, while root is
> > under the initial namespace.
> >
>
> r u sure, this program works? Sorry if I am saying something dumb here.
> Correct me. When a file descriptor is sent from one process to other,
> arn't they referring to different files in each of the processes.
> fd=5 may be pointing to file 'xyz' in parent process,
> where as fd=5 will be pointing to 'abc' in the child process.
>
> This program did not work for me, and I was wondering if adding
> CLONE_FILES in clone() would help. Because that would make sure
> that both
> the processes share the same file descriptor. It did not work too.
>
> What am I understanding wrong?

Sorry it works. I was misinterpreting the results.

>
> In any case my opinion is if this program works than the hole should
> be closed instead of exploting it to access different namespace. I
> know Jamie is going to pounce at me. ;)

a patch is due to fix the problem :)

RP


2005-05-13 08:34:41

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> > You could argue about the usefulness of ptrace. The point is, that
> > suid/sgid programs _can_ be discerned, and ptrace _needs_ to discern
> > them.
>
> I actually neither needs to, nor does. For ptrace the definition is:
> If the tracee has different privilegies, than the tracer, than it
> can't be traced.

Right. I was talking about suid/sgid because with private namespaces
(unless there's a way to enter them externally) only suid/sgid
programs will have different privileges.

> For this definition, the check is not a hack. It's the only way to go.
>
> Now this definition is really what is needed for the filesystem case
> too, so I think it's not a hack either.

Fully agreed.

Miklos

2005-05-13 08:55:46

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> I think the best thing to do for "jails" and such is to think of a
> private namespace as a collection of bind mounts in a private tree
> that (normally) cannot be reached from elsewhere, not can it reach
> elsewhere.
>
> And leave it to adminstration to ensure that a tree isn't visible from
> another tree if you don't want it to be for security purposes. That
> basically amounts to making sure processes that shouldn't communicate
> or ptrace each other can't.
>
> With that view, the kernel's notion of "namespace" is redundant.

I'm not sure.

> It doesn't add anything to the security model, and it doesn't add
> any useful functionality.

Struct namespace is pretty small, so let's see:

1) count: reference count for the "tree"

if you remove this, how will you know when to "unmount" the tree?

2) root: the root of the tree

needed for the unmount, when the namespace has no more references

3) list: flat list of mounts in the tree

used by /proc/PID/mounts, but probably not strictly needed

4) sem: protects the mount list and the tree from modification

could be removed in favor of a global lock, but that would
increase contention

> In other words, is ->mnt_namespace required at all, except to contain
> namespace->sem (which could be changed)? I don't think it adds
> anything realistic to security. The way to make secure jails is to
> isolated their trees so they are unreachable. ->mnt_namespace doesn't
> make any difference to that.

Probably true. check_mnt() is there to make sure only trees protected
by namespace->sem are modified, and is not a security check. That is
why the recursive bind mount from foreign namespace is not allowed,
while the simple bind mount is.

Miklos

2005-05-13 09:00:33

by Ram Pai

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Fri, 2005-05-13 at 00:25, Ram wrote:
> On Thu, 2005-05-12 at 18:10, Ram wrote:
> > On Thu, 2005-05-12 at 11:51, Miklos Szeredi wrote:
> > > > > I'm not sure passing directory file descriptors is the right semantic
> > > > > we want - but at least it provides a point of explicit control (in
> > > > > much the same way as a bind). Are you sure the clone + open("/") +
> > > > > pass-to-parent scenario you allows the parent to traverse the child's
> > > > > private name space through that fd?
> > > >
> > > > Pretty sure.
> > >
> > > Yup. Attached a little program that can be used to try this out. It
> > > creates a new namespace in the child, does a bind mount (so the
> > > namespaces can be differentiated), then sends the file descriptor of
> > > "/" to the parent. The parent does fchdir(fd), then starts a shell.
> >
> >
> > > So the result is that CWD is under the child namespace, while root is
> > > under the initial namespace.
> > >
> >
> > r u sure, this program works? Sorry if I am saying something dumb here.
> > Correct me. When a file descriptor is sent from one process to other,
> > arn't they referring to different files in each of the processes.
> > fd=5 may be pointing to file 'xyz' in parent process,
> > where as fd=5 will be pointing to 'abc' in the child process.
> >
> > This program did not work for me, and I was wondering if adding
> > CLONE_FILES in clone() would help. Because that would make sure
> > that both
> > the processes share the same file descriptor. It did not work too.
> >
> > What am I understanding wrong?
>
> Sorry it works. I was misinterpreting the results.
>
> >
> > In any case my opinion is if this program works than the hole should
> > be closed instead of exploting it to access different namespace. I
> > know Jamie is going to pounce at me. ;)
>
> a patch is due to fix the problem :)


attached a patch that can fix the problem, if everybody agrees that its
a problem.

> RP
>


Attachments:
fix.patch (368.00 B)

2005-05-13 09:11:05

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount


> dentry = file->f_dentry;
> mnt = file->f_vfsmnt;
> inode = dentry->d_inode;
> + if(mnt->mnt_namespace != current->namespace)
> + goto out_putf;
>
> error = -ENOTDIR;
> if (!S_ISDIR(inode->i_mode))
>

Does this actually fix the problem? The open is done in the right
namespace, and mount() doesn't call open().

I think the right fix is something like this:

Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c 2005-05-13 11:03:50.000000000 +0200
+++ linux/fs/namespace.c 2005-05-13 11:05:06.000000000 +0200
@@ -160,7 +160,7 @@ clone_mnt(struct vfsmount *old, struct d
mnt->mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
- mnt->mnt_namespace = old->mnt_namespace;
+ mnt->mnt_namespace = current->namespace;

/* stick the duplicate mount on the same expiry list
* as the original if that was on one */

2005-05-13 16:55:05

by Ram Pai

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Fri, 2005-05-13 at 02:10, Miklos Szeredi wrote:
> > dentry = file->f_dentry;
> > mnt = file->f_vfsmnt;
> > inode = dentry->d_inode;
> > + if(mnt->mnt_namespace != current->namespace)
> > + goto out_putf;
> >
> > error = -ENOTDIR;
> > if (!S_ISDIR(inode->i_mode))
> >
>
> Does this actually fix the problem? The open is done in the right
> namespace, and mount() doesn't call open().

Right but this fix disallows fchdir into a directory belonging to
a different namespace. And hence would disallow the ability to
cross mount across namespaces.

RP

>
> I think the right fix is something like this:
>
> Index: linux/fs/namespace.c
> ===================================================================
> --- linux.orig/fs/namespace.c 2005-05-13 11:03:50.000000000 +0200
> +++ linux/fs/namespace.c 2005-05-13 11:05:06.000000000 +0200
> @@ -160,7 +160,7 @@ clone_mnt(struct vfsmount *old, struct d
> mnt->mnt_root = dget(root);
> mnt->mnt_mountpoint = mnt->mnt_root;
> mnt->mnt_parent = mnt;
> - mnt->mnt_namespace = old->mnt_namespace;
> + mnt->mnt_namespace = current->namespace;


>
> /* stick the duplicate mount on the same expiry list
> * as the original if that was on one */
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2005-05-13 17:15:26

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> > > dentry = file->f_dentry;
> > > mnt = file->f_vfsmnt;
> > > inode = dentry->d_inode;
> > > + if(mnt->mnt_namespace != current->namespace)
> > > + goto out_putf;
> > >
> > > error = -ENOTDIR;
> > > if (!S_ISDIR(inode->i_mode))
> > >
> >
> > Does this actually fix the problem? The open is done in the right
> > namespace, and mount() doesn't call open().
>
> Right but this fix disallows fchdir into a directory belonging to
> a different namespace. And hence would disallow the ability to
> cross mount across namespaces.

Ahh, sorry. I thought that check was in open(), but I see now it's in
fchdir(). Next time please use '-p' option of diff, to avoid
confusing thoughtless readers like me :)

Though your patch does fix the bug, I still think it's wrong, since
mounting from a different namespace has legitimate uses, and is not a
security problem.

Miklos

2005-05-13 18:53:57

by Alan

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> Right but this fix disallows fchdir into a directory belonging to
> a different namespace. And hence would disallow the ability to
> cross mount across namespaces.

A helper can still pass file handles for you, you've changed the
semantics a little (and I'm not clear if there is a need to) but I don't
see it serves any fundamental purpose. Historical experience is that
chroot exiting via fchdir is actually useful


2005-05-13 21:00:42

by Bryan Henderson

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

>Sorry if I am saying something dumb here.
>Correct me. When a file descriptor is sent from one process to other,

This confusion, incidentally, is brought to you by the ambiguous use of
the term "file descriptor." In classic Unix, the "file descriptor" is the
integer that open() returns. It's an improper use of the term
"descriptor," as this value does not describe but identifies. But in the
SCM_RIGHTS context, the term "file descriptor" is used to refer to a
handle for a file image. While it's true that a file descriptor goes in
one end of the pipe and a (generally different) file descriptor comes out
the other end, what's being passed is a handle for a file image (aka open
instance).

--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems

2005-05-13 23:13:31

by Bryan Henderson

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

>For ptrace the definition is:
> If the tracee has different privileges, than the tracer, than it
> can't be traced.
> For this definition, the check is not a hack. It's the only way to go.

I agree this is the proper goal for ptrace and this code is not a hack.
It's a bug. In Linux, uid, euid, suid, gid, egid, and sgid do not by
themselves determine privileges. And that's what ptrace checks. The main
determiner of basic privileges is the process capabilities. The euid,
etc. do also. I have frequently have on my system a process that runs
with the same uid, euid, etc. as some other process, but should not be
allowed to ptrace it because the tracee has CAP_DAC_OVERRIDE (the
privilege to access files in spite of file permissions) and the tracer
does not. (Furthermore, the tracee got that privilege courtesy of a
set-uid file, but as we seem to agree, that is not relevant here).

So as with the user space programs I mentioned (where the euid check is
indeed a hack), I have to fix ptrace too. Fortunately, it looks as simple
as comparing capability sets.

> Now this definition is really what is needed for the filesystem case
> too, so I think it's not a hack either.

Maybe I got lost in the problem we were trying to solve, then. What does
comparing the privileges of one process with those of another have to do
with this thread about making safe unprivileged mounts via namespaces? The
post to which I replied said we have to deal with set-uid programs. Aren't
we talking about the problem where someone sets a file's setuid flag on
with the assumption that when the program within runs, it will see certain
files at certain places? And the fact that if one could mount whatever he
wants, that would violate the assumption? The two ways suggested of
handling that are: 1) after the private mount, ignore all setuid flags. 2)
after the private mount, don't let a program that has gained privileges
via set-uid see the user-made names.

My point is still that (2) can't be done because you can't know that a
program has gained privileged via set-uid.

If it's really not about set-uid, but about ptrace-like privilege
borrowing, please enlighten me.

--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems

2005-05-14 07:05:46

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

> >For ptrace the definition is:
> > If the tracee has different privileges, than the tracer, than it
> > can't be traced.
> > For this definition, the check is not a hack. It's the only way to go.
>
> I agree this is the proper goal for ptrace and this code is not a hack.
> It's a bug. In Linux, uid, euid, suid, gid, egid, and sgid do not by
> themselves determine privileges. And that's what ptrace checks. The main
> determiner of basic privileges is the process capabilities. The euid,
> etc. do also. I have frequently have on my system a process that runs
> with the same uid, euid, etc. as some other process, but should not be
> allowed to ptrace it because the tracee has CAP_DAC_OVERRIDE (the
> privilege to access files in spite of file permissions) and the tracer
> does not.

I'm constantly getting lost in the maze of rules, on what exactly
happens on setuid() etc, but I know that setuid() resets the
capabilities as well. What's the way of changing euid and suid back
to ruid, and yet keeping some capabilities?

> So as with the user space programs I mentioned (where the euid check is
> indeed a hack), I have to fix ptrace too. Fortunately, it looks as simple
> as comparing capability sets.
>
> > Now this definition is really what is needed for the filesystem case
> > too, so I think it's not a hack either.
>
> Maybe I got lost in the problem we were trying to solve, then. What does
> comparing the privileges of one process with those of another have to do
> with this thread about making safe unprivileged mounts via namespaces?

If process A accesses a filesystem controlled by process B, then B
gets ptrace-like ability over A.

Controlling a filesystem means, that a userspace process provides the
data/metadata for the filesystem as requested by the kernel. It could
be a v9fs, FUSE, or NFS served by a userspace entity.

So what exactly can B do with A?

- Know the exact sequence and timing of file operations being
performed. This is an information leak, though without serious
consequences in most cases

- Provide arbitrary data/metadata. Extremely large files, extremly
deep directories, etc. This can lead to DoS, by forcing a more
privileged program to process/store more data, than the user could
possibly provide otherwise (because of quota, or simply filesystem
size limits).

- Determine the time it takes for each filesystem operation. This
is the most serious problem, as delaying an operation will result
in hanging of the requester, possibly causing other hanging
resources (e.g. file locks).

For more detailed analysis of the problem see
Documentation/filesystems/fuse.txt in -mm.

> The post to which I replied said we have to deal with set-uid
> programs. Aren't we talking about the problem where someone sets a
> file's setuid flag on with the assumption that when the program
> within runs, it will see certain files at certain places? And the
> fact that if one could mount whatever he wants, that would violate
> the assumption?

No. That assumption can easily be satisfied, by limiting where the
user can mount. If mounting is only allowed on files/directories that
the user has unlimited write capability, than it cannot modify files
directories that the privileged process has assumptions about.

For example, in a private namespace, it's perfectly safe to allow bind
mounts (without any restrictions about which processes can access it)
on user writable files/directories.

> The two ways suggested of handling that are: 1) after the private
> mount, ignore all setuid flags. 2) after the private mount, don't
> let a program that has gained privileges via set-uid see the
> user-made names.
>
> My point is still that (2) can't be done because you can't know that a
> program has gained privileged via set-uid.
>
> If it's really not about set-uid, but about ptrace-like privilege
> borrowing, please enlighten me.

Yes, it's about ptrace-like privilege borrowing and not about set-uid.

And as such, it probably needs the capability fix that you propose for
ptrace as well.

Thanks,
Miklos

2005-05-14 11:49:39

by Jamie Lokier

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

Bryan Henderson wrote:
> 2) after the private mount, don't let a program that has gained privileges
> via set-uid see the user-made names.
>
> My point is still that (2) can't be done because you can't know that a
> program has gained privileged via set-uid.
>
> If it's really not about set-uid, but about ptrace-like privilege
> borrowing, please enlighten me.

Note that not all setuid programs gain *capabilities*.

You appear to be talking about setuid-root, but there is also
setuid-some-other-user, where the capabilities don't change but the
priveleges switch to those of another uid.

The right thing to do in that case is tricky. For example, suppose
you have a program that's setuid to the "printer" user, which can copy
the caller's file to the printer queue directories in
/var/spool/printer. Ideally, that program should be able to read the
calling user's file, looking up the path in the calling user's
namespace (that's important, because the path is provided by the
calling user), and then write to /var/spool/printer. (*Really*
ideally /var/spool/printer wouldn't be visible in the calling user's
namespace, but that sort of design is straying far indeed from a unix
model).

-- Jamie

2005-05-16 09:34:59

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

On Wed, May 11, 2005 at 12:20:10PM +0200, Miklos Szeredi wrote:
> > Currently we do allow bind mounts over every type of file for the super
> > user. I think we should keep allowing that.
>
> Yep. I didn't change that check (first two lines of function), so it
> should work as it used to.

You're right, I misread the patch.

2005-05-16 18:39:46

by Bryan Henderson

[permalink] [raw]
Subject: Re: [RCF] [PATCH] unprivileged mount/umount

>I'm constantly getting lost in the maze of rules, on what exactly
>happens on setuid() etc, but I know that setuid() resets the
>capabilities as well. What's the way of changing euid and suid back
>to ruid, and yet keeping some capabilities?

Plus, they keep changing as we try to strike the perfect balance between
logical, flexible architecture and compatibility with other kernels.

That setuid() to nonzero removes all capabilities in addition to its
essential function is a special case to ensure that old programs that mean
to drop privileges by setting uid nonzero still do so. Because it's an
exception and not architecture, no other part of the kernel should rely on
that for correctness.

As a practical matter, a process can use a prctl(SET_KEEPCAPS) system call
to indicate that it's aware that uids and capabilities have nothing to do
with each other, and thus a setuid() by that process doesn't do the
special case.

Note that another way a process can end up with capabilities but euid
nonzero is that another process did a capset() system call on it.

--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems