2007-01-04 18:06:43

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 0/8] user ns: Introduction

This patchset adds a user namespace, which allows a process to
unshare its user_struct table, allowing for separate accounting
per user namespace. It appends a user namespace to vfsmounts and
fown_structs, so that uid1==uid2 checks can be extended to be
false if uid1 and uid2 are in different namespaces.

A vfsmount generally cannot be accessed by another user namespace
than that in which it was mounted. A vfsmount can be mounted
"shared-ns", in which case it can be accessed by any user namespace.
This is needed at least to bootstrap a container so it can get far
enough to create it's own private file system tree, and can be
used in conjunction with read-only bind mounts to provide shared
/usr trees, for instance. However, for more useful, more fine-grained
sharing accross user namespaces, it has been suggested that a new
filesystem specifying global userid's be used.

Patches are as follows:

1. make exit_task_namespaces extern to prevent future
compile failure.
2. add userns framework.
3. add userns pointer to vfsmount
4. hook permission to check current against vfsmount userns
5. prepare for copy_mnt and copy_tree to return -EPERM
6. implement shared mounts which can cross user namespaces
7. implement user ns checks for file sigio
8. implement user namespace unshare

Following is a test program to verify that the cross-user-namespace
mount permissions are working correctly. The attached patches pass
both this test and LTP.

thanks,
-serge

#include <unistd.h>
#include <linux/unistd.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <errno.h>
#include <signal.h>

#ifndef MNT_DETACH
#define MNT_DETACH 2
#endif

#ifndef MS_SHARE_NS
#define MS_SHARE_NS 1<<22
#endif

#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000
#endif

#define PIPEREAD 0
#define PIPEWRITE 1

static inline _syscall2(int, clone, int, flags, int, foo)
int to1[2], from1[1], to2[2], from2[2];

void do_test1(void)
{
int ret;
int fd;
char buf[100];

rmdir("/mnt1");
ret = mkdir("/mnt1", 0666);
if (ret == -1) {
perror("mkdir mnt1");
_exit(1);
}
ret = mount("/", "/mnt1", "none", MS_BIND, "");
if (ret == -1) {
perror("mount mnt1");
_exit(1);
}
fd = open("/mnt1/testme", O_RDWR|O_CREAT);
if (fd == -1) {
printf("thread 1: unable to write /testme: ERROR\n");
} else {
write(fd, "a", 1);
printf("thread 1: able to write /testme: GOOD\n");
close(fd);
}
write(from1[PIPEWRITE], "mount", 6);

read(to1[PIPEREAD], buf, 2);
umount("/mnt1");
printf("thread 1: exiting.\n");
write(from1[PIPEWRITE], "done", 5);
rmdir("/mnt1");
}

void do_test2(void)
{
int ret;
char buf[100];
int fd;

rmdir("/mnt2");
ret = mkdir("/mnt2", 0666);
if (ret == -1) {
perror("mkdir mnt2");
_exit(1);
}
ret = read(to2[PIPEREAD], buf, 10);
fd = open("/testme", O_RDWR|O_CREAT);
if (fd == -1) {
printf("thread 2: unable to write /testme: ERROR\n");
} else {
write(fd, "b", 1);
printf("thread 2: able to write /testme: GOOD\n");
close(fd);
}

fd = open("/mnt1/testme", O_RDWR|O_CREAT);
if (fd == -1) {
printf("thread 2: unable to open /mnt1/testme: GOOD\n");
} else {
write(fd, "c", 1);
printf("thread 2: able to open /mnt1/testme: ERROR\n");
close(fd);
}

/* now try remounting /mnt1 to /mnt2 */
ret = mount("/mnt1", "/mnt2", "none", MS_BIND, "");
if (ret == -1) {
printf("thread2: unable to remount /mnt1 to /mnt2: GOOD\n");
} else {
perror("thread2: able to remount /mnt1 to /mnt2: BAD\n");
fd = open("/mnt2/testme", O_RDWR|O_CREAT);
if (fd == -1) {
printf("thread 2: unable to open /mnt2/testme: GOOD\n");
} else {
write(fd, "d", 1);
printf("thread 2: able to open /mnt2/testme: ERROR\n");
close(fd);
}
umount ("/mnt2");
}

write(from2[PIPEWRITE], "done", 5);
printf("thread 2: exiting\n");
rmdir("/mnt2");
}

int main(int argc, char *argv[])
{
int childpid1, childpid2;
int ret;
char buf[100];

ret = mount("/", "/mnt", "none", MS_SHARE_NS | MS_BIND, "");
if (ret == -1) {
perror("failed to mount /mnt shared_ns.\n");
_exit(1);
}

chdir("/mnt");
chroot("/mnt");

ret = pipe(to1);
if (ret == -1) {
perror("failed to create child pipe 1\n");
_exit(1);
}
ret = pipe(to2);
if (ret == -1) {
perror("failed to create child pipe 2\n");
_exit(1);
}
ret = pipe(from1);
if (ret == -1) {
perror("failed to create child pipe 1\n");
_exit(1);
}
ret = pipe(from2);
if (ret == -1) {
perror("failed to create child pipe 2\n");
_exit(1);
}

childpid1 = clone(CLONE_NEWUSER | SIGCHLD, 0);
if (childpid1 != 0) {
do_test1();
_exit(0);
}
childpid2 = clone(CLONE_NEWUSER | SIGCHLD, 0);
if (childpid2 != 0) {
do_test2();
_exit(0);
}

read(from1[PIPEREAD], buf, 5);
write(to2[PIPEWRITE], "go", 3);
read(from2[PIPEREAD], buf, 4);
write(to1[PIPEWRITE], "go", 3);
read(from1[PIPEREAD], buf, 4);
return 0;
}


2007-01-04 18:11:06

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 1/8] nsproxy: externalizes exit_task_namespaces

From: Cedric Le Goater <[email protected]>
Subject: [PATCH -mm 1/8] nsproxy: externalizes exit_task_namespaces

this is required to remove a header dependency in sched.h which breaks
next patches.

Signed-off-by: Cedric Le Goater <[email protected]>
Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/nsproxy.h | 15 ++++-----------
kernel/nsproxy.c | 11 +++++++++++
2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 0b9f0dc..602f3d2 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -2,7 +2,8 @@ #ifndef _LINUX_NSPROXY_H
#define _LINUX_NSPROXY_H

#include <linux/spinlock.h>
-#include <linux/sched.h>
+
+struct task_struct;

struct mnt_namespace;
struct uts_namespace;
@@ -43,14 +44,6 @@ static inline void put_nsproxy(struct ns
}
}

-static inline void exit_task_namespaces(struct task_struct *p)
-{
- struct nsproxy *ns = p->nsproxy;
- if (ns) {
- task_lock(p);
- p->nsproxy = NULL;
- task_unlock(p);
- put_nsproxy(ns);
- }
-}
+extern void exit_task_namespaces(struct task_struct *p);
+
#endif
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5b9ee6..f11bbbf 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -36,6 +36,17 @@ void get_task_namespaces(struct task_str
}
}

+void exit_task_namespaces(struct task_struct *p)
+{
+ struct nsproxy *ns = p->nsproxy;
+ if (ns) {
+ task_lock(p);
+ p->nsproxy = NULL;
+ task_unlock(p);
+ put_nsproxy(ns);
+ }
+}
+
/*
* creates a copy of "orig" with refcount 1.
* This does not grab references to the contained namespaces,
--
1.4.1

2007-01-04 18:11:25

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 2/8] user namespace: add the framework

From: Cedric Le Goater <[email protected]>
Subject: [PATCH -mm 2/8] user namespace: add the framework

This patch adds the user namespace struct and framework

Basically, it will allow a process to unshare its user_struct table,
resetting at the same time its own user_struct and all the associated
accounting.

A new root user (uid == 0) is added to the user namespace upon
creation. Such root users have full privileges and it seems that
theses privileges should be controlled through some means (process
capabilities ?)

The unshare is not included in this patch.

Changes since [try #4]:
- Updated get_user_ns and put_user_ns to accept NULL, and
get_user_ns to return the namespace.

Changes since [try #3]:
- moved struct user_namespace to files user_namespace.{c,h}

Changes since [try #2]:
- removed struct user_namespace* argument from find_user()

Changes since [try #1]:
- removed struct user_namespace* argument from find_user()
- added a root_user per user namespace

Signed-off-by: Cedric Le Goater <[email protected]>

Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/init_task.h | 2 ++
include/linux/nsproxy.h | 1 +
include/linux/sched.h | 3 ++
include/linux/user_namespace.h | 53 ++++++++++++++++++++++++++++++++++++++++
init/Kconfig | 8 ++++++
kernel/Makefile | 2 +-
kernel/fork.c | 2 +-
kernel/nsproxy.c | 12 +++++++++
kernel/sys.c | 5 ++--
kernel/user.c | 18 +++++++-------
kernel/user_namespace.c | 44 +++++++++++++++++++++++++++++++++
11 files changed, 136 insertions(+), 14 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a2d95ff..26dc24b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -8,6 +8,7 @@ #include <linux/utsname.h>
#include <linux/lockdep.h>
#include <linux/ipc.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>

#define INIT_FDTABLE \
{ \
@@ -78,6 +79,7 @@ #define INIT_NSPROXY(nsproxy) { \
.uts_ns = &init_uts_ns, \
.mnt_ns = NULL, \
INIT_IPC_NS(ipc_ns) \
+ .user_ns = &init_user_ns, \
}

#define INIT_SIGHAND(sighand) { \
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 602f3d2..e57920e 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -29,6 +29,7 @@ struct nsproxy {
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns;
+ struct user_namespace *user_ns;
};
extern struct nsproxy init_nsproxy;

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7e4de1..5a3f630 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -252,6 +252,7 @@ extern signed long schedule_timeout_unin
asmlinkage void schedule(void);

struct nsproxy;
+struct user_namespace;

/* Maximum number of active map areas.. This is a random (large) number */
#define DEFAULT_MAX_MAP_COUNT 65536
@@ -1285,7 +1286,7 @@ extern struct task_struct *find_task_by_
extern void __set_special_pids(pid_t session, pid_t pgrp);

/* per-UID process charging. */
-extern struct user_struct * alloc_uid(uid_t);
+extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
atomic_inc(&u->__count);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
new file mode 100644
index 0000000..4ad4c0d
--- /dev/null
+++ b/include/linux/user_namespace.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_USER_NAMESPACE_H
+#define _LINUX_USER_NAMESPACE_H
+
+#include <linux/kref.h>
+#include <linux/nsproxy.h>
+
+#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
+#define UIDHASH_SZ (1 << UIDHASH_BITS)
+
+struct user_namespace {
+ struct kref kref;
+ struct list_head uidhash_table[UIDHASH_SZ];
+ struct user_struct *root_user;
+};
+
+extern struct user_namespace init_user_ns;
+
+#ifdef CONFIG_USER_NS
+
+static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+{
+ if (ns)
+ kref_get(&ns->kref);
+ return ns;
+}
+
+extern int copy_user_ns(int flags, struct task_struct *tsk);
+extern void free_user_ns(struct kref *kref);
+
+static inline void put_user_ns(struct user_namespace *ns)
+{
+ if (ns)
+ kref_put(&ns->kref, free_user_ns);
+}
+
+#else
+
+static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+{
+ return NULL;
+}
+
+static inline int copy_user_ns(int flags, struct task_struct *tsk)
+{
+ return 0;
+}
+
+static inline void put_user_ns(struct user_namespace *ns)
+{
+}
+#endif
+
+#endif /* _LINUX_USER_H */
diff --git a/init/Kconfig b/init/Kconfig
index 5f0c51c..4172e95 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -222,6 +222,14 @@ config UTS_NS
vservers, to use uts namespaces to provide different
uts info for different servers. If unsure, say N.

+config USER_NS
+ bool "User Namespaces"
+ default n
+ help
+ Support user namespaces. This allows containers, i.e.
+ vservers, to use user namespaces to provide different
+ user info for different servers. If unsure, say N.
+
config AUDIT
bool "Auditing support"
depends on NET
diff --git a/kernel/Makefile b/kernel/Makefile
index caaad27..37de4ec 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,7 +4,7 @@ #

obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
exit.o itimer.o time.o softirq.o resource.o \
- sysctl.o capability.o ptrace.o timer.o user.o \
+ sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
diff --git a/kernel/fork.c b/kernel/fork.c
index a6ad544..deafa6e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -996,7 +996,7 @@ #endif
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- p->user != &root_user)
+ p->user != current->nsproxy->user_ns->root_user)
goto bad_fork_free;
}

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f11bbbf..cf43e06 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,6 +80,8 @@ struct nsproxy *dup_namespaces(struct ns
get_ipc_ns(ns->ipc_ns);
if (ns->pid_ns)
get_pid_ns(ns->pid_ns);
+ if (ns->user_ns)
+ get_user_ns(ns->user_ns);
}

return ns;
@@ -127,10 +129,18 @@ int copy_namespaces(int flags, struct ta
if (err)
goto out_pid;

+ err = copy_user_ns(flags, tsk);
+ if (err)
+ goto out_user;
+
out:
put_nsproxy(old_ns);
return err;

+out_user:
+ if (new_ns->pid_ns)
+ put_pid_ns(new_ns->pid_ns);
+
out_pid:
if (new_ns->ipc_ns)
put_ipc_ns(new_ns->ipc_ns);
@@ -156,5 +166,7 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns)
put_pid_ns(ns->pid_ns);
+ if (ns->user_ns)
+ put_user_ns(ns->user_ns);
kfree(ns);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 5590255..26bdd84 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@ #include <linux/getcpu.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/kprobes.h>
+#include <linux/user_namespace.h>

#include <asm/uaccess.h>
#include <asm/io.h>
@@ -1070,13 +1071,13 @@ static int set_user(uid_t new_ruid, int
{
struct user_struct *new_user;

- new_user = alloc_uid(new_ruid);
+ new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
if (!new_user)
return -EAGAIN;

if (atomic_read(&new_user->processes) >=
current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
- new_user != &root_user) {
+ new_user != current->nsproxy->user_ns->root_user) {
free_uid(new_user);
return -EAGAIN;
}
diff --git a/kernel/user.c b/kernel/user.c
index 4869563..98b8250 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,20 +14,19 @@ #include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/key.h>
#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/user_namespace.h>

/*
* UID task count cache, to get fast user lookup in "alloc_uid"
* when changing user ID's (ie setuid() and friends).
*/

-#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
-#define UIDHASH_SZ (1 << UIDHASH_BITS)
#define UIDHASH_MASK (UIDHASH_SZ - 1)
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))

static struct kmem_cache *uid_cachep;
-static struct list_head uidhash_table[UIDHASH_SZ];

/*
* The uidhash_lock is mostly taken from process context, but it is
@@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid)
{
struct user_struct *ret;
unsigned long flags;
+ struct user_namespace *ns = current->nsproxy->user_ns;

spin_lock_irqsave(&uidhash_lock, flags);
- ret = uid_hash_find(uid, uidhashentry(uid));
+ ret = uid_hash_find(uid, uidhashentry(ns, uid));
spin_unlock_irqrestore(&uidhash_lock, flags);
return ret;
}
@@ -120,9 +120,9 @@ void free_uid(struct user_struct *up)
}
}

-struct user_struct * alloc_uid(uid_t uid)
+struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
{
- struct list_head *hashent = uidhashentry(uid);
+ struct list_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;

spin_lock_irq(&uidhash_lock);
@@ -211,11 +211,11 @@ static int __init uid_cache_init(void)
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);

for(n = 0; n < UIDHASH_SZ; ++n)
- INIT_LIST_HEAD(uidhash_table + n);
+ INIT_LIST_HEAD(init_user_ns.uidhash_table + n);

/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
- uid_hash_insert(&root_user, uidhashentry(0));
+ uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
spin_unlock_irq(&uidhash_lock);

return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
new file mode 100644
index 0000000..368f8da
--- /dev/null
+++ b/kernel/user_namespace.c
@@ -0,0 +1,44 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
+
+struct user_namespace init_user_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .root_user = &root_user,
+};
+
+EXPORT_SYMBOL_GPL(init_user_ns);
+
+#ifdef CONFIG_USER_NS
+
+int copy_user_ns(int flags, struct task_struct *tsk)
+{
+ struct user_namespace *old_ns = tsk->nsproxy->user_ns;
+ int err = 0;
+
+ if (!old_ns)
+ return 0;
+
+ get_user_ns(old_ns);
+ return err;
+}
+
+void free_user_ns(struct kref *kref)
+{
+ struct user_namespace *ns;
+
+ ns = container_of(kref, struct user_namespace, kref);
+ kfree(ns);
+}
+
+#endif /* CONFIG_USER_NS */
--
1.4.1

2007-01-04 18:11:39

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 3/8] user ns: add user_namespace ptr to vfsmount

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH -mm 3/8] user ns: add user_namespace ptr to vfsmount

Add user_namespace ptr to vfsmount, and define a helper to compare it
to the task's user_ns.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
fs/namespace.c | 3 +++
include/linux/mount.h | 2 ++
include/linux/sched.h | 20 ++++++++++++++++++++
3 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index fd999ca..5da87e2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@ #include <linux/namei.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/ramfs.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include "pnode.h"
@@ -55,6 +56,7 @@ struct vfsmount *alloc_vfsmnt(const char
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
+ mnt->mnt_user_ns = get_user_ns(current->nsproxy->user_ns);
atomic_set(&mnt->mnt_count, 1);
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
@@ -87,6 +89,7 @@ EXPORT_SYMBOL(simple_set_mnt);

void free_vfsmnt(struct vfsmount *mnt)
{
+ put_user_ns(mnt->mnt_user_ns);
kfree(mnt->mnt_devname);
kmem_cache_free(mnt_cache, mnt);
}
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dab69af..e438195 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -21,6 +21,7 @@ struct super_block;
struct vfsmount;
struct dentry;
struct mnt_namespace;
+struct user_namespace;

#define MNT_NOSUID 0x01
#define MNT_NODEV 0x02
@@ -53,6 +54,7 @@ struct vfsmount {
struct list_head mnt_slave; /* slave list entry */
struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
+ struct user_namespace *mnt_user_ns; /* namespace for uid interpretation */
/*
* We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
* to let these frequently modified fields in a separate cache line
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a3f630..450fc39 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -83,6 +83,8 @@ #include <linux/resource.h>
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
+#include <linux/nsproxy.h>
+#include <linux/mount.h>

#include <asm/processor.h>

@@ -1586,6 +1588,24 @@ extern int cond_resched_lock(spinlock_t
extern int cond_resched_softirq(void);

/*
+ * Check whether a task and a vfsmnt belong to the same uidns.
+ * Since the initial namespace is exempt from these checks,
+ * return 1 if so. Also return 1 if the vfsmnt is exempt from
+ * such checking. Otherwise, if the uid namespaces are different,
+ * return 0.
+ */
+static inline int task_mnt_same_uidns(struct task_struct *tsk,
+ struct vfsmount *mnt)
+{
+ if (tsk->nsproxy == init_task.nsproxy)
+ return 1;
+ if (mnt->mnt_user_ns == tsk->nsproxy->user_ns)
+ return 1;
+ return 0;
+}
+
+
+/*
* Does a critical section need to be broken due to another
* task waiting?:
*/
--
1.4.1

2007-01-04 18:12:05

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 4/8] user ns: hook permission

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH -mm 4/8] user ns: hook permission

Hook permission to check vfsmnt->user_ns against current.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
fs/namei.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index e4f108f..d6687af 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -246,6 +246,8 @@ int permission(struct inode *inode, int
return -EACCES;
}

+ if (nd && !task_mnt_same_uidns(current, nd->mnt))
+ return -EACCES;

/*
* MAY_EXEC on regular files requires special handling: We override
@@ -433,6 +435,8 @@ static int exec_permission_lite(struct i
{
umode_t mode = inode->i_mode;

+ if (!task_mnt_same_uidns(current, nd->mnt))
+ return -EACCES;
if (inode->i_op && inode->i_op->permission)
return -EAGAIN;

--
1.4.1

2007-01-04 18:12:25

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 5/8] user ns: prepare copy_tree, copy_mnt, and their callers to handle errs

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH -mm 5/8] user ns: prepare copy_tree, copy_mnt, and their callers to handle errs

With shareduserns and non-shareduserns mounts, it will be possible
for clone_mnt to return -EPERM if a namespace tries to bind
mount a non-shareduserns vfsmnt from another user namespace.
But currently they only return NULL, which is interpreted as
-ENOMEM. Update the callers to handle other errors.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
fs/namespace.c | 20 ++++++++++++--------
fs/pnode.c | 5 +++--
2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 5da87e2..a4039a3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -708,8 +708,9 @@ struct vfsmount *copy_tree(struct vfsmou
return NULL;

res = q = clone_mnt(mnt, dentry, flag);
- if (!q)
- goto Enomem;
+ if (!q || IS_ERR(q)) {
+ return q;
+ }
q->mnt_mountpoint = mnt->mnt_mountpoint;

p = mnt;
@@ -730,8 +731,9 @@ struct vfsmount *copy_tree(struct vfsmou
nd.mnt = q;
nd.dentry = p->mnt_mountpoint;
q = clone_mnt(p, p->mnt_root, flag);
- if (!q)
- goto Enomem;
+ if (!q || IS_ERR(q)) {
+ goto Error;
+ }
spin_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &nd);
@@ -739,7 +741,7 @@ struct vfsmount *copy_tree(struct vfsmou
}
}
return res;
-Enomem:
+Error:
if (res) {
LIST_HEAD(umount_list);
spin_lock(&vfsmount_lock);
@@ -747,7 +749,7 @@ Enomem:
spin_unlock(&vfsmount_lock);
release_mounts(&umount_list);
}
- return NULL;
+ return q;
}

/*
@@ -927,8 +929,10 @@ static int do_loopback(struct nameidata
else
mnt = clone_mnt(old_nd.mnt, old_nd.dentry, 0);

- if (!mnt)
+ if (!mnt || IS_ERR(mnt)) {
+ err = mnt ? PTR_ERR(mnt) : -ENOMEM;
goto out;
+ }

err = graft_tree(mnt, nd);
if (err) {
@@ -1465,7 +1469,7 @@ struct mnt_namespace *dup_mnt_ns(struct
/* First pass: copy the tree topology */
new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root,
CL_COPY_ALL | CL_EXPIRE);
- if (!new_ns->root) {
+ if (!new_ns->root || IS_ERR(new_ns->root)) {
up_write(&namespace_sem);
kfree(new_ns);
return NULL;
diff --git a/fs/pnode.c b/fs/pnode.c
index 56aacea..1821c95 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -187,8 +187,9 @@ int propagate_mnt(struct vfsmount *dest_

source = get_source(m, prev_dest_mnt, prev_src_mnt, &type);

- if (!(child = copy_tree(source, source->mnt_root, type))) {
- ret = -ENOMEM;
+ child = copy_tree(source, source->mnt_root, type);
+ if (!child || IS_ERR(child)) {
+ ret = child ? PTR_ERR(child) : -ENOMEM;
list_splice(tree_list, tmp_list.prev);
goto out;
}
--
1.4.1

2007-01-04 18:12:47

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 6/8] user ns: implement shared mounts

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH -mm 6/8] user ns: implement shared mounts

Implement shared-ns mounts, which allow containers in different user
namespaces to share mounts. Without this, containers can obviously
never even be started.

Here is a sample smount.c (based on Miklos' version) which only
does a bind mount of arg1 onto arg2, but making the destination
a shared-ns mount.

int main(int argc, char *argv[])
{
int type;
if(argc != 3) {
fprintf(stderr, "usage: %s src dest", argv[0]);
return 1;
}

fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]);

type = MS_SHARE_NS | MS_BIND;
setfsuid(getuid());

if(mount(argv[1], argv[2], "none", type, "") == -1) {
perror("mount");
return 1;
}
return 0;
}

Signed-off-by: Serge E. Hallyn <[email protected]>
---
fs/namespace.c | 30 ++++++++++++++++++++++++------
fs/pnode.h | 1 +
include/linux/fs.h | 1 +
include/linux/mount.h | 1 +
include/linux/sched.h | 2 ++
5 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index a4039a3..60ca9b5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -234,7 +234,14 @@ static struct vfsmount *clone_mnt(struct
int flag)
{
struct super_block *sb = old->mnt_sb;
- struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+ struct vfsmount *mnt;
+
+ if (!(old->mnt_flags & MNT_SHARE_NS)) {
+ if (old->mnt_user_ns != current->nsproxy->user_ns)
+ return ERR_PTR(-EPERM);
+ }
+
+ mnt = alloc_vfsmnt(old->mnt_devname);

if (mnt) {
mnt->mnt_flags = old->mnt_flags;
@@ -257,6 +264,10 @@ static struct vfsmount *clone_mnt(struct
}
if (flag & CL_MAKE_SHARED)
set_mnt_shared(mnt);
+ if (flag & CL_SHARE_NS)
+ mnt->mnt_flags |= MNT_SHARE_NS;
+ else
+ mnt->mnt_flags &= ~MNT_SHARE_NS;

/* stick the duplicate mount on the same expiry list
* as the original if that was on one */
@@ -368,6 +379,7 @@ static int show_vfsmnt(struct seq_file *
{ MNT_NOSUID, ",nosuid" },
{ MNT_NODEV, ",nodev" },
{ MNT_NOEXEC, ",noexec" },
+ { MNT_SHARE_NS, ",share_userns" },
{ MNT_NOATIME, ",noatime" },
{ MNT_NODIRATIME, ",nodiratime" },
{ MNT_RELATIME, ",relatime" },
@@ -902,11 +914,14 @@ static int do_change_type(struct nameida
/*
* do loopback mount.
*/
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, int recurse,
+ int uidns_share)
{
struct nameidata old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
+ int flag = (uidns_share ? CL_SHARE_NS : 0);
+
if (err)
return err;
if (!old_name || !*old_name)
@@ -925,9 +940,9 @@ static int do_loopback(struct nameidata

err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry, 0);
+ mnt = copy_tree(old_nd.mnt, old_nd.dentry, flag);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry, 0);
+ mnt = clone_mnt(old_nd.mnt, old_nd.dentry, flag);

if (!mnt || IS_ERR(mnt)) {
err = mnt ? PTR_ERR(mnt) : -ENOMEM;
@@ -1414,9 +1429,11 @@ long do_mount(char *dev_name, char *dir_
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_RELATIME)
mnt_flags |= MNT_RELATIME;
+ if (flags & MS_SHARE_NS)
+ mnt_flags |= MNT_SHARE_NS;

flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
- MS_NOATIME | MS_NODIRATIME | MS_RELATIME);
+ MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_SHARE_NS);

/* ... and get the mountpoint */
retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
@@ -1431,7 +1448,8 @@ long do_mount(char *dev_name, char *dir_
retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(&nd, dev_name, flags & MS_REC,
+ mnt_flags & MNT_SHARE_NS);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
diff --git a/fs/pnode.h b/fs/pnode.h
index d45bd8e..eb62f4c 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -22,6 +22,7 @@ #define CL_SLAVE 0x02
#define CL_COPY_ALL 0x04
#define CL_MAKE_SHARED 0x08
#define CL_PROPAGATION 0x10
+#define CL_SHARE_NS 0x20

static inline void set_mnt_shared(struct vfsmount *mnt)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bed19b7..569a637 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -121,6 +121,7 @@ #define MS_PRIVATE (1<<18) /* change to
#define MS_SLAVE (1<<19) /* change to slave */
#define MS_SHARED (1<<20) /* change to shared */
#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */
+#define MS_SHARE_NS (1<<22) /* ignore user namespaces for permission */
#define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31)

diff --git a/include/linux/mount.h b/include/linux/mount.h
index e438195..00e5066 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -35,6 +35,7 @@ #define MNT_SHRINKABLE 0x100
#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
#define MNT_PNODE_MASK 0x3000 /* propogation flag mask */
+#define MNT_SHARE_NS 0x4000 /* ignore user namespaces for permission */

struct vfsmount {
struct list_head mnt_hash;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 450fc39..73df38c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1599,6 +1599,8 @@ static inline int task_mnt_same_uidns(st
{
if (tsk->nsproxy == init_task.nsproxy)
return 1;
+ if (mnt->mnt_flags & MNT_SHARE_NS)
+ return 1;
if (mnt->mnt_user_ns == tsk->nsproxy->user_ns)
return 1;
return 0;
--
1.4.1

2007-01-04 18:13:09

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 7/8] user_ns: handle file sigio

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH -mm 7/8] user_ns: handle file sigio

A process in one user namespace could set a fowner and sigio on a file in a
shared vfsmount, ending up killing a task in another user namespace.

Prevent this by adding a user namespace pointer to the fown_struct, and
enforcing that a process causing a signal to be sent be in the same
user namespace as the file owner.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
fs/fcntl.c | 14 +++++++++++---
fs/file_table.c | 2 ++
include/linux/fs.h | 1 +
3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5..6a774c1 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -18,6 +18,7 @@ #include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/rcupdate.h>
+#include <linux/user_namespace.h>

#include <asm/poll.h>
#include <asm/siginfo.h>
@@ -250,15 +251,18 @@ static int setfl(int fd, struct file * f
}

static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
- uid_t uid, uid_t euid, int force)
+ uid_t uid, uid_t euid, struct user_namespace *user_ns,
+ int force)
{
write_lock_irq(&filp->f_owner.lock);
if (force || !filp->f_owner.pid) {
put_pid(filp->f_owner.pid);
+ put_user_ns(filp->f_owner.user_ns);
filp->f_owner.pid = get_pid(pid);
filp->f_owner.pid_type = type;
filp->f_owner.uid = uid;
filp->f_owner.euid = euid;
+ filp->f_owner.user_ns = get_user_ns(user_ns);
}
write_unlock_irq(&filp->f_owner.lock);
}
@@ -272,7 +276,8 @@ int __f_setown(struct file *filp, struct
if (err)
return err;

- f_modown(filp, pid, type, current->uid, current->euid, force);
+ f_modown(filp, pid, type, current->uid, current->euid,
+ current->nsproxy->user_ns, force);
return 0;
}
EXPORT_SYMBOL(__f_setown);
@@ -298,7 +303,7 @@ EXPORT_SYMBOL(f_setown);

void f_delown(struct file *filp)
{
- f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1);
+ f_modown(filp, NULL, PIDTYPE_PID, 0, 0, NULL, 1);
}

pid_t f_getown(struct file *filp)
@@ -455,6 +460,9 @@ static const long band_table[NSIGPOLL] =
static inline int sigio_perm(struct task_struct *p,
struct fown_struct *fown, int sig)
{
+ if (fown->user_ns != init_task.nsproxy->user_ns &&
+ fown->user_ns != p->nsproxy->user_ns)
+ return 0;
return (((fown->euid == 0) ||
(fown->euid == p->suid) || (fown->euid == p->uid) ||
(fown->uid == p->suid) || (fown->uid == p->uid)) &&
diff --git a/fs/file_table.c b/fs/file_table.c
index 4c17a18..6e6632c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -21,6 +21,7 @@ #include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
+#include <linux/user_namespace.h>

#include <asm/atomic.h>

@@ -175,6 +176,7 @@ void fastcall __fput(struct file *file)
if (file->f_mode & FMODE_WRITE)
put_write_access(inode);
put_pid(file->f_owner.pid);
+ put_user_ns(file->f_owner.user_ns);
file_kill(file);
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 569a637..4a79909 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -685,6 +685,7 @@ struct fown_struct {
struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
uid_t uid, euid; /* uid/euid of process setting the owner */
+ struct user_namespace *user_ns; /* namespace to which uid belongs */
int signum; /* posix.1b rt signal to be delivered on IO */
};

--
1.4.1

2007-01-04 18:13:18

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH -mm 8/8] user ns: implement user ns unshare

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH -mm 8/8] user ns: implement user ns unshare

Implement CLONE_NEWUSER flag useable at clone/unshare.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/sched.h | 1 +
include/linux/user_namespace.h | 10 +++++
kernel/fork.c | 22 ++++++++++--
kernel/nsproxy.c | 2 +
kernel/user_namespace.c | 74 +++++++++++++++++++++++++++++++++++++++-
5 files changed, 102 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 73df38c..55ecf81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -26,6 +26,7 @@ #define CLONE_CHILD_SETTID 0x01000000 /*
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
+#define CLONE_NEWUSER 0x10000000 /* New user namespace */

/*
* Scheduling policies
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4ad4c0d..d577ede 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -25,6 +25,7 @@ static inline struct user_namespace *get
}

extern int copy_user_ns(int flags, struct task_struct *tsk);
+extern int unshare_user_ns(unsigned long flags, struct user_namespace **new_user);
extern void free_user_ns(struct kref *kref);

static inline void put_user_ns(struct user_namespace *ns)
@@ -40,6 +41,15 @@ static inline struct user_namespace *get
return NULL;
}

+static inline int unshare_user_ns(unsigned long flags,
+ struct user_namespace **new_user)
+{
+ if (flags & CLONE_NEWUSER)
+ return -EINVAL;
+
+ return 0;
+}
+
static inline int copy_user_ns(int flags, struct task_struct *tsk)
{
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index deafa6e..eead517 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@ #include <linux/cn_proc.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
+#include <linux/user_namespace.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1620,6 +1621,7 @@ asmlinkage long sys_unshare(unsigned lon
struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
struct uts_namespace *uts, *new_uts = NULL;
struct ipc_namespace *ipc, *new_ipc = NULL;
+ struct user_namespace *user, *new_user = NULL;

check_unshare_flags(&unshare_flags);

@@ -1627,7 +1629,7 @@ asmlinkage long sys_unshare(unsigned lon
err = -EINVAL;
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
goto bad_unshare_out;

if ((err = unshare_thread(unshare_flags)))
@@ -1648,18 +1650,20 @@ asmlinkage long sys_unshare(unsigned lon
goto bad_unshare_cleanup_semundo;
if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
goto bad_unshare_cleanup_uts;
+ if ((err = unshare_user_ns(unshare_flags, &new_user)))
+ goto bad_unshare_cleanup_ipc;

- if (new_ns || new_uts || new_ipc) {
+ if (new_ns || new_uts || new_ipc || new_user) {
old_nsproxy = current->nsproxy;
new_nsproxy = dup_namespaces(old_nsproxy);
if (!new_nsproxy) {
err = -ENOMEM;
- goto bad_unshare_cleanup_ipc;
+ goto bad_unshare_cleanup_user;
}
}

if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
- new_uts || new_ipc) {
+ new_uts || new_ipc || new_user) {

task_lock(current);

@@ -1707,12 +1711,22 @@ asmlinkage long sys_unshare(unsigned lon
new_ipc = ipc;
}

+ if (new_user) {
+ user = current->nsproxy->user_ns;
+ current->nsproxy->user_ns = new_user;
+ new_user = user;
+ }
+
task_unlock(current);
}

if (new_nsproxy)
put_nsproxy(new_nsproxy);

+bad_unshare_cleanup_user:
+ if (new_user)
+ put_user_ns(new_user);
+
bad_unshare_cleanup_ipc:
if (new_ipc)
put_ipc_ns(new_ipc);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index cf43e06..2d9bafb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -102,7 +102,7 @@ int copy_namespaces(int flags, struct ta

get_nsproxy(old_ns);

- if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
return 0;

new_ns = clone_namespaces(old_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 368f8da..d1ce4c1 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -20,17 +20,87 @@ struct user_namespace init_user_ns = {
EXPORT_SYMBOL_GPL(init_user_ns);

#ifdef CONFIG_USER_NS
+/*
+ * Clone a new ns copying an original user ns, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+{
+ struct user_namespace *ns;
+ struct user_struct *new_user;
+ int n;
+
+ ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+ if (!ns)
+ return NULL;
+
+ kref_init(&ns->kref);
+
+ for(n = 0; n < UIDHASH_SZ; ++n)
+ INIT_LIST_HEAD(ns->uidhash_table + n);
+
+ /* Insert new root user. */
+ ns->root_user = alloc_uid(ns, 0);
+ if (!ns->root_user) {
+ kfree(ns);
+ return NULL;
+ }
+
+ /* Reset current->user with a new one */
+ new_user = alloc_uid(ns, current->uid);
+ if (!new_user) {
+ free_uid(ns->root_user);
+ kfree(ns);
+ return NULL;
+ }
+
+ switch_uid(new_user);
+ return ns;
+}
+
+/*
+ * unshare the current process' user namespace.
+ */
+int unshare_user_ns(unsigned long flags,
+ struct user_namespace **new_user)
+{
+ if (flags & CLONE_NEWUSER) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *new_user = clone_user_ns(current->nsproxy->user_ns);
+ if (!*new_user)
+ return -ENOMEM;
+ }
+
+ return 0;
+}

int copy_user_ns(int flags, struct task_struct *tsk)
{
- struct user_namespace *old_ns = tsk->nsproxy->user_ns;
+ struct user_namespace *new_ns, *old_ns = tsk->nsproxy->user_ns;
int err = 0;

if (!old_ns)
return 0;

get_user_ns(old_ns);
- return err;
+ if (!(flags & CLONE_NEWUSER))
+ return 0;
+ err = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+ err = -ENOMEM;
+ new_ns = clone_user_ns(old_ns);
+ if (!new_ns)
+ goto out;
+
+ tsk->nsproxy->user_ns = new_ns;
+ err = 0;
+out:
+ put_user_ns(old_ns);
+ return 0;
}

void free_user_ns(struct kref *kref)
--
1.4.1

2007-01-04 19:02:15

by Frederik Deweerdt

[permalink] [raw]
Subject: Re: [PATCH -mm 5/8] user ns: prepare copy_tree, copy_mnt, and their callers to handle errs

On Thu, Jan 04, 2007 at 12:12:15PM -0600, Serge E. Hallyn wrote:
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 5da87e2..a4039a3 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -708,8 +708,9 @@ struct vfsmount *copy_tree(struct vfsmou
> return NULL;
>
> res = q = clone_mnt(mnt, dentry, flag);
> - if (!q)
> - goto Enomem;
> + if (!q || IS_ERR(q)) {
> + return q;
> + }
> q->mnt_mountpoint = mnt->mnt_mountpoint;
>
> p = mnt;
> @@ -730,8 +731,9 @@ struct vfsmount *copy_tree(struct vfsmou
> nd.mnt = q;
> nd.dentry = p->mnt_mountpoint;
> q = clone_mnt(p, p->mnt_root, flag);
> - if (!q)
> - goto Enomem;
> + if (!q || IS_ERR(q)) {
> + goto Error;
> + }
> spin_lock(&vfsmount_lock);
> list_add_tail(&q->mnt_list, &res->mnt_list);
> attach_mnt(q, &nd);
> @@ -739,7 +741,7 @@ struct vfsmount *copy_tree(struct vfsmou
> }
> }
> return res;
> -Enomem:
> +Error:
> if (res) {
^^^^^^^^^^
I think that this check can be safely skiped, as res is always non-NULL
when Error: is now reached, isn't it?

Regards,
Frederik

2007-01-04 19:08:59

by Frederik Deweerdt

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

On Thu, Jan 04, 2007 at 12:13:10PM -0600, Serge E. Hallyn wrote:
> From: Serge E. Hallyn <[email protected]>
> Subject: [PATCH -mm 8/8] user ns: implement user ns unshare
>
> Implement CLONE_NEWUSER flag useable at clone/unshare.
>
> Signed-off-by: Serge E. Hallyn <[email protected]>
> ---

> int copy_user_ns(int flags, struct task_struct *tsk)
> {
> - struct user_namespace *old_ns = tsk->nsproxy->user_ns;
> + struct user_namespace *new_ns, *old_ns = tsk->nsproxy->user_ns;
> int err = 0;
^^^^^^^^^^^^
The "= 0" is superfluous here.
>
> if (!old_ns)
> return 0;
>
> get_user_ns(old_ns);
> - return err;
> + if (!(flags & CLONE_NEWUSER))
> + return 0;
> + err = -EPERM;
> + if (!capable(CAP_SYS_ADMIN))
> + goto out;
> + err = -ENOMEM;
> + new_ns = clone_user_ns(old_ns);
> + if (!new_ns)
> + goto out;
> +
> + tsk->nsproxy->user_ns = new_ns;
> + err = 0;
> +out:
> + put_user_ns(old_ns);
> + return 0;
^^^^^^^^^
Should be "return err;"

Regards,
Frederik
> }
>
> void free_user_ns(struct kref *kref)
> --
> 1.4.1
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2007-01-04 19:36:01

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 5/8] user ns: prepare copy_tree, copy_mnt, and their callers to handle errs

Quoting Frederik Deweerdt ([email protected]):
> On Thu, Jan 04, 2007 at 12:12:15PM -0600, Serge E. Hallyn wrote:
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index 5da87e2..a4039a3 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -708,8 +708,9 @@ struct vfsmount *copy_tree(struct vfsmou
> > return NULL;
> >
> > res = q = clone_mnt(mnt, dentry, flag);
> > - if (!q)
> > - goto Enomem;
> > + if (!q || IS_ERR(q)) {
> > + return q;
> > + }
> > q->mnt_mountpoint = mnt->mnt_mountpoint;
> >
> > p = mnt;
> > @@ -730,8 +731,9 @@ struct vfsmount *copy_tree(struct vfsmou
> > nd.mnt = q;
> > nd.dentry = p->mnt_mountpoint;
> > q = clone_mnt(p, p->mnt_root, flag);
> > - if (!q)
> > - goto Enomem;
> > + if (!q || IS_ERR(q)) {
> > + goto Error;
> > + }
> > spin_lock(&vfsmount_lock);
> > list_add_tail(&q->mnt_list, &res->mnt_list);
> > attach_mnt(q, &nd);
> > @@ -739,7 +741,7 @@ struct vfsmount *copy_tree(struct vfsmou
> > }
> > }
> > return res;
> > -Enomem:
> > +Error:
> > if (res) {
> ^^^^^^^^^^
> I think that this check can be safely skiped, as res is always non-NULL
> when Error: is now reached, isn't it?

Good point. In addition, we can clean the code up a little by not
letting clone_mnt return NULL. Compile-tested patch follows. Though
really it starts to look like clone_mnt should be split into two
parts, with the original clone_mnt static inline, and the new clone_mnt
doing the permission check and return value conversion...

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH 1/1] userns: cleanups with clone_mnt

As Frederik Deweerdt points out, the Error: case in copy_tree()
can no longer have res==NULL, so remove that check.

Also make clone_mnt always return -ENOMEM in place of NULL.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
fs/namespace.c | 90 ++++++++++++++++++++++++++++----------------------------
1 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 60ca9b5..2ed89d4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -243,41 +243,43 @@ static struct vfsmount *clone_mnt(struct

mnt = alloc_vfsmnt(old->mnt_devname);

- if (mnt) {
- mnt->mnt_flags = old->mnt_flags;
- atomic_inc(&sb->s_active);
- mnt->mnt_sb = sb;
- mnt->mnt_root = dget(root);
- mnt->mnt_mountpoint = mnt->mnt_root;
- mnt->mnt_parent = mnt;
-
- if (flag & CL_SLAVE) {
- list_add(&mnt->mnt_slave, &old->mnt_slave_list);
- mnt->mnt_master = old;
- CLEAR_MNT_SHARED(mnt);
- } else {
- if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
- list_add(&mnt->mnt_share, &old->mnt_share);
- if (IS_MNT_SLAVE(old))
- list_add(&mnt->mnt_slave, &old->mnt_slave);
- mnt->mnt_master = old->mnt_master;
- }
- if (flag & CL_MAKE_SHARED)
- set_mnt_shared(mnt);
- if (flag & CL_SHARE_NS)
- mnt->mnt_flags |= MNT_SHARE_NS;
- else
- mnt->mnt_flags &= ~MNT_SHARE_NS;
-
- /* stick the duplicate mount on the same expiry list
- * as the original if that was on one */
- if (flag & CL_EXPIRE) {
- spin_lock(&vfsmount_lock);
- if (!list_empty(&old->mnt_expire))
- list_add(&mnt->mnt_expire, &old->mnt_expire);
- spin_unlock(&vfsmount_lock);
- }
+ if (!mnt)
+ return ERR_PTR(-ENOMEM);
+
+ mnt->mnt_flags = old->mnt_flags;
+ atomic_inc(&sb->s_active);
+ mnt->mnt_sb = sb;
+ mnt->mnt_root = dget(root);
+ mnt->mnt_mountpoint = mnt->mnt_root;
+ mnt->mnt_parent = mnt;
+
+ if (flag & CL_SLAVE) {
+ list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+ mnt->mnt_master = old;
+ CLEAR_MNT_SHARED(mnt);
+ } else {
+ if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+ list_add(&mnt->mnt_share, &old->mnt_share);
+ if (IS_MNT_SLAVE(old))
+ list_add(&mnt->mnt_slave, &old->mnt_slave);
+ mnt->mnt_master = old->mnt_master;
}
+ if (flag & CL_MAKE_SHARED)
+ set_mnt_shared(mnt);
+ if (flag & CL_SHARE_NS)
+ mnt->mnt_flags |= MNT_SHARE_NS;
+ else
+ mnt->mnt_flags &= ~MNT_SHARE_NS;
+
+ /* stick the duplicate mount on the same expiry list
+ * as the original if that was on one */
+ if (flag & CL_EXPIRE) {
+ spin_lock(&vfsmount_lock);
+ if (!list_empty(&old->mnt_expire))
+ list_add(&mnt->mnt_expire, &old->mnt_expire);
+ spin_unlock(&vfsmount_lock);
+ }
+
return mnt;
}

@@ -715,14 +717,15 @@ struct vfsmount *copy_tree(struct vfsmou
{
struct vfsmount *res, *p, *q, *r, *s;
struct nameidata nd;
+ LIST_HEAD(umount_list);

if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
return NULL;

res = q = clone_mnt(mnt, dentry, flag);
- if (!q || IS_ERR(q)) {
+ if (IS_ERR(q))
return q;
- }
+
q->mnt_mountpoint = mnt->mnt_mountpoint;

p = mnt;
@@ -743,9 +746,8 @@ struct vfsmount *copy_tree(struct vfsmou
nd.mnt = q;
nd.dentry = p->mnt_mountpoint;
q = clone_mnt(p, p->mnt_root, flag);
- if (!q || IS_ERR(q)) {
+ if (IS_ERR(q))
goto Error;
- }
spin_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &nd);
@@ -754,13 +756,11 @@ struct vfsmount *copy_tree(struct vfsmou
}
return res;
Error:
- if (res) {
- LIST_HEAD(umount_list);
- spin_lock(&vfsmount_lock);
- umount_tree(res, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
- release_mounts(&umount_list);
- }
+ spin_lock(&vfsmount_lock);
+ umount_tree(res, 0, &umount_list);
+ spin_unlock(&vfsmount_lock);
+ release_mounts(&umount_list);
+
return q;
}

--
1.4.1

2007-01-04 19:43:57

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

Quoting Frederik Deweerdt ([email protected]):
> On Thu, Jan 04, 2007 at 12:13:10PM -0600, Serge E. Hallyn wrote:
> > From: Serge E. Hallyn <[email protected]>
> > Subject: [PATCH -mm 8/8] user ns: implement user ns unshare
> >
> > Implement CLONE_NEWUSER flag useable at clone/unshare.
> >
> > Signed-off-by: Serge E. Hallyn <[email protected]>
> > ---
>
> > int copy_user_ns(int flags, struct task_struct *tsk)
> > {
> > - struct user_namespace *old_ns = tsk->nsproxy->user_ns;
> > + struct user_namespace *new_ns, *old_ns = tsk->nsproxy->user_ns;
> > int err = 0;
> ^^^^^^^^^^^^
> The "= 0" is superfluous here.

Ah, since I set it anyway, good point.

> >
> > if (!old_ns)
> > return 0;
> >
> > get_user_ns(old_ns);
> > - return err;
> > + if (!(flags & CLONE_NEWUSER))
> > + return 0;
> > + err = -EPERM;
> > + if (!capable(CAP_SYS_ADMIN))
> > + goto out;
> > + err = -ENOMEM;
> > + new_ns = clone_user_ns(old_ns);
> > + if (!new_ns)
> > + goto out;
> > +
> > + tsk->nsproxy->user_ns = new_ns;
> > + err = 0;
> > +out:
> > + put_user_ns(old_ns);
> > + return 0;
> ^^^^^^^^^
> Should be "return err;"

Yes it should.

New patch attached.

(I suppose the testcase should check for the CAP_SYS_ADMIN
error case...)

Thanks for the close review!

-serge

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH 8/8] user ns: implement user ns unshare

Implement CLONE_NEWUSER flag useable at clone/unshare.

Changes:
Jan 4: return the actual error value in copy_user_ns().

Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/sched.h | 1 +
include/linux/user_namespace.h | 10 +++++
kernel/fork.c | 22 ++++++++++--
kernel/nsproxy.c | 2 +
kernel/user_namespace.c | 74 +++++++++++++++++++++++++++++++++++++++-
5 files changed, 102 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 73df38c..55ecf81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -26,6 +26,7 @@ #define CLONE_CHILD_SETTID 0x01000000 /*
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
+#define CLONE_NEWUSER 0x10000000 /* New user namespace */

/*
* Scheduling policies
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4ad4c0d..d577ede 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -25,6 +25,7 @@ static inline struct user_namespace *get
}

extern int copy_user_ns(int flags, struct task_struct *tsk);
+extern int unshare_user_ns(unsigned long flags, struct user_namespace **new_user);
extern void free_user_ns(struct kref *kref);

static inline void put_user_ns(struct user_namespace *ns)
@@ -40,6 +41,15 @@ static inline struct user_namespace *get
return NULL;
}

+static inline int unshare_user_ns(unsigned long flags,
+ struct user_namespace **new_user)
+{
+ if (flags & CLONE_NEWUSER)
+ return -EINVAL;
+
+ return 0;
+}
+
static inline int copy_user_ns(int flags, struct task_struct *tsk)
{
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index deafa6e..eead517 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@ #include <linux/cn_proc.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
+#include <linux/user_namespace.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1620,6 +1621,7 @@ asmlinkage long sys_unshare(unsigned lon
struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
struct uts_namespace *uts, *new_uts = NULL;
struct ipc_namespace *ipc, *new_ipc = NULL;
+ struct user_namespace *user, *new_user = NULL;

check_unshare_flags(&unshare_flags);

@@ -1627,7 +1629,7 @@ asmlinkage long sys_unshare(unsigned lon
err = -EINVAL;
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
goto bad_unshare_out;

if ((err = unshare_thread(unshare_flags)))
@@ -1648,18 +1650,20 @@ asmlinkage long sys_unshare(unsigned lon
goto bad_unshare_cleanup_semundo;
if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
goto bad_unshare_cleanup_uts;
+ if ((err = unshare_user_ns(unshare_flags, &new_user)))
+ goto bad_unshare_cleanup_ipc;

- if (new_ns || new_uts || new_ipc) {
+ if (new_ns || new_uts || new_ipc || new_user) {
old_nsproxy = current->nsproxy;
new_nsproxy = dup_namespaces(old_nsproxy);
if (!new_nsproxy) {
err = -ENOMEM;
- goto bad_unshare_cleanup_ipc;
+ goto bad_unshare_cleanup_user;
}
}

if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
- new_uts || new_ipc) {
+ new_uts || new_ipc || new_user) {

task_lock(current);

@@ -1707,12 +1711,22 @@ asmlinkage long sys_unshare(unsigned lon
new_ipc = ipc;
}

+ if (new_user) {
+ user = current->nsproxy->user_ns;
+ current->nsproxy->user_ns = new_user;
+ new_user = user;
+ }
+
task_unlock(current);
}

if (new_nsproxy)
put_nsproxy(new_nsproxy);

+bad_unshare_cleanup_user:
+ if (new_user)
+ put_user_ns(new_user);
+
bad_unshare_cleanup_ipc:
if (new_ipc)
put_ipc_ns(new_ipc);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index cf43e06..2d9bafb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -102,7 +102,7 @@ int copy_namespaces(int flags, struct ta

get_nsproxy(old_ns);

- if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
return 0;

new_ns = clone_namespaces(old_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 368f8da..33c95ca 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -20,16 +20,86 @@ struct user_namespace init_user_ns = {
EXPORT_SYMBOL_GPL(init_user_ns);

#ifdef CONFIG_USER_NS
+/*
+ * Clone a new ns copying an original user ns, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+{
+ struct user_namespace *ns;
+ struct user_struct *new_user;
+ int n;
+
+ ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+ if (!ns)
+ return NULL;
+
+ kref_init(&ns->kref);
+
+ for(n = 0; n < UIDHASH_SZ; ++n)
+ INIT_LIST_HEAD(ns->uidhash_table + n);
+
+ /* Insert new root user. */
+ ns->root_user = alloc_uid(ns, 0);
+ if (!ns->root_user) {
+ kfree(ns);
+ return NULL;
+ }
+
+ /* Reset current->user with a new one */
+ new_user = alloc_uid(ns, current->uid);
+ if (!new_user) {
+ free_uid(ns->root_user);
+ kfree(ns);
+ return NULL;
+ }
+
+ switch_uid(new_user);
+ return ns;
+}
+
+/*
+ * unshare the current process' user namespace.
+ */
+int unshare_user_ns(unsigned long flags,
+ struct user_namespace **new_user)
+{
+ if (flags & CLONE_NEWUSER) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *new_user = clone_user_ns(current->nsproxy->user_ns);
+ if (!*new_user)
+ return -ENOMEM;
+ }
+
+ return 0;
+}

int copy_user_ns(int flags, struct task_struct *tsk)
{
- struct user_namespace *old_ns = tsk->nsproxy->user_ns;
- int err = 0;
+ struct user_namespace *new_ns, *old_ns = tsk->nsproxy->user_ns;
+ int err;

if (!old_ns)
return 0;

get_user_ns(old_ns);
+ if (!(flags & CLONE_NEWUSER))
+ return 0;
+ err = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+ err = -ENOMEM;
+ new_ns = clone_user_ns(old_ns);
+ if (!new_ns)
+ goto out;
+
+ tsk->nsproxy->user_ns = new_ns;
+ err = 0;
+out:
+ put_user_ns(old_ns);
return err;
}

--
1.4.1

2007-01-04 21:16:15

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 2/8] user namespace: add the framework

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH 1/1] user ns: fix compilation failures for some arches

for i386, an include of utsname.h by asm/elf.h hides the fact
that user_namespace needs sched.h. To fix compilation for
x86_64, just include sched.h explicitly.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/user_namespace.h | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d577ede..be3e484 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -3,6 +3,7 @@ #define _LINUX_USER_NAMESPACE_H

#include <linux/kref.h>
#include <linux/nsproxy.h>
+#include <linux/sched.h>

#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
#define UIDHASH_SZ (1 << UIDHASH_BITS)
--
1.4.1

2007-01-04 22:07:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

On Thu, 4 Jan 2007 14:03:16 -0800
Andrew Morton <[email protected]> wrote:

> I was just about to commit this lot then I discovered vast amounts of
> eight-spaces-where-there-should-be-tabs. Returned to sender for complete
> repair, please.

actually, it was just the one function, so I fixed it up.

2007-01-04 22:13:44

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

On Thu, 4 Jan 2007 13:43:51 -0600
"Serge E. Hallyn" <[email protected]> wrote:

I was just about to commit this lot then I discovered vast amounts of
eight-spaces-where-there-should-be-tabs. Returned to sender for complete
repair, please.

>
> +bad_unshare_cleanup_user:
> + if (new_user)
> + put_user_ns(new_user);

put_user_ns(NULL) is legal.

> + for(n = 0; n < UIDHASH_SZ; ++n)
^ space here


2007-01-04 22:24:05

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

On Thu, 04 Jan 2007 19:07:00 GMT, Frederik Deweerdt said:
> On Thu, Jan 04, 2007 at 12:13:10PM -0600, Serge E. Hallyn wrote:
> > From: Serge E. Hallyn <[email protected]>
> > Subject: [PATCH -mm 8/8] user ns: implement user ns unshare
> >
> > Implement CLONE_NEWUSER flag useable at clone/unshare.
> >
> > Signed-off-by: Serge E. Hallyn <[email protected]>
> > ---
>
> > int copy_user_ns(int flags, struct task_struct *tsk)
> > {
> > - struct user_namespace *old_ns = tsk->nsproxy->user_ns;
> > + struct user_namespace *new_ns, *old_ns = tsk->nsproxy->user_ns;
> > int err = 0;
> ^^^^^^^^^^^^
> The "= 0" is superfluous here.

Umm? bss gets cleared automagically, but when did we start auto-zeroing
the stack?


Attachments:
(No filename) (226.00 B)

2007-01-04 22:53:06

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

Quoting [email protected] ([email protected]):
> On Thu, 04 Jan 2007 19:07:00 GMT, Frederik Deweerdt said:
> > On Thu, Jan 04, 2007 at 12:13:10PM -0600, Serge E. Hallyn wrote:
> > > From: Serge E. Hallyn <[email protected]>
> > > Subject: [PATCH -mm 8/8] user ns: implement user ns unshare
> > >
> > > Implement CLONE_NEWUSER flag useable at clone/unshare.
> > >
> > > Signed-off-by: Serge E. Hallyn <[email protected]>
> > > ---
> >
> > > int copy_user_ns(int flags, struct task_struct *tsk)
> > > {
> > > - struct user_namespace *old_ns = tsk->nsproxy->user_ns;
> > > + struct user_namespace *new_ns, *old_ns = tsk->nsproxy->user_ns;
> > > int err = 0;
> > ^^^^^^^^^^^^
> > The "= 0" is superfluous here.
>
> Umm? bss gets cleared automagically, but when did we start auto-zeroing
> the stack?

No, no, that's what i thought he meant at first too, but I actually
manually set err on all paths anyway :)

-serge

2007-01-05 02:03:05

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

On Thu, 04 Jan 2007 16:52:53 CST, "Serge E. Hallyn" said:
> Quoting [email protected] ([email protected]):
> > On Thu, 04 Jan 2007 19:07:00 GMT, Frederik Deweerdt said:
> > > > int err = 0;
> > > ^^^^^^^^^^^^
> > > The "= 0" is superfluous here.
> >
> > Umm? bss gets cleared automagically, but when did we start auto-zeroing
> > the stack?
>
> No, no, that's what i thought he meant at first too, but I actually
> manually set err on all paths anyway :)

Oh. So it's *really* just "superfluous until somebody changes the code"...



Attachments:
(No filename) (226.00 B)

2007-01-05 04:04:04

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 0/8] user ns: Introduction

On Thu, 4 Jan 2007 12:06:35 -0600
"Serge E. Hallyn" <[email protected]> wrote:

> This patchset adds a user namespace, which allows a process to
> unshare its user_struct table, allowing for separate accounting
> per user namespace.

With these patches applied and with CONFIG_USER_NS=n, my selinux-enabled
standard FC5 machine throws a complete fit:

[ 12.323958] EDAC MC: Ver: 2.0.1 Jan 4 2007
[ 12.357476] TCP cubic registered
[ 12.360784] NET: Registered protocol family 1
[ 12.364125] NET: Registered protocol family 17
[ 12.367761] speedstep-centrino with X86_SPEEDSTEP_CENTRINO_ACPI config is deprecated.
[ 12.367763] Use X86_ACPI_CPUFREQ (acpi-cpufreq) instead.
[ 12.374666] Using IPI Shortcut mode
[ 12.378222] Time: tsc clocksource has been installed.
[ 12.381987] Time: acpi_pm clocksource has been installed.
[ 12.386522] ACPI: (supports S0 S3 S4 S5)
[ 6.344000] Freeing unused kernel memory: 184k freed
[ 6.560000] input: PS/2 Mouse as /class/input/input1
[ 6.580000] input: AlpsPS/2 ALPS GlidePoint as /class/input/input2
[ 6.760000] Uniform Multi-Platform E-IDE driver Revision: 7.00alpha2
[ 6.764000] ide: Assuming 33MHz system bus speed for PIO modes; override with idebus=xx
[ 6.824000] EXT3-fs: INFO: recovery required on readonly filesystem.
[ 6.824000] EXT3-fs: write access will be enabled during recovery.
[ 10.832000] kjournald starting. Commit interval 5 seconds
[ 10.836000] EXT3-fs: recovery complete.
[ 10.840000] EXT3-fs: mounted filesystem with ordered data mode.
[ 11.852000] audit(1167940353.844:2): enforcing=1 old_enforcing=0 auid=4294967295
[ 11.948000] security: 3 users, 6 roles, 1417 types, 151 bools, 1 sens, 256 cats
[ 11.952000] security: 57 classes, 41080 rules
[ 11.956000] security: class key not defined in policy
[ 11.956000] security: class context not defined in policy
[ 11.960000] security: class dccp_socket not defined in policy
[ 11.964000] security: permission dccp_recv in class node not defined in policy
[ 11.964000] security: permission dccp_send in class node not defined in policy
[ 11.968000] security: permission dccp_recv in class netif not defined in policy
[ 11.972000] security: permission dccp_send in class netif not defined in policy
[ 11.972000] security: permission setkeycreate in class process not defined in policy
[ 11.976000] security: permission setsockcreate in class process not defined in policy
[ 11.980000] security: permission polmatch in class association not defined in policy
[ 11.980000] SELinux: Completing initialization.
[ 11.984000] SELinux: Setting up existing superblocks.
[ 12.004000] SELinux: initialized (dev sda6, type ext3), uses xattr
[ 12.204000] SELinux: initialized (dev tmpfs, type tmpfs), uses transition SIDs
[ 12.208000] SELinux: initialized (dev debugfs, type debugfs), uses genfs_contexts
[ 12.208000] SELinux: initialized (dev selinuxfs, type selinuxfs), uses genfs_contexts
[ 12.212000] SELinux: initialized (dev mqueue, type mqueue), uses transition SIDs
[ 12.216000] SELinux: initialized (dev hugetlbfs, type hugetlbfs), uses genfs_contexts
[ 12.216000] SELinux: initialized (dev devpts, type devpts), uses transition SIDs
[ 12.220000] SELinux: initialized (dev eventpollfs, type eventpollfs), uses genfs_contexts
[ 12.224000] SELinux: initialized (dev inotifyfs, type inotifyfs), uses genfs_contexts
[ 12.224000] SELinux: initialized (dev tmpfs, type tmpfs), uses transition SIDs
[ 12.228000] SELinux: initialized (dev futexfs, type futexfs), uses genfs_contexts
[ 12.232000] SELinux: initialized (dev pipefs, type pipefs), uses task SIDs
[ 12.232000] SELinux: initialized (dev sockfs, type sockfs), uses task SIDs
[ 12.236000] SELinux: initialized (dev proc, type proc), uses genfs_contexts
[ 12.240000] SELinux: initialized (dev bdev, type bdev), uses genfs_contexts
[ 12.240000] SELinux: initialized (dev rootfs, type rootfs), uses genfs_contexts
[ 12.244000] SELinux: initialized (dev sysfs, type sysfs), uses genfs_contexts
[ 12.260000] audit(1167940354.256:3): policy loaded auid=4294967295
[ 12.944000] SELinux: initialized (dev usbfs, type usbfs), uses genfs_contexts
[ 15.376000] audit(1167969158.994:4): avc: denied { audit_write } for pid=386 comm="hwclock" capability=29 scontext=system_u:system_r:hwclock_t:s0 tcontext=system_u:system_r:hwclock_t:s0 tclass=capability
[ 33.936000] audit(1167969177.567:2292): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.940000] audit(1167969177.579:2293): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.952000] audit(1167969177.591:2294): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.956000] audit(1167969177.607:2295): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.960000] audit(1167969177.615:2296): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.964000] audit(1167969177.627:2297): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.968000] audit(1167969177.639:2298): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.972000] audit(1167969177.651:2299): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.976000] audit(1167969177.667:2300): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.980000] audit(1167969177.679:2301): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
[ 33.984000] audit(1167969177.695:2302): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
<ad infinitum>


Setting CONFIG_USER_NS=y fixes this.

2007-01-05 04:35:24

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 8/8] user ns: implement user ns unshare

Quoting [email protected] ([email protected]):
> On Thu, 04 Jan 2007 16:52:53 CST, "Serge E. Hallyn" said:
> > Quoting [email protected] ([email protected]):
> > > On Thu, 04 Jan 2007 19:07:00 GMT, Frederik Deweerdt said:
> > > > > int err = 0;
> > > > ^^^^^^^^^^^^
> > > > The "= 0" is superfluous here.
> > >
> > > Umm? bss gets cleared automagically, but when did we start auto-zeroing
> > > the stack?
> >
> > No, no, that's what i thought he meant at first too, but I actually
> > manually set err on all paths anyway :)
>
> Oh. So it's *really* just "superfluous until somebody changes the code"...

True.

-serge

2007-01-05 06:08:34

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 0/8] user ns: Introduction

Quoting Andrew Morton ([email protected]):
> On Thu, 4 Jan 2007 12:06:35 -0600
> "Serge E. Hallyn" <[email protected]> wrote:
>
> > This patchset adds a user namespace, which allows a process to
> > unshare its user_struct table, allowing for separate accounting
> > per user namespace.
>
> With these patches applied and with CONFIG_USER_NS=n, my selinux-enabled
> standard FC5 machine throws a complete fit:
>
> [ 12.323958] EDAC MC: Ver: 2.0.1 Jan 4 2007
> [ 12.357476] TCP cubic registered
> [ 12.360784] NET: Registered protocol family 1
> [ 12.364125] NET: Registered protocol family 17
> [ 12.367761] speedstep-centrino with X86_SPEEDSTEP_CENTRINO_ACPI config is deprecated.
> [ 12.367763] Use X86_ACPI_CPUFREQ (acpi-cpufreq) instead.
> [ 12.374666] Using IPI Shortcut mode
> [ 12.378222] Time: tsc clocksource has been installed.
> [ 12.381987] Time: acpi_pm clocksource has been installed.
> [ 12.386522] ACPI: (supports S0 S3 S4 S5)
> [ 6.344000] Freeing unused kernel memory: 184k freed
> [ 6.560000] input: PS/2 Mouse as /class/input/input1
> [ 6.580000] input: AlpsPS/2 ALPS GlidePoint as /class/input/input2
> [ 6.760000] Uniform Multi-Platform E-IDE driver Revision: 7.00alpha2
> [ 6.764000] ide: Assuming 33MHz system bus speed for PIO modes; override with idebus=xx
> [ 6.824000] EXT3-fs: INFO: recovery required on readonly filesystem.
> [ 6.824000] EXT3-fs: write access will be enabled during recovery.
> [ 10.832000] kjournald starting. Commit interval 5 seconds
> [ 10.836000] EXT3-fs: recovery complete.
> [ 10.840000] EXT3-fs: mounted filesystem with ordered data mode.
> [ 11.852000] audit(1167940353.844:2): enforcing=1 old_enforcing=0 auid=4294967295
> [ 11.948000] security: 3 users, 6 roles, 1417 types, 151 bools, 1 sens, 256 cats
> [ 11.952000] security: 57 classes, 41080 rules
> [ 11.956000] security: class key not defined in policy
> [ 11.956000] security: class context not defined in policy
> [ 11.960000] security: class dccp_socket not defined in policy
> [ 11.964000] security: permission dccp_recv in class node not defined in policy
> [ 11.964000] security: permission dccp_send in class node not defined in policy
> [ 11.968000] security: permission dccp_recv in class netif not defined in policy
> [ 11.972000] security: permission dccp_send in class netif not defined in policy
> [ 11.972000] security: permission setkeycreate in class process not defined in policy
> [ 11.976000] security: permission setsockcreate in class process not defined in policy
> [ 11.980000] security: permission polmatch in class association not defined in policy
> [ 11.980000] SELinux: Completing initialization.
> [ 11.984000] SELinux: Setting up existing superblocks.
> [ 12.004000] SELinux: initialized (dev sda6, type ext3), uses xattr
> [ 12.204000] SELinux: initialized (dev tmpfs, type tmpfs), uses transition SIDs
> [ 12.208000] SELinux: initialized (dev debugfs, type debugfs), uses genfs_contexts
> [ 12.208000] SELinux: initialized (dev selinuxfs, type selinuxfs), uses genfs_contexts
> [ 12.212000] SELinux: initialized (dev mqueue, type mqueue), uses transition SIDs
> [ 12.216000] SELinux: initialized (dev hugetlbfs, type hugetlbfs), uses genfs_contexts
> [ 12.216000] SELinux: initialized (dev devpts, type devpts), uses transition SIDs
> [ 12.220000] SELinux: initialized (dev eventpollfs, type eventpollfs), uses genfs_contexts
> [ 12.224000] SELinux: initialized (dev inotifyfs, type inotifyfs), uses genfs_contexts
> [ 12.224000] SELinux: initialized (dev tmpfs, type tmpfs), uses transition SIDs
> [ 12.228000] SELinux: initialized (dev futexfs, type futexfs), uses genfs_contexts
> [ 12.232000] SELinux: initialized (dev pipefs, type pipefs), uses task SIDs
> [ 12.232000] SELinux: initialized (dev sockfs, type sockfs), uses task SIDs
> [ 12.236000] SELinux: initialized (dev proc, type proc), uses genfs_contexts
> [ 12.240000] SELinux: initialized (dev bdev, type bdev), uses genfs_contexts
> [ 12.240000] SELinux: initialized (dev rootfs, type rootfs), uses genfs_contexts
> [ 12.244000] SELinux: initialized (dev sysfs, type sysfs), uses genfs_contexts
> [ 12.260000] audit(1167940354.256:3): policy loaded auid=4294967295
> [ 12.944000] SELinux: initialized (dev usbfs, type usbfs), uses genfs_contexts
> [ 15.376000] audit(1167969158.994:4): avc: denied { audit_write } for pid=386 comm="hwclock" capability=29 scontext=system_u:system_r:hwclock_t:s0 tcontext=system_u:system_r:hwclock_t:s0 tclass=capability
> [ 33.936000] audit(1167969177.567:2292): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.940000] audit(1167969177.579:2293): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.952000] audit(1167969177.591:2294): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.956000] audit(1167969177.607:2295): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.960000] audit(1167969177.615:2296): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.964000] audit(1167969177.627:2297): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.968000] audit(1167969177.639:2298): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.972000] audit(1167969177.651:2299): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.976000] audit(1167969177.667:2300): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.980000] audit(1167969177.679:2301): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> [ 33.984000] audit(1167969177.695:2302): avc: denied { search } for pid=2141 comm="klogd" name="/" dev=tmpfs ino=1225 scontext=system_u:system_r:klogd_t:s0 tcontext=system_u:object_r:tmpfs_t:s0 tclass=dir
> <ad infinitum>
>
>
> Setting CONFIG_USER_NS=y fixes this.

Ok, I see. The CONFIG_USER_NS split is absolutely horrible. Should
really get rid of the user_ns pointers altogether when !CONFIG_USER_NS.
I'll try to fix it up without putting ifdefs all over - planning to send
a patch tomorrow.

thanks,
-serge

2007-01-05 07:01:48

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 0/8] user ns: Introduction

Quoting Serge E. Hallyn ([email protected]):
> Quoting Andrew Morton ([email protected]):
> > On Thu, 4 Jan 2007 12:06:35 -0600
> > "Serge E. Hallyn" <[email protected]> wrote:
> >
> > > This patchset adds a user namespace, which allows a process to
> > > unshare its user_struct table, allowing for separate accounting
> > > per user namespace.
> >
> > With these patches applied and with CONFIG_USER_NS=n, my selinux-enabled
> > standard FC5 machine throws a complete fit:
...
> >
> > Setting CONFIG_USER_NS=y fixes this.
>
> Ok, I see. The CONFIG_USER_NS split is absolutely horrible. Should
> really get rid of the user_ns pointers altogether when !CONFIG_USER_NS.
> I'll try to fix it up without putting ifdefs all over - planning to send
> a patch tomorrow.

Actually it's not that bad. Following patch fixes what I believe is
the underlying problem, and in any case was a definite bug.

thanks,
-serge

From: Serge E. Hallyn <[email protected]>
Subject: [PATCH 1/1] user ns: fix !CONFIG_USER_NS mount denial

Don't do user_ns permission checks when !CONFIG_USER_NS.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
fs/namespace.c | 6 ++----
include/linux/sched.h | 8 ++++++++
include/linux/user_namespace.h | 15 +++++++++++++++
3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 2ed89d4..664c6f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,10 +236,8 @@ static struct vfsmount *clone_mnt(struct
struct super_block *sb = old->mnt_sb;
struct vfsmount *mnt;

- if (!(old->mnt_flags & MNT_SHARE_NS)) {
- if (old->mnt_user_ns != current->nsproxy->user_ns)
- return ERR_PTR(-EPERM);
- }
+ if (!clone_mnt_userns_permission(old))
+ return ERR_PTR(-EPERM);

mnt = alloc_vfsmnt(old->mnt_devname);

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55ecf81..0542f34 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1588,6 +1588,7 @@ extern int cond_resched(void);
extern int cond_resched_lock(spinlock_t * lock);
extern int cond_resched_softirq(void);

+#ifdef CONFIG_USER_NS
/*
* Check whether a task and a vfsmnt belong to the same uidns.
* Since the initial namespace is exempt from these checks,
@@ -1606,6 +1607,13 @@ static inline int task_mnt_same_uidns(st
return 1;
return 0;
}
+#else
+static inline int task_mnt_same_uidns(struct task_struct *tsk,
+ struct vfsmount *mnt)
+{
+ return 1;
+}
+#endif


/*
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index be3e484..e4ce9cb 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -35,6 +35,16 @@ static inline void put_user_ns(struct us
kref_put(&ns->kref, free_user_ns);
}

+static inline int clone_mnt_userns_permission(struct vfsmount *old)
+{
+ if (!(old->mnt_flags & MNT_SHARE_NS)) {
+ if (old->mnt_user_ns != current->nsproxy->user_ns)
+ return 0;
+ }
+
+ return 1;
+}
+
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -59,6 +69,11 @@ static inline int copy_user_ns(int flags
static inline void put_user_ns(struct user_namespace *ns)
{
}
+
+static inline int clone_mnt_userns_permission(struct vfsmount *old)
+{
+ return 1;
+}
#endif

#endif /* _LINUX_USER_H */
--
1.4.1

2007-01-12 05:21:01

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 7/8] user_ns: handle file sigio

On Thu, 4 Jan 2007 12:12:57 -0600
"Serge E. Hallyn" <[email protected]> wrote:

> A process in one user namespace could set a fowner and sigio on a file in a
> shared vfsmount, ending up killing a task in another user namespace.
>
> Prevent this by adding a user namespace pointer to the fown_struct, and
> enforcing that a process causing a signal to be sent be in the same
> user namespace as the file owner.

This patch breaks the X server (stock FC5 install) with CONFIG_USER_NS=n.
Neither the USB mouse nor the trackpad work. They work OK under GPM.

Setting CONFIG_USER_NS=y "fixes" this. This bug was not observed in
2.6.20-rc3-mm1 because that kernel had user-ns-always-on.patch for other
reasons. (I'll restore that patch).

There's nothing very interesting here:


sony:/home/akpm> diff -u Xorg.0.log.good Xorg.0.log.bad
--- Xorg.0.log.good 2007-01-11 21:11:11.000000000 -0800
+++ Xorg.0.log.bad 2007-01-11 21:17:31.000000000 -0800
@@ -6,7 +6,7 @@
Release Date: 21 December 2005
X Protocol Version 11, Revision 0, Release 7.0
Build Operating System:Linux 2.6.9-22.18.bz155725.ELsmp i686Red Hat, Inc.
-Current Operating System: Linux sony 2.6.20-rc4-mm1 #15 Thu Jan 11 21:07:58 PST 2007 i686
+Current Operating System: Linux sony 2.6.20-rc4-mm1 #16 Thu Jan 11 21:14:03 PST 2007 i686
Build Date: 22 March 2006
Before reporting problems, check http://wiki.x.org
to make sure that you have the latest version.
@@ -14,7 +14,7 @@
Markers: (--) probed, (**) from config file, (==) default setting,
(++) from command line, (!!) notice, (II) informational,
(WW) warning, (EE) error, (NI) not implemented, (??) unknown.
-(==) Log file: "/var/log/Xorg.0.log", Time: Thu Jan 11 21:10:16 2007
+(==) Log file: "/var/log/Xorg.0.log", Time: Thu Jan 11 21:16:39 2007
(==) Using config file: "/etc/X11/xorg.conf"
(==) ServerLayout "single head configuration"
(**) |-->Screen "Screen0" (0)
@@ -2117,9 +2117,9 @@
(II) I810(0): Allocated 128 kB for the ring buffer at 0x0
(II) I810(0): Allocating at least 256 scanlines for pixmap cache
(II) I810(0): Initial framebuffer allocation size: 12288 kByte
-(II) I810(0): Allocated 4 kB for HW cursor at 0xffff000 (0x35dd3000)
-(II) I810(0): Allocated 16 kB for HW (ARGB) cursor at 0xfffb000 (0x35e78000)
-(II) I810(0): Allocated 4 kB for Overlay registers at 0xfffa000 (0x35e39000).
+(II) I810(0): Allocated 4 kB for HW cursor at 0xffff000 (0x358d5000)
+(II) I810(0): Allocated 16 kB for HW (ARGB) cursor at 0xfffb000 (0x35888000)
+(II) I810(0): Allocated 4 kB for Overlay registers at 0xfffa000 (0x358d7000).
(II) I810(0): Allocated 64 kB for the scratch buffer at 0xffea000
drmOpenDevice: node name is /dev/dri/card0
drmOpenDevice: open result is -1, (No such device or address)
@@ -2137,8 +2137,8 @@
(II) I810(0): [drm] loaded kernel module for "i915" driver
(II) I810(0): [drm] DRM interface version 1.3
(II) I810(0): [drm] created "i915" driver at busid "pci:0000:00:02.0"
-(II) I810(0): [drm] added 8192 byte SAREA at 0xf8e46000
-(II) I810(0): [drm] mapped SAREA 0xf8e46000 to 0xb7eec000
+(II) I810(0): [drm] added 8192 byte SAREA at 0xf8d4a000
+(II) I810(0): [drm] mapped SAREA 0xf8d4a000 to 0xb7f23000
(II) I810(0): [drm] framebuffer handle = 0xc0020000
(II) I810(0): [drm] added 1 reserved context for kernel
(II) I810(0): Allocated 32 kB for the logical context at 0xffe2000.

2007-01-15 07:27:04

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 7/8] user_ns: handle file sigio

Quoting Andrew Morton ([email protected]):
> On Thu, 4 Jan 2007 12:12:57 -0600
> "Serge E. Hallyn" <[email protected]> wrote:
>
> > A process in one user namespace could set a fowner and sigio on a file in a
> > shared vfsmount, ending up killing a task in another user namespace.
> >
> > Prevent this by adding a user namespace pointer to the fown_struct, and
> > enforcing that a process causing a signal to be sent be in the same
> > user namespace as the file owner.
>
> This patch breaks the X server (stock FC5 install) with CONFIG_USER_NS=n.
> Neither the USB mouse nor the trackpad work. They work OK under GPM.
>
> Setting CONFIG_USER_NS=y "fixes" this. This bug was not observed in
> 2.6.20-rc3-mm1 because that kernel had user-ns-always-on.patch for other
> reasons. (I'll restore that patch).
>
> There's nothing very interesting here:
>
>
> sony:/home/akpm> diff -u Xorg.0.log.good Xorg.0.log.bad
> --- Xorg.0.log.good 2007-01-11 21:11:11.000000000 -0800
> +++ Xorg.0.log.bad 2007-01-11 21:17:31.000000000 -0800
> @@ -6,7 +6,7 @@
> Release Date: 21 December 2005
> X Protocol Version 11, Revision 0, Release 7.0
> Build Operating System:Linux 2.6.9-22.18.bz155725.ELsmp i686Red Hat, Inc.
> -Current Operating System: Linux sony 2.6.20-rc4-mm1 #15 Thu Jan 11 21:07:58 PST 2007 i686
> +Current Operating System: Linux sony 2.6.20-rc4-mm1 #16 Thu Jan 11 21:14:03 PST 2007 i686
> Build Date: 22 March 2006
> Before reporting problems, check http://wiki.x.org
> to make sure that you have the latest version.
> @@ -14,7 +14,7 @@
> Markers: (--) probed, (**) from config file, (==) default setting,
> (++) from command line, (!!) notice, (II) informational,
> (WW) warning, (EE) error, (NI) not implemented, (??) unknown.
> -(==) Log file: "/var/log/Xorg.0.log", Time: Thu Jan 11 21:10:16 2007
> +(==) Log file: "/var/log/Xorg.0.log", Time: Thu Jan 11 21:16:39 2007
> (==) Using config file: "/etc/X11/xorg.conf"
> (==) ServerLayout "single head configuration"
> (**) |-->Screen "Screen0" (0)
> @@ -2117,9 +2117,9 @@
> (II) I810(0): Allocated 128 kB for the ring buffer at 0x0
> (II) I810(0): Allocating at least 256 scanlines for pixmap cache
> (II) I810(0): Initial framebuffer allocation size: 12288 kByte
> -(II) I810(0): Allocated 4 kB for HW cursor at 0xffff000 (0x35dd3000)
> -(II) I810(0): Allocated 16 kB for HW (ARGB) cursor at 0xfffb000 (0x35e78000)
> -(II) I810(0): Allocated 4 kB for Overlay registers at 0xfffa000 (0x35e39000).
> +(II) I810(0): Allocated 4 kB for HW cursor at 0xffff000 (0x358d5000)
> +(II) I810(0): Allocated 16 kB for HW (ARGB) cursor at 0xfffb000 (0x35888000)
> +(II) I810(0): Allocated 4 kB for Overlay registers at 0xfffa000 (0x358d7000).
> (II) I810(0): Allocated 64 kB for the scratch buffer at 0xffea000
> drmOpenDevice: node name is /dev/dri/card0
> drmOpenDevice: open result is -1, (No such device or address)
> @@ -2137,8 +2137,8 @@
> (II) I810(0): [drm] loaded kernel module for "i915" driver
> (II) I810(0): [drm] DRM interface version 1.3
> (II) I810(0): [drm] created "i915" driver at busid "pci:0000:00:02.0"
> -(II) I810(0): [drm] added 8192 byte SAREA at 0xf8e46000
> -(II) I810(0): [drm] mapped SAREA 0xf8e46000 to 0xb7eec000
> +(II) I810(0): [drm] added 8192 byte SAREA at 0xf8d4a000
> +(II) I810(0): [drm] mapped SAREA 0xf8d4a000 to 0xb7f23000
> (II) I810(0): [drm] framebuffer handle = 0xc0020000
> (II) I810(0): [drm] added 1 reserved context for kernel
> (II) I810(0): Allocated 32 kB for the logical context at 0xffe2000.

I can't see any reason for this in the code or comparative ltp runs.
Cedric is testing on a fc6 laptop, hopefully he can reproduce it.

thanks,
-serge

2007-01-15 15:03:53

by Cédric Le Goater

[permalink] [raw]
Subject: Re: [PATCH -mm 7/8] user_ns: handle file sigio

Serge E. Hallyn wrote:
> Quoting Andrew Morton ([email protected]):
>> On Thu, 4 Jan 2007 12:12:57 -0600
>> "Serge E. Hallyn" <[email protected]> wrote:
>>
>>> A process in one user namespace could set a fowner and sigio on a file in a
>>> shared vfsmount, ending up killing a task in another user namespace.
>>>
>>> Prevent this by adding a user namespace pointer to the fown_struct, and
>>> enforcing that a process causing a signal to be sent be in the same
>>> user namespace as the file owner.
>> This patch breaks the X server (stock FC5 install) with CONFIG_USER_NS=n.
>> Neither the USB mouse nor the trackpad work. They work OK under GPM.
>>
>> Setting CONFIG_USER_NS=y "fixes" this. This bug was not observed in
>> 2.6.20-rc3-mm1 because that kernel had user-ns-always-on.patch for other
>> reasons. (I'll restore that patch).
>>
>> There's nothing very interesting here:
>
[ ... ]
>
> I can't see any reason for this in the code or comparative ltp runs.
> Cedric is testing on a fc6 laptop, hopefully he can reproduce it.

I did reproduce it on a FC5 desktop finally.

get_user_ns() returns NULL when CONFIG_USER_NS=n and this breaks
sigio_perm() which does not expect NULL values for ->user_ns.

I would fix this with the following patch.

C.


Signed-off-by: Cedric Le Goater <[email protected]>

---
include/linux/user_namespace.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: 2.6.20-rc4-mm1/include/linux/user_namespace.h
===================================================================
--- 2.6.20-rc4-mm1.orig/include/linux/user_namespace.h
+++ 2.6.20-rc4-mm1/include/linux/user_namespace.h
@@ -49,7 +49,7 @@

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
- return NULL;
+ return &init_user_ns;
}

static inline int unshare_user_ns(unsigned long flags,

2007-01-15 15:28:29

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm 7/8] user_ns: handle file sigio

Quoting Cedric Le Goater ([email protected]):
> Serge E. Hallyn wrote:
> > Quoting Andrew Morton ([email protected]):
> >> On Thu, 4 Jan 2007 12:12:57 -0600
> >> "Serge E. Hallyn" <[email protected]> wrote:
> >>
> >>> A process in one user namespace could set a fowner and sigio on a file in a
> >>> shared vfsmount, ending up killing a task in another user namespace.
> >>>
> >>> Prevent this by adding a user namespace pointer to the fown_struct, and
> >>> enforcing that a process causing a signal to be sent be in the same
> >>> user namespace as the file owner.
> >> This patch breaks the X server (stock FC5 install) with CONFIG_USER_NS=n.
> >> Neither the USB mouse nor the trackpad work. They work OK under GPM.
> >>
> >> Setting CONFIG_USER_NS=y "fixes" this. This bug was not observed in
> >> 2.6.20-rc3-mm1 because that kernel had user-ns-always-on.patch for other
> >> reasons. (I'll restore that patch).
> >>
> >> There's nothing very interesting here:
> >
> [ ... ]
> >
> > I can't see any reason for this in the code or comparative ltp runs.
> > Cedric is testing on a fc6 laptop, hopefully he can reproduce it.
>
> I did reproduce it on a FC5 desktop finally.
>
> get_user_ns() returns NULL when CONFIG_USER_NS=n and this breaks
> sigio_perm() which does not expect NULL values for ->user_ns.

Argh.

Thanks, Cedric.

Rewriting the userns testcases right now. Clearly, in addition to
separately testing clone and unshare, I need to add a sigioperm check,
and have a separate set of testcases for CONFIG_USER_NS=n.

thanks,
-serge

> I would fix this with the following patch.
>
> C.
>
>
> Signed-off-by: Cedric Le Goater <[email protected]>

Acked-off-by: Serge E Hallyn <[email protected]>

> ---
> include/linux/user_namespace.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> Index: 2.6.20-rc4-mm1/include/linux/user_namespace.h
> ===================================================================
> --- 2.6.20-rc4-mm1.orig/include/linux/user_namespace.h
> +++ 2.6.20-rc4-mm1/include/linux/user_namespace.h
> @@ -49,7 +49,7 @@
>
> static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
> {
> - return NULL;
> + return &init_user_ns;
> }
>
> static inline int unshare_user_ns(unsigned long flags,

2007-01-15 17:35:37

by Cédric Le Goater

[permalink] [raw]
Subject: Re: [PATCH -mm 7/8] user_ns: handle file sigio


[ ... ]

> Rewriting the userns testcases right now. Clearly, in addition to
> separately testing clone and unshare, I need to add a sigioperm check,
> and have a separate set of testcases for CONFIG_USER_NS=n.

Could we get rid of CONFIG_USER_NS ?

It doesn't look that useful anyway, it just deactivates the unshare
capability for the user namespace.

Same question for the other mainline namespaces, IPC and utsname.

C.

2007-01-16 11:04:35

by Cédric Le Goater

[permalink] [raw]
Subject: [PATCH -mm] user_ns: remove CONFIG_USER_NS

It doesn't look that useful anyway, it just deactivates the unshare
capability for the user namespace.

Signed-off-by: Cedric Le Goater <[email protected]>
---
include/linux/sched.h | 9 ---------
include/linux/user_namespace.h | 33 ---------------------------------
init/Kconfig | 4 ----
kernel/user_namespace.c | 3 ---
4 files changed, 49 deletions(-)

Index: 2.6.20-rc4-mm1/include/linux/sched.h
===================================================================
--- 2.6.20-rc4-mm1.orig/include/linux/sched.h
+++ 2.6.20-rc4-mm1/include/linux/sched.h
@@ -1603,7 +1603,6 @@ extern int cond_resched(void);
extern int cond_resched_lock(spinlock_t * lock);
extern int cond_resched_softirq(void);

-#ifdef CONFIG_USER_NS
/*
* Check whether a task and a vfsmnt belong to the same uidns.
* Since the initial namespace is exempt from these checks,
@@ -1622,14 +1621,6 @@ static inline int task_mnt_same_uidns(st
return 1;
return 0;
}
-#else
-static inline int task_mnt_same_uidns(struct task_struct *tsk,
- struct vfsmount *mnt)
-{
- return 1;
-}
-#endif
-

/*
* Does a critical section need to be broken due to another
Index: 2.6.20-rc4-mm1/include/linux/user_namespace.h
===================================================================
--- 2.6.20-rc4-mm1.orig/include/linux/user_namespace.h
+++ 2.6.20-rc4-mm1/include/linux/user_namespace.h
@@ -16,8 +16,6 @@ struct user_namespace {

extern struct user_namespace init_user_ns;

-#ifdef CONFIG_USER_NS
-
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
if (ns)
@@ -45,35 +43,4 @@ static inline int clone_mnt_userns_permi
return 1;
}

-#else
-
-static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
-{
- return NULL;
-}
-
-static inline int unshare_user_ns(unsigned long flags,
- struct user_namespace **new_user)
-{
- if (flags & CLONE_NEWUSER)
- return -EINVAL;
-
- return 0;
-}
-
-static inline int copy_user_ns(int flags, struct task_struct *tsk)
-{
- return 0;
-}
-
-static inline void put_user_ns(struct user_namespace *ns)
-{
-}
-
-static inline int clone_mnt_userns_permission(struct vfsmount *old)
-{
- return 1;
-}
-#endif
-
#endif /* _LINUX_USER_H */
Index: 2.6.20-rc4-mm1/init/Kconfig
===================================================================
--- 2.6.20-rc4-mm1.orig/init/Kconfig
+++ 2.6.20-rc4-mm1/init/Kconfig
@@ -222,10 +222,6 @@ config UTS_NS
vservers, to use uts namespaces to provide different
uts info for different servers. If unsure, say N.

-config USER_NS
- bool
- default y
-
config AUDIT
bool "Auditing support"
depends on NET
Index: 2.6.20-rc4-mm1/kernel/user_namespace.c
===================================================================
--- 2.6.20-rc4-mm1.orig/kernel/user_namespace.c
+++ 2.6.20-rc4-mm1/kernel/user_namespace.c
@@ -19,7 +19,6 @@ struct user_namespace init_user_ns = {

EXPORT_SYMBOL_GPL(init_user_ns);

-#ifdef CONFIG_USER_NS
/*
* Clone a new ns copying an original user ns, setting refcount to 1
* @old_ns: namespace to clone
@@ -110,5 +109,3 @@ void free_user_ns(struct kref *kref)
ns = container_of(kref, struct user_namespace, kref);
kfree(ns);
}
-
-#endif /* CONFIG_USER_NS */

2007-01-16 14:53:20

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH -mm] user_ns: remove CONFIG_USER_NS

Quoting Cedric Le Goater ([email protected]):
> It doesn't look that useful anyway, it just deactivates the unshare
> capability for the user namespace.
>
> Signed-off-by: Cedric Le Goater <[email protected]>

Agreed on both this and CONFIG_UTS_NS. (I haven't looked closely enough
at the CONFIG_IPC_NS case yet)

Acked-by: Serge Hallyn <[email protected]>

thanks,
-serge

> ---
> include/linux/sched.h | 9 ---------
> include/linux/user_namespace.h | 33 ---------------------------------
> init/Kconfig | 4 ----
> kernel/user_namespace.c | 3 ---
> 4 files changed, 49 deletions(-)
>
> Index: 2.6.20-rc4-mm1/include/linux/sched.h
> ===================================================================
> --- 2.6.20-rc4-mm1.orig/include/linux/sched.h
> +++ 2.6.20-rc4-mm1/include/linux/sched.h
> @@ -1603,7 +1603,6 @@ extern int cond_resched(void);
> extern int cond_resched_lock(spinlock_t * lock);
> extern int cond_resched_softirq(void);
>
> -#ifdef CONFIG_USER_NS
> /*
> * Check whether a task and a vfsmnt belong to the same uidns.
> * Since the initial namespace is exempt from these checks,
> @@ -1622,14 +1621,6 @@ static inline int task_mnt_same_uidns(st
> return 1;
> return 0;
> }
> -#else
> -static inline int task_mnt_same_uidns(struct task_struct *tsk,
> - struct vfsmount *mnt)
> -{
> - return 1;
> -}
> -#endif
> -
>
> /*
> * Does a critical section need to be broken due to another
> Index: 2.6.20-rc4-mm1/include/linux/user_namespace.h
> ===================================================================
> --- 2.6.20-rc4-mm1.orig/include/linux/user_namespace.h
> +++ 2.6.20-rc4-mm1/include/linux/user_namespace.h
> @@ -16,8 +16,6 @@ struct user_namespace {
>
> extern struct user_namespace init_user_ns;
>
> -#ifdef CONFIG_USER_NS
> -
> static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
> {
> if (ns)
> @@ -45,35 +43,4 @@ static inline int clone_mnt_userns_permi
> return 1;
> }
>
> -#else
> -
> -static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
> -{
> - return NULL;
> -}
> -
> -static inline int unshare_user_ns(unsigned long flags,
> - struct user_namespace **new_user)
> -{
> - if (flags & CLONE_NEWUSER)
> - return -EINVAL;
> -
> - return 0;
> -}
> -
> -static inline int copy_user_ns(int flags, struct task_struct *tsk)
> -{
> - return 0;
> -}
> -
> -static inline void put_user_ns(struct user_namespace *ns)
> -{
> -}
> -
> -static inline int clone_mnt_userns_permission(struct vfsmount *old)
> -{
> - return 1;
> -}
> -#endif
> -
> #endif /* _LINUX_USER_H */
> Index: 2.6.20-rc4-mm1/init/Kconfig
> ===================================================================
> --- 2.6.20-rc4-mm1.orig/init/Kconfig
> +++ 2.6.20-rc4-mm1/init/Kconfig
> @@ -222,10 +222,6 @@ config UTS_NS
> vservers, to use uts namespaces to provide different
> uts info for different servers. If unsure, say N.
>
> -config USER_NS
> - bool
> - default y
> -
> config AUDIT
> bool "Auditing support"
> depends on NET
> Index: 2.6.20-rc4-mm1/kernel/user_namespace.c
> ===================================================================
> --- 2.6.20-rc4-mm1.orig/kernel/user_namespace.c
> +++ 2.6.20-rc4-mm1/kernel/user_namespace.c
> @@ -19,7 +19,6 @@ struct user_namespace init_user_ns = {
>
> EXPORT_SYMBOL_GPL(init_user_ns);
>
> -#ifdef CONFIG_USER_NS
> /*
> * Clone a new ns copying an original user ns, setting refcount to 1
> * @old_ns: namespace to clone
> @@ -110,5 +109,3 @@ void free_user_ns(struct kref *kref)
> ns = container_of(kref, struct user_namespace, kref);
> kfree(ns);
> }
> -
> -#endif /* CONFIG_USER_NS */