relinquish_fs() is a replacement for the chroot("/var/empty") technique
currently used for privilege-separated daemons such as recent OpenSSH.
Rather than using a directory on the normal filesystem it places the
process in an alternate namespace. This namespace has one filesystem
of type "nullfs" -- it's empty and immutable.
This has several benefits:
* Considerably safer against root users in cage
Normal chroot's are trivial for privileged users to break out of -
these tasks don't work in the namespace. You can't create a directory
to do the "chroot foo; cd ../../..; chroot ." trick. You can't
create device nodes or mount /proc anywhere (I added an extra check
to do_mount() that even prevents a mount on top of '/')
* Can be used by non-root users!
Normally chroot is restricted to root (to prevent the escape mentioned
above and also to avoid confusing setuid programs about their environment)
Obviously setuid binaries are not an issue here since execve() will be
impossible after relinquish_fs() is called.
This is a big deal for privilege separation; currently it's hard to
implement except in a daemon that starts its life as root. Now the
same techniques can be used by any process.
Imagine, for example, a jpeg decoder that after opening its input and
output files called relinquish_fs(). Now if the decoder has a flaw and
is fed an exploit it is not able to do much of anything. And although
adding privsep is hard work this technique can, in theory, be extended
to anything that processes untrusted data from the network (html parsers,
ssh client, etc)
Of course, even after chroot() or relinquish_fs() an exploited process
can still do bad things (chew up CPU, fork bomb, implement a rogue TCP
proxy service, etc) but at least with this technique it makes it hard
for an exploit to do lasting damage to the OS.
I reused the i386 syscall number from the (apparently now gone) set_altroot
syscall for now.
Also, this is the first time I've dug into the VFS internals; let me know
if I'm doing something terribly wrong.
Patch is versus 2.6.10-rc2-bk9
-Mitch
Signed-off-by: Mitchell Blank Jr <[email protected]>
diff -ur linux-2.6.10-rc2-bk9-VIRGIN/arch/i386/kernel/entry.S linux-2.6.10-rc2-bk9/arch/i386/kernel/entry.S
--- linux-2.6.10-rc2-bk9-VIRGIN/arch/i386/kernel/entry.S 2004-11-25 21:46:20.000000000 -0800
+++ linux-2.6.10-rc2-bk9/arch/i386/kernel/entry.S 2004-11-29 03:26:08.960348873 -0800
@@ -860,7 +860,7 @@
.long sys_mq_getsetattr
.long sys_ni_syscall /* reserved for kexec */
.long sys_waitid
- .long sys_ni_syscall /* 285 */ /* available */
+ .long sys_relinquish_fs /* 285 */
.long sys_add_key
.long sys_request_key
.long sys_keyctl
diff -ur linux-2.6.10-rc2-bk9-VIRGIN/fs/namespace.c linux-2.6.10-rc2-bk9/fs/namespace.c
--- linux-2.6.10-rc2-bk9-VIRGIN/fs/namespace.c 2004-11-25 21:46:59.000000000 -0800
+++ linux-2.6.10-rc2-bk9/fs/namespace.c 2004-11-29 02:48:08.833945322 -0800
@@ -22,6 +22,7 @@
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/mount.h>
+#include <linux/pagemap.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -991,6 +992,9 @@
return 0;
}
+/* This is the namespace tasks go to after calling relinquish_fs() */
+static struct namespace *nullfs_namespace;
+
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1023,6 +1027,14 @@
if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
return -EINVAL;
+ /*
+ * If the task has called relinquish_fs() then don't allow any
+ * mounts. They can't get a mount point but they could mount
+ * something on top of "/"
+ */
+ if (current->namespace == nullfs_namespace)
+ return -EROFS;
+
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
@@ -1358,15 +1370,10 @@
goto out2;
}
-static void __init init_mount_tree(void)
+static struct namespace * __init init_new_namespace(struct vfsmount *mnt)
{
- struct vfsmount *mnt;
struct namespace *namespace;
- struct task_struct *g, *p;
- mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
- if (IS_ERR(mnt))
- panic("Can't create rootfs");
namespace = kmalloc(sizeof(*namespace), GFP_KERNEL);
if (!namespace)
panic("Can't allocate initial namespace");
@@ -1376,6 +1383,19 @@
list_add(&mnt->mnt_list, &namespace->list);
namespace->root = mnt;
mnt->mnt_namespace = namespace;
+ return namespace;
+}
+
+static void __init init_mount_tree(void)
+{
+ struct vfsmount *mnt;
+ struct namespace *namespace;
+ struct task_struct *g, *p;
+
+ mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
+ if (IS_ERR(mnt))
+ panic("Can't create rootfs");
+ namespace = init_new_namespace(mnt);
init_task.namespace = namespace;
read_lock(&tasklist_lock);
@@ -1389,6 +1409,78 @@
set_fs_root(current->fs, namespace->root, namespace->root->mnt_root);
}
+/* Make sure that noone tries a remount a "nullfs" */
+static int nullfs_remount(struct super_block *sb, int *flagsp, char *data)
+{
+ return -EROFS;
+}
+
+/* Simple filesystem that is just an immutable empty directory */
+static int nullfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct super_operations nullfs_ops= {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .remount_fs = nullfs_remount,
+ };
+ static struct address_space_operations nullfs_aops = {
+ .readpage = simple_readpage,
+ };
+ static struct inode_operations nullfs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ };
+ struct inode *inode;
+
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = 0x6c6c754e; /* "Null" */
+ sb->s_op = &nullfs_ops;
+ inode = new_inode(sb);
+ if (inode == NULL)
+ return -ENOMEM;
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_mapping->a_ops = &nullfs_aops;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_op = &nullfs_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_nlink++;
+ sb->s_root = d_alloc_root(inode);
+ if (sb->s_root == NULL) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static struct super_block *nullfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return get_sb_nodev(fs_type, flags | MS_NOUSER,
+ data, nullfs_fill_super);
+}
+
+static void __init init_nullfs(void)
+{
+ static struct file_system_type nullfs_fs_type = {
+ .name = "nullfs",
+ .get_sb = nullfs_get_sb,
+ .kill_sb = kill_anon_super,
+ };
+
+ if (register_filesystem(&nullfs_fs_type) != 0)
+ panic("Can't register nullfs");
+ /* Most of these flags are not needed, but won't hurt */
+ nullfs_namespace = init_new_namespace(do_kern_mount("nullfs",
+ (MS_RDONLY | MS_NOSUID | MS_NODEV |
+ MS_NOEXEC | MS_NOATIME | MS_NODIRATIME),
+ "nullfs", NULL));
+ if (IS_ERR(nullfs_namespace->root))
+ panic("Can't mount nullfs");
+}
+
void __init mnt_init(unsigned long mempages)
{
struct list_head *d;
@@ -1439,6 +1531,54 @@
sysfs_init();
init_rootfs();
init_mount_tree();
+ init_nullfs();
+}
+
+/*
+ * Callable by any process to relinquish all access to the filesystem by
+ * putting their root in an immutably empty namespace. Note that like
+ * chroot() we do NOT change pwd, so a chdir("/") should be done afterwards
+ */
+asmlinkage long sys_relinquish_fs(void)
+{
+ struct fs_struct *fs = current->fs;
+
+ if (current->namespace == nullfs_namespace)
+ return 0; /* already relinquished */
+ /*
+ * Since namespace is per-thread yet the ->fs struct can be shared
+ * we need to make sure we're in a thread. Unfortunately that means
+ * this syscall can fail with -ENOMEM; luckily it's only in the
+ * threaded case
+ */
+ if (atomic_read(&fs->count) != 1) {
+ struct fs_struct *n = copy_fs_struct(current->fs);
+ if (n == NULL)
+ return -ENOMEM;
+ task_lock(current);
+ current->fs = n;
+ task_unlock(current);
+ put_fs_struct(fs);
+ fs = n;
+ }
+ get_namespace(nullfs_namespace);
+ put_namespace(current->namespace);
+ current->namespace = nullfs_namespace;
+
+ set_fs_root(fs, nullfs_namespace->root,
+ nullfs_namespace->root->mnt_root);
+ /*
+ * We also just rid ourselves of any altroot; an attacker could get
+ * it back via set_personality() but that won't do any good since
+ * emul_prefix can't be looked up anymore
+ */
+ if (current->fs->altroot != NULL) {
+ dput(current->fs->altroot);
+ mntput(current->fs->altrootmnt);
+ current->fs->altroot = NULL;
+ current->fs->altrootmnt = NULL;
+ }
+ return 0;
}
void __put_namespace(struct namespace *namespace)
diff -ur linux-2.6.10-rc2-bk9-VIRGIN/include/asm-i386/unistd.h linux-2.6.10-rc2-bk9/include/asm-i386/unistd.h
--- linux-2.6.10-rc2-bk9-VIRGIN/include/asm-i386/unistd.h 2004-11-25 21:46:39.000000000 -0800
+++ linux-2.6.10-rc2-bk9/include/asm-i386/unistd.h 2004-11-28 22:04:48.000000000 -0800
@@ -290,7 +290,7 @@
#define __NR_mq_getsetattr (__NR_mq_open+5)
#define __NR_sys_kexec_load 283
#define __NR_waitid 284
-/* #define __NR_sys_setaltroot 285 */
+#define __NR_sys_relinquish_fs 285
#define __NR_add_key 286
#define __NR_request_key 287
#define __NR_keyctl 288
diff -ur linux-2.6.10-rc2-bk9-VIRGIN/include/linux/syscalls.h linux-2.6.10-rc2-bk9/include/linux/syscalls.h
--- linux-2.6.10-rc2-bk9-VIRGIN/include/linux/syscalls.h 2004-11-25 21:46:40.000000000 -0800
+++ linux-2.6.10-rc2-bk9/include/linux/syscalls.h 2004-11-28 09:59:28.449907226 -0800
@@ -506,4 +506,6 @@
asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5);
+asmlinkage long sys_relinquish_fs(void);
+
#endif
On Llu, 2004-11-29 at 11:43, Mitchell Blank Jr wrote:
> This has several benefits:
>
> * Considerably safer against root users in cage
Pardon. Its equally ineffectual. It might take someone a week longer to
write the exploit but an hour after that its no different.
> Normal chroot's are trivial for privileged users to break out of -
> these tasks don't work in the namespace. You can't create a directory
> to do the "chroot foo; cd ../../..; chroot ." trick. You can't
> create device nodes or mount /proc anywhere (I added an extra check
> to do_mount() that even prevents a mount on top of '/')
A priviledged user can ioperm/iopl their way out.
> This is a big deal for privilege separation; currently it's hard to
> implement except in a daemon that starts its life as root. Now the
> same techniques can be used by any process.
That doesn't do name lookup, character set translation, or time (and a
few other things).
> Imagine, for example, a jpeg decoder that after opening its input and
> output files called relinquish_fs(). Now if the decoder has a flaw and
Imagine a jpeg decoder using an SELinux policy.
Alan Cox wrote:
> A priviledged user can ioperm/iopl their way out.
OK, good point, at least on i386/x86_64. So before jailing itself a task
will have to take CAP_SYS_RAWIO out of its permitted set. That shouldn't
be too bad of a restriction for most programs to live with.
> On Llu, 2004-11-29 at 11:43, Mitchell Blank Jr wrote:
> > This has several benefits:
> >
> > * Considerably safer against root users in cage
>
> Pardon. Its equally ineffectual. It might take someone a week longer to
> write the exploit but an hour after that its no different.
OK, could you please describe other attacks against it then?
> > This is a big deal for privilege separation; currently it's hard to
> > implement except in a daemon that starts its life as root. Now the
> > same techniques can be used by any process.
>
> That doesn't do name lookup, character set translation, or time (and a
> few other things).
Alan - have you looked at privsep implementation in, say, opensshd. The
way privsep works is that you have two processes communicating over a
UNIX domain socket. One then jails itself and handles all the hairy
untrusted data. The unjailed process handles requests from inside as
needed. So if the program needs to do DNS lookups then your privsep
protocol would include a primitive for doing that.
Some information on openssh privsep, including the USENIX paper:
http://www.citi.umich.edu/u/provos/ssh/privsep.html
Frankly, if a program is touching lots of untrusted data AND it also needs
lots of external stuff then it's a perfect candidate for privsep.
> > Imagine, for example, a jpeg decoder that after opening its input and
> > output files called relinquish_fs(). Now if the decoder has a flaw and
>
> Imagine a jpeg decoder using an SELinux policy.
SELinux is great but it's a pretty orthoginal issue. There's no reason the
two can't coexist.
-Mitch
On Llu, 2004-11-29 at 13:55, Mitchell Blank Jr wrote:
> OK, good point, at least on i386/x86_64. So before jailing itself a task
> will have to take CAP_SYS_RAWIO out of its permitted set. That shouldn't
> be too bad of a restriction for most programs to live with.
> > Pardon. Its equally ineffectual. It might take someone a week longer to
> > write the exploit but an hour after that its no different.
>
> OK, could you please describe other attacks against it then?
With CAP_SYS_RAWIO I can ask the IDE controller to DMA into the kernel
as one example. Without it should be sane. However if you take away all
the capabilities then you don't need the other changes because mount
works just fine.
> > That doesn't do name lookup, character set translation, or time (and a
> > few other things).
>
> Alan - have you looked at privsep implementation in, say, opensshd. The
> way privsep works is that you have two processes communicating over a
> UNIX domain socket. One then jails itself and handles all the hairy
> untrusted data. The unjailed process handles requests from inside as
> needed. So if the program needs to do DNS lookups then your privsep
> protocol would include a primitive for doing that.
Yes I realise that but you also need to realise that glibc has a lot of
supporting baggage it likes to find if its going to function sanely. Yes
you can deal with it but SELinux can deal with it better.
> > > Imagine, for example, a jpeg decoder that after opening its input and
> > > output files called relinquish_fs(). Now if the decoder has a flaw and
> >
> > Imagine a jpeg decoder using an SELinux policy.
>
> SELinux is great but it's a pretty orthoginal issue. There's no reason the
> two can't coexist.
I don't see it as orthogonal except for the "done by user" aspect. "foo"
isnt allowed to open files or chdir is SELinux policy at work (or for
that matter the rather nice bitmap filtered syscall stuff Andrea was
talking about some time back).
BTW: you also have to deal with fchdir() and potentially AF_UNIX
sockets.
Mitchell Blank Jr wrote:
>Alan Cox wrote:
>
>
>>A priviledged user can ioperm/iopl their way out.
>>
>>
>
>OK, good point, at least on i386/x86_64. So before jailing itself a task
>will have to take CAP_SYS_RAWIO out of its permitted set. That shouldn't
>be too bad of a restriction for most programs to live with.
>
>
>
>>On Llu, 2004-11-29 at 11:43, Mitchell Blank Jr wrote:
>>
>>
>>>This has several benefits:
>>>
>>> * Considerably safer against root users in cage
>>>
>>>
>>Pardon. Its equally ineffectual. It might take someone a week longer to
>>write the exploit but an hour after that its no different.
>>
>>
>
>OK, could you please describe other attacks against it then?
>
>
>
>>> This is a big deal for privilege separation; currently it's hard to
>>> implement except in a daemon that starts its life as root. Now the
>>> same techniques can be used by any process.
>>>
>>>
>>That doesn't do name lookup, character set translation, or time (and a
>>few other things).
>>
>>
>
>Alan - have you looked at privsep implementation in, say, opensshd. The
>way privsep works is that you have two processes communicating over a
>UNIX domain socket. One then jails itself and handles all the hairy
>untrusted data. The unjailed process handles requests from inside as
>needed. So if the program needs to do DNS lookups then your privsep
>protocol would include a primitive for doing that.
>
>
So someone finds a way to break into the jailed process.
This is used to feed some hairy exploit to the unjailed
process that expects "clean" data from the jailed process.
Same as before, only now they need a two-stage exploit.
You can jail a process doing a "dangerous" job, but you can't
really jail a "hairy" data stream. Not if data is expected to
emerge from the jail someday.
Helge Hafting
Alan Cox wrote:
> With CAP_SYS_RAWIO I can ask the IDE controller to DMA into the kernel
> as one example.
Can you really do that on normal file descriptors? Weird. I'd have thought
you'd need to open /dev/hd* to do that.
Oh, and you also need to revoke CAP_SYS_PTRACE to prevent a compromised
process from taking over a process inside the jail
> Without it should be sane. However if you take away all
> the capabilities then you don't need the other changes because mount
> works just fine.
True; for the root-user case you can build an equivalent jail as long as you
* remove CAP_SYS_RAWIO
* remove CAP_SYS_ADMIN to prevent mount/umount
* make the root somehow immutable (make mountpoint r/o; ext2 attribute;
whatever)
I probably shouldn't have even mentioned the jail-root case in my description;
my only point is that relinquish_fs() is at least as good as the currently
used chroot("/var/empty") in every way. It's also stronger in some
ways. Obviously a rogue process running with a full CAP_SYS_* set can
do all kinds of bad things even inside a jail (up to and including a
reboot)
The real point of the patch is to add a method giving this same power to
unprivileged processes, functionality which is currently lacking but
useful. The existing namespace support makes it almost trivial to provide.
> Yes I realise that but you also need to realise that glibc has a lot of
> supporting baggage it likes to find if its going to function sanely.
And how is this different then people using chroot() currently? The people
doing privsep designs understand the limits libc and can work within them.
> > > Imagine a jpeg decoder using an SELinux policy.
> >
> > SELinux is great but it's a pretty orthoginal issue. There's no reason the
> > two can't coexist.
>
> I don't see it as orthogonal except for the "done by user" aspect.
Yes and that's the point. relinquish_fs() is a tool for defensive
programming. SELinux is a tool for sysadmins and distribution vendors
to enforce policy. Ideally they would both be used -- defense in depth
is a good thing.
> BTW: you also have to deal with fchdir() and potentially AF_UNIX
> sockets.
Is AF_UNIX in a separate namespace? My understanding (from reading
unix_find_other()) is that unless you can create a UNIX socket in your
filesystem you're going to have trouble creating new UNIX sockets.
I think some *NIXes worked differently but linux seems sane in this
regard. So you'd have to trick your privsep-peer outside the jail to
send you a directory fd over a unix-domain socket you already had open
before you jailed yourself.
-Mitch
On Tue, 2004-11-30 at 05:27 -0800, Mitchell Blank Jr wrote:
> Alan Cox wrote:
> > With CAP_SYS_RAWIO I can ask the IDE controller to DMA into the kernel
> > as one example.
>
> Can you really do that on normal file descriptors? Weird. I'd have thought
> you'd need to open /dev/hd* to do that.
inb/outb after iopl.
> Is AF_UNIX in a separate namespace? My understanding (from reading
> unix_find_other()) is that unless you can create a UNIX socket in your
> filesystem you're going to have trouble creating new UNIX sockets.
iirc there are anonymous unix sockets...
Helge Hafting wrote:
> So someone finds a way to break into the jailed process.
> This is used to feed some hairy exploit to the unjailed
> process that expects "clean" data from the jailed process.
OK, so by your logic firewalls are useless (since you can just exploit the
firewall and then the host) Oh and you might as well run all your daemons
as root (since an attacker can just use some other exploit to gain root
later)
The entire point of a privsep design is that the unjailed process treats
all data from the jailed process as untrusted. If the attacker gains
full control of the jailed process it should still be a challenge to
trick the unjailed process into doing something harmful.
> Same as before, only now they need a two-stage exploit.
Exactly! Now in order for the attacker to win they need to find a
programming error in the jailed process AND the unjailed process.
This is how defense in depth works.
> You can jail a process doing a "dangerous" job, but you can't
> really jail a "hairy" data stream. Not if data is expected to
> emerge from the jail someday.
That's a bogus argument - the data exiting the jail can come out in a
non-hairy format that is less error-prone to process. You do all the
complicated crypto/compression/parsing/etc inside the jail where a
programming errors cause less damage.
Think about a client communicating with a random network hosts over an
SSL connection - you can move the SSL engine into the jailed process.
Now if your SSL's ASN.1 parser has a buffer overflow and you connect to
a malicious server it can't immediately compromise your account.
-Mitch
Arjan van de Ven wrote:
> > Can you really do that on normal file descriptors? Weird. I'd have thought
> > you'd need to open /dev/hd* to do that.
>
> inb/outb after iopl.
That was already discussed earlier in the thread.
> > Is AF_UNIX in a separate namespace? My understanding (from reading
> > unix_find_other()) is that unless you can create a UNIX socket in your
> > filesystem you're going to have trouble creating new UNIX sockets.
>
> iirc there are anonymous unix sockets...
Ah, I see now -- the sun_path[0]=='\0' code. I'll have to take a look
at that; probably just need to add a check to prevent jailed processes
from using those sockets (since they're supposed to be in a "null"
namespace) Will investigate later this week.
It looks like this is also a weakness in code that currently uses
chroot("/var/empty") It's not the end of the world since it still
requires a cooperating unjailed process on the same host as the jailed
one to pass in a fd which is quite an obstacle in most scenarios. Still,
it's something that should be protected against.
-Mitch
On Maw, 2004-11-30 at 14:12, Mitchell Blank Jr wrote:
> > iirc there are anonymous unix sockets...
>
> Ah, I see now -- the sun_path[0]=='\0' code. I'll have to take a look
> at that; probably just need to add a check to prevent jailed processes
> from using those sockets (since they're supposed to be in a "null"
> namespace) Will investigate later this week.
You would probably want a "private" AF_UNIX namespace too. The fact its
a single namespace for "anonymous" AF_UNIX and the \0 trick is used is
really legacy unix compatibility. Having multiple such namespaces is
certainly
doable. It's the same problem as the shared memory, semaphore and
message
queue objects have because they fall out of the filesystem namespace.
Posix
has fixed these but very few apps use the new forms.
>
> It looks like this is also a weakness in code that currently uses
> chroot("/var/empty") It's not the end of the world since it still
> requires a cooperating unjailed process on the same host as the jailed
> one to pass in a fd which is quite an obstacle in most scenarios. Still,
> it's something that should be protected against.
Also you need to look at fchdir(). If I accidentally pass you a file
handle to a directory (or maybe to a file in reiser4 world ?) you can
fchdir() out of the chroot.
Alan