LinuxLists.cc - [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

2008-02-05 21:41:42

Subject: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

From: Miklos Szeredi <[email protected]>

Add the following:

/proc/sys/fs/types/${FS_TYPE}/usermount_safe

Signed-off-by: Miklos Szeredi <[email protected]>
---

Index: linux/fs/filesystems.c
===================================================================
--- linux.orig/fs/filesystems.c 2008-02-04 23:47:46.000000000 +0100
+++ linux/fs/filesystems.c 2008-02-04 23:48:04.000000000 +0100
@@ -12,6 +12,7 @@
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/sysctl.h>
#include <asm/uaccess.h>

/*
@@ -51,6 +52,57 @@ static struct file_system_type **find_fi
return p;
}

+#define MAX_FILESYSTEM_VARS 1
+
+struct filesystem_sysctl_table {
+ struct ctl_table_header *header;
+ struct ctl_table table[MAX_FILESYSTEM_VARS + 1];
+};
+
+/*
+ * Create /sys/fs/types/${FSNAME} directory with per fs-type tunables.
+ */
+static int filesystem_sysctl_register(struct file_system_type *fs)
+{
+ struct filesystem_sysctl_table *t;
+ struct ctl_path path[] = {
+ { .procname = "fs", .ctl_name = CTL_FS },
+ { .procname = "types", .ctl_name = CTL_UNNUMBERED },
+ { .procname = fs->name, .ctl_name = CTL_UNNUMBERED },
+ { }
+ };
+
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return -ENOMEM;
+
+
+ t->table[0].ctl_name = CTL_UNNUMBERED;
+ t->table[0].procname = "usermount_safe";
+ t->table[0].maxlen = sizeof(int);
+ t->table[0].data = &fs->fs_safe;
+ t->table[0].mode = 0644;
+ t->table[0].proc_handler = &proc_dointvec;
+
+ t->header = register_sysctl_paths(path, t->table);
+ if (!t->header) {
+ kfree(t);
+ return -ENOMEM;
+ }
+
+ fs->sysctl_table = t;
+
+ return 0;
+}
+
+static void filesystem_sysctl_unregister(struct file_system_type *fs)
+{
+ struct filesystem_sysctl_table *t = fs->sysctl_table;
+
+ unregister_sysctl_table(t->header);
+ kfree(t);
+}
+
/**
* register_filesystem - register a new filesystem
* @fs: the file system structure
@@ -80,6 +132,13 @@ int register_filesystem(struct file_syst
else
*p = fs;
write_unlock(&file_systems_lock);
+
+ if (res == 0) {
+ res = filesystem_sysctl_register(fs);
+ if (res != 0)
+ unregister_filesystem(fs);
+ }
+
return res;
}

@@ -108,6 +167,7 @@ int unregister_filesystem(struct file_sy
*tmp = fs->next;
fs->next = NULL;
write_unlock(&file_systems_lock);
+ filesystem_sysctl_unregister(fs);
return 0;
}
tmp = &(*tmp)->next;
Index: linux/include/linux/fs.h
===================================================================
--- linux.orig/include/linux/fs.h 2008-02-04 23:48:02.000000000 +0100
+++ linux/include/linux/fs.h 2008-02-04 23:48:04.000000000 +0100
@@ -1444,6 +1444,7 @@ struct file_system_type {
struct module *owner;
struct file_system_type * next;
struct list_head fs_supers;
+ struct filesystem_sysctl_table *sysctl_table;

struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
Index: linux/Documentation/filesystems/proc.txt
===================================================================
--- linux.orig/Documentation/filesystems/proc.txt 2008-02-04 23:47:58.000000000 +0100
+++ linux/Documentation/filesystems/proc.txt 2008-02-04 23:48:04.000000000 +0100
@@ -44,6 +44,7 @@ Table of Contents
2.14 /proc/<pid>/io - Display the IO accounting fields
2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
2.16 /proc/<pid>/mountinfo - Information about mounts
+ 2.17 /proc/sys/fs/types - File system type specific parameters

------------------------------------------------------------------------------
Preface
@@ -2392,4 +2393,34 @@ For more information see:
Documentation/filesystems/sharedsubtree.txt

+2.17 /proc/sys/fs/types/ - File system type specific parameters
+----------------------------------------------------------------
+
+There's a separate directory /proc/sys/fs/types/<type>/ for each
+filesystem type, containing the following files:
+
+usermount_safe
+--------------
+
+Setting this to non-zero will allow filesystems of this type to be
+mounted by unprivileged users (note, that there are other
+prerequisites as well).
+
+Fuse has been designed to be as safe as possible, and some
+distributions already ship with unprivileged fuse mounts enabled by
+default. There are still some situations (multi-user systems with
+untrusted users in particular), where enabling this for fuse might not
+be appropriate. For more details, see Documentation/filesystems/fuse.txt
+
+Procfs is also safe, but unprivileged mounting of it is not usually
+necessary (bind mounting is equivalent).
+
+Most other filesystems are unsafe. Here are just some of the issues,
+that must be resolved before a filesystem can be declared safe:
+
+ - no strict input checking (buffer overruns, directory loops, etc)
+ - network filesystem deadlocks when mounting from localhost
+ - no permission checking when opening the device
+ - changing mount options when mounting a new instance of a filesystem
+
------------------------------------------------------------------------------

--

2008-02-06 20:21:23

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

Quoting Miklos Szeredi ([email protected]):
> From: Miklos Szeredi <[email protected]>
>
> Add the following:
>
> /proc/sys/fs/types/${FS_TYPE}/usermount_safe
>
> Signed-off-by: Miklos Szeredi <[email protected]>

Thanks, Miklos, good explanations in the docs.

Acked-by: Serge Hallyn <[email protected]>

One comment inline, but not imo your problem :)

> ---
>
> Index: linux/fs/filesystems.c
> ===================================================================
> --- linux.orig/fs/filesystems.c 2008-02-04 23:47:46.000000000 +0100
> +++ linux/fs/filesystems.c 2008-02-04 23:48:04.000000000 +0100
> @@ -12,6 +12,7 @@
> #include <linux/kmod.h>
> #include <linux/init.h>
> #include <linux/module.h>
> +#include <linux/sysctl.h>
> #include <asm/uaccess.h>
>
> /*
> @@ -51,6 +52,57 @@ static struct file_system_type **find_fi
> return p;
> }
>
> +#define MAX_FILESYSTEM_VARS 1
> +
> +struct filesystem_sysctl_table {
> + struct ctl_table_header *header;
> + struct ctl_table table[MAX_FILESYSTEM_VARS + 1];
> +};
> +
> +/*
> + * Create /sys/fs/types/${FSNAME} directory with per fs-type tunables.
> + */
> +static int filesystem_sysctl_register(struct file_system_type *fs)
> +{
> + struct filesystem_sysctl_table *t;
> + struct ctl_path path[] = {
> + { .procname = "fs", .ctl_name = CTL_FS },
> + { .procname = "types", .ctl_name = CTL_UNNUMBERED },
> + { .procname = fs->name, .ctl_name = CTL_UNNUMBERED },
> + { }
> + };
> +
> + t = kzalloc(sizeof(*t), GFP_KERNEL);
> + if (!t)
> + return -ENOMEM;
> +
> +
> + t->table[0].ctl_name = CTL_UNNUMBERED;
> + t->table[0].procname = "usermount_safe";
> + t->table[0].maxlen = sizeof(int);
> + t->table[0].data = &fs->fs_safe;
> + t->table[0].mode = 0644;

Yikes, this could be a problem for containers, as it's simply tied to
uid 0, whereas tying it to a capability would let us solve it with
capability bounds.

This might mean more urgency to get user namespaces working at least
with sysfs, else this is a quick way around having CAP_SYS_ADMIN taken
out of a container's capability bounding set.

> + t->table[0].proc_handler = &proc_dointvec;
> +
> + t->header = register_sysctl_paths(path, t->table);
> + if (!t->header) {
> + kfree(t);
> + return -ENOMEM;
> + }
> +
> + fs->sysctl_table = t;
> +
> + return 0;
> +}
> +
> +static void filesystem_sysctl_unregister(struct file_system_type *fs)
> +{
> + struct filesystem_sysctl_table *t = fs->sysctl_table;
> +
> + unregister_sysctl_table(t->header);
> + kfree(t);
> +}
> +
> /**
> * register_filesystem - register a new filesystem
> * @fs: the file system structure
> @@ -80,6 +132,13 @@ int register_filesystem(struct file_syst
> else
> *p = fs;
> write_unlock(&file_systems_lock);
> +
> + if (res == 0) {
> + res = filesystem_sysctl_register(fs);
> + if (res != 0)
> + unregister_filesystem(fs);
> + }
> +
> return res;
> }
>
> @@ -108,6 +167,7 @@ int unregister_filesystem(struct file_sy
> *tmp = fs->next;
> fs->next = NULL;
> write_unlock(&file_systems_lock);
> + filesystem_sysctl_unregister(fs);
> return 0;
> }
> tmp = &(*tmp)->next;
> Index: linux/include/linux/fs.h
> ===================================================================
> --- linux.orig/include/linux/fs.h 2008-02-04 23:48:02.000000000 +0100
> +++ linux/include/linux/fs.h 2008-02-04 23:48:04.000000000 +0100
> @@ -1444,6 +1444,7 @@ struct file_system_type {
> struct module *owner;
> struct file_system_type * next;
> struct list_head fs_supers;
> + struct filesystem_sysctl_table *sysctl_table;
>
> struct lock_class_key s_lock_key;
> struct lock_class_key s_umount_key;
> Index: linux/Documentation/filesystems/proc.txt
> ===================================================================
> --- linux.orig/Documentation/filesystems/proc.txt 2008-02-04 23:47:58.000000000 +0100
> +++ linux/Documentation/filesystems/proc.txt 2008-02-04 23:48:04.000000000 +0100
> @@ -44,6 +44,7 @@ Table of Contents
> 2.14 /proc/<pid>/io - Display the IO accounting fields
> 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
> 2.16 /proc/<pid>/mountinfo - Information about mounts
> + 2.17 /proc/sys/fs/types - File system type specific parameters
>
> ------------------------------------------------------------------------------
> Preface
> @@ -2392,4 +2393,34 @@ For more information see:
> Documentation/filesystems/sharedsubtree.txt
>
>
> +2.17 /proc/sys/fs/types/ - File system type specific parameters
> +----------------------------------------------------------------
> +
> +There's a separate directory /proc/sys/fs/types/<type>/ for each
> +filesystem type, containing the following files:
> +
> +usermount_safe
> +--------------
> +
> +Setting this to non-zero will allow filesystems of this type to be
> +mounted by unprivileged users (note, that there are other
> +prerequisites as well).
> +
> +Fuse has been designed to be as safe as possible, and some
> +distributions already ship with unprivileged fuse mounts enabled by
> +default. There are still some situations (multi-user systems with
> +untrusted users in particular), where enabling this for fuse might not
> +be appropriate. For more details, see Documentation/filesystems/fuse.txt
> +
> +Procfs is also safe, but unprivileged mounting of it is not usually
> +necessary (bind mounting is equivalent).
> +
> +Most other filesystems are unsafe. Here are just some of the issues,
> +that must be resolved before a filesystem can be declared safe:
> +
> + - no strict input checking (buffer overruns, directory loops, etc)
> + - network filesystem deadlocks when mounting from localhost
> + - no permission checking when opening the device
> + - changing mount options when mounting a new instance of a filesystem
> +
> ------------------------------------------------------------------------------
>
> --

2008-02-06 21:11:35

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

> > + t->table[0].mode = 0644;
>
> Yikes, this could be a problem for containers, as it's simply tied to
> uid 0, whereas tying it to a capability would let us solve it with
> capability bounds.
>
> This might mean more urgency to get user namespaces working at least
> with sysfs, else this is a quick way around having CAP_SYS_ADMIN taken
> out of a container's capability bounding set.

I think I understand the problem, but not the solution. How do user
namespaces going to help?

Maybe sysctls just need to check capabilities, instead of uids. I
think that would make a lot of sense anyway.

Thanks,
Miklos

2008-02-06 22:45:42

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

Quoting Miklos Szeredi ([email protected]):
> > > + t->table[0].mode = 0644;
> >
> > Yikes, this could be a problem for containers, as it's simply tied to
> > uid 0, whereas tying it to a capability would let us solve it with
> > capability bounds.
> >
> > This might mean more urgency to get user namespaces working at least
> > with sysfs, else this is a quick way around having CAP_SYS_ADMIN taken
> > out of a container's capability bounding set.
>
> I think I understand the problem, but not the solution. How do user
> namespaces going to help?

Well it somewhat depends on how we implement userns for filesystems
in the first place, and whether we end up splitting sysfs into
sub-filesystems as I think Eric Biederman has been advocating. My
thoughts had been running along the lines of just tagging vfsmounts
with userns of the mounting process. A task from outside the mounting
process' namespace would get user other permissions whether or not
its uid was the owning uid or uid 0 (unless the task had CAP_NS_OVERRIDE).

But really it gets more complicated for sysfs than something like ext2
since we really want to be able to filter files and directories for
different namespaces... Handling sysfs user namespaces before we sort
out the rest of the sysfs stuff (being hashed out with network
namespaces) seems like jumping the gun a bit.

> Maybe sysctls just need to check capabilities, instead of uids. I
> think that would make a lot of sense anyway.

Would it be as simple as tagging the inodes with capability sets? One
set for writing, or one each for reading and writing?

thanks,
-serge

2008-02-07 08:10:26

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

> > Maybe sysctls just need to check capabilities, instead of uids. I
> > think that would make a lot of sense anyway.
>
> Would it be as simple as tagging the inodes with capability sets? One
> set for writing, or one each for reading and writing?

Yes, or something even simpler, like mapping the owner permission bits
to CAP_SYS_ADMIN. There seem to be very few different permissions
under /proc/sys:

--w-------
-r--r--r--
-rw-------
-rw-r--r--

As long as the group and other bits are always the same, and we accept
that the owner bits really mean CAP_SYS_ADMIN and not something else,
then the permission check would not need to look at uids or gids at
all.

Miklos

2008-02-07 14:05:47

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

Quoting Miklos Szeredi ([email protected]):
> > > Maybe sysctls just need to check capabilities, instead of uids. I
> > > think that would make a lot of sense anyway.
> >
> > Would it be as simple as tagging the inodes with capability sets? One
> > set for writing, or one each for reading and writing?
>
> Yes, or something even simpler, like mapping the owner permission bits
> to CAP_SYS_ADMIN. There seem to be very few different permissions
> under /proc/sys:
>
> --w-------
> -r--r--r--
> -rw-------
> -rw-r--r--
>
> As long as the group and other bits are always the same, and we accept
> that the owner bits really mean CAP_SYS_ADMIN and not something else,

But I would assume some things under /proc/sys/net/ipv4 or
/proc/sys/net/ath0 require CAP_NET_ADMIN rather than CAP_SYS_ADMIN?

> then the permission check would not need to look at uids or gids at
> all.
>
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2008-02-07 14:36:29

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

> > > > Maybe sysctls just need to check capabilities, instead of uids. I
> > > > think that would make a lot of sense anyway.
> > >
> > > Would it be as simple as tagging the inodes with capability sets? One
> > > set for writing, or one each for reading and writing?
> >
> > Yes, or something even simpler, like mapping the owner permission bits
> > to CAP_SYS_ADMIN. There seem to be very few different permissions
> > under /proc/sys:
> >
> > --w-------
> > -r--r--r--
> > -rw-------
> > -rw-r--r--
> >
> > As long as the group and other bits are always the same, and we accept
> > that the owner bits really mean CAP_SYS_ADMIN and not something else,
>
> But I would assume some things under /proc/sys/net/ipv4 or
> /proc/sys/net/ath0 require CAP_NET_ADMIN rather than CAP_SYS_ADMIN?

I guess so. I'm not very familiar with the different capabilities :)

How about this patch then: a hybrid solution between just relying on
permission bits, and specifying separate capability sets for read and
write in addition to the permission bits.

Untested, the 'cap' field obviously still needs to be filled in where
appropriate.

Miklos
----

Index: linux/include/linux/sysctl.h
===================================================================
--- linux.orig/include/linux/sysctl.h 2008-02-04 12:29:01.000000000 +0100
+++ linux/include/linux/sysctl.h 2008-02-07 15:19:06.000000000 +0100
@@ -1041,6 +1041,7 @@ struct ctl_table
void *data;
int maxlen;
mode_t mode;
+ int cap; /* Capability needed to read/write */
struct ctl_table *child;
struct ctl_table *parent; /* Automatically set */
proc_handler *proc_handler; /* Callback for text formatting */
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c 2008-02-05 22:17:05.000000000 +0100
+++ linux/kernel/sysctl.c 2008-02-07 15:30:45.000000000 +0100
@@ -1527,14 +1527,26 @@ out:
* some sysctl variables are readonly even to root.
*/

-static int test_perm(int mode, int op)
+static int test_perm(struct ctl_table *table, int op)
{
- if (!current->euid)
- mode >>= 6;
- else if (in_egroup_p(0))
- mode >>= 3;
+ int cap = table->cap;
+ mode_t mode = table->mode;
+
+ if (!cap)
+ cap = CAP_SYS_ADMIN;
+
+ if ((op & MAY_READ) && !(mode & S_IRUGO))
+ return -EACCES;
+
+ if ((op & MAY_WRITE) && !(mode & S_IWUGO))
+ return -EACCES;
+
+ if (capable(cap))
+ return 0;
+
if ((mode & op & 0007) == op)
return 0;
+
return -EACCES;
}

@@ -1544,7 +1556,7 @@ int sysctl_perm(struct ctl_table *table,
error = security_sysctl(table, op);
if (error)
return error;
- return test_perm(table->mode, op);
+ return test_perm(table, op);
}

#ifdef CONFIG_SYSCTL_SYSCALL

2008-02-07 15:33:33

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

On Tue, Feb 05, 2008 at 10:36:23PM +0100, Miklos Szeredi wrote:
> From: Miklos Szeredi <[email protected]>
>
> Add the following:
>
> /proc/sys/fs/types/${FS_TYPE}/usermount_safe
>

There is /proc/fs/<type>/ already. Since it is file system specific
shouldn't it go there ?

-aneesh

2008-02-07 16:24:59

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

> On Tue, Feb 05, 2008 at 10:36:23PM +0100, Miklos Szeredi wrote:
> > From: Miklos Szeredi <[email protected]>
> >
> > Add the following:
> >
> > /proc/sys/fs/types/${FS_TYPE}/usermount_safe
> >
>
>
> There is /proc/fs/<type>/ already. Since it is file system specific
> shouldn't it go there ?

The problem is exactly that it's filesystem specific, each filesystem
creates it's own stuff there.

Also we have a rule tp not create new things under /proc. Things
should either go into /sys or into /proc/sys. And I think a sysctl is
more appropriate for this than something under /sys.

Thanks,
Miklos

2008-02-07 16:58:20

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [patch 07/10] unprivileged mounts: add sysctl tunable for "safe" property

Quoting Miklos Szeredi ([email protected]):
> > > > > Maybe sysctls just need to check capabilities, instead of uids. I
> > > > > think that would make a lot of sense anyway.
> > > >
> > > > Would it be as simple as tagging the inodes with capability sets? One
> > > > set for writing, or one each for reading and writing?
> > >
> > > Yes, or something even simpler, like mapping the owner permission bits
> > > to CAP_SYS_ADMIN. There seem to be very few different permissions
> > > under /proc/sys:
> > >
> > > --w-------
> > > -r--r--r--
> > > -rw-------
> > > -rw-r--r--
> > >
> > > As long as the group and other bits are always the same, and we accept
> > > that the owner bits really mean CAP_SYS_ADMIN and not something else,
> >
> > But I would assume some things under /proc/sys/net/ipv4 or
> > /proc/sys/net/ath0 require CAP_NET_ADMIN rather than CAP_SYS_ADMIN?
>
> I guess so. I'm not very familiar with the different capabilities :)
>
> How about this patch then: a hybrid solution between just relying on
> permission bits, and specifying separate capability sets for read and
> write in addition to the permission bits.
>
> Untested, the 'cap' field obviously still needs to be filled in where
> appropriate.
>
> Miklos
> ----
>
> Index: linux/include/linux/sysctl.h
> ===================================================================
> --- linux.orig/include/linux/sysctl.h 2008-02-04 12:29:01.000000000 +0100
> +++ linux/include/linux/sysctl.h 2008-02-07 15:19:06.000000000 +0100
> @@ -1041,6 +1041,7 @@ struct ctl_table
> void *data;
> int maxlen;
> mode_t mode;
> + int cap; /* Capability needed to read/write */
> struct ctl_table *child;
> struct ctl_table *parent; /* Automatically set */
> proc_handler *proc_handler; /* Callback for text formatting */
> Index: linux/kernel/sysctl.c
> ===================================================================
> --- linux.orig/kernel/sysctl.c 2008-02-05 22:17:05.000000000 +0100
> +++ linux/kernel/sysctl.c 2008-02-07 15:30:45.000000000 +0100
> @@ -1527,14 +1527,26 @@ out:
> * some sysctl variables are readonly even to root.
> */
>
> -static int test_perm(int mode, int op)
> +static int test_perm(struct ctl_table *table, int op)
> {
> - if (!current->euid)
> - mode >>= 6;
> - else if (in_egroup_p(0))
> - mode >>= 3;
> + int cap = table->cap;
> + mode_t mode = table->mode;
> +
> + if (!cap)
> + cap = CAP_SYS_ADMIN;
> +
> + if ((op & MAY_READ) && !(mode & S_IRUGO))
> + return -EACCES;
> +
> + if ((op & MAY_WRITE) && !(mode & S_IWUGO))
> + return -EACCES;
> +
> + if (capable(cap))
> + return 0;
> +
> if ((mode & op & 0007) == op)
> return 0;
> +
> return -EACCES;

I like how simple it appears to be :)

At first I missed the fact that owning uid is always 0 so I thought the
uid processing wasn't quite enough. But since it's always 0, the only
question is whether there are any /proc/sys files whose users currently
depend on being setgid 0 and setgid non-0 with no capabilities.

On my laptop, 'find /proc/sys -type f -perm -020' gives me no results,
so that is promising.

So this certainly seems like a good first step. In fact, combined with
/proc/sys/ being partially remounted per container like /proc/sys/net is
doing, we may not even need to do anything with CAP_NS_OVERRIDE.

thanks,
-serge

> }
>
> @@ -1544,7 +1556,7 @@ int sysctl_perm(struct ctl_table *table,
> error = security_sysctl(table, op);
> if (error)
> return error;
> - return test_perm(table->mode, op);
> + return test_perm(table, op);
> }
>
> #ifdef CONFIG_SYSCTL_SYSCALL