2013-06-17 08:31:15

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

I found that a few processes can eat all host memory and nobody can kill them.
$ mount -t tmpfs xxx /mnt
$ mount --make-shared /mnt
$ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done

All this processes are unkillable, because they took i_mutex and waits
namespace_lock.

...
21715 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.ht6jzO
21716 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.97K4mI
21717 pts/0    R      0:01          \_ mount --bind /mnt /mnt/test.gO2CD9
...

Each of this process doubles a number of mounts, so at the end we will
have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
need about 1TB of RAM.

Another problem is that “umount” of a big tree is very hard operation
and it requires a lot of time.
E.g.:
16411
umount("/tmp/xxx", MNT_DETACH)          = 0 <7.852066> (7.8 sec)
32795
umount("/tmp/xxx", MNT_DETACH)          = 0 <34.485501> ( 34 sec)

For all this time sys_umoun takes namespace_sem and vfsmount_lock...

Due to all this reasons I suggest to restrict a number of mounts.
Probably we can optimize this code in a future, but now this restriction
can help.

Cc: Alexander Viro <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Cc: "Serge E. Hallyn" <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Rik van Riel <[email protected]>
Signed-off-by: Andrey Vagin <[email protected]>
---
fs/namespace.c | 66 +++++++++++++++++++++++++------------------
include/linux/mnt_namespace.h | 2 ++
kernel/sysctl.c | 8 ++++++
3 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7b1ca9b..d22e54c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,6 +41,9 @@ static struct list_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
static struct rw_semaphore namespace_sem;

+unsigned int sysctl_mount_nr __read_mostly = 16384;
+static atomic_t mount_nr = ATOMIC_INIT(0);
+
/* /sys/fs */
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
@@ -164,43 +167,49 @@ unsigned int mnt_get_count(struct mount *mnt)

static struct mount *alloc_vfsmnt(const char *name)
{
- struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
- if (mnt) {
- int err;
+ struct mount *mnt;
+ int err;

- err = mnt_alloc_id(mnt);
- if (err)
- goto out_free_cache;
+ if (atomic_inc_return(&mount_nr) > sysctl_mount_nr)
+ goto out_dec_mount_nr;

- if (name) {
- mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
- if (!mnt->mnt_devname)
- goto out_free_id;
- }
+ mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+ if (!mnt)
+ goto out_dec_mount_nr;
+
+ err = mnt_alloc_id(mnt);
+ if (err)
+ goto out_free_cache;
+
+ if (name) {
+ mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+ if (!mnt->mnt_devname)
+ goto out_free_id;
+ }

#ifdef CONFIG_SMP
- mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
- if (!mnt->mnt_pcp)
- goto out_free_devname;
+ mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+ if (!mnt->mnt_pcp)
+ goto out_free_devname;

- this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+ this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
- mnt->mnt_count = 1;
- mnt->mnt_writers = 0;
+ mnt->mnt_count = 1;
+ mnt->mnt_writers = 0;
#endif

- INIT_LIST_HEAD(&mnt->mnt_hash);
- INIT_LIST_HEAD(&mnt->mnt_child);
- INIT_LIST_HEAD(&mnt->mnt_mounts);
- INIT_LIST_HEAD(&mnt->mnt_list);
- INIT_LIST_HEAD(&mnt->mnt_expire);
- INIT_LIST_HEAD(&mnt->mnt_share);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
- INIT_LIST_HEAD(&mnt->mnt_slave);
+ INIT_LIST_HEAD(&mnt->mnt_hash);
+ INIT_LIST_HEAD(&mnt->mnt_child);
+ INIT_LIST_HEAD(&mnt->mnt_mounts);
+ INIT_LIST_HEAD(&mnt->mnt_list);
+ INIT_LIST_HEAD(&mnt->mnt_expire);
+ INIT_LIST_HEAD(&mnt->mnt_share);
+ INIT_LIST_HEAD(&mnt->mnt_slave_list);
+ INIT_LIST_HEAD(&mnt->mnt_slave);
#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+ INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
- }
+
return mnt;

#ifdef CONFIG_SMP
@@ -211,6 +220,8 @@ out_free_id:
mnt_free_id(mnt);
out_free_cache:
kmem_cache_free(mnt_cache, mnt);
+out_dec_mount_nr:
+ atomic_dec(&mount_nr);
return NULL;
}

@@ -546,6 +557,7 @@ static void free_vfsmnt(struct mount *mnt)
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
#endif
+ atomic_dec(&mount_nr);
kmem_cache_free(mnt_cache, mnt);
}

diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 12b2ab5..d8e5ec9 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -2,6 +2,8 @@
#define _NAMESPACE_H_
#ifdef __KERNEL__

+extern unsigned int sysctl_mount_nr;
+
struct mnt_namespace;
struct fs_struct;
struct user_namespace;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf45..bebfdd7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
+#include <linux/mnt_namespace.h>
#include <linux/sched/sysctl.h>

#include <asm/uaccess.h>
@@ -1616,6 +1617,13 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "mount-nr",
+ .data = &sysctl_mount_nr,
+ .maxlen = sizeof(sysctl_mount_nr),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};

--
1.8.1.4


2013-06-17 19:58:41

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

Andrey Vagin <[email protected]> writes:

> I found that a few processes can eat all host memory and nobody can kill them.
> $ mount -t tmpfs xxx /mnt
> $ mount --make-shared /mnt
> $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
>
> All this processes are unkillable, because they took i_mutex and waits
> namespace_lock.
>
> ...
> 21715 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.ht6jzO
> 21716 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.97K4mI
> 21717 pts/0    R      0:01          \_ mount --bind /mnt /mnt/test.gO2CD9
> ...
>
> Each of this process doubles a number of mounts, so at the end we will
> have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
> need about 1TB of RAM.
>
> Another problem is that “umount” of a big tree is very hard operation
> and it requires a lot of time.
> E.g.:
> 16411
> umount("/tmp/xxx", MNT_DETACH)          = 0 <7.852066> (7.8 sec)
> 32795
> umount("/tmp/xxx", MNT_DETACH)          = 0 <34.485501> ( 34 sec)
>
> For all this time sys_umoun takes namespace_sem and vfsmount_lock...
>
> Due to all this reasons I suggest to restrict a number of mounts.
> Probably we can optimize this code in a future, but now this restriction
> can help.

So for anyone seriously worried about this kind of thing in general we
already have the memory control group, which is quite capable of
limiting this kind of thing, and it limits all memory allocations not
just mount.

Is there some reason we want to go down the path of adding and tuning
static limits all over the kernel? As opposed to streamlining the memory
control group so it is low overhead and everyone that cares can use it?

Should we reach the point where we automatically enable the memory
control group to prevent abuse of the kernel?

Eric

> Cc: Alexander Viro <[email protected]>
> Cc: "Eric W. Biederman" <[email protected]>
> Cc: "Serge E. Hallyn" <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Kees Cook <[email protected]>
> Cc: Mel Gorman <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Signed-off-by: Andrey Vagin <[email protected]>
> ---
> fs/namespace.c | 66 +++++++++++++++++++++++++------------------
> include/linux/mnt_namespace.h | 2 ++
> kernel/sysctl.c | 8 ++++++
> 3 files changed, 49 insertions(+), 27 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 7b1ca9b..d22e54c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -41,6 +41,9 @@ static struct list_head *mountpoint_hashtable __read_mostly;
> static struct kmem_cache *mnt_cache __read_mostly;
> static struct rw_semaphore namespace_sem;
>
> +unsigned int sysctl_mount_nr __read_mostly = 16384;
> +static atomic_t mount_nr = ATOMIC_INIT(0);
> +
> /* /sys/fs */
> struct kobject *fs_kobj;
> EXPORT_SYMBOL_GPL(fs_kobj);
> @@ -164,43 +167,49 @@ unsigned int mnt_get_count(struct mount *mnt)
>
> static struct mount *alloc_vfsmnt(const char *name)
> {
> - struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
> - if (mnt) {
> - int err;
> + struct mount *mnt;
> + int err;
>
> - err = mnt_alloc_id(mnt);
> - if (err)
> - goto out_free_cache;
> + if (atomic_inc_return(&mount_nr) > sysctl_mount_nr)
> + goto out_dec_mount_nr;
>
> - if (name) {
> - mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
> - if (!mnt->mnt_devname)
> - goto out_free_id;
> - }
> + mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
> + if (!mnt)
> + goto out_dec_mount_nr;
> +
> + err = mnt_alloc_id(mnt);
> + if (err)
> + goto out_free_cache;
> +
> + if (name) {
> + mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
> + if (!mnt->mnt_devname)
> + goto out_free_id;
> + }
>
> #ifdef CONFIG_SMP
> - mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
> - if (!mnt->mnt_pcp)
> - goto out_free_devname;
> + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
> + if (!mnt->mnt_pcp)
> + goto out_free_devname;
>
> - this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
> + this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
> #else
> - mnt->mnt_count = 1;
> - mnt->mnt_writers = 0;
> + mnt->mnt_count = 1;
> + mnt->mnt_writers = 0;
> #endif
>
> - INIT_LIST_HEAD(&mnt->mnt_hash);
> - INIT_LIST_HEAD(&mnt->mnt_child);
> - INIT_LIST_HEAD(&mnt->mnt_mounts);
> - INIT_LIST_HEAD(&mnt->mnt_list);
> - INIT_LIST_HEAD(&mnt->mnt_expire);
> - INIT_LIST_HEAD(&mnt->mnt_share);
> - INIT_LIST_HEAD(&mnt->mnt_slave_list);
> - INIT_LIST_HEAD(&mnt->mnt_slave);
> + INIT_LIST_HEAD(&mnt->mnt_hash);
> + INIT_LIST_HEAD(&mnt->mnt_child);
> + INIT_LIST_HEAD(&mnt->mnt_mounts);
> + INIT_LIST_HEAD(&mnt->mnt_list);
> + INIT_LIST_HEAD(&mnt->mnt_expire);
> + INIT_LIST_HEAD(&mnt->mnt_share);
> + INIT_LIST_HEAD(&mnt->mnt_slave_list);
> + INIT_LIST_HEAD(&mnt->mnt_slave);
> #ifdef CONFIG_FSNOTIFY
> - INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
> + INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
> #endif
> - }
> +
> return mnt;
>
> #ifdef CONFIG_SMP
> @@ -211,6 +220,8 @@ out_free_id:
> mnt_free_id(mnt);
> out_free_cache:
> kmem_cache_free(mnt_cache, mnt);
> +out_dec_mount_nr:
> + atomic_dec(&mount_nr);
> return NULL;
> }
>
> @@ -546,6 +557,7 @@ static void free_vfsmnt(struct mount *mnt)
> #ifdef CONFIG_SMP
> free_percpu(mnt->mnt_pcp);
> #endif
> + atomic_dec(&mount_nr);
> kmem_cache_free(mnt_cache, mnt);
> }
>
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index 12b2ab5..d8e5ec9 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -2,6 +2,8 @@
> #define _NAMESPACE_H_
> #ifdef __KERNEL__
>
> +extern unsigned int sysctl_mount_nr;
> +
> struct mnt_namespace;
> struct fs_struct;
> struct user_namespace;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 9edcf45..bebfdd7 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -61,6 +61,7 @@
> #include <linux/kmod.h>
> #include <linux/capability.h>
> #include <linux/binfmts.h>
> +#include <linux/mnt_namespace.h>
> #include <linux/sched/sysctl.h>
>
> #include <asm/uaccess.h>
> @@ -1616,6 +1617,13 @@ static struct ctl_table fs_table[] = {
> .proc_handler = &pipe_proc_fn,
> .extra1 = &pipe_min_size,
> },
> + {
> + .procname = "mount-nr",
> + .data = &sysctl_mount_nr,
> + .maxlen = sizeof(sysctl_mount_nr),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> { }
> };

2013-06-17 22:56:18

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

On Mon, 17 Jun 2013 12:58:00 -0700 [email protected] (Eric W. Biederman) wrote:

> > I found that a few processes can eat all host memory and nobody can kill them.
> > $ mount -t tmpfs xxx /mnt
> > $ mount --make-shared /mnt
> > $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
> >
> > All this processes are unkillable, because they took i_mutex and waits
> > namespace_lock.
> >
> > ...
> > 21715 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.ht6jzO
> > 21716 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.97K4mI
> > 21717 pts/0 ______R __________0:01 __________________\_ mount --bind /mnt /mnt/test.gO2CD9
> > ...
> >
> > Each of this process doubles a number of mounts, so at the end we will
> > have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
> > need about 1TB of RAM.
> >
> > Another problem is that ___umount___ of a big tree is very hard operation
> > and it requires a lot of time.
> > E.g.:
> > 16411
> > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <7.852066> (7.8 sec)
> > 32795
> > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <34.485501> ( 34 sec)
> >
> > For all this time sys_umoun takes namespace_sem and vfsmount_lock...
> >
> > Due to all this reasons I suggest to restrict a number of mounts.
> > Probably we can optimize this code in a future, but now this restriction
> > can help.
>
> So for anyone seriously worried about this kind of thing in general we
> already have the memory control group, which is quite capable of
> limiting this kind of thing, and it limits all memory allocations not
> just mount.

What is the exposure here? By what means can a non-CAP_SYS_ADMIN user
run sys_mount() under the namespace system?

IOW, what does the sysadmin have to do to permit this? Is that a
typical thing to do, or did the sysadmin make a mistake?

2013-06-17 22:56:54

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

2013/6/17 Eric W. Biederman <[email protected]>:
> Andrey Vagin <[email protected]> writes:
>
>> I found that a few processes can eat all host memory and nobody can kill them.
>> $ mount -t tmpfs xxx /mnt
>> $ mount --make-shared /mnt
>> $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
>>
>> All this processes are unkillable, because they took i_mutex and waits
>> namespace_lock.
>>
>> ...
>> 21715 pts/0 D 0:00 \_ mount --bind /mnt /mnt/test.ht6jzO
>> 21716 pts/0 D 0:00 \_ mount --bind /mnt /mnt/test.97K4mI
>> 21717 pts/0 R 0:01 \_ mount --bind /mnt /mnt/test.gO2CD9
>> ...
>>
>> Each of this process doubles a number of mounts, so at the end we will
>> have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
>> need about 1TB of RAM.
>>
>> Another problem is that ?umount? of a big tree is very hard operation
>> and it requires a lot of time.
>> E.g.:
>> 16411
>> umount("/tmp/xxx", MNT_DETACH) = 0 <7.852066> (7.8 sec)
>> 32795
>> umount("/tmp/xxx", MNT_DETACH) = 0 <34.485501> ( 34 sec)
>>
>> For all this time sys_umoun takes namespace_sem and vfsmount_lock...
>>
>> Due to all this reasons I suggest to restrict a number of mounts.
>> Probably we can optimize this code in a future, but now this restriction
>> can help.
>
> So for anyone seriously worried about this kind of thing in general we
> already have the memory control group, which is quite capable of
> limiting this kind of thing,

> and it limits all memory allocations not just mount.

And that is problem, we can't to limit a particular slab. Let's
imagine a real container with 4Gb of RAM. What is a kernel memory
limit resonable for it? I setup 64 Mb (it may be not enough for real
CT, but it's enough to make host inaccessible for some minutes).

$ mkdir /sys/fs/cgroup/memory/test
$ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
$ unshare -m
$ echo $$ > /sys/fs/cgroup/memory/test/tasks
$ mount --make-rprivate /
$ mount -t tmpfs xxx /mnt
$ mount --make-shared /mnt
$ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
`mktemp -d /mnt/test.XXXXXX` & done; for i in `seq 30`; do wait;
done'
real 0m23.141s
user 0m0.016s
sys 0m22.881s

While the last script is working, nobody can't to read /proc/mounts or
mount something. I don't think that users from other containers will
be glad. This problem is not so significant in compared with umounting
of this tree.

$ strace -T umount -l /mnt
umount("/mnt", MNT_DETACH) = 0 <548.898244>
The host is inaccessible, it writes messages about soft lockup in
kernel log and eats 100% cpu.


>
> Is there some reason we want to go down the path of adding and tuning
> static limits all over the kernel? As opposed to streamlining the memory
> control group so it is low overhead and everyone that cares can use it?

The memory control group doesn't help in this case... I need to look
at this code in more details, maybe we can limit a depth of nested
mount points.

2013-06-18 06:11:50

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

On Mon, Jun 17, 2013 at 03:56:14PM -0700, Andrew Morton wrote:
> On Mon, 17 Jun 2013 12:58:00 -0700 [email protected] (Eric W. Biederman) wrote:
>
> > > I found that a few processes can eat all host memory and nobody can kill them.
> > > $ mount -t tmpfs xxx /mnt
> > > $ mount --make-shared /mnt
> > > $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
> > >
> > > All this processes are unkillable, because they took i_mutex and waits
> > > namespace_lock.
> > >
> > > ...
> > > 21715 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.ht6jzO
> > > 21716 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.97K4mI
> > > 21717 pts/0 ______R __________0:01 __________________\_ mount --bind /mnt /mnt/test.gO2CD9
> > > ...
> > >
> > > Each of this process doubles a number of mounts, so at the end we will
> > > have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
> > > need about 1TB of RAM.
> > >
> > > Another problem is that ___umount___ of a big tree is very hard operation
> > > and it requires a lot of time.
> > > E.g.:
> > > 16411
> > > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <7.852066> (7.8 sec)
> > > 32795
> > > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <34.485501> ( 34 sec)
> > >
> > > For all this time sys_umoun takes namespace_sem and vfsmount_lock...
> > >
> > > Due to all this reasons I suggest to restrict a number of mounts.
> > > Probably we can optimize this code in a future, but now this restriction
> > > can help.
> >
> > So for anyone seriously worried about this kind of thing in general we
> > already have the memory control group, which is quite capable of
> > limiting this kind of thing, and it limits all memory allocations not
> > just mount.
>
> What is the exposure here? By what means can a non-CAP_SYS_ADMIN user
> run sys_mount() under the namespace system?
>
> IOW, what does the sysadmin have to do to permit this? Is that a
> typical thing to do, or did the sysadmin make a mistake?

It's a problem for Linux Containers. Because usually the root user in
container should have enough rights to mount something (tmpfs,
bindmounts, etc). Our target is to make containers completely isolated.

A container is isolated with help of namespaces. The user namespace
creates a new sets of capabilities and users.


> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-06-19 21:37:51

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote:
> 2013/6/17 Eric W. Biederman <[email protected]>:
> > So for anyone seriously worried about this kind of thing in general we
> > already have the memory control group, which is quite capable of
> > limiting this kind of thing,
>
> > and it limits all memory allocations not just mount.
>
> And that is problem, we can't to limit a particular slab. Let's
> imagine a real container with 4Gb of RAM. What is a kernel memory
> limit resonable for it? I setup 64 Mb (it may be not enough for real
> CT, but it's enough to make host inaccessible for some minutes).
>
> $ mkdir /sys/fs/cgroup/memory/test
> $ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
> $ unshare -m
> $ echo $$ > /sys/fs/cgroup/memory/test/tasks
> $ mount --make-rprivate /
> $ mount -t tmpfs xxx /mnt
> $ mount --make-shared /mnt
> $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
> `mktemp -d /mnt/test.XXXXXX` & done; for i in `seq 30`; do wait;
> done'
> real 0m23.141s
> user 0m0.016s
> sys 0m22.881s
>
> While the last script is working, nobody can't to read /proc/mounts or
> mount something. I don't think that users from other containers will
> be glad. This problem is not so significant in compared with umounting
> of this tree.
>
> $ strace -T umount -l /mnt
> umount("/mnt", MNT_DETACH) = 0 <548.898244>
> The host is inaccessible, it writes messages about soft lockup in
> kernel log and eats 100% cpu.

Eric, do you agree that
* It is a problem
* Currently we don't have a mechanism to prevent this problem
* We need to find a way to prevent this problem

>
>
> >
> > Is there some reason we want to go down the path of adding and tuning
> > static limits all over the kernel? As opposed to streamlining the memory
> > control group so it is low overhead and everyone that cares can use it?
>
> The memory control group doesn't help in this case... I need to look
> at this code in more details, maybe we can limit a depth of nested
> mount points.

Complexity of the umount algorithm does not depends on a depth of nested
mounts, it depends on a number of mounts and sometimes complexity is O(n^2).

For example:

mount -t tmpfs xxx /mnt
mount --make-shared /mnt

mkdir /mnt/tmp
mount -t tmpfs xxx /mnt/tmp
mkdir /mnt/d

for ((i = 0; i < $1; i++)); do
d=`mktemp -d /mnt/d/xxx.XXXXXX`
mount --bind /mnt/tmp $d || break
done

mkdir /mnt/tmp/d
for ((i = 0; i < $1; i++)); do
d=`mktemp -d /mnt/tmp/xxx.XXXXXX`
mount --bind /mnt/tmp/d $d || break
done

perf data for umount -l /mnt
29.60% dbus-daemon [kernel.kallsyms] [k] __ticket_spin_lock
|
--- __ticket_spin_lock
lg_local_lock
path_init
path_openat
do_filp_open
do_sys_open
SyS_openat
system_call_fastpath
__openat64_nocancel
0x747379732f312d73

20.20% umount [kernel.kallsyms] [k] propagation_next
|
--- propagation_next
|
|--65.35%-- umount_tree
| SyS_umount
| system_call_fastpath
| __umount2
| __libc_start_main
|
--34.65%-- propagate_umount
umount_tree
SyS_umount
system_call_fastpath
__umount2
__libc_start_main

17.81% umount [kernel.kallsyms] [k] __lookup_mnt
|
--- __lookup_mnt
|
|--82.78%-- propagate_umount
| umount_tree
| SyS_umount
| system_call_fastpath
| __umount2
| __libc_start_main
|
--17.22%-- umount_tree
SyS_umount
system_call_fastpath
__umount2
__libc_start_main

2013-06-21 01:05:08

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

Andrey Wagin <[email protected]> writes:

> On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote:
>> 2013/6/17 Eric W. Biederman <[email protected]>:
>> > So for anyone seriously worried about this kind of thing in general we
>> > already have the memory control group, which is quite capable of
>> > limiting this kind of thing,
>>
>> > and it limits all memory allocations not just mount.
>>
>> And that is problem, we can't to limit a particular slab. Let's
>> imagine a real container with 4Gb of RAM. What is a kernel memory
>> limit resonable for it? I setup 64 Mb (it may be not enough for real
>> CT, but it's enough to make host inaccessible for some minutes).
>>
>> $ mkdir /sys/fs/cgroup/memory/test
>> $ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
>> $ unshare -m
>> $ echo $$ > /sys/fs/cgroup/memory/test/tasks
>> $ mount --make-rprivate /
>> $ mount -t tmpfs xxx /mnt
>> $ mount --make-shared /mnt
>> $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
>> `mktemp -d /mnt/test.XXXXXX` & done; for i in `seq 30`; do wait;
>> done'
>> real 0m23.141s
>> user 0m0.016s
>> sys 0m22.881s
>>
>> While the last script is working, nobody can't to read /proc/mounts or
>> mount something. I don't think that users from other containers will
>> be glad. This problem is not so significant in compared with umounting
>> of this tree.
>>
>> $ strace -T umount -l /mnt
>> umount("/mnt", MNT_DETACH) = 0 <548.898244>
>> The host is inaccessible, it writes messages about soft lockup in
>> kernel log and eats 100% cpu.
>
> Eric, do you agree that
> * It is a problem
> * Currently we don't have a mechanism to prevent this problem
> * We need to find a way to prevent this problem

Ugh. I knew mount propagation was annoying semantically but I had not
realized the implementation was quite so bad.

This doesn't happen in normal operation to normal folks. So I don't
think this is something we need to rush in a fix at the last moment to
prevent the entire world from melting down. Even people using mount
namespaces in containers.

I do think it is worth looking at. Which kernel were you testing?. I
haven't gotten as far as looking too closely but I just noticed that Al
Viro has been busy rewriting the lock of this. So if you aren't testing
at least 2.10-rcX you probably need to retest.

My thoughts would be. Improve the locking as much as possible,
and if that is not enough keep a measure of how many mounts will be
affected at least for the umount. Possibly for the umount -l case.
Then just don't allow the complexity to exceed some limit so we know
things will happen in a timely manner.

Eric