One practical problem I've found with cgroup namespaces is that there
is no way to disambiguate between a cgroupfs mount which was done in
a cgroup namespace, and a bind mount of a cgroupfs directory. So
whether I do
unshare --cgroup -- bash -c "mount -t cgroup -o freezer f /mnt; cat /proc/self/mountinfo"
or whether I just
mount --bind /sys/fs/cgroup/freezer/$(awk -F: '/freezer/ { print $3 }' /proc/self/cgroup) /mnt
'mount root' field (field 3) in /proc/self/mountinfo will show the
same thing, the result of awk -F: '/freezer/ { print $3 }' /proc/self/cgroup.
This patch adds a 'nsroot=' field to cgroup mountinfo entries, so that
userspace can distinguish a mount made in a cgroup namespace from a bind
mount from a cgroup subdirectory.
Signed-off-by: Serge Hallyn <[email protected]>
---
fs/kernfs/mount.c | 2 +-
include/linux/kernfs.h | 3 ++-
kernel/cgroup.c | 29 ++++++++++++++++++++++++++++-
3 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index b67dbcc..58f59fd 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
struct kernfs_syscall_ops *scops = root->syscall_ops;
if (scops && scops->show_options)
- return scops->show_options(sf, root);
+ return scops->show_options(sf, dentry, root);
return 0;
}
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c442..3124b91 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -145,7 +145,8 @@ struct kernfs_node {
*/
struct kernfs_syscall_ops {
int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
- int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
+ int (*show_options)(struct seq_file *sf, struct dentry *dentry,
+ struct kernfs_root *root);
int (*mkdir)(struct kernfs_node *parent, const char *name,
umode_t mode);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671dc05..806d1e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1593,7 +1593,32 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
return 0;
}
-static int cgroup_show_options(struct seq_file *seq,
+static void cgroup_show_nsroot(struct seq_file *seq, struct dentry *dentry,
+ struct kernfs_root *kf_root)
+{
+ struct kernfs_node *d_kn = dentry->d_fsdata;
+ char *nsroot;
+ int len, ret;
+
+ if (!kf_root)
+ return;
+ len = kernfs_path_from_node(d_kn, kf_root->kn, NULL, 0);
+ if (len <= 0)
+ return;
+ nsroot = kzalloc(len + 1, GFP_ATOMIC);
+ if (!nsroot)
+ return;
+ ret = kernfs_path_from_node(d_kn, kf_root->kn, nsroot, len + 1);
+ if (ret <= 0 || ret > len)
+ goto out;
+
+ seq_show_option(seq, "nsroot", nsroot);
+
+out:
+ kfree(nsroot);
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry,
struct kernfs_root *kf_root)
{
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
@@ -1619,6 +1644,8 @@ static int cgroup_show_options(struct seq_file *seq,
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_show_option(seq, "name", root->name);
+ cgroup_show_nsroot(seq, dentry, kf_root);
+
return 0;
}
--
2.7.3
Quoting Serge E. Hallyn ([email protected]):
> One practical problem I've found with cgroup namespaces is that there
> is no way to disambiguate between a cgroupfs mount which was done in
> a cgroup namespace, and a bind mount of a cgroupfs directory. So
> whether I do
>
> unshare --cgroup -- bash -c "mount -t cgroup -o freezer f /mnt; cat /proc/self/mountinfo"
>
> or whether I just
>
> mount --bind /sys/fs/cgroup/freezer/$(awk -F: '/freezer/ { print $3 }' /proc/self/cgroup) /mnt
>
> 'mount root' field (field 3) in /proc/self/mountinfo will show the
> same thing, the result of awk -F: '/freezer/ { print $3 }' /proc/self/cgroup.
>
> This patch adds a 'nsroot=' field to cgroup mountinfo entries, so that
> userspace can distinguish a mount made in a cgroup namespace from a bind
> mount from a cgroup subdirectory.
Hi Tejun,
no rush on the patch itself, I don't mind if i have to rewrite it from
scratch, but I'd like to get the patch into docker/libcontainer using it,
so can we decide on whether the syntax as shown here is ok?
thanks,
-serge
> Signed-off-by: Serge Hallyn <[email protected]>
> ---
> fs/kernfs/mount.c | 2 +-
> include/linux/kernfs.h | 3 ++-
> kernel/cgroup.c | 29 ++++++++++++++++++++++++++++-
> 3 files changed, 31 insertions(+), 3 deletions(-)
>
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index b67dbcc..58f59fd 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
> struct kernfs_syscall_ops *scops = root->syscall_ops;
>
> if (scops && scops->show_options)
> - return scops->show_options(sf, root);
> + return scops->show_options(sf, dentry, root);
> return 0;
> }
>
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index c06c442..3124b91 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -145,7 +145,8 @@ struct kernfs_node {
> */
> struct kernfs_syscall_ops {
> int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
> + int (*show_options)(struct seq_file *sf, struct dentry *dentry,
> + struct kernfs_root *root);
>
> int (*mkdir)(struct kernfs_node *parent, const char *name,
> umode_t mode);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 671dc05..806d1e7 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1593,7 +1593,32 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
> return 0;
> }
>
> -static int cgroup_show_options(struct seq_file *seq,
> +static void cgroup_show_nsroot(struct seq_file *seq, struct dentry *dentry,
> + struct kernfs_root *kf_root)
> +{
> + struct kernfs_node *d_kn = dentry->d_fsdata;
> + char *nsroot;
> + int len, ret;
> +
> + if (!kf_root)
> + return;
> + len = kernfs_path_from_node(d_kn, kf_root->kn, NULL, 0);
> + if (len <= 0)
> + return;
> + nsroot = kzalloc(len + 1, GFP_ATOMIC);
> + if (!nsroot)
> + return;
> + ret = kernfs_path_from_node(d_kn, kf_root->kn, nsroot, len + 1);
> + if (ret <= 0 || ret > len)
> + goto out;
> +
> + seq_show_option(seq, "nsroot", nsroot);
> +
> +out:
> + kfree(nsroot);
> +}
> +
> +static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry,
> struct kernfs_root *kf_root)
> {
> struct cgroup_root *root = cgroup_root_from_kf(kf_root);
> @@ -1619,6 +1644,8 @@ static int cgroup_show_options(struct seq_file *seq,
> seq_puts(seq, ",clone_children");
> if (strlen(root->name))
> seq_show_option(seq, "name", root->name);
> + cgroup_show_nsroot(seq, dentry, kf_root);
> +
> return 0;
> }
>
> --
> 2.7.3
>
Hi Serge,
On Mon, Mar 21, 2016 at 06:41:33PM -0500, Serge E. Hallyn wrote:
> One practical problem I've found with cgroup namespaces is that there
> is no way to disambiguate between a cgroupfs mount which was done in
> a cgroup namespace, and a bind mount of a cgroupfs directory. So
> whether I do
>
> unshare --cgroup -- bash -c "mount -t cgroup -o freezer f /mnt; cat /proc/self/mountinfo"
>
> or whether I just
>
> mount --bind /sys/fs/cgroup/freezer/$(awk -F: '/freezer/ { print $3 }' /proc/self/cgroup) /mnt
>
> 'mount root' field (field 3) in /proc/self/mountinfo will show the
> same thing, the result of awk -F: '/freezer/ { print $3 }' /proc/self/cgroup.
>
> This patch adds a 'nsroot=' field to cgroup mountinfo entries, so that
> userspace can distinguish a mount made in a cgroup namespace from a bind
> mount from a cgroup subdirectory.
With this patch, mountinfo shows nsroot= in the mount options, but the
actual mount() call for cgroups doesn't allow nsroot. Would it be
possible to allow passing nsroot= to mount, as long is it does in fact
match the current nsroot?
The motivation for this is that CRIU just copies the mount options and
uses them on restore, so with this patch we have to add a special case
to trim off nsroot= before we restore.
Tycho
> Signed-off-by: Serge Hallyn <[email protected]>
> ---
> fs/kernfs/mount.c | 2 +-
> include/linux/kernfs.h | 3 ++-
> kernel/cgroup.c | 29 ++++++++++++++++++++++++++++-
> 3 files changed, 31 insertions(+), 3 deletions(-)
>
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index b67dbcc..58f59fd 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
> struct kernfs_syscall_ops *scops = root->syscall_ops;
>
> if (scops && scops->show_options)
> - return scops->show_options(sf, root);
> + return scops->show_options(sf, dentry, root);
> return 0;
> }
>
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index c06c442..3124b91 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -145,7 +145,8 @@ struct kernfs_node {
> */
> struct kernfs_syscall_ops {
> int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
> + int (*show_options)(struct seq_file *sf, struct dentry *dentry,
> + struct kernfs_root *root);
>
> int (*mkdir)(struct kernfs_node *parent, const char *name,
> umode_t mode);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 671dc05..806d1e7 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1593,7 +1593,32 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
> return 0;
> }
>
> -static int cgroup_show_options(struct seq_file *seq,
> +static void cgroup_show_nsroot(struct seq_file *seq, struct dentry *dentry,
> + struct kernfs_root *kf_root)
> +{
> + struct kernfs_node *d_kn = dentry->d_fsdata;
> + char *nsroot;
> + int len, ret;
> +
> + if (!kf_root)
> + return;
> + len = kernfs_path_from_node(d_kn, kf_root->kn, NULL, 0);
> + if (len <= 0)
> + return;
> + nsroot = kzalloc(len + 1, GFP_ATOMIC);
> + if (!nsroot)
> + return;
> + ret = kernfs_path_from_node(d_kn, kf_root->kn, nsroot, len + 1);
> + if (ret <= 0 || ret > len)
> + goto out;
> +
> + seq_show_option(seq, "nsroot", nsroot);
> +
> +out:
> + kfree(nsroot);
> +}
> +
> +static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry,
> struct kernfs_root *kf_root)
> {
> struct cgroup_root *root = cgroup_root_from_kf(kf_root);
> @@ -1619,6 +1644,8 @@ static int cgroup_show_options(struct seq_file *seq,
> seq_puts(seq, ",clone_children");
> if (strlen(root->name))
> seq_show_option(seq, "name", root->name);
> + cgroup_show_nsroot(seq, dentry, kf_root);
> +
> return 0;
> }
>
> --
> 2.7.3
>
Quoting Tycho Andersen ([email protected]):
> Hi Serge,
>
> On Mon, Mar 21, 2016 at 06:41:33PM -0500, Serge E. Hallyn wrote:
> > One practical problem I've found with cgroup namespaces is that there
> > is no way to disambiguate between a cgroupfs mount which was done in
> > a cgroup namespace, and a bind mount of a cgroupfs directory. So
> > whether I do
> >
> > unshare --cgroup -- bash -c "mount -t cgroup -o freezer f /mnt; cat /proc/self/mountinfo"
> >
> > or whether I just
> >
> > mount --bind /sys/fs/cgroup/freezer/$(awk -F: '/freezer/ { print $3 }' /proc/self/cgroup) /mnt
> >
> > 'mount root' field (field 3) in /proc/self/mountinfo will show the
> > same thing, the result of awk -F: '/freezer/ { print $3 }' /proc/self/cgroup.
> >
> > This patch adds a 'nsroot=' field to cgroup mountinfo entries, so that
> > userspace can distinguish a mount made in a cgroup namespace from a bind
> > mount from a cgroup subdirectory.
>
> With this patch, mountinfo shows nsroot= in the mount options, but the
> actual mount() call for cgroups doesn't allow nsroot. Would it be
> possible to allow passing nsroot= to mount, as long is it does in fact
> match the current nsroot?
Yeah, that should be possible. I'll try to send a patch for that later
this week. That's not to say Tejun will be ok with the behavior, but
it seems to make sense to me.
> The motivation for this is that CRIU just copies the mount options and
> uses them on restore, so with this patch we have to add a special case
> to trim off nsroot= before we restore.
>
> Tycho
>
> > Signed-off-by: Serge Hallyn <[email protected]>
> > ---
> > fs/kernfs/mount.c | 2 +-
> > include/linux/kernfs.h | 3 ++-
> > kernel/cgroup.c | 29 ++++++++++++++++++++++++++++-
> > 3 files changed, 31 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> > index b67dbcc..58f59fd 100644
> > --- a/fs/kernfs/mount.c
> > +++ b/fs/kernfs/mount.c
> > @@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
> > struct kernfs_syscall_ops *scops = root->syscall_ops;
> >
> > if (scops && scops->show_options)
> > - return scops->show_options(sf, root);
> > + return scops->show_options(sf, dentry, root);
> > return 0;
> > }
> >
> > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > index c06c442..3124b91 100644
> > --- a/include/linux/kernfs.h
> > +++ b/include/linux/kernfs.h
> > @@ -145,7 +145,8 @@ struct kernfs_node {
> > */
> > struct kernfs_syscall_ops {
> > int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> > - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
> > + int (*show_options)(struct seq_file *sf, struct dentry *dentry,
> > + struct kernfs_root *root);
> >
> > int (*mkdir)(struct kernfs_node *parent, const char *name,
> > umode_t mode);
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index 671dc05..806d1e7 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -1593,7 +1593,32 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
> > return 0;
> > }
> >
> > -static int cgroup_show_options(struct seq_file *seq,
> > +static void cgroup_show_nsroot(struct seq_file *seq, struct dentry *dentry,
> > + struct kernfs_root *kf_root)
> > +{
> > + struct kernfs_node *d_kn = dentry->d_fsdata;
> > + char *nsroot;
> > + int len, ret;
> > +
> > + if (!kf_root)
> > + return;
> > + len = kernfs_path_from_node(d_kn, kf_root->kn, NULL, 0);
> > + if (len <= 0)
> > + return;
> > + nsroot = kzalloc(len + 1, GFP_ATOMIC);
> > + if (!nsroot)
> > + return;
> > + ret = kernfs_path_from_node(d_kn, kf_root->kn, nsroot, len + 1);
> > + if (ret <= 0 || ret > len)
> > + goto out;
> > +
> > + seq_show_option(seq, "nsroot", nsroot);
> > +
> > +out:
> > + kfree(nsroot);
> > +}
> > +
> > +static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry,
> > struct kernfs_root *kf_root)
> > {
> > struct cgroup_root *root = cgroup_root_from_kf(kf_root);
> > @@ -1619,6 +1644,8 @@ static int cgroup_show_options(struct seq_file *seq,
> > seq_puts(seq, ",clone_children");
> > if (strlen(root->name))
> > seq_show_option(seq, "name", root->name);
> > + cgroup_show_nsroot(seq, dentry, kf_root);
> > +
> > return 0;
> > }
> >
> > --
> > 2.7.3
> >
As of the patch "cgroup namespaces: add a 'nsroot=' mountinfo field",
cgroupfs mountinfo output shows 'nsroot='. If userspace like criu
copy/pastes mount options from there into a new mount command, we should
ignore it.
Signed-off-by: Serge Hallyn <[email protected]>
---
kernel/cgroup.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ef0c25d..69fb112 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1680,6 +1680,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->none = true;
continue;
}
+ if (!strncmp(token, "nsroot=", 7)) {
+ /* ignore nsroot= copied from mountinfo */
+ continue;
+ }
if (!strcmp(token, "all")) {
/* Mutually exclusive option 'all' + subsystem name */
if (one_ss)
--
2.7.0
On Wed, Mar 30, 2016 at 12:21:00PM -0500, Serge E. Hallyn wrote:
> As of the patch "cgroup namespaces: add a 'nsroot=' mountinfo field",
> cgroupfs mountinfo output shows 'nsroot='. If userspace like criu
> copy/pastes mount options from there into a new mount command, we should
> ignore it.
>
> Signed-off-by: Serge Hallyn <[email protected]>
Tested-by: Tycho Andersen <[email protected]>
Thanks, Serge.
> ---
> kernel/cgroup.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index ef0c25d..69fb112 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1680,6 +1680,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
> opts->none = true;
> continue;
> }
> + if (!strncmp(token, "nsroot=", 7)) {
> + /* ignore nsroot= copied from mountinfo */
> + continue;
> + }
> if (!strcmp(token, "all")) {
> /* Mutually exclusive option 'all' + subsystem name */
> if (one_ss)
> --
> 2.7.0
>
Hello, Serge.
On Mon, Mar 28, 2016 at 08:12:03PM -0500, Serge E. Hallyn wrote:
> Quoting Serge E. Hallyn ([email protected]):
> > One practical problem I've found with cgroup namespaces is that there
> > is no way to disambiguate between a cgroupfs mount which was done in
> > a cgroup namespace, and a bind mount of a cgroupfs directory. So
> > whether I do
> >
> > unshare --cgroup -- bash -c "mount -t cgroup -o freezer f /mnt; cat /proc/self/mountinfo"
> >
> > or whether I just
> >
> > mount --bind /sys/fs/cgroup/freezer/$(awk -F: '/freezer/ { print $3 }' /proc/self/cgroup) /mnt
> >
> > 'mount root' field (field 3) in /proc/self/mountinfo will show the
> > same thing, the result of awk -F: '/freezer/ { print $3 }' /proc/self/cgroup.
> >
> > This patch adds a 'nsroot=' field to cgroup mountinfo entries, so that
> > userspace can distinguish a mount made in a cgroup namespace from a bind
> > mount from a cgroup subdirectory.
>
> no rush on the patch itself, I don't mind if i have to rewrite it from
> scratch, but I'd like to get the patch into docker/libcontainer using it,
> so can we decide on whether the syntax as shown here is ok?
Yeah, I think the syntax is fine.
Thanks.
--
tejun
Hello, Serge.
Sorry about the delay.
On Mon, Mar 21, 2016 at 06:41:33PM -0500, Serge E. Hallyn wrote:
> struct kernfs_syscall_ops {
> int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
> + int (*show_options)(struct seq_file *sf, struct dentry *dentry,
> + struct kernfs_root *root);
Wouldn't it make more sense to pass in kernfs_node pointer instead of
dentry pointer?
> +static void cgroup_show_nsroot(struct seq_file *seq, struct dentry *dentry,
> + struct kernfs_root *kf_root)
> +{
> + struct kernfs_node *d_kn = dentry->d_fsdata;
> + char *nsroot;
> + int len, ret;
> +
> + if (!kf_root)
> + return;
> + len = kernfs_path_from_node(d_kn, kf_root->kn, NULL, 0);
> + if (len <= 0)
> + return;
> + nsroot = kzalloc(len + 1, GFP_ATOMIC);
> + if (!nsroot)
> + return;
> + ret = kernfs_path_from_node(d_kn, kf_root->kn, nsroot, len + 1);
> + if (ret <= 0 || ret > len)
> + goto out;
Hmmm.... does this mean that someone inside cgroup ns would be able to
find out the absolute cgroup path of the ns root from inside? If so,
wouldn't that be an unnecessary information leak?
Thanks.
--
tejun
Quoting Tejun Heo ([email protected]):
> Hello, Serge.
>
> Sorry about the delay.
>
> On Mon, Mar 21, 2016 at 06:41:33PM -0500, Serge E. Hallyn wrote:
> > struct kernfs_syscall_ops {
> > int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> > - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
> > + int (*show_options)(struct seq_file *sf, struct dentry *dentry,
> > + struct kernfs_root *root);
>
> Wouldn't it make more sense to pass in kernfs_node pointer instead of
> dentry pointer?
Yeah that definately seems better.
> > +static void cgroup_show_nsroot(struct seq_file *seq, struct dentry *dentry,
> > + struct kernfs_root *kf_root)
> > +{
> > + struct kernfs_node *d_kn = dentry->d_fsdata;
> > + char *nsroot;
> > + int len, ret;
> > +
> > + if (!kf_root)
> > + return;
> > + len = kernfs_path_from_node(d_kn, kf_root->kn, NULL, 0);
> > + if (len <= 0)
> > + return;
> > + nsroot = kzalloc(len + 1, GFP_ATOMIC);
> > + if (!nsroot)
> > + return;
> > + ret = kernfs_path_from_node(d_kn, kf_root->kn, nsroot, len + 1);
> > + if (ret <= 0 || ret > len)
> > + goto out;
>
> Hmmm.... does this mean that someone inside cgroup ns would be able to
> find out the absolute cgroup path of the ns root from inside? If so,
> wouldn't that be an unnecessary information leak?
It's not a leak of any information we're trying to hide. I realize
something like 8 years have passed, but I still basically go by the
ksummit guidance that containers are ok but the kernel's first priority
is to facilitate containers but not trick containers into thinking
they're not containerized. So long as the container is properly set
up, I don't think there's anything the workload could do with the
nsroot= info other than *know* that it is in a ns cgroup.
If we did change that guidance, there's a slew of proc info that we
could better virtualize :)
thanks,
-serge
Hello, Serge.
On Wed, Apr 13, 2016 at 01:46:39PM -0500, Serge E. Hallyn wrote:
> It's not a leak of any information we're trying to hide. I realize
> something like 8 years have passed, but I still basically go by the
> ksummit guidance that containers are ok but the kernel's first priority
> is to facilitate containers but not trick containers into thinking
> they're not containerized. So long as the container is properly set
> up, I don't think there's anything the workload could do with the
> nsroot= info other than *know* that it is in a ns cgroup.
>
> If we did change that guidance, there's a slew of proc info that we
> could better virtualize :)
I see. I'm just wondering because the information here seems a bit
gratuituous. Isn't the only thing necessary telling whether the root
is bind mounted or namescoped? Wouldn't simple "nsroot" work for that
purpose?
Thanks.
--
tejun
Quoting Tejun Heo ([email protected]):
> Hello, Serge.
>
> On Wed, Apr 13, 2016 at 01:46:39PM -0500, Serge E. Hallyn wrote:
> > It's not a leak of any information we're trying to hide. I realize
> > something like 8 years have passed, but I still basically go by the
> > ksummit guidance that containers are ok but the kernel's first priority
> > is to facilitate containers but not trick containers into thinking
> > they're not containerized. So long as the container is properly set
> > up, I don't think there's anything the workload could do with the
> > nsroot= info other than *know* that it is in a ns cgroup.
> >
> > If we did change that guidance, there's a slew of proc info that we
> > could better virtualize :)
>
> I see. I'm just wondering because the information here seems a bit
> gratuituous. Isn't the only thing necessary telling whether the root
> is bind mounted or namescoped? Wouldn't simple "nsroot" work for that
> purpose?
I don't think so - we could be in a cgroup namespace but still have
access only to bind-mounted cgroups. So we need to compare the
superblock dentry root field to the nsroot= value.
Hello,
On Wed, Apr 13, 2016 at 02:01:52PM -0500, Serge E. Hallyn wrote:
> I don't think so - we could be in a cgroup namespace but still have
> access only to bind-mounted cgroups. So we need to compare the
> superblock dentry root field to the nsroot= value.
I see. No objection then.
Thanks.
--
tejun
On Wed, Apr 13, 2016 at 12:01 PM, Serge E. Hallyn <[email protected]> wrote:
> Quoting Tejun Heo ([email protected]):
>> Hello, Serge.
>>
>> On Wed, Apr 13, 2016 at 01:46:39PM -0500, Serge E. Hallyn wrote:
>> > It's not a leak of any information we're trying to hide. I realize
>> > something like 8 years have passed, but I still basically go by the
>> > ksummit guidance that containers are ok but the kernel's first priority
>> > is to facilitate containers but not trick containers into thinking
>> > they're not containerized. So long as the container is properly set
>> > up, I don't think there's anything the workload could do with the
>> > nsroot= info other than *know* that it is in a ns cgroup.
>> >
>> > If we did change that guidance, there's a slew of proc info that we
>> > could better virtualize :)
>>
>> I see. I'm just wondering because the information here seems a bit
>> gratuituous. Isn't the only thing necessary telling whether the root
>> is bind mounted or namescoped? Wouldn't simple "nsroot" work for that
>> purpose?
>
> I don't think so - we could be in a cgroup namespace but still have
> access only to bind-mounted cgroups. So we need to compare the
> superblock dentry root field to the nsroot= value.
Umm, I don't think this is such a good idea. The main purpose of
cgroup namespace was to prevent this exposure of system cgroup
hierarchy that used to happen because of /proc/self/cgroup. Wouldn't
showing that information in /proc/self/mountinfo defeat the purpose?
> One practical problem I've found with cgroup namespaces is that there
> is no way to disambiguate between a cgroupfs mount which was done in
> a cgroup namespace, and a bind mount of a cgroupfs directory.
Thats actually by design, no? Namespaced apps should not know/care if
they are running inside namespace. If they can find it out today, its
just because of certain side-effects. I fear adding explicit "nsroot"
or something in /proc/self/mountinfo now becomes an API making it hard
to virtualize user-apps again.
--
Aditya
Quoting Aditya Kali ([email protected]):
> On Wed, Apr 13, 2016 at 12:01 PM, Serge E. Hallyn <[email protected]> wrote:
> > Quoting Tejun Heo ([email protected]):
> >> Hello, Serge.
> >>
> >> On Wed, Apr 13, 2016 at 01:46:39PM -0500, Serge E. Hallyn wrote:
> >> > It's not a leak of any information we're trying to hide. I realize
> >> > something like 8 years have passed, but I still basically go by the
> >> > ksummit guidance that containers are ok but the kernel's first priority
> >> > is to facilitate containers but not trick containers into thinking
> >> > they're not containerized. So long as the container is properly set
> >> > up, I don't think there's anything the workload could do with the
> >> > nsroot= info other than *know* that it is in a ns cgroup.
> >> >
> >> > If we did change that guidance, there's a slew of proc info that we
> >> > could better virtualize :)
> >>
> >> I see. I'm just wondering because the information here seems a bit
> >> gratuituous. Isn't the only thing necessary telling whether the root
> >> is bind mounted or namescoped? Wouldn't simple "nsroot" work for that
> >> purpose?
> >
> > I don't think so - we could be in a cgroup namespace but still have
> > access only to bind-mounted cgroups. So we need to compare the
> > superblock dentry root field to the nsroot= value.
>
> Umm, I don't think this is such a good idea. The main purpose of
> cgroup namespace was to prevent this exposure of system cgroup
> hierarchy that used to happen because of /proc/self/cgroup. Wouldn't
> showing that information in /proc/self/mountinfo defeat the purpose?
I disagree. The primary purpose was to simplify init's job and to keep
cgroup mounts in sync with /proc/self/cgroup. So that userspace doesn't
have to look at /proc/self/cgroup and then try and figure out how that
relates to its actual cgroup mountpoints. It was not to *hide* the
information.
Field 3 already gives us the path, nsroot just tells us what part of
it we are namespaced under.
> > One practical problem I've found with cgroup namespaces is that there
> > is no way to disambiguate between a cgroupfs mount which was done in
> > a cgroup namespace, and a bind mount of a cgroupfs directory.
>
> Thats actually by design, no? Namespaced apps should not know/care if
> they are running inside namespace. If they can find it out today, its
No. If a workload isn't allowed to mount its own cgroups, and can only
see that freezer /lxc/x1 was mounted at /dev/cgroup (poorly done, but we
don't get to pass judgement or choose mountpoints for userspace), and
it sees /lxc/x1 in its freezer entry for /proc/self/cgroup, then it
cannot tell whether it should be using /dev/cgroup/tasks or
/dev/cgroup/lxc/tasks or /dev/cgroup/lxc/x1/tasks. That's a problem.
> just because of certain side-effects. I fear adding explicit "nsroot"
> or something in /proc/self/mountinfo now becomes an API making it hard
> to virtualize user-apps again.
It doesn't make it hard to virtualize. The only complication would be
if you wanted to checkpoint/restart and reproduce the exact
/proc/self/mountinfo output. That's a bogus goal anyway, since the
restart could be in a different cgroup and field 3 would be different.
In contrast, not providing this makes it impossible for software to
deal with both cgroup namespace and any bind-mounted cgroups. Which
means any new docker (say) which can run in cgroup namespaces will
not be able to run under old (that is, anything currently released
except lxc 2.0) container managers. We're breaking all container
managers.
Now the other thing we could do would be to tweak field 3 in the
mountinfo output. That had been my first inclination, but the way
the mountinfo code is currently done makes that ... challenging.
-serge
This is so that userspace can distinguish a mount made in a cgroup
namespace from a bind mount from a cgroup subdirectory.
Signed-off-by: Serge Hallyn <[email protected]>
---
Changelog: 2016-04-13: pass kernfs_node rather than dentry to show_options
---
fs/kernfs/mount.c | 2 +-
include/linux/kernfs.h | 3 ++-
kernel/cgroup.c | 28 +++++++++++++++++++++++++++-
3 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f73541f..58e8a86 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
struct kernfs_syscall_ops *scops = root->syscall_ops;
if (scops && scops->show_options)
- return scops->show_options(sf, root);
+ return scops->show_options(sf, dentry->d_fsdata, root);
return 0;
}
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c442..72b4081 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -145,7 +145,8 @@ struct kernfs_node {
*/
struct kernfs_syscall_ops {
int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
- int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
+ int (*show_options)(struct seq_file *sf, struct kernfs_node *kn,
+ struct kernfs_root *root);
int (*mkdir)(struct kernfs_node *parent, const char *name,
umode_t mode);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671dc05..4d26d07 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1593,7 +1593,31 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
return 0;
}
-static int cgroup_show_options(struct seq_file *seq,
+static void cgroup_show_nsroot(struct seq_file *seq, struct kernfs_node *knode,
+ struct kernfs_root *kroot)
+{
+ char *nsroot;
+ int len, ret;
+
+ if (!kroot)
+ return;
+ len = kernfs_path_from_node(knode, kroot->kn, NULL, 0);
+ if (len <= 0)
+ return;
+ nsroot = kzalloc(len + 1, GFP_ATOMIC);
+ if (!nsroot)
+ return;
+ ret = kernfs_path_from_node(knode, kroot->kn, nsroot, len + 1);
+ if (ret <= 0 || ret > len)
+ goto out;
+
+ seq_show_option(seq, "nsroot", nsroot);
+
+out:
+ kfree(nsroot);
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_node *kn,
struct kernfs_root *kf_root)
{
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
@@ -1619,6 +1643,8 @@ static int cgroup_show_options(struct seq_file *seq,
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_show_option(seq, "name", root->name);
+ cgroup_show_nsroot(seq, kn, kf_root);
+
return 0;
}
--
2.7.4
"Serge E. Hallyn" <[email protected]> writes:
> This is so that userspace can distinguish a mount made in a cgroup
> namespace from a bind mount from a cgroup subdirectory.
To do that do you need to print the path, or is an extra option that
reveals nothing except that it was a cgroup mount sufficient?
Is there any practical difference between a mount in a namespace and a
bind mount?
Given the way the conversation has been going I think it would be good
to see the answers to these questions. Perhaps I missed it but I
haven't seen the answers to those questions.
Eric
>
> Signed-off-by: Serge Hallyn <[email protected]>
> ---
> Changelog: 2016-04-13: pass kernfs_node rather than dentry to show_options
> ---
> fs/kernfs/mount.c | 2 +-
> include/linux/kernfs.h | 3 ++-
> kernel/cgroup.c | 28 +++++++++++++++++++++++++++-
> 3 files changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index f73541f..58e8a86 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
> struct kernfs_syscall_ops *scops = root->syscall_ops;
>
> if (scops && scops->show_options)
> - return scops->show_options(sf, root);
> + return scops->show_options(sf, dentry->d_fsdata, root);
> return 0;
> }
>
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index c06c442..72b4081 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -145,7 +145,8 @@ struct kernfs_node {
> */
> struct kernfs_syscall_ops {
> int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
> + int (*show_options)(struct seq_file *sf, struct kernfs_node *kn,
> + struct kernfs_root *root);
>
> int (*mkdir)(struct kernfs_node *parent, const char *name,
> umode_t mode);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 671dc05..4d26d07 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1593,7 +1593,31 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
> return 0;
> }
>
> -static int cgroup_show_options(struct seq_file *seq,
> +static void cgroup_show_nsroot(struct seq_file *seq, struct kernfs_node *knode,
> + struct kernfs_root *kroot)
> +{
> + char *nsroot;
> + int len, ret;
> +
> + if (!kroot)
> + return;
> + len = kernfs_path_from_node(knode, kroot->kn, NULL, 0);
> + if (len <= 0)
> + return;
> + nsroot = kzalloc(len + 1, GFP_ATOMIC);
> + if (!nsroot)
> + return;
> + ret = kernfs_path_from_node(knode, kroot->kn, nsroot, len + 1);
> + if (ret <= 0 || ret > len)
> + goto out;
> +
> + seq_show_option(seq, "nsroot", nsroot);
> +
> +out:
> + kfree(nsroot);
> +}
> +
> +static int cgroup_show_options(struct seq_file *seq, struct kernfs_node *kn,
> struct kernfs_root *kf_root)
> {
> struct cgroup_root *root = cgroup_root_from_kf(kf_root);
> @@ -1619,6 +1643,8 @@ static int cgroup_show_options(struct seq_file *seq,
> seq_puts(seq, ",clone_children");
> if (strlen(root->name))
> seq_show_option(seq, "name", root->name);
> + cgroup_show_nsroot(seq, kn, kf_root);
> +
> return 0;
> }
Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> > This is so that userspace can distinguish a mount made in a cgroup
> > namespace from a bind mount from a cgroup subdirectory.
>
> To do that do you need to print the path, or is an extra option that
> reveals nothing except that it was a cgroup mount sufficient?
>
> Is there any practical difference between a mount in a namespace and a
> bind mount?
>
> Given the way the conversation has been going I think it would be good
> to see the answers to these questions. Perhaps I missed it but I
> haven't seen the answers to those questions.
Yup, I tried to answer those in my last email, let me try again.
Let's say I start a container using cgroup namespaces, /lxc/x1. It mounts
freezer at /sys/fs/cgroup so it has field three of mountinfo as /lxc/x1,
and /sys/fs/cgroup/ is the path to the container's cgroup (/lxc/x1). In
that container, I start another container x1, not using cgroup namespaces.
It also wants a cgroup mount, and a common way to handle that (to prevent
container rewriting its limits) is to mount a tmpfs at /sys/fs/cgroup,
create /sysfs/cgroup/lxc/x1, and bind mount /sys/fs/cgroup/lxc/x1 from
the parent container onto /sys/fs/cgroup/lxc/x1 in the child container.
Now for that bind mount, the mountinfo field 3 will show /lxc/x1/lxc/x1,
with mount target /sys/fs/cgroup/lxc/x1, while /proc/self/cgroup for a task
in that container will show '/lxc/x1'. Unless it has been moved into
/lxc/x1/lxc/x1 in the container (/lxc/x1/lxc/x1/lxc/x1 on the host)...
Every time I've thought "maybe we can just..." I've found a case where it
wouldn't work.
At first in lxc we simply said if /proc/self/ns/cgroup exists assume that
the cgroupfs mounts are not bind mounts. However, old userspace (and
container drivers) on new kernels is certainly possible, especially an
older distro in a container on a newer distro on the host. That completely
breaks with this approach.
I also personally think there *is* value in letting a task know its
place on the system, so hiding the full cgroup path is imo not only not
a valid goal, it's counter-productive. Part of making for better
virtualization is to give userspace all the info it needs about its
current limits. Consider that with the unified hierarchy, you cannot
have tasks in a cgroup that also has child cgroups - except for the
root. Cgroup namespaces do not make an exception for this, so knowing
that you are not in the absolute cgroup root actually can prevent you
from trying something that cannot work. Or, I suppose, at least
understanding why you're unable to do what you're trying to do (namely
your container manager messed up). I point this out because finding
a way to only show the namespaced root in field 3 of mountinfo would
fix the base problem, but at the cost of hiding useful information
from a container.
> Eric
>
>
> >
> > Signed-off-by: Serge Hallyn <[email protected]>
> > ---
> > Changelog: 2016-04-13: pass kernfs_node rather than dentry to show_options
> > ---
> > fs/kernfs/mount.c | 2 +-
> > include/linux/kernfs.h | 3 ++-
> > kernel/cgroup.c | 28 +++++++++++++++++++++++++++-
> > 3 files changed, 30 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> > index f73541f..58e8a86 100644
> > --- a/fs/kernfs/mount.c
> > +++ b/fs/kernfs/mount.c
> > @@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
> > struct kernfs_syscall_ops *scops = root->syscall_ops;
> >
> > if (scops && scops->show_options)
> > - return scops->show_options(sf, root);
> > + return scops->show_options(sf, dentry->d_fsdata, root);
> > return 0;
> > }
> >
> > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > index c06c442..72b4081 100644
> > --- a/include/linux/kernfs.h
> > +++ b/include/linux/kernfs.h
> > @@ -145,7 +145,8 @@ struct kernfs_node {
> > */
> > struct kernfs_syscall_ops {
> > int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> > - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
> > + int (*show_options)(struct seq_file *sf, struct kernfs_node *kn,
> > + struct kernfs_root *root);
> >
> > int (*mkdir)(struct kernfs_node *parent, const char *name,
> > umode_t mode);
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index 671dc05..4d26d07 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -1593,7 +1593,31 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
> > return 0;
> > }
> >
> > -static int cgroup_show_options(struct seq_file *seq,
> > +static void cgroup_show_nsroot(struct seq_file *seq, struct kernfs_node *knode,
> > + struct kernfs_root *kroot)
> > +{
> > + char *nsroot;
> > + int len, ret;
> > +
> > + if (!kroot)
> > + return;
> > + len = kernfs_path_from_node(knode, kroot->kn, NULL, 0);
> > + if (len <= 0)
> > + return;
> > + nsroot = kzalloc(len + 1, GFP_ATOMIC);
> > + if (!nsroot)
> > + return;
> > + ret = kernfs_path_from_node(knode, kroot->kn, nsroot, len + 1);
> > + if (ret <= 0 || ret > len)
> > + goto out;
> > +
> > + seq_show_option(seq, "nsroot", nsroot);
> > +
> > +out:
> > + kfree(nsroot);
> > +}
> > +
> > +static int cgroup_show_options(struct seq_file *seq, struct kernfs_node *kn,
> > struct kernfs_root *kf_root)
> > {
> > struct cgroup_root *root = cgroup_root_from_kf(kf_root);
> > @@ -1619,6 +1643,8 @@ static int cgroup_show_options(struct seq_file *seq,
> > seq_puts(seq, ",clone_children");
> > if (strlen(root->name))
> > seq_show_option(seq, "name", root->name);
> > + cgroup_show_nsroot(seq, kn, kf_root);
> > +
> > return 0;
> > }
"Serge E. Hallyn" <[email protected]> writes:
> Quoting Eric W. Biederman ([email protected]):
>> "Serge E. Hallyn" <[email protected]> writes:
>>
>> > This is so that userspace can distinguish a mount made in a cgroup
>> > namespace from a bind mount from a cgroup subdirectory.
>>
>> To do that do you need to print the path, or is an extra option that
>> reveals nothing except that it was a cgroup mount sufficient?
>>
>> Is there any practical difference between a mount in a namespace and a
>> bind mount?
>>
>> Given the way the conversation has been going I think it would be good
>> to see the answers to these questions. Perhaps I missed it but I
>> haven't seen the answers to those questions.
>
> Yup, I tried to answer those in my last email, let me try again.
>
> Let's say I start a container using cgroup namespaces, /lxc/x1. It mounts
> freezer at /sys/fs/cgroup so it has field three of mountinfo as /lxc/x1,
> and /sys/fs/cgroup/ is the path to the container's cgroup (/lxc/x1). In
> that container, I start another container x1, not using cgroup namespaces.
> It also wants a cgroup mount, and a common way to handle that (to prevent
> container rewriting its limits) is to mount a tmpfs at /sys/fs/cgroup,
> create /sysfs/cgroup/lxc/x1, and bind mount /sys/fs/cgroup/lxc/x1 from
> the parent container onto /sys/fs/cgroup/lxc/x1 in the child container.
> Now for that bind mount, the mountinfo field 3 will show /lxc/x1/lxc/x1,
> with mount target /sys/fs/cgroup/lxc/x1, while /proc/self/cgroup for a task
> in that container will show '/lxc/x1'. Unless it has been moved into
> /lxc/x1/lxc/x1 in the container (/lxc/x1/lxc/x1/lxc/x1 on the host)...
> Every time I've thought "maybe we can just..." I've found a case where it
> wouldn't work.
>
> At first in lxc we simply said if /proc/self/ns/cgroup exists assume that
> the cgroupfs mounts are not bind mounts. However, old userspace (and
> container drivers) on new kernels is certainly possible, especially an
> older distro in a container on a newer distro on the host. That completely
> breaks with this approach.
>
> I also personally think there *is* value in letting a task know its
> place on the system, so hiding the full cgroup path is imo not only not
> a valid goal, it's counter-productive. Part of making for better
> virtualization is to give userspace all the info it needs about its
> current limits. Consider that with the unified hierarchy, you cannot
> have tasks in a cgroup that also has child cgroups - except for the
> root. Cgroup namespaces do not make an exception for this, so knowing
> that you are not in the absolute cgroup root actually can prevent you
> from trying something that cannot work. Or, I suppose, at least
> understanding why you're unable to do what you're trying to do (namely
> your container manager messed up). I point this out because finding
> a way to only show the namespaced root in field 3 of mountinfo would
> fix the base problem, but at the cost of hiding useful information
> from a container.
It is just the superblock show_path method. And regardless of the rest
of the usefullness of your mount option implementing show_path appears
to be fundamentally the right thing in this context. As that field
appears to have the same issue as /proc/self/cgroup.
Eric
Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> > Quoting Eric W. Biederman ([email protected]):
> >> "Serge E. Hallyn" <[email protected]> writes:
> >>
> >> > This is so that userspace can distinguish a mount made in a cgroup
> >> > namespace from a bind mount from a cgroup subdirectory.
> >>
> >> To do that do you need to print the path, or is an extra option that
> >> reveals nothing except that it was a cgroup mount sufficient?
> >>
> >> Is there any practical difference between a mount in a namespace and a
> >> bind mount?
> >>
> >> Given the way the conversation has been going I think it would be good
> >> to see the answers to these questions. Perhaps I missed it but I
> >> haven't seen the answers to those questions.
> >
> > Yup, I tried to answer those in my last email, let me try again.
> >
> > Let's say I start a container using cgroup namespaces, /lxc/x1. It mounts
> > freezer at /sys/fs/cgroup so it has field three of mountinfo as /lxc/x1,
> > and /sys/fs/cgroup/ is the path to the container's cgroup (/lxc/x1). In
> > that container, I start another container x1, not using cgroup namespaces.
> > It also wants a cgroup mount, and a common way to handle that (to prevent
> > container rewriting its limits) is to mount a tmpfs at /sys/fs/cgroup,
> > create /sysfs/cgroup/lxc/x1, and bind mount /sys/fs/cgroup/lxc/x1 from
> > the parent container onto /sys/fs/cgroup/lxc/x1 in the child container.
> > Now for that bind mount, the mountinfo field 3 will show /lxc/x1/lxc/x1,
> > with mount target /sys/fs/cgroup/lxc/x1, while /proc/self/cgroup for a task
> > in that container will show '/lxc/x1'. Unless it has been moved into
> > /lxc/x1/lxc/x1 in the container (/lxc/x1/lxc/x1/lxc/x1 on the host)...
> > Every time I've thought "maybe we can just..." I've found a case where it
> > wouldn't work.
> >
> > At first in lxc we simply said if /proc/self/ns/cgroup exists assume that
> > the cgroupfs mounts are not bind mounts. However, old userspace (and
> > container drivers) on new kernels is certainly possible, especially an
> > older distro in a container on a newer distro on the host. That completely
> > breaks with this approach.
> >
> > I also personally think there *is* value in letting a task know its
> > place on the system, so hiding the full cgroup path is imo not only not
> > a valid goal, it's counter-productive. Part of making for better
> > virtualization is to give userspace all the info it needs about its
> > current limits. Consider that with the unified hierarchy, you cannot
> > have tasks in a cgroup that also has child cgroups - except for the
> > root. Cgroup namespaces do not make an exception for this, so knowing
> > that you are not in the absolute cgroup root actually can prevent you
> > from trying something that cannot work. Or, I suppose, at least
> > understanding why you're unable to do what you're trying to do (namely
> > your container manager messed up). I point this out because finding
> > a way to only show the namespaced root in field 3 of mountinfo would
> > fix the base problem, but at the cost of hiding useful information
> > from a container.
>
> It is just the superblock show_path method. And regardless of the rest
> of the usefullness of your mount option implementing show_path appears
Ugh. Yeah as I've said implementing that would be the other way to go.
I'm somewhat loath to give up the extra information, but I can work
on that patch later this week.
> to be fundamentally the right thing in this context. As that field
> appears to have the same issue as /proc/self/cgroup.
Well, /proc/self/cgroup could also have been fixed by adding a
':<nsroot>" field to each line, but it's used differently...
thanks,
-serge
"Serge E. Hallyn" <[email protected]> writes:
> Quoting Eric W. Biederman ([email protected]):
>> "Serge E. Hallyn" <[email protected]> writes:
>>
>> > Quoting Eric W. Biederman ([email protected]):
>> >> "Serge E. Hallyn" <[email protected]> writes:
>> >>
>> >> > This is so that userspace can distinguish a mount made in a cgroup
>> >> > namespace from a bind mount from a cgroup subdirectory.
>> >>
>> >> To do that do you need to print the path, or is an extra option that
>> >> reveals nothing except that it was a cgroup mount sufficient?
>> >>
>> >> Is there any practical difference between a mount in a namespace and a
>> >> bind mount?
>> >>
>> >> Given the way the conversation has been going I think it would be good
>> >> to see the answers to these questions. Perhaps I missed it but I
>> >> haven't seen the answers to those questions.
>> >
>> > Yup, I tried to answer those in my last email, let me try again.
>> >
>> > Let's say I start a container using cgroup namespaces, /lxc/x1. It mounts
>> > freezer at /sys/fs/cgroup so it has field three of mountinfo as /lxc/x1,
>> > and /sys/fs/cgroup/ is the path to the container's cgroup (/lxc/x1). In
>> > that container, I start another container x1, not using cgroup namespaces.
>> > It also wants a cgroup mount, and a common way to handle that (to prevent
>> > container rewriting its limits) is to mount a tmpfs at /sys/fs/cgroup,
>> > create /sysfs/cgroup/lxc/x1, and bind mount /sys/fs/cgroup/lxc/x1 from
>> > the parent container onto /sys/fs/cgroup/lxc/x1 in the child container.
>> > Now for that bind mount, the mountinfo field 3 will show /lxc/x1/lxc/x1,
>> > with mount target /sys/fs/cgroup/lxc/x1, while /proc/self/cgroup for a task
>> > in that container will show '/lxc/x1'. Unless it has been moved into
>> > /lxc/x1/lxc/x1 in the container (/lxc/x1/lxc/x1/lxc/x1 on the host)...
>> > Every time I've thought "maybe we can just..." I've found a case where it
>> > wouldn't work.
>> >
>> > At first in lxc we simply said if /proc/self/ns/cgroup exists assume that
>> > the cgroupfs mounts are not bind mounts. However, old userspace (and
>> > container drivers) on new kernels is certainly possible, especially an
>> > older distro in a container on a newer distro on the host. That completely
>> > breaks with this approach.
>> >
>> > I also personally think there *is* value in letting a task know its
>> > place on the system, so hiding the full cgroup path is imo not only not
>> > a valid goal, it's counter-productive. Part of making for better
>> > virtualization is to give userspace all the info it needs about its
>> > current limits. Consider that with the unified hierarchy, you cannot
>> > have tasks in a cgroup that also has child cgroups - except for the
>> > root. Cgroup namespaces do not make an exception for this, so knowing
>> > that you are not in the absolute cgroup root actually can prevent you
>> > from trying something that cannot work. Or, I suppose, at least
>> > understanding why you're unable to do what you're trying to do (namely
>> > your container manager messed up). I point this out because finding
>> > a way to only show the namespaced root in field 3 of mountinfo would
>> > fix the base problem, but at the cost of hiding useful information
>> > from a container.
>>
>> It is just the superblock show_path method. And regardless of the rest
>> of the usefullness of your mount option implementing show_path appears
>
> Ugh. Yeah as I've said implementing that would be the other way to go.
> I'm somewhat loath to give up the extra information, but I can work
> on that patch later this week.
It sounded like you couldn't see how to implement that which is slightly
different.
>> to be fundamentally the right thing in this context. As that field
>> appears to have the same issue as /proc/self/cgroup.
>
> Well, /proc/self/cgroup could also have been fixed by adding a
> ':<nsroot>" field to each line, but it's used differently...
Usage may be a reasonable justification. Mostly I am trying to pry
apart the chaos. Not shoot down one approach or another.
My current perspective is that irrespective of what we do with
a informational mount option (and there seems to be value in that)
having mountinfo show the path relative to root, will allow more
software to just work without modification.
And old software just working when it can seem very valuable.
Eric
On Thu, Apr 14, 2016 at 8:27 AM, Serge E. Hallyn <[email protected]> wrote:
> Quoting Eric W. Biederman ([email protected]):
>> "Serge E. Hallyn" <[email protected]> writes:
>>
>> > This is so that userspace can distinguish a mount made in a cgroup
>> > namespace from a bind mount from a cgroup subdirectory.
>>
>> To do that do you need to print the path, or is an extra option that
>> reveals nothing except that it was a cgroup mount sufficient?
>>
>> Is there any practical difference between a mount in a namespace and a
>> bind mount?
>>
>> Given the way the conversation has been going I think it would be good
>> to see the answers to these questions. Perhaps I missed it but I
>> haven't seen the answers to those questions.
>
> Yup, I tried to answer those in my last email, let me try again.
>
> Let's say I start a container using cgroup namespaces, /lxc/x1. It mounts
> freezer at /sys/fs/cgroup so it has field three of mountinfo as /lxc/x1,
> and /sys/fs/cgroup/ is the path to the container's cgroup (/lxc/x1). In
> that container, I start another container x1, not using cgroup namespaces.
> It also wants a cgroup mount, and a common way to handle that (to prevent
> container rewriting its limits) is to mount a tmpfs at /sys/fs/cgroup,
> create /sysfs/cgroup/lxc/x1, and bind mount /sys/fs/cgroup/lxc/x1 from
> the parent container onto /sys/fs/cgroup/lxc/x1 in the child container.
> Now for that bind mount, the mountinfo field 3 will show /lxc/x1/lxc/x1,
> with mount target /sys/fs/cgroup/lxc/x1, while /proc/self/cgroup for a task
> in that container will show '/lxc/x1'. Unless it has been moved into
> /lxc/x1/lxc/x1 in the container (/lxc/x1/lxc/x1/lxc/x1 on the host)...
> Every time I've thought "maybe we can just..." I've found a case where it
> wouldn't work.
>
> At first in lxc we simply said if /proc/self/ns/cgroup exists assume that
> the cgroupfs mounts are not bind mounts. However, old userspace (and
> container drivers) on new kernels is certainly possible, especially an
> older distro in a container on a newer distro on the host. That completely
> breaks with this approach.
>
My main concern regarding making this a new kernel API is that its too
generic and exposes information about all system cgroups to every
process on the system, not just the container or the process inside it
that needs it. Not all containers need this information and not all
processes running inside the container needs this. I haven't spent too
much thought into it, but it seems you will still need to update the
container userspace to read this extra mount option. So seems like a
simpler approach where the host "cgroup manager" provides this
information to specific container cgroup manager via other user-space
channels (a config file, command-line args, environment vars, proper
container mounts, etc.) may also work, right?
> I also personally think there *is* value in letting a task know its
> place on the system, so hiding the full cgroup path is imo not only not
> a valid goal, it's counter-productive. Part of making for better
> virtualization is to give userspace all the info it needs about its
> current limits. Consider that with the unified hierarchy, you cannot
> have tasks in a cgroup that also has child cgroups - except for the
> root. Cgroup namespaces do not make an exception for this, so knowing
> that you are not in the absolute cgroup root actually can prevent you
> from trying something that cannot work. Or, I suppose, at least
> understanding why you're unable to do what you're trying to do (namely
> your container manager messed up). I point this out because finding
> a way to only show the namespaced root in field 3 of mountinfo would
> fix the base problem, but at the cost of hiding useful information
> from a container.
>
>> Eric
>>
>>
>> >
>> > Signed-off-by: Serge Hallyn <[email protected]>
>> > ---
>> > Changelog: 2016-04-13: pass kernfs_node rather than dentry to show_options
>> > ---
>> > fs/kernfs/mount.c | 2 +-
>> > include/linux/kernfs.h | 3 ++-
>> > kernel/cgroup.c | 28 +++++++++++++++++++++++++++-
>> > 3 files changed, 30 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
>> > index f73541f..58e8a86 100644
>> > --- a/fs/kernfs/mount.c
>> > +++ b/fs/kernfs/mount.c
>> > @@ -36,7 +36,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
>> > struct kernfs_syscall_ops *scops = root->syscall_ops;
>> >
>> > if (scops && scops->show_options)
>> > - return scops->show_options(sf, root);
>> > + return scops->show_options(sf, dentry->d_fsdata, root);
>> > return 0;
>> > }
>> >
>> > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
>> > index c06c442..72b4081 100644
>> > --- a/include/linux/kernfs.h
>> > +++ b/include/linux/kernfs.h
>> > @@ -145,7 +145,8 @@ struct kernfs_node {
>> > */
>> > struct kernfs_syscall_ops {
>> > int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
>> > - int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
>> > + int (*show_options)(struct seq_file *sf, struct kernfs_node *kn,
>> > + struct kernfs_root *root);
>> >
>> > int (*mkdir)(struct kernfs_node *parent, const char *name,
>> > umode_t mode);
>> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> > index 671dc05..4d26d07 100644
>> > --- a/kernel/cgroup.c
>> > +++ b/kernel/cgroup.c
>> > @@ -1593,7 +1593,31 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
>> > return 0;
>> > }
>> >
>> > -static int cgroup_show_options(struct seq_file *seq,
>> > +static void cgroup_show_nsroot(struct seq_file *seq, struct kernfs_node *knode,
>> > + struct kernfs_root *kroot)
>> > +{
>> > + char *nsroot;
>> > + int len, ret;
>> > +
>> > + if (!kroot)
>> > + return;
>> > + len = kernfs_path_from_node(knode, kroot->kn, NULL, 0);
>> > + if (len <= 0)
>> > + return;
>> > + nsroot = kzalloc(len + 1, GFP_ATOMIC);
>> > + if (!nsroot)
>> > + return;
>> > + ret = kernfs_path_from_node(knode, kroot->kn, nsroot, len + 1);
>> > + if (ret <= 0 || ret > len)
>> > + goto out;
>> > +
>> > + seq_show_option(seq, "nsroot", nsroot);
>> > +
>> > +out:
>> > + kfree(nsroot);
>> > +}
>> > +
>> > +static int cgroup_show_options(struct seq_file *seq, struct kernfs_node *kn,
>> > struct kernfs_root *kf_root)
>> > {
>> > struct cgroup_root *root = cgroup_root_from_kf(kf_root);
>> > @@ -1619,6 +1643,8 @@ static int cgroup_show_options(struct seq_file *seq,
>> > seq_puts(seq, ",clone_children");
>> > if (strlen(root->name))
>> > seq_show_option(seq, "name", root->name);
>> > + cgroup_show_nsroot(seq, kn, kf_root);
>> > +
>> > return 0;
>> > }
--
Aditya
Quoting Aditya Kali ([email protected]):
> On Thu, Apr 14, 2016 at 8:27 AM, Serge E. Hallyn <[email protected]> wrote:
> > Quoting Eric W. Biederman ([email protected]):
> >> "Serge E. Hallyn" <[email protected]> writes:
> >>
> >> > This is so that userspace can distinguish a mount made in a cgroup
> >> > namespace from a bind mount from a cgroup subdirectory.
> >>
> >> To do that do you need to print the path, or is an extra option that
> >> reveals nothing except that it was a cgroup mount sufficient?
> >>
> >> Is there any practical difference between a mount in a namespace and a
> >> bind mount?
> >>
> >> Given the way the conversation has been going I think it would be good
> >> to see the answers to these questions. Perhaps I missed it but I
> >> haven't seen the answers to those questions.
> >
> > Yup, I tried to answer those in my last email, let me try again.
> >
> > Let's say I start a container using cgroup namespaces, /lxc/x1. It mounts
> > freezer at /sys/fs/cgroup so it has field three of mountinfo as /lxc/x1,
> > and /sys/fs/cgroup/ is the path to the container's cgroup (/lxc/x1). In
> > that container, I start another container x1, not using cgroup namespaces.
> > It also wants a cgroup mount, and a common way to handle that (to prevent
> > container rewriting its limits) is to mount a tmpfs at /sys/fs/cgroup,
> > create /sysfs/cgroup/lxc/x1, and bind mount /sys/fs/cgroup/lxc/x1 from
> > the parent container onto /sys/fs/cgroup/lxc/x1 in the child container.
> > Now for that bind mount, the mountinfo field 3 will show /lxc/x1/lxc/x1,
> > with mount target /sys/fs/cgroup/lxc/x1, while /proc/self/cgroup for a task
> > in that container will show '/lxc/x1'. Unless it has been moved into
> > /lxc/x1/lxc/x1 in the container (/lxc/x1/lxc/x1/lxc/x1 on the host)...
> > Every time I've thought "maybe we can just..." I've found a case where it
> > wouldn't work.
> >
> > At first in lxc we simply said if /proc/self/ns/cgroup exists assume that
> > the cgroupfs mounts are not bind mounts. However, old userspace (and
> > container drivers) on new kernels is certainly possible, especially an
> > older distro in a container on a newer distro on the host. That completely
> > breaks with this approach.
> >
>
> My main concern regarding making this a new kernel API is that its too
> generic and exposes information about all system cgroups to every
> process on the system, not just the container or the process inside it
> that needs it. Not all containers need this information and not all
> processes running inside the container needs this. I haven't spent too
> much thought into it, but it seems you will still need to update the
> container userspace to read this extra mount option. So seems like a
> simpler approach where the host "cgroup manager" provides this
> information to specific container cgroup manager via other user-space
> channels (a config file, command-line args, environment vars, proper
> container mounts, etc.) may also work, right?
No, because existing legacy userspace would need to be taught about
these new channels.
I'm testing a new patch which simply fixes the root dentry field in
mountinfo, which should also serve to fix this problem without adding
the nsroot= option field.