The main need for this is to support container runtimes on stateless Linux
system (pivot_root system call from initramfs).
Normally, the task of initramfs is to mount and switch to a "real" root
filesystem. However, on stateless systems (booting over the network) it is just
convenient to have your "real" filesystem as initramfs from the start.
This, however, breaks different container runtimes, because they usually use
pivot_root system call after creating their mount namespace. But pivot_root does
not work from initramfs, because initramfs runs form rootfs, which is the root
of the mount tree and can't be unmounted.
One workaround is to do:
mount --bind / /
However, that defeats one of the purposes of using pivot_root in the cloned
containers: get rid of host root filesystem, should the code somehow escapes the
chroot.
There is a way to solve this problem from userspace, but it is much more
cumbersome:
* either have to create a multilayered archive for initramfs, where the outer
layer creates a tmpfs filesystem and unpacks the inner layer, switches root
and does not forget to properly cleanup the old rootfs
* or we need to use keepinitrd kernel cmdline option, unpack initramfs to
rootfs, run a script to create our target tmpfs root, unpack the same
initramfs there, switch root to it and again properly cleanup the old root,
thus unpacking the same archive twice and also wasting memory, because
the kernel stores compressed initramfs image indefinitely.
With this change we can ask the kernel (by specifying nonroot_initramfs kernel
cmdline option) to create a "leaf" tmpfs mount for us and switch root to it
before the initramfs handling code, so initramfs gets unpacked directly into
the "leaf" tmpfs with rootfs being empty and no need to clean up anything.
This also bring the behaviour in line with the older style initrd, where the
initrd is located on some leaf filesystem in the mount tree and rootfs remaining
empty.
Signed-off-by: Ignat Korchagin <[email protected]>
---
fs/namespace.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/fs/namespace.c b/fs/namespace.c
index 85b5f7bea82e..a1ec862e8146 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3701,6 +3701,49 @@ static void __init init_mount_tree(void)
set_fs_root(current->fs, &root);
}
+#if IS_ENABLED(CONFIG_TMPFS)
+static int __initdata nonroot_initramfs;
+
+static int __init nonroot_initramfs_param(char *str)
+{
+ if (*str)
+ return 0;
+ nonroot_initramfs = 1;
+ return 1;
+}
+__setup("nonroot_initramfs", nonroot_initramfs_param);
+
+static void __init init_nonroot_initramfs(void)
+{
+ int err;
+
+ if (!nonroot_initramfs)
+ return;
+
+ err = ksys_mkdir("/root", 0700);
+ if (err < 0)
+ goto out;
+
+ err = do_mount("tmpfs", "/root", "tmpfs", 0, NULL);
+ if (err)
+ goto out;
+
+ err = ksys_chdir("/root");
+ if (err)
+ goto out;
+
+ err = do_mount(".", "/", NULL, MS_MOVE, NULL);
+ if (err)
+ goto out;
+
+ err = ksys_chroot(".");
+ if (!err)
+ return;
+out:
+ printk(KERN_WARNING "Failed to create a non-root filesystem for initramfs\n");
+}
+#endif /* IS_ENABLED(CONFIG_TMPFS) */
+
void __init mnt_init(void)
{
int err;
@@ -3734,6 +3777,10 @@ void __init mnt_init(void)
shmem_init();
init_rootfs();
init_mount_tree();
+
+#if IS_ENABLED(CONFIG_TMPFS)
+ init_nonroot_initramfs();
+#endif
}
void put_mnt_ns(struct mnt_namespace *ns)
--
2.20.1
On 3/30/20 6:14 AM, Ignat Korchagin wrote:
> The main need for this is to support container runtimes on stateless Linux
> system (pivot_root system call from initramfs).
>
> Normally, the task of initramfs is to mount and switch to a "real" root
> filesystem. However, on stateless systems (booting over the network) it is just
> convenient to have your "real" filesystem as initramfs from the start.
>
> This, however, breaks different container runtimes, because they usually use
> pivot_root system call after creating their mount namespace. But pivot_root does
> not work from initramfs, because initramfs runs form rootfs, which is the root
> of the mount tree and can't be unmounted.
>
> One workaround is to do:
>
> mount --bind / /
>
> However, that defeats one of the purposes of using pivot_root in the cloned
> containers: get rid of host root filesystem, should the code somehow escapes the
> chroot.
>
> There is a way to solve this problem from userspace, but it is much more
> cumbersome:
> * either have to create a multilayered archive for initramfs, where the outer
> layer creates a tmpfs filesystem and unpacks the inner layer, switches root
> and does not forget to properly cleanup the old rootfs
> * or we need to use keepinitrd kernel cmdline option, unpack initramfs to
> rootfs, run a script to create our target tmpfs root, unpack the same
> initramfs there, switch root to it and again properly cleanup the old root,
> thus unpacking the same archive twice and also wasting memory, because
> the kernel stores compressed initramfs image indefinitely.
>
> With this change we can ask the kernel (by specifying nonroot_initramfs kernel
> cmdline option) to create a "leaf" tmpfs mount for us and switch root to it
> before the initramfs handling code, so initramfs gets unpacked directly into
> the "leaf" tmpfs with rootfs being empty and no need to clean up anything.
>
> This also bring the behaviour in line with the older style initrd, where the
> initrd is located on some leaf filesystem in the mount tree and rootfs remaining
> empty.
>
> Signed-off-by: Ignat Korchagin <[email protected]>
> ---
> fs/namespace.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 47 insertions(+)
Hi,
Please document "nonroot_initramfs" in
Documentation/admin-guide/kernel-parameters.txt.
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 85b5f7bea82e..a1ec862e8146 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3701,6 +3701,49 @@ static void __init init_mount_tree(void)
> set_fs_root(current->fs, &root);
> }
>
> +#if IS_ENABLED(CONFIG_TMPFS)
> +static int __initdata nonroot_initramfs;
> +
> +static int __init nonroot_initramfs_param(char *str)
> +{
> + if (*str)
> + return 0;
> + nonroot_initramfs = 1;
> + return 1;
> +}
> +__setup("nonroot_initramfs", nonroot_initramfs_param);
> +
> +static void __init init_nonroot_initramfs(void)
> +{
> + int err;
> +
> + if (!nonroot_initramfs)
> + return;
> +
> + err = ksys_mkdir("/root", 0700);
> + if (err < 0)
> + goto out;
> +
> + err = do_mount("tmpfs", "/root", "tmpfs", 0, NULL);
> + if (err)
> + goto out;
> +
> + err = ksys_chdir("/root");
> + if (err)
> + goto out;
> +
> + err = do_mount(".", "/", NULL, MS_MOVE, NULL);
> + if (err)
> + goto out;
> +
> + err = ksys_chroot(".");
> + if (!err)
> + return;
> +out:
> + printk(KERN_WARNING "Failed to create a non-root filesystem for initramfs\n");
> +}
> +#endif /* IS_ENABLED(CONFIG_TMPFS) */
> +
> void __init mnt_init(void)
> {
> int err;
> @@ -3734,6 +3777,10 @@ void __init mnt_init(void)
> shmem_init();
> init_rootfs();
> init_mount_tree();
> +
> +#if IS_ENABLED(CONFIG_TMPFS)
> + init_nonroot_initramfs();
> +#endif
> }
>
> void put_mnt_ns(struct mnt_namespace *ns)
>
thanks.
--
~Randy
Sorry, forgot to follow up in this thread. I've reposted v2 patches
with documentation.
Regards,
Ignat
On Mon, Mar 30, 2020 at 8:03 PM Randy Dunlap <[email protected]> wrote:
>
> On 3/30/20 6:14 AM, Ignat Korchagin wrote:
> > The main need for this is to support container runtimes on stateless Linux
> > system (pivot_root system call from initramfs).
> >
> > Normally, the task of initramfs is to mount and switch to a "real" root
> > filesystem. However, on stateless systems (booting over the network) it is just
> > convenient to have your "real" filesystem as initramfs from the start.
> >
> > This, however, breaks different container runtimes, because they usually use
> > pivot_root system call after creating their mount namespace. But pivot_root does
> > not work from initramfs, because initramfs runs form rootfs, which is the root
> > of the mount tree and can't be unmounted.
> >
> > One workaround is to do:
> >
> > mount --bind / /
> >
> > However, that defeats one of the purposes of using pivot_root in the cloned
> > containers: get rid of host root filesystem, should the code somehow escapes the
> > chroot.
> >
> > There is a way to solve this problem from userspace, but it is much more
> > cumbersome:
> > * either have to create a multilayered archive for initramfs, where the outer
> > layer creates a tmpfs filesystem and unpacks the inner layer, switches root
> > and does not forget to properly cleanup the old rootfs
> > * or we need to use keepinitrd kernel cmdline option, unpack initramfs to
> > rootfs, run a script to create our target tmpfs root, unpack the same
> > initramfs there, switch root to it and again properly cleanup the old root,
> > thus unpacking the same archive twice and also wasting memory, because
> > the kernel stores compressed initramfs image indefinitely.
> >
> > With this change we can ask the kernel (by specifying nonroot_initramfs kernel
> > cmdline option) to create a "leaf" tmpfs mount for us and switch root to it
> > before the initramfs handling code, so initramfs gets unpacked directly into
> > the "leaf" tmpfs with rootfs being empty and no need to clean up anything.
> >
> > This also bring the behaviour in line with the older style initrd, where the
> > initrd is located on some leaf filesystem in the mount tree and rootfs remaining
> > empty.
> >
> > Signed-off-by: Ignat Korchagin <[email protected]>
> > ---
> > fs/namespace.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 47 insertions(+)
>
> Hi,
> Please document "nonroot_initramfs" in
> Documentation/admin-guide/kernel-parameters.txt.
>
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index 85b5f7bea82e..a1ec862e8146 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -3701,6 +3701,49 @@ static void __init init_mount_tree(void)
> > set_fs_root(current->fs, &root);
> > }
> >
> > +#if IS_ENABLED(CONFIG_TMPFS)
> > +static int __initdata nonroot_initramfs;
> > +
> > +static int __init nonroot_initramfs_param(char *str)
> > +{
> > + if (*str)
> > + return 0;
> > + nonroot_initramfs = 1;
> > + return 1;
> > +}
> > +__setup("nonroot_initramfs", nonroot_initramfs_param);
> > +
> > +static void __init init_nonroot_initramfs(void)
> > +{
> > + int err;
> > +
> > + if (!nonroot_initramfs)
> > + return;
> > +
> > + err = ksys_mkdir("/root", 0700);
> > + if (err < 0)
> > + goto out;
> > +
> > + err = do_mount("tmpfs", "/root", "tmpfs", 0, NULL);
> > + if (err)
> > + goto out;
> > +
> > + err = ksys_chdir("/root");
> > + if (err)
> > + goto out;
> > +
> > + err = do_mount(".", "/", NULL, MS_MOVE, NULL);
> > + if (err)
> > + goto out;
> > +
> > + err = ksys_chroot(".");
> > + if (!err)
> > + return;
> > +out:
> > + printk(KERN_WARNING "Failed to create a non-root filesystem for initramfs\n");
> > +}
> > +#endif /* IS_ENABLED(CONFIG_TMPFS) */
> > +
> > void __init mnt_init(void)
> > {
> > int err;
> > @@ -3734,6 +3777,10 @@ void __init mnt_init(void)
> > shmem_init();
> > init_rootfs();
> > init_mount_tree();
> > +
> > +#if IS_ENABLED(CONFIG_TMPFS)
> > + init_nonroot_initramfs();
> > +#endif
> > }
> >
> > void put_mnt_ns(struct mnt_namespace *ns)
> >
>
> thanks.
> --
> ~Randy
>