2021-04-02 15:53:35

by Pavel Tikhomirov

[permalink] [raw]
Subject: [PATCH] clone3: add option to change owner of newly created namespaces

Let's add a flag CLONE_OWNER_NS and clone_args.userns_fd field to
specify a user namespace for clone3 which would become an owner of newly
created namespaces. This owner is restricted to be a descendant of
current user namespace. It means that we can do clone while in more
privileged user namespace than the one which would become an owner.

We need this for CRIU.

1) When CRIU is dumping a container it can face nested user namespaces
and pid namespaces, to properly dump/restore them CRIU needs to also
restore all dependencies between namespaces: parent-child and
userns-owner.

Previously when CRIU was recreating the process tree of container during
restore if we needed to restore a pid namespace init we should first
enter a proper user namespace which should be an owner of restored pid
namespace and only than can clone. That means that the cloned process is
initially created in probably unprivileged user namespace and it brings
a lot of restrictions on what it can do (e.g. we should probably enter
it's ipc/uts/net namespaces already, if they are owned by more
privileged user namespace we would not be able to enter them later).

With new userns_fd option we would be able to recreate process tree with
all container pid namespaces with proper user namespace owners for them
while all processes are left in most privileged user namespace, and it
would be easier to restore all resources of those processes. We can
restore user namespace of each process later when needed.

2) Other problem which this option is trying to solve is that clone3()
with set_tid does not work as desired when we try to recreate a process
in the container which has nested user and pid namespaces.

Imagine that in container we have a chain of processes each of them was
created with clone() with (CLONE_NEWPID | CLONE_NEWUSER) from previous
process and in each pid namespace of this chain we've also created some
amount of processes to hold pid numbers. Next we create one more
"target" process with new pid and user namespaces in the end of chain,
it can have random pids in each pid namespace of the chain.

When CRIU would restore this container it would not be able to restore
pids of "target" process on each pid namespace level with clone3()+
set_tid because it would need to call clone from owner user namespace
(or it's parent if we use CLONE_NEWUSER) of the "target"'s pid
namespace, which has no rights to set_tid on each needed level.

With new userns_fd option we would easily do it, we just need to have
current user namespace to be root user namespace of the container and
pass "target"'s pid namespace owner user namespace to userns_fd.

Here are two examples on the use of new userns_fd option:

- clone3_owner_ns.c - is simple demonstration of how process can create
pid namespace owned by it's user namespace descendant;
- clone3_set_tid_vs_owner_ns.c - is a bit more complex demonstration
on how clone3+set_tid can work for restoring pids on each level of
nested user and pid namespaces when used together with userns_fd:

https://github.com/Snorch/clone3_owner_ns

Signed-off-by: Pavel Tikhomirov <[email protected]>
---
include/linux/nsproxy.h | 3 ++-
include/linux/sched/task.h | 1 +
include/linux/user_namespace.h | 6 ++++++
include/uapi/linux/sched.h | 3 +++
kernel/fork.c | 18 +++++++++++++++---
kernel/nsproxy.c | 19 ++++++++++++++++++-
kernel/user_namespace.c | 22 ++++++++++++++++++++++
7 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index cdb171efc7cb..201bbb75637d 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -91,7 +91,8 @@ static inline struct cred *nsset_cred(struct nsset *set)
*
*/

-int copy_namespaces(unsigned long flags, struct task_struct *tsk);
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+ int userns_fd);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
void free_nsproxy(struct nsproxy *ns);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c0f71f2e7160..176b087443a0 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -33,6 +33,7 @@ struct kernel_clone_args {
int cgroup;
struct cgroup *cgrp;
struct css_set *cset;
+ int userns_fd;
};

/*
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..cecc4c55b7cb 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -136,6 +136,7 @@ extern bool in_userns(const struct user_namespace *ancestor,
const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
+extern struct user_namespace *get_user_ns_by_fd(int fd);
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -180,6 +181,11 @@ static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
return ERR_PTR(-EPERM);
}
+
+static inline struct user_namespace *get_user_ns_by_fd(int fd)
+{
+ return ERR_PTR(-EINVAL);
+}
#endif

#endif /* _LINUX_USER_H */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..77dfa0bb4d73 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,6 +36,7 @@
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#define CLONE_OWNER_NS 0x400000000ULL /* Clone with changed owner userns */

/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -101,12 +102,14 @@ struct clone_args {
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
__aligned_u64 cgroup;
+ __aligned_u64 userns_fd;
};
#endif

#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER3 96 /* sizeof forth published struct */

/*
* Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..a22acfa4b618 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2097,7 +2097,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
- retval = copy_namespaces(clone_flags, p);
+ retval = copy_namespaces(clone_flags, p, args->userns_fd);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
@@ -2596,7 +2596,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
CLONE_ARGS_SIZE_VER1);
BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
+ BUILD_BUG_ON(offsetofend(struct clone_args, userns_fd) !=
+ CLONE_ARGS_SIZE_VER3);
+ BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER3);

if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
@@ -2628,6 +2630,10 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
return -EINVAL;

+ if ((args.flags & CLONE_OWNER_NS) &&
+ (usize < CLONE_ARGS_SIZE_VER3))
+ return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
@@ -2639,6 +2645,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
+ .userns_fd = args.userns_fd,
};

if (args.set_tid &&
@@ -2683,7 +2690,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
- ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+ CLONE_INTO_CGROUP | CLONE_OWNER_NS))
return false;

/*
@@ -2704,6 +2712,10 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
if (!clone3_stack_valid(kargs))
return false;

+ if ((kargs->flags & (CLONE_OWNER_NS | CLONE_NEWUSER)) ==
+ (CLONE_OWNER_NS | CLONE_NEWUSER))
+ return false;
+
return true;
}

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index abc01fcad8c7..43326dd0df3f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -19,6 +19,7 @@
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
@@ -148,10 +149,12 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+ int userns_fd)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
+ struct user_namespace *owner = NULL;
struct nsproxy *new_ns;

if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
@@ -175,7 +178,21 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
(CLONE_NEWIPC | CLONE_SYSVSEM))
return -EINVAL;

+ if (flags & CLONE_OWNER_NS) {
+ owner = get_user_ns_by_fd(userns_fd);
+ if (IS_ERR(owner))
+ return -EINVAL;
+
+ if (!in_userns(user_ns, owner)) {
+ put_user_ns(owner);
+ return -EPERM;
+ }
+
+ user_ns = owner;
+ }
+
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
+ put_user_ns(owner);
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..c578f478eedc 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -20,6 +20,7 @@
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/file.h>

static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);
@@ -1253,6 +1254,27 @@ static void userns_put(struct ns_common *ns)
put_user_ns(to_user_ns(ns));
}

+struct user_namespace *get_user_ns_by_fd(int fd)
+{
+ struct file *file;
+ struct ns_common *ns;
+ struct user_namespace *user_ns;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ ns = get_proc_ns(file_inode(file));
+ if (ns->ops == &userns_operations)
+ user_ns = get_user_ns(to_user_ns(ns));
+ else
+ user_ns = ERR_PTR(-EINVAL);
+
+ fput(file);
+ return user_ns;
+}
+EXPORT_SYMBOL(get_user_ns_by_fd);
+
static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
struct user_namespace *user_ns = to_user_ns(ns);
--
2.30.2