Add a simple capability helper which makes it possible to determine
whether a set of creds is ns capable wrt to the passed in credentials.
This is not something exciting it's just a more pleasant wrapper around
security_capable() by allowing ns_capable_common() to ake a const struct
cred argument. In ptrace_has_cap() for example, we're using
security_capable() directly. ns_capable_cred() will be used in the next
patch to check against the target credentials the caller is going to
switch to.
Cc: Eric W. Biederman <[email protected]>
Cc: Serge Hallyn <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
---
/* v2 */
patch introduced
---
include/linux/capability.h | 3 +++
kernel/capability.c | 17 +++++++++++------
2 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/include/linux/capability.h b/include/linux/capability.h
index ecce0f43c73a..743a08d936fb 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -40,6 +40,7 @@ struct cpu_vfs_cap_data {
struct file;
struct inode;
struct dentry;
+struct cred;
struct task_struct;
struct user_namespace;
@@ -209,6 +210,8 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
struct user_namespace *ns, int cap);
extern bool capable(int cap);
extern bool ns_capable(struct user_namespace *ns, int cap);
+extern bool ns_capable_cred(const struct cred *cred,
+ struct user_namespace *ns, int cap);
extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
extern bool ns_capable_setid(struct user_namespace *ns, int cap);
#else
diff --git a/kernel/capability.c b/kernel/capability.c
index 1444f3954d75..84425781917e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -361,8 +361,8 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
-static bool ns_capable_common(struct user_namespace *ns,
- int cap,
+static bool ns_capable_common(const struct cred *cred,
+ struct user_namespace *ns, int cap,
unsigned int opts)
{
int capable;
@@ -372,7 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns,
BUG();
}
- capable = security_capable(current_cred(), ns, cap, opts);
+ capable = security_capable(cred, ns, cap, opts);
if (capable == 0) {
current->flags |= PF_SUPERPRIV;
return true;
@@ -393,10 +393,15 @@ static bool ns_capable_common(struct user_namespace *ns,
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, CAP_OPT_NONE);
+ return ns_capable_common(current_cred(), ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);
+bool ns_capable_cred(const struct cred *cred, struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(cred, ns, cap, CAP_OPT_NONE);
+}
+
/**
* ns_capable_noaudit - Determine if the current task has a superior capability
* (unaudited) in effect
@@ -411,7 +416,7 @@ EXPORT_SYMBOL(ns_capable);
*/
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
+ return ns_capable_common(current_cred(), ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);
@@ -430,7 +435,7 @@ EXPORT_SYMBOL(ns_capable_noaudit);
*/
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, CAP_OPT_INSETID);
+ return ns_capable_common(current_cred(), ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);
base-commit: ae83d0b416db002fe95601e7f97f64b59514d936
--
2.26.2
For quite a while we have been thinking about using pidfds to attach to
namespaces. This patchset has existed for about a year already but we've
wanted to wait to see how the general api would be received and adopted.
Now that more and more programs in userspace have started using pidfds
for process management it's time to send this one out.
This patch makes it possible to use pidfds to attach to the namespaces
of another process, i.e. they can be passed as the first argument to the
setns() syscall. When only a single namespace type is specified the
semantics are equivalent to passing an nsfd. That means
setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However,
when a pidfd is passed, multiple namespace flags can be specified in the
second setns() argument and setns() will attach the caller to all the
specified namespaces all at once or to none of them. Specifying 0 is not
valid together with a pidfd.
The obvious example where this is useful is a standard container
manager interacting with a running container: pushing and pulling files
or directories, injecting mounts, attaching/execing any kind of process,
managing network devices all these operations require attaching to all
or at least multiple namespaces at the same time. Given that nowadays
most containers are spawned with all namespaces enabled we're currently
looking at at least 14 syscalls, 7 to open the /proc/<pid>/ns/<ns>
nsfds, another 7 to actually perform the namespace switch. With time
namespaces we're looking at about 16 syscalls.
(We could amortize the first 7 or 8 syscalls for opening the nsfds by
stashing them in each container's monitor process but that would mean
we need to send around those file descriptors through unix sockets
everytime we want to interact with the container or keep on-disk
state. Even in scenarios where a caller wants to join a particular
namespace in a particular order callers still profit from batching
other namespaces. That mostly applies to the user namespace but
all container runtimes I found join the user namespace first no matter
if it privileges or deprivileges the container similar to how unshare
behaves.)
With pidfds this becomes a single syscall no matter how many namespaces
are supposed to be attached to.
A decently designed, large-scale container manager usually isn't the
parent of any of the containers it spawns so the containers don't die
when it crashes or needs to update or reinitialize. This means that
for the manager to interact with containers through pids is inherently
racy especially on systems where the maximum pid number is not
significicantly bumped. This is even more problematic since we often spawn
and manage thousands or ten-thousands of containers. Interacting with a
container through a pid thus can become risky quite quickly. Especially
since we allow for an administrator to enable advanced features such as
syscall interception where we're performing syscalls in lieu of the
container. In all of those cases we use pidfds if they are available and
we pass them around as stable references. Using them to setns() to the
target process' namespaces is as reliable as using nsfds. Either the
target process is already dead and we get ESRCH or we manage to attach
to its namespaces but we can't accidently attach to another process'
namespaces. So pidfds lend themselves to be used with this api.
The other main advantage is that with this change the pidfd becomes the
only relevant token for most container interactions and it's the only
token we need to create and send around.
Apart from significiantly reducing the number of syscalls from double
digit to single digit which is a decent reason post-spectre/meltdown
this also allows to switch to a set of namespaces atomically, i.e.
either attaching to all the specified namespaces succeeds or we fail. If
we fail we haven't changed a single namespace. There are currently three
namespaces that can fail (other than for ENOMEM which really is not
very interesting since we then have other problems anyway) for
non-trivial reasons, user, mount, and pid namespaces. We can fail to
attach to a pid namespace if it is not our current active pid namespace
or a descendant of it. We can fail to attach to a user namespace because
we are multi-threaded or because our current mount namespace shares
filesystem state with other tasks, or because we're trying to setns()
to the same user namespace, i.e. the target task has the same user
namespace as we do. We can fail to attach to a mount namespace because
it shares filesystem state with other tasks or because we fail to lookup
the new root for the new mount namespace. In most non-pathological
scenarios these issues can be somewhat mitigated. But there are cases where
we're half-attached to some namespace and failing to attach to another one.
I've talked about some of these problem during the hallway track (something
only the pre-COVID-19 generation will remember) of Plumbers in Los Angeles
in 2018(?). Even if all these issues could be avoided with super careful
userspace coding it would be nicer to have this done in-kernel. Pidfds seem
to lend themselves nicely for this.
The other neat thing about this is that setns() becomes an actual
counterpart to the namespace bits of unshare().
Cc: Eric W. Biederman <[email protected]>
Cc: Serge Hallyn <[email protected]>
Cc: Jann Horn <[email protected]>
Cc: Michael Kerrisk <[email protected]>
Cc: Aleksa Sarai <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
---
Seems that some people do in fact find this useful and I still have
hopes that I can potentially send this for v5.8. I've added some
expanded testing to this whole series to catch any mistakes.
The lifecycle management for various objects was certainly the
most interesting aspect so I'd appreciate a close look.
/* v2 */
- Michael Kerrisk <[email protected]>:
- Michael pointed out that the semantics for setns(nsfd, 0) and
setns(pidfd, 0) are not comparable. setns(pidfd, 0) is now
disallowed completely. Users wanting to attach to all namespaces
should simply specify them explicitly just as with unshare() and
clone3().
- Jann Horn <[email protected]>:
- Jann pointed out that the setns() in its first form wasn't atomic in
that userspace could end up in an intermediate state where e.g. the
process had moved into the target user namespace but failed to move
into the target mount namespace.
In this new version I've removed all intermediate states. There's an
installation/preparation state and a commit state similar to
prepare_creds() and commit_creds().
---
fs/nsfs.c | 7 +-
include/linux/proc_fs.h | 6 ++
kernel/nsproxy.c | 228 ++++++++++++++++++++++++++++++++++++----
3 files changed, 222 insertions(+), 19 deletions(-)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 4f1205725cfe..b023c1a367c8 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -229,6 +229,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
return res;
}
+bool proc_ns_file(const struct file *file)
+{
+ return file->f_op == &ns_file_operations;
+}
+
struct file *proc_ns_fget(int fd)
{
struct file *file;
@@ -237,7 +242,7 @@ struct file *proc_ns_fget(int fd)
if (!file)
return ERR_PTR(-EBADF);
- if (file->f_op != &ns_file_operations)
+ if (!proc_ns_file(file))
goto out_invalid;
return file;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 45c05fd9c99d..acfd5012db4e 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -104,6 +104,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
proc_write_t write,
void *data);
extern struct pid *tgid_pidfd_to_pid(const struct file *file);
+extern bool proc_ns_file(const struct file *file);
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
@@ -159,6 +160,11 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
return ERR_PTR(-EBADF);
}
+static inline bool proc_ns_file(const struct file *file)
+{
+ return false;
+}
+
#endif /* CONFIG_PROC_FS */
struct net;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 2da463bab58a..8cae508acb9c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -20,6 +20,7 @@
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
#include <linux/fs_struct.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -258,12 +259,53 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+static int check_setns_flags(unsigned long flags)
+{
+ if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
+ CLONE_NEWCGROUP)))
+ return -EINVAL;
+
+#ifndef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET)
+ return -EINVAL;
+#endif
+
+ return 0;
+}
+
static void put_nsset(struct nsset *nsset)
{
unsigned flags = nsset->flags;
if (flags & CLONE_NEWUSER)
put_cred(nsset_cred(nsset));
+ /*
+ * We only created a temporary copy if we attached to more than just
+ * the mount namespace.
+ */
+ if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
+ free_fs_struct(nsset->fs);
if (nsset->nsproxy)
free_nsproxy(nsset->nsproxy);
}
@@ -283,8 +325,14 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
if (!nsset->cred)
goto out;
- if (flags & CLONE_NEWNS)
+ /* Only create a temporary copy of fs_struct if we really need to. */
+ if (flags == CLONE_NEWNS) {
nsset->fs = me->fs;
+ } else if (flags & CLONE_NEWNS) {
+ nsset->fs = copy_fs_struct(me->fs);
+ if (!nsset->fs)
+ goto out;
+ }
nsset->flags = flags;
return 0;
@@ -294,6 +342,138 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
return -ENOMEM;
}
+static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
+{
+ return ns->ops->install(nsset, ns);
+}
+
+/*
+ * This is the inverse operation to unshare().
+ * Ordering is equivalent to the standard ordering used everywhere else
+ * during unshare and process creation. The switch to the new set of
+ * namespaces occurs at the point of no return after installation of
+ * all requested namespaces was successful in commit_nsset().
+ */
+static int validate_nsset(struct nsset *nsset, struct pid *pid)
+{
+ int ret = 0;
+ unsigned flags = nsset->flags;
+ struct user_namespace *user_ns = NULL;
+ struct pid_namespace *pid_ns = NULL;
+ struct nsproxy *nsp;
+ struct task_struct *tsk;
+
+ /* Take a "snapshot" of the target task's namespaces. */
+ rcu_read_lock();
+ tsk = pid_task(pid, PIDTYPE_PID);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+
+ task_lock(tsk);
+ nsp = tsk->nsproxy;
+ if (nsp)
+ get_nsproxy(nsp);
+ task_unlock(tsk);
+ if (!nsp) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ pid_ns = task_active_pid_ns(tsk);
+ if (unlikely(!pid_ns)) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_pid_ns(pid_ns);
+ }
+#endif
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ user_ns = get_user_ns(__task_cred(tsk)->user_ns);
+#endif
+ rcu_read_unlock();
+
+ /*
+ * Install requested namespaces. The caller will have
+ * verified earlier that the requested namespaces are
+ * supported on this kernel. We don't report errors here
+ * if a namespace is requested that isn't supported.
+ */
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ ret = validate_ns(nsset, &user_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+ if (flags & CLONE_NEWNS) {
+ ret = validate_ns(nsset, mnt_ns_to_common(nsp->mnt_ns));
+ if (ret)
+ goto out;
+ }
+
+#ifdef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS) {
+ ret = validate_ns(nsset, &nsp->uts_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC) {
+ ret = validate_ns(nsset, &nsp->ipc_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ ret = validate_ns(nsset, &pid_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP) {
+ ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET) {
+ ret = validate_ns(nsset, &nsp->net_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+out:
+ if (pid_ns)
+ put_pid_ns(pid_ns);
+ if (nsp)
+ put_nsproxy(nsp);
+ put_user_ns(user_ns);
+
+ return ret;
+}
+
/*
* This is the point of no return. There are just a few namespaces
* that do some actual work here and it's sufficiently minimal that
@@ -316,6 +496,12 @@ static void commit_nsset(struct nsset *nsset)
}
#endif
+ /* We only need to commit if we have used a temporary fs_struct. */
+ if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
+ set_fs_root(me->fs, &nsset->fs->root);
+ set_fs_pwd(me->fs, &nsset->fs->pwd);
+ }
+
#ifdef CONFIG_IPC_NS
if (flags & CLONE_NEWIPC)
exit_sem(me);
@@ -326,33 +512,39 @@ static void commit_nsset(struct nsset *nsset)
nsset->nsproxy = NULL;
}
-static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
-{
- return ns->ops->install(nsset, ns);
-}
-
-SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
struct task_struct *tsk = current;
struct file *file;
- struct ns_common *ns;
+ struct ns_common *ns = NULL;
struct nsset nsset = {};
- int err;
-
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return PTR_ERR(file);
+ int err = 0;
- err = -EINVAL;
- ns = get_proc_ns(file_inode(file));
- if (nstype && (ns->ops->type != nstype))
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ if (proc_ns_file(file)) {
+ ns = get_proc_ns(file_inode(file));
+ if (flags && (ns->ops->type != flags))
+ err = -EINVAL;
+ flags = ns->ops->type;
+ } else if (pidfd_pid(file)) {
+ err = check_setns_flags(flags);
+ } else {
+ err = -EBADF;
+ }
+ if (err)
goto out;
- err = prepare_nsset(nstype, &nsset);
+ err = prepare_nsset(flags, &nsset);
if (err)
goto out;
- err = validate_ns(&nsset, ns);
+ if (proc_ns_file(file))
+ err = validate_ns(&nsset, ns);
+ else
+ err = validate_nsset(&nsset, file->private_data);
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(tsk);
--
2.26.2
Add a simple struct nsset. It holds all necessary pieces to switch to a new
set of namespaces without leaving a task in a half-switched state which we
will make use of in the next patch. This patch simply switches the existing
setns logic over without causing a change in setns() behavior. This brings
setns() closer to how unshare() works().
Cc: Eric W. Biederman <[email protected]>
Cc: Serge Hallyn <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
---
/* v2 */
patch introduced
---
fs/namespace.c | 17 +++++--
include/linux/mnt_namespace.h | 2 +
include/linux/nsproxy.h | 24 +++++++++
include/linux/proc_ns.h | 4 +-
ipc/namespace.c | 9 ++--
kernel/cgroup/namespace.c | 7 +--
kernel/nsproxy.c | 94 ++++++++++++++++++++++++++++++-----
kernel/pid_namespace.c | 7 +--
kernel/time/namespace.c | 7 +--
kernel/user_namespace.c | 8 +--
kernel/utsname.c | 7 +--
net/core/net_namespace.c | 7 +--
12 files changed, 150 insertions(+), 43 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index a28e4db075ed..91d2b86aadfb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1733,6 +1733,11 @@ static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
return container_of(ns, struct mnt_namespace, ns);
}
+struct ns_common *mnt_ns_to_common(struct mnt_namespace *mnt)
+{
+ return &mnt->ns;
+}
+
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
@@ -3954,16 +3959,18 @@ static void mntns_put(struct ns_common *ns)
put_mnt_ns(to_mnt_ns(ns));
}
-static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
- struct fs_struct *fs = current->fs;
+ struct nsproxy *nsproxy = nsset->nsproxy;
+ struct fs_struct *fs = nsset->fs;
struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
+ struct user_namespace *user_ns = nsset->cred->user_ns;
struct path root;
int err;
- if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, user_ns, CAP_SYS_CHROOT) ||
+ !ns_capable_cred(nsset->cred, user_ns, CAP_SYS_ADMIN))
return -EPERM;
if (is_anon_ns(mnt_ns))
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 35942084cd40..664dd3b06f34 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -6,10 +6,12 @@
struct mnt_namespace;
struct fs_struct;
struct user_namespace;
+struct ns_common;
extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
struct user_namespace *, struct fs_struct *);
extern void put_mnt_ns(struct mnt_namespace *ns);
+extern struct ns_common *mnt_ns_to_common(struct mnt_namespace *);
extern const struct file_operations proc_mounts_operations;
extern const struct file_operations proc_mountinfo_operations;
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 074f395b9ad2..cdb171efc7cb 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -41,6 +41,30 @@ struct nsproxy {
};
extern struct nsproxy init_nsproxy;
+/*
+ * A structure to encompass all bits needed to install
+ * a partial or complete new set of namespaces.
+ *
+ * If a new user namespace is requested cred will
+ * point to a modifiable set of credentials. If a pointer
+ * to a modifiable set is needed nsset_cred() must be
+ * used and tested.
+ */
+struct nsset {
+ unsigned flags;
+ struct nsproxy *nsproxy;
+ struct fs_struct *fs;
+ const struct cred *cred;
+};
+
+static inline struct cred *nsset_cred(struct nsset *set)
+{
+ if (set->flags & CLONE_NEWUSER)
+ return (struct cred *)set->cred;
+
+ return NULL;
+}
+
/*
* the namespaces access rules are:
*
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 6abe85c34681..75807ecef880 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -8,7 +8,7 @@
#include <linux/ns_common.h>
struct pid_namespace;
-struct nsproxy;
+struct nsset;
struct path;
struct task_struct;
struct inode;
@@ -19,7 +19,7 @@ struct proc_ns_operations {
int type;
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
- int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+ int (*install)(struct nsset *nsset, struct ns_common *ns);
struct user_namespace *(*owner)(struct ns_common *ns);
struct ns_common *(*get_parent)(struct ns_common *ns);
} __randomize_layout;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b3ca1476ca51..df6aa2deab18 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -177,15 +177,14 @@ static void ipcns_put(struct ns_common *ns)
return put_ipc_ns(to_ipc_ns(ns));
}
-static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int ipcns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct ipc_namespace *ns = to_ipc_ns(new);
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- /* Ditch state from the old ipc namespace */
- exit_sem(current);
put_ipc_ns(nsproxy->ipc_ns);
nsproxy->ipc_ns = get_ipc_ns(ns);
return 0;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index b05f1dd58a62..01279c9e6dee 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -95,12 +95,13 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
return container_of(ns, struct cgroup_namespace, ns);
}
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
- !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* Don't need to do anything if we are attaching to our own cgroupns. */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ed9882108cd2..2da463bab58a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -19,6 +19,7 @@
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
+#include <linux/fs_struct.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -257,12 +258,85 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+static void put_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+
+ if (flags & CLONE_NEWUSER)
+ put_cred(nsset_cred(nsset));
+ if (nsset->nsproxy)
+ free_nsproxy(nsset->nsproxy);
+}
+
+static int prepare_nsset(unsigned flags, struct nsset *nsset)
+{
+ struct task_struct *me = current;
+
+ nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+ if (IS_ERR(nsset->nsproxy))
+ return PTR_ERR(nsset->nsproxy);
+
+ if (flags & CLONE_NEWUSER)
+ nsset->cred = prepare_creds();
+ else
+ nsset->cred = current_cred();
+ if (!nsset->cred)
+ goto out;
+
+ if (flags & CLONE_NEWNS)
+ nsset->fs = me->fs;
+
+ nsset->flags = flags;
+ return 0;
+
+out:
+ put_nsset(nsset);
+ return -ENOMEM;
+}
+
+/*
+ * This is the point of no return. There are just a few namespaces
+ * that do some actual work here and it's sufficiently minimal that
+ * a separate ns_common operation seems unnecessary for now.
+ * Unshare is doing the same thing. If we'll end up needing to do
+ * more in a given namespace or a helper here is ultimately not
+ * exported anymore a simple commit handler for each namespace
+ * should be added to ns_common.
+ */
+static void commit_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+ struct task_struct *me = current;
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ /* transfer ownership */
+ commit_creds(nsset_cred(nsset));
+ nsset->cred = NULL;
+ }
+#endif
+
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ exit_sem(me);
+#endif
+
+ /* transfer ownership */
+ switch_task_namespaces(me, nsset->nsproxy);
+ nsset->nsproxy = NULL;
+}
+
+static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
+{
+ return ns->ops->install(nsset, ns);
+}
+
SYSCALL_DEFINE2(setns, int, fd, int, nstype)
{
struct task_struct *tsk = current;
- struct nsproxy *new_nsproxy;
struct file *file;
struct ns_common *ns;
+ struct nsset nsset = {};
int err;
file = proc_ns_fget(fd);
@@ -274,20 +348,16 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
if (nstype && (ns->ops->type != nstype))
goto out;
- new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
- if (IS_ERR(new_nsproxy)) {
- err = PTR_ERR(new_nsproxy);
+ err = prepare_nsset(nstype, &nsset);
+ if (err)
goto out;
- }
- err = ns->ops->install(new_nsproxy, ns);
- if (err) {
- free_nsproxy(new_nsproxy);
- goto out;
+ err = validate_ns(&nsset, ns);
+ if (!err) {
+ commit_nsset(&nsset);
+ perf_event_namespaces(tsk);
}
- switch_task_namespaces(tsk, new_nsproxy);
-
- perf_event_namespaces(tsk);
+ put_nsset(&nsset);
out:
fput(file);
return err;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 01f8ba32cc0c..b0b0e1055812 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *ancestor, *new = to_pid_ns(ns);
- if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, new->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 53bce347cd50..8ff16ef947ef 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,16 +280,17 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int timens_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
int err;
if (!current_is_single_threaded())
return -EUSERS;
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
timens_set_vvar_page(current, ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8eadadc478f9..87804e0371fe 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns)
put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
@@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- cred = prepare_creds();
+ cred = nsset_cred(nsset);
if (!cred)
- return -ENOMEM;
+ return -EINVAL;
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
- return commit_creds(cred);
+ return 0;
}
struct ns_common *ns_get_owner(struct ns_common *ns)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index f0e491193009..f8fa76756022 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns)
put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int utsns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct uts_namespace *ns = to_uts_ns(new);
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
get_uts_ns(ns);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 190ca66a383b..e312a5f411ae 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1353,12 +1353,13 @@ static void netns_put(struct ns_common *ns)
put_net(to_net_ns(ns));
}
-static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct net *net = to_net_ns(ns);
- if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, net->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
put_net(nsproxy->net_ns);
--
2.26.2
This is basically a test-suite for setns() and as of now contains:
- test that we can't pass garbage flags
- test that we can't attach to the namespaces of task that has already exited
- test that we can incrementally setns into all namespaces of a target task
using a pidfd
- test that we can setns atomically into all namespaces of a target task
- test that we can't cross setns into a user namespace outside of our user
namespace hierarchy
- test that we can't setns into namespaces owned by user namespaces over which
we are not privileged
Signed-off-by: Christian Brauner <[email protected]>
---
/* v2 */
patch introduced
---
tools/testing/selftests/pidfd/.gitignore | 1 +
tools/testing/selftests/pidfd/Makefile | 3 +-
tools/testing/selftests/pidfd/config | 6 +
.../selftests/pidfd/pidfd_setns_test.c | 422 ++++++++++++++++++
4 files changed, 431 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/pidfd/config
create mode 100644 tools/testing/selftests/pidfd/pidfd_setns_test.c
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 2d4db5afb142..973198a3ec3d 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -5,3 +5,4 @@ pidfd_test
pidfd_wait
pidfd_fdinfo_test
pidfd_getfd_test
+pidfd_setns_test
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index 75a545861375..f4a2f28f926b 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -g -I../../../../usr/include/ -pthread
-TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test
+TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
+ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
include ../lib.mk
diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config
new file mode 100644
index 000000000000..bb11de90c0c9
--- /dev/null
+++ b/tools/testing/selftests/pidfd/config
@@ -0,0 +1,6 @@
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+CONFIG_CGROUPS=y
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
new file mode 100644
index 000000000000..c327cc3b0ec9
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <linux/kcmp.h>
+
+#include "pidfd.h"
+#include "../clone3/clone3_selftests.h"
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+enum {
+ PIDFD_NS_USER,
+ PIDFD_NS_MNT,
+ PIDFD_NS_PID,
+ PIDFD_NS_UTS,
+ PIDFD_NS_IPC,
+ PIDFD_NS_NET,
+ PIDFD_NS_CGROUP,
+ PIDFD_NS_PIDCLD,
+ PIDFD_NS_MAX
+};
+
+const struct ns_info {
+ const char *proc_name;
+ int clone_flag;
+ const char *flag_name;
+} ns_info[] = {
+ [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, "CLONE_NEWUSER", },
+ [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, "CLONE_NEWNS", },
+ [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, "CLONE_NEWPID", },
+ [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, "CLONE_NEWUTS", },
+ [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, "CLONE_NEWIPC", },
+ [PIDFD_NS_NET] = { "net", CLONE_NEWNET, "CLONE_NEWNET", },
+ [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", },
+ [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, "INVALID_FLAG_PID_FOR_CHILDREN", },
+};
+
+FIXTURE(current_nsset)
+{
+ pid_t pid;
+ int pidfd;
+ int ns_fds[PIDFD_NS_MAX];
+
+ pid_t child_pid_exited;
+ int child_pidfd_exited;
+
+ pid_t child_pid_all_ns_stopped1;
+ int child_pidfd_all_ns_stopped1;
+ int child_ns_fds_all_ns_stopped1[PIDFD_NS_MAX];
+
+ pid_t child_pid_all_ns_stopped2;
+ int child_pidfd_all_ns_stopped2;
+ int child_ns_fds_all_ns_stopped2[PIDFD_NS_MAX];
+};
+
+static int sys_waitid(int which, pid_t pid, int options)
+{
+ return syscall(__NR_waitid, which, pid, NULL, options, NULL);
+}
+
+pid_t create_child(int *pidfd, unsigned flags)
+{
+ struct clone_args args = {
+ .flags = CLONE_PIDFD | flags,
+ .exit_signal = SIGCHLD,
+ .pidfd = ptr_to_u64(pidfd),
+ };
+
+ return sys_clone3(&args, sizeof(struct clone_args));
+}
+
+FIXTURE_SETUP(current_nsset)
+{
+ int i;
+ int proc_fd;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ self->ns_fds[i] = -EBADF;
+ self->child_ns_fds_all_ns_stopped1[i] = -EBADF;
+ self->child_ns_fds_all_ns_stopped2[i] = -EBADF;
+ }
+
+ proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
+ ASSERT_GE(proc_fd, 0) {
+ TH_LOG("%m - Failed to open /proc/self/ns");
+ }
+
+ self->pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ self->ns_fds[i] = openat(proc_fd, info->proc_name, O_RDONLY | O_CLOEXEC);
+ if (self->ns_fds[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->proc_name, self->pid);
+ }
+ }
+ }
+
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ ASSERT_GE(self->pidfd, 0) {
+ TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ }
+
+ /* Create task that exits right away. */
+ self->child_pid_exited = create_child(&self->child_pidfd_exited,
+ CLONE_NEWUSER | CLONE_NEWNET);
+ ASSERT_GE(self->child_pid_exited, 0);
+
+ if (self->child_pid_exited == 0)
+ _exit(EXIT_SUCCESS);
+
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ ASSERT_GE(self->pidfd, 0) {
+ TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ }
+
+ /* Create tasks that will be stopped. */
+ self->child_pid_all_ns_stopped1 = create_child(&self->child_pidfd_all_ns_stopped1,
+ CLONE_NEWUSER |
+ CLONE_NEWNS |
+ CLONE_NEWCGROUP |
+ CLONE_NEWIPC |
+ CLONE_NEWUTS |
+ CLONE_NEWPID |
+ CLONE_NEWNET);
+ ASSERT_GE(self->child_pid_all_ns_stopped1, 0);
+
+ if (self->child_pid_all_ns_stopped1 == 0) {
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ self->child_pid_all_ns_stopped2 = create_child(&self->child_pidfd_all_ns_stopped2,
+ CLONE_NEWUSER |
+ CLONE_NEWNS |
+ CLONE_NEWCGROUP |
+ CLONE_NEWIPC |
+ CLONE_NEWUTS |
+ CLONE_NEWPID |
+ CLONE_NEWNET);
+ ASSERT_GE(self->child_pid_all_ns_stopped2, 0);
+
+ if (self->child_pid_all_ns_stopped2 == 0) {
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ char path[100];
+
+ const struct ns_info *info = &ns_info[i];
+
+ self->ns_fds[i] = openat(proc_fd, info->proc_name, O_RDONLY | O_CLOEXEC);
+ if (self->ns_fds[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->proc_name, self->pid);
+ }
+ }
+
+ (void)snprintf(path, sizeof(path), "/proc/%d/ns/%s", self->child_pid_all_ns_stopped1, info->proc_name);
+ self->child_ns_fds_all_ns_stopped1[i] = open(path, O_RDONLY | O_CLOEXEC);
+ if (self->child_ns_fds_all_ns_stopped1[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+ }
+
+ (void)snprintf(path, sizeof(path), "/proc/%d/ns/%s", self->child_pid_all_ns_stopped1, info->proc_name);
+ self->child_ns_fds_all_ns_stopped2[i] = open(path, O_RDONLY | O_CLOEXEC);
+ if (self->child_ns_fds_all_ns_stopped2[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+ }
+ }
+}
+
+FIXTURE_TEARDOWN(current_nsset)
+{
+ int i;
+
+ ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd_all_ns_stopped1,
+ SIGKILL, NULL, 0), 0);
+ ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd_all_ns_stopped2,
+ SIGKILL, NULL, 0), 0);
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ if (self->ns_fds[i] >= 0)
+ close(self->ns_fds[i]);
+ if (self->child_ns_fds_all_ns_stopped1[i] >= 0)
+ close(self->child_ns_fds_all_ns_stopped1[i]);
+ if (self->child_ns_fds_all_ns_stopped2[i] >= 0)
+ close(self->child_ns_fds_all_ns_stopped2[i]);
+ }
+
+ EXPECT_EQ(0, close(self->child_pidfd_all_ns_stopped1));
+ EXPECT_EQ(0, close(self->child_pidfd_all_ns_stopped2));
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_all_ns_stopped1, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_all_ns_stopped2, WEXITED), 0);
+}
+
+int preserve_ns(const int pid, const char *ns)
+{
+ int ret;
+/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
+#define __NS_PATH_LEN 50
+ char path[__NS_PATH_LEN];
+
+ /* This way we can use this function to also check whether namespaces
+ * are supported by the kernel by passing in the NULL or the empty
+ * string.
+ */
+ ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
+ !ns || strcmp(ns, "") == 0 ? "" : "/",
+ !ns || strcmp(ns, "") == 0 ? "" : ns);
+ if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
+ errno = EFBIG;
+ return -1;
+ }
+
+ return open(path, O_RDONLY | O_CLOEXEC);
+}
+
+static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns)
+{
+ int ns_fd2 = -EBADF;
+ int ret = -1;
+ struct stat ns_st1, ns_st2;
+
+ ret = fstat(ns_fd1, &ns_st1);
+ if (ret < 0)
+ return -1;
+
+ ns_fd2 = preserve_ns(pid2, ns);
+ if (ns_fd2 < 0)
+ return -1;
+
+ ret = fstat(ns_fd2, &ns_st2);
+ close(ns_fd2);
+ if (ret < 0)
+ return -1;
+
+ /* processes are in the same namespace */
+ if ((ns_st1.st_dev == ns_st2.st_dev) &&
+ (ns_st1.st_ino == ns_st2.st_ino))
+ return 1;
+
+ /* processes are in different namespaces */
+ return 0;
+}
+
+/* Test that we can't pass garbage to the kernel. */
+TEST_F(current_nsset, invalid_flags)
+{
+ ASSERT_NE(setns(self->pidfd, 0), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, -1), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, CLONE_VM), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0);
+ EXPECT_EQ(errno, EINVAL);
+}
+
+/* Test that we can't attach to a task that has already exited. */
+TEST_F(current_nsset, exited_child)
+{
+ int i;
+ pid_t pid;
+
+ ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET), 0);
+ EXPECT_EQ(errno, ESRCH);
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ /* Verify that we haven't changed any namespaces. */
+ if (self->ns_fds[i] >= 0)
+ ASSERT_EQ(in_same_namespace(self->ns_fds[i], pid, info->proc_name), 1);
+ }
+}
+
+TEST_F(current_nsset, incremental_setns)
+{
+ int i;
+ pid_t pid;
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_ns_fds_all_ns_stopped1[i] < 0)
+ continue;
+
+ if (info->clone_flag) {
+ ASSERT_EQ(setns(self->child_pidfd_all_ns_stopped1, info->clone_flag), 0) {
+ TH_LOG("%m - Failed to setns to %s namespace of %d", info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+ }
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->clone_flag == CLONE_NEWPID)
+ nsfd = self->ns_fds[i];
+ else
+ nsfd = self->child_ns_fds_all_ns_stopped1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->proc_name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d", info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d", info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+}
+
+TEST_F(current_nsset, one_shot_setns)
+{
+ unsigned flags = 0;
+ int i;
+ pid_t pid;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_ns_fds_all_ns_stopped1[i] < 0)
+ continue;
+
+ flags |= info->clone_flag;
+ TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+
+ ASSERT_EQ(setns(self->child_pidfd_all_ns_stopped1, flags), 0) {
+ TH_LOG("%m - Failed to setns to namespaces of %d", self->child_pid_all_ns_stopped1);
+ }
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_ns_fds_all_ns_stopped1[i] < 0)
+ continue;
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->clone_flag == CLONE_NEWPID)
+ nsfd = self->ns_fds[i];
+ else
+ nsfd = self->child_ns_fds_all_ns_stopped1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->proc_name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d", info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d", info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+}
+
+TEST_F(current_nsset, no_foul_play)
+{
+ unsigned flags = 0;
+ int i;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_ns_fds_all_ns_stopped1[i] < 0)
+ continue;
+
+ flags |= info->clone_flag;
+ if (info->clone_flag) /* No use logging pid_for_children. */
+ TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", info->proc_name, self->child_pid_all_ns_stopped1);
+ }
+
+ ASSERT_EQ(setns(self->child_pidfd_all_ns_stopped1, flags), 0) {
+ TH_LOG("%m - Failed to setns to namespaces of %d", self->child_pid_all_ns_stopped1);
+ }
+
+
+ /*
+ * Can't setns to a user namespace outside of our hierarchy since we
+ * don't have caps in there and didn't create it. That means that under
+ * no circumstances should we be able to setns to any of the other
+ * ones since they aren't owned by our user namespace.
+ */
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_ns_fds_all_ns_stopped1[i] < 0)
+ continue;
+
+ if (!info->clone_flag)
+ continue;
+
+ ASSERT_NE(setns(self->child_pidfd_all_ns_stopped2, info->clone_flag), 0) {
+ TH_LOG("Managed to setns to %s namespace of %d", info->proc_name, self->child_pid_all_ns_stopped2);
+ }
+ TH_LOG("%m - Correctly failed to setns to %s namespace of %d", info->proc_name, self->child_pid_all_ns_stopped2);
+ }
+}
+
+TEST_HARNESS_MAIN
--
2.26.2
Christian Brauner <[email protected]> writes:
> Add a simple capability helper which makes it possible to determine
> whether a set of creds is ns capable wrt to the passed in credentials.
> This is not something exciting it's just a more pleasant wrapper around
> security_capable() by allowing ns_capable_common() to ake a const struct
> cred argument. In ptrace_has_cap() for example, we're using
> security_capable() directly. ns_capable_cred() will be used in the next
> patch to check against the target credentials the caller is going to
> switch to.
Given that this is to suppot setns. I don't understand the
justification for this.
Is it your intention to use the reduced permissions that you get
when you install a user namespace?
Why do you want to use the reduced permissions when installing multiple
namespaces at once?
Eric
> Cc: Eric W. Biederman <[email protected]>
> Cc: Serge Hallyn <[email protected]>
> Signed-off-by: Christian Brauner <[email protected]>
> ---
> /* v2 */
> patch introduced
> ---
> include/linux/capability.h | 3 +++
> kernel/capability.c | 17 +++++++++++------
> 2 files changed, 14 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index ecce0f43c73a..743a08d936fb 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -40,6 +40,7 @@ struct cpu_vfs_cap_data {
> struct file;
> struct inode;
> struct dentry;
> +struct cred;
> struct task_struct;
> struct user_namespace;
>
> @@ -209,6 +210,8 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
> struct user_namespace *ns, int cap);
> extern bool capable(int cap);
> extern bool ns_capable(struct user_namespace *ns, int cap);
> +extern bool ns_capable_cred(const struct cred *cred,
> + struct user_namespace *ns, int cap);
> extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
> extern bool ns_capable_setid(struct user_namespace *ns, int cap);
> #else
> diff --git a/kernel/capability.c b/kernel/capability.c
> index 1444f3954d75..84425781917e 100644
> --- a/kernel/capability.c
> +++ b/kernel/capability.c
> @@ -361,8 +361,8 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
> return has_ns_capability_noaudit(t, &init_user_ns, cap);
> }
>
> -static bool ns_capable_common(struct user_namespace *ns,
> - int cap,
> +static bool ns_capable_common(const struct cred *cred,
> + struct user_namespace *ns, int cap,
> unsigned int opts)
> {
> int capable;
> @@ -372,7 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns,
> BUG();
> }
>
> - capable = security_capable(current_cred(), ns, cap, opts);
> + capable = security_capable(cred, ns, cap, opts);
> if (capable == 0) {
> current->flags |= PF_SUPERPRIV;
> return true;
> @@ -393,10 +393,15 @@ static bool ns_capable_common(struct user_namespace *ns,
> */
> bool ns_capable(struct user_namespace *ns, int cap)
> {
> - return ns_capable_common(ns, cap, CAP_OPT_NONE);
> + return ns_capable_common(current_cred(), ns, cap, CAP_OPT_NONE);
> }
> EXPORT_SYMBOL(ns_capable);
>
> +bool ns_capable_cred(const struct cred *cred, struct user_namespace *ns, int cap)
> +{
> + return ns_capable_common(cred, ns, cap, CAP_OPT_NONE);
> +}
> +
> /**
> * ns_capable_noaudit - Determine if the current task has a superior capability
> * (unaudited) in effect
> @@ -411,7 +416,7 @@ EXPORT_SYMBOL(ns_capable);
> */
> bool ns_capable_noaudit(struct user_namespace *ns, int cap)
> {
> - return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
> + return ns_capable_common(current_cred(), ns, cap, CAP_OPT_NOAUDIT);
> }
> EXPORT_SYMBOL(ns_capable_noaudit);
>
> @@ -430,7 +435,7 @@ EXPORT_SYMBOL(ns_capable_noaudit);
> */
> bool ns_capable_setid(struct user_namespace *ns, int cap)
> {
> - return ns_capable_common(ns, cap, CAP_OPT_INSETID);
> + return ns_capable_common(current_cred(), ns, cap, CAP_OPT_INSETID);
> }
> EXPORT_SYMBOL(ns_capable_setid);
>
>
> base-commit: ae83d0b416db002fe95601e7f97f64b59514d936
On Thu, Apr 30, 2020 at 01:09:30PM -0500, Eric W. Biederman wrote:
> Christian Brauner <[email protected]> writes:
>
> > Add a simple capability helper which makes it possible to determine
> > whether a set of creds is ns capable wrt to the passed in credentials.
> > This is not something exciting it's just a more pleasant wrapper around
> > security_capable() by allowing ns_capable_common() to ake a const struct
> > cred argument. In ptrace_has_cap() for example, we're using
> > security_capable() directly. ns_capable_cred() will be used in the next
> > patch to check against the target credentials the caller is going to
> > switch to.
>
> Given that this is to suppot setns. I don't understand the
> justification for this.
>
> Is it your intention to use the reduced permissions that you get
> when you install a user namespace?
Indeed.
>
> Why do you want to use the reduced permissions when installing multiple
> namespaces at once?
The intention is to use the target credentials we are going to install
when setns() hits the point of no return. The target permissions are
either the permissions of the caller or the reduced permissions if
CLONE_NEWUSER has been requested. This has multiple reasons.
The most obvious reason imho is that all other namespaces have an owning
user namespace. Attaching to any other namespace requires the attacher
to be privileged over the owning user namespace of that namespace.
Consequently, all our current install handlers for every namespace we
have check whether we are privileged in the owning user namespace of
that user namespace. So in order to attach to any of those namespaces -
especially when attaching as an unprivileged user - requires that we are
attached to the user namespace first. (That's especially useful given
that most users especially container runtimes will unshare all
namespaces. Doing it this way we can not just have attach privileged
users attach to their containers but also unprivileged users to their
containers in one shot.)
A few other points about this. If one looks at clone(CLONE_NEW*) or
unshare(CLONE_NEW*) then the ordering and permissions checking is the
same way. All permissions checks are performed against the reduced
permissions, i.e. if CLONE_NEWUSER is specified you check privilege
against the reduced permissions too otherwise you wouldn't be able to
spawn into a complete set of new namespaces as an unprivileged user.
This logic is also expressed in how setns() is already used in
userspace. Any container runtime will attach to the user namespace first,
so all subsequent calls to attach to other namespaces perform the checks
against the reduced permissions. It also has to be that way because of
fully unprivileged containers.
To put it another way, if we were to always perform the permission checks
against the current permissions (i.e. no matter if CLONE_NEWUSER is
specified or not) then we'd never be able to attach to a set of
namespaces at once as an unprivileged user.
We also really want to be able to express both semantics:
1. setns(flags & ~CLONE_NEWUSER) --> attach to all namespaces with my
current permission level
2. setns(flags | CLONE_NEWUSER) attach to all namespaces with the target
permission level
It feels weird if both 1 and 2 would mean the exact same thing given
that the user namespace has an owernship relation with all the other
namespaces.
Christian
Christian Brauner <[email protected]> writes:
> On Thu, Apr 30, 2020 at 01:09:30PM -0500, Eric W. Biederman wrote:
>> Christian Brauner <[email protected]> writes:
>>
>> > Add a simple capability helper which makes it possible to determine
>> > whether a set of creds is ns capable wrt to the passed in credentials.
>> > This is not something exciting it's just a more pleasant wrapper around
>> > security_capable() by allowing ns_capable_common() to ake a const struct
>> > cred argument. In ptrace_has_cap() for example, we're using
>> > security_capable() directly. ns_capable_cred() will be used in the next
>> > patch to check against the target credentials the caller is going to
>> > switch to.
>>
>> Given that this is to suppot setns. I don't understand the
>> justification for this.
>>
>> Is it your intention to use the reduced permissions that you get
>> when you install a user namespace?
>
> Indeed.
>
>>
>> Why do you want to use the reduced permissions when installing multiple
>> namespaces at once?
>
> The intention is to use the target credentials we are going to install
> when setns() hits the point of no return. The target permissions are
> either the permissions of the caller or the reduced permissions if
> CLONE_NEWUSER has been requested. This has multiple reasons.
>
> The most obvious reason imho is that all other namespaces have an owning
> user namespace. Attaching to any other namespace requires the attacher
> to be privileged over the owning user namespace of that namespace.
> Consequently, all our current install handlers for every namespace we
> have check whether we are privileged in the owning user namespace of
> that user namespace. So in order to attach to any of those namespaces -
> especially when attaching as an unprivileged user - requires that we are
> attached to the user namespace first.
No actually it doesn't. As if you have privileges to attach to the user
namespace you have the privileges to attach to anything it owns. Or you
should I think I am missing some subtle detail at the moment.
> (That's especially useful given
> that most users especially container runtimes will unshare all
> namespaces. Doing it this way we can not just have attach privileged
> users attach to their containers but also unprivileged users to their
> containers in one shot.)
That is a wonderful reason for doing things, and it is the reason
why I am asking about it because I think you have it backwards.
Especially in the context of some container runtimes like Kubernetes
that I have been told will do things like share a network namespace
across all containers in a POD.
> A few other points about this. If one looks at clone(CLONE_NEW*) or
> unshare(CLONE_NEW*) then the ordering and permissions checking is the
> same way. All permissions checks are performed against the reduced
> permissions, i.e. if CLONE_NEWUSER is specified you check privilege
> against the reduced permissions too otherwise you wouldn't be able to
> spawn into a complete set of new namespaces as an unprivileged user.
That is a good catch and definitely a reason for looking at doing
things in this order.
For unshare and clone putting things in a user namespace means you can
create namespaces you could not create otherwise.
> This logic is also expressed in how setns() is already used in
> userspace. Any container runtime will attach to the user namespace first,
> so all subsequent calls to attach to other namespaces perform the checks
> against the reduced permissions. It also has to be that way because of
> fully unprivileged containers.
So I sat and looked. For nsetner it winds up trying to enter
the namespaces in either order.
/*
* Now that we know which namespaces we want to enter, enter
* them. Do this in two passes, not entering the user
* namespace on the first pass. So if we're deprivileging the
* container we'll enter the user namespace last and if we're
* privileging it then we enter the user namespace first
* (because the initial setns will fail).
*/
for (pass = 0; pass < 2; pass ++) {
for (nsfile = namespace_files + 1 - pass; nsfile->nstype; nsfile++) {
if (nsfile->fd < 0)
continue;
if (nsfile->nstype == CLONE_NEWPID && do_fork == -1)
do_fork = 1;
if (setns(nsfile->fd, nsfile->nstype)) {
if (pass != 0)
err(EXIT_FAILURE,
_("reassociate to namespace '%s' failed"),
nsfile->name);
else
continue;
}
close(nsfile->fd);
nsfile->fd = -1;
}
}
Looking a little close we have at least for entering the mntns the
following checks:
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
Which require CAP_SYS_CHROOT and CAP_SYS_ADMIN in the current user namespace.
So there is defintiely an issue there.
I suspect the clean approach is to simply require CAP_SYS_CHROOT in
the new user namespace, when we are setting several of them at once.
> To put it another way, if we were to always perform the permission checks
> against the current permissions (i.e. no matter if CLONE_NEWUSER is
> specified or not) then we'd never be able to attach to a set of
> namespaces at once as an unprivileged user.
> We also really want to be able to express both semantics:
> 1. setns(flags & ~CLONE_NEWUSER) --> attach to all namespaces with my
> current permission level
> 2. setns(flags | CLONE_NEWUSER) attach to all namespaces with the target
> permission level
> It feels weird if both 1 and 2 would mean the exact same thing given
> that the user namespace has an owernship relation with all the other
> namespaces.
It feels weird to me to disallow anything that we have permissions for.
Can you dig up the actual install permissions checks and see if there is
anything other than the mount namespace that needs permissions in the
current user namespace?
Please let's walk this through. I think there should be a way to
carefully phrase the permission checks that we don't need
ns_capable_cred that will allow goofy cases like setns into Kuberneties
PODs that share network namespaces.
I believe that will be a way to phrase the permission checks so that
with or without CLONE_NEWUSER they make sense, and give very similar
results.
Certainly attaching to a mount namespace is going to need either being
root or attaching to a user namespace at the same time. Because
attaching to a mount namespace needs functionality that the user
namespace provides.
Eric
On Sat, May 02, 2020 at 07:35:53AM -0500, Eric W. Biederman wrote:
> Christian Brauner <[email protected]> writes:
>
> > On Thu, Apr 30, 2020 at 01:09:30PM -0500, Eric W. Biederman wrote:
> >> Christian Brauner <[email protected]> writes:
> >>
> >> > Add a simple capability helper which makes it possible to determine
> >> > whether a set of creds is ns capable wrt to the passed in credentials.
> >> > This is not something exciting it's just a more pleasant wrapper around
> >> > security_capable() by allowing ns_capable_common() to ake a const struct
> >> > cred argument. In ptrace_has_cap() for example, we're using
> >> > security_capable() directly. ns_capable_cred() will be used in the next
> >> > patch to check against the target credentials the caller is going to
> >> > switch to.
> >>
> >> Given that this is to suppot setns. I don't understand the
> >> justification for this.
> >>
> >> Is it your intention to use the reduced permissions that you get
> >> when you install a user namespace?
> >
> > Indeed.
> >
> >>
> >> Why do you want to use the reduced permissions when installing multiple
> >> namespaces at once?
> >
> > The intention is to use the target credentials we are going to install
> > when setns() hits the point of no return. The target permissions are
> > either the permissions of the caller or the reduced permissions if
> > CLONE_NEWUSER has been requested. This has multiple reasons.
> >
> > The most obvious reason imho is that all other namespaces have an owning
> > user namespace. Attaching to any other namespace requires the attacher
> > to be privileged over the owning user namespace of that namespace.
> > Consequently, all our current install handlers for every namespace we
> > have check whether we are privileged in the owning user namespace of
> > that user namespace. So in order to attach to any of those namespaces -
> > especially when attaching as an unprivileged user - requires that we are
> > attached to the user namespace first.
>
> No actually it doesn't. As if you have privileges to attach to the user
> namespace you have the privileges to attach to anything it owns. Or you
> should I think I am missing some subtle detail at the moment.
Sorry, I phrased that very misleadingly. What I'm talking about should
be obvious in the second patch:
-static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int utsns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct uts_namespace *ns = to_uts_ns(new);
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ if (!ns_capable_cred(nsset->cred, ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
All our current install handlers check that we're privileged wrt to our
_current_ user namespace, i.e. they all have a
ns_capable(current_user_ns(), CAP_SYS_ADMIN)
line. So if we specify
setns(CLONE_NEWUSER | CLONE_NEW*)
as an unprivileged user we aren't able to attach to any other namespace
unless we check against the target credentials.
>
>
> > (That's especially useful given
> > that most users especially container runtimes will unshare all
> > namespaces. Doing it this way we can not just have attach privileged
> > users attach to their containers but also unprivileged users to their
> > containers in one shot.)
>
> That is a wonderful reason for doing things, and it is the reason
> why I am asking about it because I think you have it backwards.
>
> Especially in the context of some container runtimes like Kubernetes
> that I have been told will do things like share a network namespace
> across all containers in a POD.
Kubernetes doesn't use user namespace at all so they need to run as root
anyway so that's not a problem. And if we're talking about scenarios
where some namespaces are shared and other's aren't then there'll be
batch-attaching any, i.e. not all at once but some at once. So I don't
think this is a good argument.
>
> > A few other points about this. If one looks at clone(CLONE_NEW*) or
> > unshare(CLONE_NEW*) then the ordering and permissions checking is the
> > same way. All permissions checks are performed against the reduced
> > permissions, i.e. if CLONE_NEWUSER is specified you check privilege
> > against the reduced permissions too otherwise you wouldn't be able to
> > spawn into a complete set of new namespaces as an unprivileged user.
>
> That is a good catch and definitely a reason for looking at doing
> things in this order.
>
> For unshare and clone putting things in a user namespace means you can
> create namespaces you could not create otherwise.
>
>
> > This logic is also expressed in how setns() is already used in
> > userspace. Any container runtime will attach to the user namespace first,
> > so all subsequent calls to attach to other namespaces perform the checks
> > against the reduced permissions. It also has to be that way because of
> > fully unprivileged containers.
>
> So I sat and looked. For nsetner it winds up trying to enter
> the namespaces in either order.
>
> /*
> * Now that we know which namespaces we want to enter, enter
> * them. Do this in two passes, not entering the user
> * namespace on the first pass. So if we're deprivileging the
> * container we'll enter the user namespace last and if we're
> * privileging it then we enter the user namespace first
> * (because the initial setns will fail).
> */
> for (pass = 0; pass < 2; pass ++) {
> for (nsfile = namespace_files + 1 - pass; nsfile->nstype; nsfile++) {
> if (nsfile->fd < 0)
> continue;
> if (nsfile->nstype == CLONE_NEWPID && do_fork == -1)
> do_fork = 1;
> if (setns(nsfile->fd, nsfile->nstype)) {
> if (pass != 0)
> err(EXIT_FAILURE,
> _("reassociate to namespace '%s' failed"),
> nsfile->name);
> else
> continue;
> }
>
> close(nsfile->fd);
> nsfile->fd = -1;
> }
> }
>
>
> Looking a little close we have at least for entering the mntns the
> following checks:
>
> if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
> !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> return -EPERM;
>
> Which require CAP_SYS_CHROOT and CAP_SYS_ADMIN in the current user namespace.
>
> So there is defintiely an issue there.
>
> I suspect the clean approach is to simply require CAP_SYS_CHROOT in
> the new user namespace, when we are setting several of them at once.
See my example above. All install handlers check for
ns_capable(current_user_ns(), CAP_SYS_ADMIN).
static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
{
struct ipc_namespace *ns = to_ipc_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct net *net = to_net_ns(ns);
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct fs_struct *fs = current->fs;
struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
struct path root;
int err;
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
{
struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
>
> > To put it another way, if we were to always perform the permission checks
> > against the current permissions (i.e. no matter if CLONE_NEWUSER is
> > specified or not) then we'd never be able to attach to a set of
> > namespaces at once as an unprivileged user.
> > We also really want to be able to express both semantics:
> > 1. setns(flags & ~CLONE_NEWUSER) --> attach to all namespaces with my
> > current permission level
> > 2. setns(flags | CLONE_NEWUSER) attach to all namespaces with the target
> > permission level
> > It feels weird if both 1 and 2 would mean the exact same thing given
> > that the user namespace has an owernship relation with all the other
> > namespaces.
>
> It feels weird to me to disallow anything that we have permissions for.
>
> Can you dig up the actual install permissions checks and see if there is
> anything other than the mount namespace that needs permissions in the
> current user namespace?
Yep did, all of them check ns_capable(current_user_ns(), CAP_SYS_ADMIN),
see above please.
>
> Please let's walk this through. I think there should be a way to
> carefully phrase the permission checks that we don't need
> ns_capable_cred that will allow goofy cases like setns into Kuberneties
> PODs that share network namespaces.
Hm, Kubernetes doesn't use user namespace. I think I misunderstand your
concern maybe. But see below for a suggestion.
>
> I believe that will be a way to phrase the permission checks so that
> with or without CLONE_NEWUSER they make sense, and give very similar
> results.
>
> Certainly attaching to a mount namespace is going to need either being
> root or attaching to a user namespace at the same time. Because
> attaching to a mount namespace needs functionality that the user
> namespace provides.
So how about we add a new flag to setns()
SETNS_NEWUSER_CRED that is only valid in
conjunction with CLONE_NEWUSER and which allows callers to tell the
kernel "assume the target credentials first". This way we can support
both cases and users can specify what they want and we can clearly
document it on the manpages.
Christian
Christian Brauner <[email protected]> writes:
> On Sat, May 02, 2020 at 07:35:53AM -0500, Eric W. Biederman wrote:
>> Christian Brauner <[email protected]> writes:
>>
>> > On Thu, Apr 30, 2020 at 01:09:30PM -0500, Eric W. Biederman wrote:
>> >> Christian Brauner <[email protected]> writes:
>> >>
>> >> > Add a simple capability helper which makes it possible to determine
>> >> > whether a set of creds is ns capable wrt to the passed in credentials.
>> >> > This is not something exciting it's just a more pleasant wrapper around
>> >> > security_capable() by allowing ns_capable_common() to ake a const struct
>> >> > cred argument. In ptrace_has_cap() for example, we're using
>> >> > security_capable() directly. ns_capable_cred() will be used in the next
>> >> > patch to check against the target credentials the caller is going to
>> >> > switch to.
>> >>
>> >> Given that this is to suppot setns. I don't understand the
>> >> justification for this.
>> >>
>> >> Is it your intention to use the reduced permissions that you get
>> >> when you install a user namespace?
>> >
>> > Indeed.
>> >
>> >>
>> >> Why do you want to use the reduced permissions when installing multiple
>> >> namespaces at once?
>> >
>> > The intention is to use the target credentials we are going to install
>> > when setns() hits the point of no return. The target permissions are
>> > either the permissions of the caller or the reduced permissions if
>> > CLONE_NEWUSER has been requested. This has multiple reasons.
>> >
>> > The most obvious reason imho is that all other namespaces have an owning
>> > user namespace. Attaching to any other namespace requires the attacher
>> > to be privileged over the owning user namespace of that namespace.
>> > Consequently, all our current install handlers for every namespace we
>> > have check whether we are privileged in the owning user namespace of
>> > that user namespace. So in order to attach to any of those namespaces -
>> > especially when attaching as an unprivileged user - requires that we are
>> > attached to the user namespace first.
>>
>> No actually it doesn't. As if you have privileges to attach to the user
>> namespace you have the privileges to attach to anything it owns. Or you
>> should I think I am missing some subtle detail at the moment.
>
> Sorry, I phrased that very misleadingly. What I'm talking about should
> be obvious in the second patch:
>
> -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
> +static int utsns_install(struct nsset *nsset, struct ns_common *new)
> {
> + struct nsproxy *nsproxy = nsset->nsproxy;
> struct uts_namespace *ns = to_uts_ns(new);
>
> - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
> - !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> + if (!ns_capable_cred(nsset->cred, ns->user_ns, CAP_SYS_ADMIN) ||
> + !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
> return -EPERM;
>
> All our current install handlers check that we're privileged wrt to our
> _current_ user namespace, i.e. they all have a
> ns_capable(current_user_ns(), CAP_SYS_ADMIN)
> line. So if we specify
> setns(CLONE_NEWUSER | CLONE_NEW*)
> as an unprivileged user we aren't able to attach to any other namespace
> unless we check against the target credentials.
>
>>
>>
>> > (That's especially useful given
>> > that most users especially container runtimes will unshare all
>> > namespaces. Doing it this way we can not just have attach privileged
>> > users attach to their containers but also unprivileged users to their
>> > containers in one shot.)
>>
>> That is a wonderful reason for doing things, and it is the reason
>> why I am asking about it because I think you have it backwards.
>>
>> Especially in the context of some container runtimes like Kubernetes
>> that I have been told will do things like share a network namespace
>> across all containers in a POD.
>
> Kubernetes doesn't use user namespace at all so they need to run as root
> anyway so that's not a problem. And if we're talking about scenarios
> where some namespaces are shared and other's aren't then there'll be
> batch-attaching any, i.e. not all at once but some at once. So I don't
> think this is a good argument.
>
>>
>> > A few other points about this. If one looks at clone(CLONE_NEW*) or
>> > unshare(CLONE_NEW*) then the ordering and permissions checking is the
>> > same way. All permissions checks are performed against the reduced
>> > permissions, i.e. if CLONE_NEWUSER is specified you check privilege
>> > against the reduced permissions too otherwise you wouldn't be able to
>> > spawn into a complete set of new namespaces as an unprivileged user.
>>
>> That is a good catch and definitely a reason for looking at doing
>> things in this order.
>>
>> For unshare and clone putting things in a user namespace means you can
>> create namespaces you could not create otherwise.
>>
>>
>> > This logic is also expressed in how setns() is already used in
>> > userspace. Any container runtime will attach to the user namespace first,
>> > so all subsequent calls to attach to other namespaces perform the checks
>> > against the reduced permissions. It also has to be that way because of
>> > fully unprivileged containers.
>>
>> So I sat and looked. For nsetner it winds up trying to enter
>> the namespaces in either order.
>>
>> /*
>> * Now that we know which namespaces we want to enter, enter
>> * them. Do this in two passes, not entering the user
>> * namespace on the first pass. So if we're deprivileging the
>> * container we'll enter the user namespace last and if we're
>> * privileging it then we enter the user namespace first
>> * (because the initial setns will fail).
>> */
>> for (pass = 0; pass < 2; pass ++) {
>> for (nsfile = namespace_files + 1 - pass; nsfile->nstype; nsfile++) {
>> if (nsfile->fd < 0)
>> continue;
>> if (nsfile->nstype == CLONE_NEWPID && do_fork == -1)
>> do_fork = 1;
>> if (setns(nsfile->fd, nsfile->nstype)) {
>> if (pass != 0)
>> err(EXIT_FAILURE,
>> _("reassociate to namespace '%s' failed"),
>> nsfile->name);
>> else
>> continue;
>> }
>>
>> close(nsfile->fd);
>> nsfile->fd = -1;
>> }
>> }
>>
>>
>> Looking a little close we have at least for entering the mntns the
>> following checks:
>>
>> if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
>> !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
>> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
>> return -EPERM;
>>
>> Which require CAP_SYS_CHROOT and CAP_SYS_ADMIN in the current user namespace.
>>
>> So there is defintiely an issue there.
>>
>> I suspect the clean approach is to simply require CAP_SYS_CHROOT in
>> the new user namespace, when we are setting several of them at once.
>
> See my example above. All install handlers check for
> ns_capable(current_user_ns(), CAP_SYS_ADMIN).
>
>
> static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
> {
> struct ipc_namespace *ns = to_ipc_ns(new);
> if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> return -EPERM;
>
>
> static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> {
> struct net *net = to_net_ns(ns);
>
> if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> return -EPERM;
>
> static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> {
> struct fs_struct *fs = current->fs;
> struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
> struct path root;
> int err;
>
> if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
> !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> return -EPERM;
>
> static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> {
> struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
>
> if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
> !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
> return -EPERM;
>
> static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
> {
> struct uts_namespace *ns = to_uts_ns(new);
>
> if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> return -EPERM;
>
> static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> {
> struct pid_namespace *active = task_active_pid_ns(current);
> struct pid_namespace *ancestor, *new = to_pid_ns(ns);
>
> if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> return -EPERM;
>
>>
>> > To put it another way, if we were to always perform the permission checks
>> > against the current permissions (i.e. no matter if CLONE_NEWUSER is
>> > specified or not) then we'd never be able to attach to a set of
>> > namespaces at once as an unprivileged user.
>> > We also really want to be able to express both semantics:
>> > 1. setns(flags & ~CLONE_NEWUSER) --> attach to all namespaces with my
>> > current permission level
>> > 2. setns(flags | CLONE_NEWUSER) attach to all namespaces with the target
>> > permission level
>> > It feels weird if both 1 and 2 would mean the exact same thing given
>> > that the user namespace has an owernship relation with all the other
>> > namespaces.
>>
>> It feels weird to me to disallow anything that we have permissions for.
>>
>> Can you dig up the actual install permissions checks and see if there is
>> anything other than the mount namespace that needs permissions in the
>> current user namespace?
>
> Yep did, all of them check ns_capable(current_user_ns(), CAP_SYS_ADMIN),
> see above please.
>
>>
>> Please let's walk this through. I think there should be a way to
>> carefully phrase the permission checks that we don't need
>> ns_capable_cred that will allow goofy cases like setns into Kuberneties
>> PODs that share network namespaces.
>
> Hm, Kubernetes doesn't use user namespace. I think I misunderstand your
> concern maybe. But see below for a suggestion.
>
>>
>> I believe that will be a way to phrase the permission checks so that
>> with or without CLONE_NEWUSER they make sense, and give very similar
>> results.
>>
>> Certainly attaching to a mount namespace is going to need either being
>> root or attaching to a user namespace at the same time. Because
>> attaching to a mount namespace needs functionality that the user
>> namespace provides.
>
> So how about we add a new flag to setns()
> SETNS_NEWUSER_CRED that is only valid in
> conjunction with CLONE_NEWUSER and which allows callers to tell the
> kernel "assume the target credentials first". This way we can support
> both cases and users can specify what they want and we can clearly
> document it on the manpages.
Let's not get complicated. Let's make this very simple.
Change "ns_capable(current_user_user_ns(), CAP_SYS_ADMIN)"
to "ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)".
We still use the current credentials, but we require the caller of setns
to have CAP_SYS_ADMIN in the user namespace you are going to wind up in.
That is exactly what userns_install does today.
That won't be any kind of security hole because we still require
"ns_capable(ns->user_ns, CAP_SYS_ADMIN)"
on all of the namespaces as well.
That way if you are sufficiently privileged you can still handle joining
namespaces that are not owned by the owner of the processes user
namespace. So we can join weird processes.
Does that make sense?
Eric
On Sat, May 02, 2020 at 09:52:03AM -0500, Eric W. Biederman wrote:
> Christian Brauner <[email protected]> writes:
>
> > On Sat, May 02, 2020 at 07:35:53AM -0500, Eric W. Biederman wrote:
> >> Christian Brauner <[email protected]> writes:
> >>
> >> > On Thu, Apr 30, 2020 at 01:09:30PM -0500, Eric W. Biederman wrote:
> >> >> Christian Brauner <[email protected]> writes:
> >> >>
> >> >> > Add a simple capability helper which makes it possible to determine
> >> >> > whether a set of creds is ns capable wrt to the passed in credentials.
> >> >> > This is not something exciting it's just a more pleasant wrapper around
> >> >> > security_capable() by allowing ns_capable_common() to ake a const struct
> >> >> > cred argument. In ptrace_has_cap() for example, we're using
> >> >> > security_capable() directly. ns_capable_cred() will be used in the next
> >> >> > patch to check against the target credentials the caller is going to
> >> >> > switch to.
> >> >>
> >> >> Given that this is to suppot setns. I don't understand the
> >> >> justification for this.
> >> >>
> >> >> Is it your intention to use the reduced permissions that you get
> >> >> when you install a user namespace?
> >> >
> >> > Indeed.
> >> >
> >> >>
> >> >> Why do you want to use the reduced permissions when installing multiple
> >> >> namespaces at once?
> >> >
> >> > The intention is to use the target credentials we are going to install
> >> > when setns() hits the point of no return. The target permissions are
> >> > either the permissions of the caller or the reduced permissions if
> >> > CLONE_NEWUSER has been requested. This has multiple reasons.
> >> >
> >> > The most obvious reason imho is that all other namespaces have an owning
> >> > user namespace. Attaching to any other namespace requires the attacher
> >> > to be privileged over the owning user namespace of that namespace.
> >> > Consequently, all our current install handlers for every namespace we
> >> > have check whether we are privileged in the owning user namespace of
> >> > that user namespace. So in order to attach to any of those namespaces -
> >> > especially when attaching as an unprivileged user - requires that we are
> >> > attached to the user namespace first.
> >>
> >> No actually it doesn't. As if you have privileges to attach to the user
> >> namespace you have the privileges to attach to anything it owns. Or you
> >> should I think I am missing some subtle detail at the moment.
> >
> > Sorry, I phrased that very misleadingly. What I'm talking about should
> > be obvious in the second patch:
> >
> > -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
> > +static int utsns_install(struct nsset *nsset, struct ns_common *new)
> > {
> > + struct nsproxy *nsproxy = nsset->nsproxy;
> > struct uts_namespace *ns = to_uts_ns(new);
> >
> > - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
> > - !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > + if (!ns_capable_cred(nsset->cred, ns->user_ns, CAP_SYS_ADMIN) ||
> > + !ns_capable_cred(nsset->cred, nsset->cred->user_ns, CAP_SYS_ADMIN))
> > return -EPERM;
> >
> > All our current install handlers check that we're privileged wrt to our
> > _current_ user namespace, i.e. they all have a
> > ns_capable(current_user_ns(), CAP_SYS_ADMIN)
> > line. So if we specify
> > setns(CLONE_NEWUSER | CLONE_NEW*)
> > as an unprivileged user we aren't able to attach to any other namespace
> > unless we check against the target credentials.
> >
> >>
> >>
> >> > (That's especially useful given
> >> > that most users especially container runtimes will unshare all
> >> > namespaces. Doing it this way we can not just have attach privileged
> >> > users attach to their containers but also unprivileged users to their
> >> > containers in one shot.)
> >>
> >> That is a wonderful reason for doing things, and it is the reason
> >> why I am asking about it because I think you have it backwards.
> >>
> >> Especially in the context of some container runtimes like Kubernetes
> >> that I have been told will do things like share a network namespace
> >> across all containers in a POD.
> >
> > Kubernetes doesn't use user namespace at all so they need to run as root
> > anyway so that's not a problem. And if we're talking about scenarios
> > where some namespaces are shared and other's aren't then there'll be
> > batch-attaching any, i.e. not all at once but some at once. So I don't
> > think this is a good argument.
> >
> >>
> >> > A few other points about this. If one looks at clone(CLONE_NEW*) or
> >> > unshare(CLONE_NEW*) then the ordering and permissions checking is the
> >> > same way. All permissions checks are performed against the reduced
> >> > permissions, i.e. if CLONE_NEWUSER is specified you check privilege
> >> > against the reduced permissions too otherwise you wouldn't be able to
> >> > spawn into a complete set of new namespaces as an unprivileged user.
> >>
> >> That is a good catch and definitely a reason for looking at doing
> >> things in this order.
> >>
> >> For unshare and clone putting things in a user namespace means you can
> >> create namespaces you could not create otherwise.
> >>
> >>
> >> > This logic is also expressed in how setns() is already used in
> >> > userspace. Any container runtime will attach to the user namespace first,
> >> > so all subsequent calls to attach to other namespaces perform the checks
> >> > against the reduced permissions. It also has to be that way because of
> >> > fully unprivileged containers.
> >>
> >> So I sat and looked. For nsetner it winds up trying to enter
> >> the namespaces in either order.
> >>
> >> /*
> >> * Now that we know which namespaces we want to enter, enter
> >> * them. Do this in two passes, not entering the user
> >> * namespace on the first pass. So if we're deprivileging the
> >> * container we'll enter the user namespace last and if we're
> >> * privileging it then we enter the user namespace first
> >> * (because the initial setns will fail).
> >> */
> >> for (pass = 0; pass < 2; pass ++) {
> >> for (nsfile = namespace_files + 1 - pass; nsfile->nstype; nsfile++) {
> >> if (nsfile->fd < 0)
> >> continue;
> >> if (nsfile->nstype == CLONE_NEWPID && do_fork == -1)
> >> do_fork = 1;
> >> if (setns(nsfile->fd, nsfile->nstype)) {
> >> if (pass != 0)
> >> err(EXIT_FAILURE,
> >> _("reassociate to namespace '%s' failed"),
> >> nsfile->name);
> >> else
> >> continue;
> >> }
> >>
> >> close(nsfile->fd);
> >> nsfile->fd = -1;
> >> }
> >> }
> >>
> >>
> >> Looking a little close we have at least for entering the mntns the
> >> following checks:
> >>
> >> if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
> >> !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
> >> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> >> return -EPERM;
> >>
> >> Which require CAP_SYS_CHROOT and CAP_SYS_ADMIN in the current user namespace.
> >>
> >> So there is defintiely an issue there.
> >>
> >> I suspect the clean approach is to simply require CAP_SYS_CHROOT in
> >> the new user namespace, when we are setting several of them at once.
> >
> > See my example above. All install handlers check for
> > ns_capable(current_user_ns(), CAP_SYS_ADMIN).
> >
> >
> > static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
> > {
> > struct ipc_namespace *ns = to_ipc_ns(new);
> > if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
> > !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > return -EPERM;
> >
> >
> > static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> > {
> > struct net *net = to_net_ns(ns);
> >
> > if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
> > !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > return -EPERM;
> >
> > static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> > {
> > struct fs_struct *fs = current->fs;
> > struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
> > struct path root;
> > int err;
> >
> > if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
> > !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
> > !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > return -EPERM;
> >
> > static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> > {
> > struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
> >
> > if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
> > !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
> > return -EPERM;
> >
> > static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
> > {
> > struct uts_namespace *ns = to_uts_ns(new);
> >
> > if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
> > !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > return -EPERM;
> >
> > static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> > {
> > struct pid_namespace *active = task_active_pid_ns(current);
> > struct pid_namespace *ancestor, *new = to_pid_ns(ns);
> >
> > if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
> > !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > return -EPERM;
> >
> >>
> >> > To put it another way, if we were to always perform the permission checks
> >> > against the current permissions (i.e. no matter if CLONE_NEWUSER is
> >> > specified or not) then we'd never be able to attach to a set of
> >> > namespaces at once as an unprivileged user.
> >> > We also really want to be able to express both semantics:
> >> > 1. setns(flags & ~CLONE_NEWUSER) --> attach to all namespaces with my
> >> > current permission level
> >> > 2. setns(flags | CLONE_NEWUSER) attach to all namespaces with the target
> >> > permission level
> >> > It feels weird if both 1 and 2 would mean the exact same thing given
> >> > that the user namespace has an owernship relation with all the other
> >> > namespaces.
> >>
> >> It feels weird to me to disallow anything that we have permissions for.
> >>
> >> Can you dig up the actual install permissions checks and see if there is
> >> anything other than the mount namespace that needs permissions in the
> >> current user namespace?
> >
> > Yep did, all of them check ns_capable(current_user_ns(), CAP_SYS_ADMIN),
> > see above please.
> >
> >>
> >> Please let's walk this through. I think there should be a way to
> >> carefully phrase the permission checks that we don't need
> >> ns_capable_cred that will allow goofy cases like setns into Kuberneties
> >> PODs that share network namespaces.
> >
> > Hm, Kubernetes doesn't use user namespace. I think I misunderstand your
> > concern maybe. But see below for a suggestion.
> >
> >>
> >> I believe that will be a way to phrase the permission checks so that
> >> with or without CLONE_NEWUSER they make sense, and give very similar
> >> results.
> >>
> >> Certainly attaching to a mount namespace is going to need either being
> >> root or attaching to a user namespace at the same time. Because
> >> attaching to a mount namespace needs functionality that the user
> >> namespace provides.
> >
> > So how about we add a new flag to setns()
> > SETNS_NEWUSER_CRED that is only valid in
> > conjunction with CLONE_NEWUSER and which allows callers to tell the
> > kernel "assume the target credentials first". This way we can support
> > both cases and users can specify what they want and we can clearly
> > document it on the manpages.
>
> Let's not get complicated. Let's make this very simple.
I'd call it "flexible". :)
>
> Change "ns_capable(current_user_user_ns(), CAP_SYS_ADMIN)"
> to "ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)".
>
> We still use the current credentials, but we require the caller of setns
> to have CAP_SYS_ADMIN in the user namespace you are going to wind up in.
> That is exactly what userns_install does today.
>
> That won't be any kind of security hole because we still require
> "ns_capable(ns->user_ns, CAP_SYS_ADMIN)"
> on all of the namespaces as well.
>
> That way if you are sufficiently privileged you can still handle joining
> namespaces that are not owned by the owner of the processes user
> namespace. So we can join weird processes.
>
> Does that make sense?
Well, it's funny that you say that since I had a version of that
patchset and I still have it at setns_pidfd_v2_wip_v5. I justed pushed
it to
https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=setns_pidfd_v2_wip_v5
The reason I refrained from doing this was fear of introducing a
security hole but since we agree that this would be fine let's give it a
go.
In any case, I think I never ran the test-suite I added on that version.
Let me plug this in and test this.
Christian
Hello Christian,
Looking at this patch, and commit 303cc571d107b that landed in
5.8-rc1, time namespaces were omitted. I assume this was an accident,
since the commit message makes no statement about excluding time
namespaces, and even mentions their existence as part of the
justification for the patch ("With time namespaces we're looking at
about 16 syscalls.").
Is a fix needed here?
Thanks,
Michael
On Thu, 30 Apr 2020 at 18:57, Christian Brauner
<[email protected]> wrote:
>
> For quite a while we have been thinking about using pidfds to attach to
> namespaces. This patchset has existed for about a year already but we've
> wanted to wait to see how the general api would be received and adopted.
> Now that more and more programs in userspace have started using pidfds
> for process management it's time to send this one out.
>
> This patch makes it possible to use pidfds to attach to the namespaces
> of another process, i.e. they can be passed as the first argument to the
> setns() syscall. When only a single namespace type is specified the
> semantics are equivalent to passing an nsfd. That means
> setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However,
> when a pidfd is passed, multiple namespace flags can be specified in the
> second setns() argument and setns() will attach the caller to all the
> specified namespaces all at once or to none of them. Specifying 0 is not
> valid together with a pidfd.
>
> The obvious example where this is useful is a standard container
> manager interacting with a running container: pushing and pulling files
> or directories, injecting mounts, attaching/execing any kind of process,
> managing network devices all these operations require attaching to all
> or at least multiple namespaces at the same time. Given that nowadays
> most containers are spawned with all namespaces enabled we're currently
> looking at at least 14 syscalls, 7 to open the /proc/<pid>/ns/<ns>
> nsfds, another 7 to actually perform the namespace switch. With time
> namespaces we're looking at about 16 syscalls.
> (We could amortize the first 7 or 8 syscalls for opening the nsfds by
> stashing them in each container's monitor process but that would mean
> we need to send around those file descriptors through unix sockets
> everytime we want to interact with the container or keep on-disk
> state. Even in scenarios where a caller wants to join a particular
> namespace in a particular order callers still profit from batching
> other namespaces. That mostly applies to the user namespace but
> all container runtimes I found join the user namespace first no matter
> if it privileges or deprivileges the container similar to how unshare
> behaves.)
> With pidfds this becomes a single syscall no matter how many namespaces
> are supposed to be attached to.
>
> A decently designed, large-scale container manager usually isn't the
> parent of any of the containers it spawns so the containers don't die
> when it crashes or needs to update or reinitialize. This means that
> for the manager to interact with containers through pids is inherently
> racy especially on systems where the maximum pid number is not
> significicantly bumped. This is even more problematic since we often spawn
> and manage thousands or ten-thousands of containers. Interacting with a
> container through a pid thus can become risky quite quickly. Especially
> since we allow for an administrator to enable advanced features such as
> syscall interception where we're performing syscalls in lieu of the
> container. In all of those cases we use pidfds if they are available and
> we pass them around as stable references. Using them to setns() to the
> target process' namespaces is as reliable as using nsfds. Either the
> target process is already dead and we get ESRCH or we manage to attach
> to its namespaces but we can't accidently attach to another process'
> namespaces. So pidfds lend themselves to be used with this api.
> The other main advantage is that with this change the pidfd becomes the
> only relevant token for most container interactions and it's the only
> token we need to create and send around.
>
> Apart from significiantly reducing the number of syscalls from double
> digit to single digit which is a decent reason post-spectre/meltdown
> this also allows to switch to a set of namespaces atomically, i.e.
> either attaching to all the specified namespaces succeeds or we fail. If
> we fail we haven't changed a single namespace. There are currently three
> namespaces that can fail (other than for ENOMEM which really is not
> very interesting since we then have other problems anyway) for
> non-trivial reasons, user, mount, and pid namespaces. We can fail to
> attach to a pid namespace if it is not our current active pid namespace
> or a descendant of it. We can fail to attach to a user namespace because
> we are multi-threaded or because our current mount namespace shares
> filesystem state with other tasks, or because we're trying to setns()
> to the same user namespace, i.e. the target task has the same user
> namespace as we do. We can fail to attach to a mount namespace because
> it shares filesystem state with other tasks or because we fail to lookup
> the new root for the new mount namespace. In most non-pathological
> scenarios these issues can be somewhat mitigated. But there are cases where
> we're half-attached to some namespace and failing to attach to another one.
> I've talked about some of these problem during the hallway track (something
> only the pre-COVID-19 generation will remember) of Plumbers in Los Angeles
> in 2018(?). Even if all these issues could be avoided with super careful
> userspace coding it would be nicer to have this done in-kernel. Pidfds seem
> to lend themselves nicely for this.
>
> The other neat thing about this is that setns() becomes an actual
> counterpart to the namespace bits of unshare().
>
> Cc: Eric W. Biederman <[email protected]>
> Cc: Serge Hallyn <[email protected]>
> Cc: Jann Horn <[email protected]>
> Cc: Michael Kerrisk <[email protected]>
> Cc: Aleksa Sarai <[email protected]>
> Signed-off-by: Christian Brauner <[email protected]>
> ---
> Seems that some people do in fact find this useful and I still have
> hopes that I can potentially send this for v5.8. I've added some
> expanded testing to this whole series to catch any mistakes.
> The lifecycle management for various objects was certainly the
> most interesting aspect so I'd appreciate a close look.
> /* v2 */
> - Michael Kerrisk <[email protected]>:
> - Michael pointed out that the semantics for setns(nsfd, 0) and
> setns(pidfd, 0) are not comparable. setns(pidfd, 0) is now
> disallowed completely. Users wanting to attach to all namespaces
> should simply specify them explicitly just as with unshare() and
> clone3().
> - Jann Horn <[email protected]>:
> - Jann pointed out that the setns() in its first form wasn't atomic in
> that userspace could end up in an intermediate state where e.g. the
> process had moved into the target user namespace but failed to move
> into the target mount namespace.
> In this new version I've removed all intermediate states. There's an
> installation/preparation state and a commit state similar to
> prepare_creds() and commit_creds().
> ---
> fs/nsfs.c | 7 +-
> include/linux/proc_fs.h | 6 ++
> kernel/nsproxy.c | 228 ++++++++++++++++++++++++++++++++++++----
> 3 files changed, 222 insertions(+), 19 deletions(-)
>
> diff --git a/fs/nsfs.c b/fs/nsfs.c
> index 4f1205725cfe..b023c1a367c8 100644
> --- a/fs/nsfs.c
> +++ b/fs/nsfs.c
> @@ -229,6 +229,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
> return res;
> }
>
> +bool proc_ns_file(const struct file *file)
> +{
> + return file->f_op == &ns_file_operations;
> +}
> +
> struct file *proc_ns_fget(int fd)
> {
> struct file *file;
> @@ -237,7 +242,7 @@ struct file *proc_ns_fget(int fd)
> if (!file)
> return ERR_PTR(-EBADF);
>
> - if (file->f_op != &ns_file_operations)
> + if (!proc_ns_file(file))
> goto out_invalid;
>
> return file;
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 45c05fd9c99d..acfd5012db4e 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -104,6 +104,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
> proc_write_t write,
> void *data);
> extern struct pid *tgid_pidfd_to_pid(const struct file *file);
> +extern bool proc_ns_file(const struct file *file);
>
> #ifdef CONFIG_PROC_PID_ARCH_STATUS
> /*
> @@ -159,6 +160,11 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
> return ERR_PTR(-EBADF);
> }
>
> +static inline bool proc_ns_file(const struct file *file)
> +{
> + return false;
> +}
> +
> #endif /* CONFIG_PROC_FS */
>
> struct net;
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 2da463bab58a..8cae508acb9c 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -20,6 +20,7 @@
> #include <linux/ipc_namespace.h>
> #include <linux/time_namespace.h>
> #include <linux/fs_struct.h>
> +#include <linux/proc_fs.h>
> #include <linux/proc_ns.h>
> #include <linux/file.h>
> #include <linux/syscalls.h>
> @@ -258,12 +259,53 @@ void exit_task_namespaces(struct task_struct *p)
> switch_task_namespaces(p, NULL);
> }
>
> +static int check_setns_flags(unsigned long flags)
> +{
> + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> + CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
> + CLONE_NEWCGROUP)))
> + return -EINVAL;
> +
> +#ifndef CONFIG_USER_NS
> + if (flags & CLONE_NEWUSER)
> + return -EINVAL;
> +#endif
> +#ifndef CONFIG_PID_NS
> + if (flags & CLONE_NEWPID)
> + return -EINVAL;
> +#endif
> +#ifndef CONFIG_UTS_NS
> + if (flags & CLONE_NEWUTS)
> + return -EINVAL;
> +#endif
> +#ifndef CONFIG_IPC_NS
> + if (flags & CLONE_NEWIPC)
> + return -EINVAL;
> +#endif
> +#ifndef CONFIG_CGROUPS
> + if (flags & CLONE_NEWCGROUP)
> + return -EINVAL;
> +#endif
> +#ifndef CONFIG_NET_NS
> + if (flags & CLONE_NEWNET)
> + return -EINVAL;
> +#endif
> +
> + return 0;
> +}
> +
> static void put_nsset(struct nsset *nsset)
> {
> unsigned flags = nsset->flags;
>
> if (flags & CLONE_NEWUSER)
> put_cred(nsset_cred(nsset));
> + /*
> + * We only created a temporary copy if we attached to more than just
> + * the mount namespace.
> + */
> + if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
> + free_fs_struct(nsset->fs);
> if (nsset->nsproxy)
> free_nsproxy(nsset->nsproxy);
> }
> @@ -283,8 +325,14 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
> if (!nsset->cred)
> goto out;
>
> - if (flags & CLONE_NEWNS)
> + /* Only create a temporary copy of fs_struct if we really need to. */
> + if (flags == CLONE_NEWNS) {
> nsset->fs = me->fs;
> + } else if (flags & CLONE_NEWNS) {
> + nsset->fs = copy_fs_struct(me->fs);
> + if (!nsset->fs)
> + goto out;
> + }
>
> nsset->flags = flags;
> return 0;
> @@ -294,6 +342,138 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
> return -ENOMEM;
> }
>
> +static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> +{
> + return ns->ops->install(nsset, ns);
> +}
> +
> +/*
> + * This is the inverse operation to unshare().
> + * Ordering is equivalent to the standard ordering used everywhere else
> + * during unshare and process creation. The switch to the new set of
> + * namespaces occurs at the point of no return after installation of
> + * all requested namespaces was successful in commit_nsset().
> + */
> +static int validate_nsset(struct nsset *nsset, struct pid *pid)
> +{
> + int ret = 0;
> + unsigned flags = nsset->flags;
> + struct user_namespace *user_ns = NULL;
> + struct pid_namespace *pid_ns = NULL;
> + struct nsproxy *nsp;
> + struct task_struct *tsk;
> +
> + /* Take a "snapshot" of the target task's namespaces. */
> + rcu_read_lock();
> + tsk = pid_task(pid, PIDTYPE_PID);
> + if (!tsk) {
> + rcu_read_unlock();
> + return -ESRCH;
> + }
> +
> + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
> + rcu_read_unlock();
> + return -EPERM;
> + }
> +
> + task_lock(tsk);
> + nsp = tsk->nsproxy;
> + if (nsp)
> + get_nsproxy(nsp);
> + task_unlock(tsk);
> + if (!nsp) {
> + rcu_read_unlock();
> + return -ESRCH;
> + }
> +
> +#ifdef CONFIG_PID_NS
> + if (flags & CLONE_NEWPID) {
> + pid_ns = task_active_pid_ns(tsk);
> + if (unlikely(!pid_ns)) {
> + rcu_read_unlock();
> + ret = -ESRCH;
> + goto out;
> + }
> + get_pid_ns(pid_ns);
> + }
> +#endif
> +
> +#ifdef CONFIG_USER_NS
> + if (flags & CLONE_NEWUSER)
> + user_ns = get_user_ns(__task_cred(tsk)->user_ns);
> +#endif
> + rcu_read_unlock();
> +
> + /*
> + * Install requested namespaces. The caller will have
> + * verified earlier that the requested namespaces are
> + * supported on this kernel. We don't report errors here
> + * if a namespace is requested that isn't supported.
> + */
> +#ifdef CONFIG_USER_NS
> + if (flags & CLONE_NEWUSER) {
> + ret = validate_ns(nsset, &user_ns->ns);
> + if (ret)
> + goto out;
> + }
> +#endif
> +
> + if (flags & CLONE_NEWNS) {
> + ret = validate_ns(nsset, mnt_ns_to_common(nsp->mnt_ns));
> + if (ret)
> + goto out;
> + }
> +
> +#ifdef CONFIG_UTS_NS
> + if (flags & CLONE_NEWUTS) {
> + ret = validate_ns(nsset, &nsp->uts_ns->ns);
> + if (ret)
> + goto out;
> + }
> +#endif
> +
> +#ifdef CONFIG_IPC_NS
> + if (flags & CLONE_NEWIPC) {
> + ret = validate_ns(nsset, &nsp->ipc_ns->ns);
> + if (ret)
> + goto out;
> + }
> +#endif
> +
> +#ifdef CONFIG_PID_NS
> + if (flags & CLONE_NEWPID) {
> + ret = validate_ns(nsset, &pid_ns->ns);
> + if (ret)
> + goto out;
> + }
> +#endif
> +
> +#ifdef CONFIG_CGROUPS
> + if (flags & CLONE_NEWCGROUP) {
> + ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
> + if (ret)
> + goto out;
> + }
> +#endif
> +
> +#ifdef CONFIG_NET_NS
> + if (flags & CLONE_NEWNET) {
> + ret = validate_ns(nsset, &nsp->net_ns->ns);
> + if (ret)
> + goto out;
> + }
> +#endif
> +
> +out:
> + if (pid_ns)
> + put_pid_ns(pid_ns);
> + if (nsp)
> + put_nsproxy(nsp);
> + put_user_ns(user_ns);
> +
> + return ret;
> +}
> +
> /*
> * This is the point of no return. There are just a few namespaces
> * that do some actual work here and it's sufficiently minimal that
> @@ -316,6 +496,12 @@ static void commit_nsset(struct nsset *nsset)
> }
> #endif
>
> + /* We only need to commit if we have used a temporary fs_struct. */
> + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
> + set_fs_root(me->fs, &nsset->fs->root);
> + set_fs_pwd(me->fs, &nsset->fs->pwd);
> + }
> +
> #ifdef CONFIG_IPC_NS
> if (flags & CLONE_NEWIPC)
> exit_sem(me);
> @@ -326,33 +512,39 @@ static void commit_nsset(struct nsset *nsset)
> nsset->nsproxy = NULL;
> }
>
> -static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> -{
> - return ns->ops->install(nsset, ns);
> -}
> -
> -SYSCALL_DEFINE2(setns, int, fd, int, nstype)
> +SYSCALL_DEFINE2(setns, int, fd, int, flags)
> {
> struct task_struct *tsk = current;
> struct file *file;
> - struct ns_common *ns;
> + struct ns_common *ns = NULL;
> struct nsset nsset = {};
> - int err;
> -
> - file = proc_ns_fget(fd);
> - if (IS_ERR(file))
> - return PTR_ERR(file);
> + int err = 0;
>
> - err = -EINVAL;
> - ns = get_proc_ns(file_inode(file));
> - if (nstype && (ns->ops->type != nstype))
> + file = fget(fd);
> + if (!file)
> + return -EBADF;
> +
> + if (proc_ns_file(file)) {
> + ns = get_proc_ns(file_inode(file));
> + if (flags && (ns->ops->type != flags))
> + err = -EINVAL;
> + flags = ns->ops->type;
> + } else if (pidfd_pid(file)) {
> + err = check_setns_flags(flags);
> + } else {
> + err = -EBADF;
> + }
> + if (err)
> goto out;
>
> - err = prepare_nsset(nstype, &nsset);
> + err = prepare_nsset(flags, &nsset);
> if (err)
> goto out;
>
> - err = validate_ns(&nsset, ns);
> + if (proc_ns_file(file))
> + err = validate_ns(&nsset, ns);
> + else
> + err = validate_nsset(&nsset, file->private_data);
> if (!err) {
> commit_nsset(&nsset);
> perf_event_namespaces(tsk);
> --
> 2.26.2
>
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
On June 15, 2020 1:13:37 PM GMT+02:00, "Michael Kerrisk (man-pages)" <[email protected]> wrote:
>Hello Christian,
>
>Looking at this patch, and commit 303cc571d107b that landed in
>5.8-rc1, time namespaces were omitted. I assume this was an accident,
>since the commit message makes no statement about excluding time
>namespaces, and even mentions their existence as part of the
>justification for the patch ("With time namespaces we're looking at
>about 16 syscalls.").
>
>Is a fix needed here?
>
>Thanks,
>
>Michael
>
>On Thu, 30 Apr 2020 at 18:57, Christian Brauner
><[email protected]> wrote:
>>
>> For quite a while we have been thinking about using pidfds to attach
>to
>> namespaces. This patchset has existed for about a year already but
>we've
>> wanted to wait to see how the general api would be received and
>adopted.
>> Now that more and more programs in userspace have started using
>pidfds
>> for process management it's time to send this one out.
>>
>> This patch makes it possible to use pidfds to attach to the
>namespaces
>> of another process, i.e. they can be passed as the first argument to
>the
>> setns() syscall. When only a single namespace type is specified the
>> semantics are equivalent to passing an nsfd. That means
>> setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However,
>> when a pidfd is passed, multiple namespace flags can be specified in
>the
>> second setns() argument and setns() will attach the caller to all the
>> specified namespaces all at once or to none of them. Specifying 0 is
>not
>> valid together with a pidfd.
>>
>> The obvious example where this is useful is a standard container
>> manager interacting with a running container: pushing and pulling
>files
>> or directories, injecting mounts, attaching/execing any kind of
>process,
>> managing network devices all these operations require attaching to
>all
>> or at least multiple namespaces at the same time. Given that nowadays
>> most containers are spawned with all namespaces enabled we're
>currently
>> looking at at least 14 syscalls, 7 to open the /proc/<pid>/ns/<ns>
>> nsfds, another 7 to actually perform the namespace switch. With time
>> namespaces we're looking at about 16 syscalls.
>> (We could amortize the first 7 or 8 syscalls for opening the nsfds by
>> stashing them in each container's monitor process but that would
>mean
>> we need to send around those file descriptors through unix sockets
>> everytime we want to interact with the container or keep on-disk
>> state. Even in scenarios where a caller wants to join a particular
>> namespace in a particular order callers still profit from batching
>> other namespaces. That mostly applies to the user namespace but
>> all container runtimes I found join the user namespace first no
>matter
>> if it privileges or deprivileges the container similar to how
>unshare
>> behaves.)
>> With pidfds this becomes a single syscall no matter how many
>namespaces
>> are supposed to be attached to.
>>
>> A decently designed, large-scale container manager usually isn't the
>> parent of any of the containers it spawns so the containers don't die
>> when it crashes or needs to update or reinitialize. This means that
>> for the manager to interact with containers through pids is
>inherently
>> racy especially on systems where the maximum pid number is not
>> significicantly bumped. This is even more problematic since we often
>spawn
>> and manage thousands or ten-thousands of containers. Interacting with
>a
>> container through a pid thus can become risky quite quickly.
>Especially
>> since we allow for an administrator to enable advanced features such
>as
>> syscall interception where we're performing syscalls in lieu of the
>> container. In all of those cases we use pidfds if they are available
>and
>> we pass them around as stable references. Using them to setns() to
>the
>> target process' namespaces is as reliable as using nsfds. Either the
>> target process is already dead and we get ESRCH or we manage to
>attach
>> to its namespaces but we can't accidently attach to another process'
>> namespaces. So pidfds lend themselves to be used with this api.
>> The other main advantage is that with this change the pidfd becomes
>the
>> only relevant token for most container interactions and it's the only
>> token we need to create and send around.
>>
>> Apart from significiantly reducing the number of syscalls from double
>> digit to single digit which is a decent reason post-spectre/meltdown
>> this also allows to switch to a set of namespaces atomically, i.e.
>> either attaching to all the specified namespaces succeeds or we fail.
>If
>> we fail we haven't changed a single namespace. There are currently
>three
>> namespaces that can fail (other than for ENOMEM which really is not
>> very interesting since we then have other problems anyway) for
>> non-trivial reasons, user, mount, and pid namespaces. We can fail to
>> attach to a pid namespace if it is not our current active pid
>namespace
>> or a descendant of it. We can fail to attach to a user namespace
>because
>> we are multi-threaded or because our current mount namespace shares
>> filesystem state with other tasks, or because we're trying to setns()
>> to the same user namespace, i.e. the target task has the same user
>> namespace as we do. We can fail to attach to a mount namespace
>because
>> it shares filesystem state with other tasks or because we fail to
>lookup
>> the new root for the new mount namespace. In most non-pathological
>> scenarios these issues can be somewhat mitigated. But there are cases
>where
>> we're half-attached to some namespace and failing to attach to
>another one.
>> I've talked about some of these problem during the hallway track
>(something
>> only the pre-COVID-19 generation will remember) of Plumbers in Los
>Angeles
>> in 2018(?). Even if all these issues could be avoided with super
>careful
>> userspace coding it would be nicer to have this done in-kernel.
>Pidfds seem
>> to lend themselves nicely for this.
>>
>> The other neat thing about this is that setns() becomes an actual
>> counterpart to the namespace bits of unshare().
>>
>> Cc: Eric W. Biederman <[email protected]>
>> Cc: Serge Hallyn <[email protected]>
>> Cc: Jann Horn <[email protected]>
>> Cc: Michael Kerrisk <[email protected]>
>> Cc: Aleksa Sarai <[email protected]>
>> Signed-off-by: Christian Brauner <[email protected]>
>> ---
>> Seems that some people do in fact find this useful and I still have
>> hopes that I can potentially send this for v5.8. I've added some
>> expanded testing to this whole series to catch any mistakes.
>> The lifecycle management for various objects was certainly the
>> most interesting aspect so I'd appreciate a close look.
>> /* v2 */
>> - Michael Kerrisk <[email protected]>:
>> - Michael pointed out that the semantics for setns(nsfd, 0) and
>> setns(pidfd, 0) are not comparable. setns(pidfd, 0) is now
>> disallowed completely. Users wanting to attach to all namespaces
>> should simply specify them explicitly just as with unshare() and
>> clone3().
>> - Jann Horn <[email protected]>:
>> - Jann pointed out that the setns() in its first form wasn't atomic
>in
>> that userspace could end up in an intermediate state where e.g.
>the
>> process had moved into the target user namespace but failed to
>move
>> into the target mount namespace.
>> In this new version I've removed all intermediate states. There's
>an
>> installation/preparation state and a commit state similar to
>> prepare_creds() and commit_creds().
>> ---
>> fs/nsfs.c | 7 +-
>> include/linux/proc_fs.h | 6 ++
>> kernel/nsproxy.c | 228
>++++++++++++++++++++++++++++++++++++----
>> 3 files changed, 222 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/nsfs.c b/fs/nsfs.c
>> index 4f1205725cfe..b023c1a367c8 100644
>> --- a/fs/nsfs.c
>> +++ b/fs/nsfs.c
>> @@ -229,6 +229,11 @@ int ns_get_name(char *buf, size_t size, struct
>task_struct *task,
>> return res;
>> }
>>
>> +bool proc_ns_file(const struct file *file)
>> +{
>> + return file->f_op == &ns_file_operations;
>> +}
>> +
>> struct file *proc_ns_fget(int fd)
>> {
>> struct file *file;
>> @@ -237,7 +242,7 @@ struct file *proc_ns_fget(int fd)
>> if (!file)
>> return ERR_PTR(-EBADF);
>>
>> - if (file->f_op != &ns_file_operations)
>> + if (!proc_ns_file(file))
>> goto out_invalid;
>>
>> return file;
>> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
>> index 45c05fd9c99d..acfd5012db4e 100644
>> --- a/include/linux/proc_fs.h
>> +++ b/include/linux/proc_fs.h
>> @@ -104,6 +104,7 @@ struct proc_dir_entry
>*proc_create_net_single_write(const char *name, umode_t mo
>> proc_write_t
>write,
>> void *data);
>> extern struct pid *tgid_pidfd_to_pid(const struct file *file);
>> +extern bool proc_ns_file(const struct file *file);
>>
>> #ifdef CONFIG_PROC_PID_ARCH_STATUS
>> /*
>> @@ -159,6 +160,11 @@ static inline struct pid
>*tgid_pidfd_to_pid(const struct file *file)
>> return ERR_PTR(-EBADF);
>> }
>>
>> +static inline bool proc_ns_file(const struct file *file)
>> +{
>> + return false;
>> +}
>> +
>> #endif /* CONFIG_PROC_FS */
>>
>> struct net;
>> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
>> index 2da463bab58a..8cae508acb9c 100644
>> --- a/kernel/nsproxy.c
>> +++ b/kernel/nsproxy.c
>> @@ -20,6 +20,7 @@
>> #include <linux/ipc_namespace.h>
>> #include <linux/time_namespace.h>
>> #include <linux/fs_struct.h>
>> +#include <linux/proc_fs.h>
>> #include <linux/proc_ns.h>
>> #include <linux/file.h>
>> #include <linux/syscalls.h>
>> @@ -258,12 +259,53 @@ void exit_task_namespaces(struct task_struct
>*p)
>> switch_task_namespaces(p, NULL);
>> }
>>
>> +static int check_setns_flags(unsigned long flags)
>> +{
>> + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS |
>CLONE_NEWIPC |
>> + CLONE_NEWNET | CLONE_NEWUSER |
>CLONE_NEWPID |
>> + CLONE_NEWCGROUP)))
>> + return -EINVAL;
>> +
>> +#ifndef CONFIG_USER_NS
>> + if (flags & CLONE_NEWUSER)
>> + return -EINVAL;
>> +#endif
>> +#ifndef CONFIG_PID_NS
>> + if (flags & CLONE_NEWPID)
>> + return -EINVAL;
>> +#endif
>> +#ifndef CONFIG_UTS_NS
>> + if (flags & CLONE_NEWUTS)
>> + return -EINVAL;
>> +#endif
>> +#ifndef CONFIG_IPC_NS
>> + if (flags & CLONE_NEWIPC)
>> + return -EINVAL;
>> +#endif
>> +#ifndef CONFIG_CGROUPS
>> + if (flags & CLONE_NEWCGROUP)
>> + return -EINVAL;
>> +#endif
>> +#ifndef CONFIG_NET_NS
>> + if (flags & CLONE_NEWNET)
>> + return -EINVAL;
>> +#endif
>> +
>> + return 0;
>> +}
>> +
>> static void put_nsset(struct nsset *nsset)
>> {
>> unsigned flags = nsset->flags;
>>
>> if (flags & CLONE_NEWUSER)
>> put_cred(nsset_cred(nsset));
>> + /*
>> + * We only created a temporary copy if we attached to more
>than just
>> + * the mount namespace.
>> + */
>> + if (nsset->fs && (flags & CLONE_NEWNS) && (flags &
>~CLONE_NEWNS))
>> + free_fs_struct(nsset->fs);
>> if (nsset->nsproxy)
>> free_nsproxy(nsset->nsproxy);
>> }
>> @@ -283,8 +325,14 @@ static int prepare_nsset(unsigned flags, struct
>nsset *nsset)
>> if (!nsset->cred)
>> goto out;
>>
>> - if (flags & CLONE_NEWNS)
>> + /* Only create a temporary copy of fs_struct if we really
>need to. */
>> + if (flags == CLONE_NEWNS) {
>> nsset->fs = me->fs;
>> + } else if (flags & CLONE_NEWNS) {
>> + nsset->fs = copy_fs_struct(me->fs);
>> + if (!nsset->fs)
>> + goto out;
>> + }
>>
>> nsset->flags = flags;
>> return 0;
>> @@ -294,6 +342,138 @@ static int prepare_nsset(unsigned flags, struct
>nsset *nsset)
>> return -ENOMEM;
>> }
>>
>> +static inline int validate_ns(struct nsset *nsset, struct ns_common
>*ns)
>> +{
>> + return ns->ops->install(nsset, ns);
>> +}
>> +
>> +/*
>> + * This is the inverse operation to unshare().
>> + * Ordering is equivalent to the standard ordering used everywhere
>else
>> + * during unshare and process creation. The switch to the new set of
>> + * namespaces occurs at the point of no return after installation of
>> + * all requested namespaces was successful in commit_nsset().
>> + */
>> +static int validate_nsset(struct nsset *nsset, struct pid *pid)
>> +{
>> + int ret = 0;
>> + unsigned flags = nsset->flags;
>> + struct user_namespace *user_ns = NULL;
>> + struct pid_namespace *pid_ns = NULL;
>> + struct nsproxy *nsp;
>> + struct task_struct *tsk;
>> +
>> + /* Take a "snapshot" of the target task's namespaces. */
>> + rcu_read_lock();
>> + tsk = pid_task(pid, PIDTYPE_PID);
>> + if (!tsk) {
>> + rcu_read_unlock();
>> + return -ESRCH;
>> + }
>> +
>> + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
>> + rcu_read_unlock();
>> + return -EPERM;
>> + }
>> +
>> + task_lock(tsk);
>> + nsp = tsk->nsproxy;
>> + if (nsp)
>> + get_nsproxy(nsp);
>> + task_unlock(tsk);
>> + if (!nsp) {
>> + rcu_read_unlock();
>> + return -ESRCH;
>> + }
>> +
>> +#ifdef CONFIG_PID_NS
>> + if (flags & CLONE_NEWPID) {
>> + pid_ns = task_active_pid_ns(tsk);
>> + if (unlikely(!pid_ns)) {
>> + rcu_read_unlock();
>> + ret = -ESRCH;
>> + goto out;
>> + }
>> + get_pid_ns(pid_ns);
>> + }
>> +#endif
>> +
>> +#ifdef CONFIG_USER_NS
>> + if (flags & CLONE_NEWUSER)
>> + user_ns = get_user_ns(__task_cred(tsk)->user_ns);
>> +#endif
>> + rcu_read_unlock();
>> +
>> + /*
>> + * Install requested namespaces. The caller will have
>> + * verified earlier that the requested namespaces are
>> + * supported on this kernel. We don't report errors here
>> + * if a namespace is requested that isn't supported.
>> + */
>> +#ifdef CONFIG_USER_NS
>> + if (flags & CLONE_NEWUSER) {
>> + ret = validate_ns(nsset, &user_ns->ns);
>> + if (ret)
>> + goto out;
>> + }
>> +#endif
>> +
>> + if (flags & CLONE_NEWNS) {
>> + ret = validate_ns(nsset,
>mnt_ns_to_common(nsp->mnt_ns));
>> + if (ret)
>> + goto out;
>> + }
>> +
>> +#ifdef CONFIG_UTS_NS
>> + if (flags & CLONE_NEWUTS) {
>> + ret = validate_ns(nsset, &nsp->uts_ns->ns);
>> + if (ret)
>> + goto out;
>> + }
>> +#endif
>> +
>> +#ifdef CONFIG_IPC_NS
>> + if (flags & CLONE_NEWIPC) {
>> + ret = validate_ns(nsset, &nsp->ipc_ns->ns);
>> + if (ret)
>> + goto out;
>> + }
>> +#endif
>> +
>> +#ifdef CONFIG_PID_NS
>> + if (flags & CLONE_NEWPID) {
>> + ret = validate_ns(nsset, &pid_ns->ns);
>> + if (ret)
>> + goto out;
>> + }
>> +#endif
>> +
>> +#ifdef CONFIG_CGROUPS
>> + if (flags & CLONE_NEWCGROUP) {
>> + ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
>> + if (ret)
>> + goto out;
>> + }
>> +#endif
>> +
>> +#ifdef CONFIG_NET_NS
>> + if (flags & CLONE_NEWNET) {
>> + ret = validate_ns(nsset, &nsp->net_ns->ns);
>> + if (ret)
>> + goto out;
>> + }
>> +#endif
>> +
>> +out:
>> + if (pid_ns)
>> + put_pid_ns(pid_ns);
>> + if (nsp)
>> + put_nsproxy(nsp);
>> + put_user_ns(user_ns);
>> +
>> + return ret;
>> +}
>> +
>> /*
>> * This is the point of no return. There are just a few namespaces
>> * that do some actual work here and it's sufficiently minimal that
>> @@ -316,6 +496,12 @@ static void commit_nsset(struct nsset *nsset)
>> }
>> #endif
>>
>> + /* We only need to commit if we have used a temporary
>fs_struct. */
>> + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
>> + set_fs_root(me->fs, &nsset->fs->root);
>> + set_fs_pwd(me->fs, &nsset->fs->pwd);
>> + }
>> +
>> #ifdef CONFIG_IPC_NS
>> if (flags & CLONE_NEWIPC)
>> exit_sem(me);
>> @@ -326,33 +512,39 @@ static void commit_nsset(struct nsset *nsset)
>> nsset->nsproxy = NULL;
>> }
>>
>> -static inline int validate_ns(struct nsset *nsset, struct ns_common
>*ns)
>> -{
>> - return ns->ops->install(nsset, ns);
>> -}
>> -
>> -SYSCALL_DEFINE2(setns, int, fd, int, nstype)
>> +SYSCALL_DEFINE2(setns, int, fd, int, flags)
>> {
>> struct task_struct *tsk = current;
>> struct file *file;
>> - struct ns_common *ns;
>> + struct ns_common *ns = NULL;
>> struct nsset nsset = {};
>> - int err;
>> -
>> - file = proc_ns_fget(fd);
>> - if (IS_ERR(file))
>> - return PTR_ERR(file);
>> + int err = 0;
>>
>> - err = -EINVAL;
>> - ns = get_proc_ns(file_inode(file));
>> - if (nstype && (ns->ops->type != nstype))
>> + file = fget(fd);
>> + if (!file)
>> + return -EBADF;
>> +
>> + if (proc_ns_file(file)) {
>> + ns = get_proc_ns(file_inode(file));
>> + if (flags && (ns->ops->type != flags))
>> + err = -EINVAL;
>> + flags = ns->ops->type;
>> + } else if (pidfd_pid(file)) {
>> + err = check_setns_flags(flags);
>> + } else {
>> + err = -EBADF;
>> + }
>> + if (err)
>> goto out;
>>
>> - err = prepare_nsset(nstype, &nsset);
>> + err = prepare_nsset(flags, &nsset);
>> if (err)
>> goto out;
>>
>> - err = validate_ns(&nsset, ns);
>> + if (proc_ns_file(file))
>> + err = validate_ns(&nsset, ns);
>> + else
>> + err = validate_nsset(&nsset, file->private_data);
>> if (!err) {
>> commit_nsset(&nsset);
>> perf_event_namespaces(tsk);
>> --
>> 2.26.2
>>
I've omitted them on purpose since we need a small fix first.
I've already discussed this at
https://lore.kernel.org/lkml/20200612144744.orw4yiaosghqlv4w@wittgenstein
I'll send the series next week. I'm on vacation this week.
Thanks!
Christian