2017-04-27 12:33:09

by Kirill Tkhai

[permalink] [raw]
Subject: [PATCH v3] pid_ns: Introduce ioctl to set vector of ns_last_pid's on ns hierarhy

On implementing of nested pid namespaces support in CRIU
(checkpoint-restore in userspace tool) we run into
the situation, that it's impossible to create a task with
specific NSpid effectively. After commit 49f4d8b93ccf
"pidns: Capture the user namespace and filter ns_last_pid"
it is impossible to set ns_last_pid on any pid namespace,
except task's active pid_ns (before the commit it was possible
to write to pid_ns_for_children). Thus, if a restored task
in a container has more than one pid_ns levels, the restorer
code must have a task helper for every pid namespace
of the task's pid_ns hierarhy.

This is a big problem, because of communication with
a helper for every pid_ns in the hierarchy is not cheap.
It's not performance-good as it implies many helpers wakeups
to create a single task (independently, how you communicate
with the helpers). This patch tries to decide the problem.

It introduces a new pid_ns ioctl(NS_SET_LAST_PID_VEC),
which allows to write a vector of last pids on pid_ns hierarchy.
The vector is passed as array of pids in struct ns_ioc_pid_vec,
written in reverse order. The first number corresponds to
the opened namespace ns_last_pid, the second is to its parent, etc.
So, if you have the pid namespaces hierarchy like:

pid_ns1 (grand father)
|
v
pid_ns2 (father)
|
v
pid_ns3 (child)

and the pid_ns3 is open, then the corresponding vector will be
{last_ns_pid3, last_ns_pid2, last_ns_pid1}. This vector may be
short and it may contain less levels. For example,
{last_ns_pid3, last_ns_pid2} or even {last_ns_pid3}, in dependence
of which levels you want to populate.

v3: Use __u32 in uapi instead of unsigned int.

v2: Kill pid_ns->child_reaper check as it's impossible to have
such a pid namespace file open.
Use generic namespaces ioctl() number.
Pass pids as array, not as a string.

Signed-off-by: Kirill Tkhai <[email protected]>
---
fs/nsfs.c | 5 +++++
include/linux/pid_namespace.h | 12 ++++++++++++
include/uapi/linux/nsfs.h | 7 +++++++
kernel/pid_namespace.c | 35 +++++++++++++++++++++++++++++++++++
4 files changed, 59 insertions(+)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 323f492e0822..f669a1552003 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -6,6 +6,7 @@
#include <linux/ktime.h>
#include <linux/seq_file.h>
#include <linux/user_namespace.h>
+#include <linux/pid_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>

@@ -186,6 +187,10 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
+ case NS_SET_LAST_PID_VEC:
+ if (ns->ops->type != CLONE_NEWPID)
+ return -EINVAL;
+ return pidns_set_last_pid_vec(ns, (void *)arg);
default:
return -ENOTTY;
}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c2a989dee876..c8dc4173a4e8 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/kref.h>
#include <linux/ns_common.h>
+#include <uapi/linux/nsfs.h>

struct pidmap {
atomic_t nr_free;
@@ -103,6 +104,17 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
}
#endif /* CONFIG_PID_NS */

+#if defined(CONFIG_PID_NS) && defined(CONFIG_CHECKPOINT_RESTORE)
+extern long pidns_set_last_pid_vec(struct ns_common *ns,
+ struct ns_ioc_pid_vec __user *vec);
+#else /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
+static inline long pidns_set_last_pid_vec(struct ns_common *ns,
+ struct ns_ioc_pid_vec __user *vec)
+{
+ return -ENOTTY;
+}
+#endif /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
+
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pidmap_init(void);
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 1a3ca79f466b..1254b02a47fa 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -14,5 +14,12 @@
#define NS_GET_NSTYPE _IO(NSIO, 0x3)
/* Get owner UID (in the caller's user namespace) for a user namespace */
#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
+/* Set a vector of ns_last_pid for a pid namespace stack */
+#define NS_SET_LAST_PID_VEC _IO(NSIO, 0x5)
+
+struct ns_ioc_pid_vec {
+ __u32 nr;
+ pid_t pid[0];
+};

#endif /* __LINUX_NSFS_H */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..08b5fef23534 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <uapi/linux/nsfs.h>

struct pid_cache {
int nr_ids;
@@ -428,6 +429,40 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns)
return &get_pid_ns(pid_ns)->ns;
}

+#ifdef CONFIG_CHECKPOINT_RESTORE
+long pidns_set_last_pid_vec(struct ns_common *ns,
+ struct ns_ioc_pid_vec __user *vec)
+{
+ struct pid_namespace *pid_ns = to_pid_ns(ns);
+ pid_t pid, __user *pid_ptr;
+ u32 nr;
+
+ if (get_user(nr, &vec->nr))
+ return -EFAULT;
+ if (nr > 32 || nr < 1)
+ return -EINVAL;
+
+ pid_ptr = &vec->pid[0];
+ do {
+ if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(pid, pid_ptr))
+ return -EFAULT;
+ if (pid < 0 || pid > pid_max)
+ return -EINVAL;
+
+ /* Write directly: see the comment in pid_ns_ctl_handler() */
+ pid_ns->last_pid = pid;
+
+ pid_ns = pid_ns->parent;
+ pid_ptr++;
+ } while (--nr > 0 && pid_ns);
+
+ return nr ? -EINVAL : 0;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
static struct user_namespace *pidns_owner(struct ns_common *ns)
{
return to_pid_ns(ns)->user_ns;


2017-04-27 15:22:22

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v3] pid_ns: Introduce ioctl to set vector of ns_last_pid's on ns hierarhy

Kirill Tkhai <[email protected]> writes:

> On implementing of nested pid namespaces support in CRIU
> (checkpoint-restore in userspace tool) we run into
> the situation, that it's impossible to create a task with
> specific NSpid effectively. After commit 49f4d8b93ccf
> "pidns: Capture the user namespace and filter ns_last_pid"
> it is impossible to set ns_last_pid on any pid namespace,
> except task's active pid_ns (before the commit it was possible
> to write to pid_ns_for_children). Thus, if a restored task
> in a container has more than one pid_ns levels, the restorer
> code must have a task helper for every pid namespace
> of the task's pid_ns hierarhy.
>
> This is a big problem, because of communication with
> a helper for every pid_ns in the hierarchy is not cheap.
> It's not performance-good as it implies many helpers wakeups
> to create a single task (independently, how you communicate
> with the helpers). This patch tries to decide the problem.

I see the problem and we definitely need to do something.
Your patch does appear better than what we have been doing.
So a tenative conceptual ack.

At the same time it is legitimate to claim that the use of
task_active_pid_ns(current) rather than
current->nsproxy->pid_ns_for_children is a regression caused by the
above commit. So we can fix the original issue as well.

I do have to ask when was this problem discovered, and why did it take
so long to discover? The regeression happened nearly 5 years ago.

Was CRIU already using this?

It looks like the fix is a one line low danger change to
/proc/sys/kernel/ns_last_pid. With a low danger as pid_ns_for_children
rarely differs from task_active_pid_ns().


> It introduces a new pid_ns ioctl(NS_SET_LAST_PID_VEC),
> which allows to write a vector of last pids on pid_ns hierarchy.
> The vector is passed as array of pids in struct ns_ioc_pid_vec,
> written in reverse order. The first number corresponds to
> the opened namespace ns_last_pid, the second is to its parent, etc.
> So, if you have the pid namespaces hierarchy like:
>
> pid_ns1 (grand father)
> |
> v
> pid_ns2 (father)
> |
> v
> pid_ns3 (child)
>
> and the pid_ns3 is open, then the corresponding vector will be
> {last_ns_pid3, last_ns_pid2, last_ns_pid1}. This vector may be
> short and it may contain less levels. For example,
> {last_ns_pid3, last_ns_pid2} or even {last_ns_pid3}, in dependence
> of which levels you want to populate.
>
> v3: Use __u32 in uapi instead of unsigned int.
>
> v2: Kill pid_ns->child_reaper check as it's impossible to have
> such a pid namespace file open.
> Use generic namespaces ioctl() number.
> Pass pids as array, not as a string.
>
> Signed-off-by: Kirill Tkhai <[email protected]>
> ---
> fs/nsfs.c | 5 +++++
> include/linux/pid_namespace.h | 12 ++++++++++++
> include/uapi/linux/nsfs.h | 7 +++++++
> kernel/pid_namespace.c | 35 +++++++++++++++++++++++++++++++++++
> 4 files changed, 59 insertions(+)
>
> diff --git a/fs/nsfs.c b/fs/nsfs.c
> index 323f492e0822..f669a1552003 100644
> --- a/fs/nsfs.c
> +++ b/fs/nsfs.c
> @@ -6,6 +6,7 @@
> #include <linux/ktime.h>
> #include <linux/seq_file.h>
> #include <linux/user_namespace.h>
> +#include <linux/pid_namespace.h>
> #include <linux/nsfs.h>
> #include <linux/uaccess.h>
>
> @@ -186,6 +187,10 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
> argp = (uid_t __user *) arg;
> uid = from_kuid_munged(current_user_ns(), user_ns->owner);
> return put_user(uid, argp);
> + case NS_SET_LAST_PID_VEC:
> + if (ns->ops->type != CLONE_NEWPID)
> + return -EINVAL;
> + return pidns_set_last_pid_vec(ns, (void *)arg);
> default:
> return -ENOTTY;
> }
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index c2a989dee876..c8dc4173a4e8 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -9,6 +9,7 @@
> #include <linux/nsproxy.h>
> #include <linux/kref.h>
> #include <linux/ns_common.h>
> +#include <uapi/linux/nsfs.h>

No need for the extra include and slowing down the build. Just
declare the relevant structures.
>
> struct pidmap {
> atomic_t nr_free;
> @@ -103,6 +104,17 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
> }
> #endif /* CONFIG_PID_NS */
>
> +#if defined(CONFIG_PID_NS) && defined(CONFIG_CHECKPOINT_RESTORE)
> +extern long pidns_set_last_pid_vec(struct ns_common *ns,
> + struct ns_ioc_pid_vec __user *vec);
> +#else /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
> +static inline long pidns_set_last_pid_vec(struct ns_common *ns,
> + struct ns_ioc_pid_vec __user *vec)
> +{
> + return -ENOTTY;
> +}
> +#endif /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */

Just CONFIG_PID_NS please. Either this is good enough for everyone who
has pid namespace support enabled or it isn't.


> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
> void pidhash_init(void);
> void pidmap_init(void);
> diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
> index 1a3ca79f466b..1254b02a47fa 100644
> --- a/include/uapi/linux/nsfs.h
> +++ b/include/uapi/linux/nsfs.h
> @@ -14,5 +14,12 @@
> #define NS_GET_NSTYPE _IO(NSIO, 0x3)
> /* Get owner UID (in the caller's user namespace) for a user namespace */
> #define NS_GET_OWNER_UID _IO(NSIO, 0x4)
> +/* Set a vector of ns_last_pid for a pid namespace stack */
> +#define NS_SET_LAST_PID_VEC _IO(NSIO, 0x5)
> +
> +struct ns_ioc_pid_vec {
> + __u32 nr;
> + pid_t pid[0];
> +};
>
> #endif /* __LINUX_NSFS_H */
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index de461aa0bf9a..08b5fef23534 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -21,6 +21,7 @@
> #include <linux/export.h>
> #include <linux/sched/task.h>
> #include <linux/sched/signal.h>
> +#include <uapi/linux/nsfs.h>
>
> struct pid_cache {
> int nr_ids;
> @@ -428,6 +429,40 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns)
> return &get_pid_ns(pid_ns)->ns;
> }
>
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> +long pidns_set_last_pid_vec(struct ns_common *ns,
> + struct ns_ioc_pid_vec __user *vec)
> +{
> + struct pid_namespace *pid_ns = to_pid_ns(ns);
> + pid_t pid, __user *pid_ptr;
> + u32 nr;
> +
> + if (get_user(nr, &vec->nr))
> + return -EFAULT;
> + if (nr > 32 || nr < 1)

The maximum needs not to be hard coded.

> + return -EINVAL;
> +
> + pid_ptr = &vec->pid[0];

All of the rest of the vector needs to be read in, in one go.

> + do {
> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> + return -EPERM;

Permission to change all of the namespaces should be checked before
writing to pid_ns->last_pid happens.
> +
> + if (get_user(pid, pid_ptr))
> + return -EFAULT;
> + if (pid < 0 || pid > pid_max)
> + return -EINVAL;
> +
> + /* Write directly: see the comment in pid_ns_ctl_handler() */
> + pid_ns->last_pid = pid;
> +
> + pid_ns = pid_ns->parent;
> + pid_ptr++;
> + } while (--nr > 0 && pid_ns);
> +
> + return nr ? -EINVAL : 0;
> +}
> +#endif /* CONFIG_CHECKPOINT_RESTORE */
> +
> static struct user_namespace *pidns_owner(struct ns_common *ns)
> {
> return to_pid_ns(ns)->user_ns;

2017-04-27 15:53:16

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH v3] pid_ns: Introduce ioctl to set vector of ns_last_pid's on ns hierarhy

On 27.04.2017 18:15, Eric W. Biederman wrote:
> Kirill Tkhai <[email protected]> writes:
>
>> On implementing of nested pid namespaces support in CRIU
>> (checkpoint-restore in userspace tool) we run into
>> the situation, that it's impossible to create a task with
>> specific NSpid effectively. After commit 49f4d8b93ccf
>> "pidns: Capture the user namespace and filter ns_last_pid"
>> it is impossible to set ns_last_pid on any pid namespace,
>> except task's active pid_ns (before the commit it was possible
>> to write to pid_ns_for_children). Thus, if a restored task
>> in a container has more than one pid_ns levels, the restorer
>> code must have a task helper for every pid namespace
>> of the task's pid_ns hierarhy.
>>
>> This is a big problem, because of communication with
>> a helper for every pid_ns in the hierarchy is not cheap.
>> It's not performance-good as it implies many helpers wakeups
>> to create a single task (independently, how you communicate
>> with the helpers). This patch tries to decide the problem.
>
> I see the problem and we definitely need to do something.
> Your patch does appear better than what we have been doing.
> So a tenative conceptual ack.
>
> At the same time it is legitimate to claim that the use of
> task_active_pid_ns(current) rather than
> current->nsproxy->pid_ns_for_children is a regression caused by the
> above commit. So we can fix the original issue as well.
>
> I do have to ask when was this problem discovered, and why did it take
> so long to discover? The regeression happened nearly 5 years ago.
>
> Was CRIU already using this?

CRIU uses ns_last_pid, but we never had nested pid namespace hierarchy.
When there is only one level of pid namespaces, then active pid namespace
is the save as pid_ns_for_children, so we never faced with this problem.

Now we're working on Docker support, and its recent versions create nested
pid namespaces (I have no information, when they begun to do that). So,
we met this problem.

> It looks like the fix is a one line low danger change to
> /proc/sys/kernel/ns_last_pid. With a low danger as pid_ns_for_children
> rarely differs from task_active_pid_ns().
>
>
>> It introduces a new pid_ns ioctl(NS_SET_LAST_PID_VEC),
>> which allows to write a vector of last pids on pid_ns hierarchy.
>> The vector is passed as array of pids in struct ns_ioc_pid_vec,
>> written in reverse order. The first number corresponds to
>> the opened namespace ns_last_pid, the second is to its parent, etc.
>> So, if you have the pid namespaces hierarchy like:
>>
>> pid_ns1 (grand father)
>> |
>> v
>> pid_ns2 (father)
>> |
>> v
>> pid_ns3 (child)
>>
>> and the pid_ns3 is open, then the corresponding vector will be
>> {last_ns_pid3, last_ns_pid2, last_ns_pid1}. This vector may be
>> short and it may contain less levels. For example,
>> {last_ns_pid3, last_ns_pid2} or even {last_ns_pid3}, in dependence
>> of which levels you want to populate.
>>
>> v3: Use __u32 in uapi instead of unsigned int.
>>
>> v2: Kill pid_ns->child_reaper check as it's impossible to have
>> such a pid namespace file open.
>> Use generic namespaces ioctl() number.
>> Pass pids as array, not as a string.
>>
>> Signed-off-by: Kirill Tkhai <[email protected]>
>> ---
>> fs/nsfs.c | 5 +++++
>> include/linux/pid_namespace.h | 12 ++++++++++++
>> include/uapi/linux/nsfs.h | 7 +++++++
>> kernel/pid_namespace.c | 35 +++++++++++++++++++++++++++++++++++
>> 4 files changed, 59 insertions(+)
>>
>> diff --git a/fs/nsfs.c b/fs/nsfs.c
>> index 323f492e0822..f669a1552003 100644
>> --- a/fs/nsfs.c
>> +++ b/fs/nsfs.c
>> @@ -6,6 +6,7 @@
>> #include <linux/ktime.h>
>> #include <linux/seq_file.h>
>> #include <linux/user_namespace.h>
>> +#include <linux/pid_namespace.h>
>> #include <linux/nsfs.h>
>> #include <linux/uaccess.h>
>>
>> @@ -186,6 +187,10 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
>> argp = (uid_t __user *) arg;
>> uid = from_kuid_munged(current_user_ns(), user_ns->owner);
>> return put_user(uid, argp);
>> + case NS_SET_LAST_PID_VEC:
>> + if (ns->ops->type != CLONE_NEWPID)
>> + return -EINVAL;
>> + return pidns_set_last_pid_vec(ns, (void *)arg);
>> default:
>> return -ENOTTY;
>> }
>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>> index c2a989dee876..c8dc4173a4e8 100644
>> --- a/include/linux/pid_namespace.h
>> +++ b/include/linux/pid_namespace.h
>> @@ -9,6 +9,7 @@
>> #include <linux/nsproxy.h>
>> #include <linux/kref.h>
>> #include <linux/ns_common.h>
>> +#include <uapi/linux/nsfs.h>
>
> No need for the extra include and slowing down the build. Just
> declare the relevant structures.

So, I'll write just:

struct ns_ioc_pid_vec;

instead of that.

>>
>> struct pidmap {
>> atomic_t nr_free;
>> @@ -103,6 +104,17 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>> }
>> #endif /* CONFIG_PID_NS */
>>
>> +#if defined(CONFIG_PID_NS) && defined(CONFIG_CHECKPOINT_RESTORE)
>> +extern long pidns_set_last_pid_vec(struct ns_common *ns,
>> + struct ns_ioc_pid_vec __user *vec);
>> +#else /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>> +static inline long pidns_set_last_pid_vec(struct ns_common *ns,
>> + struct ns_ioc_pid_vec __user *vec)
>> +{
>> + return -ENOTTY;
>> +}
>> +#endif /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>
> Just CONFIG_PID_NS please. Either this is good enough for everyone who
> has pid namespace support enabled or it isn't.

Great, if it's so. For me it looks better too.

>> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>> void pidhash_init(void);
>> void pidmap_init(void);
>> diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
>> index 1a3ca79f466b..1254b02a47fa 100644
>> --- a/include/uapi/linux/nsfs.h
>> +++ b/include/uapi/linux/nsfs.h
>> @@ -14,5 +14,12 @@
>> #define NS_GET_NSTYPE _IO(NSIO, 0x3)
>> /* Get owner UID (in the caller's user namespace) for a user namespace */
>> #define NS_GET_OWNER_UID _IO(NSIO, 0x4)
>> +/* Set a vector of ns_last_pid for a pid namespace stack */
>> +#define NS_SET_LAST_PID_VEC _IO(NSIO, 0x5)
>> +
>> +struct ns_ioc_pid_vec {
>> + __u32 nr;
>> + pid_t pid[0];
>> +};
>>
>> #endif /* __LINUX_NSFS_H */
>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>> index de461aa0bf9a..08b5fef23534 100644
>> --- a/kernel/pid_namespace.c
>> +++ b/kernel/pid_namespace.c
>> @@ -21,6 +21,7 @@
>> #include <linux/export.h>
>> #include <linux/sched/task.h>
>> #include <linux/sched/signal.h>
>> +#include <uapi/linux/nsfs.h>
>>
>> struct pid_cache {
>> int nr_ids;
>> @@ -428,6 +429,40 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns)
>> return &get_pid_ns(pid_ns)->ns;
>> }
>>
>> +#ifdef CONFIG_CHECKPOINT_RESTORE
>> +long pidns_set_last_pid_vec(struct ns_common *ns,
>> + struct ns_ioc_pid_vec __user *vec)
>> +{
>> + struct pid_namespace *pid_ns = to_pid_ns(ns);
>> + pid_t pid, __user *pid_ptr;
>> + u32 nr;
>> +
>> + if (get_user(nr, &vec->nr))
>> + return -EFAULT;
>> + if (nr > 32 || nr < 1)
>
> The maximum needs not to be hard coded.

Ah, I've missed MAX_PID_NS_LEVEL, thanks.

>> + return -EINVAL;
>> +
>> + pid_ptr = &vec->pid[0];
>
> All of the rest of the vector needs to be read in, in one go.

Hm, Oleg said we shouldn't allocate a memory for that. Should
I create array of MAX_PID_NS_LEVEL pids on stack?

>> + do {
>> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>> + return -EPERM;
>
> Permission to change all of the namespaces should be checked before
> writing to pid_ns->last_pid happens.

Ok

>> +
>> + if (get_user(pid, pid_ptr))
>> + return -EFAULT;
>> + if (pid < 0 || pid > pid_max)
>> + return -EINVAL;
>> +
>> + /* Write directly: see the comment in pid_ns_ctl_handler() */
>> + pid_ns->last_pid = pid;
>> +
>> + pid_ns = pid_ns->parent;
>> + pid_ptr++;
>> + } while (--nr > 0 && pid_ns);
>> +
>> + return nr ? -EINVAL : 0;
>> +}
>> +#endif /* CONFIG_CHECKPOINT_RESTORE */
>> +
>> static struct user_namespace *pidns_owner(struct ns_common *ns)
>> {
>> return to_pid_ns(ns)->user_ns;

2017-04-27 16:13:50

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v3] pid_ns: Introduce ioctl to set vector of ns_last_pid's on ns hierarhy

Kirill Tkhai <[email protected]> writes:

> On 27.04.2017 18:15, Eric W. Biederman wrote:
>> Kirill Tkhai <[email protected]> writes:
>>
>>> On implementing of nested pid namespaces support in CRIU
>>> (checkpoint-restore in userspace tool) we run into
>>> the situation, that it's impossible to create a task with
>>> specific NSpid effectively. After commit 49f4d8b93ccf
>>> "pidns: Capture the user namespace and filter ns_last_pid"
>>> it is impossible to set ns_last_pid on any pid namespace,
>>> except task's active pid_ns (before the commit it was possible
>>> to write to pid_ns_for_children). Thus, if a restored task
>>> in a container has more than one pid_ns levels, the restorer
>>> code must have a task helper for every pid namespace
>>> of the task's pid_ns hierarhy.
>>>
>>> This is a big problem, because of communication with
>>> a helper for every pid_ns in the hierarchy is not cheap.
>>> It's not performance-good as it implies many helpers wakeups
>>> to create a single task (independently, how you communicate
>>> with the helpers). This patch tries to decide the problem.
>>
>> I see the problem and we definitely need to do something.
>> Your patch does appear better than what we have been doing.
>> So a tenative conceptual ack.
>>
>> At the same time it is legitimate to claim that the use of
>> task_active_pid_ns(current) rather than
>> current->nsproxy->pid_ns_for_children is a regression caused by the
>> above commit. So we can fix the original issue as well.
>>
>> I do have to ask when was this problem discovered, and why did it take
>> so long to discover? The regeression happened nearly 5 years ago.
>>
>> Was CRIU already using this?
>
> CRIU uses ns_last_pid, but we never had nested pid namespace hierarchy.
> When there is only one level of pid namespaces, then active pid namespace
> is the save as pid_ns_for_children, so we never faced with this
> problem.

Ok. So not a regression then.

> Now we're working on Docker support, and its recent versions create nested
> pid namespaces (I have no information, when they begun to do that). So,
> we met this problem.
>
>> It looks like the fix is a one line low danger change to
>> /proc/sys/kernel/ns_last_pid. With a low danger as pid_ns_for_children
>> rarely differs from task_active_pid_ns().
>>
>>
>>> It introduces a new pid_ns ioctl(NS_SET_LAST_PID_VEC),
>>> which allows to write a vector of last pids on pid_ns hierarchy.
>>> The vector is passed as array of pids in struct ns_ioc_pid_vec,
>>> written in reverse order. The first number corresponds to
>>> the opened namespace ns_last_pid, the second is to its parent, etc.
>>> So, if you have the pid namespaces hierarchy like:
>>>
>>> pid_ns1 (grand father)
>>> |
>>> v
>>> pid_ns2 (father)
>>> |
>>> v
>>> pid_ns3 (child)
>>>
>>> and the pid_ns3 is open, then the corresponding vector will be
>>> {last_ns_pid3, last_ns_pid2, last_ns_pid1}. This vector may be
>>> short and it may contain less levels. For example,
>>> {last_ns_pid3, last_ns_pid2} or even {last_ns_pid3}, in dependence
>>> of which levels you want to populate.
>>>
>>> v3: Use __u32 in uapi instead of unsigned int.
>>>
>>> v2: Kill pid_ns->child_reaper check as it's impossible to have
>>> such a pid namespace file open.
>>> Use generic namespaces ioctl() number.
>>> Pass pids as array, not as a string.
>>>
>>> Signed-off-by: Kirill Tkhai <[email protected]>
>>> ---
>>> fs/nsfs.c | 5 +++++
>>> include/linux/pid_namespace.h | 12 ++++++++++++
>>> include/uapi/linux/nsfs.h | 7 +++++++
>>> kernel/pid_namespace.c | 35 +++++++++++++++++++++++++++++++++++
>>> 4 files changed, 59 insertions(+)
>>>
>>> diff --git a/fs/nsfs.c b/fs/nsfs.c
>>> index 323f492e0822..f669a1552003 100644
>>> --- a/fs/nsfs.c
>>> +++ b/fs/nsfs.c
>>> @@ -6,6 +6,7 @@
>>> #include <linux/ktime.h>
>>> #include <linux/seq_file.h>
>>> #include <linux/user_namespace.h>
>>> +#include <linux/pid_namespace.h>
>>> #include <linux/nsfs.h>
>>> #include <linux/uaccess.h>
>>>
>>> @@ -186,6 +187,10 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
>>> argp = (uid_t __user *) arg;
>>> uid = from_kuid_munged(current_user_ns(), user_ns->owner);
>>> return put_user(uid, argp);
>>> + case NS_SET_LAST_PID_VEC:
>>> + if (ns->ops->type != CLONE_NEWPID)
>>> + return -EINVAL;
>>> + return pidns_set_last_pid_vec(ns, (void *)arg);
>>> default:
>>> return -ENOTTY;
>>> }
>>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>>> index c2a989dee876..c8dc4173a4e8 100644
>>> --- a/include/linux/pid_namespace.h
>>> +++ b/include/linux/pid_namespace.h
>>> @@ -9,6 +9,7 @@
>>> #include <linux/nsproxy.h>
>>> #include <linux/kref.h>
>>> #include <linux/ns_common.h>
>>> +#include <uapi/linux/nsfs.h>
>>
>> No need for the extra include and slowing down the build. Just
>> declare the relevant structures.
>
> So, I'll write just:
>
> struct ns_ioc_pid_vec;
>
> instead of that.
>
>>>
>>> struct pidmap {
>>> atomic_t nr_free;
>>> @@ -103,6 +104,17 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>>> }
>>> #endif /* CONFIG_PID_NS */
>>>
>>> +#if defined(CONFIG_PID_NS) && defined(CONFIG_CHECKPOINT_RESTORE)
>>> +extern long pidns_set_last_pid_vec(struct ns_common *ns,
>>> + struct ns_ioc_pid_vec __user *vec);
>>> +#else /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>>> +static inline long pidns_set_last_pid_vec(struct ns_common *ns,
>>> + struct ns_ioc_pid_vec __user *vec)
>>> +{
>>> + return -ENOTTY;
>>> +}
>>> +#endif /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>>
>> Just CONFIG_PID_NS please. Either this is good enough for everyone who
>> has pid namespace support enabled or it isn't.
>
> Great, if it's so. For me it looks better too.
>
>>> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>>> void pidhash_init(void);
>>> void pidmap_init(void);
>>> diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
>>> index 1a3ca79f466b..1254b02a47fa 100644
>>> --- a/include/uapi/linux/nsfs.h
>>> +++ b/include/uapi/linux/nsfs.h
>>> @@ -14,5 +14,12 @@
>>> #define NS_GET_NSTYPE _IO(NSIO, 0x3)
>>> /* Get owner UID (in the caller's user namespace) for a user namespace */
>>> #define NS_GET_OWNER_UID _IO(NSIO, 0x4)
>>> +/* Set a vector of ns_last_pid for a pid namespace stack */
>>> +#define NS_SET_LAST_PID_VEC _IO(NSIO, 0x5)
>>> +
>>> +struct ns_ioc_pid_vec {
>>> + __u32 nr;
>>> + pid_t pid[0];
>>> +};
>>>
>>> #endif /* __LINUX_NSFS_H */
>>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>>> index de461aa0bf9a..08b5fef23534 100644
>>> --- a/kernel/pid_namespace.c
>>> +++ b/kernel/pid_namespace.c
>>> @@ -21,6 +21,7 @@
>>> #include <linux/export.h>
>>> #include <linux/sched/task.h>
>>> #include <linux/sched/signal.h>
>>> +#include <uapi/linux/nsfs.h>
>>>
>>> struct pid_cache {
>>> int nr_ids;
>>> @@ -428,6 +429,40 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns)
>>> return &get_pid_ns(pid_ns)->ns;
>>> }
>>>
>>> +#ifdef CONFIG_CHECKPOINT_RESTORE
>>> +long pidns_set_last_pid_vec(struct ns_common *ns,
>>> + struct ns_ioc_pid_vec __user *vec)
>>> +{
>>> + struct pid_namespace *pid_ns = to_pid_ns(ns);
>>> + pid_t pid, __user *pid_ptr;
>>> + u32 nr;
>>> +
>>> + if (get_user(nr, &vec->nr))
>>> + return -EFAULT;
>>> + if (nr > 32 || nr < 1)
>>
>> The maximum needs not to be hard coded.
>
> Ah, I've missed MAX_PID_NS_LEVEL, thanks.
>
>>> + return -EINVAL;
>>> +
>>> + pid_ptr = &vec->pid[0];
>>
>> All of the rest of the vector needs to be read in, in one go.
>
> Hm, Oleg said we shouldn't allocate a memory for that. Should
> I create array of MAX_PID_NS_LEVEL pids on stack?

*scratches head*

The really important part is that we perform all of the permission
checks before we perform the rest of the work.

I would like to be able to say that the permission checks and
the rest of it all happen atomically. Which requires copying the
data into kernel memory and sanitizing it (aka all checks) before
we apply the changes.

I seem to remember Oleg's primary concern was using vmalloc
and not kmalloc. Using the stack is fine but we need a
"BUILD_BUG_ON(sizeof(u32) * MAX_PID_NS_LEVEL < 64);" if we are
going to do that so we strictly bound the amount of local stack
we are going to use.

The keep it simple part of me likes the loop copying each value at a
time. The rest of my cringes because that introduces time of use to
time of check issues. Especially if we ever want range checks on
the value of last_pid (which I think we do) we need to be able to
read all of the values in before applying them.

Eric


>>> + do {
>>> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>>> + return -EPERM;
>>
>> Permission to change all of the namespaces should be checked before
>> writing to pid_ns->last_pid happens.
>
> Ok
>
>>> +
>>> + if (get_user(pid, pid_ptr))
>>> + return -EFAULT;
>>> + if (pid < 0 || pid > pid_max)
>>> + return -EINVAL;
>>> +
>>> + /* Write directly: see the comment in pid_ns_ctl_handler() */
>>> + pid_ns->last_pid = pid;
>>> +
>>> + pid_ns = pid_ns->parent;
>>> + pid_ptr++;
>>> + } while (--nr > 0 && pid_ns);
>>> +
>>> + return nr ? -EINVAL : 0;
>>> +}
>>> +#endif /* CONFIG_CHECKPOINT_RESTORE */
>>> +
>>> static struct user_namespace *pidns_owner(struct ns_common *ns)
>>> {
>>> return to_pid_ns(ns)->user_ns;

2017-04-28 09:13:43

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH v3] pid_ns: Introduce ioctl to set vector of ns_last_pid's on ns hierarhy

On 27.04.2017 19:07, Eric W. Biederman wrote:
> Kirill Tkhai <[email protected]> writes:
>
>> On 27.04.2017 18:15, Eric W. Biederman wrote:
>>> Kirill Tkhai <[email protected]> writes:
>>>
>>>> On implementing of nested pid namespaces support in CRIU
>>>> (checkpoint-restore in userspace tool) we run into
>>>> the situation, that it's impossible to create a task with
>>>> specific NSpid effectively. After commit 49f4d8b93ccf
>>>> "pidns: Capture the user namespace and filter ns_last_pid"
>>>> it is impossible to set ns_last_pid on any pid namespace,
>>>> except task's active pid_ns (before the commit it was possible
>>>> to write to pid_ns_for_children). Thus, if a restored task
>>>> in a container has more than one pid_ns levels, the restorer
>>>> code must have a task helper for every pid namespace
>>>> of the task's pid_ns hierarhy.
>>>>
>>>> This is a big problem, because of communication with
>>>> a helper for every pid_ns in the hierarchy is not cheap.
>>>> It's not performance-good as it implies many helpers wakeups
>>>> to create a single task (independently, how you communicate
>>>> with the helpers). This patch tries to decide the problem.
>>>
>>> I see the problem and we definitely need to do something.
>>> Your patch does appear better than what we have been doing.
>>> So a tenative conceptual ack.
>>>
>>> At the same time it is legitimate to claim that the use of
>>> task_active_pid_ns(current) rather than
>>> current->nsproxy->pid_ns_for_children is a regression caused by the
>>> above commit. So we can fix the original issue as well.
>>>
>>> I do have to ask when was this problem discovered, and why did it take
>>> so long to discover? The regeression happened nearly 5 years ago.
>>>
>>> Was CRIU already using this?
>>
>> CRIU uses ns_last_pid, but we never had nested pid namespace hierarchy.
>> When there is only one level of pid namespaces, then active pid namespace
>> is the save as pid_ns_for_children, so we never faced with this
>> problem.
>
> Ok. So not a regression then.
>
>> Now we're working on Docker support, and its recent versions create nested
>> pid namespaces (I have no information, when they begun to do that). So,
>> we met this problem.
>>
>>> It looks like the fix is a one line low danger change to
>>> /proc/sys/kernel/ns_last_pid. With a low danger as pid_ns_for_children
>>> rarely differs from task_active_pid_ns().
>>>
>>>
>>>> It introduces a new pid_ns ioctl(NS_SET_LAST_PID_VEC),
>>>> which allows to write a vector of last pids on pid_ns hierarchy.
>>>> The vector is passed as array of pids in struct ns_ioc_pid_vec,
>>>> written in reverse order. The first number corresponds to
>>>> the opened namespace ns_last_pid, the second is to its parent, etc.
>>>> So, if you have the pid namespaces hierarchy like:
>>>>
>>>> pid_ns1 (grand father)
>>>> |
>>>> v
>>>> pid_ns2 (father)
>>>> |
>>>> v
>>>> pid_ns3 (child)
>>>>
>>>> and the pid_ns3 is open, then the corresponding vector will be
>>>> {last_ns_pid3, last_ns_pid2, last_ns_pid1}. This vector may be
>>>> short and it may contain less levels. For example,
>>>> {last_ns_pid3, last_ns_pid2} or even {last_ns_pid3}, in dependence
>>>> of which levels you want to populate.
>>>>
>>>> v3: Use __u32 in uapi instead of unsigned int.
>>>>
>>>> v2: Kill pid_ns->child_reaper check as it's impossible to have
>>>> such a pid namespace file open.
>>>> Use generic namespaces ioctl() number.
>>>> Pass pids as array, not as a string.
>>>>
>>>> Signed-off-by: Kirill Tkhai <[email protected]>
>>>> ---
>>>> fs/nsfs.c | 5 +++++
>>>> include/linux/pid_namespace.h | 12 ++++++++++++
>>>> include/uapi/linux/nsfs.h | 7 +++++++
>>>> kernel/pid_namespace.c | 35 +++++++++++++++++++++++++++++++++++
>>>> 4 files changed, 59 insertions(+)
>>>>
>>>> diff --git a/fs/nsfs.c b/fs/nsfs.c
>>>> index 323f492e0822..f669a1552003 100644
>>>> --- a/fs/nsfs.c
>>>> +++ b/fs/nsfs.c
>>>> @@ -6,6 +6,7 @@
>>>> #include <linux/ktime.h>
>>>> #include <linux/seq_file.h>
>>>> #include <linux/user_namespace.h>
>>>> +#include <linux/pid_namespace.h>
>>>> #include <linux/nsfs.h>
>>>> #include <linux/uaccess.h>
>>>>
>>>> @@ -186,6 +187,10 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
>>>> argp = (uid_t __user *) arg;
>>>> uid = from_kuid_munged(current_user_ns(), user_ns->owner);
>>>> return put_user(uid, argp);
>>>> + case NS_SET_LAST_PID_VEC:
>>>> + if (ns->ops->type != CLONE_NEWPID)
>>>> + return -EINVAL;
>>>> + return pidns_set_last_pid_vec(ns, (void *)arg);
>>>> default:
>>>> return -ENOTTY;
>>>> }
>>>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>>>> index c2a989dee876..c8dc4173a4e8 100644
>>>> --- a/include/linux/pid_namespace.h
>>>> +++ b/include/linux/pid_namespace.h
>>>> @@ -9,6 +9,7 @@
>>>> #include <linux/nsproxy.h>
>>>> #include <linux/kref.h>
>>>> #include <linux/ns_common.h>
>>>> +#include <uapi/linux/nsfs.h>
>>>
>>> No need for the extra include and slowing down the build. Just
>>> declare the relevant structures.
>>
>> So, I'll write just:
>>
>> struct ns_ioc_pid_vec;
>>
>> instead of that.
>>
>>>>
>>>> struct pidmap {
>>>> atomic_t nr_free;
>>>> @@ -103,6 +104,17 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>>>> }
>>>> #endif /* CONFIG_PID_NS */
>>>>
>>>> +#if defined(CONFIG_PID_NS) && defined(CONFIG_CHECKPOINT_RESTORE)
>>>> +extern long pidns_set_last_pid_vec(struct ns_common *ns,
>>>> + struct ns_ioc_pid_vec __user *vec);
>>>> +#else /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>>>> +static inline long pidns_set_last_pid_vec(struct ns_common *ns,
>>>> + struct ns_ioc_pid_vec __user *vec)
>>>> +{
>>>> + return -ENOTTY;
>>>> +}
>>>> +#endif /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>>>
>>> Just CONFIG_PID_NS please. Either this is good enough for everyone who
>>> has pid namespace support enabled or it isn't.
>>
>> Great, if it's so. For me it looks better too.
>>
>>>> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>>>> void pidhash_init(void);
>>>> void pidmap_init(void);
>>>> diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
>>>> index 1a3ca79f466b..1254b02a47fa 100644
>>>> --- a/include/uapi/linux/nsfs.h
>>>> +++ b/include/uapi/linux/nsfs.h
>>>> @@ -14,5 +14,12 @@
>>>> #define NS_GET_NSTYPE _IO(NSIO, 0x3)
>>>> /* Get owner UID (in the caller's user namespace) for a user namespace */
>>>> #define NS_GET_OWNER_UID _IO(NSIO, 0x4)
>>>> +/* Set a vector of ns_last_pid for a pid namespace stack */
>>>> +#define NS_SET_LAST_PID_VEC _IO(NSIO, 0x5)
>>>> +
>>>> +struct ns_ioc_pid_vec {
>>>> + __u32 nr;
>>>> + pid_t pid[0];
>>>> +};
>>>>
>>>> #endif /* __LINUX_NSFS_H */
>>>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>>>> index de461aa0bf9a..08b5fef23534 100644
>>>> --- a/kernel/pid_namespace.c
>>>> +++ b/kernel/pid_namespace.c
>>>> @@ -21,6 +21,7 @@
>>>> #include <linux/export.h>
>>>> #include <linux/sched/task.h>
>>>> #include <linux/sched/signal.h>
>>>> +#include <uapi/linux/nsfs.h>
>>>>
>>>> struct pid_cache {
>>>> int nr_ids;
>>>> @@ -428,6 +429,40 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns)
>>>> return &get_pid_ns(pid_ns)->ns;
>>>> }
>>>>
>>>> +#ifdef CONFIG_CHECKPOINT_RESTORE
>>>> +long pidns_set_last_pid_vec(struct ns_common *ns,
>>>> + struct ns_ioc_pid_vec __user *vec)
>>>> +{
>>>> + struct pid_namespace *pid_ns = to_pid_ns(ns);
>>>> + pid_t pid, __user *pid_ptr;
>>>> + u32 nr;
>>>> +
>>>> + if (get_user(nr, &vec->nr))
>>>> + return -EFAULT;
>>>> + if (nr > 32 || nr < 1)
>>>
>>> The maximum needs not to be hard coded.
>>
>> Ah, I've missed MAX_PID_NS_LEVEL, thanks.
>>
>>>> + return -EINVAL;
>>>> +
>>>> + pid_ptr = &vec->pid[0];
>>>
>>> All of the rest of the vector needs to be read in, in one go.
>>
>> Hm, Oleg said we shouldn't allocate a memory for that. Should
>> I create array of MAX_PID_NS_LEVEL pids on stack?
>
> *scratches head*
>
> The really important part is that we perform all of the permission
> checks before we perform the rest of the work.
>
> I would like to be able to say that the permission checks and
> the rest of it all happen atomically. Which requires copying the
> data into kernel memory and sanitizing it (aka all checks) before
> we apply the changes.

This way, we better check the permissions on the top pid namespace
of the passed vector, because every children's pid_ns->user_ns is
the same as its parent's, or it's descendant.

> I seem to remember Oleg's primary concern was using vmalloc
> and not kmalloc. Using the stack is fine but we need a

It's a bit not what he said, but I do not insist.

> "BUILD_BUG_ON(sizeof(u32) * MAX_PID_NS_LEVEL < 64);" if we are

What does this check mean? Why do we have to limit minimal MAX_PID_NS_LEVEL?

> going to do that so we strictly bound the amount of local stack
> we are going to use.
>
> The keep it simple part of me likes the loop copying each value at a
> time. The rest of my cringes because that introduces time of use to
> time of check issues. Especially if we ever want range checks on
> the value of last_pid (which I think we do) we need to be able to
> read all of the values in before applying them.
>
> Eric
>
>
>>>> + do {
>>>> + if (!ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>>>> + return -EPERM;
>>>
>>> Permission to change all of the namespaces should be checked before
>>> writing to pid_ns->last_pid happens.
>>
>> Ok
>>
>>>> +
>>>> + if (get_user(pid, pid_ptr))
>>>> + return -EFAULT;
>>>> + if (pid < 0 || pid > pid_max)
>>>> + return -EINVAL;
>>>> +
>>>> + /* Write directly: see the comment in pid_ns_ctl_handler() */
>>>> + pid_ns->last_pid = pid;
>>>> +
>>>> + pid_ns = pid_ns->parent;
>>>> + pid_ptr++;
>>>> + } while (--nr > 0 && pid_ns);
>>>> +
>>>> + return nr ? -EINVAL : 0;
>>>> +}
>>>> +#endif /* CONFIG_CHECKPOINT_RESTORE */
>>>> +
>>>> static struct user_namespace *pidns_owner(struct ns_common *ns)
>>>> {
>>>> return to_pid_ns(ns)->user_ns;

2017-04-29 19:18:41

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v3] pid_ns: Introduce ioctl to set vector of ns_last_pid's on ns hierarhy

Kirill Tkhai <[email protected]> writes:

> On 27.04.2017 19:07, Eric W. Biederman wrote:
>> Kirill Tkhai <[email protected]> writes:
>>
>>> On 27.04.2017 18:15, Eric W. Biederman wrote:
>>>> Kirill Tkhai <[email protected]> writes:
>>>>
>>>>> On implementing of nested pid namespaces support in CRIU
>>>>> (checkpoint-restore in userspace tool) we run into
>>>>> the situation, that it's impossible to create a task with
>>>>> specific NSpid effectively. After commit 49f4d8b93ccf
>>>>> "pidns: Capture the user namespace and filter ns_last_pid"
>>>>> it is impossible to set ns_last_pid on any pid namespace,
>>>>> except task's active pid_ns (before the commit it was possible
>>>>> to write to pid_ns_for_children). Thus, if a restored task
>>>>> in a container has more than one pid_ns levels, the restorer
>>>>> code must have a task helper for every pid namespace
>>>>> of the task's pid_ns hierarhy.
>>>>>
>>>>> This is a big problem, because of communication with
>>>>> a helper for every pid_ns in the hierarchy is not cheap.
>>>>> It's not performance-good as it implies many helpers wakeups
>>>>> to create a single task (independently, how you communicate
>>>>> with the helpers). This patch tries to decide the problem.
>>>>
>>>> I see the problem and we definitely need to do something.
>>>> Your patch does appear better than what we have been doing.
>>>> So a tenative conceptual ack.
>>>>
>>>> At the same time it is legitimate to claim that the use of
>>>> task_active_pid_ns(current) rather than
>>>> current->nsproxy->pid_ns_for_children is a regression caused by the
>>>> above commit. So we can fix the original issue as well.
>>>>
>>>> I do have to ask when was this problem discovered, and why did it take
>>>> so long to discover? The regeression happened nearly 5 years ago.
>>>>
>>>> Was CRIU already using this?
>>>
>>> CRIU uses ns_last_pid, but we never had nested pid namespace hierarchy.
>>> When there is only one level of pid namespaces, then active pid namespace
>>> is the save as pid_ns_for_children, so we never faced with this
>>> problem.
>>
>> Ok. So not a regression then.
>>
>>> Now we're working on Docker support, and its recent versions create nested
>>> pid namespaces (I have no information, when they begun to do that). So,
>>> we met this problem.
>>>
>>>> It looks like the fix is a one line low danger change to
>>>> /proc/sys/kernel/ns_last_pid. With a low danger as pid_ns_for_children
>>>> rarely differs from task_active_pid_ns().
>>>>
>>>>
>>>>> It introduces a new pid_ns ioctl(NS_SET_LAST_PID_VEC),
>>>>> which allows to write a vector of last pids on pid_ns hierarchy.
>>>>> The vector is passed as array of pids in struct ns_ioc_pid_vec,
>>>>> written in reverse order. The first number corresponds to
>>>>> the opened namespace ns_last_pid, the second is to its parent, etc.
>>>>> So, if you have the pid namespaces hierarchy like:
>>>>>
>>>>> pid_ns1 (grand father)
>>>>> |
>>>>> v
>>>>> pid_ns2 (father)
>>>>> |
>>>>> v
>>>>> pid_ns3 (child)
>>>>>
>>>>> and the pid_ns3 is open, then the corresponding vector will be
>>>>> {last_ns_pid3, last_ns_pid2, last_ns_pid1}. This vector may be
>>>>> short and it may contain less levels. For example,
>>>>> {last_ns_pid3, last_ns_pid2} or even {last_ns_pid3}, in dependence
>>>>> of which levels you want to populate.
>>>>>
>>>>> v3: Use __u32 in uapi instead of unsigned int.
>>>>>
>>>>> v2: Kill pid_ns->child_reaper check as it's impossible to have
>>>>> such a pid namespace file open.
>>>>> Use generic namespaces ioctl() number.
>>>>> Pass pids as array, not as a string.
>>>>>
>>>>> Signed-off-by: Kirill Tkhai <[email protected]>
>>>>> ---
>>>>> fs/nsfs.c | 5 +++++
>>>>> include/linux/pid_namespace.h | 12 ++++++++++++
>>>>> include/uapi/linux/nsfs.h | 7 +++++++
>>>>> kernel/pid_namespace.c | 35 +++++++++++++++++++++++++++++++++++
>>>>> 4 files changed, 59 insertions(+)
>>>>>
>>>>> diff --git a/fs/nsfs.c b/fs/nsfs.c
>>>>> index 323f492e0822..f669a1552003 100644
>>>>> --- a/fs/nsfs.c
>>>>> +++ b/fs/nsfs.c
>>>>> @@ -6,6 +6,7 @@
>>>>> #include <linux/ktime.h>
>>>>> #include <linux/seq_file.h>
>>>>> #include <linux/user_namespace.h>
>>>>> +#include <linux/pid_namespace.h>
>>>>> #include <linux/nsfs.h>
>>>>> #include <linux/uaccess.h>
>>>>>
>>>>> @@ -186,6 +187,10 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
>>>>> argp = (uid_t __user *) arg;
>>>>> uid = from_kuid_munged(current_user_ns(), user_ns->owner);
>>>>> return put_user(uid, argp);
>>>>> + case NS_SET_LAST_PID_VEC:
>>>>> + if (ns->ops->type != CLONE_NEWPID)
>>>>> + return -EINVAL;
>>>>> + return pidns_set_last_pid_vec(ns, (void *)arg);
>>>>> default:
>>>>> return -ENOTTY;
>>>>> }
>>>>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>>>>> index c2a989dee876..c8dc4173a4e8 100644
>>>>> --- a/include/linux/pid_namespace.h
>>>>> +++ b/include/linux/pid_namespace.h
>>>>> @@ -9,6 +9,7 @@
>>>>> #include <linux/nsproxy.h>
>>>>> #include <linux/kref.h>
>>>>> #include <linux/ns_common.h>
>>>>> +#include <uapi/linux/nsfs.h>
>>>>
>>>> No need for the extra include and slowing down the build. Just
>>>> declare the relevant structures.
>>>
>>> So, I'll write just:
>>>
>>> struct ns_ioc_pid_vec;
>>>
>>> instead of that.
>>>
>>>>>
>>>>> struct pidmap {
>>>>> atomic_t nr_free;
>>>>> @@ -103,6 +104,17 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>>>>> }
>>>>> #endif /* CONFIG_PID_NS */
>>>>>
>>>>> +#if defined(CONFIG_PID_NS) && defined(CONFIG_CHECKPOINT_RESTORE)
>>>>> +extern long pidns_set_last_pid_vec(struct ns_common *ns,
>>>>> + struct ns_ioc_pid_vec __user *vec);
>>>>> +#else /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>>>>> +static inline long pidns_set_last_pid_vec(struct ns_common *ns,
>>>>> + struct ns_ioc_pid_vec __user *vec)
>>>>> +{
>>>>> + return -ENOTTY;
>>>>> +}
>>>>> +#endif /* CONFIG_PID_NS && CONFIG_CHECKPOINT_RESTORE */
>>>>
>>>> Just CONFIG_PID_NS please. Either this is good enough for everyone who
>>>> has pid namespace support enabled or it isn't.
>>>
>>> Great, if it's so. For me it looks better too.
>>>
>>>>> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>>>>> void pidhash_init(void);
>>>>> void pidmap_init(void);
>>>>> diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
>>>>> index 1a3ca79f466b..1254b02a47fa 100644
>>>>> --- a/include/uapi/linux/nsfs.h
>>>>> +++ b/include/uapi/linux/nsfs.h
>>>>> @@ -14,5 +14,12 @@
>>>>> #define NS_GET_NSTYPE _IO(NSIO, 0x3)
>>>>> /* Get owner UID (in the caller's user namespace) for a user namespace */
>>>>> #define NS_GET_OWNER_UID _IO(NSIO, 0x4)
>>>>> +/* Set a vector of ns_last_pid for a pid namespace stack */
>>>>> +#define NS_SET_LAST_PID_VEC _IO(NSIO, 0x5)
>>>>> +
>>>>> +struct ns_ioc_pid_vec {
>>>>> + __u32 nr;
>>>>> + pid_t pid[0];
>>>>> +};
>>>>>
>>>>> #endif /* __LINUX_NSFS_H */
>>>>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>>>>> index de461aa0bf9a..08b5fef23534 100644
>>>>> --- a/kernel/pid_namespace.c
>>>>> +++ b/kernel/pid_namespace.c
>>>>> @@ -21,6 +21,7 @@
>>>>> #include <linux/export.h>
>>>>> #include <linux/sched/task.h>
>>>>> #include <linux/sched/signal.h>
>>>>> +#include <uapi/linux/nsfs.h>
>>>>>
>>>>> struct pid_cache {
>>>>> int nr_ids;
>>>>> @@ -428,6 +429,40 @@ static struct ns_common *pidns_get_parent(struct ns_common *ns)
>>>>> return &get_pid_ns(pid_ns)->ns;
>>>>> }
>>>>>
>>>>> +#ifdef CONFIG_CHECKPOINT_RESTORE
>>>>> +long pidns_set_last_pid_vec(struct ns_common *ns,
>>>>> + struct ns_ioc_pid_vec __user *vec)
>>>>> +{
>>>>> + struct pid_namespace *pid_ns = to_pid_ns(ns);
>>>>> + pid_t pid, __user *pid_ptr;
>>>>> + u32 nr;
>>>>> +
>>>>> + if (get_user(nr, &vec->nr))
>>>>> + return -EFAULT;
>>>>> + if (nr > 32 || nr < 1)
>>>>
>>>> The maximum needs not to be hard coded.
>>>
>>> Ah, I've missed MAX_PID_NS_LEVEL, thanks.
>>>
>>>>> + return -EINVAL;
>>>>> +
>>>>> + pid_ptr = &vec->pid[0];
>>>>
>>>> All of the rest of the vector needs to be read in, in one go.
>>>
>>> Hm, Oleg said we shouldn't allocate a memory for that. Should
>>> I create array of MAX_PID_NS_LEVEL pids on stack?
>>
>> *scratches head*
>>
>> The really important part is that we perform all of the permission
>> checks before we perform the rest of the work.
>>
>> I would like to be able to say that the permission checks and
>> the rest of it all happen atomically. Which requires copying the
>> data into kernel memory and sanitizing it (aka all checks) before
>> we apply the changes.
>
> This way, we better check the permissions on the top pid namespace
> of the passed vector, because every children's pid_ns->user_ns is
> the same as its parent's, or it's descendant.

In practice this makes sense and is a useful simplification.

Looking at your suggesting I am noticing we don't actually enforce this
constraint, and that with careful usage of setns I can get around that.

This seems like a hazard for kernel developers and not at all useful
for userspace developers. So it looks like we need a patch to enforce
this constraint. Patch to fix this issue in a moment.


>> "BUILD_BUG_ON(sizeof(u32) * MAX_PID_NS_LEVEL < 64);" if we are
>
> What does this check mean? Why do we have to limit minimal MAX_PID_NS_LEVEL?

That should have been paranenthesized as:
BUILD_BUG_ON((sizeof(u32) * MAX_PID_NS_LEVEL) < 128);
or possibly writen as:
BUILD_BUG_ON(sizeof(on_stack_array) < 128);

The point being that if someone changes MAX_PID_NS_LEVEL and the stack
usage goes up noticably we have a warning, and then someone can
determine if the array is still small enough to fit on the stack
or if it needs to be kmalloced.

The goal is not to leave a trap for maintainers in the future.

Eric

2017-04-29 19:31:54

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH] userns,pidns: Verify the userns for new pid namespaces


It is pointless and confusing to allow a pid namespace hierarchy and
the user namespace hierarchy to get out of sync. The owner of a child
pid namespace should be the owner of the parent pid namespace or
a descendant of the owner of the parent pid namespace.

Otherwise it is possible to construct scenarios where it is legal to
do something in a parent pid namespace but in a child pid namespace.

It requires use of setns into a pid namespace (but not into a user
namespace) to create such a scenario.

Add the function in_userns to help in making this determination.

Signed-off-by: "Eric W. Biederman" <[email protected]>
---

While review a patch from Kiril Tkhai I realized we were missing this
sanity check....

include/linux/user_namespace.h | 8 +++++++-
kernel/pid_namespace.c | 4 ++++
kernel/user_namespace.c | 18 ++++++++++++------
3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 32354b4b4b2b..497ed50004db 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -112,8 +112,9 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t,
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
+extern bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
-
struct ns_common *ns_get_owner(struct ns_common *ns);
#else

@@ -144,6 +145,11 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns)
return true;
}

+static inline bool in_userns(const struct user_namespace *target_ns)
+{
+ return true;
+}
+
static inline bool current_in_userns(const struct user_namespace *target_ns)
{
return true;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..749147f5a613 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
int i;
int err;

+ err = -EINVAL;
+ if (!in_userns(parent_pid_ns->user_ns, user_ns))
+ goto out;
+
err = -ENOSPC;
if (level > MAX_PID_NS_LEVEL)
goto out;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2f735cbe05e8..7d8658fbabc8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -986,19 +986,25 @@ bool userns_may_setgroups(const struct user_namespace *ns)
}

/*
- * Returns true if @ns is the same namespace as or a descendant of
- * @target_ns.
+ * Returns true if @child is the same namespace or a descendant of
+ * @ancestor.
*/
-bool current_in_userns(const struct user_namespace *target_ns)
+bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child)
{
- struct user_namespace *ns;
- for (ns = current_user_ns(); ns; ns = ns->parent) {
- if (ns == target_ns)
+ const struct user_namespace *ns;
+ for (ns = child; ns; ns = ns->parent) {
+ if (ns == ancestor)
return true;
}
return false;
}

+bool current_in_userns(const struct user_namespace *target_ns)
+{
+ return in_userns(target_ns, current_user_ns());
+}
+
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
--
2.10.1

2017-04-29 20:53:35

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH] userns,pidns: Verify the userns for new pid namespaces

Quoting Eric W. Biederman ([email protected]):
>
> It is pointless and confusing to allow a pid namespace hierarchy and
> the user namespace hierarchy to get out of sync. The owner of a child
> pid namespace should be the owner of the parent pid namespace or
> a descendant of the owner of the parent pid namespace.
>
> Otherwise it is possible to construct scenarios where it is legal to
> do something in a parent pid namespace but in a child pid namespace.

Hi,

did you mean 'but not in a child...' above?

> It requires use of setns into a pid namespace (but not into a user
> namespace) to create such a scenario.
>
> Add the function in_userns to help in making this determination.
>
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
>
> While review a patch from Kiril Tkhai I realized we were missing this
> sanity check....
>
> include/linux/user_namespace.h | 8 +++++++-
> kernel/pid_namespace.c | 4 ++++
> kernel/user_namespace.c | 18 ++++++++++++------
> 3 files changed, 23 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 32354b4b4b2b..497ed50004db 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -112,8 +112,9 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t,
> extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
> extern int proc_setgroups_show(struct seq_file *m, void *v);
> extern bool userns_may_setgroups(const struct user_namespace *ns);
> +extern bool in_userns(const struct user_namespace *ancestor,
> + const struct user_namespace *child);
> extern bool current_in_userns(const struct user_namespace *target_ns);
> -
> struct ns_common *ns_get_owner(struct ns_common *ns);
> #else
>
> @@ -144,6 +145,11 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns)
> return true;
> }
>
> +static inline bool in_userns(const struct user_namespace *target_ns)
> +{
> + return true;
> +}
> +
> static inline bool current_in_userns(const struct user_namespace *target_ns)
> {
> return true;
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index de461aa0bf9a..749147f5a613 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> int i;
> int err;
>
> + err = -EINVAL;
> + if (!in_userns(parent_pid_ns->user_ns, user_ns))
> + goto out;
> +
> err = -ENOSPC;
> if (level > MAX_PID_NS_LEVEL)
> goto out;
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 2f735cbe05e8..7d8658fbabc8 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -986,19 +986,25 @@ bool userns_may_setgroups(const struct user_namespace *ns)
> }
>
> /*
> - * Returns true if @ns is the same namespace as or a descendant of
> - * @target_ns.
> + * Returns true if @child is the same namespace or a descendant of
> + * @ancestor.
> */
> -bool current_in_userns(const struct user_namespace *target_ns)
> +bool in_userns(const struct user_namespace *ancestor,
> + const struct user_namespace *child)
> {
> - struct user_namespace *ns;
> - for (ns = current_user_ns(); ns; ns = ns->parent) {
> - if (ns == target_ns)
> + const struct user_namespace *ns;
> + for (ns = child; ns; ns = ns->parent) {
> + if (ns == ancestor)
> return true;
> }
> return false;
> }
>
> +bool current_in_userns(const struct user_namespace *target_ns)
> +{
> + return in_userns(target_ns, current_user_ns());
> +}
> +
> static inline struct user_namespace *to_user_ns(struct ns_common *ns)
> {
> return container_of(ns, struct user_namespace, ns);
> --
> 2.10.1

2017-04-30 04:39:57

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] userns,pidns: Verify the userns for new pid namespaces

"Serge E. Hallyn" <[email protected]> writes:

> Quoting Eric W. Biederman ([email protected]):
>>
>> It is pointless and confusing to allow a pid namespace hierarchy and
>> the user namespace hierarchy to get out of sync. The owner of a child
>> pid namespace should be the owner of the parent pid namespace or
>> a descendant of the owner of the parent pid namespace.
>>
>> Otherwise it is possible to construct scenarios where it is legal to
>> do something in a parent pid namespace but in a child pid namespace.
>
> Hi,
>
> did you mean 'but not in a child...' above?

Actually I believe I meant:

>> Otherwise it is possible to construct scenarios where it is not legal
>> to do something in a parent pid namespace but it is legal a child pid
>> namespace.

I definitely need to fix that wording thank you.

Eric

2017-04-30 04:48:48

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] userns,pidns: Verify the userns for new pid namespaces

[email protected] (Eric W. Biederman) writes:

> "Serge E. Hallyn" <[email protected]> writes:
>
>> Quoting Eric W. Biederman ([email protected]):
>>>
>>> It is pointless and confusing to allow a pid namespace hierarchy and
>>> the user namespace hierarchy to get out of sync. The owner of a child
>>> pid namespace should be the owner of the parent pid namespace or
>>> a descendant of the owner of the parent pid namespace.
>>>
>>> Otherwise it is possible to construct scenarios where it is legal to
>>> do something in a parent pid namespace but in a child pid namespace.
>>
>> Hi,
>>
>> did you mean 'but not in a child...' above?
>
> Actually I believe I meant:
>
>>> Otherwise it is possible to construct scenarios where it is not legal
>>> to do something in a parent pid namespace but it is legal a child pid
>>> namespace.
>
> I definitely need to fix that wording thank you.

Looking at some more I mean:

Otherwise it is possible to construct scenarios where a process has a
capability in a over a parent pid namespace but does not have the
capability over a child pid namespace. Which confusingly makes
permission checks non-transitive.


Eric