The main motivation to add CLONE_SET_TID to clone3() is CRIU.
To restore a process with the same PID/TID CRIU currently uses
/proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
ns_last_pid and then (quickly) does a clone(). This works most of the
time, but it is racy. It is also slow as it requires multiple syscalls.
Extending clone3() to support CLONE_SET_TID makes it possible restore a
process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
race free (as long as the desired PID/TID is available).
This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
on clone3() with set_tid as they are currently in place for ns_last_pid.
v2:
- Removed (size < sizeof(struct clone_args)) as discussed with
Christian and Dmitry
- Added comment to ((set_tid != 1) && idr_get_cursor() <= 1) (Oleg)
- Use idr_alloc() instead of idr_alloc_cyclic() (Oleg)
Signed-off-by: Adrian Reber <[email protected]>
---
include/linux/pid.h | 2 +-
include/linux/sched/task.h | 1 +
include/uapi/linux/sched.h | 2 ++
kernel/fork.c | 25 ++++++++++++++++---------
kernel/pid.c | 30 +++++++++++++++++++++++-------
5 files changed, 43 insertions(+), 17 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 2a83e434db9d..052000db0ced 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -116,7 +116,7 @@ extern struct pid *find_vpid(int nr);
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t set_tid);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0497091e40c1..4f2a80564332 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -26,6 +26,7 @@ struct kernel_clone_args {
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
+ pid_t set_tid;
};
/*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..8c4e803e8147 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -32,6 +32,7 @@
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
+#define CLONE_SET_TID 0x100000000ULL /* set if the desired TID is set in set_tid */
/*
* Arguments for the clone3 syscall
@@ -45,6 +46,7 @@ struct clone_args {
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
+ __aligned_u64 set_tid;
};
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e76ea3..405f98ce4c83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2031,7 +2031,7 @@ static __latent_entropy struct task_struct *copy_process(
stackleak_task_init(p);
if (pid != &init_struct_pid) {
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
@@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t size)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct clone_args args;
if (unlikely(size > PAGE_SIZE))
return -E2BIG;
- if (unlikely(size < sizeof(struct clone_args)))
- return -EINVAL;
-
if (unlikely(!access_ok(uargs, size)))
return -EFAULT;
@@ -2562,6 +2560,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
if (copy_from_user(&args, uargs, size))
return -EFAULT;
+ if ((args.flags & CLONE_SET_TID) && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
@@ -2571,6 +2572,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
+ .set_tid = args.set_tid,
};
return 0;
@@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
static bool clone3_args_valid(const struct kernel_clone_args *kargs)
{
- /*
- * All lower bits of the flag word are taken.
- * Verify that no other unknown flags are passed along.
- */
- if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ /* Verify that no other unknown flags are passed along. */
+ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
+ return false;
+
+ /* Fail if set_tid is set without CLONE_SET_TID */
+ if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
+ return false;
+
+ /* Also fail if set_tid is invalid */
+ if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
return false;
/*
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..977f3ac39d7f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -157,7 +157,7 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
{
struct pid *pid;
enum pid_type type;
@@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
- /*
- * Store a null pointer so find_pid_ns does not find
- * a partially initialized PID (see below).
- */
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ if (set_tid) {
+ /*
+ * Also fail if a PID != 1 is requested
+ * and no PID 1 exists.
+ */
+ if ((set_tid >= pid_max) || ((set_tid != 1) &&
+ (idr_get_cursor(&tmp->idr) <= 1))) {
+ spin_unlock_irq(&pidmap_lock);
+ retval = -EINVAL;
+ goto out_free;
+ }
+ nr = idr_alloc(&tmp->idr, NULL, set_tid,
+ set_tid + 1, GFP_ATOMIC);
+ set_tid = 0;
+ } else {
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ }
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
--
2.21.0
This tests clone3() with set_tid to see if all desired PIDs are working
as expected. The test tries to clone3() with a set_tid of -1, 1,
pid_max, a PID which is already in use and an unused PID. The same
tests are also running in PID namespace.
Signed-off-by: Adrian Reber <[email protected]>
---
tools/testing/selftests/clone3/.gitignore | 1 +
tools/testing/selftests/clone3/Makefile | 11 ++
.../testing/selftests/clone3/clone3_set_tid.c | 148 ++++++++++++++++++
3 files changed, 160 insertions(+)
create mode 100644 tools/testing/selftests/clone3/.gitignore
create mode 100644 tools/testing/selftests/clone3/Makefile
create mode 100644 tools/testing/selftests/clone3/clone3_set_tid.c
diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore
new file mode 100644
index 000000000000..09ccea33016c
--- /dev/null
+++ b/tools/testing/selftests/clone3/.gitignore
@@ -0,0 +1 @@
+clone3_set_tid
diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile
new file mode 100644
index 000000000000..45c77b50f367
--- /dev/null
+++ b/tools/testing/selftests/clone3/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+
+CFLAGS += -I../../../../usr/include/
+
+ifeq ($(ARCH),x86_64)
+ TEST_GEN_PROGS := clone3_set_tid
+endif
+
+include ../lib.mk
diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c
new file mode 100644
index 000000000000..1ed0845aa4c5
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_set_tid.c
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Based on Christian Brauner's clone3() example */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include "../kselftest.h"
+
+static pid_t raw_clone(struct clone_args *args)
+{
+ return syscall(__NR_clone3, args, sizeof(struct clone_args));
+}
+
+static int call_clone3_set_tid(int set_tid, int flags)
+{
+ struct clone_args args = {0};
+ pid_t ppid = -1;
+ pid_t pid = -1;
+ int status;
+
+ args.flags = flags | CLONE_SET_TID;
+ args.exit_signal = SIGCHLD;
+ args.set_tid = set_tid;
+
+ pid = raw_clone(&args);
+ if (pid < 0) {
+ ksft_print_msg("%s - Failed to create new process\n",
+ strerror(errno));
+ return -errno;
+ }
+
+ if (pid == 0) {
+ ksft_print_msg("I am the child, my PID is %d\n", getpid());
+ if (set_tid != getpid())
+ _exit(EXIT_FAILURE);
+ _exit(EXIT_SUCCESS);
+ }
+
+ ppid = getpid();
+ ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
+ ppid, pid);
+
+ (void)wait(&status);
+ if (WEXITSTATUS(status))
+ return WEXITSTATUS(status);
+
+ return 0;
+}
+
+static int test_clone3_set_tid(int set_tid, int flags, int expected)
+{
+ int ret;
+ ksft_print_msg("[%d] Trying clone3() with CLONE_SET_TID to %d "
+ "and 0x%x\n", getpid(), set_tid, flags);
+ ret = call_clone3_set_tid(set_tid, flags);
+ ksft_print_msg("[%d] clone3() with CLONE_SET_TID %d says :%d "
+ "- expected %d\n", getpid(), set_tid, ret, expected);
+ if (ret != expected)
+ ksft_exit_fail_msg("[%d] Result (%d) is different than "
+ "expected (%d)\n", getpid(), ret, expected);
+ ksft_test_result_pass("[%d] Result (%d) matches expectation (%d)\n",
+ getpid(), ret, expected);
+ return 0;
+}
+int main(int argc, char *argv[])
+{
+ FILE *f;
+ int pid_max = 0;
+ pid_t pid;
+ pid_t ns_pid;
+ int ret = -1;
+
+ ksft_print_header();
+ ksft_set_plan(10);
+
+ f = fopen("/proc/sys/kernel/pid_max", "r");
+ if (f == NULL)
+ ksft_exit_fail_msg("%s - Could not open /proc/sys/kernel/pid_max\n",
+ strerror(errno));
+ fscanf(f, "%d", &pid_max);
+ fclose(f);
+ ksft_print_msg("/proc/sys/kernel/pid_max %d\n", pid_max);
+
+ /* First try with an invalid PID */
+ if (test_clone3_set_tid(-1, 0, -EINVAL))
+ goto on_error;
+ if (test_clone3_set_tid(-1, CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ /* Then with PID 1 */
+ if (test_clone3_set_tid(1, 0, -EAGAIN))
+ goto on_error;
+ /* PID 1 should not fail in a PID namespace */
+ if (test_clone3_set_tid(1, CLONE_NEWPID, 0))
+ goto on_error;
+ /* pid_max should fail everywhere */
+ if (test_clone3_set_tid(pid_max, 0, -EINVAL))
+ goto on_error;
+ if (test_clone3_set_tid(pid_max, CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ /* Find the current active PID */
+ pid = fork();
+ if (pid == 0) {
+ ksft_print_msg("Child has PID %d\n", getpid());
+ sleep(1);
+ _exit(EXIT_SUCCESS);
+ }
+ /* Try to create a process with that PID should fail */
+ if (test_clone3_set_tid(pid, 0, -EAGAIN))
+ goto on_error;
+ (void)wait(NULL);
+ /* After the child has finished, try again with the same PID */
+ if (test_clone3_set_tid(pid, 0, 0))
+ goto on_error;
+ /* This should fail as there is no PID 1 in that namespace */
+ if (test_clone3_set_tid(pid, CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ unshare(CLONE_NEWPID);
+ /* Let's create a PID 1 */
+ ns_pid = fork();
+ if (ns_pid == 0) {
+ ksft_print_msg("Child in PID namespace has PID %d\n", getpid());
+ sleep(1);
+ _exit(EXIT_SUCCESS);
+ }
+ /*
+ * Now, after the unshare() it should be possible to create a process
+ * with another ID than 1 in the PID namespace.
+ */
+ if (test_clone3_set_tid(2, 0, 0))
+ goto on_error;
+ (void)wait(NULL);
+
+ ret = 0;
+on_error:
+
+ return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
--
2.21.0
Hi Adrian,
On 7/31/19 5:12 PM, Adrian Reber wrote:
[..]
> @@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> struct clone_args __user *uargs,
> size_t size)
> {
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> struct clone_args args;
>
> if (unlikely(size > PAGE_SIZE))
> return -E2BIG;
>
> - if (unlikely(size < sizeof(struct clone_args)))
> - return -EINVAL;
> -
It might be better to validate it still somehow, but I don't insist.
[..]
> @@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>
> static bool clone3_args_valid(const struct kernel_clone_args *kargs)
> {
> - /*
> - * All lower bits of the flag word are taken.
> - * Verify that no other unknown flags are passed along.
> - */
> - if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> + /* Verify that no other unknown flags are passed along. */
> + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
> + return false;
> +
> + /* Fail if set_tid is set without CLONE_SET_TID */
> + if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
> + return false;
> +
> + /* Also fail if set_tid is invalid */
> + if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
> return false;
Sorry for not mentioning it on v1, but I've noticed it only now:
you check kargs->set_tid even with the legacy-sized kernel_clone_args,
which is probably some random value on a task's stack?
[..]
Thanks much,
Dmitry
On 7/31/19 5:49 PM, Dmitry Safonov wrote:
> Hi Adrian,
>
> On 7/31/19 5:12 PM, Adrian Reber wrote:
> [..]
>> @@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>> struct clone_args __user *uargs,
>> size_t size)
>> {
>> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
>> struct clone_args args;
>>
>> if (unlikely(size > PAGE_SIZE))
>> return -E2BIG;
>>
>> - if (unlikely(size < sizeof(struct clone_args)))
>> - return -EINVAL;
>> -
>
> It might be better to validate it still somehow, but I don't insist.
>
> [..]
>> @@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>>
>> static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>> {
>> - /*
>> - * All lower bits of the flag word are taken.
>> - * Verify that no other unknown flags are passed along.
>> - */
>> - if (kargs->flags & ~CLONE_LEGACY_FLAGS)
>> + /* Verify that no other unknown flags are passed along. */
>> + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
>> + return false;
>> +
>> + /* Fail if set_tid is set without CLONE_SET_TID */
>> + if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
>> + return false;
>> +
>> + /* Also fail if set_tid is invalid */
>> + if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
>> return false;
>
> Sorry for not mentioning it on v1, but I've noticed it only now:
> you check kargs->set_tid even with the legacy-sized kernel_clone_args,
> which is probably some random value on a task's stack?
Self-correction: On kernel stack in copy_clone_args_from_user().
Which may probably be considered as a security leak..
Sorry again for not spotting it in v1.
Thanks,
Dmitry
On 07/31, Adrian Reber wrote:
>
> Extending clone3() to support CLONE_SET_TID makes it possible restore a
> process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
> race free (as long as the desired PID/TID is available).
I personally like this... but please see the question below.
> +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
> {
> struct pid *pid;
> enum pid_type type;
> @@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> pid_min = RESERVED_PIDS;
>
> - /*
> - * Store a null pointer so find_pid_ns does not find
> - * a partially initialized PID (see below).
> - */
> - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + if (set_tid) {
> + /*
> + * Also fail if a PID != 1 is requested
> + * and no PID 1 exists.
> + */
> + if ((set_tid >= pid_max) || ((set_tid != 1) &&
> + (idr_get_cursor(&tmp->idr) <= 1)))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Ah, I forgot to mention... this should work but only because
RESERVED_PIDS > 0. How about idr_is_empty() ?
But the main question is how it can really help if ns->level > 0, unlikely
CRIU will ever need to clone the process with the same pid_nr == set_tid
in the ns->parent chain.
So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
Or I missed something ?
Oleg.
On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> On 07/31, Adrian Reber wrote:
> >
> > Extending clone3() to support CLONE_SET_TID makes it possible restore a
> > process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
> > race free (as long as the desired PID/TID is available).
>
> I personally like this... but please see the question below.
>
> > +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
> > {
> > struct pid *pid;
> > enum pid_type type;
> > @@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> > if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> > pid_min = RESERVED_PIDS;
> >
> > - /*
> > - * Store a null pointer so find_pid_ns does not find
> > - * a partially initialized PID (see below).
> > - */
> > - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> > - pid_max, GFP_ATOMIC);
> > + if (set_tid) {
> > + /*
> > + * Also fail if a PID != 1 is requested
> > + * and no PID 1 exists.
> > + */
> > + if ((set_tid >= pid_max) || ((set_tid != 1) &&
> > + (idr_get_cursor(&tmp->idr) <= 1)))
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> Ah, I forgot to mention... this should work but only because
> RESERVED_PIDS > 0. How about idr_is_empty() ?
>
>
> But the main question is how it can really help if ns->level > 0, unlikely
> CRIU will ever need to clone the process with the same pid_nr == set_tid
> in the ns->parent chain.
Not sure I understand what you mean. For CRIU only the PID in the PID
namespace is relevant.
> So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> Or I missed something ?
Not sure why and how an array would be needed. Could you give me some
more details why you think this is needed.
Adrian
On 08/02, Adrian Reber wrote:
>
> On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> > But the main question is how it can really help if ns->level > 0, unlikely
> > CRIU will ever need to clone the process with the same pid_nr == set_tid
> > in the ns->parent chain.
>
> Not sure I understand what you mean. For CRIU only the PID in the PID
> namespace is relevant.
If it runs "inside" this namespace. But in this case alloc_pid() should
use nr == set_tid only once in the main loop, when i == ns->level.
It doesn't need to have the same pid_nr in the parent pid namespace.
And in fact we should not allow criu (or anything else) to control the child's
pid_nr in the parent(s) namespace.
Right?
> > So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> > Or I missed something ?
>
> Not sure why and how an array would be needed. Could you give me some
> more details why you think this is needed.
IIURC, criu can restore the process tree along with nested pid namespaces.
how can this patch help in this case?
But again, perhaps I missed something. I forgot everything about criu.
Oleg.
On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> The main motivation to add CLONE_SET_TID to clone3() is CRIU.
>
> To restore a process with the same PID/TID CRIU currently uses
> /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> ns_last_pid and then (quickly) does a clone(). This works most of the
> time, but it is racy. It is also slow as it requires multiple syscalls.
Can you elaborate how this is racy, please. Afaict, CRIU will always
usually restore in a new pid namespace that it controls, right? What is
the exact race?
>
> Extending clone3() to support CLONE_SET_TID makes it possible restore a
> process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
> race free (as long as the desired PID/TID is available).
>
> This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
> on clone3() with set_tid as they are currently in place for ns_last_pid.
>
> v2:
> - Removed (size < sizeof(struct clone_args)) as discussed with
> Christian and Dmitry
> - Added comment to ((set_tid != 1) && idr_get_cursor() <= 1) (Oleg)
> - Use idr_alloc() instead of idr_alloc_cyclic() (Oleg)
Fwiw, the changelog should be placed after the "---" after your SOB, so
rather :):
Signed-off-by: Adrian Reber <[email protected]>
---
v2:
- Removed (size < sizeof(struct clone_args)) as discussed with
Christian and Dmitry
- Added comment to ((set_tid != 1) && idr_get_cursor() <= 1) (Oleg)
- Use idr_alloc() instead of idr_alloc_cyclic() (Oleg)
---
>
> Signed-off-by: Adrian Reber <[email protected]>
> ---
> include/linux/pid.h | 2 +-
> include/linux/sched/task.h | 1 +
> include/uapi/linux/sched.h | 2 ++
> kernel/fork.c | 25 ++++++++++++++++---------
> kernel/pid.c | 30 +++++++++++++++++++++++-------
> 5 files changed, 43 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 2a83e434db9d..052000db0ced 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -116,7 +116,7 @@ extern struct pid *find_vpid(int nr);
> extern struct pid *find_get_pid(int nr);
> extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
>
> -extern struct pid *alloc_pid(struct pid_namespace *ns);
> +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t set_tid);
> extern void free_pid(struct pid *pid);
> extern void disable_pid_allocation(struct pid_namespace *ns);
>
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 0497091e40c1..4f2a80564332 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -26,6 +26,7 @@ struct kernel_clone_args {
> unsigned long stack;
> unsigned long stack_size;
> unsigned long tls;
> + pid_t set_tid;
> };
>
> /*
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index b3105ac1381a..8c4e803e8147 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -32,6 +32,7 @@
> #define CLONE_NEWPID 0x20000000 /* New pid namespace */
> #define CLONE_NEWNET 0x40000000 /* New network namespace */
> #define CLONE_IO 0x80000000 /* Clone io context */
> +#define CLONE_SET_TID 0x100000000ULL /* set if the desired TID is set in set_tid */
>
> /*
> * Arguments for the clone3 syscall
> @@ -45,6 +46,7 @@ struct clone_args {
> __aligned_u64 stack;
> __aligned_u64 stack_size;
> __aligned_u64 tls;
> + __aligned_u64 set_tid;
> };
>
> /*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 2852d0e76ea3..405f98ce4c83 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2031,7 +2031,7 @@ static __latent_entropy struct task_struct *copy_process(
> stackleak_task_init(p);
>
> if (pid != &init_struct_pid) {
> - pid = alloc_pid(p->nsproxy->pid_ns_for_children);
> + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid);
> if (IS_ERR(pid)) {
> retval = PTR_ERR(pid);
> goto bad_fork_cleanup_thread;
> @@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> struct clone_args __user *uargs,
> size_t size)
> {
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> struct clone_args args;
>
> if (unlikely(size > PAGE_SIZE))
> return -E2BIG;
>
> - if (unlikely(size < sizeof(struct clone_args)))
> - return -EINVAL;
> -
> if (unlikely(!access_ok(uargs, size)))
> return -EFAULT;
>
> @@ -2562,6 +2560,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> if (copy_from_user(&args, uargs, size))
> return -EFAULT;
>
> + if ((args.flags & CLONE_SET_TID) && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> + return -EPERM;
Have you made sure that this makes sense with all flags, e.g. does this
make sense when specified with CLONE_THREAD?
> +
> *kargs = (struct kernel_clone_args){
> .flags = args.flags,
> .pidfd = u64_to_user_ptr(args.pidfd),
> @@ -2571,6 +2572,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> .stack = args.stack,
> .stack_size = args.stack_size,
> .tls = args.tls,
> + .set_tid = args.set_tid,
> };
>
> return 0;
> @@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>
> static bool clone3_args_valid(const struct kernel_clone_args *kargs)
> {
> - /*
> - * All lower bits of the flag word are taken.
> - * Verify that no other unknown flags are passed along.
> - */
> - if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> + /* Verify that no other unknown flags are passed along. */
> + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
> + return false;
> +
> + /* Fail if set_tid is set without CLONE_SET_TID */
> + if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
> + return false;
> +
> + /* Also fail if set_tid is invalid */
> + if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
> return false;
>
> /*
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 0a9f2e437217..977f3ac39d7f 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -157,7 +157,7 @@ void free_pid(struct pid *pid)
> call_rcu(&pid->rcu, delayed_put_pid);
> }
>
> -struct pid *alloc_pid(struct pid_namespace *ns)
> +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
> {
> struct pid *pid;
> enum pid_type type;
> @@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> pid_min = RESERVED_PIDS;
>
> - /*
> - * Store a null pointer so find_pid_ns does not find
> - * a partially initialized PID (see below).
> - */
> - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + if (set_tid) {
> + /*
> + * Also fail if a PID != 1 is requested
> + * and no PID 1 exists.
> + */
> + if ((set_tid >= pid_max) || ((set_tid != 1) &&
> + (idr_get_cursor(&tmp->idr) <= 1))) {
> + spin_unlock_irq(&pidmap_lock);
> + retval = -EINVAL;
> + goto out_free;
> + }
> + nr = idr_alloc(&tmp->idr, NULL, set_tid,
> + set_tid + 1, GFP_ATOMIC);
Hm, feels to me that we should report a different error code than EAGAIN
when the allocation fails for set_tid. Right now you get EAGAIN for both
the non set_tid and the set_tid codepath.
But for set_tid the error that you likely should surface is EEXIST, i.e.
that pid is already taken.
> + set_tid = 0;
> + } else {
> + /*
> + * Store a null pointer so find_pid_ns does not find
> + * a partially initialized PID (see below).
> + */
> + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> + pid_max, GFP_ATOMIC);
> + }
> spin_unlock_irq(&pidmap_lock);
> idr_preload_end();
>
> --
> 2.21.0
>
On 08/02, Christian Brauner wrote:
>
> On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> > The main motivation to add CLONE_SET_TID to clone3() is CRIU.
> >
> > To restore a process with the same PID/TID CRIU currently uses
> > /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> > ns_last_pid and then (quickly) does a clone(). This works most of the
> > time, but it is racy. It is also slow as it requires multiple syscalls.
>
> Can you elaborate how this is racy, please. Afaict, CRIU will always
> usually restore in a new pid namespace that it controls, right?
Why? No. For example you can checkpoint (not sure this is correct word)
a single process in your namespace, then (try to restore) it.
> What is
> the exact race?
something else in the same namespace can fork() right after criu writes
the pid-for-restore into ns_last_pid.
Oleg.
On 08/02, Oleg Nesterov wrote:
>
> On 08/02, Adrian Reber wrote:
> >
> > On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> > > But the main question is how it can really help if ns->level > 0, unlikely
> > > CRIU will ever need to clone the process with the same pid_nr == set_tid
> > > in the ns->parent chain.
> >
> > Not sure I understand what you mean. For CRIU only the PID in the PID
> > namespace is relevant.
>
> If it runs "inside" this namespace. But in this case alloc_pid() should
> use nr == set_tid only once in the main loop, when i == ns->level.
and I just noticed that your patch does exactly this, it clears set_tid
after the 1st usage when i == ns->level.
> > > So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> > > Or I missed something ?
> >
> > Not sure why and how an array would be needed. Could you give me some
> > more details why you think this is needed.
>
> IIURC, criu can restore the process tree along with nested pid namespaces.
>
> how can this patch help in this case?
>
> But again, perhaps I missed something. I forgot everything about criu.
Yes... I guess in this case it can "safely" modify ns_last_pid in the child
namespaces it needs to create.
So Adrian, sorry for confusion, I think your patch is fine.
Oleg.
On Fri, Aug 02, 2019 at 02:47:38PM +0200, Oleg Nesterov wrote:
> On 08/02, Adrian Reber wrote:
> >
> > On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> > > But the main question is how it can really help if ns->level > 0, unlikely
> > > CRIU will ever need to clone the process with the same pid_nr == set_tid
> > > in the ns->parent chain.
> >
> > Not sure I understand what you mean. For CRIU only the PID in the PID
> > namespace is relevant.
>
> If it runs "inside" this namespace. But in this case alloc_pid() should
> use nr == set_tid only once in the main loop, when i == ns->level.
>
> It doesn't need to have the same pid_nr in the parent pid namespace.
>
> And in fact we should not allow criu (or anything else) to control the child's
> pid_nr in the parent(s) namespace.
This should definitely not be possible!
>
> Right?
>
> > > So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> > > Or I missed something ?
> >
> > Not sure why and how an array would be needed. Could you give me some
> > more details why you think this is needed.
>
> IIURC, criu can restore the process tree along with nested pid namespaces.
Hm, I'm not a fan of this array approach...
>
> how can this patch help in this case?
>
> But again, perhaps I missed something. I forgot everything about criu.
>
> Oleg.
>
On 08/02, Oleg Nesterov wrote:
>
> So Adrian, sorry for confusion, I think your patch is fine.
Yes... but do we really need the new CLONE_SET_TID ?
set_tid == 0 has no effect, can't we simply check kargs->set_tid != 0
before ns_capable() ?
and to remind, I still think alloc_pid() should use idr_is_empty(), but
I won't insist.
Oleg.
On Fri, Aug 02, 2019 at 03:30:01PM +0200, Oleg Nesterov wrote:
> On 08/02, Christian Brauner wrote:
> >
> > On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> > > The main motivation to add CLONE_SET_TID to clone3() is CRIU.
> > >
> > > To restore a process with the same PID/TID CRIU currently uses
> > > /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> > > ns_last_pid and then (quickly) does a clone(). This works most of the
> > > time, but it is racy. It is also slow as it requires multiple syscalls.
> >
> > Can you elaborate how this is racy, please. Afaict, CRIU will always
> > usually restore in a new pid namespace that it controls, right?
>
> Why? No. For example you can checkpoint (not sure this is correct word)
> a single process in your namespace, then (try to restore) it.
>
> > What is
> > the exact race?
>
> something else in the same namespace can fork() right after criu writes
> the pid-for-restore into ns_last_pid.
Ok, that makes sense. :)
My CRIU userspace knowledge is sporadic, so I'm not sure how exactly it
restores process trees in pid namespaces and what workloads this would
especially help with.
On Fri, Aug 02, 2019 at 03:46:11PM +0200, Oleg Nesterov wrote:
> On 08/02, Oleg Nesterov wrote:
> >
> > So Adrian, sorry for confusion, I think your patch is fine.
>
> Yes... but do we really need the new CLONE_SET_TID ?
>
> set_tid == 0 has no effect, can't we simply check kargs->set_tid != 0
> before ns_capable() ?
Yeah, I agree that sounds much better and aligns with exit_signal.
Christian
On Fri, Aug 02, 2019 at 03:52:49PM +0200, Christian Brauner wrote:
> On Fri, Aug 02, 2019 at 03:46:11PM +0200, Oleg Nesterov wrote:
> > On 08/02, Oleg Nesterov wrote:
> > >
> > > So Adrian, sorry for confusion, I think your patch is fine.
Good to know.
> > Yes... but do we really need the new CLONE_SET_TID ?
> >
> > set_tid == 0 has no effect, can't we simply check kargs->set_tid != 0
> > before ns_capable() ?
>
> Yeah, I agree that sounds much better and aligns with exit_signal.
Let me remove CLONE_SET_TID from the patch and I will try out
idr_is_empty().
I will also address Dmitry's comment about accessing smaller parameter
structs.
Adrian
On Fri, Aug 02, 2019 at 03:50:54PM +0200, Christian Brauner wrote:
> On Fri, Aug 02, 2019 at 03:30:01PM +0200, Oleg Nesterov wrote:
> > On 08/02, Christian Brauner wrote:
> > >
> > > On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> > > > The main motivation to add CLONE_SET_TID to clone3() is CRIU.
> > > >
> > > > To restore a process with the same PID/TID CRIU currently uses
> > > > /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> > > > ns_last_pid and then (quickly) does a clone(). This works most of the
> > > > time, but it is racy. It is also slow as it requires multiple syscalls.
> > >
> > > Can you elaborate how this is racy, please. Afaict, CRIU will always
> > > usually restore in a new pid namespace that it controls, right?
> >
> > Why? No. For example you can checkpoint (not sure this is correct word)
> > a single process in your namespace, then (try to restore) it.
> >
> > > What is
> > > the exact race?
> >
> > something else in the same namespace can fork() right after criu writes
> > the pid-for-restore into ns_last_pid.
>
> Ok, that makes sense. :)
> My CRIU userspace knowledge is sporadic, so I'm not sure how exactly it
> restores process trees in pid namespaces and what workloads this would
> especially help with.
Just what Oleg said. CRIU can restore processes in a new PID namespaces
or in an existing. To restore a process into an existing PID namespace
has the possibility of a PID collision, but if the PID is not yet in use
there is no limitation from CRIU's side.
Restoring into an existing PID namespace which is used by other
processes always has the possibility that between writing to
/proc/sys/kernel/ns_last_pid and clone() something else has fork()'d and
therefore it is racy.
Adrian