2019-08-06 19:18:38

by Adrian Reber

[permalink] [raw]
Subject: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

The main motivation to add set_tid to clone3() is CRIU.

To restore a process with the same PID/TID CRIU currently uses
/proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
ns_last_pid and then (quickly) does a clone(). This works most of the
time, but it is racy. It is also slow as it requires multiple syscalls.

Extending clone3() to support set_tid makes it possible restore a
process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
race free (as long as the desired PID/TID is available).

This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
on clone3() with set_tid as they are currently in place for ns_last_pid.

Signed-off-by: Adrian Reber <[email protected]>
---
v2:
- Removed (size < sizeof(struct clone_args)) as discussed with
Christian and Dmitry
- Added comment to ((set_tid != 1) && idr_get_cursor() <= 1) (Oleg)
- Use idr_alloc() instead of idr_alloc_cyclic() (Oleg)

v3:
- Return EEXIST if PID is already in use (Christian)
- Drop CLONE_SET_TID (Christian and Oleg)
- Use idr_is_empty() instead of idr_get_cursor() (Oleg)
- Handle different `struct clone_args` sizes (Dmitry)
---
include/linux/pid.h | 2 +-
include/linux/sched/task.h | 1 +
include/uapi/linux/sched.h | 1 +
kernel/fork.c | 18 ++++++++++++++++--
kernel/pid.c | 37 ++++++++++++++++++++++++++++++-------
5 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 2a83e434db9d..052000db0ced 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -116,7 +116,7 @@ extern struct pid *find_vpid(int nr);
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t set_tid);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0497091e40c1..4f2a80564332 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -26,6 +26,7 @@ struct kernel_clone_args {
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
+ pid_t set_tid;
};

/*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..e1ce103a2c47 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -45,6 +45,7 @@ struct clone_args {
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
+ __aligned_u64 set_tid;
};

/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e76ea3..4bb3972ebb99 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2031,7 +2031,7 @@ static __latent_entropy struct task_struct *copy_process(
stackleak_task_init(p);

if (pid != &init_struct_pid) {
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
@@ -2530,12 +2530,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t size)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct clone_args args;

if (unlikely(size > PAGE_SIZE))
return -E2BIG;

- if (unlikely(size < sizeof(struct clone_args)))
+ /* The struct needs to be at least the size of the original struct. */
+ if (size < (sizeof(struct clone_args) - sizeof(__aligned_u64)))
return -EINVAL;

if (unlikely(!access_ok(uargs, size)))
@@ -2573,6 +2575,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.tls = args.tls,
};

+ if (size == sizeof(struct clone_args)) {
+ /* Only check permissions if set_tid is actually set. */
+ if (args.set_tid &&
+ !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+ kargs->set_tid = args.set_tid;
+ }
+
return 0;
}

@@ -2585,6 +2595,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
if (kargs->flags & ~CLONE_LEGACY_FLAGS)
return false;

+ /* Fail if set_tid is invalid */
+ if (kargs->set_tid < 0)
+ return false;
+
/*
* - make the CLONE_DETACHED bit reuseable for clone3
* - make the CSIGNAL bits reuseable for clone3
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..520b75c17790 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -157,7 +157,7 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}

-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
{
struct pid *pid;
enum pid_type type;
@@ -186,12 +186,35 @@ struct pid *alloc_pid(struct pid_namespace *ns)
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;

- /*
- * Store a null pointer so find_pid_ns does not find
- * a partially initialized PID (see below).
- */
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ if (set_tid) {
+ /*
+ * Also fail if a PID != 1 is requested
+ * and no PID 1 exists.
+ */
+ if ((set_tid >= pid_max) || ((set_tid != 1) &&
+ (idr_is_empty(&tmp->idr)))) {
+ spin_unlock_irq(&pidmap_lock);
+ retval = -EINVAL;
+ goto out_free;
+ }
+ nr = idr_alloc(&tmp->idr, NULL, set_tid,
+ set_tid + 1, GFP_ATOMIC);
+ /*
+ * If ENOSPC is returned it means that the PID is
+ * alreay in use. Return EEXIST in that case.
+ */
+ if (nr == -ENOSPC)
+ nr = -EEXIST;
+ /* Only use set_tid for one PID namespace. */
+ set_tid = 0;
+ } else {
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ }
spin_unlock_irq(&pidmap_lock);
idr_preload_end();

--
2.21.0


2019-08-06 19:18:54

by Adrian Reber

[permalink] [raw]
Subject: [PATCH v3 2/2] selftests: add tests for clone3()

This tests clone3() with and without set_tid to see if all desired PIDs
are working as expected. The test tries to clone3() with a set_tid of
-1, 1, pid_max, a PID which is already in use and an unused PID. The
same tests are also running in PID namespace.

Signed-off-by: Adrian Reber <[email protected]>
---
tools/testing/selftests/clone3/.gitignore | 2 +
tools/testing/selftests/clone3/Makefile | 11 ++
tools/testing/selftests/clone3/clone3.c | 141 +++++++++++++++
.../testing/selftests/clone3/clone3_set_tid.c | 161 ++++++++++++++++++
4 files changed, 315 insertions(+)
create mode 100644 tools/testing/selftests/clone3/.gitignore
create mode 100644 tools/testing/selftests/clone3/Makefile
create mode 100644 tools/testing/selftests/clone3/clone3.c
create mode 100644 tools/testing/selftests/clone3/clone3_set_tid.c

diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore
new file mode 100644
index 000000000000..c63c64a78ddf
--- /dev/null
+++ b/tools/testing/selftests/clone3/.gitignore
@@ -0,0 +1,2 @@
+clone3_set_tid
+clone3
diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile
new file mode 100644
index 000000000000..4efcf45b995b
--- /dev/null
+++ b/tools/testing/selftests/clone3/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+
+CFLAGS += -I../../../../usr/include/
+
+ifeq ($(ARCH),x86_64)
+ TEST_GEN_PROGS := clone3 clone3_set_tid
+endif
+
+include ../lib.mk
diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
new file mode 100644
index 000000000000..55a6915566b8
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Based on Christian Brauner's clone3() example */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include "../kselftest.h"
+
+static pid_t raw_clone(struct clone_args *args)
+{
+ return syscall(__NR_clone3, args, sizeof(struct clone_args));
+}
+
+static int call_clone3(int flags)
+{
+ struct clone_args args = {0};
+ pid_t ppid = -1;
+ pid_t pid = -1;
+ int status;
+
+ args.flags = flags;
+ args.exit_signal = SIGCHLD;
+
+ pid = raw_clone(&args);
+ if (pid < 0) {
+ ksft_print_msg("%s - Failed to create new process\n",
+ strerror(errno));
+ return -errno;
+ }
+
+ if (pid == 0) {
+ ksft_print_msg("I am the child, my PID is %d\n", getpid());
+ _exit(EXIT_SUCCESS);
+ }
+
+ ppid = getpid();
+ ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
+ ppid, pid);
+
+ (void)wait(&status);
+ if (WEXITSTATUS(status))
+ return WEXITSTATUS(status);
+
+ return 0;
+}
+
+static int test_clone3(int flags, int expected)
+{
+ int ret;
+
+ ksft_print_msg("[%d] Trying clone3() with flags 0x%x\n",
+ getpid(), flags);
+ ret = call_clone3(flags);
+ ksft_print_msg("[%d] clone3() with flags says :%d expected %d\n",
+ getpid(), ret, expected);
+ if (ret != expected)
+ ksft_exit_fail_msg(
+ "[%d] Result (%d) is different than expected (%d)\n",
+ getpid(), ret, expected);
+ ksft_test_result_pass("[%d] Result (%d) matches expectation (%d)\n",
+ getpid(), ret, expected);
+ return 0;
+}
+int main(int argc, char *argv[])
+{
+ int ret = -1;
+ pid_t pid;
+
+ ksft_print_header();
+ ksft_set_plan(3);
+
+ /* Just a simple clone3() should return 0.*/
+ if (test_clone3(0, 0))
+ goto on_error;
+ /* Do a clone3() in a new PID NS.*/
+ if (test_clone3(CLONE_NEWPID, 0))
+ goto on_error;
+ ksft_print_msg("First unshare\n");
+ if (unshare(CLONE_NEWPID))
+ goto on_error;
+ /*
+ * Before clone3()ing in a new PID NS with
+ * CLONE_NEWPID a fork() is necessary.
+ */
+ if (test_clone3(CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ pid = fork();
+ if (pid < 0) {
+ ksft_print_msg("First fork() failed\n");
+ goto on_error;
+ }
+ if (pid > 0) {
+ (void)wait(NULL);
+ goto parent_out;
+ }
+ ksft_set_plan(6);
+ if (test_clone3(CLONE_NEWPID, 0))
+ goto on_error;
+ if (test_clone3(0, 0))
+ goto on_error;
+ ksft_print_msg("Second unshare\n");
+ if (unshare(CLONE_NEWPID))
+ goto on_error;
+ /*
+ * Before clone3()ing in a new PID NS with
+ * CLONE_NEWPID a fork() is necessary.
+ */
+ if (test_clone3(CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ pid = fork();
+ if (pid < 0) {
+ ksft_print_msg("Second fork() failed\n");
+ goto on_error;
+ }
+ if (pid > 0) {
+ (void)wait(NULL);
+ goto parent_out;
+ }
+ ksft_set_plan(8);
+ if (test_clone3(CLONE_NEWPID, 0))
+ goto on_error;
+ if (test_clone3(0, 0))
+ goto on_error;
+
+parent_out:
+ ret = 0;
+on_error:
+
+ return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c
new file mode 100644
index 000000000000..f5012e84dcb3
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_set_tid.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Based on Christian Brauner's clone3() example */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include "../kselftest.h"
+
+static pid_t raw_clone(struct clone_args *args)
+{
+ return syscall(__NR_clone3, args, sizeof(struct clone_args));
+}
+
+static int call_clone3_set_tid(int set_tid, int flags)
+{
+ struct clone_args args = {0};
+ pid_t ppid = -1;
+ pid_t pid = -1;
+ int status;
+
+ args.flags = flags;
+ args.exit_signal = SIGCHLD;
+ args.set_tid = set_tid;
+
+ pid = raw_clone(&args);
+ if (pid < 0) {
+ ksft_print_msg("%s - Failed to create new process\n",
+ strerror(errno));
+ return -errno;
+ }
+
+ if (pid == 0) {
+ ksft_print_msg("I am the child, my PID is %d (expected %d)\n",
+ getpid(), set_tid);
+ if (set_tid != getpid())
+ _exit(EXIT_FAILURE);
+ _exit(EXIT_SUCCESS);
+ }
+
+ ppid = getpid();
+ ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
+ ppid, pid);
+
+ (void)wait(&status);
+ if (WEXITSTATUS(status))
+ return WEXITSTATUS(status);
+
+ return 0;
+}
+
+static int test_clone3_set_tid(int set_tid, int flags, int expected)
+{
+ int ret;
+
+ ksft_print_msg(
+ "[%d] Trying clone3() with CLONE_SET_TID to %d and 0x%x\n",
+ getpid(), set_tid, flags);
+ ret = call_clone3_set_tid(set_tid, flags);
+ ksft_print_msg(
+ "[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n",
+ getpid(), set_tid, ret, expected);
+ if (ret != expected)
+ ksft_exit_fail_msg(
+ "[%d] Result (%d) is different than expected (%d)\n",
+ getpid(), ret, expected);
+ ksft_test_result_pass("[%d] Result (%d) matches expectation (%d)\n",
+ getpid(), ret, expected);
+ return 0;
+}
+int main(int argc, char *argv[])
+{
+ FILE *f;
+ int pid_max = 0;
+ pid_t pid;
+ pid_t ns_pid;
+ int ret = -1;
+
+ ksft_print_header();
+ ksft_set_plan(13);
+
+ f = fopen("/proc/sys/kernel/pid_max", "r");
+ if (f == NULL)
+ ksft_exit_fail_msg(
+ "%s - Could not open /proc/sys/kernel/pid_max\n",
+ strerror(errno));
+ fscanf(f, "%d", &pid_max);
+ fclose(f);
+ ksft_print_msg("/proc/sys/kernel/pid_max %d\n", pid_max);
+
+ /* First try with an invalid PID */
+ if (test_clone3_set_tid(-1, 0, -EINVAL))
+ goto on_error;
+ if (test_clone3_set_tid(-1, CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ /* Then with PID 1 */
+ if (test_clone3_set_tid(1, 0, -EEXIST))
+ goto on_error;
+ /* PID 1 should not fail in a PID namespace */
+ if (test_clone3_set_tid(1, CLONE_NEWPID, 0))
+ goto on_error;
+ /* pid_max should fail everywhere */
+ if (test_clone3_set_tid(pid_max, 0, -EINVAL))
+ goto on_error;
+ if (test_clone3_set_tid(pid_max, CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ /* Find the current active PID */
+ pid = fork();
+ if (pid == 0) {
+ ksft_print_msg("Child has PID %d\n", getpid());
+ sleep(1);
+ _exit(EXIT_SUCCESS);
+ }
+ /* Try to create a process with that PID should fail */
+ if (test_clone3_set_tid(pid, 0, -EEXIST))
+ goto on_error;
+ (void)wait(NULL);
+ /* After the child has finished, try again with the same PID */
+ if (test_clone3_set_tid(pid, 0, 0))
+ goto on_error;
+ /* This should fail as there is no PID 1 in that namespace */
+ if (test_clone3_set_tid(pid, CLONE_NEWPID, -EINVAL))
+ goto on_error;
+ unshare(CLONE_NEWPID);
+ if (test_clone3_set_tid(10, 0, -EINVAL))
+ goto on_error;
+ /* Let's create a PID 1 */
+ ns_pid = fork();
+ if (ns_pid == 0) {
+ ksft_print_msg("Child in PID namespace has PID %d\n", getpid());
+ sleep(1);
+ _exit(EXIT_SUCCESS);
+ }
+ /*
+ * Now, after the unshare() it should be possible to create a process
+ * with another ID than 1 in the PID namespace.
+ */
+ if (test_clone3_set_tid(2, 0, 0))
+ goto on_error;
+ /* Use a different PID in this namespace. */
+ if (test_clone3_set_tid(2222, 0, 0))
+ goto on_error;
+ if (test_clone3_set_tid(1, 0, -EEXIST))
+ goto on_error;
+ (void)wait(NULL);
+
+ ret = 0;
+on_error:
+
+ return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
--
2.21.0

2019-08-07 14:28:41

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On 08/06, Adrian Reber wrote:
>
> +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
> {
> struct pid *pid;
> enum pid_type type;
> @@ -186,12 +186,35 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> pid_min = RESERVED_PIDS;
>
> - /*
> - * Store a null pointer so find_pid_ns does not find
> - * a partially initialized PID (see below).
> - */
> - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + if (set_tid) {
> + /*
> + * Also fail if a PID != 1 is requested
> + * and no PID 1 exists.
> + */
> + if ((set_tid >= pid_max) || ((set_tid != 1) &&
> + (idr_is_empty(&tmp->idr)))) {

too many parentheses ;) this is purely cosmetic, up to you, but to me

if (set_tid >= pid_max ||
(set_tid != 1 && idr_is_empty(&tmp->idr))) {

looks a bit more readable.


> + spin_unlock_irq(&pidmap_lock);
> + retval = -EINVAL;
> + goto out_free;

This doesn't look right, you need idr_preload_end() before goto out_free.

But I'd suggest to simply do

nr = -EINVAL;
if (set_tid < pid_max &&
(set_tid != 1 || idr_is_empty(&tmp->idr)))
nr = idr_alloc(&tmp->idr, NULL, set_tid,
set_tid + 1, GFP_ATOMIC);

...

this is more robust.

Oleg.


2019-08-07 15:50:55

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On 08/06, Adrian Reber wrote:
>
> @@ -2530,12 +2530,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> struct clone_args __user *uargs,
> size_t size)
> {
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> struct clone_args args;
>
> if (unlikely(size > PAGE_SIZE))
> return -E2BIG;
>
> - if (unlikely(size < sizeof(struct clone_args)))
> + /* The struct needs to be at least the size of the original struct. */
> + if (size < (sizeof(struct clone_args) - sizeof(__aligned_u64)))
> return -EINVAL;

slightly off-topic, but with or without this patch I do not understand
-EINVAL. Can't we replace this check with

if (size < sizeof(struct clone_args))
memset((void*)&args + size, sizeof(struct clone_args) - size, 0);

?

this way we can new members at the end of clone_args and this matches
the "if (size > sizeof(struct clone_args))" block below which promises
that whatever we add into clone_args a zero value should work.


And if we do this

> + if (size == sizeof(struct clone_args)) {
> + /* Only check permissions if set_tid is actually set. */
> + if (args.set_tid &&
> + !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> + return -EPERM;
> + kargs->set_tid = args.set_tid;
> + }

we can move this check into clone3_args_valid() or even copy_process()

if (kargs->set_tid) {
if (!ns_capable(...))
return -EPERM;
}


Either way,

> @@ -2585,6 +2595,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
> if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> return false;
>
> + /* Fail if set_tid is invalid */
> + if (kargs->set_tid < 0)
> + return false;

I think it would be more clean to do this along with ns_capable() check,
or along with the "set_tid >= pid_max" check in alloc_pid().

I won't insist, but I do not really like the fact we check set_tid 3 times
in copy_clone_args_from_user(), clone3_args_valid(), and alloc_pid().

Oleg.

2019-08-07 16:21:30

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID


On 8/7/19 4:48 PM, Oleg Nesterov wrote:
> On 08/06, Adrian Reber wrote:
>>
>> @@ -2530,12 +2530,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>> struct clone_args __user *uargs,
>> size_t size)
>> {
>> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
>> struct clone_args args;
>>
>> if (unlikely(size > PAGE_SIZE))
>> return -E2BIG;
>>
>> - if (unlikely(size < sizeof(struct clone_args)))
>> + /* The struct needs to be at least the size of the original struct. */
>> + if (size < (sizeof(struct clone_args) - sizeof(__aligned_u64)))
>> return -EINVAL;
>
> slightly off-topic, but with or without this patch I do not understand
> -EINVAL. Can't we replace this check with
>
> if (size < sizeof(struct clone_args))
> memset((void*)&args + size, sizeof(struct clone_args) - size, 0);
>
> ?
>
> this way we can new members at the end of clone_args and this matches
> the "if (size > sizeof(struct clone_args))" block below which promises
> that whatever we add into clone_args a zero value should work.

What if the size is lesser than offsetof(struct clone_args, stack_size)?
Probably, there should be still a check that it's not lesser than what's
the required minimum..

Also note, that (kargs) and (args) are a bit different beasts in this
context..
kargs lies on the stack and might want to be with zero-initializer
: struct kernel_clone_args kargs = {};

--
Dmitry

2019-08-07 16:28:46

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On 08/07, Dmitry Safonov wrote:
>
> On 8/7/19 4:48 PM, Oleg Nesterov wrote:
> > On 08/06, Adrian Reber wrote:
> >>
> >> @@ -2530,12 +2530,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> >> struct clone_args __user *uargs,
> >> size_t size)
> >> {
> >> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> >> struct clone_args args;
> >>
> >> if (unlikely(size > PAGE_SIZE))
> >> return -E2BIG;
> >>
> >> - if (unlikely(size < sizeof(struct clone_args)))
> >> + /* The struct needs to be at least the size of the original struct. */
> >> + if (size < (sizeof(struct clone_args) - sizeof(__aligned_u64)))
> >> return -EINVAL;
> >
> > slightly off-topic, but with or without this patch I do not understand
> > -EINVAL. Can't we replace this check with
> >
> > if (size < sizeof(struct clone_args))
> > memset((void*)&args + size, sizeof(struct clone_args) - size, 0);
> >
> > ?
> >
> > this way we can new members at the end of clone_args and this matches
> > the "if (size > sizeof(struct clone_args))" block below which promises
> > that whatever we add into clone_args a zero value should work.
>
> What if the size is lesser than offsetof(struct clone_args, stack_size)?
> Probably, there should be still a check that it's not lesser than what's
> the required minimum..

Not sure I understand... I mean, this doesn't differ from the case when
size == sizeof(clone_args) but uargs->stack == NULL ?

> Also note, that (kargs) and (args) are a bit different beasts in this
> context..
> kargs lies on the stack and might want to be with zero-initializer
> : struct kernel_clone_args kargs = {};

I don't think so. Lets consider this patch which adds the new set_tid
into clone_args and kernel_clone_args. copy_clone_args_from_user() does

*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
};

so this patch should simply add

.set_tid = args.set_tid;

at the end. No?

Oleg.

2019-08-07 16:34:42

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On 8/7/19 5:21 PM, Oleg Nesterov wrote:
> On 08/07, Dmitry Safonov wrote:
[..]
>> What if the size is lesser than offsetof(struct clone_args, stack_size)?
>> Probably, there should be still a check that it's not lesser than what's
>> the required minimum..
>
> Not sure I understand... I mean, this doesn't differ from the case when
> size == sizeof(clone_args) but uargs->stack == NULL ?

I might be mistaken and I confess that I don't fully understand the
code, but wouldn't it mystically fail in copy_thread_tls() with -ENOMEM
instead of -EINVAL?
Maybe not a huge difference, but..

>> Also note, that (kargs) and (args) are a bit different beasts in this
>> context..
>> kargs lies on the stack and might want to be with zero-initializer
>> : struct kernel_clone_args kargs = {};
>
> I don't think so. Lets consider this patch which adds the new set_tid
> into clone_args and kernel_clone_args. copy_clone_args_from_user() does
>
> *kargs = (struct kernel_clone_args){
> .flags = args.flags,
> .pidfd = u64_to_user_ptr(args.pidfd),
> .child_tid = u64_to_user_ptr(args.child_tid),
> .parent_tid = u64_to_user_ptr(args.parent_tid),
> .exit_signal = args.exit_signal,
> .stack = args.stack,
> .stack_size = args.stack_size,
> .tls = args.tls,
> };
>
> so this patch should simply add
>
> .set_tid = args.set_tid;
>
> at the end. No?
Agree, this may be better.

--
Dmitry

2019-08-07 16:45:24

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On 08/06, Adrian Reber wrote:
>
> @@ -2573,6 +2575,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> .tls = args.tls,
> };
>
> + if (size == sizeof(struct clone_args)) {
> + /* Only check permissions if set_tid is actually set. */
> + if (args.set_tid &&
> + !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))

and I just noticed this uses pid_ns = task_active_pid_ns() ...

is it correct?

I feel I am totally confused, but should we use the same
p->nsproxy->pid_ns_for_children passed to alloc_pid?

Oleg.

2019-08-07 17:38:51

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On 8/7/19 5:33 PM, Dmitry Safonov wrote:
> On 8/7/19 5:21 PM, Oleg Nesterov wrote:
>> On 08/07, Dmitry Safonov wrote:
> [..]
>>> What if the size is lesser than offsetof(struct clone_args, stack_size)?
>>> Probably, there should be still a check that it's not lesser than what's
>>> the required minimum..
>>
>> Not sure I understand... I mean, this doesn't differ from the case when
>> size == sizeof(clone_args) but uargs->stack == NULL ?
>
> I might be mistaken and I confess that I don't fully understand the
> code, but wouldn't it mystically fail in copy_thread_tls() with -ENOMEM
> instead of -EINVAL?
> Maybe not a huge difference, but..

Actually, not there. I've just tried clone3() with stack_size == 0, it
sets it a proper size somewhere on the way..
So, apologies for the misinformation - it seems that we definitely could
just memset() the missing fields.

Thanks,
Dmitry

2019-08-07 17:58:31

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On Tue, Aug 06, 2019 at 09:15:50PM +0200, Adrian Reber wrote:
> The main motivation to add set_tid to clone3() is CRIU.
>
> To restore a process with the same PID/TID CRIU currently uses
> /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> ns_last_pid and then (quickly) does a clone(). This works most of the
> time, but it is racy. It is also slow as it requires multiple syscalls.
>
> Extending clone3() to support set_tid makes it possible restore a
> process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
> race free (as long as the desired PID/TID is available).
>
> This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
> on clone3() with set_tid as they are currently in place for ns_last_pid.
>
> Signed-off-by: Adrian Reber <[email protected]>

I'm currently on vacation until 12 August so I'm going to be a little
slow in responding. Sorry about that!

Christian

2019-08-07 18:03:08

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On Wed, Aug 07, 2019 at 04:26:10PM +0200, Oleg Nesterov wrote:
> On 08/06, Adrian Reber wrote:
> >
> > +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
> > {
> > struct pid *pid;
> > enum pid_type type;
> > @@ -186,12 +186,35 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> > if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> > pid_min = RESERVED_PIDS;
> >
> > - /*
> > - * Store a null pointer so find_pid_ns does not find
> > - * a partially initialized PID (see below).
> > - */
> > - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> > - pid_max, GFP_ATOMIC);
> > + if (set_tid) {
> > + /*
> > + * Also fail if a PID != 1 is requested
> > + * and no PID 1 exists.
> > + */
> > + if ((set_tid >= pid_max) || ((set_tid != 1) &&
> > + (idr_is_empty(&tmp->idr)))) {
>
> too many parentheses ;) this is purely cosmetic, up to you, but to me
>
> if (set_tid >= pid_max ||
> (set_tid != 1 && idr_is_empty(&tmp->idr))) {
>
> looks a bit more readable.
>
>
> > + spin_unlock_irq(&pidmap_lock);
> > + retval = -EINVAL;
> > + goto out_free;
>
> This doesn't look right, you need idr_preload_end() before goto out_free.
>
> But I'd suggest to simply do
>
> nr = -EINVAL;
> if (set_tid < pid_max &&
> (set_tid != 1 || idr_is_empty(&tmp->idr)))
> nr = idr_alloc(&tmp->idr, NULL, set_tid,
> set_tid + 1, GFP_ATOMIC);
>
> ...
>
> this is more robust.

That all sounds reasonable to me.

2019-08-07 18:07:17

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On Wed, Aug 07, 2019 at 06:08:56PM +0200, Oleg Nesterov wrote:
> On 08/06, Adrian Reber wrote:
> >
> > @@ -2573,6 +2575,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> > .tls = args.tls,
> > };
> >
> > + if (size == sizeof(struct clone_args)) {
> > + /* Only check permissions if set_tid is actually set. */
> > + if (args.set_tid &&
> > + !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>
> and I just noticed this uses pid_ns = task_active_pid_ns() ...
>
> is it correct?
>
> I feel I am totally confused, but should we use the same
> p->nsproxy->pid_ns_for_children passed to alloc_pid?

We need to have CAP_SYS_ADMIN in the owning user namespace of the target
pidns for the pidns in which we spawn the new process. The value for
pid_ns_for_children could've been altered by either passing CLONE_NEWPID
or by having called unshare(CLONE_NEWPID) before. So yes,
pid_ns_for_children is what we want.

Sorry again for the delay in my responses. On vacation atm.

Christian

2019-08-07 18:21:38

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] fork: extend clone3() to support CLONE_SET_TID

On Wed, Aug 07, 2019 at 05:48:29PM +0200, Oleg Nesterov wrote:
> On 08/06, Adrian Reber wrote:
> >
> > @@ -2530,12 +2530,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> > struct clone_args __user *uargs,
> > size_t size)
> > {
> > + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> > struct clone_args args;
> >
> > if (unlikely(size > PAGE_SIZE))
> > return -E2BIG;
> >
> > - if (unlikely(size < sizeof(struct clone_args)))
> > + /* The struct needs to be at least the size of the original struct. */
> > + if (size < (sizeof(struct clone_args) - sizeof(__aligned_u64)))
> > return -EINVAL;
>
> slightly off-topic, but with or without this patch I do not understand
> -EINVAL. Can't we replace this check with
>
> if (size < sizeof(struct clone_args))
> memset((void*)&args + size, sizeof(struct clone_args) - size, 0);
>
> ?
>
> this way we can new members at the end of clone_args and this matches
> the "if (size > sizeof(struct clone_args))" block below which promises
> that whatever we add into clone_args a zero value should work.

Hm, I actually think we should define:

#define CLONE3_ARGS_SIZE_V0 64
#define CLONE3_ARGS_SIZE_V1 ...
and then later on for future extensions
#define CLONE3_ARGS_SIZE_V2 ...

then do
if (size < CLONE3_ARGS_SIZE_V0)
return -EINVAL;

then do what you suggested:

if (size < sizeof(struct clone_args))
memset((void*)&args + size, sizeof(struct clone_args) - size, 0);

>
>
> And if we do this
>
> > + if (size == sizeof(struct clone_args)) {
> > + /* Only check permissions if set_tid is actually set. */
> > + if (args.set_tid &&
> > + !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> > + return -EPERM;
> > + kargs->set_tid = args.set_tid;
> > + }
>
> we can move this check into clone3_args_valid() or even copy_process()
>
> if (kargs->set_tid) {
> if (!ns_capable(...))
> return -EPERM;
> }
>
>
> Either way,
>
> > @@ -2585,6 +2595,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
> > if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> > return false;
> >
> > + /* Fail if set_tid is invalid */
> > + if (kargs->set_tid < 0)
> > + return false;
>
> I think it would be more clean to do this along with ns_capable() check,
> or along with the "set_tid >= pid_max" check in alloc_pid().
>
> I won't insist, but I do not really like the fact we check set_tid 3 times
> in copy_clone_args_from_user(), clone3_args_valid(), and alloc_pid().

Agreed on that part.