2019-07-29 19:06:42

by Adrian Reber

[permalink] [raw]
Subject: [PATCH 1/2] fork: extend clone3() to support CLONE_SET_TID

The main motivation to add CLONE_SET_TID to clone3() is CRIU.

To restore a process with the same PID/TID CRIU currently uses
/proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
ns_last_pid and then (quickly) does a clone(). This works most of the
time, but it is racy. It is also slow as it requires multiple syscalls.

Extending clone3() to support CLONE_SET_TID makes it possible restore a
process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
race free (as long as the desired PID/TID is available).

This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
on clone3() with set_tid as they are currently in place for ns_last_pid.

Signed-off-by: Adrian Reber <[email protected]>
---
include/linux/pid.h | 2 +-
include/linux/sched/task.h | 1 +
include/uapi/linux/sched.h | 2 ++
kernel/fork.c | 22 ++++++++++++++++------
kernel/pid.c | 21 ++++++++++++++++++---
5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 2a83e434db9d..052000db0ced 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -116,7 +116,7 @@ extern struct pid *find_vpid(int nr);
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t set_tid);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0497091e40c1..4f2a80564332 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -26,6 +26,7 @@ struct kernel_clone_args {
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
+ pid_t set_tid;
};

/*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..8c4e803e8147 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -32,6 +32,7 @@
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
+#define CLONE_SET_TID 0x100000000ULL /* set if the desired TID is set in set_tid */

/*
* Arguments for the clone3 syscall
@@ -45,6 +46,7 @@ struct clone_args {
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
+ __aligned_u64 set_tid;
};

/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e76ea3..8c59441ac153 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2031,7 +2031,7 @@ static __latent_entropy struct task_struct *copy_process(
stackleak_task_init(p);

if (pid != &init_struct_pid) {
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
@@ -2530,6 +2530,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t size)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct clone_args args;

if (unlikely(size > PAGE_SIZE))
@@ -2562,6 +2563,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
if (copy_from_user(&args, uargs, size))
return -EFAULT;

+ if ((args.flags & CLONE_SET_TID) && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
@@ -2571,6 +2575,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
+ .set_tid = args.set_tid,
};

return 0;
@@ -2578,11 +2583,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,

static bool clone3_args_valid(const struct kernel_clone_args *kargs)
{
- /*
- * All lower bits of the flag word are taken.
- * Verify that no other unknown flags are passed along.
- */
- if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ /* Verify that no other unknown flags are passed along. */
+ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
+ return false;
+
+ /* Fail if set_tid is set without CLONE_SET_TID */
+ if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
+ return false;
+
+ /* Also fail if set_tid is invalid */
+ if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
return false;

/*
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..d3db8090bb10 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -157,11 +157,12 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}

-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
{
struct pid *pid;
enum pid_type type;
int i, nr;
+ int max_p, min_p;
struct pid_namespace *tmp;
struct upid *upid;
int retval = -ENOMEM;
@@ -186,12 +187,26 @@ struct pid *alloc_pid(struct pid_namespace *ns)
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;

+ if (set_tid) {
+ if ((set_tid >= pid_max) || ((set_tid != 1) &&
+ (idr_get_cursor(&tmp->idr) <= 1))) {
+ spin_unlock_irq(&pidmap_lock);
+ retval = -EINVAL;
+ goto out_free;
+ }
+ min_p = set_tid;
+ max_p = set_tid + 1;
+ set_tid = 0;
+ } else {
+ min_p = pid_min;
+ max_p = pid_max;
+ }
/*
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
*/
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, min_p,
+ max_p, GFP_ATOMIC);
spin_unlock_irq(&pidmap_lock);
idr_preload_end();

--
2.21.0


2019-07-29 19:31:27

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH 1/2] fork: extend clone3() to support CLONE_SET_TID

On Mon, 29 Jul 2019 at 20:12, Dmitry Safonov <[email protected]> wrote:
>
> On Mon, 29 Jul 2019 at 17:52, Adrian Reber <[email protected]> wrote:
> [..]
> > --- a/include/uapi/linux/sched.h
> > +++ b/include/uapi/linux/sched.h
> > @@ -32,6 +32,7 @@
> > #define CLONE_NEWPID 0x20000000 /* New pid namespace */
> > #define CLONE_NEWNET 0x40000000 /* New network namespace */
> > #define CLONE_IO 0x80000000 /* Clone io context */
> > +#define CLONE_SET_TID 0x100000000ULL /* set if the desired TID is set in set_tid */
> >
> > /*
> > * Arguments for the clone3 syscall
> > @@ -45,6 +46,7 @@ struct clone_args {
> > __aligned_u64 stack;
> > __aligned_u64 stack_size;
> > __aligned_u64 tls;
> > + __aligned_u64 set_tid;
> > };
> >
>
> I don't see a change to
> : if (unlikely(size < sizeof(struct clone_args)))
> : return -EINVAL;
>
> That seems backwards-incompatible, but I may miss some part..

On the other hand, clone3() was merged in v5.3 window, so probably,
it doesn't matter.

--
Dmitry

2019-07-29 19:32:35

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH 1/2] fork: extend clone3() to support CLONE_SET_TID

On Mon, 29 Jul 2019 at 17:52, Adrian Reber <[email protected]> wrote:
[..]
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -32,6 +32,7 @@
> #define CLONE_NEWPID 0x20000000 /* New pid namespace */
> #define CLONE_NEWNET 0x40000000 /* New network namespace */
> #define CLONE_IO 0x80000000 /* Clone io context */
> +#define CLONE_SET_TID 0x100000000ULL /* set if the desired TID is set in set_tid */
>
> /*
> * Arguments for the clone3 syscall
> @@ -45,6 +46,7 @@ struct clone_args {
> __aligned_u64 stack;
> __aligned_u64 stack_size;
> __aligned_u64 tls;
> + __aligned_u64 set_tid;
> };
>

I don't see a change to
: if (unlikely(size < sizeof(struct clone_args)))
: return -EINVAL;

That seems backwards-incompatible, but I may miss some part..

--
Dmitry

2019-07-29 20:12:26

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 1/2] fork: extend clone3() to support CLONE_SET_TID

On Mon, Jul 29, 2019 at 08:17:58PM +0100, Dmitry Safonov wrote:
> On Mon, 29 Jul 2019 at 20:12, Dmitry Safonov <[email protected]> wrote:
> >
> > On Mon, 29 Jul 2019 at 17:52, Adrian Reber <[email protected]> wrote:
> > [..]
> > > --- a/include/uapi/linux/sched.h
> > > +++ b/include/uapi/linux/sched.h
> > > @@ -32,6 +32,7 @@
> > > #define CLONE_NEWPID 0x20000000 /* New pid namespace */
> > > #define CLONE_NEWNET 0x40000000 /* New network namespace */
> > > #define CLONE_IO 0x80000000 /* Clone io context */
> > > +#define CLONE_SET_TID 0x100000000ULL /* set if the desired TID is set in set_tid */
> > >
> > > /*
> > > * Arguments for the clone3 syscall
> > > @@ -45,6 +46,7 @@ struct clone_args {
> > > __aligned_u64 stack;
> > > __aligned_u64 stack_size;
> > > __aligned_u64 tls;
> > > + __aligned_u64 set_tid;
> > > };
> > >
> >
> > I don't see a change to
> > : if (unlikely(size < sizeof(struct clone_args)))
> > : return -EINVAL;
> >
> > That seems backwards-incompatible, but I may miss some part..
>
> On the other hand, clone3() was merged in v5.3 window, so probably,
> it doesn't matter.

It would matter. Even if this were to be accepted and preferred over the
existing /proc/sys/kernel/ns_last_pid interface the earliest point in
time I'd consider this for would be 5.4. So I think this would need to
be changed. :)

Christian

2019-07-30 16:59:50

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/2] fork: extend clone3() to support CLONE_SET_TID

On 07/29, Adrian Reber wrote:
>
> @@ -186,12 +187,26 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> pid_min = RESERVED_PIDS;
>
> + if (set_tid) {
> + if ((set_tid >= pid_max) || ((set_tid != 1) &&
> + (idr_get_cursor(&tmp->idr) <= 1))) {

I think the

(set_tid != 1) && idr_get_cursor() <= 1

check needs a comment...

> + spin_unlock_irq(&pidmap_lock);
> + retval = -EINVAL;
> + goto out_free;
> + }
> + min_p = set_tid;
> + max_p = set_tid + 1;
> + set_tid = 0;
> + } else {
> + min_p = pid_min;
> + max_p = pid_max;
> + }
> /*
> * Store a null pointer so find_pid_ns does not find
> * a partially initialized PID (see below).
> */
> - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + nr = idr_alloc_cyclic(&tmp->idr, NULL, min_p,
> + max_p, GFP_ATOMIC);

do we really want _cyclic() which updates idr->idr_next if set_tid?

perhaps idr_alloc() makes more sense?

Oleg.