2014-11-28 23:05:14

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC PATCH] proc, pidns: Add highpid

Pid reuse is common, which means that it's difficult or impossible
to read information about a pid from /proc without races.

This introduces a second number associated with each (task, pidns)
pair called highpid. Highpid is a 64-bit number, and, barring
extremely unlikely circumstances or outright error, a (highpid, pid)
will never be reused.

With just this change, a program can open /proc/PID/status, read the
"Highpid" field, and confirm that it has the expected value. If the
pid has been reused, then highpid will be different.

The initial implementation is straightforward: highpid is simply a
64-bit counter. If a high-end system can fork every 3 ns (which
would be amazing, given that just allocating a pid requires at
atomic operation), it would take well over 1000 years for highpid to
wrap.

For CRIU's benefit, the next highpid can be set by a privileged
user.

NB: The sysctl stuff only works on 64-bit systems. If the approach
looks good, I'll fix that somehow.

Signed-off-by: Andy Lutomirski <[email protected]>
---

If this goes in, there's plenty of room to add new interfaces to
make this more useful. For example, we could add a fancier tgkill
that adds and validates hightgid and highpid, and we might want to
add a syscall to read one's own hightgid and highpid. These would
be quite useful for pidfiles.

David, would this be useful for kdbus?

CRIU people: will this be unduly difficult to support in CRIU?

If you all think this is a good idea, I'll fix the sysctl stuff and
document it. It might also be worth adding "Hightgid" to status.

fs/proc/array.c | 5 ++++-
include/linux/pid.h | 2 ++
include/linux/pid_namespace.h | 1 +
kernel/pid.c | 19 +++++++++++++++----
kernel/pid_namespace.c | 22 ++++++++++++++++++++++
5 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index cd3653e4f35c..f1e0e69d19f9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
int g;
struct fdtable *fdt = NULL;
const struct cred *cred;
+ const struct upid *upid;
pid_t ppid, tpid;

rcu_read_lock();
@@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
}
+ upid = pid_upid_ns(pid, ns);
cred = get_task_cred(p);
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
"Ngid:\t%d\n"
"Pid:\t%d\n"
+ "Highpid:\t%llu\n"
"PPid:\t%d\n"
"TracerPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
@@ -183,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
get_task_state(p),
task_tgid_nr_ns(p, ns),
task_numa_group_id(p),
- pid_nr_ns(pid, ns),
+ upid ? upid->nr : 0, upid ? upid->highnr : 0,
ppid, tpid,
from_kuid_munged(user_ns, cred->uid),
from_kuid_munged(user_ns, cred->euid),
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 23705a53abba..ece70b64d04c 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -51,6 +51,7 @@ struct upid {
/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
int nr;
struct pid_namespace *ns;
+ u64 highnr;
struct hlist_node pid_chain;
};

@@ -170,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
+const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);

#define do_each_pid_task(pid, type, task) \
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 1997ffc295a7..1f9f0455d7ef 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -26,6 +26,7 @@ struct pid_namespace {
struct rcu_head rcu;
int last_pid;
unsigned int nr_hashed;
+ atomic64_t next_highpid;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..863d11a9bfbf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -312,6 +312,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)

pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
+ pid->numbers[i].highnr =
+ atomic64_inc_return(&tmp->next_highpid) - 1;
tmp = tmp->parent;
}

@@ -492,17 +494,26 @@ struct pid *find_get_pid(pid_t nr)
}
EXPORT_SYMBOL_GPL(find_get_pid);

-pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
+const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
- pid_t nr = 0;

if (pid && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
- nr = upid->nr;
+ return upid;
}
- return nr;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(pid_upid_ns);
+
+pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
+{
+ const struct upid *upid = pid_upid_ns(pid, ns);
+
+ if (!upid)
+ return 0;
+ return upid->nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..e246802b4361 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -268,6 +268,22 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}

+static int pid_ns_next_highpid_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct ctl_table tmp = *table;
+
+ if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* This needs to be fixed. */
+ BUILD_BUG_ON(sizeof(u64) != sizeof(unsigned long));
+
+ tmp.data = &pid_ns->next_highpid;
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
extern int pid_max;
static int zero = 0;
static struct ctl_table pid_ns_ctl_table[] = {
@@ -279,6 +295,12 @@ static struct ctl_table pid_ns_ctl_table[] = {
.extra1 = &zero,
.extra2 = &pid_max,
},
+ {
+ .procname = "ns_next_highpid",
+ .maxlen = sizeof(u64),
+ .mode = 0666, /* permissions are checked in the handler */
+ .proc_handler = pid_ns_next_highpid_handler,
+ },
{ }
};
static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
--
1.9.3


2014-11-28 23:06:33

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

[Adding CRIU people. Whoops.]

On Fri, Nov 28, 2014 at 3:05 PM, Andy Lutomirski <[email protected]> wrote:
> Pid reuse is common, which means that it's difficult or impossible
> to read information about a pid from /proc without races.
>
> This introduces a second number associated with each (task, pidns)
> pair called highpid. Highpid is a 64-bit number, and, barring
> extremely unlikely circumstances or outright error, a (highpid, pid)
> will never be reused.
>
> With just this change, a program can open /proc/PID/status, read the
> "Highpid" field, and confirm that it has the expected value. If the
> pid has been reused, then highpid will be different.
>
> The initial implementation is straightforward: highpid is simply a
> 64-bit counter. If a high-end system can fork every 3 ns (which
> would be amazing, given that just allocating a pid requires at
> atomic operation), it would take well over 1000 years for highpid to
> wrap.
>
> For CRIU's benefit, the next highpid can be set by a privileged
> user.
>
> NB: The sysctl stuff only works on 64-bit systems. If the approach
> looks good, I'll fix that somehow.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> If this goes in, there's plenty of room to add new interfaces to
> make this more useful. For example, we could add a fancier tgkill
> that adds and validates hightgid and highpid, and we might want to
> add a syscall to read one's own hightgid and highpid. These would
> be quite useful for pidfiles.
>
> David, would this be useful for kdbus?
>
> CRIU people: will this be unduly difficult to support in CRIU?
>
> If you all think this is a good idea, I'll fix the sysctl stuff and
> document it. It might also be worth adding "Hightgid" to status.
>
> fs/proc/array.c | 5 ++++-
> include/linux/pid.h | 2 ++
> include/linux/pid_namespace.h | 1 +
> kernel/pid.c | 19 +++++++++++++++----
> kernel/pid_namespace.c | 22 ++++++++++++++++++++++
> 5 files changed, 44 insertions(+), 5 deletions(-)
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index cd3653e4f35c..f1e0e69d19f9 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> int g;
> struct fdtable *fdt = NULL;
> const struct cred *cred;
> + const struct upid *upid;
> pid_t ppid, tpid;
>
> rcu_read_lock();
> @@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> if (tracer)
> tpid = task_pid_nr_ns(tracer, ns);
> }
> + upid = pid_upid_ns(pid, ns);
> cred = get_task_cred(p);
> seq_printf(m,
> "State:\t%s\n"
> "Tgid:\t%d\n"
> "Ngid:\t%d\n"
> "Pid:\t%d\n"
> + "Highpid:\t%llu\n"
> "PPid:\t%d\n"
> "TracerPid:\t%d\n"
> "Uid:\t%d\t%d\t%d\t%d\n"
> @@ -183,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> get_task_state(p),
> task_tgid_nr_ns(p, ns),
> task_numa_group_id(p),
> - pid_nr_ns(pid, ns),
> + upid ? upid->nr : 0, upid ? upid->highnr : 0,
> ppid, tpid,
> from_kuid_munged(user_ns, cred->uid),
> from_kuid_munged(user_ns, cred->euid),
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 23705a53abba..ece70b64d04c 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -51,6 +51,7 @@ struct upid {
> /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
> int nr;
> struct pid_namespace *ns;
> + u64 highnr;
> struct hlist_node pid_chain;
> };
>
> @@ -170,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
> }
>
> pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns);
> pid_t pid_vnr(struct pid *pid);
>
> #define do_each_pid_task(pid, type, task) \
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 1997ffc295a7..1f9f0455d7ef 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -26,6 +26,7 @@ struct pid_namespace {
> struct rcu_head rcu;
> int last_pid;
> unsigned int nr_hashed;
> + atomic64_t next_highpid;
> struct task_struct *child_reaper;
> struct kmem_cache *pid_cachep;
> unsigned int level;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 9b9a26698144..863d11a9bfbf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -312,6 +312,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>
> pid->numbers[i].nr = nr;
> pid->numbers[i].ns = tmp;
> + pid->numbers[i].highnr =
> + atomic64_inc_return(&tmp->next_highpid) - 1;
> tmp = tmp->parent;
> }
>
> @@ -492,17 +494,26 @@ struct pid *find_get_pid(pid_t nr)
> }
> EXPORT_SYMBOL_GPL(find_get_pid);
>
> -pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns)
> {
> struct upid *upid;
> - pid_t nr = 0;
>
> if (pid && ns->level <= pid->level) {
> upid = &pid->numbers[ns->level];
> if (upid->ns == ns)
> - nr = upid->nr;
> + return upid;
> }
> - return nr;
> + return NULL;
> +}
> +EXPORT_SYMBOL_GPL(pid_upid_ns);
> +
> +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
> +{
> + const struct upid *upid = pid_upid_ns(pid, ns);
> +
> + if (!upid)
> + return 0;
> + return upid->nr;
> }
> EXPORT_SYMBOL_GPL(pid_nr_ns);
>
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index db95d8eb761b..e246802b4361 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -268,6 +268,22 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> }
>
> +static int pid_ns_next_highpid_handler(struct ctl_table *table, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> + struct ctl_table tmp = *table;
> +
> + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + /* This needs to be fixed. */
> + BUILD_BUG_ON(sizeof(u64) != sizeof(unsigned long));
> +
> + tmp.data = &pid_ns->next_highpid;
> + return proc_dointvec(&tmp, write, buffer, lenp, ppos);
> +}
> +
> extern int pid_max;
> static int zero = 0;
> static struct ctl_table pid_ns_ctl_table[] = {
> @@ -279,6 +295,12 @@ static struct ctl_table pid_ns_ctl_table[] = {
> .extra1 = &zero,
> .extra2 = &pid_max,
> },
> + {
> + .procname = "ns_next_highpid",
> + .maxlen = sizeof(u64),
> + .mode = 0666, /* permissions are checked in the handler */
> + .proc_handler = pid_ns_next_highpid_handler,
> + },
> { }
> };
> static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
> --
> 1.9.3
>



--
Andy Lutomirski
AMA Capital Management, LLC

2014-11-29 03:35:34

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

Andy Lutomirski <[email protected]> writes:

> Pid reuse is common, which means that it's difficult or impossible
> to read information about a pid from /proc without races.

Sigh.

What we need are not race free pids, but a file descriptor based process
management api. Possibly one that starts by handing you a proc
directory.

Which probably means that we need a proc file we can write to and send
signals to a process, and another proc file we can select on and wait
for the process to exit.

Making pids bigger just looks like bandaid.

Remember evovle things in the direction of an object capability system
things wind up being more maintainable.

Eric

2014-11-29 04:24:32

by Greg KH

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Fri, Nov 28, 2014 at 03:05:01PM -0800, Andy Lutomirski wrote:
> Pid reuse is common, which means that it's difficult or impossible
> to read information about a pid from /proc without races.
>
> This introduces a second number associated with each (task, pidns)
> pair called highpid. Highpid is a 64-bit number, and, barring
> extremely unlikely circumstances or outright error, a (highpid, pid)
> will never be reused.
>
> With just this change, a program can open /proc/PID/status, read the
> "Highpid" field, and confirm that it has the expected value. If the
> pid has been reused, then highpid will be different.
>
> The initial implementation is straightforward: highpid is simply a
> 64-bit counter. If a high-end system can fork every 3 ns (which
> would be amazing, given that just allocating a pid requires at
> atomic operation), it would take well over 1000 years for highpid to
> wrap.
>
> For CRIU's benefit, the next highpid can be set by a privileged
> user.
>
> NB: The sysctl stuff only works on 64-bit systems. If the approach
> looks good, I'll fix that somehow.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> If this goes in, there's plenty of room to add new interfaces to
> make this more useful. For example, we could add a fancier tgkill
> that adds and validates hightgid and highpid, and we might want to
> add a syscall to read one's own hightgid and highpid. These would
> be quite useful for pidfiles.
>
> David, would this be useful for kdbus?
>
> CRIU people: will this be unduly difficult to support in CRIU?
>
> If you all think this is a good idea, I'll fix the sysctl stuff and
> document it. It might also be worth adding "Hightgid" to status.
>
> fs/proc/array.c | 5 ++++-
> include/linux/pid.h | 2 ++
> include/linux/pid_namespace.h | 1 +
> kernel/pid.c | 19 +++++++++++++++----
> kernel/pid_namespace.c | 22 ++++++++++++++++++++++
> 5 files changed, 44 insertions(+), 5 deletions(-)
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index cd3653e4f35c..f1e0e69d19f9 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> int g;
> struct fdtable *fdt = NULL;
> const struct cred *cred;
> + const struct upid *upid;
> pid_t ppid, tpid;
>
> rcu_read_lock();
> @@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> if (tracer)
> tpid = task_pid_nr_ns(tracer, ns);
> }
> + upid = pid_upid_ns(pid, ns);
> cred = get_task_cred(p);
> seq_printf(m,
> "State:\t%s\n"
> "Tgid:\t%d\n"
> "Ngid:\t%d\n"
> "Pid:\t%d\n"
> + "Highpid:\t%llu\n"
> "PPid:\t%d\n"
> "TracerPid:\t%d\n"
> "Uid:\t%d\t%d\t%d\t%d\n"

Changing existing proc files like this is dangerous and can cause lots
of breakage in userspace programs if you are not careful. Usually
adding fields to the end of the file is best, but sometimes even then
things can break :(

2014-11-29 15:19:34

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Nov 28, 2014 9:24 PM, "Greg KH" <[email protected]> wrote:
>
> On Fri, Nov 28, 2014 at 03:05:01PM -0800, Andy Lutomirski wrote:
> > Pid reuse is common, which means that it's difficult or impossible
> > to read information about a pid from /proc without races.
> >
> > This introduces a second number associated with each (task, pidns)
> > pair called highpid. Highpid is a 64-bit number, and, barring
> > extremely unlikely circumstances or outright error, a (highpid, pid)
> > will never be reused.
> >
> > With just this change, a program can open /proc/PID/status, read the
> > "Highpid" field, and confirm that it has the expected value. If the
> > pid has been reused, then highpid will be different.
> >
> > The initial implementation is straightforward: highpid is simply a
> > 64-bit counter. If a high-end system can fork every 3 ns (which
> > would be amazing, given that just allocating a pid requires at
> > atomic operation), it would take well over 1000 years for highpid to
> > wrap.
> >
> > For CRIU's benefit, the next highpid can be set by a privileged
> > user.
> >
> > NB: The sysctl stuff only works on 64-bit systems. If the approach
> > looks good, I'll fix that somehow.
> >
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> >
> > If this goes in, there's plenty of room to add new interfaces to
> > make this more useful. For example, we could add a fancier tgkill
> > that adds and validates hightgid and highpid, and we might want to
> > add a syscall to read one's own hightgid and highpid. These would
> > be quite useful for pidfiles.
> >
> > David, would this be useful for kdbus?
> >
> > CRIU people: will this be unduly difficult to support in CRIU?
> >
> > If you all think this is a good idea, I'll fix the sysctl stuff and
> > document it. It might also be worth adding "Hightgid" to status.
> >
> > fs/proc/array.c | 5 ++++-
> > include/linux/pid.h | 2 ++
> > include/linux/pid_namespace.h | 1 +
> > kernel/pid.c | 19 +++++++++++++++----
> > kernel/pid_namespace.c | 22 ++++++++++++++++++++++
> > 5 files changed, 44 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/proc/array.c b/fs/proc/array.c
> > index cd3653e4f35c..f1e0e69d19f9 100644
> > --- a/fs/proc/array.c
> > +++ b/fs/proc/array.c
> > @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> > int g;
> > struct fdtable *fdt = NULL;
> > const struct cred *cred;
> > + const struct upid *upid;
> > pid_t ppid, tpid;
> >
> > rcu_read_lock();
> > @@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> > if (tracer)
> > tpid = task_pid_nr_ns(tracer, ns);
> > }
> > + upid = pid_upid_ns(pid, ns);
> > cred = get_task_cred(p);
> > seq_printf(m,
> > "State:\t%s\n"
> > "Tgid:\t%d\n"
> > "Ngid:\t%d\n"
> > "Pid:\t%d\n"
> > + "Highpid:\t%llu\n"
> > "PPid:\t%d\n"
> > "TracerPid:\t%d\n"
> > "Uid:\t%d\t%d\t%d\t%d\n"
>
> Changing existing proc files like this is dangerous and can cause lots
> of breakage in userspace programs if you are not careful. Usually
> adding fields to the end of the file is best, but sometimes even then
> things can break :(

Point taken. I had hoped that everything would parse /proc/PID/status
by looking for the lamed headers. I'll see what the history of new
fields in there is, but I can change this easily enough.

--Andy

> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2014-11-29 15:32:44

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Fri, Nov 28, 2014 at 7:34 PM, Eric W. Biederman
<[email protected]> wrote:
> Andy Lutomirski <[email protected]> writes:
>
>> Pid reuse is common, which means that it's difficult or impossible
>> to read information about a pid from /proc without races.
>
> Sigh.
>
> What we need are not race free pids, but a file descriptor based process
> management api. Possibly one that starts by handing you a proc
> directory.

I agree completely, and the Capsicum stuff from FreeBSD would probably
be a very good start here.

>
> Which probably means that we need a proc file we can write to and send
> signals to a process, and another proc file we can select on and wait
> for the process to exit.

That too. In fact, I have an old patch that went nowhere that makes
the proc directory fd itself pollable.

>
> Making pids bigger just looks like bandaid.
>
> Remember evovle things in the direction of an object capability system
> things wind up being more maintainable.

Yes, but this really is intended to be a much better bandaid than what
people currently use.

I'm aiming this at two main use cases that aren't going to switch to
using fds any time soon. One is shell stuff and PID files. If we can
put something like "12345@81726162" in /run/lock/foo.pid and safely
kill `cat /run/lock/foo.pid`, that would be nice (even though sensible
users don't use pid files any more, there are *tons* of things that
still use them). This could also be used for diagnostics. Suppose
the kernel log indicates a misbehaving pid. There's currently no
race-free way to poke at those pids.

The much more pressing issue is that there are apparently programs
that think that checking the process's starttime is a good idea for
avoiding PID reuse races. Both kdbus v1 and v2 do this, hopefully
only for diagnostics. This is, IMO, completely unacceptable, but I
really don't expect kdbus to start passing even more fds around when
they'll be ignored most of the time. So, if kdbus is going to be
merged at some point, I'd like to give it the opportunity to name the
sender of a message in a way that is only mildly dangerous (in non
race-related ways) as opposed to being totally broken.

Now if someone wants to implement real light-weight capability-ish fds
in Linux, that would be neat. IIRC someone tried several years ago
using fds with some high bits set, but it never went anywhere.

FWIW, given that I seem to be the most vocal reviewer of the kdbus
patches, I feel like it's nice to offer some assistance in a piece of
the kdbus stuff that I think really would benefit from a new kernel
feature.

--Andy

2014-11-29 16:07:05

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Fri, Nov 28, 2014 at 03:05:01PM -0800, Andy Lutomirski wrote:
> Pid reuse is common, which means that it's difficult or impossible
> to read information about a pid from /proc without races.
>
> This introduces a second number associated with each (task, pidns)
> pair called highpid. Highpid is a 64-bit number, and, barring
> extremely unlikely circumstances or outright error, a (highpid, pid)
> will never be reused.
>
> With just this change, a program can open /proc/PID/status, read the
> "Highpid" field, and confirm that it has the expected value. If the
> pid has been reused, then highpid will be different.
>
> The initial implementation is straightforward: highpid is simply a
> 64-bit counter. If a high-end system can fork every 3 ns (which
> would be amazing, given that just allocating a pid requires at
> atomic operation), it would take well over 1000 years for highpid to
> wrap.
>
> For CRIU's benefit, the next highpid can be set by a privileged
> user.
>
> NB: The sysctl stuff only works on 64-bit systems. If the approach
> looks good, I'll fix that somehow.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> If this goes in, there's plenty of room to add new interfaces to
> make this more useful. For example, we could add a fancier tgkill
> that adds and validates hightgid and highpid, and we might want to
> add a syscall to read one's own hightgid and highpid. These would
> be quite useful for pidfiles.
>
> David, would this be useful for kdbus?
>
> CRIU people: will this be unduly difficult to support in CRIU?

Hi Andy. I think it won't be hard to support.

2014-11-30 09:08:34

by Florian Weimer

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

* Andy Lutomirski:

> The initial implementation is straightforward: highpid is simply a
> 64-bit counter. If a high-end system can fork every 3 ns (which
> would be amazing, given that just allocating a pid requires at
> atomic operation), it would take well over 1000 years for highpid to
> wrap.

I'm not sure if I'm reading the patch correctly, but is the counter
namespaced? If yes, why?

2014-11-30 16:45:34

by David Herrmann

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

Hi Andy

On Sat, Nov 29, 2014 at 12:05 AM, Andy Lutomirski <[email protected]> wrote:
> Pid reuse is common, which means that it's difficult or impossible
> to read information about a pid from /proc without races.
>
> This introduces a second number associated with each (task, pidns)
> pair called highpid. Highpid is a 64-bit number, and, barring
> extremely unlikely circumstances or outright error, a (highpid, pid)
> will never be reused.
>
> With just this change, a program can open /proc/PID/status, read the
> "Highpid" field, and confirm that it has the expected value. If the
> pid has been reused, then highpid will be different.
>
> The initial implementation is straightforward: highpid is simply a
> 64-bit counter. If a high-end system can fork every 3 ns (which
> would be amazing, given that just allocating a pid requires at
> atomic operation), it would take well over 1000 years for highpid to
> wrap.
>
> For CRIU's benefit, the next highpid can be set by a privileged
> user.
>
> NB: The sysctl stuff only works on 64-bit systems. If the approach
> looks good, I'll fix that somehow.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> If this goes in, there's plenty of room to add new interfaces to
> make this more useful. For example, we could add a fancier tgkill
> that adds and validates hightgid and highpid, and we might want to
> add a syscall to read one's own hightgid and highpid. These would
> be quite useful for pidfiles.
>
> David, would this be useful for kdbus?

Much appreciated! This would serve well as replacement for
'starttime'. I'd prefer if pid_t was 64bit, but I guess that ship
sailed long ago. Though, your patch might in the end just introduce a
new pid64, which replaces the old pid and lives in parallel.

Anyway, considering that we actually want the same pid-reuse
protection for tid, tgid, ppid and so on, we'd have to add a
'starttime' for all of them. Sounds ugly.. so we might just end up
dropping 'starttime' and introduce KDBUS_ITEM_PIDS2 whenever a 'fix'
is merged upstream.

Thanks a lot!
David

2014-11-30 22:06:13

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Nov 30, 2014 9:45 AM, "David Herrmann" <[email protected]> wrote:
>
> Hi Andy
>
> On Sat, Nov 29, 2014 at 12:05 AM, Andy Lutomirski <[email protected]> wrote:
> > Pid reuse is common, which means that it's difficult or impossible
> > to read information about a pid from /proc without races.
> >
> > This introduces a second number associated with each (task, pidns)
> > pair called highpid. Highpid is a 64-bit number, and, barring
> > extremely unlikely circumstances or outright error, a (highpid, pid)
> > will never be reused.
> >
> > With just this change, a program can open /proc/PID/status, read the
> > "Highpid" field, and confirm that it has the expected value. If the
> > pid has been reused, then highpid will be different.
> >
> > The initial implementation is straightforward: highpid is simply a
> > 64-bit counter. If a high-end system can fork every 3 ns (which
> > would be amazing, given that just allocating a pid requires at
> > atomic operation), it would take well over 1000 years for highpid to
> > wrap.
> >
> > For CRIU's benefit, the next highpid can be set by a privileged
> > user.
> >
> > NB: The sysctl stuff only works on 64-bit systems. If the approach
> > looks good, I'll fix that somehow.
> >
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> >
> > If this goes in, there's plenty of room to add new interfaces to
> > make this more useful. For example, we could add a fancier tgkill
> > that adds and validates hightgid and highpid, and we might want to
> > add a syscall to read one's own hightgid and highpid. These would
> > be quite useful for pidfiles.
> >
> > David, would this be useful for kdbus?
>
> Much appreciated! This would serve well as replacement for
> 'starttime'. I'd prefer if pid_t was 64bit, but I guess that ship
> sailed long ago. Though, your patch might in the end just introduce a
> new pid64, which replaces the old pid and lives in parallel.
>
> Anyway, considering that we actually want the same pid-reuse
> protection for tid, tgid, ppid and so on, we'd have to add a
> 'starttime' for all of them. Sounds ugly.. so we might just end up
> dropping 'starttime' and introduce KDBUS_ITEM_PIDS2 whenever a 'fix'
> is merged upstream.

I don't understand "we want". Doesn't kdbus currently only think
about tid and tgid?

In any event, I just looked at the kdbus code
(https://github.com/gregkh/kdbus/blob/master/metadata.c), and I see:

meta->pid = get_pid(task_pid(current));
meta->tgid = get_pid(task_tgid(current));
meta->starttime = current->start_time,

I don't understand how that concept of starttime is particularly
useful. Isn't that the start time associated with tid? How can that
be useful when looking up tgid?

Regardless, my patch here associates a highpid with each struct pid,
and both tid and tgid are just struct pids, so you get both. I agree
that adding Highppid to /proc might be useful, but that seems like a
future extension that has nothing to do with kdbus.

--Andy

2014-11-30 22:08:37

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Nov 30, 2014 1:47 AM, "Florian Weimer" <[email protected]> wrote:
>
> * Andy Lutomirski:
>
> > The initial implementation is straightforward: highpid is simply a
> > 64-bit counter. If a high-end system can fork every 3 ns (which
> > would be amazing, given that just allocating a pid requires at
> > atomic operation), it would take well over 1000 years for highpid to
> > wrap.
>
> I'm not sure if I'm reading the patch correctly, but is the counter
> namespaced? If yes, why?

It's namespaced so that CRIU can migrate/restore a whole pid namespace.

--Andy

2014-12-01 06:47:41

by Florian Weimer

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

* Andy Lutomirski:

> On Nov 30, 2014 1:47 AM, "Florian Weimer" <[email protected]> wrote:
>>
>> * Andy Lutomirski:
>>
>> > The initial implementation is straightforward: highpid is simply a
>> > 64-bit counter. If a high-end system can fork every 3 ns (which
>> > would be amazing, given that just allocating a pid requires at
>> > atomic operation), it would take well over 1000 years for highpid to
>> > wrap.
>>
>> I'm not sure if I'm reading the patch correctly, but is the counter
>> namespaced? If yes, why?
>
> It's namespaced so that CRIU can migrate/restore a whole pid namespace.

Oh well, this requirement is at odds with system-wide uniqueness. Is
CRIU really that important? :-)

2014-12-01 07:03:52

by Konstantin Khlebnikov

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

Hmm. What about per-task/thread UUID? exported via separate file: /proc/PID/uuid
It could be created at the first access, thus this wouldn't shlowdown clone().
Also it could be droped at execve(), so it'll describe execution
context more precisely than pid.

On Sat, Nov 29, 2014 at 2:05 AM, Andy Lutomirski <[email protected]> wrote:
> Pid reuse is common, which means that it's difficult or impossible
> to read information about a pid from /proc without races.
>
> This introduces a second number associated with each (task, pidns)
> pair called highpid. Highpid is a 64-bit number, and, barring
> extremely unlikely circumstances or outright error, a (highpid, pid)
> will never be reused.
>
> With just this change, a program can open /proc/PID/status, read the
> "Highpid" field, and confirm that it has the expected value. If the
> pid has been reused, then highpid will be different.
>
> The initial implementation is straightforward: highpid is simply a
> 64-bit counter. If a high-end system can fork every 3 ns (which
> would be amazing, given that just allocating a pid requires at
> atomic operation), it would take well over 1000 years for highpid to
> wrap.
>
> For CRIU's benefit, the next highpid can be set by a privileged
> user.
>
> NB: The sysctl stuff only works on 64-bit systems. If the approach
> looks good, I'll fix that somehow.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> If this goes in, there's plenty of room to add new interfaces to
> make this more useful. For example, we could add a fancier tgkill
> that adds and validates hightgid and highpid, and we might want to
> add a syscall to read one's own hightgid and highpid. These would
> be quite useful for pidfiles.
>
> David, would this be useful for kdbus?
>
> CRIU people: will this be unduly difficult to support in CRIU?
>
> If you all think this is a good idea, I'll fix the sysctl stuff and
> document it. It might also be worth adding "Hightgid" to status.
>
> fs/proc/array.c | 5 ++++-
> include/linux/pid.h | 2 ++
> include/linux/pid_namespace.h | 1 +
> kernel/pid.c | 19 +++++++++++++++----
> kernel/pid_namespace.c | 22 ++++++++++++++++++++++
> 5 files changed, 44 insertions(+), 5 deletions(-)
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index cd3653e4f35c..f1e0e69d19f9 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> int g;
> struct fdtable *fdt = NULL;
> const struct cred *cred;
> + const struct upid *upid;
> pid_t ppid, tpid;
>
> rcu_read_lock();
> @@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> if (tracer)
> tpid = task_pid_nr_ns(tracer, ns);
> }
> + upid = pid_upid_ns(pid, ns);
> cred = get_task_cred(p);
> seq_printf(m,
> "State:\t%s\n"
> "Tgid:\t%d\n"
> "Ngid:\t%d\n"
> "Pid:\t%d\n"
> + "Highpid:\t%llu\n"
> "PPid:\t%d\n"
> "TracerPid:\t%d\n"
> "Uid:\t%d\t%d\t%d\t%d\n"
> @@ -183,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
> get_task_state(p),
> task_tgid_nr_ns(p, ns),
> task_numa_group_id(p),
> - pid_nr_ns(pid, ns),
> + upid ? upid->nr : 0, upid ? upid->highnr : 0,
> ppid, tpid,
> from_kuid_munged(user_ns, cred->uid),
> from_kuid_munged(user_ns, cred->euid),
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 23705a53abba..ece70b64d04c 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -51,6 +51,7 @@ struct upid {
> /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
> int nr;
> struct pid_namespace *ns;
> + u64 highnr;
> struct hlist_node pid_chain;
> };
>
> @@ -170,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
> }
>
> pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns);
> pid_t pid_vnr(struct pid *pid);
>
> #define do_each_pid_task(pid, type, task) \
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 1997ffc295a7..1f9f0455d7ef 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -26,6 +26,7 @@ struct pid_namespace {
> struct rcu_head rcu;
> int last_pid;
> unsigned int nr_hashed;
> + atomic64_t next_highpid;
> struct task_struct *child_reaper;
> struct kmem_cache *pid_cachep;
> unsigned int level;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 9b9a26698144..863d11a9bfbf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -312,6 +312,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>
> pid->numbers[i].nr = nr;
> pid->numbers[i].ns = tmp;
> + pid->numbers[i].highnr =
> + atomic64_inc_return(&tmp->next_highpid) - 1;
> tmp = tmp->parent;
> }
>
> @@ -492,17 +494,26 @@ struct pid *find_get_pid(pid_t nr)
> }
> EXPORT_SYMBOL_GPL(find_get_pid);
>
> -pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns)
> {
> struct upid *upid;
> - pid_t nr = 0;
>
> if (pid && ns->level <= pid->level) {
> upid = &pid->numbers[ns->level];
> if (upid->ns == ns)
> - nr = upid->nr;
> + return upid;
> }
> - return nr;
> + return NULL;
> +}
> +EXPORT_SYMBOL_GPL(pid_upid_ns);
> +
> +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
> +{
> + const struct upid *upid = pid_upid_ns(pid, ns);
> +
> + if (!upid)
> + return 0;
> + return upid->nr;
> }
> EXPORT_SYMBOL_GPL(pid_nr_ns);
>
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index db95d8eb761b..e246802b4361 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -268,6 +268,22 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> }
>
> +static int pid_ns_next_highpid_handler(struct ctl_table *table, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> + struct ctl_table tmp = *table;
> +
> + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + /* This needs to be fixed. */
> + BUILD_BUG_ON(sizeof(u64) != sizeof(unsigned long));
> +
> + tmp.data = &pid_ns->next_highpid;
> + return proc_dointvec(&tmp, write, buffer, lenp, ppos);
> +}
> +
> extern int pid_max;
> static int zero = 0;
> static struct ctl_table pid_ns_ctl_table[] = {
> @@ -279,6 +295,12 @@ static struct ctl_table pid_ns_ctl_table[] = {
> .extra1 = &zero,
> .extra2 = &pid_max,
> },
> + {
> + .procname = "ns_next_highpid",
> + .maxlen = sizeof(u64),
> + .mode = 0666, /* permissions are checked in the handler */
> + .proc_handler = pid_ns_next_highpid_handler,
> + },
> { }
> };
> static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2014-12-01 12:34:00

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On 12/01/2014 09:47 AM, Florian Weimer wrote:
> * Andy Lutomirski:
>
>> On Nov 30, 2014 1:47 AM, "Florian Weimer" <[email protected]> wrote:
>>>
>>> * Andy Lutomirski:
>>>
>>>> The initial implementation is straightforward: highpid is simply a
>>>> 64-bit counter. If a high-end system can fork every 3 ns (which
>>>> would be amazing, given that just allocating a pid requires at
>>>> atomic operation), it would take well over 1000 years for highpid to
>>>> wrap.
>>>
>>> I'm not sure if I'm reading the patch correctly, but is the counter
>>> namespaced? If yes, why?
>>
>> It's namespaced so that CRIU can migrate/restore a whole pid namespace.
>
> Oh well, this requirement is at odds with system-wide uniqueness. Is
> CRIU really that important? :-)

Well, in this context it is. Since the main (if not the only) use-case for
highpid is to read one, remember, then compare to new value, restoring it
to wrong/arbitrary value will break the using applications in 100% cases.

Thus we really need the ability to restore this value.

Thanks,
Pavel

2014-12-01 16:21:49

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Sun, Nov 30, 2014 at 11:03 PM, Konstantin Khlebnikov
<[email protected]> wrote:
> Hmm. What about per-task/thread UUID? exported via separate file: /proc/PID/uuid
> It could be created at the first access, thus this wouldn't shlowdown clone().
> Also it could be droped at execve(), so it'll describe execution
> context more precisely than pid.
>

I'd be all for this if it weren't for two issues:

1. This thing needs to work as soon as init is started, and we can't
reliably generate random numbers that early.

2. Migration / CRIU is rather fundamentally at odds with making
anything universally unique, as opposed to just per-user-ns.

So, given that I don't think we can actually provide a universally
unique pid-like thing, I'd rather not even try.

That being said, I want to rework this a little bit. I think that
highpid should be restricted to the range 2^32 through 2^64 - 4096.
The former prevents anyone from confusing highpid with regular pid,
and the latter means that we don't need to worry about confusion
between errors and valid highpids (e.g. -1 will never be a highpid).

Implementing that will be only mildly annoying.

--Andy

> On Sat, Nov 29, 2014 at 2:05 AM, Andy Lutomirski <[email protected]> wrote:
>> Pid reuse is common, which means that it's difficult or impossible
>> to read information about a pid from /proc without races.
>>
>> This introduces a second number associated with each (task, pidns)
>> pair called highpid. Highpid is a 64-bit number, and, barring
>> extremely unlikely circumstances or outright error, a (highpid, pid)
>> will never be reused.
>>
>> With just this change, a program can open /proc/PID/status, read the
>> "Highpid" field, and confirm that it has the expected value. If the
>> pid has been reused, then highpid will be different.
>>
>> The initial implementation is straightforward: highpid is simply a
>> 64-bit counter. If a high-end system can fork every 3 ns (which
>> would be amazing, given that just allocating a pid requires at
>> atomic operation), it would take well over 1000 years for highpid to
>> wrap.
>>
>> For CRIU's benefit, the next highpid can be set by a privileged
>> user.
>>
>> NB: The sysctl stuff only works on 64-bit systems. If the approach
>> looks good, I'll fix that somehow.
>>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>>
>> If this goes in, there's plenty of room to add new interfaces to
>> make this more useful. For example, we could add a fancier tgkill
>> that adds and validates hightgid and highpid, and we might want to
>> add a syscall to read one's own hightgid and highpid. These would
>> be quite useful for pidfiles.
>>
>> David, would this be useful for kdbus?
>>
>> CRIU people: will this be unduly difficult to support in CRIU?
>>
>> If you all think this is a good idea, I'll fix the sysctl stuff and
>> document it. It might also be worth adding "Hightgid" to status.
>>
>> fs/proc/array.c | 5 ++++-
>> include/linux/pid.h | 2 ++
>> include/linux/pid_namespace.h | 1 +
>> kernel/pid.c | 19 +++++++++++++++----
>> kernel/pid_namespace.c | 22 ++++++++++++++++++++++
>> 5 files changed, 44 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/proc/array.c b/fs/proc/array.c
>> index cd3653e4f35c..f1e0e69d19f9 100644
>> --- a/fs/proc/array.c
>> +++ b/fs/proc/array.c
>> @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>> int g;
>> struct fdtable *fdt = NULL;
>> const struct cred *cred;
>> + const struct upid *upid;
>> pid_t ppid, tpid;
>>
>> rcu_read_lock();
>> @@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>> if (tracer)
>> tpid = task_pid_nr_ns(tracer, ns);
>> }
>> + upid = pid_upid_ns(pid, ns);
>> cred = get_task_cred(p);
>> seq_printf(m,
>> "State:\t%s\n"
>> "Tgid:\t%d\n"
>> "Ngid:\t%d\n"
>> "Pid:\t%d\n"
>> + "Highpid:\t%llu\n"
>> "PPid:\t%d\n"
>> "TracerPid:\t%d\n"
>> "Uid:\t%d\t%d\t%d\t%d\n"
>> @@ -183,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>> get_task_state(p),
>> task_tgid_nr_ns(p, ns),
>> task_numa_group_id(p),
>> - pid_nr_ns(pid, ns),
>> + upid ? upid->nr : 0, upid ? upid->highnr : 0,
>> ppid, tpid,
>> from_kuid_munged(user_ns, cred->uid),
>> from_kuid_munged(user_ns, cred->euid),
>> diff --git a/include/linux/pid.h b/include/linux/pid.h
>> index 23705a53abba..ece70b64d04c 100644
>> --- a/include/linux/pid.h
>> +++ b/include/linux/pid.h
>> @@ -51,6 +51,7 @@ struct upid {
>> /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
>> int nr;
>> struct pid_namespace *ns;
>> + u64 highnr;
>> struct hlist_node pid_chain;
>> };
>>
>> @@ -170,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
>> }
>>
>> pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
>> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns);
>> pid_t pid_vnr(struct pid *pid);
>>
>> #define do_each_pid_task(pid, type, task) \
>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>> index 1997ffc295a7..1f9f0455d7ef 100644
>> --- a/include/linux/pid_namespace.h
>> +++ b/include/linux/pid_namespace.h
>> @@ -26,6 +26,7 @@ struct pid_namespace {
>> struct rcu_head rcu;
>> int last_pid;
>> unsigned int nr_hashed;
>> + atomic64_t next_highpid;
>> struct task_struct *child_reaper;
>> struct kmem_cache *pid_cachep;
>> unsigned int level;
>> diff --git a/kernel/pid.c b/kernel/pid.c
>> index 9b9a26698144..863d11a9bfbf 100644
>> --- a/kernel/pid.c
>> +++ b/kernel/pid.c
>> @@ -312,6 +312,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>>
>> pid->numbers[i].nr = nr;
>> pid->numbers[i].ns = tmp;
>> + pid->numbers[i].highnr =
>> + atomic64_inc_return(&tmp->next_highpid) - 1;
>> tmp = tmp->parent;
>> }
>>
>> @@ -492,17 +494,26 @@ struct pid *find_get_pid(pid_t nr)
>> }
>> EXPORT_SYMBOL_GPL(find_get_pid);
>>
>> -pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
>> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns)
>> {
>> struct upid *upid;
>> - pid_t nr = 0;
>>
>> if (pid && ns->level <= pid->level) {
>> upid = &pid->numbers[ns->level];
>> if (upid->ns == ns)
>> - nr = upid->nr;
>> + return upid;
>> }
>> - return nr;
>> + return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(pid_upid_ns);
>> +
>> +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
>> +{
>> + const struct upid *upid = pid_upid_ns(pid, ns);
>> +
>> + if (!upid)
>> + return 0;
>> + return upid->nr;
>> }
>> EXPORT_SYMBOL_GPL(pid_nr_ns);
>>
>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>> index db95d8eb761b..e246802b4361 100644
>> --- a/kernel/pid_namespace.c
>> +++ b/kernel/pid_namespace.c
>> @@ -268,6 +268,22 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
>> }
>>
>> +static int pid_ns_next_highpid_handler(struct ctl_table *table, int write,
>> + void __user *buffer, size_t *lenp, loff_t *ppos)
>> +{
>> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
>> + struct ctl_table tmp = *table;
>> +
>> + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>> + return -EPERM;
>> +
>> + /* This needs to be fixed. */
>> + BUILD_BUG_ON(sizeof(u64) != sizeof(unsigned long));
>> +
>> + tmp.data = &pid_ns->next_highpid;
>> + return proc_dointvec(&tmp, write, buffer, lenp, ppos);
>> +}
>> +
>> extern int pid_max;
>> static int zero = 0;
>> static struct ctl_table pid_ns_ctl_table[] = {
>> @@ -279,6 +295,12 @@ static struct ctl_table pid_ns_ctl_table[] = {
>> .extra1 = &zero,
>> .extra2 = &pid_max,
>> },
>> + {
>> + .procname = "ns_next_highpid",
>> + .maxlen = sizeof(u64),
>> + .mode = 0666, /* permissions are checked in the handler */
>> + .proc_handler = pid_ns_next_highpid_handler,
>> + },
>> { }
>> };
>> static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
>> --
>> 1.9.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/



--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-01 16:39:15

by Konstantin Khlebnikov

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Mon, Dec 1, 2014 at 7:21 PM, Andy Lutomirski <[email protected]> wrote:
> On Sun, Nov 30, 2014 at 11:03 PM, Konstantin Khlebnikov
> <[email protected]> wrote:
>> Hmm. What about per-task/thread UUID? exported via separate file: /proc/PID/uuid
>> It could be created at the first access, thus this wouldn't shlowdown clone().
>> Also it could be droped at execve(), so it'll describe execution
>> context more precisely than pid.
>>
>
> I'd be all for this if it weren't for two issues:
>
> 1. This thing needs to work as soon as init is started, and we can't
> reliably generate random numbers that early.
>
> 2. Migration / CRIU is rather fundamentally at odds with making
> anything universally unique, as opposed to just per-user-ns.
>
> So, given that I don't think we can actually provide a universally
> unique pid-like thing, I'd rather not even try.

Well... another idea: what about pid generation counter? Of course it
should be per-pid-ns.
As I see struct upid has nice 32-bit hole next to 'nr' field. (on
64-bit systems)

>
> That being said, I want to rework this a little bit. I think that
> highpid should be restricted to the range 2^32 through 2^64 - 4096.
> The former prevents anyone from confusing highpid with regular pid,
> and the latter means that we don't need to worry about confusion
> between errors and valid highpids (e.g. -1 will never be a highpid).
>
> Implementing that will be only mildly annoying.
>
> --Andy
>
>> On Sat, Nov 29, 2014 at 2:05 AM, Andy Lutomirski <[email protected]> wrote:
>>> Pid reuse is common, which means that it's difficult or impossible
>>> to read information about a pid from /proc without races.
>>>
>>> This introduces a second number associated with each (task, pidns)
>>> pair called highpid. Highpid is a 64-bit number, and, barring
>>> extremely unlikely circumstances or outright error, a (highpid, pid)
>>> will never be reused.
>>>
>>> With just this change, a program can open /proc/PID/status, read the
>>> "Highpid" field, and confirm that it has the expected value. If the
>>> pid has been reused, then highpid will be different.
>>>
>>> The initial implementation is straightforward: highpid is simply a
>>> 64-bit counter. If a high-end system can fork every 3 ns (which
>>> would be amazing, given that just allocating a pid requires at
>>> atomic operation), it would take well over 1000 years for highpid to
>>> wrap.
>>>
>>> For CRIU's benefit, the next highpid can be set by a privileged
>>> user.
>>>
>>> NB: The sysctl stuff only works on 64-bit systems. If the approach
>>> looks good, I'll fix that somehow.
>>>
>>> Signed-off-by: Andy Lutomirski <[email protected]>
>>> ---
>>>
>>> If this goes in, there's plenty of room to add new interfaces to
>>> make this more useful. For example, we could add a fancier tgkill
>>> that adds and validates hightgid and highpid, and we might want to
>>> add a syscall to read one's own hightgid and highpid. These would
>>> be quite useful for pidfiles.
>>>
>>> David, would this be useful for kdbus?
>>>
>>> CRIU people: will this be unduly difficult to support in CRIU?
>>>
>>> If you all think this is a good idea, I'll fix the sysctl stuff and
>>> document it. It might also be worth adding "Hightgid" to status.
>>>
>>> fs/proc/array.c | 5 ++++-
>>> include/linux/pid.h | 2 ++
>>> include/linux/pid_namespace.h | 1 +
>>> kernel/pid.c | 19 +++++++++++++++----
>>> kernel/pid_namespace.c | 22 ++++++++++++++++++++++
>>> 5 files changed, 44 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/proc/array.c b/fs/proc/array.c
>>> index cd3653e4f35c..f1e0e69d19f9 100644
>>> --- a/fs/proc/array.c
>>> +++ b/fs/proc/array.c
>>> @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>>> int g;
>>> struct fdtable *fdt = NULL;
>>> const struct cred *cred;
>>> + const struct upid *upid;
>>> pid_t ppid, tpid;
>>>
>>> rcu_read_lock();
>>> @@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>>> if (tracer)
>>> tpid = task_pid_nr_ns(tracer, ns);
>>> }
>>> + upid = pid_upid_ns(pid, ns);
>>> cred = get_task_cred(p);
>>> seq_printf(m,
>>> "State:\t%s\n"
>>> "Tgid:\t%d\n"
>>> "Ngid:\t%d\n"
>>> "Pid:\t%d\n"
>>> + "Highpid:\t%llu\n"
>>> "PPid:\t%d\n"
>>> "TracerPid:\t%d\n"
>>> "Uid:\t%d\t%d\t%d\t%d\n"
>>> @@ -183,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>>> get_task_state(p),
>>> task_tgid_nr_ns(p, ns),
>>> task_numa_group_id(p),
>>> - pid_nr_ns(pid, ns),
>>> + upid ? upid->nr : 0, upid ? upid->highnr : 0,
>>> ppid, tpid,
>>> from_kuid_munged(user_ns, cred->uid),
>>> from_kuid_munged(user_ns, cred->euid),
>>> diff --git a/include/linux/pid.h b/include/linux/pid.h
>>> index 23705a53abba..ece70b64d04c 100644
>>> --- a/include/linux/pid.h
>>> +++ b/include/linux/pid.h
>>> @@ -51,6 +51,7 @@ struct upid {
>>> /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
>>> int nr;
>>> struct pid_namespace *ns;
>>> + u64 highnr;
>>> struct hlist_node pid_chain;
>>> };
>>>
>>> @@ -170,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
>>> }
>>>
>>> pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
>>> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns);
>>> pid_t pid_vnr(struct pid *pid);
>>>
>>> #define do_each_pid_task(pid, type, task) \
>>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>>> index 1997ffc295a7..1f9f0455d7ef 100644
>>> --- a/include/linux/pid_namespace.h
>>> +++ b/include/linux/pid_namespace.h
>>> @@ -26,6 +26,7 @@ struct pid_namespace {
>>> struct rcu_head rcu;
>>> int last_pid;
>>> unsigned int nr_hashed;
>>> + atomic64_t next_highpid;
>>> struct task_struct *child_reaper;
>>> struct kmem_cache *pid_cachep;
>>> unsigned int level;
>>> diff --git a/kernel/pid.c b/kernel/pid.c
>>> index 9b9a26698144..863d11a9bfbf 100644
>>> --- a/kernel/pid.c
>>> +++ b/kernel/pid.c
>>> @@ -312,6 +312,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>>>
>>> pid->numbers[i].nr = nr;
>>> pid->numbers[i].ns = tmp;
>>> + pid->numbers[i].highnr =
>>> + atomic64_inc_return(&tmp->next_highpid) - 1;
>>> tmp = tmp->parent;
>>> }
>>>
>>> @@ -492,17 +494,26 @@ struct pid *find_get_pid(pid_t nr)
>>> }
>>> EXPORT_SYMBOL_GPL(find_get_pid);
>>>
>>> -pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
>>> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns)
>>> {
>>> struct upid *upid;
>>> - pid_t nr = 0;
>>>
>>> if (pid && ns->level <= pid->level) {
>>> upid = &pid->numbers[ns->level];
>>> if (upid->ns == ns)
>>> - nr = upid->nr;
>>> + return upid;
>>> }
>>> - return nr;
>>> + return NULL;
>>> +}
>>> +EXPORT_SYMBOL_GPL(pid_upid_ns);
>>> +
>>> +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
>>> +{
>>> + const struct upid *upid = pid_upid_ns(pid, ns);
>>> +
>>> + if (!upid)
>>> + return 0;
>>> + return upid->nr;
>>> }
>>> EXPORT_SYMBOL_GPL(pid_nr_ns);
>>>
>>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>>> index db95d8eb761b..e246802b4361 100644
>>> --- a/kernel/pid_namespace.c
>>> +++ b/kernel/pid_namespace.c
>>> @@ -268,6 +268,22 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>>> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
>>> }
>>>
>>> +static int pid_ns_next_highpid_handler(struct ctl_table *table, int write,
>>> + void __user *buffer, size_t *lenp, loff_t *ppos)
>>> +{
>>> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
>>> + struct ctl_table tmp = *table;
>>> +
>>> + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>>> + return -EPERM;
>>> +
>>> + /* This needs to be fixed. */
>>> + BUILD_BUG_ON(sizeof(u64) != sizeof(unsigned long));
>>> +
>>> + tmp.data = &pid_ns->next_highpid;
>>> + return proc_dointvec(&tmp, write, buffer, lenp, ppos);
>>> +}
>>> +
>>> extern int pid_max;
>>> static int zero = 0;
>>> static struct ctl_table pid_ns_ctl_table[] = {
>>> @@ -279,6 +295,12 @@ static struct ctl_table pid_ns_ctl_table[] = {
>>> .extra1 = &zero,
>>> .extra2 = &pid_max,
>>> },
>>> + {
>>> + .procname = "ns_next_highpid",
>>> + .maxlen = sizeof(u64),
>>> + .mode = 0666, /* permissions are checked in the handler */
>>> + .proc_handler = pid_ns_next_highpid_handler,
>>> + },
>>> { }
>>> };
>>> static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
>>> --
>>> 1.9.3
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/
>
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC

2014-12-01 16:48:54

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] proc, pidns: Add highpid

On Mon, Dec 1, 2014 at 8:39 AM, Konstantin Khlebnikov <[email protected]> wrote:
> On Mon, Dec 1, 2014 at 7:21 PM, Andy Lutomirski <[email protected]> wrote:
>> On Sun, Nov 30, 2014 at 11:03 PM, Konstantin Khlebnikov
>> <[email protected]> wrote:
>>> Hmm. What about per-task/thread UUID? exported via separate file: /proc/PID/uuid
>>> It could be created at the first access, thus this wouldn't shlowdown clone().
>>> Also it could be droped at execve(), so it'll describe execution
>>> context more precisely than pid.
>>>
>>
>> I'd be all for this if it weren't for two issues:
>>
>> 1. This thing needs to work as soon as init is started, and we can't
>> reliably generate random numbers that early.
>>
>> 2. Migration / CRIU is rather fundamentally at odds with making
>> anything universally unique, as opposed to just per-user-ns.
>>
>> So, given that I don't think we can actually provide a universally
>> unique pid-like thing, I'd rather not even try.
>
> Well... another idea: what about pid generation counter? Of course it
> should be per-pid-ns.
> As I see struct upid has nice 32-bit hole next to 'nr' field. (on
> 64-bit systems)
>

I thought about that, but it has two issues:

1. Implementing it will be pain. The pid allocation algorithm is
already complicated, and it wasn't obvious to me how to accurately
detect wraparound without racing against other wrapping users.

2. I don't think it will be reliable. Suppose that there are pid_max
- 1 processes. One of them can repeatedly clone and exit,
incrementing the generation counter each time. After 2^32 iterations,
which won't take all that long, the pid generation will wrap.

--Andy

>>
>> That being said, I want to rework this a little bit. I think that
>> highpid should be restricted to the range 2^32 through 2^64 - 4096.
>> The former prevents anyone from confusing highpid with regular pid,
>> and the latter means that we don't need to worry about confusion
>> between errors and valid highpids (e.g. -1 will never be a highpid).
>>
>> Implementing that will be only mildly annoying.
>>
>> --Andy
>>
>>> On Sat, Nov 29, 2014 at 2:05 AM, Andy Lutomirski <[email protected]> wrote:
>>>> Pid reuse is common, which means that it's difficult or impossible
>>>> to read information about a pid from /proc without races.
>>>>
>>>> This introduces a second number associated with each (task, pidns)
>>>> pair called highpid. Highpid is a 64-bit number, and, barring
>>>> extremely unlikely circumstances or outright error, a (highpid, pid)
>>>> will never be reused.
>>>>
>>>> With just this change, a program can open /proc/PID/status, read the
>>>> "Highpid" field, and confirm that it has the expected value. If the
>>>> pid has been reused, then highpid will be different.
>>>>
>>>> The initial implementation is straightforward: highpid is simply a
>>>> 64-bit counter. If a high-end system can fork every 3 ns (which
>>>> would be amazing, given that just allocating a pid requires at
>>>> atomic operation), it would take well over 1000 years for highpid to
>>>> wrap.
>>>>
>>>> For CRIU's benefit, the next highpid can be set by a privileged
>>>> user.
>>>>
>>>> NB: The sysctl stuff only works on 64-bit systems. If the approach
>>>> looks good, I'll fix that somehow.
>>>>
>>>> Signed-off-by: Andy Lutomirski <[email protected]>
>>>> ---
>>>>
>>>> If this goes in, there's plenty of room to add new interfaces to
>>>> make this more useful. For example, we could add a fancier tgkill
>>>> that adds and validates hightgid and highpid, and we might want to
>>>> add a syscall to read one's own hightgid and highpid. These would
>>>> be quite useful for pidfiles.
>>>>
>>>> David, would this be useful for kdbus?
>>>>
>>>> CRIU people: will this be unduly difficult to support in CRIU?
>>>>
>>>> If you all think this is a good idea, I'll fix the sysctl stuff and
>>>> document it. It might also be worth adding "Hightgid" to status.
>>>>
>>>> fs/proc/array.c | 5 ++++-
>>>> include/linux/pid.h | 2 ++
>>>> include/linux/pid_namespace.h | 1 +
>>>> kernel/pid.c | 19 +++++++++++++++----
>>>> kernel/pid_namespace.c | 22 ++++++++++++++++++++++
>>>> 5 files changed, 44 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/fs/proc/array.c b/fs/proc/array.c
>>>> index cd3653e4f35c..f1e0e69d19f9 100644
>>>> --- a/fs/proc/array.c
>>>> +++ b/fs/proc/array.c
>>>> @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>>>> int g;
>>>> struct fdtable *fdt = NULL;
>>>> const struct cred *cred;
>>>> + const struct upid *upid;
>>>> pid_t ppid, tpid;
>>>>
>>>> rcu_read_lock();
>>>> @@ -170,12 +171,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>>>> if (tracer)
>>>> tpid = task_pid_nr_ns(tracer, ns);
>>>> }
>>>> + upid = pid_upid_ns(pid, ns);
>>>> cred = get_task_cred(p);
>>>> seq_printf(m,
>>>> "State:\t%s\n"
>>>> "Tgid:\t%d\n"
>>>> "Ngid:\t%d\n"
>>>> "Pid:\t%d\n"
>>>> + "Highpid:\t%llu\n"
>>>> "PPid:\t%d\n"
>>>> "TracerPid:\t%d\n"
>>>> "Uid:\t%d\t%d\t%d\t%d\n"
>>>> @@ -183,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
>>>> get_task_state(p),
>>>> task_tgid_nr_ns(p, ns),
>>>> task_numa_group_id(p),
>>>> - pid_nr_ns(pid, ns),
>>>> + upid ? upid->nr : 0, upid ? upid->highnr : 0,
>>>> ppid, tpid,
>>>> from_kuid_munged(user_ns, cred->uid),
>>>> from_kuid_munged(user_ns, cred->euid),
>>>> diff --git a/include/linux/pid.h b/include/linux/pid.h
>>>> index 23705a53abba..ece70b64d04c 100644
>>>> --- a/include/linux/pid.h
>>>> +++ b/include/linux/pid.h
>>>> @@ -51,6 +51,7 @@ struct upid {
>>>> /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
>>>> int nr;
>>>> struct pid_namespace *ns;
>>>> + u64 highnr;
>>>> struct hlist_node pid_chain;
>>>> };
>>>>
>>>> @@ -170,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
>>>> }
>>>>
>>>> pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
>>>> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns);
>>>> pid_t pid_vnr(struct pid *pid);
>>>>
>>>> #define do_each_pid_task(pid, type, task) \
>>>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>>>> index 1997ffc295a7..1f9f0455d7ef 100644
>>>> --- a/include/linux/pid_namespace.h
>>>> +++ b/include/linux/pid_namespace.h
>>>> @@ -26,6 +26,7 @@ struct pid_namespace {
>>>> struct rcu_head rcu;
>>>> int last_pid;
>>>> unsigned int nr_hashed;
>>>> + atomic64_t next_highpid;
>>>> struct task_struct *child_reaper;
>>>> struct kmem_cache *pid_cachep;
>>>> unsigned int level;
>>>> diff --git a/kernel/pid.c b/kernel/pid.c
>>>> index 9b9a26698144..863d11a9bfbf 100644
>>>> --- a/kernel/pid.c
>>>> +++ b/kernel/pid.c
>>>> @@ -312,6 +312,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>>>>
>>>> pid->numbers[i].nr = nr;
>>>> pid->numbers[i].ns = tmp;
>>>> + pid->numbers[i].highnr =
>>>> + atomic64_inc_return(&tmp->next_highpid) - 1;
>>>> tmp = tmp->parent;
>>>> }
>>>>
>>>> @@ -492,17 +494,26 @@ struct pid *find_get_pid(pid_t nr)
>>>> }
>>>> EXPORT_SYMBOL_GPL(find_get_pid);
>>>>
>>>> -pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
>>>> +const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns)
>>>> {
>>>> struct upid *upid;
>>>> - pid_t nr = 0;
>>>>
>>>> if (pid && ns->level <= pid->level) {
>>>> upid = &pid->numbers[ns->level];
>>>> if (upid->ns == ns)
>>>> - nr = upid->nr;
>>>> + return upid;
>>>> }
>>>> - return nr;
>>>> + return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(pid_upid_ns);
>>>> +
>>>> +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
>>>> +{
>>>> + const struct upid *upid = pid_upid_ns(pid, ns);
>>>> +
>>>> + if (!upid)
>>>> + return 0;
>>>> + return upid->nr;
>>>> }
>>>> EXPORT_SYMBOL_GPL(pid_nr_ns);
>>>>
>>>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>>>> index db95d8eb761b..e246802b4361 100644
>>>> --- a/kernel/pid_namespace.c
>>>> +++ b/kernel/pid_namespace.c
>>>> @@ -268,6 +268,22 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>>>> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
>>>> }
>>>>
>>>> +static int pid_ns_next_highpid_handler(struct ctl_table *table, int write,
>>>> + void __user *buffer, size_t *lenp, loff_t *ppos)
>>>> +{
>>>> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
>>>> + struct ctl_table tmp = *table;
>>>> +
>>>> + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>>>> + return -EPERM;
>>>> +
>>>> + /* This needs to be fixed. */
>>>> + BUILD_BUG_ON(sizeof(u64) != sizeof(unsigned long));
>>>> +
>>>> + tmp.data = &pid_ns->next_highpid;
>>>> + return proc_dointvec(&tmp, write, buffer, lenp, ppos);
>>>> +}
>>>> +
>>>> extern int pid_max;
>>>> static int zero = 0;
>>>> static struct ctl_table pid_ns_ctl_table[] = {
>>>> @@ -279,6 +295,12 @@ static struct ctl_table pid_ns_ctl_table[] = {
>>>> .extra1 = &zero,
>>>> .extra2 = &pid_max,
>>>> },
>>>> + {
>>>> + .procname = "ns_next_highpid",
>>>> + .maxlen = sizeof(u64),
>>>> + .mode = 0666, /* permissions are checked in the handler */
>>>> + .proc_handler = pid_ns_next_highpid_handler,
>>>> + },
>>>> { }
>>>> };
>>>> static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
>>>> --
>>>> 1.9.3
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at http://www.tux.org/lkml/
>>
>>
>>
>> --
>> Andy Lutomirski
>> AMA Capital Management, LLC



--
Andy Lutomirski
AMA Capital Management, LLC