Rationale:
On x86_64 with big ram people running containers set pid_max on host to
large values to be able to launch more containers. At the same time
containers running 32-bit software experience problems with large pids - ps
calls readdir/stat on proc entries and inode's i_ino happen to be too big
for the 32-bit API.
Thus, the ability to limit the pid value inside container is required.
Signed-off-by: Pavel Emelyanov <[email protected]>
---
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..248220d 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -20,6 +20,7 @@ struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
int last_pid;
+ int pid_max;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b6..aafc285 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,12 +43,10 @@ static struct hlist_head *pid_hash;
static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
-int pid_max = PID_MAX_DEFAULT;
-
#define RESERVED_PIDS 300
-int pid_max_min = RESERVED_PIDS + 1;
-int pid_max_max = PID_MAX_LIMIT;
+static int pid_max_min = RESERVED_PIDS + 1;
+static int pid_max_max = PID_MAX_LIMIT;
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
@@ -161,7 +159,7 @@ static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
static int alloc_pidmap(struct pid_namespace *pid_ns)
{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ int i, offset, max_scan, pid, last = pid_ns->last_pid, pid_max = pid_ns->pid_max;
struct pidmap *map;
pid = last + 1;
@@ -546,14 +544,40 @@ void __init pidhash_init(void)
INIT_HLIST_HEAD(&pid_hash[i]);
}
+static int proc_dointvec_pidmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp;
+
+ tmp = *table;
+ tmp.data = ¤t->nsproxy->pid_ns->pid_max;
+
+ return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table pid_ctl_table[] = {
+ {
+ .procname = "pid_max",
+ .data = &init_pid_ns.pid_max,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_pidmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+ { }
+};
+
+static struct ctl_path pid_kern_path[] = { { .procname = "kernel" }, { } };
+
void __init pidmap_init(void)
{
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, PID_MAX_DEFAULT,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
@@ -562,4 +586,5 @@ void __init pidmap_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ register_sysctl_paths(pid_kern_path, pid_ctl_table);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94..93d594e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -89,6 +89,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
+ ns->pid_max = parent_pid_ns->pid_max;
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83..0f94054 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -93,9 +93,7 @@ extern int core_uses_pid;
extern int suid_dumpable;
extern char core_pattern[];
extern unsigned int core_pipe_limit;
-extern int pid_max;
extern int min_free_kbytes;
-extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
@@ -653,15 +651,6 @@ static struct ctl_table kern_table[] = {
},
#endif
{
- .procname = "pid_max",
- .data = &pid_max,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
- },
- {
.procname = "panic_on_oops",
.data = &panic_on_oops,
.maxlen = sizeof(int),
On Thu, 03 Mar 2011 11:39:17 +0300
Pavel Emelyanov <[email protected]> wrote:
> Rationale:
>
> On x86_64 with big ram people running containers set pid_max on host to
> large values to be able to launch more containers. At the same time
> containers running 32-bit software experience problems with large pids - ps
> calls readdir/stat on proc entries and inode's i_ino happen to be too big
> for the 32-bit API.
>
> Thus, the ability to limit the pid value inside container is required.
>
This is a behavioural change, isn't it? In current kernels a write to
/proc/sys/kernel/pid_max will change the max pid on all processes.
After this change, that write will only affect processes in the current
namespace. Anyone who was depending on the old behaviour might run
into problems?
Also: documentation. Documentation/sysctl/kernel.txt would like an
update. And perhaps also the pidns documentation which we forgot to
create :(
On 03/08/2011 02:58 AM, Andrew Morton wrote:
> On Thu, 03 Mar 2011 11:39:17 +0300
> Pavel Emelyanov <[email protected]> wrote:
>
>> Rationale:
>>
>> On x86_64 with big ram people running containers set pid_max on host to
>> large values to be able to launch more containers. At the same time
>> containers running 32-bit software experience problems with large pids - ps
>> calls readdir/stat on proc entries and inode's i_ino happen to be too big
>> for the 32-bit API.
>>
>> Thus, the ability to limit the pid value inside container is required.
>>
>
> This is a behavioural change, isn't it? In current kernels a write to
> /proc/sys/kernel/pid_max will change the max pid on all processes.
> After this change, that write will only affect processes in the current
> namespace. Anyone who was depending on the old behaviour might run
> into problems?
Hardly. If the behavior of some two apps depends on its synchronous change,
these two might want to run in the same pid namespace.
> Also: documentation. Documentation/sysctl/kernel.txt would like an
> update. And perhaps also the pidns documentation which we forgot to
> create :(
OK, I'll fix the existing docs.
Thanks,
Pavel
On Thu, 10 Mar 2011 12:35:32 +0300 Pavel Emelyanov <[email protected]> wrote:
> On 03/08/2011 02:58 AM, Andrew Morton wrote:
> > On Thu, 03 Mar 2011 11:39:17 +0300
> > Pavel Emelyanov <[email protected]> wrote:
> >
> >> Rationale:
> >>
> >> On x86_64 with big ram people running containers set pid_max on host to
> >> large values to be able to launch more containers. At the same time
> >> containers running 32-bit software experience problems with large pids - ps
> >> calls readdir/stat on proc entries and inode's i_ino happen to be too big
> >> for the 32-bit API.
> >>
> >> Thus, the ability to limit the pid value inside container is required.
> >>
> >
> > This is a behavioural change, isn't it? In current kernels a write to
> > /proc/sys/kernel/pid_max will change the max pid on all processes.
> > After this change, that write will only affect processes in the current
> > namespace. Anyone who was depending on the old behaviour might run
> > into problems?
>
> Hardly. If the behavior of some two apps depends on its synchronous change,
> these two might want to run in the same pid namespace.
I don't understand your answer. What is this "synchronous change" of which
you speak? Does your "might want to run" suggestion mean that userspace
changes would be required for this operation to again work correctly?
"In current kernels a write to /proc/sys/kernel/pid_max will change the
max pid on all processes." Is this incorrect?
"After this change (ie: this patch), that write will only affect
processes in the current namespace.". Is this incorrect?
On 03/10/2011 12:50 PM, Andrew Morton wrote:
> On Thu, 10 Mar 2011 12:35:32 +0300 Pavel Emelyanov <[email protected]> wrote:
>
>> On 03/08/2011 02:58 AM, Andrew Morton wrote:
>>> On Thu, 03 Mar 2011 11:39:17 +0300
>>> Pavel Emelyanov <[email protected]> wrote:
>>>
>>>> Rationale:
>>>>
>>>> On x86_64 with big ram people running containers set pid_max on host to
>>>> large values to be able to launch more containers. At the same time
>>>> containers running 32-bit software experience problems with large pids - ps
>>>> calls readdir/stat on proc entries and inode's i_ino happen to be too big
>>>> for the 32-bit API.
>>>>
>>>> Thus, the ability to limit the pid value inside container is required.
>>>>
>>>
>>> This is a behavioural change, isn't it? In current kernels a write to
>>> /proc/sys/kernel/pid_max will change the max pid on all processes.
>>> After this change, that write will only affect processes in the current
>>> namespace. Anyone who was depending on the old behaviour might run
>>> into problems?
>>
>> Hardly. If the behavior of some two apps depends on its synchronous change,
>> these two might want to run in the same pid namespace.
>
> I don't understand your answer. What is this "synchronous change" of which
> you speak? Does your "might want to run" suggestion mean that userspace
> changes would be required for this operation to again work correctly?
Your concern was about "anyone who was depending on the old behaviour", where
the old behavior meant "a write to sys.pid_max will change the max pid on all
processes".
I wanted to say, that if someone changes pid_max and expects someone else to
act differently after this, then these two should live in the same pid namespace.
IOW, if X raises the pid_max, then all the processes X sees in its pid namespace
*may* have pids up to this value. All the other process, that are not visible
in X's pid space will have other values, but X doesn't see them, so why should
we care?
> "In current kernels a write to /proc/sys/kernel/pid_max will change the
> max pid on all processes." Is this incorrect?
Not 100%. If I have some process with pid N and then I change the pid_max to N/2,
that process will still have its pid N which is obviously greater, than N/2.
> "After this change (ie: this patch), that write will only affect
> processes in the current namespace.". Is this incorrect?
With the exception stated above - yes, but I don't understand your concern after
these two questions :(
Thanks,
Pavel
On Thu, 10 Mar 2011 13:06:48 +0300 Pavel Emelyanov <[email protected]> wrote:
> On 03/10/2011 12:50 PM, Andrew Morton wrote:
> > On Thu, 10 Mar 2011 12:35:32 +0300 Pavel Emelyanov <[email protected]> wrote:
> >
> >> On 03/08/2011 02:58 AM, Andrew Morton wrote:
> >>> On Thu, 03 Mar 2011 11:39:17 +0300
> >>> Pavel Emelyanov <[email protected]> wrote:
> >>>
> >>>> Rationale:
> >>>>
> >>>> On x86_64 with big ram people running containers set pid_max on host to
> >>>> large values to be able to launch more containers. At the same time
> >>>> containers running 32-bit software experience problems with large pids - ps
> >>>> calls readdir/stat on proc entries and inode's i_ino happen to be too big
> >>>> for the 32-bit API.
> >>>>
> >>>> Thus, the ability to limit the pid value inside container is required.
> >>>>
> >>>
> >>> This is a behavioural change, isn't it? In current kernels a write to
> >>> /proc/sys/kernel/pid_max will change the max pid on all processes.
> >>> After this change, that write will only affect processes in the current
> >>> namespace. Anyone who was depending on the old behaviour might run
> >>> into problems?
> >>
> >> Hardly. If the behavior of some two apps depends on its synchronous change,
> >> these two might want to run in the same pid namespace.
> >
> > I don't understand your answer. What is this "synchronous change" of which
> > you speak? Does your "might want to run" suggestion mean that userspace
> > changes would be required for this operation to again work correctly?
>
> Your concern was about "anyone who was depending on the old behaviour", where
> the old behavior meant "a write to sys.pid_max will change the max pid on all
> processes".
>
> I wanted to say, that if someone changes pid_max and expects someone else to
> act differently after this, then these two should live in the same pid namespace.
So it's a non-back-compatible change to the userspace interface. uh-oh.
> IOW, if X raises the pid_max, then all the processes X sees in its pid namespace
> *may* have pids up to this value. All the other process, that are not visible
> in X's pid space will have other values, but X doesn't see them, so why should
> we care?
Current userspace has no *need* to be running in the same pidns to
alter the pid_max of some processes. So the chances are good that
any current userspace takes advantage of this.
Silly example:
if (fork() == 0) {
/* child */
create_new_pidns();
start_doing_stuff();
} else {
/* parent */
increase_pid_max();
}
Another example would be logging into a system as root in the init_ns
and modifying /proc/sys/kernel/pid_max by hand.
I don't have a clue how much code is out there using pid namespaces,
not how much of that code alters the default pid_max. Hard.
The proposed interface is a bit weird and hacky anyway, isn't it? We
have a single pseudo-file in a well-known location -
/proc/sys/kernel/pid_max. One would expect alteration of that
system-wide file to have system-wide effects, only that isn't the case.
Instead a modification to the system-wide file has local-pidns-only
effects. It would be much more logical to have a per-pidns pid_max
pseudo file.
And if we do that, we then need to work out what to do with writes to
/proc/sys/kernel/pid_max. Remember the user expects those writes to
alter all processes on the machine! I guess it would be acceptable to
permit that to continue to happen - a write to /proc/sys/kernel/pid_max
will overwrite all the per-pidns pid_max settings.