The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844
("cgroups: convert tasks file to use a seq_file with shared pid array").
We cache a pid array for all threads that are opening the same "tasks"
file, but the pids in the array are always from the namespace of the
last process that opened the file, so all other threads will read pids
from that namespace instead of their own namespaces.
To fix it, we maintain a list of pid arrays, which is keyed by pid_ns.
The list will be of length 1 at most time.
Reported-by: Paul Menage <[email protected]>
Idea-by: Paul Menage <[email protected]>
Signed-off-by: Li Zefan <[email protected]>
---
include/linux/cgroup.h | 11 ++----
kernel/cgroup.c | 94 +++++++++++++++++++++++++++++++++++------------
2 files changed, 74 insertions(+), 31 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 665fa70..20411d2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -179,14 +179,11 @@ struct cgroup {
*/
struct list_head release_list;
- /* pids_mutex protects the fields below */
+ /* pids_mutex protects pids_list and cached pid arrays. */
struct rw_semaphore pids_mutex;
- /* Array of process ids in the cgroup */
- pid_t *tasks_pids;
- /* How many files are using the current tasks_pids array */
- int pids_use_count;
- /* Length of the current tasks_pids array */
- int pids_length;
+
+ /* Linked list of struct cgroup_pids */
+ struct list_head pids_list;
/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a68..13dddb4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>
#include <asm/atomic.h>
@@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pids_list);
init_rwsem(&cgrp->pids_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2201,12 +2203,30 @@ err:
return ret;
}
+/*
+ * Cache pids for all threads in the same pid namespace that are
+ * opening the same "tasks" file.
+ */
+struct cgroup_pids {
+ /* The node in cgrp->pids_list */
+ struct list_head list;
+ /* The cgroup those pids belong to */
+ struct cgroup *cgrp;
+ /* The namepsace those pids belong to */
+ struct pid_namespace *pid_ns;
+ /* Array of process ids in the cgroup */
+ pid_t *tasks_pids;
+ /* How many files are using the this tasks_pids array */
+ int pids_use_count;
+ /* Length of the current tasks_pids array */
+ int pids_length;
+};
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
-
/*
* seq_file methods for the "tasks" file. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
@@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
int index = 0, pid = *pos;
int *iter;
down_read(&cgrp->pids_mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = cp->pids_length;
while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (cp->tasks_pids[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (cp->tasks_pids[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= cp->pids_length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = cp->tasks_pids + index;
*pos = *iter;
return iter;
}
static void cgroup_tasks_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
up_read(&cgrp->pids_mutex);
}
static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
+ int *end = cp->tasks_pids + cp->pids_length;
/*
* Advance to the next pid in the array. If this goes off the
@@ -2286,26 +2308,32 @@ static struct seq_operations cgroup_tasks_seq_operations = {
.show = cgroup_tasks_show,
};
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void release_cgroup_pid_array(struct cgroup_pids *cp)
{
+ struct cgroup *cgrp = cp->cgrp;
+
down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ BUG_ON(!cp->pids_use_count);
+ if (!--cp->pids_use_count) {
+ list_del(&cp->list);
+ kfree(cp->tasks_pids);
+ kfree(cp);
}
up_write(&cgrp->pids_mutex);
}
static int cgroup_tasks_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct seq_file *seq;
+ struct cgroup_pids *cp;
if (!(file->f_mode & FMODE_READ))
return 0;
- release_cgroup_pid_array(cgrp);
+ seq = file->private_data;
+ cp = seq->private;
+
+ release_cgroup_pid_array(cp);
return seq_release(inode, file);
}
@@ -2324,6 +2352,8 @@ static struct file_operations cgroup_tasks_operations = {
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct cgroup_pids *cp;
pid_t *pidarray;
int npids;
int retval;
@@ -2350,20 +2380,36 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
* array if necessary
*/
down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
+
+ list_for_each_entry(cp, &cgrp->pids_list, list) {
+ if (pid_ns == cp->pid_ns)
+ goto found;
+ }
+
+ cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp) {
+ up_write(&cgrp->pids_mutex);
+ kfree(pidarray);
+ return -ENOMEM;
+ }
+ cp->cgrp = cgrp;
+ cp->pid_ns = pid_ns;
+ list_add(&cp->list, &cgrp->pids_list);
+found:
+ kfree(cp->tasks_pids);
+ cp->tasks_pids = pidarray;
+ cp->pids_length = npids;
+ cp->pids_use_count++;
up_write(&cgrp->pids_mutex);
file->f_op = &cgroup_tasks_operations;
retval = seq_open(file, &cgroup_tasks_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ release_cgroup_pid_array(cp);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = cp;
return 0;
}
--
1.5.4.rc3
Thanks Li - but as I said to Serge in the email when I brought this up
originally, I already had a patch in mind for this; I've had an intern
(Ben) at Google working on it. His patch (pretty much ready, and being
sent out tomorrow I hope) is pretty similar to yours, but his is on
top of another patch that provides a (currently read-only)
"cgroup.procs" file in each cgroup dir that lists the unique tgids in
the cgroup. So the key in the list of pid arrays is actually a pid_ns
and a file type (indicating procs or tasks) rather than just a pid_ns.
So I think it makes more sense to not use this patch, but to use the
pair of patches that Ben's written, since they provide more overal
functionality.
Paul
On Wed, Jul 1, 2009 at 6:24 PM, Li Zefan<[email protected]> wrote:
> The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844
> ("cgroups: convert tasks file to use a seq_file with shared pid array").
>
> We cache a pid array for all threads that are opening the same "tasks"
> file, but the pids in the array are always from the namespace of the
> last process that opened the file, so all other threads will read pids
> from that namespace instead of their own namespaces.
>
> To fix it, we maintain a list of pid arrays, which is keyed by pid_ns.
> The list will be of length 1 at most time.
>
> Reported-by: Paul Menage <[email protected]>
> Idea-by: Paul Menage <[email protected]>
> Signed-off-by: Li Zefan <[email protected]>
> ---
> ?include/linux/cgroup.h | ? 11 ++----
> ?kernel/cgroup.c ? ? ? ?| ? 94 +++++++++++++++++++++++++++++++++++------------
> ?2 files changed, 74 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 665fa70..20411d2 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -179,14 +179,11 @@ struct cgroup {
> ? ? ? ? */
> ? ? ? ?struct list_head release_list;
>
> - ? ? ? /* pids_mutex protects the fields below */
> + ? ? ? /* pids_mutex protects pids_list and cached pid arrays. */
> ? ? ? ?struct rw_semaphore pids_mutex;
> - ? ? ? /* Array of process ids in the cgroup */
> - ? ? ? pid_t *tasks_pids;
> - ? ? ? /* How many files are using the current tasks_pids array */
> - ? ? ? int pids_use_count;
> - ? ? ? /* Length of the current tasks_pids array */
> - ? ? ? int pids_length;
> +
> + ? ? ? /* Linked list of struct cgroup_pids */
> + ? ? ? struct list_head pids_list;
>
> ? ? ? ?/* For RCU-protected deletion */
> ? ? ? ?struct rcu_head rcu_head;
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3737a68..13dddb4 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -47,6 +47,7 @@
> ?#include <linux/hash.h>
> ?#include <linux/namei.h>
> ?#include <linux/smp_lock.h>
> +#include <linux/pid_namespace.h>
>
> ?#include <asm/atomic.h>
>
> @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> ? ? ? ?INIT_LIST_HEAD(&cgrp->children);
> ? ? ? ?INIT_LIST_HEAD(&cgrp->css_sets);
> ? ? ? ?INIT_LIST_HEAD(&cgrp->release_list);
> + ? ? ? INIT_LIST_HEAD(&cgrp->pids_list);
> ? ? ? ?init_rwsem(&cgrp->pids_mutex);
> ?}
> ?static void init_cgroup_root(struct cgroupfs_root *root)
> @@ -2201,12 +2203,30 @@ err:
> ? ? ? ?return ret;
> ?}
>
> +/*
> + * Cache pids for all threads in the same pid namespace that are
> + * opening the same "tasks" file.
> + */
> +struct cgroup_pids {
> + ? ? ? /* The node in cgrp->pids_list */
> + ? ? ? struct list_head list;
> + ? ? ? /* The cgroup those pids belong to */
> + ? ? ? struct cgroup *cgrp;
> + ? ? ? /* The namepsace those pids belong to */
> + ? ? ? struct pid_namespace *pid_ns;
> + ? ? ? /* Array of process ids in the cgroup */
> + ? ? ? pid_t *tasks_pids;
> + ? ? ? /* How many files are using the this tasks_pids array */
> + ? ? ? int pids_use_count;
> + ? ? ? /* Length of the current tasks_pids array */
> + ? ? ? int pids_length;
> +};
> +
> ?static int cmppid(const void *a, const void *b)
> ?{
> ? ? ? ?return *(pid_t *)a - *(pid_t *)b;
> ?}
>
> -
> ?/*
> ?* seq_file methods for the "tasks" file. The seq_file position is the
> ?* next pid to display; the seq_file iterator is a pointer to the pid
> @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
> ? ? ? ? * after a seek to the start). Use a binary-search to find the
> ? ? ? ? * next pid to display, if any
> ? ? ? ? */
> - ? ? ? struct cgroup *cgrp = s->private;
> + ? ? ? struct cgroup_pids *cp = s->private;
> + ? ? ? struct cgroup *cgrp = cp->cgrp;
> ? ? ? ?int index = 0, pid = *pos;
> ? ? ? ?int *iter;
>
> ? ? ? ?down_read(&cgrp->pids_mutex);
> ? ? ? ?if (pid) {
> - ? ? ? ? ? ? ? int end = cgrp->pids_length;
> + ? ? ? ? ? ? ? int end = cp->pids_length;
>
> ? ? ? ? ? ? ? ?while (index < end) {
> ? ? ? ? ? ? ? ? ? ? ? ?int mid = (index + end) / 2;
> - ? ? ? ? ? ? ? ? ? ? ? if (cgrp->tasks_pids[mid] == pid) {
> + ? ? ? ? ? ? ? ? ? ? ? if (cp->tasks_pids[mid] == pid) {
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?index = mid;
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?break;
> - ? ? ? ? ? ? ? ? ? ? ? } else if (cgrp->tasks_pids[mid] <= pid)
> + ? ? ? ? ? ? ? ? ? ? ? } else if (cp->tasks_pids[mid] <= pid)
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?index = mid + 1;
> ? ? ? ? ? ? ? ? ? ? ? ?else
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?end = mid;
> ? ? ? ? ? ? ? ?}
> ? ? ? ?}
> ? ? ? ?/* If we're off the end of the array, we're done */
> - ? ? ? if (index >= cgrp->pids_length)
> + ? ? ? if (index >= cp->pids_length)
> ? ? ? ? ? ? ? ?return NULL;
> ? ? ? ?/* Update the abstract position to be the actual pid that we found */
> - ? ? ? iter = cgrp->tasks_pids + index;
> + ? ? ? iter = cp->tasks_pids + index;
> ? ? ? ?*pos = *iter;
> ? ? ? ?return iter;
> ?}
>
> ?static void cgroup_tasks_stop(struct seq_file *s, void *v)
> ?{
> - ? ? ? struct cgroup *cgrp = s->private;
> + ? ? ? struct cgroup_pids *cp = s->private;
> + ? ? ? struct cgroup *cgrp = cp->cgrp;
> ? ? ? ?up_read(&cgrp->pids_mutex);
> ?}
>
> ?static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
> ?{
> - ? ? ? struct cgroup *cgrp = s->private;
> + ? ? ? struct cgroup_pids *cp = s->private;
> ? ? ? ?int *p = v;
> - ? ? ? int *end = cgrp->tasks_pids + cgrp->pids_length;
> + ? ? ? int *end = cp->tasks_pids + cp->pids_length;
>
> ? ? ? ?/*
> ? ? ? ? * Advance to the next pid in the array. If this goes off the
> @@ -2286,26 +2308,32 @@ static struct seq_operations cgroup_tasks_seq_operations = {
> ? ? ? ?.show = cgroup_tasks_show,
> ?};
>
> -static void release_cgroup_pid_array(struct cgroup *cgrp)
> +static void release_cgroup_pid_array(struct cgroup_pids *cp)
> ?{
> + ? ? ? struct cgroup *cgrp = cp->cgrp;
> +
> ? ? ? ?down_write(&cgrp->pids_mutex);
> - ? ? ? BUG_ON(!cgrp->pids_use_count);
> - ? ? ? if (!--cgrp->pids_use_count) {
> - ? ? ? ? ? ? ? kfree(cgrp->tasks_pids);
> - ? ? ? ? ? ? ? cgrp->tasks_pids = NULL;
> - ? ? ? ? ? ? ? cgrp->pids_length = 0;
> + ? ? ? BUG_ON(!cp->pids_use_count);
> + ? ? ? if (!--cp->pids_use_count) {
> + ? ? ? ? ? ? ? list_del(&cp->list);
> + ? ? ? ? ? ? ? kfree(cp->tasks_pids);
> + ? ? ? ? ? ? ? kfree(cp);
> ? ? ? ?}
> ? ? ? ?up_write(&cgrp->pids_mutex);
> ?}
>
> ?static int cgroup_tasks_release(struct inode *inode, struct file *file)
> ?{
> - ? ? ? struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> + ? ? ? struct seq_file *seq;
> + ? ? ? struct cgroup_pids *cp;
>
> ? ? ? ?if (!(file->f_mode & FMODE_READ))
> ? ? ? ? ? ? ? ?return 0;
>
> - ? ? ? release_cgroup_pid_array(cgrp);
> + ? ? ? seq = file->private_data;
> + ? ? ? cp = seq->private;
> +
> + ? ? ? release_cgroup_pid_array(cp);
> ? ? ? ?return seq_release(inode, file);
> ?}
>
> @@ -2324,6 +2352,8 @@ static struct file_operations cgroup_tasks_operations = {
> ?static int cgroup_tasks_open(struct inode *unused, struct file *file)
> ?{
> ? ? ? ?struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> + ? ? ? struct pid_namespace *pid_ns = task_active_pid_ns(current);
> + ? ? ? struct cgroup_pids *cp;
> ? ? ? ?pid_t *pidarray;
> ? ? ? ?int npids;
> ? ? ? ?int retval;
> @@ -2350,20 +2380,36 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> ? ? ? ? * array if necessary
> ? ? ? ? */
> ? ? ? ?down_write(&cgrp->pids_mutex);
> - ? ? ? kfree(cgrp->tasks_pids);
> - ? ? ? cgrp->tasks_pids = pidarray;
> - ? ? ? cgrp->pids_length = npids;
> - ? ? ? cgrp->pids_use_count++;
> +
> + ? ? ? list_for_each_entry(cp, &cgrp->pids_list, list) {
> + ? ? ? ? ? ? ? if (pid_ns == cp->pid_ns)
> + ? ? ? ? ? ? ? ? ? ? ? goto found;
> + ? ? ? }
> +
> + ? ? ? cp = kzalloc(sizeof(*cp), GFP_KERNEL);
> + ? ? ? if (!cp) {
> + ? ? ? ? ? ? ? up_write(&cgrp->pids_mutex);
> + ? ? ? ? ? ? ? kfree(pidarray);
> + ? ? ? ? ? ? ? return -ENOMEM;
> + ? ? ? }
> + ? ? ? cp->cgrp = cgrp;
> + ? ? ? cp->pid_ns = pid_ns;
> + ? ? ? list_add(&cp->list, &cgrp->pids_list);
> +found:
> + ? ? ? kfree(cp->tasks_pids);
> + ? ? ? cp->tasks_pids = pidarray;
> + ? ? ? cp->pids_length = npids;
> + ? ? ? cp->pids_use_count++;
> ? ? ? ?up_write(&cgrp->pids_mutex);
>
> ? ? ? ?file->f_op = &cgroup_tasks_operations;
>
> ? ? ? ?retval = seq_open(file, &cgroup_tasks_seq_operations);
> ? ? ? ?if (retval) {
> - ? ? ? ? ? ? ? release_cgroup_pid_array(cgrp);
> + ? ? ? ? ? ? ? release_cgroup_pid_array(cp);
> ? ? ? ? ? ? ? ?return retval;
> ? ? ? ?}
> - ? ? ? ((struct seq_file *)file->private_data)->private = cgrp;
> + ? ? ? ((struct seq_file *)file->private_data)->private = cp;
> ? ? ? ?return 0;
> ?}
>
> --
> 1.5.4.rc3
>
Paul Menage wrote:
> Thanks Li - but as I said to Serge in the email when I brought this up
> originally, I already had a patch in mind for this; I've had an intern
> (Ben) at Google working on it. His patch (pretty much ready, and being
> sent out tomorrow I hope) is pretty similar to yours, but his is on
> top of another patch that provides a (currently read-only)
> "cgroup.procs" file in each cgroup dir that lists the unique tgids in
> the cgroup. So the key in the list of pid arrays is actually a pid_ns
> and a file type (indicating procs or tasks) rather than just a pid_ns.
>
> So I think it makes more sense to not use this patch, but to use the
> pair of patches that Ben's written, since they provide more overal
> functionality.
>
It's fine for me. I waited for your fix for a week or so but didn't see
it, so I thought you were stuck in other business and I did this fix.
On Wed, 1 Jul 2009 18:36:36 -0700
Paul Menage <[email protected]> wrote:
> Thanks Li - but as I said to Serge in the email when I brought this up
> originally, I already had a patch in mind for this; I've had an intern
> (Ben) at Google working on it. His patch (pretty much ready, and being
> sent out tomorrow I hope) is pretty similar to yours, but his is on
> top of another patch that provides a (currently read-only)
> "cgroup.procs" file in each cgroup dir that lists the unique tgids in
> the cgroup. So the key in the list of pid arrays is actually a pid_ns
> and a file type (indicating procs or tasks) rather than just a pid_ns.
>
Wow, I welcome "procs" file :)
Off-topic:
Is there relationship betweem mm->owner and tgid ?
If no, is it difficult to synchronize tgid and mm->owner ?
THanks,
-Kame
> So I think it makes more sense to not use this patch, but to use the
> pair of patches that Ben's written, since they provide more overal
> functionality.
>
> Paul
>
> On Wed, Jul 1, 2009 at 6:24 PM, Li Zefan<[email protected]> wrote:
> > The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844
> > ("cgroups: convert tasks file to use a seq_file with shared pid array").
> >
> > We cache a pid array for all threads that are opening the same "tasks"
> > file, but the pids in the array are always from the namespace of the
> > last process that opened the file, so all other threads will read pids
> > from that namespace instead of their own namespaces.
> >
> > To fix it, we maintain a list of pid arrays, which is keyed by pid_ns.
> > The list will be of length 1 at most time.
> >
> > Reported-by: Paul Menage <[email protected]>
> > Idea-by: Paul Menage <[email protected]>
> > Signed-off-by: Li Zefan <[email protected]>
> > ---
> > include/linux/cgroup.h | 11 ++----
> > kernel/cgroup.c | 94 +++++++++++++++++++++++++++++++++++------------
> > 2 files changed, 74 insertions(+), 31 deletions(-)
> >
> > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> > index 665fa70..20411d2 100644
> > --- a/include/linux/cgroup.h
> > +++ b/include/linux/cgroup.h
> > @@ -179,14 +179,11 @@ struct cgroup {
> > */
> > struct list_head release_list;
> >
> > - /* pids_mutex protects the fields below */
> > + /* pids_mutex protects pids_list and cached pid arrays. */
> > struct rw_semaphore pids_mutex;
> > - /* Array of process ids in the cgroup */
> > - pid_t *tasks_pids;
> > - /* How many files are using the current tasks_pids array */
> > - int pids_use_count;
> > - /* Length of the current tasks_pids array */
> > - int pids_length;
> > +
> > + /* Linked list of struct cgroup_pids */
> > + struct list_head pids_list;
> >
> > /* For RCU-protected deletion */
> > struct rcu_head rcu_head;
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index 3737a68..13dddb4 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -47,6 +47,7 @@
> > #include <linux/hash.h>
> > #include <linux/namei.h>
> > #include <linux/smp_lock.h>
> > +#include <linux/pid_namespace.h>
> >
> > #include <asm/atomic.h>
> >
> > @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> > INIT_LIST_HEAD(&cgrp->children);
> > INIT_LIST_HEAD(&cgrp->css_sets);
> > INIT_LIST_HEAD(&cgrp->release_list);
> > + INIT_LIST_HEAD(&cgrp->pids_list);
> > init_rwsem(&cgrp->pids_mutex);
> > }
> > static void init_cgroup_root(struct cgroupfs_root *root)
> > @@ -2201,12 +2203,30 @@ err:
> > return ret;
> > }
> >
> > +/*
> > + * Cache pids for all threads in the same pid namespace that are
> > + * opening the same "tasks" file.
> > + */
> > +struct cgroup_pids {
> > + /* The node in cgrp->pids_list */
> > + struct list_head list;
> > + /* The cgroup those pids belong to */
> > + struct cgroup *cgrp;
> > + /* The namepsace those pids belong to */
> > + struct pid_namespace *pid_ns;
> > + /* Array of process ids in the cgroup */
> > + pid_t *tasks_pids;
> > + /* How many files are using the this tasks_pids array */
> > + int pids_use_count;
> > + /* Length of the current tasks_pids array */
> > + int pids_length;
> > +};
> > +
> > static int cmppid(const void *a, const void *b)
> > {
> > return *(pid_t *)a - *(pid_t *)b;
> > }
> >
> > -
> > /*
> > * seq_file methods for the "tasks" file. The seq_file position is the
> > * next pid to display; the seq_file iterator is a pointer to the pid
> > @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
> > * after a seek to the start). Use a binary-search to find the
> > * next pid to display, if any
> > */
> > - struct cgroup *cgrp = s->private;
> > + struct cgroup_pids *cp = s->private;
> > + struct cgroup *cgrp = cp->cgrp;
> > int index = 0, pid = *pos;
> > int *iter;
> >
> > down_read(&cgrp->pids_mutex);
> > if (pid) {
> > - int end = cgrp->pids_length;
> > + int end = cp->pids_length;
> >
> > while (index < end) {
> > int mid = (index + end) / 2;
> > - if (cgrp->tasks_pids[mid] == pid) {
> > + if (cp->tasks_pids[mid] == pid) {
> > index = mid;
> > break;
> > - } else if (cgrp->tasks_pids[mid] <= pid)
> > + } else if (cp->tasks_pids[mid] <= pid)
> > index = mid + 1;
> > else
> > end = mid;
> > }
> > }
> > /* If we're off the end of the array, we're done */
> > - if (index >= cgrp->pids_length)
> > + if (index >= cp->pids_length)
> > return NULL;
> > /* Update the abstract position to be the actual pid that we found */
> > - iter = cgrp->tasks_pids + index;
> > + iter = cp->tasks_pids + index;
> > *pos = *iter;
> > return iter;
> > }
> >
> > static void cgroup_tasks_stop(struct seq_file *s, void *v)
> > {
> > - struct cgroup *cgrp = s->private;
> > + struct cgroup_pids *cp = s->private;
> > + struct cgroup *cgrp = cp->cgrp;
> > up_read(&cgrp->pids_mutex);
> > }
> >
> > static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
> > {
> > - struct cgroup *cgrp = s->private;
> > + struct cgroup_pids *cp = s->private;
> > int *p = v;
> > - int *end = cgrp->tasks_pids + cgrp->pids_length;
> > + int *end = cp->tasks_pids + cp->pids_length;
> >
> > /*
> > * Advance to the next pid in the array. If this goes off the
> > @@ -2286,26 +2308,32 @@ static struct seq_operations cgroup_tasks_seq_operations = {
> > .show = cgroup_tasks_show,
> > };
> >
> > -static void release_cgroup_pid_array(struct cgroup *cgrp)
> > +static void release_cgroup_pid_array(struct cgroup_pids *cp)
> > {
> > + struct cgroup *cgrp = cp->cgrp;
> > +
> > down_write(&cgrp->pids_mutex);
> > - BUG_ON(!cgrp->pids_use_count);
> > - if (!--cgrp->pids_use_count) {
> > - kfree(cgrp->tasks_pids);
> > - cgrp->tasks_pids = NULL;
> > - cgrp->pids_length = 0;
> > + BUG_ON(!cp->pids_use_count);
> > + if (!--cp->pids_use_count) {
> > + list_del(&cp->list);
> > + kfree(cp->tasks_pids);
> > + kfree(cp);
> > }
> > up_write(&cgrp->pids_mutex);
> > }
> >
> > static int cgroup_tasks_release(struct inode *inode, struct file *file)
> > {
> > - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> > + struct seq_file *seq;
> > + struct cgroup_pids *cp;
> >
> > if (!(file->f_mode & FMODE_READ))
> > return 0;
> >
> > - release_cgroup_pid_array(cgrp);
> > + seq = file->private_data;
> > + cp = seq->private;
> > +
> > + release_cgroup_pid_array(cp);
> > return seq_release(inode, file);
> > }
> >
> > @@ -2324,6 +2352,8 @@ static struct file_operations cgroup_tasks_operations = {
> > static int cgroup_tasks_open(struct inode *unused, struct file *file)
> > {
> > struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> > + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> > + struct cgroup_pids *cp;
> > pid_t *pidarray;
> > int npids;
> > int retval;
> > @@ -2350,20 +2380,36 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> > * array if necessary
> > */
> > down_write(&cgrp->pids_mutex);
> > - kfree(cgrp->tasks_pids);
> > - cgrp->tasks_pids = pidarray;
> > - cgrp->pids_length = npids;
> > - cgrp->pids_use_count++;
> > +
> > + list_for_each_entry(cp, &cgrp->pids_list, list) {
> > + if (pid_ns == cp->pid_ns)
> > + goto found;
> > + }
> > +
> > + cp = kzalloc(sizeof(*cp), GFP_KERNEL);
> > + if (!cp) {
> > + up_write(&cgrp->pids_mutex);
> > + kfree(pidarray);
> > + return -ENOMEM;
> > + }
> > + cp->cgrp = cgrp;
> > + cp->pid_ns = pid_ns;
> > + list_add(&cp->list, &cgrp->pids_list);
> > +found:
> > + kfree(cp->tasks_pids);
> > + cp->tasks_pids = pidarray;
> > + cp->pids_length = npids;
> > + cp->pids_use_count++;
> > up_write(&cgrp->pids_mutex);
> >
> > file->f_op = &cgroup_tasks_operations;
> >
> > retval = seq_open(file, &cgroup_tasks_seq_operations);
> > if (retval) {
> > - release_cgroup_pid_array(cgrp);
> > + release_cgroup_pid_array(cp);
> > return retval;
> > }
> > - ((struct seq_file *)file->private_data)->private = cgrp;
> > + ((struct seq_file *)file->private_data)->private = cp;
> > return 0;
> > }
> >
> > --
> > 1.5.4.rc3
> >
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers
>
Paul Menage wrote:
> Thanks Li - but as I said to Serge in the email when I brought this up
> originally, I already had a patch in mind for this; I've had an intern
> (Ben) at Google working on it. His patch (pretty much ready, and being
> sent out tomorrow I hope) is pretty similar to yours, but his is on
> top of another patch that provides a (currently read-only)
> "cgroup.procs" file in each cgroup dir that lists the unique tgids in
> the cgroup. So the key in the list of pid arrays is actually a pid_ns
> and a file type (indicating procs or tasks) rather than just a pid_ns.
>
> So I think it makes more sense to not use this patch, but to use the
> pair of patches that Ben's written, since they provide more overal
> functionality.
>
But I guess we are going to fix the bug for 2.6.31? So is it ok to
merge a new feature 'cgroup.procs' together into 2.6.31?
On Wed, Jul 1, 2009 at 7:17 PM, Li Zefan<[email protected]> wrote:
>
> But I guess we are going to fix the bug for 2.6.31? So is it ok to
> merge a new feature 'cgroup.procs' together into 2.6.31?
>
Does this bug really need to be fixed for 2.6.31? I didn't think that
the namespace support in mainline was robust enough yet for people to
use them for virtual servers in production environments.
Paul
Paul Menage wrote:
> On Wed, Jul 1, 2009 at 7:17 PM, Li Zefan<[email protected]> wrote:
>> But I guess we are going to fix the bug for 2.6.31? So is it ok to
>> merge a new feature 'cgroup.procs' together into 2.6.31?
>>
>
> Does this bug really need to be fixed for 2.6.31? I didn't think that
> the namespace support in mainline was robust enough yet for people to
> use them for virtual servers in production environments.
>
If so, it's ok for me. Unless someone else has objections. Serge?
* KAMEZAWA Hiroyuki <[email protected]> [2009-07-02 10:57:07]:
> On Wed, 1 Jul 2009 18:36:36 -0700
> Paul Menage <[email protected]> wrote:
>
> > Thanks Li - but as I said to Serge in the email when I brought this up
> > originally, I already had a patch in mind for this; I've had an intern
> > (Ben) at Google working on it. His patch (pretty much ready, and being
> > sent out tomorrow I hope) is pretty similar to yours, but his is on
> > top of another patch that provides a (currently read-only)
> > "cgroup.procs" file in each cgroup dir that lists the unique tgids in
> > the cgroup. So the key in the list of pid arrays is actually a pid_ns
> > and a file type (indicating procs or tasks) rather than just a pid_ns.
> >
> Wow, I welcome "procs" file :)
>
Yes, we discussed this in the Resouce Management topic of the
containers mini-summit. I hope we are talking of the same thing.
> Off-topic:
> Is there relationship betweem mm->owner and tgid ?
> If no, is it difficult to synchronize tgid and mm->owner ?
>
At the beginning tgid is mm->owner.
>
> > So I think it makes more sense to not use this patch, but to use the
> > pair of patches that Ben's written, since they provide more overal
> > functionality.
> >
> > Paul
> >
> > On Wed, Jul 1, 2009 at 6:24 PM, Li Zefan<[email protected]> wrote:
> > > The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844
> > > ("cgroups: convert tasks file to use a seq_file with shared pid array").
> > >
> > > We cache a pid array for all threads that are opening the same "tasks"
> > > file, but the pids in the array are always from the namespace of the
> > > last process that opened the file, so all other threads will read pids
> > > from that namespace instead of their own namespaces.
> > >
> > > To fix it, we maintain a list of pid arrays, which is keyed by pid_ns.
> > > The list will be of length 1 at most time.
> > >
> > > Reported-by: Paul Menage <[email protected]>
> > > Idea-by: Paul Menage <[email protected]>
> > > Signed-off-by: Li Zefan <[email protected]>
> > > ---
> > > ?include/linux/cgroup.h | ? 11 ++----
> > > ?kernel/cgroup.c ? ? ? ?| ? 94 +++++++++++++++++++++++++++++++++++------------
> > > ?2 files changed, 74 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> > > index 665fa70..20411d2 100644
> > > --- a/include/linux/cgroup.h
> > > +++ b/include/linux/cgroup.h
> > > @@ -179,14 +179,11 @@ struct cgroup {
> > > ? ? ? ? */
> > > ? ? ? ?struct list_head release_list;
> > >
> > > - ? ? ? /* pids_mutex protects the fields below */
> > > + ? ? ? /* pids_mutex protects pids_list and cached pid arrays. */
> > > ? ? ? ?struct rw_semaphore pids_mutex;
> > > - ? ? ? /* Array of process ids in the cgroup */
> > > - ? ? ? pid_t *tasks_pids;
> > > - ? ? ? /* How many files are using the current tasks_pids array */
> > > - ? ? ? int pids_use_count;
> > > - ? ? ? /* Length of the current tasks_pids array */
> > > - ? ? ? int pids_length;
> > > +
> > > + ? ? ? /* Linked list of struct cgroup_pids */
> > > + ? ? ? struct list_head pids_list;
> > >
> > > ? ? ? ?/* For RCU-protected deletion */
> > > ? ? ? ?struct rcu_head rcu_head;
> > > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > > index 3737a68..13dddb4 100644
> > > --- a/kernel/cgroup.c
> > > +++ b/kernel/cgroup.c
> > > @@ -47,6 +47,7 @@
> > > ?#include <linux/hash.h>
> > > ?#include <linux/namei.h>
> > > ?#include <linux/smp_lock.h>
> > > +#include <linux/pid_namespace.h>
> > >
> > > ?#include <asm/atomic.h>
> > >
> > > @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> > > ? ? ? ?INIT_LIST_HEAD(&cgrp->children);
> > > ? ? ? ?INIT_LIST_HEAD(&cgrp->css_sets);
> > > ? ? ? ?INIT_LIST_HEAD(&cgrp->release_list);
> > > + ? ? ? INIT_LIST_HEAD(&cgrp->pids_list);
> > > ? ? ? ?init_rwsem(&cgrp->pids_mutex);
> > > ?}
> > > ?static void init_cgroup_root(struct cgroupfs_root *root)
> > > @@ -2201,12 +2203,30 @@ err:
> > > ? ? ? ?return ret;
> > > ?}
> > >
> > > +/*
> > > + * Cache pids for all threads in the same pid namespace that are
> > > + * opening the same "tasks" file.
> > > + */
> > > +struct cgroup_pids {
> > > + ? ? ? /* The node in cgrp->pids_list */
> > > + ? ? ? struct list_head list;
> > > + ? ? ? /* The cgroup those pids belong to */
> > > + ? ? ? struct cgroup *cgrp;
> > > + ? ? ? /* The namepsace those pids belong to */
> > > + ? ? ? struct pid_namespace *pid_ns;
> > > + ? ? ? /* Array of process ids in the cgroup */
> > > + ? ? ? pid_t *tasks_pids;
> > > + ? ? ? /* How many files are using the this tasks_pids array */
> > > + ? ? ? int pids_use_count;
> > > + ? ? ? /* Length of the current tasks_pids array */
> > > + ? ? ? int pids_length;
> > > +};
> > > +
> > > ?static int cmppid(const void *a, const void *b)
> > > ?{
> > > ? ? ? ?return *(pid_t *)a - *(pid_t *)b;
> > > ?}
> > >
> > > -
> > > ?/*
> > > ?* seq_file methods for the "tasks" file. The seq_file position is the
> > > ?* next pid to display; the seq_file iterator is a pointer to the pid
> > > @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
> > > ? ? ? ? * after a seek to the start). Use a binary-search to find the
> > > ? ? ? ? * next pid to display, if any
> > > ? ? ? ? */
> > > - ? ? ? struct cgroup *cgrp = s->private;
> > > + ? ? ? struct cgroup_pids *cp = s->private;
> > > + ? ? ? struct cgroup *cgrp = cp->cgrp;
> > > ? ? ? ?int index = 0, pid = *pos;
> > > ? ? ? ?int *iter;
> > >
> > > ? ? ? ?down_read(&cgrp->pids_mutex);
> > > ? ? ? ?if (pid) {
> > > - ? ? ? ? ? ? ? int end = cgrp->pids_length;
> > > + ? ? ? ? ? ? ? int end = cp->pids_length;
> > >
> > > ? ? ? ? ? ? ? ?while (index < end) {
> > > ? ? ? ? ? ? ? ? ? ? ? ?int mid = (index + end) / 2;
> > > - ? ? ? ? ? ? ? ? ? ? ? if (cgrp->tasks_pids[mid] == pid) {
> > > + ? ? ? ? ? ? ? ? ? ? ? if (cp->tasks_pids[mid] == pid) {
> > > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?index = mid;
> > > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?break;
> > > - ? ? ? ? ? ? ? ? ? ? ? } else if (cgrp->tasks_pids[mid] <= pid)
> > > + ? ? ? ? ? ? ? ? ? ? ? } else if (cp->tasks_pids[mid] <= pid)
> > > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?index = mid + 1;
> > > ? ? ? ? ? ? ? ? ? ? ? ?else
> > > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?end = mid;
> > > ? ? ? ? ? ? ? ?}
> > > ? ? ? ?}
> > > ? ? ? ?/* If we're off the end of the array, we're done */
> > > - ? ? ? if (index >= cgrp->pids_length)
> > > + ? ? ? if (index >= cp->pids_length)
> > > ? ? ? ? ? ? ? ?return NULL;
> > > ? ? ? ?/* Update the abstract position to be the actual pid that we found */
> > > - ? ? ? iter = cgrp->tasks_pids + index;
> > > + ? ? ? iter = cp->tasks_pids + index;
> > > ? ? ? ?*pos = *iter;
> > > ? ? ? ?return iter;
> > > ?}
> > >
> > > ?static void cgroup_tasks_stop(struct seq_file *s, void *v)
> > > ?{
> > > - ? ? ? struct cgroup *cgrp = s->private;
> > > + ? ? ? struct cgroup_pids *cp = s->private;
> > > + ? ? ? struct cgroup *cgrp = cp->cgrp;
> > > ? ? ? ?up_read(&cgrp->pids_mutex);
> > > ?}
> > >
> > > ?static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
> > > ?{
> > > - ? ? ? struct cgroup *cgrp = s->private;
> > > + ? ? ? struct cgroup_pids *cp = s->private;
> > > ? ? ? ?int *p = v;
> > > - ? ? ? int *end = cgrp->tasks_pids + cgrp->pids_length;
> > > + ? ? ? int *end = cp->tasks_pids + cp->pids_length;
> > >
> > > ? ? ? ?/*
> > > ? ? ? ? * Advance to the next pid in the array. If this goes off the
> > > @@ -2286,26 +2308,32 @@ static struct seq_operations cgroup_tasks_seq_operations = {
> > > ? ? ? ?.show = cgroup_tasks_show,
> > > ?};
> > >
> > > -static void release_cgroup_pid_array(struct cgroup *cgrp)
> > > +static void release_cgroup_pid_array(struct cgroup_pids *cp)
> > > ?{
> > > + ? ? ? struct cgroup *cgrp = cp->cgrp;
> > > +
> > > ? ? ? ?down_write(&cgrp->pids_mutex);
> > > - ? ? ? BUG_ON(!cgrp->pids_use_count);
> > > - ? ? ? if (!--cgrp->pids_use_count) {
> > > - ? ? ? ? ? ? ? kfree(cgrp->tasks_pids);
> > > - ? ? ? ? ? ? ? cgrp->tasks_pids = NULL;
> > > - ? ? ? ? ? ? ? cgrp->pids_length = 0;
> > > + ? ? ? BUG_ON(!cp->pids_use_count);
> > > + ? ? ? if (!--cp->pids_use_count) {
> > > + ? ? ? ? ? ? ? list_del(&cp->list);
> > > + ? ? ? ? ? ? ? kfree(cp->tasks_pids);
> > > + ? ? ? ? ? ? ? kfree(cp);
> > > ? ? ? ?}
> > > ? ? ? ?up_write(&cgrp->pids_mutex);
> > > ?}
> > >
> > > ?static int cgroup_tasks_release(struct inode *inode, struct file *file)
> > > ?{
> > > - ? ? ? struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> > > + ? ? ? struct seq_file *seq;
> > > + ? ? ? struct cgroup_pids *cp;
> > >
> > > ? ? ? ?if (!(file->f_mode & FMODE_READ))
> > > ? ? ? ? ? ? ? ?return 0;
> > >
> > > - ? ? ? release_cgroup_pid_array(cgrp);
> > > + ? ? ? seq = file->private_data;
> > > + ? ? ? cp = seq->private;
> > > +
> > > + ? ? ? release_cgroup_pid_array(cp);
> > > ? ? ? ?return seq_release(inode, file);
> > > ?}
> > >
> > > @@ -2324,6 +2352,8 @@ static struct file_operations cgroup_tasks_operations = {
> > > ?static int cgroup_tasks_open(struct inode *unused, struct file *file)
> > > ?{
> > > ? ? ? ?struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> > > + ? ? ? struct pid_namespace *pid_ns = task_active_pid_ns(current);
> > > + ? ? ? struct cgroup_pids *cp;
> > > ? ? ? ?pid_t *pidarray;
> > > ? ? ? ?int npids;
> > > ? ? ? ?int retval;
> > > @@ -2350,20 +2380,36 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> > > ? ? ? ? * array if necessary
> > > ? ? ? ? */
> > > ? ? ? ?down_write(&cgrp->pids_mutex);
> > > - ? ? ? kfree(cgrp->tasks_pids);
> > > - ? ? ? cgrp->tasks_pids = pidarray;
> > > - ? ? ? cgrp->pids_length = npids;
> > > - ? ? ? cgrp->pids_use_count++;
> > > +
> > > + ? ? ? list_for_each_entry(cp, &cgrp->pids_list, list) {
> > > + ? ? ? ? ? ? ? if (pid_ns == cp->pid_ns)
> > > + ? ? ? ? ? ? ? ? ? ? ? goto found;
> > > + ? ? ? }
> > > +
> > > + ? ? ? cp = kzalloc(sizeof(*cp), GFP_KERNEL);
> > > + ? ? ? if (!cp) {
> > > + ? ? ? ? ? ? ? up_write(&cgrp->pids_mutex);
> > > + ? ? ? ? ? ? ? kfree(pidarray);
> > > + ? ? ? ? ? ? ? return -ENOMEM;
> > > + ? ? ? }
> > > + ? ? ? cp->cgrp = cgrp;
> > > + ? ? ? cp->pid_ns = pid_ns;
> > > + ? ? ? list_add(&cp->list, &cgrp->pids_list);
> > > +found:
> > > + ? ? ? kfree(cp->tasks_pids);
> > > + ? ? ? cp->tasks_pids = pidarray;
> > > + ? ? ? cp->pids_length = npids;
> > > + ? ? ? cp->pids_use_count++;
> > > ? ? ? ?up_write(&cgrp->pids_mutex);
> > >
> > > ? ? ? ?file->f_op = &cgroup_tasks_operations;
> > >
> > > ? ? ? ?retval = seq_open(file, &cgroup_tasks_seq_operations);
> > > ? ? ? ?if (retval) {
> > > - ? ? ? ? ? ? ? release_cgroup_pid_array(cgrp);
> > > + ? ? ? ? ? ? ? release_cgroup_pid_array(cp);
> > > ? ? ? ? ? ? ? ?return retval;
> > > ? ? ? ?}
> > > - ? ? ? ((struct seq_file *)file->private_data)->private = cgrp;
> > > + ? ? ? ((struct seq_file *)file->private_data)->private = cp;
> > > ? ? ? ?return 0;
> > > ?}
> > >
> > > --
> > > 1.5.4.rc3
> > >
> > _______________________________________________
> > Containers mailing list
> > [email protected]
> > https://lists.linux-foundation.org/mailman/listinfo/containers
> >
>
--
Balbir
Quoting Li Zefan ([email protected]):
> Paul Menage wrote:
> > On Wed, Jul 1, 2009 at 7:17 PM, Li Zefan<[email protected]> wrote:
> >> But I guess we are going to fix the bug for 2.6.31? So is it ok to
> >> merge a new feature 'cgroup.procs' together into 2.6.31?
> >>
> >
> > Does this bug really need to be fixed for 2.6.31? I didn't think that
> > the namespace support in mainline was robust enough yet for people to
> > use them for virtual servers in production environments.
I don't know where the bar is for 'production environments', but I'd
have to claim that pid namespaces are there...
> If so, it's ok for me. Unless someone else has objections. Serge?
Well, on the one hand it's not a horrible bug in that at least it
won't crash the kernel. But what bugs me is that there is no
workaround for userspace, no way for an admin to know that if he
does for t in `cat /cgroup/victim/tasks`; do kill $t; done he
won't kill his mysql server.
I think that's a bad enough risk to make it worth trying to push
Li's patch. Surely changing Ben's procs file should be pretty
trivial to rebase?
thanks,
-serge
Quoting Li Zefan ([email protected]):
> The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844
> ("cgroups: convert tasks file to use a seq_file with shared pid array").
>
> We cache a pid array for all threads that are opening the same "tasks"
> file, but the pids in the array are always from the namespace of the
> last process that opened the file, so all other threads will read pids
> from that namespace instead of their own namespaces.
>
> To fix it, we maintain a list of pid arrays, which is keyed by pid_ns.
> The list will be of length 1 at most time.
>
> Reported-by: Paul Menage <[email protected]>
> Idea-by: Paul Menage <[email protected]>
> Signed-off-by: Li Zefan <[email protected]>
Reviewed-by: Serge Hallyn <[email protected]>
> ---
> include/linux/cgroup.h | 11 ++----
> kernel/cgroup.c | 94 +++++++++++++++++++++++++++++++++++------------
> 2 files changed, 74 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 665fa70..20411d2 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -179,14 +179,11 @@ struct cgroup {
> */
> struct list_head release_list;
>
> - /* pids_mutex protects the fields below */
> + /* pids_mutex protects pids_list and cached pid arrays. */
> struct rw_semaphore pids_mutex;
> - /* Array of process ids in the cgroup */
> - pid_t *tasks_pids;
> - /* How many files are using the current tasks_pids array */
> - int pids_use_count;
> - /* Length of the current tasks_pids array */
> - int pids_length;
> +
> + /* Linked list of struct cgroup_pids */
> + struct list_head pids_list;
>
> /* For RCU-protected deletion */
> struct rcu_head rcu_head;
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3737a68..13dddb4 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -47,6 +47,7 @@
> #include <linux/hash.h>
> #include <linux/namei.h>
> #include <linux/smp_lock.h>
> +#include <linux/pid_namespace.h>
>
> #include <asm/atomic.h>
>
> @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> INIT_LIST_HEAD(&cgrp->children);
> INIT_LIST_HEAD(&cgrp->css_sets);
> INIT_LIST_HEAD(&cgrp->release_list);
> + INIT_LIST_HEAD(&cgrp->pids_list);
> init_rwsem(&cgrp->pids_mutex);
> }
> static void init_cgroup_root(struct cgroupfs_root *root)
> @@ -2201,12 +2203,30 @@ err:
> return ret;
> }
>
> +/*
> + * Cache pids for all threads in the same pid namespace that are
> + * opening the same "tasks" file.
> + */
> +struct cgroup_pids {
> + /* The node in cgrp->pids_list */
> + struct list_head list;
> + /* The cgroup those pids belong to */
> + struct cgroup *cgrp;
> + /* The namepsace those pids belong to */
> + struct pid_namespace *pid_ns;
> + /* Array of process ids in the cgroup */
> + pid_t *tasks_pids;
> + /* How many files are using the this tasks_pids array */
> + int pids_use_count;
> + /* Length of the current tasks_pids array */
> + int pids_length;
> +};
> +
> static int cmppid(const void *a, const void *b)
> {
> return *(pid_t *)a - *(pid_t *)b;
> }
>
> -
> /*
> * seq_file methods for the "tasks" file. The seq_file position is the
> * next pid to display; the seq_file iterator is a pointer to the pid
> @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
> * after a seek to the start). Use a binary-search to find the
> * next pid to display, if any
> */
> - struct cgroup *cgrp = s->private;
> + struct cgroup_pids *cp = s->private;
> + struct cgroup *cgrp = cp->cgrp;
> int index = 0, pid = *pos;
> int *iter;
>
> down_read(&cgrp->pids_mutex);
> if (pid) {
> - int end = cgrp->pids_length;
> + int end = cp->pids_length;
>
> while (index < end) {
> int mid = (index + end) / 2;
> - if (cgrp->tasks_pids[mid] == pid) {
> + if (cp->tasks_pids[mid] == pid) {
> index = mid;
> break;
> - } else if (cgrp->tasks_pids[mid] <= pid)
> + } else if (cp->tasks_pids[mid] <= pid)
> index = mid + 1;
> else
> end = mid;
> }
> }
> /* If we're off the end of the array, we're done */
> - if (index >= cgrp->pids_length)
> + if (index >= cp->pids_length)
> return NULL;
> /* Update the abstract position to be the actual pid that we found */
> - iter = cgrp->tasks_pids + index;
> + iter = cp->tasks_pids + index;
> *pos = *iter;
> return iter;
> }
>
> static void cgroup_tasks_stop(struct seq_file *s, void *v)
> {
> - struct cgroup *cgrp = s->private;
> + struct cgroup_pids *cp = s->private;
> + struct cgroup *cgrp = cp->cgrp;
> up_read(&cgrp->pids_mutex);
> }
>
> static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
> {
> - struct cgroup *cgrp = s->private;
> + struct cgroup_pids *cp = s->private;
> int *p = v;
> - int *end = cgrp->tasks_pids + cgrp->pids_length;
> + int *end = cp->tasks_pids + cp->pids_length;
>
> /*
> * Advance to the next pid in the array. If this goes off the
> @@ -2286,26 +2308,32 @@ static struct seq_operations cgroup_tasks_seq_operations = {
> .show = cgroup_tasks_show,
> };
>
> -static void release_cgroup_pid_array(struct cgroup *cgrp)
> +static void release_cgroup_pid_array(struct cgroup_pids *cp)
> {
> + struct cgroup *cgrp = cp->cgrp;
> +
> down_write(&cgrp->pids_mutex);
> - BUG_ON(!cgrp->pids_use_count);
> - if (!--cgrp->pids_use_count) {
> - kfree(cgrp->tasks_pids);
> - cgrp->tasks_pids = NULL;
> - cgrp->pids_length = 0;
> + BUG_ON(!cp->pids_use_count);
> + if (!--cp->pids_use_count) {
> + list_del(&cp->list);
> + kfree(cp->tasks_pids);
> + kfree(cp);
> }
> up_write(&cgrp->pids_mutex);
> }
>
> static int cgroup_tasks_release(struct inode *inode, struct file *file)
> {
> - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> + struct seq_file *seq;
> + struct cgroup_pids *cp;
>
> if (!(file->f_mode & FMODE_READ))
> return 0;
>
> - release_cgroup_pid_array(cgrp);
> + seq = file->private_data;
> + cp = seq->private;
> +
> + release_cgroup_pid_array(cp);
> return seq_release(inode, file);
> }
>
> @@ -2324,6 +2352,8 @@ static struct file_operations cgroup_tasks_operations = {
> static int cgroup_tasks_open(struct inode *unused, struct file *file)
> {
> struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> + struct cgroup_pids *cp;
> pid_t *pidarray;
> int npids;
> int retval;
> @@ -2350,20 +2380,36 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> * array if necessary
> */
> down_write(&cgrp->pids_mutex);
> - kfree(cgrp->tasks_pids);
> - cgrp->tasks_pids = pidarray;
> - cgrp->pids_length = npids;
> - cgrp->pids_use_count++;
> +
> + list_for_each_entry(cp, &cgrp->pids_list, list) {
> + if (pid_ns == cp->pid_ns)
> + goto found;
> + }
> +
> + cp = kzalloc(sizeof(*cp), GFP_KERNEL);
> + if (!cp) {
> + up_write(&cgrp->pids_mutex);
> + kfree(pidarray);
> + return -ENOMEM;
> + }
> + cp->cgrp = cgrp;
> + cp->pid_ns = pid_ns;
> + list_add(&cp->list, &cgrp->pids_list);
> +found:
> + kfree(cp->tasks_pids);
> + cp->tasks_pids = pidarray;
> + cp->pids_length = npids;
> + cp->pids_use_count++;
> up_write(&cgrp->pids_mutex);
>
> file->f_op = &cgroup_tasks_operations;
>
> retval = seq_open(file, &cgroup_tasks_seq_operations);
> if (retval) {
> - release_cgroup_pid_array(cgrp);
> + release_cgroup_pid_array(cp);
> return retval;
> }
> - ((struct seq_file *)file->private_data)->private = cgrp;
> + ((struct seq_file *)file->private_data)->private = cp;
> return 0;
> }
>
> --
> 1.5.4.rc3
On Thu, Jul 2, 2009 at 6:26 AM, Serge E. Hallyn<[email protected]> wrote:
> Quoting Li Zefan ([email protected]):
>> Paul Menage wrote:
>> > On Wed, Jul 1, 2009 at 7:17 PM, Li Zefan<[email protected]> wrote:
>> >> But I guess we are going to fix the bug for 2.6.31? So is it ok to
>> >> merge a new feature 'cgroup.procs' together into 2.6.31?
>> >>
>> >
>> > Does this bug really need to be fixed for 2.6.31? I didn't think that
>> > the namespace support in mainline was robust enough yet for people to
>> > use them for virtual servers in production environments.
>
> I don't know where the bar is for 'production environments', but I'd
> have to claim that pid namespaces are there...
Well, pid namespaces are marked as experimental, as are user
namespaces (and were described as "very incomplete" a few months
back). Pid namespaces are useful for process migration (which is still
under development) or virtual servers (for which user namespaces are
pretty much essential). So I'm not sure quite what you'd use pid
namespaces for yet.
Paul
Quoting Paul Menage ([email protected]):
> On Thu, Jul 2, 2009 at 6:26 AM, Serge E. Hallyn<[email protected]> wrote:
> > Quoting Li Zefan ([email protected]):
> >> Paul Menage wrote:
> >> > On Wed, Jul 1, 2009 at 7:17 PM, Li Zefan<[email protected]> wrote:
> >> >> But I guess we are going to fix the bug for 2.6.31? So is it ok to
> >> >> merge a new feature 'cgroup.procs' together into 2.6.31?
> >> >>
> >> >
> >> > Does this bug really need to be fixed for 2.6.31? I didn't think that
> >> > the namespace support in mainline was robust enough yet for people to
> >> > use them for virtual servers in production environments.
> >
> > I don't know where the bar is for 'production environments', but I'd
> > have to claim that pid namespaces are there...
>
> Well, pid namespaces are marked as experimental, as are user
> namespaces (and were described as "very incomplete" a few months
incomplete (due to signaling issues which have mostly been resolved)
but stable and usable.
user namespace are a completely different story :)
> back). Pid namespaces are useful for process migration (which is still
> under development) or virtual servers (for which user namespaces are
> pretty much essential). So I'm not sure quite what you'd use pid
> namespaces for yet.
You don't need user namespaces to use pid namespaces for virtual
servers (depending on your use).
Now the fact remains this is a hard to trigger bug which doesn't
corrupt the kernel, and - to take back what I said earlier - userspace
can work around it by simply freezing the cgroup before reading its
tasks file.
So I guess I can go either way... If Li's patch were more complicated
I'd definately be for waiting. But I do object to the general process
of making a fix of a pretty bad bag depend on an unrelated new feature!
-serge
On Wed, Jul 1, 2009 at 6:24 PM, Li Zefan<[email protected]> wrote:
> The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844
> ("cgroups: convert tasks file to use a seq_file with shared pid array").
>
> We cache a pid array for all threads that are opening the same "tasks"
> file, but the pids in the array are always from the namespace of the
> last process that opened the file, so all other threads will read pids
> from that namespace instead of their own namespaces.
>
> To fix it, we maintain a list of pid arrays, which is keyed by pid_ns.
> The list will be of length 1 at most time.
>
> Reported-by: Paul Menage <[email protected]>
> Idea-by: Paul Menage <[email protected]>
> Signed-off-by: Li Zefan <[email protected]>
> ---
> ?include/linux/cgroup.h | ? 11 ++----
> ?kernel/cgroup.c ? ? ? ?| ? 94 +++++++++++++++++++++++++++++++++++------------
> ?2 files changed, 74 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 665fa70..20411d2 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -179,14 +179,11 @@ struct cgroup {
> ? ? ? ? */
> ? ? ? ?struct list_head release_list;
>
> - ? ? ? /* pids_mutex protects the fields below */
> + ? ? ? /* pids_mutex protects pids_list and cached pid arrays. */
> ? ? ? ?struct rw_semaphore pids_mutex;
> - ? ? ? /* Array of process ids in the cgroup */
> - ? ? ? pid_t *tasks_pids;
> - ? ? ? /* How many files are using the current tasks_pids array */
> - ? ? ? int pids_use_count;
> - ? ? ? /* Length of the current tasks_pids array */
> - ? ? ? int pids_length;
> +
> + ? ? ? /* Linked list of struct cgroup_pids */
> + ? ? ? struct list_head pids_list;
>
> ? ? ? ?/* For RCU-protected deletion */
> ? ? ? ?struct rcu_head rcu_head;
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3737a68..13dddb4 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -47,6 +47,7 @@
> ?#include <linux/hash.h>
> ?#include <linux/namei.h>
> ?#include <linux/smp_lock.h>
> +#include <linux/pid_namespace.h>
>
> ?#include <asm/atomic.h>
>
> @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> ? ? ? ?INIT_LIST_HEAD(&cgrp->children);
> ? ? ? ?INIT_LIST_HEAD(&cgrp->css_sets);
> ? ? ? ?INIT_LIST_HEAD(&cgrp->release_list);
> + ? ? ? INIT_LIST_HEAD(&cgrp->pids_list);
> ? ? ? ?init_rwsem(&cgrp->pids_mutex);
> ?}
> ?static void init_cgroup_root(struct cgroupfs_root *root)
> @@ -2201,12 +2203,30 @@ err:
> ? ? ? ?return ret;
> ?}
>
> +/*
> + * Cache pids for all threads in the same pid namespace that are
> + * opening the same "tasks" file.
> + */
> +struct cgroup_pids {
> + ? ? ? /* The node in cgrp->pids_list */
> + ? ? ? struct list_head list;
> + ? ? ? /* The cgroup those pids belong to */
> + ? ? ? struct cgroup *cgrp;
> + ? ? ? /* The namepsace those pids belong to */
> + ? ? ? struct pid_namespace *pid_ns;
> + ? ? ? /* Array of process ids in the cgroup */
> + ? ? ? pid_t *tasks_pids;
> + ? ? ? /* How many files are using the this tasks_pids array */
> + ? ? ? int pids_use_count;
> + ? ? ? /* Length of the current tasks_pids array */
> + ? ? ? int pids_length;
> +};
Maybe lose the unnecessary "pids" suffices and prefixes in this structure?
>
> @@ -2324,6 +2352,8 @@ static struct file_operations cgroup_tasks_operations = {
> ?static int cgroup_tasks_open(struct inode *unused, struct file *file)
> ?{
> ? ? ? ?struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> + ? ? ? struct pid_namespace *pid_ns = task_active_pid_ns(current);
> + ? ? ? struct cgroup_pids *cp;
> ? ? ? ?pid_t *pidarray;
> ? ? ? ?int npids;
> ? ? ? ?int retval;
> @@ -2350,20 +2380,36 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> ? ? ? ? * array if necessary
> ? ? ? ? */
> ? ? ? ?down_write(&cgrp->pids_mutex);
> - ? ? ? kfree(cgrp->tasks_pids);
> - ? ? ? cgrp->tasks_pids = pidarray;
> - ? ? ? cgrp->pids_length = npids;
> - ? ? ? cgrp->pids_use_count++;
> +
> + ? ? ? list_for_each_entry(cp, &cgrp->pids_list, list) {
> + ? ? ? ? ? ? ? if (pid_ns == cp->pid_ns)
> + ? ? ? ? ? ? ? ? ? ? ? goto found;
> + ? ? ? }
> +
> + ? ? ? cp = kzalloc(sizeof(*cp), GFP_KERNEL);
> + ? ? ? if (!cp) {
> + ? ? ? ? ? ? ? up_write(&cgrp->pids_mutex);
> + ? ? ? ? ? ? ? kfree(pidarray);
> + ? ? ? ? ? ? ? return -ENOMEM;
> + ? ? ? }
> + ? ? ? cp->cgrp = cgrp;
> + ? ? ? cp->pid_ns = pid_ns;
You're storing an uncounted reference to the pid ns here - there's no
guarantee that the pid_ns will outlive the open file.
Paul
On Thu, Jul 2, 2009 at 9:15 AM, Serge E. Hallyn<[email protected]> wrote:
>>
>> Well, pid namespaces are marked as experimental, as are user
>> namespaces (and were described as "very incomplete" a few months
>
> incomplete (due to signaling issues which have mostly been resolved)
> but stable and usable.
>
> user namespace are a completely different story :)
Sorry, that wasn't clear - the "very incomplete" referred to user namespaces.
Paul
Quoting Paul Menage ([email protected]):
> On Wed, Jul 1, 2009 at 6:24 PM, Li Zefan<[email protected]> wrote:
> > + ? ? ? cp = kzalloc(sizeof(*cp), GFP_KERNEL);
> > + ? ? ? if (!cp) {
> > + ? ? ? ? ? ? ? up_write(&cgrp->pids_mutex);
> > + ? ? ? ? ? ? ? kfree(pidarray);
> > + ? ? ? ? ? ? ? return -ENOMEM;
> > + ? ? ? }
> > + ? ? ? cp->cgrp = cgrp;
> > + ? ? ? cp->pid_ns = pid_ns;
>
> You're storing an uncounted reference to the pid ns here - there's no
> guarantee that the pid_ns will outlive the open file.
Yeah I was thinking about that, but
1. the only way it won't outlive the open file is if the
task opens the file, hands the open fd over a
unix socket, then exits as the last task of its
pidns
2. We don't dereference the pid_ns, so there is no actual
safety issue. So it would become a problem only
if a new pidns gets created at that same address
*and* a task in the new pidns opens the same
tasks file.
Still, it wouldn't hurt to do get_pid_ns/put_pid_ns at the open
and release :)
-serge
On Thu, Jul 2, 2009 at 9:37 AM, Serge E. Hallyn<[email protected]> wrote:
>
> ? ? ? ?1. the only way it won't outlive the open file is if the
> ? ? ? ? ? ? ? ?task opens the file, hands the open fd over a
> ? ? ? ? ? ? ? ?unix socket, then exits as the last task of its
> ? ? ? ? ? ? ? ?pidns
Right.
> ? ? ? ?2. We don't dereference the pid_ns, so there is no actual
> ? ? ? ? ? ? ? ?safety issue. ?So it would become a problem only
> ? ? ? ? ? ? ? ?if a new pidns gets created at that same address
Which is fairly likely given that pid_namespace is allocated from a
specific cache.
Paul
Quoting Paul Menage ([email protected]):
> On Thu, Jul 2, 2009 at 9:37 AM, Serge E. Hallyn<[email protected]> wrote:
> >
> > ? ? ? ?1. the only way it won't outlive the open file is if the
> > ? ? ? ? ? ? ? ?task opens the file, hands the open fd over a
> > ? ? ? ? ? ? ? ?unix socket, then exits as the last task of its
> > ? ? ? ? ? ? ? ?pidns
>
> Right.
>
> > ? ? ? ?2. We don't dereference the pid_ns, so there is no actual
> > ? ? ? ? ? ? ? ?safety issue. ?So it would become a problem only
> > ? ? ? ? ? ? ? ?if a new pidns gets created at that same address
>
> Which is fairly likely given that pid_namespace is allocated from a
> specific cache.
>
> Paul
The scenario as a whole is still pretty unlikely, but there's just
no reason to risk it.
-serge
On Thu, 2 Jul 2009 09:27:20 -0700
Paul Menage <[email protected]> wrote:
> On Thu, Jul 2, 2009 at 9:15 AM, Serge E. Hallyn<[email protected]> wrote:
> >>
> >> Well, pid namespaces are marked as experimental, as are user
> >> namespaces (and were described as "very incomplete" a few months
> >
> > incomplete (due to signaling issues which have mostly been resolved)
> > but stable and usable.
> >
> > user namespace are a completely different story :)
>
> Sorry, that wasn't clear - the "very incomplete" referred to user namespaces.
>
I do think we should lean toward fixing it. After all, it used to work
OK and now it doesn't.
It is a three-minute matter of patch-wrangling to take the fix out
again within a more comprehensive 2.6.32 patch series, so that's not an
issue.
So are there any strong (enough) objections to putting this into 2.6.31?
From: Li Zefan <[email protected]>
The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844
("cgroups: convert tasks file to use a seq_file with shared pid array").
We cache a pid array for all threads that are opening the same "tasks"
file, but the pids in the array are always from the namespace of the
last process that opened the file, so all other threads will read pids
from that namespace instead of their own namespaces.
To fix it, we maintain a list of pid arrays, which is keyed by pid_ns.
The list will be of length 1 at most time.
Reported-by: Paul Menage <[email protected]>
Idea-by: Paul Menage <[email protected]>
Signed-off-by: Li Zefan <[email protected]>
Reviewed-by: Serge Hallyn <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---
include/linux/cgroup.h | 11 +---
kernel/cgroup.c | 94 +++++++++++++++++++++++++++++----------
2 files changed, 74 insertions(+), 31 deletions(-)
diff -puN include/linux/cgroup.h~cgroups-fix-pid-namespace-bug include/linux/cgroup.h
--- a/include/linux/cgroup.h~cgroups-fix-pid-namespace-bug
+++ a/include/linux/cgroup.h
@@ -179,14 +179,11 @@ struct cgroup {
*/
struct list_head release_list;
- /* pids_mutex protects the fields below */
+ /* pids_mutex protects pids_list and cached pid arrays. */
struct rw_semaphore pids_mutex;
- /* Array of process ids in the cgroup */
- pid_t *tasks_pids;
- /* How many files are using the current tasks_pids array */
- int pids_use_count;
- /* Length of the current tasks_pids array */
- int pids_length;
+
+ /* Linked list of struct cgroup_pids */
+ struct list_head pids_list;
/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff -puN kernel/cgroup.c~cgroups-fix-pid-namespace-bug kernel/cgroup.c
--- a/kernel/cgroup.c~cgroups-fix-pid-namespace-bug
+++ a/kernel/cgroup.c
@@ -47,6 +47,7 @@
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>
#include <asm/atomic.h>
@@ -961,6 +962,7 @@ static void init_cgroup_housekeeping(str
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pids_list);
init_rwsem(&cgrp->pids_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2202,12 +2204,30 @@ err:
return ret;
}
+/*
+ * Cache pids for all threads in the same pid namespace that are
+ * opening the same "tasks" file.
+ */
+struct cgroup_pids {
+ /* The node in cgrp->pids_list */
+ struct list_head list;
+ /* The cgroup those pids belong to */
+ struct cgroup *cgrp;
+ /* The namepsace those pids belong to */
+ struct pid_namespace *pid_ns;
+ /* Array of process ids in the cgroup */
+ pid_t *tasks_pids;
+ /* How many files are using the this tasks_pids array */
+ int pids_use_count;
+ /* Length of the current tasks_pids array */
+ int pids_length;
+};
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
-
/*
* seq_file methods for the "tasks" file. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
@@ -2222,45 +2242,47 @@ static void *cgroup_tasks_start(struct s
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
int index = 0, pid = *pos;
int *iter;
down_read(&cgrp->pids_mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = cp->pids_length;
while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (cp->tasks_pids[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (cp->tasks_pids[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= cp->pids_length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = cp->tasks_pids + index;
*pos = *iter;
return iter;
}
static void cgroup_tasks_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
up_read(&cgrp->pids_mutex);
}
static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
+ int *end = cp->tasks_pids + cp->pids_length;
/*
* Advance to the next pid in the array. If this goes off the
@@ -2287,26 +2309,32 @@ static struct seq_operations cgroup_task
.show = cgroup_tasks_show,
};
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void release_cgroup_pid_array(struct cgroup_pids *cp)
{
+ struct cgroup *cgrp = cp->cgrp;
+
down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ BUG_ON(!cp->pids_use_count);
+ if (!--cp->pids_use_count) {
+ list_del(&cp->list);
+ kfree(cp->tasks_pids);
+ kfree(cp);
}
up_write(&cgrp->pids_mutex);
}
static int cgroup_tasks_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct seq_file *seq;
+ struct cgroup_pids *cp;
if (!(file->f_mode & FMODE_READ))
return 0;
- release_cgroup_pid_array(cgrp);
+ seq = file->private_data;
+ cp = seq->private;
+
+ release_cgroup_pid_array(cp);
return seq_release(inode, file);
}
@@ -2325,6 +2353,8 @@ static struct file_operations cgroup_tas
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct cgroup_pids *cp;
pid_t *pidarray;
int npids;
int retval;
@@ -2351,20 +2381,36 @@ static int cgroup_tasks_open(struct inod
* array if necessary
*/
down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
+
+ list_for_each_entry(cp, &cgrp->pids_list, list) {
+ if (pid_ns == cp->pid_ns)
+ goto found;
+ }
+
+ cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp) {
+ up_write(&cgrp->pids_mutex);
+ kfree(pidarray);
+ return -ENOMEM;
+ }
+ cp->cgrp = cgrp;
+ cp->pid_ns = pid_ns;
+ list_add(&cp->list, &cgrp->pids_list);
+found:
+ kfree(cp->tasks_pids);
+ cp->tasks_pids = pidarray;
+ cp->pids_length = npids;
+ cp->pids_use_count++;
up_write(&cgrp->pids_mutex);
file->f_op = &cgroup_tasks_operations;
retval = seq_open(file, &cgroup_tasks_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ release_cgroup_pid_array(cp);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = cp;
return 0;
}
_
On Thu, Jul 2, 2009 at 4:20 PM, Andrew Morton<[email protected]> wrote:
>
> I do think we should lean toward fixing it. ?After all, it used to work
> OK and now it doesn't.
>
> It is a three-minute matter of patch-wrangling to take the fix out
> again within a more comprehensive 2.6.32 patch series, so that's not an
> issue.
>
> So are there any strong (enough) objections to putting this into 2.6.31?
>
I guess no strong objection on the principle, but the patch as it
stands is missing a couple of pid_ns refcount operations in order to
be a complete fix (IMO).
Paul
Paul Menage <[email protected]> writes:
> On Thu, Jul 2, 2009 at 6:26 AM, Serge E. Hallyn<[email protected]> wrote:
>> Quoting Li Zefan ([email protected]):
>>> Paul Menage wrote:
>>> > On Wed, Jul 1, 2009 at 7:17 PM, Li Zefan<[email protected]> wrote:
>>> >> But I guess we are going to fix the bug for 2.6.31? So is it ok to
>>> >> merge a new feature 'cgroup.procs' together into 2.6.31?
>>> >>
>>> >
>>> > Does this bug really need to be fixed for 2.6.31? I didn't think that
>>> > the namespace support in mainline was robust enough yet for people to
>>> > use them for virtual servers in production environments.
>>
>> I don't know where the bar is for 'production environments', but I'd
>> have to claim that pid namespaces are there...
>
> Well, pid namespaces are marked as experimental, as are user
> namespaces (and were described as "very incomplete" a few months
> back). Pid namespaces are useful for process migration (which is still
> under development) or virtual servers (for which user namespaces are
> pretty much essential). So I'm not sure quite what you'd use pid
> namespaces for yet.
I have pid namespaces in pretty heavy use already.
Inescapable process groups are quite handy.
Eric