2009-07-31 01:51:46

by Ben Blum

[permalink] [raw]
Subject: [PATCH v2 0/6] CGroups: cgroup memberlist enhancement+fix

(This patch series is a revision of http://lkml.org/lkml/2009/7/23/330 .)

The following series adds a "cgroup.procs" file to each cgroup that reports
unique tgids rather than pids, which can also be written to for moving all
threads in a threadgroup at once.

Patch #5 modifies the subsystem attach interface to be able to attach whole
threadgroups at a time, which could introduce potential problems if any
subsystem were to need the old cgroup of every thread being moved.

Patch #6 introduces a new rwsem (per threadgroup) that's taken for reading in
the fork() path, and reveals potential for a race when forking before a
subsystem's attach() function is called - one potential solution in case any
subsystem has this problem is to hang on to the group's fork mutex through the
attach() calls.

This patch series was written at the same time as Li Zefan's pid namespace
bugfix patch (from http://lkml.org/lkml/2009/7/1/559 ), and contains a similar
but finer-grained fix for the same bug. These patches can either be rewritten
to be applied on top of Li's patch, or be applied as they are with Li's patch
reversed.

---

Ben Blum (6):
Makes procs file writable to move all threads by tgid at once
Lets ss->can_attach and ss->attach do whole threadgroups at a time
Changes css_set freeing mechanism to be under RCU
Quick vmalloc vs kmalloc fix to the case where array size is too large
Ensures correct concurrent opening/reading of pidlists across pid namespaces
Adds a read-only "procs" file similar to "tasks" that shows only unique tgids


Documentation/cgroups/cgroups.txt | 12 +
include/linux/cgroup.h | 68 ++-
include/linux/init_task.h | 9
include/linux/sched.h | 2
kernel/cgroup.c | 810 ++++++++++++++++++++++++++++++-------
kernel/cgroup_freezer.c | 15 +
kernel/cpuset.c | 65 ++-
kernel/fork.c | 6
kernel/ns_cgroup.c | 16 +
kernel/sched.c | 37 ++
mm/memcontrol.c | 3
security/device_cgroup.c | 3
12 files changed, 855 insertions(+), 191 deletions(-)


2009-07-31 01:52:03

by Ben Blum

[permalink] [raw]
Subject: [PATCH 1/6] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

struct cgroup used to have a bunch of fields for keeping track of the pidlist
for the tasks file. Those are now separated into a new struct cgroup_pidlist,
of which two are had, one for procs and one for tasks. The way the seq_file
operations are set up is changed so that just the pidlist struct gets passed
around as the private data.

Interface example: Suppose a multithreaded process has pid 1000 and other
threads with ids 1001, 1002, 1003:
$ cat tasks
1000
1001
1002
1003
$ cat cgroup.procs
1000
$

Signed-off-by: Ben Blum <[email protected]>

---

include/linux/cgroup.h | 22 ++--
kernel/cgroup.c | 278 ++++++++++++++++++++++++++++++------------------
2 files changed, 186 insertions(+), 114 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 665fa70..8a3a3ac 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,6 +141,17 @@ enum {
CGRP_WAIT_ON_RMDIR,
};

+struct cgroup_pidlist {
+ /* protects the other fields */
+ struct rw_semaphore mutex;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* how many files are using the current array */
+ int use_count;
+};
+
struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */

@@ -179,14 +190,9 @@ struct cgroup {
*/
struct list_head release_list;

- /* pids_mutex protects the fields below */
- struct rw_semaphore pids_mutex;
- /* Array of process ids in the cgroup */
- pid_t *tasks_pids;
- /* How many files are using the current tasks_pids array */
- int pids_use_count;
- /* Length of the current tasks_pids array */
- int pids_length;
+ /* we will have two separate pidlists, one for pids (the tasks file)
+ * and one for tgids (the procs file). */
+ struct cgroup_pidlist tasks, procs;

/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a68..4ceee3f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -960,7 +960,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
- init_rwsem(&cgrp->pids_mutex);
+ init_rwsem(&(cgrp->tasks.mutex));
+ init_rwsem(&(cgrp->procs.mutex));
}
static void init_cgroup_root(struct cgroupfs_root *root)
{
@@ -1408,15 +1409,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
return ret;
}

-/* The various types of files and directories in a cgroup file system */
-enum cgroup_filetype {
- FILE_ROOT,
- FILE_DIR,
- FILE_TASKLIST,
- FILE_NOTIFY_ON_RELEASE,
- FILE_RELEASE_AGENT,
-};
-
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
@@ -2114,7 +2106,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
}

/*
- * Stuff for reading the 'tasks' file.
+ * Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
@@ -2124,27 +2116,106 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/

/*
- * Load into 'pidarray' up to 'npids' of the tasks using cgroup
- * 'cgrp'. Return actual number of pids loaded. No need to
- * task_lock(p) when reading out p->cgroup, since we're in an RCU
- * read section, so the css_set can't go away, and is
- * immutable after creation.
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * If the new stripped list is sufficiently smaller and there's enough memory
+ * to allocate a new buffer, will let go of the unneeded memory. Returns the
+ * number of unique elements.
*/
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+/* is the size difference enough that we should re-allocate the array? */
+#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
+static int pidlist_uniq(pid_t **p, int length)
{
- int n = 0, pid;
+ int src, dest = 1;
+ pid_t *list = *p;
+ pid_t *newlist;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
+ }
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+after:
+ /*
+ * if the length difference is large enough, we want to allocate a
+ * smaller buffer to save memory. if this fails due to out of memory,
+ * we'll just stay with what we've got.
+ */
+ if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
+ newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL);
+ if (newlist)
+ *p = newlist;
+ }
+ return dest;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, bool procs)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
struct cgroup_iter it;
struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
- if (unlikely(n == npids))
+ if (unlikely(n == length))
break;
- pid = task_pid_vnr(tsk);
- if (pid > 0)
- pidarray[n++] = pid;
+ /* get tgid or pid for procs or tasks file respectively */
+ pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
}
cgroup_iter_end(cgrp, &it);
- return n;
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (procs) {
+ length = pidlist_uniq(&array, length);
+ l = &(cgrp->procs);
+ } else {
+ l = &(cgrp->tasks);
+ }
+ /* store array in cgroup, freeing old if necessary */
+ down_write(&l->mutex);
+ kfree(l->list);
+ l->list = array;
+ l->length = length;
+ l->use_count++;
+ up_write(&l->mutex);
+ return 0;
}

/**
@@ -2201,19 +2272,14 @@ err:
return ret;
}

-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-

/*
- * seq_file methods for the "tasks" file. The seq_file position is the
+ * seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->tasks_pids array.
+ * in the cgroup->l->list array.
*/

-static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
{
/*
* Initially we receive a position value that corresponds to
@@ -2221,46 +2287,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pidlist *l = s->private;
int index = 0, pid = *pos;
int *iter;

- down_read(&cgrp->pids_mutex);
+ down_read(&l->mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = l->length;

while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (l->list[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = l->list + index;
*pos = *iter;
return iter;
}

-static void cgroup_tasks_stop(struct seq_file *s, void *v)
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
- up_read(&cgrp->pids_mutex);
+ struct cgroup_pidlist *l = s->private;
+ up_read(&l->mutex);
}

-static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
- int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
-
+ struct cgroup_pidlist *l = s->private;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
@@ -2274,98 +2339,94 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
}
}

-static int cgroup_tasks_show(struct seq_file *s, void *v)
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}

-static struct seq_operations cgroup_tasks_seq_operations = {
- .start = cgroup_tasks_start,
- .stop = cgroup_tasks_stop,
- .next = cgroup_tasks_next,
- .show = cgroup_tasks_show,
+/*
+ * seq_operations functions for iterating on pidlists through seq_file -
+ * independent of whether it's tasks or procs
+ */
+static const struct seq_operations cgroup_pidlist_seq_operations = {
+ .start = cgroup_pidlist_start,
+ .stop = cgroup_pidlist_stop,
+ .next = cgroup_pidlist_next,
+ .show = cgroup_pidlist_show,
};

-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void cgroup_release_pid_array(struct cgroup_pidlist *l)
{
- down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ down_write(&l->mutex);
+ BUG_ON(!l->use_count);
+ if (!--l->use_count) {
+ kfree(l->list);
+ l->list = NULL;
+ l->length = 0;
}
- up_write(&cgrp->pids_mutex);
+ up_write(&l->mutex);
}

-static int cgroup_tasks_release(struct inode *inode, struct file *file)
+static int cgroup_pidlist_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-
+ struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
-
- release_cgroup_pid_array(cgrp);
+ /*
+ * the seq_file will only be initialized if the file was opened for
+ * reading; hence we check if it's not null only in that case.
+ */
+ l = ((struct seq_file *)file->private_data)->private;
+ cgroup_release_pid_array(l);
return seq_release(inode, file);
}

-static struct file_operations cgroup_tasks_operations = {
+static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
- .release = cgroup_tasks_release,
+ .release = cgroup_pidlist_release,
};

/*
- * Handle an open on 'tasks' file. Prepare an array containing the
- * process id's of tasks currently attached to the cgroup being opened.
+ * The following functions handle opens on a file that displays a pidlist
+ * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
+ * in the cgroup.
*/
-
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
+/* helper function for the two below it */
+static int cgroup_pidlist_open(struct file *file, bool procs)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- pid_t *pidarray;
- int npids;
+ struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
int retval;

/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;

- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- npids = cgroup_task_count(cgrp);
- pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
- if (!pidarray)
- return -ENOMEM;
- npids = pid_array_load(pidarray, npids, cgrp);
- sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
- /*
- * Store the array in the cgroup, freeing the old
- * array if necessary
- */
- down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
- up_write(&cgrp->pids_mutex);
-
- file->f_op = &cgroup_tasks_operations;
+ /* have the array populated */
+ retval = pidlist_array_load(cgrp, procs);
+ if (retval)
+ return retval;
+ /* configure file information */
+ file->f_op = &cgroup_pidlist_operations;

- retval = seq_open(file, &cgroup_tasks_seq_operations);
+ retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ cgroup_release_pid_array(l);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = l;
return 0;
}
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+ return cgroup_pidlist_open(file, false);
+}
+static int cgroup_procs_open(struct inode *unused, struct file *file)
+{
+ return cgroup_pidlist_open(file, true);
+}

static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
struct cftype *cft)
@@ -2388,21 +2449,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
/*
* for the common functions, 'private' gives the type of file
*/
+/* for hysterical raisins, we can't put this on the older files */
+#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
static struct cftype files[] = {
{
.name = "tasks",
.open = cgroup_tasks_open,
.write_u64 = cgroup_tasks_write,
- .release = cgroup_tasks_release,
- .private = FILE_TASKLIST,
+ .release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
-
+ {
+ .name = CGROUP_FILE_GENERIC_PREFIX "procs",
+ .open = cgroup_procs_open,
+ /* .write_u64 = cgroup_procs_write, TODO */
+ .release = cgroup_pidlist_release,
+ .mode = S_IRUGO,
+ },
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
- .private = FILE_NOTIFY_ON_RELEASE,
},
};

@@ -2411,7 +2478,6 @@ static struct cftype cft_release_agent = {
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
- .private = FILE_RELEASE_AGENT,
};

static int cgroup_populate_dir(struct cgroup *cgrp)

2009-07-31 01:51:59

by Ben Blum

[permalink] [raw]
Subject: [PATCH 2/6] Ensures correct concurrent opening/reading of pidlists across pid namespaces

Ensures correct concurrent opening/reading of pidlists across pid namespaces

Previously there was the problem in which two processes from different pid
namespaces reading the tasks or procs file could result in one process seeing
results from the other's namespace. Rather than one pidlist for each file in a
cgroup, we now keep a list of pidlists keyed by namespace and file type (tasks
versus procs) in which entries are placed on demand. Each pidlist has its own
lock, and that the pidlists themselves are passed around in the seq_file's
private pointer means we don't have to touch the cgroup or its master list
except when creating and destroying entries.

Signed-off-by: Ben Blum <[email protected]>

---

include/linux/cgroup.h | 34 +++++++++++++--
kernel/cgroup.c | 108 ++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 120 insertions(+), 22 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8a3a3ac..b934b72 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,15 +141,36 @@ enum {
CGRP_WAIT_ON_RMDIR,
};

+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
struct cgroup_pidlist {
- /* protects the other fields */
- struct rw_semaphore mutex;
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* how many files are using the current array */
int use_count;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* protects the other fields */
+ struct rw_semaphore mutex;
};

struct cgroup {
@@ -190,9 +211,12 @@ struct cgroup {
*/
struct list_head release_list;

- /* we will have two separate pidlists, one for pids (the tasks file)
- * and one for tgids (the procs file). */
- struct cgroup_pidlist tasks, procs;
+ /*
+ * list of pidlists, up to two for each namespace (one for procs, one
+ * for tasks); created on demand.
+ */
+ struct list_head pidlists;
+ struct mutex pidlist_mutex;

/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4ceee3f..cdd0112 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>

#include <asm/atomic.h>

@@ -675,6 +676,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
*/
deactivate_super(cgrp->root->sb);

+ /*
+ * if we're getting rid of the cgroup, refcount should ensure
+ * that there are no pidlists left.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
+
call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
}
iput(inode);
@@ -960,8 +967,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
- init_rwsem(&(cgrp->tasks.mutex));
- init_rwsem(&(cgrp->procs.mutex));
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
{
@@ -2167,9 +2174,59 @@ static int cmppid(const void *a, const void *b)
}

/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+ /*
+ * We can't drop the pidlist_mutex before taking the l->mutex in case
+ * the last ref-holder is trying to remove l from the list at the same
+ * time. Holding the pidlist_mutex precludes somebody taking whichever
+ * list we find out from under us - compare release_pid_array().
+ */
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry(l, &cgrp->pidlists, links) {
+ if (l->key.type == type && l->key.ns == ns) {
+ /* found a matching list - drop the extra refcount */
+ put_pid_ns(ns);
+ /* make sure l doesn't vanish out from under us */
+ down_write(&l->mutex);
+ mutex_unlock(&cgrp->pidlist_mutex);
+ l->use_count++;
+ return l;
+ }
+ }
+ /* entry not found; create a new one */
+ l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
+ put_pid_ns(ns);
+ return l;
+ }
+ init_rwsem(&l->mutex);
+ down_write(&l->mutex);
+ l->key.type = type;
+ l->key.ns = ns;
+ l->use_count = 0; /* don't increment here */
+ l->list = NULL;
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ mutex_unlock(&cgrp->pidlist_mutex);
+ return l;
+}
+
+/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
-static int pidlist_array_load(struct cgroup *cgrp, bool procs)
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
{
pid_t *array;
int length;
@@ -2194,7 +2251,10 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
if (unlikely(n == length))
break;
/* get tgid or pid for procs or tasks file respectively */
- pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
}
@@ -2202,19 +2262,20 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (procs) {
+ if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(&array, length);
- l = &(cgrp->procs);
- } else {
- l = &(cgrp->tasks);
+ l = cgroup_pidlist_find(cgrp, type);
+ if (!l) {
+ kfree(array);
+ return -ENOMEM;
}
- /* store array in cgroup, freeing old if necessary */
- down_write(&l->mutex);
+ /* store array, freeing old if necessary - lock already held */
kfree(l->list);
l->list = array;
l->length = length;
l->use_count++;
up_write(&l->mutex);
+ *lp = l;
return 0;
}

@@ -2357,13 +2418,26 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {

static void cgroup_release_pid_array(struct cgroup_pidlist *l)
{
+ /*
+ * the case where we're the last user of this particular pidlist will
+ * have us remove it from the cgroup's list, which entails taking the
+ * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
+ * pidlist_mutex, we have to take pidlist_mutex first.
+ */
+ mutex_lock(&l->owner->pidlist_mutex);
down_write(&l->mutex);
BUG_ON(!l->use_count);
if (!--l->use_count) {
+ /* we're the last user if refcount is 0; remove and free */
+ list_del(&l->links);
+ mutex_unlock(&l->owner->pidlist_mutex);
kfree(l->list);
- l->list = NULL;
- l->length = 0;
+ put_pid_ns(l->key.ns);
+ up_write(&l->mutex);
+ kfree(l);
+ return;
}
+ mutex_unlock(&l->owner->pidlist_mutex);
up_write(&l->mutex);
}

@@ -2394,10 +2468,10 @@ static const struct file_operations cgroup_pidlist_operations = {
* in the cgroup.
*/
/* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, bool procs)
+static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
+ struct cgroup_pidlist *l;
int retval;

/* Nothing to do for write-only files */
@@ -2405,7 +2479,7 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
return 0;

/* have the array populated */
- retval = pidlist_array_load(cgrp, procs);
+ retval = pidlist_array_load(cgrp, type, &l);
if (retval)
return retval;
/* configure file information */
@@ -2421,11 +2495,11 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
}
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
- return cgroup_pidlist_open(file, false);
+ return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
- return cgroup_pidlist_open(file, true);
+ return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
}

static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,

2009-07-31 01:52:13

by Ben Blum

[permalink] [raw]
Subject: [PATCH 4/6] Changes css_set freeing mechanism to be under RCU

Changes css_set freeing mechanism to be under RCU

This is a prepatch for making the procs file writable. In order to free the
old css_sets for each task to be moved as they're being moved, the freeing
mechanism must be RCU-protected, or else we would have to have a call to
synchronize_rcu() for each task before freeing its old css_set.

Signed-off-by: Ben Blum <[email protected]>

---

include/linux/cgroup.h | 3 +++
kernel/cgroup.c | 8 +++++++-
2 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b934b72..24e3f1a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -260,6 +260,9 @@ struct css_set {
* during subsystem registration (at boot time).
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+ /* For RCU-protected deletion */
+ struct rcu_head rcu_head;
};

/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f6d5969..a12bc8e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -288,6 +288,12 @@ static void unlink_css_set(struct css_set *cg)
}
}

+static void free_css_set_rcu(struct rcu_head *obj)
+{
+ struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+ kfree(cg);
+}
+
static void __put_css_set(struct css_set *cg, int taskexit)
{
int i;
@@ -317,7 +323,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
}
rcu_read_unlock();
- kfree(cg);
+ call_rcu(&cg->rcu_head, free_css_set_rcu);
}

/*

2009-07-31 01:52:24

by Ben Blum

[permalink] [raw]
Subject: [PATCH 6/6] Makes procs file writable to move all threads by tgid at once

Makes procs file writable to move all threads by tgid at once

This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.

There is a gap between releasing the fork_mutex and calling each subsystem's
attach function, which could possibly lead to problems if the subsystem relies
on something that could change in the meantime as caused by forking threads.
No particular issue seems apparent, but were some subsystem to have a problem
here, the per-threadgroup fork mutex could be held longer until after the
attach calls are done.

Signed-off-by: Ben Blum <[email protected]>

---

Documentation/cgroups/cgroups.txt | 12 +
include/linux/cgroup.h | 12 +
include/linux/init_task.h | 9 +
include/linux/sched.h | 2
kernel/cgroup.c | 417 +++++++++++++++++++++++++++++++++----
kernel/fork.c | 6 -
6 files changed, 406 insertions(+), 52 deletions(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97..d579346 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -228,6 +228,7 @@ Each cgroup is represented by a directory in the cgroup file system
containing the following files describing that cgroup:

- tasks: list of tasks (by pid) attached to that cgroup
+ - cgroup.procs: list of unique tgids in the cgroup
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -374,7 +375,7 @@ Now you want to do something with this cgroup.

In this directory you can find several files:
# ls
-notify_on_release tasks
+cgroup.procs notify_on_release tasks
(plus whatever files added by the attached subsystems)

Now attach your shell to this cgroup:
@@ -408,6 +409,15 @@ You can attach the current shell task by echoing 0:

# echo 0 > tasks

+The cgroup.procs file is useful for managing all tasks in a threadgroup at
+once. It works the same way as the tasks file, but moves all tasks in the
+threadgroup with the specified tgid.
+
+Writing the pid of a task that's not the threadgroup leader (i.e., a pid
+that isn't a tgid) is treated as invalid. Writing a '0' to cgroup.procs will
+attach the writing task and all tasks in its threadgroup, but is invalid if
+the writing task is not the leader of the threadgroup.
+
3. Kernel API
=============

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8286758..105d681 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -30,10 +30,12 @@ extern int cgroup_init(void);
extern void cgroup_lock(void);
extern bool cgroup_lock_live_group(struct cgroup *cgrp);
extern void cgroup_unlock(void);
-extern void cgroup_fork(struct task_struct *p);
+extern void cgroup_fork(struct task_struct *p, int clone_flags);
extern void cgroup_fork_callbacks(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern void cgroup_post_fork(struct task_struct *p, int clone_flags);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern void cgroup_fork_failed(struct task_struct *p, int run_callbacks,
+ int clone_flags);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);

@@ -551,10 +553,12 @@ unsigned short css_depth(struct cgroup_subsys_state *css);

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
-static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_fork(struct task_struct *p, int clone_flags) {}
static inline void cgroup_fork_callbacks(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline void cgroup_post_fork(struct task_struct *p, int clone_flags) {}
static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
+static inline void cgroup_fork_failed(struct task_struct *p, int callbacks,
+ int clone_flags) {}

static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index aecd24e..26d814f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -105,6 +105,14 @@ extern struct cred init_cred;
# define INIT_PERF_COUNTERS(tsk)
#endif

+#ifdef CONFIG_CGROUPS
+# define INIT_CGROUP_FORK_MUTEX(tsk) \
+ .cgroup_fork_mutex = \
+ __RWSEM_INITIALIZER(tsk.cgroup_fork_mutex),
+#else
+# define INIT_CGROUP_FORK_MUTEX(tsk)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -174,6 +182,7 @@ extern struct cred init_cred;
INIT_LOCKDEP \
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
+ INIT_CGROUP_FORK_MUTEX(tsk) \
}


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e3e11..5d38980 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,6 +1400,8 @@ struct task_struct {
struct css_set *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
+ /* guarantees atomic threadgroup movement via the procs file */
+ struct rw_semaphore cgroup_fork_mutex;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ea05d6b..3ce7298 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1297,6 +1297,87 @@ static void get_first_subsys(const struct cgroup *cgrp,
*subsys_id = test_ss->subsys_id;
}

+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail
+ * with -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, int guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /*
+ * locate or allocate a new css_set for this task. 'guarantee' tells
+ * us whether or not we are sure that a new css_set already exists;
+ * in that case, we are not allowed to fail, as we won't need malloc.
+ */
+ if (guarantee) {
+ /*
+ * our caller promises us that the css_set we want already
+ * exists, so we use find_existing_css_set directly.
+ */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /*
+ * we cannot move a task that's declared itself as exiting, as once
+ * PF_EXITING is set, the tsk->cgroups pointer is no longer safe.
+ */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list)) {
+ list_del(&tsk->cg_list);
+ list_add(&tsk->cg_list, &newcg->tasks);
+ }
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to
+ * drop it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1307,11 +1388,9 @@ static void get_first_subsys(const struct cgroup *cgrp,
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
int subsys_id;

@@ -1330,75 +1409,293 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}

- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
+ if (retval)
+ return retval;
+
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ }
+
+ synchronize_rcu();
+
/*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
+ * wake up rmdir() waiter. the rmdir should fail since the cgroup
+ * is no longer empty.
*/
+ cgroup_wakeup_rmdir_waiters(cgrp);
+ return 0;
+}
+
+/*
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list, of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static int css_set_check_fetched(struct cgroup *cgrp, struct task_struct *tsk,
+ struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ /* doesn't exist at all? */
+ if (!newcg)
+ return 1;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return 0;
+ }
+ }
+ /* not found */
+ put_css_set(newcg);
+ return 1;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving
+ * the given task to the given cgroup. Returns 0 on success, -ENOMEM if we
+ * run out of memory.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ /* ensure a new css_set will exist for this thread */
newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
if (!newcg)
return -ENOMEM;
+ /* add new element to list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}

- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
+ * threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval;
+ struct cgroup_subsys *ss;
+ struct cgroup *oldcgrp;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ int subsys_id;
+ /* threadgroup list cursor */
+ struct task_struct *tsk;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry;
+
+ /* first, make sure this came from a valid tgid */
+ if (!thread_group_leader(leader))
+ return -EINVAL;
+ /*
+ * check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader, true);
+ if (retval)
+ return retval;
+ }
+ }
+
+ get_first_subsys(cgrp, NULL, &subsys_id);
+
+ /*
+ * step 1: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ oldcgrp = task_cgroup(leader, subsys_id);
+ if (cgrp != oldcgrp) {
+ /* get old css_set */
+ task_lock(leader);
+ if (leader->flags & PF_EXITING) {
+ task_unlock(leader);
+ goto prefetch_loop;
+ }
+ oldcg = leader->cgroups;
+ get_css_set(oldcg);
+ task_unlock(leader);
+ /* acquire new one */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ }
+prefetch_loop:
+ rcu_read_lock();
+ /*
+ * if we need to fetch a new css_set for this task, we must exit the
+ * rcu_read section because allocating it can sleep. afterwards, we'll
+ * need to restart iteration on the threadgroup list - the whole thing
+ * will be O(nm) in the number of threads and css_sets; as the typical
+ * case only has one css_set for all of them, usually O(n).
+ */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup(tsk, subsys_id);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
task_unlock(tsk);
- put_css_set(newcg);
- return -ESRCH;
+ /* see if the new one for us is already in the list? */
+ retval = css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list);
+ if (retval) {
+ /* we don't already have it. get new one. */
+ rcu_read_unlock();
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ /* begin iteration again. */
+ goto prefetch_loop;
+ } else {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ }
}
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
+ rcu_read_unlock();

- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
+ /*
+ * step 2: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup. Even if the threadgroup leader
+ * is PF_EXITING, we still proceed to move all of its sub-threads to
+ * the new cgroup; if everybody is PF_EXITING, we'll just end up doing
+ * nothing, which is ok.
+ */
+ oldcgrp = task_cgroup(leader, subsys_id);
+ /* if leader is already there, skip moving him */
+ if (cgrp != oldcgrp) {
+ retval = cgroup_task_migrate(cgrp, oldcgrp, leader, 1);
+ BUG_ON(retval != 0 && retval != -ESRCH);
}
- write_unlock(&css_set_lock);
+ /*
+ * now move all the rest of the threads - need to lock against
+ * possible races with fork().
+ */
+ down_write(&leader->cgroup_fork_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup(tsk, subsys_id);
+ if (cgrp == oldcgrp)
+ continue;
+ /* we don't care whether these threads are exiting */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 1);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ rcu_read_unlock();
+ up_write(&leader->cgroup_fork_mutex);

+ /*
+ * step 3: attach whole threadgroup to each subsystem
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that
+ * information.
+ */
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk, true);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- synchronize_rcu();
- put_css_set(cg);

/*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
+ * step 4: success! ...and cleanup
*/
+ synchronize_rcu();
cgroup_wakeup_rmdir_waiters(cgrp);
- return 0;
+ retval = 0;
+list_teardown:
+ /* no longer need the list of css_sets, so get rid of it */
+ while (!list_empty(&newcg_list)) {
+ /* pop from the list */
+ cg_entry = list_first_entry(&newcg_list, struct cg_list_entry,
+ links);
+ list_del(&cg_entry->links);
+ /* drop the refcount */
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+ /* done! */
+ return retval;
}

/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid,
+ int attach(struct cgroup *,
+ struct task_struct *))
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;

+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
if (!tsk || tsk->flags & PF_EXITING) {
rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
-
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on the group leader, because
+ * even if another task has different permissions, the group
+ * leader will have sufficient access to change it.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
@@ -1408,19 +1705,25 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
get_task_struct(tsk);
}

- ret = cgroup_attach_task(cgrp, tsk);
+ /*
+ * Note that the check for whether the task is its threadgroup leader
+ * is done in cgroup_attach_proc. This means that writing 0 to the
+ * procs file will only work if the writing task is the leader.
+ */
+ ret = attach(cgrp, tsk);
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}

static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
- int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
- return ret;
+ return attach_task_by_pid(cgrp, pid, cgroup_attach_task);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
+ return attach_task_by_pid(cgrp, tgid, cgroup_attach_proc);
}

/**
@@ -2579,9 +2882,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
@@ -3182,8 +3485,12 @@ static struct file_operations proc_cgroupstats_operations = {
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
-void cgroup_fork(struct task_struct *child)
+void cgroup_fork(struct task_struct *child, int clone_flags)
{
+ if (clone_flags & CLONE_THREAD)
+ down_read(&current->group_leader->cgroup_fork_mutex);
+ else
+ init_rwsem(&child->cgroup_fork_mutex);
task_lock(current);
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
@@ -3220,7 +3527,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
* with the first call to cgroup_iter_start() - to guarantee that the
* new task ends up on its list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child, int clone_flags)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
@@ -3230,6 +3537,8 @@ void cgroup_post_fork(struct task_struct *child)
task_unlock(child);
write_unlock(&css_set_lock);
}
+ if (clone_flags & CLONE_THREAD)
+ up_read(&current->group_leader->cgroup_fork_mutex);
}
/**
* cgroup_exit - detach cgroup from exiting task
@@ -3301,6 +3610,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}

/**
+ * cgroup_fork_failed - undo operations for fork failure
+ * @tsk: pointer to task_struct of exiting process
+ * @run_callback: run exit callbacks?
+ *
+ * Description: Undo cgroup operations after cgroup_fork in fork failure.
+ *
+ * We release the read lock that was taken in cgroup_fork(), since it is
+ * supposed to be dropped in cgroup_post_fork in the success case. The other
+ * thing that wants to be done is detaching the failed child task from the
+ * cgroup, so we wrap cgroup_exit.
+ */
+void cgroup_fork_failed(struct task_struct *tsk, int run_callbacks,
+ int clone_flags)
+{
+ if (clone_flags & CLONE_THREAD)
+ up_read(&current->group_leader->cgroup_fork_mutex);
+ cgroup_exit(tsk, run_callbacks);
+}
+
+/**
* cgroup_clone - clone the cgroup the given subsystem is attached to
* @tsk: the task to be moved
* @subsys: the given subsystem
diff --git a/kernel/fork.c b/kernel/fork.c
index 926c117..3f9decf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1053,7 +1053,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
- cgroup_fork(p);
+ cgroup_fork(p, clone_flags);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
@@ -1269,7 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, clone_flags);
return p;

bad_fork_free_pid:
@@ -1300,7 +1300,7 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
- cgroup_exit(p, cgroup_callbacks_done);
+ cgroup_fork_failed(p, cgroup_callbacks_done, clone_flags);
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);

2009-07-31 01:52:44

by Ben Blum

[permalink] [raw]
Subject: [PATCH 5/6] Lets ss->can_attach and ss->attach do whole threadgroups at a time

Lets ss->can_attach and ss->attach do whole threadgroups at a time

This patch alters the ss->can_attach and ss->attach functions to be able to
deal with a whole threadgroup at a time, for use in cgroup_attach_proc. (This
is a pre-patch to cgroup-procs-writable.patch.)

Currently, new mode of the attach function can only tell the subsystem about
the old cgroup of the threadgroup leader. No subsystem currently needs that
information for each thread that's being moved, but if one were to be added
(for example, one that counts tasks within a group) this bit would need to be
reworked a bit to tell the subsystem the right information.

Signed-off-by: Ben Blum <[email protected]>

---

include/linux/cgroup.h | 7 +++--
kernel/cgroup.c | 4 +--
kernel/cgroup_freezer.c | 15 +++++++++--
kernel/cpuset.c | 65 ++++++++++++++++++++++++++++++++++++----------
kernel/ns_cgroup.c | 16 ++++++++++-
kernel/sched.c | 37 ++++++++++++++++++++++++--
mm/memcontrol.c | 3 +-
security/device_cgroup.c | 3 +-
8 files changed, 122 insertions(+), 28 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 24e3f1a..8286758 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -408,10 +408,11 @@ struct cgroup_subsys {
struct cgroup *cgrp);
int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
- int (*can_attach)(struct cgroup_subsys *ss,
- struct cgroup *cgrp, struct task_struct *tsk);
+ int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk, bool threadgroup);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *tsk);
+ struct cgroup *old_cgrp, struct task_struct *tsk,
+ bool threadgroup);
void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a12bc8e..ea05d6b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1324,7 +1324,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk);
+ retval = ss->can_attach(ss, cgrp, tsk, false);
if (retval)
return retval;
}
@@ -1362,7 +1362,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk);
+ ss->attach(ss, cgrp, oldcgrp, tsk, false);
}
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2..4e352ab 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,10 +159,9 @@ static bool is_task_frozen_enough(struct task_struct *task)
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task)
+ struct task_struct *task, bool threadgroup)
{
struct freezer *freezer;
-
/*
* Anything frozen can't move or be moved to/from.
*
@@ -177,6 +176,18 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state == CGROUP_FROZEN)
return -EBUSY;

+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ if (is_task_frozen_enough(c)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+ }
+ rcu_read_unlock();
+ }
+
return 0;
}

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41..86397f4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
static cpumask_var_t cpus_attach;

/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss,
- struct cgroup *cont, struct task_struct *tsk)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct task_struct *tsk, bool threadgroup)
{
+ int ret;
struct cpuset *cs = cgroup_cs(cont);

if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,50 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;

- return security_task_setscheduler(tsk, 0, NULL);
+ ret = security_task_setscheduler(tsk, 0, NULL);
+ if (ret)
+ return ret;
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ ret = security_task_setscheduler(c, 0, NULL);
+ if (ret) {
+ rcu_read_unlock();
+ return ret;
+ }
+ }
+ rcu_read_unlock();
+ }
+ return 0;
+}
+
+static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+ struct cpuset *cs)
+{
+ int err;
+ /*
+ * can_attach beforehand should guarantee that this doesn't fail.
+ * TODO: have a better way to handle failure here
+ */
+ err = set_cpus_allowed_ptr(tsk, cpus_attach);
+ WARN_ON_ONCE(err);
+
+ task_lock(tsk);
+ cpuset_change_task_nodemask(tsk, to);
+ task_unlock(tsk);
+ cpuset_update_task_spread_flag(cs, tsk);
+
}

-static void cpuset_attach(struct cgroup_subsys *ss,
- struct cgroup *cont, struct cgroup *oldcont,
- struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct cgroup *oldcont, struct task_struct *tsk,
+ bool threadgroup)
{
nodemask_t from, to;
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- int err;

if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1396,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
guarantee_online_cpus(cs, cpus_attach);
guarantee_online_mems(cs, &to);
}
- err = set_cpus_allowed_ptr(tsk, cpus_attach);
- if (err)
- return;

- task_lock(tsk);
- cpuset_change_task_nodemask(tsk, &to);
- task_unlock(tsk);
- cpuset_update_task_spread_flag(cs, tsk);
+ /* do per-task migration stuff possibly for each in the threadgroup */
+ cpuset_attach_task(tsk, &to, cs);
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ cpuset_attach_task(c, &to, cs);
+ }
+ rcu_read_unlock();
+ }

+ /* change mm; only needs to be done once even if threadgroup */
from = oldcs->mems_allowed;
to = cs->mems_allowed;
mm = get_task_mm(tsk);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f..2a5dfec 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
* (hence either you are in the same cgroup as task, or in an
* ancestor cgroup thereof)
*/
-static int ns_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup, struct task_struct *task)
+static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
+ struct task_struct *task, bool threadgroup)
{
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
if (!cgroup_is_descendant(new_cgroup, task))
return -EPERM;

+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ if (!cgroup_is_descendant(new_cgroup, c)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+ }
+ rcu_read_unlock();
+ }
+
return 0;
}

diff --git a/kernel/sched.c b/kernel/sched.c
index 3393c18..b5e371b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10194,8 +10194,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
}

static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk)
+cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10209,11 +10208,43 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
return 0;
}

+static int
+cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk, bool threadgroup)
+{
+ int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
+ if (retval)
+ return retval;
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ retval = cpu_cgroup_can_attach_task(cgrp, c);
+ if (retval) {
+ rcu_read_unlock();
+ return retval;
+ }
+ }
+ rcu_read_unlock();
+ }
+ return 0;
+
+}
+
static void
cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk)
+ struct cgroup *old_cont, struct task_struct *tsk
+ bool threadgroup)
{
sched_move_task(tsk);
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ sched_move_task(c);
+ }
+ rcu_read_unlock();
+ }
}

#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6ceb6f2..d9e9cf4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2584,7 +2584,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p)
+ struct task_struct *p,
+ bool threadgroup)
{
mutex_lock(&memcg_tasklist);
/*
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index b8186ba..6cf8fd2 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,7 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
struct cgroup_subsys devices_subsys;

static int devcgroup_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup, struct task_struct *task)
+ struct cgroup *new_cgroup, struct task_struct *task,
+ bool threadgroup)
{
if (current != task && !capable(CAP_SYS_ADMIN))
return -EPERM;

2009-07-31 01:52:57

by Ben Blum

[permalink] [raw]
Subject: [PATCH 3/6] Quick vmalloc vs kmalloc fix to the case where array size is too large

Quick vmalloc vs kmalloc fix to the case where array size is too large

Separates all pidlist allocation requests to a separate function that judges
based on the requested size whether or not the array needs to be vmalloced or
can be gotten via kmalloc, and similar for kfree/vfree. Should be replaced
entirely with a kernel-wide solution to this general problem.

Depends on cgroup-pidlist-namespace.patch, cgroup-procs.patch

Signed-off-by: Ben Blum <[email protected]>

---

kernel/cgroup.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cdd0112..f6d5969 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -48,6 +48,7 @@
#include <linux/namei.h>
#include <linux/smp_lock.h>
#include <linux/pid_namespace.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

#include <asm/atomic.h>

@@ -2123,6 +2124,42 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/

/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+static void pidlist_free(void *p)
+{
+ if (is_vmalloc_addr(p))
+ vfree(p);
+ else
+ kfree(p);
+}
+static void *pidlist_resize(void *p, int newcount)
+{
+ void *newlist;
+ /* note: if new alloc fails, old p will still be valid either way */
+ if (is_vmalloc_addr(p)) {
+ newlist = vmalloc(newcount * sizeof(pid_t));
+ if (!newlist)
+ return NULL;
+ memcpy(newlist, p, newcount * sizeof(pid_t));
+ vfree(p);
+ } else {
+ newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
+ }
+ return newlist;
+}
+
+/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* If the new stripped list is sufficiently smaller and there's enough memory
* to allocate a new buffer, will let go of the unneeded memory. Returns the
@@ -2161,7 +2198,7 @@ after:
* we'll just stay with what we've got.
*/
if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL);
+ newlist = pidlist_resize(list, dest);
if (newlist)
*p = newlist;
}
@@ -2242,7 +2279,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
- array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
+ array = pidlist_allocate(length);
if (!array)
return -ENOMEM;
/* now, populate the array */
@@ -2266,11 +2303,11 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
length = pidlist_uniq(&array, length);
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
- kfree(array);
+ pidlist_free(array);
return -ENOMEM;
}
/* store array, freeing old if necessary - lock already held */
- kfree(l->list);
+ pidlist_free(l->list);
l->list = array;
l->length = length;
l->use_count++;
@@ -2431,7 +2468,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
- kfree(l->list);
+ pidlist_free(l->list);
put_pid_ns(l->key.ns);
up_write(&l->mutex);
kfree(l);