LinuxLists.cc - [PATCH v2 0/3] CGroups: cgroup member list enhancement/fix

[permalink] [raw]

Subject: [PATCH 2/3] Ensures correct concurrent opening/reading of pidlists across pid namespaces

Ensures correct concurrent opening/reading of pidlists across pid namespaces

Previously there was the problem in which two processes from different pid
namespaces reading the tasks or procs file could result in one process seeing
results from the other's namespace. Rather than one pidlist for each file in a
cgroup, we now keep a list of pidlists keyed by namespace and file type (tasks
versus procs) in which entries are placed on demand. Each pidlist has its own
lock, and that the pidlists themselves are passed around in the seq_file's
private pointer means we don't have to touch the cgroup or its master list
except when creating and destroying entries.

Signed-off-by: Ben Blum <[email protected]>

---

include/linux/cgroup.h | 34 +++++++++++++--
kernel/cgroup.c | 108 ++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 119 insertions(+), 23 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8a3a3ac..b934b72 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,15 +141,36 @@ enum {
CGRP_WAIT_ON_RMDIR,
};

+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
struct cgroup_pidlist {
- /* protects the other fields */
- struct rw_semaphore mutex;
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* how many files are using the current array */
int use_count;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* protects the other fields */
+ struct rw_semaphore mutex;
};

struct cgroup {
@@ -190,9 +211,12 @@ struct cgroup {
*/
struct list_head release_list;

- /* we will have two separate pidlists, one for pids (the tasks file)
- * and one for tgids (the procs file). */
- struct cgroup_pidlist tasks, procs;
+ /*
+ * list of pidlists, up to two for each namespace (one for procs, one
+ * for tasks); created on demand.
+ */
+ struct list_head pidlists;
+ struct mutex pidlist_mutex;

/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2efe5c4..33d89be 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>

#include <asm/atomic.h>

@@ -640,7 +641,11 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
static void free_cgroup_rcu(struct rcu_head *obj)
{
struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-
+ /*
+ * if we're getting rid of the cgroup, refcount must have ensured
+ * that anybody using a pidlist will have cleaned it up by now.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
kfree(cgrp);
}

@@ -960,8 +965,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
- init_rwsem(&(cgrp->tasks.mutex));
- init_rwsem(&(cgrp->procs.mutex));
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
{
@@ -2171,9 +2176,58 @@ static int cmppid(const void *a, const void *b)
}

/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+ /*
+ * We can't drop the pidlist_mutex before taking the l->mutex in case
+ * the last ref-holder is trying to remove l from the list at the same
+ * time. Holding the pidlist_mutex precludes somebody taking whichever
+ * list we find out from under us - compare release_pid_array().
+ */
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry(l, &cgrp->pidlists, links) {
+ if (l->key.type == type && l->key.ns == ns) {
+ /* found a matching list - drop the extra refcount */
+ put_pid_ns(ns);
+ /* make sure l doesn't vanish out from under us */
+ down_write(&l->mutex);
+ mutex_unlock(&cgrp->pidlist_mutex);
+ l->use_count++;
+ return l;
+ }
+ }
+ /* entry not found; create a new one */
+ l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
+ return l;
+ }
+ init_rwsem(&l->mutex);
+ down_write(&l->mutex);
+ l->key.type = type;
+ l->key.ns = ns;
+ l->use_count = 0; /* don't increment here */
+ l->list = NULL;
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ mutex_unlock(&cgrp->pidlist_mutex);
+ return l;
+}
+
+/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
-static int pidlist_array_load(struct cgroup *cgrp, bool procs)
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
{
pid_t *array;
int length;
@@ -2198,7 +2252,10 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
if (unlikely(n == length))
break;
/* get tgid or pid for procs or tasks file respectively */
- pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
}
@@ -2206,19 +2263,21 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (procs) {
+ if (type == CGROUP_FILE_PROCS) {
length = pidlist_uniq(&array, length);
- l = &(cgrp->procs);
- } else {
- l = &(cgrp->tasks);
}
- /* store array in cgroup, freeing old if necessary */
- down_write(&l->mutex);
+ l = cgroup_pidlist_find(cgrp, type);
+ if (!l) {
+ kfree(array);
+ return -ENOMEM;
+ }
+ /* store array, freeing old if necessary - lock already held */
kfree(l->list);
l->list = array;
l->length = length;
l->use_count++;
up_write(&l->mutex);
+ *lp = l;
return 0;
}

@@ -2361,13 +2420,26 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {

static void cgroup_release_pid_array(struct cgroup_pidlist *l)
{
+ /*
+ * the case where we're the last user of this particular pidlist will
+ * have us remove it from the cgroup's list, which entails taking the
+ * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
+ * pidlist_mutex, we have to take pidlist_mutex first.
+ */
+ mutex_lock(&l->owner->pidlist_mutex);
down_write(&l->mutex);
BUG_ON(!l->use_count);
if (!--l->use_count) {
+ /* we're the last user if refcount is 0; remove and free */
+ list_del(&l->links);
+ mutex_unlock(&l->owner->pidlist_mutex);
kfree(l->list);
- l->list = NULL;
- l->length = 0;
+ put_pid_ns(l->key.ns);
+ up_write(&l->mutex);
+ kfree(l);
+ return;
}
+ mutex_unlock(&l->owner->pidlist_mutex);
up_write(&l->mutex);
}

@@ -2398,10 +2470,10 @@ static const struct file_operations cgroup_pidlist_operations = {
* in the cgroup.
*/
/* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, bool procs)
+static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
+ struct cgroup_pidlist *l;
int retval;

/* Nothing to do for write-only files */
@@ -2409,7 +2481,7 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
return 0;

/* have the array populated */
- retval = pidlist_array_load(cgrp, procs);
+ retval = pidlist_array_load(cgrp, type, &l);
if (retval)
return retval;
/* configure file information */
@@ -2425,11 +2497,11 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
}
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
- return cgroup_pidlist_open(file, false);
+ return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
- return cgroup_pidlist_open(file, true);
+ return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
}

static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,

2009-07-10 23:02:42

[permalink] [raw]

Subject: [PATCH 1/3] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

struct cgroup used to have a bunch of fields for keeping track of the pidlist
for the tasks file. Those are now separated into a new struct cgroup_pidlist,
of which two are had, one for procs and one for tasks. The way the seq_file
operations are set up is changed so that just the pidlist struct gets passed
around as the private data.

Interface example: suppose a process with tgid 1000 has additional threads
with ids 1001 and 1002. The tasks file will show ids 1000, 1001, and 1002, and
the procs file will only show id 1000.

Possible future functionality is making the procs file writable for purposes
of adding all threads with the same tgid at once.

Signed-off-by: Ben Blum <[email protected]>

---

include/linux/cgroup.h | 22 ++--
kernel/cgroup.c | 282 ++++++++++++++++++++++++++++++------------------
2 files changed, 190 insertions(+), 114 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 665fa70..8a3a3ac 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,6 +141,17 @@ enum {
CGRP_WAIT_ON_RMDIR,
};

+struct cgroup_pidlist {
+ /* protects the other fields */
+ struct rw_semaphore mutex;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* how many files are using the current array */
+ int use_count;
+};
+
struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */

@@ -179,14 +190,9 @@ struct cgroup {
*/
struct list_head release_list;

- /* pids_mutex protects the fields below */
- struct rw_semaphore pids_mutex;
- /* Array of process ids in the cgroup */
- pid_t *tasks_pids;
- /* How many files are using the current tasks_pids array */
- int pids_use_count;
- /* Length of the current tasks_pids array */
- int pids_length;
+ /* we will have two separate pidlists, one for pids (the tasks file)
+ * and one for tgids (the procs file). */
+ struct cgroup_pidlist tasks, procs;

/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a68..2efe5c4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -960,7 +960,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
- init_rwsem(&cgrp->pids_mutex);
+ init_rwsem(&(cgrp->tasks.mutex));
+ init_rwsem(&(cgrp->procs.mutex));
}
static void init_cgroup_root(struct cgroupfs_root *root)
{
@@ -1408,15 +1409,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
return ret;
}

-/* The various types of files and directories in a cgroup file system */
-enum cgroup_filetype {
- FILE_ROOT,
- FILE_DIR,
- FILE_TASKLIST,
- FILE_NOTIFY_ON_RELEASE,
- FILE_RELEASE_AGENT,
-};
-
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
@@ -2114,7 +2106,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
}

/*
- * Stuff for reading the 'tasks' file.
+ * Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
@@ -2124,27 +2116,110 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/

/*
- * Load into 'pidarray' up to 'npids' of the tasks using cgroup
- * 'cgrp'. Return actual number of pids loaded. No need to
- * task_lock(p) when reading out p->cgroup, since we're in an RCU
- * read section, so the css_set can't go away, and is
- * immutable after creation.
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * If the new stripped list is sufficiently smaller and there's enough memory
+ * to allocate a new buffer, will let go of the unneeded memory. Returns the
+ * number of unique elements.
+ */
+/* is the size difference enough that we should re-allocate the array? */
+#define PIDLIST_REALLOC_DIFFERENCE(old,new) ((old) - PAGE_SIZE >= (new))
+static int pidlist_uniq(pid_t **p, int length)
+{
+ int src, dest = 1;
+ pid_t *list = *p;
+ pid_t *newlist;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ break;
+ }
+ if (src == length)
+ break;
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+ /*
+ * if the length difference is large enough, we want to allocate a
+ * smaller buffer to save memory. if this fails due to out of memory,
+ * we'll just stay with what we've got.
+ */
+ if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
+ newlist = kmalloc(dest * sizeof(pid_t), GFP_KERNEL);
+ if (newlist) {
+ memcpy(newlist, list, dest * sizeof(pid_t));
+ kfree(list);
+ *p = newlist;
+ }
+ }
+ return dest;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+static int pidlist_array_load(struct cgroup *cgrp, bool procs)
{
- int n = 0, pid;
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
struct cgroup_iter it;
struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
- if (unlikely(n == npids))
+ if (unlikely(n == length))
break;
- pid = task_pid_vnr(tsk);
- if (pid > 0)
- pidarray[n++] = pid;
+ /* get tgid or pid for procs or tasks file respectively */
+ pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
}
cgroup_iter_end(cgrp, &it);
- return n;
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (procs) {
+ length = pidlist_uniq(&array, length);
+ l = &(cgrp->procs);
+ } else {
+ l = &(cgrp->tasks);
+ }
+ /* store array in cgroup, freeing old if necessary */
+ down_write(&l->mutex);
+ kfree(l->list);
+ l->list = array;
+ l->length = length;
+ l->use_count++;
+ up_write(&l->mutex);
+ return 0;
}

/**
@@ -2201,19 +2276,14 @@ err:
return ret;
}

-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-

/*
- * seq_file methods for the "tasks" file. The seq_file position is the
+ * seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->tasks_pids array.
+ * in the cgroup->l->list array.
*/

-static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
{
/*
* Initially we receive a position value that corresponds to
@@ -2221,46 +2291,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pidlist *l = s->private;
int index = 0, pid = *pos;
int *iter;

- down_read(&cgrp->pids_mutex);
+ down_read(&l->mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = l->length;

while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (l->list[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = l->list + index;
*pos = *iter;
return iter;
}

-static void cgroup_tasks_stop(struct seq_file *s, void *v)
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
- up_read(&cgrp->pids_mutex);
+ struct cgroup_pidlist *l = s->private;
+ up_read(&l->mutex);
}

-static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
- int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
-
+ struct cgroup_pidlist *l = s->private;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
@@ -2274,98 +2343,94 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
}
}

-static int cgroup_tasks_show(struct seq_file *s, void *v)
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}

-static struct seq_operations cgroup_tasks_seq_operations = {
- .start = cgroup_tasks_start,
- .stop = cgroup_tasks_stop,
- .next = cgroup_tasks_next,
- .show = cgroup_tasks_show,
+/*
+ * seq_operations functions for iterating on pidlists through seq_file -
+ * independent of whether it's tasks or procs
+ */
+static const struct seq_operations cgroup_pidlist_seq_operations = {
+ .start = cgroup_pidlist_start,
+ .stop = cgroup_pidlist_stop,
+ .next = cgroup_pidlist_next,
+ .show = cgroup_pidlist_show,
};

-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void cgroup_release_pid_array(struct cgroup_pidlist *l)
{
- down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ down_write(&l->mutex);
+ BUG_ON(!l->use_count);
+ if (!--l->use_count) {
+ kfree(l->list);
+ l->list = NULL;
+ l->length = 0;
}
- up_write(&cgrp->pids_mutex);
+ up_write(&l->mutex);
}

-static int cgroup_tasks_release(struct inode *inode, struct file *file)
+static int cgroup_pidlist_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-
+ struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
-
- release_cgroup_pid_array(cgrp);
+ /*
+ * the seq_file will only be initialized if the file was opened for
+ * reading; hence we check if it's not null only in that case.
+ */
+ l = ((struct seq_file *)file->private_data)->private;
+ cgroup_release_pid_array(l);
return seq_release(inode, file);
}

-static struct file_operations cgroup_tasks_operations = {
+static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
- .release = cgroup_tasks_release,
+ .release = cgroup_pidlist_release,
};

/*
- * Handle an open on 'tasks' file. Prepare an array containing the
- * process id's of tasks currently attached to the cgroup being opened.
+ * The following functions handle opens on a file that displays a pidlist
+ * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
+ * in the cgroup.
*/
-
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
+/* helper function for the two below it */
+static int cgroup_pidlist_open(struct file *file, bool procs)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- pid_t *pidarray;
- int npids;
+ struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
int retval;

/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;

- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- npids = cgroup_task_count(cgrp);
- pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
- if (!pidarray)
- return -ENOMEM;
- npids = pid_array_load(pidarray, npids, cgrp);
- sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
- /*
- * Store the array in the cgroup, freeing the old
- * array if necessary
- */
- down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
- up_write(&cgrp->pids_mutex);
-
- file->f_op = &cgroup_tasks_operations;
+ /* have the array populated */
+ retval = pidlist_array_load(cgrp, procs);
+ if (retval)
+ return retval;
+ /* configure file information */
+ file->f_op = &cgroup_pidlist_operations;

- retval = seq_open(file, &cgroup_tasks_seq_operations);
+ retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ cgroup_release_pid_array(l);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = l;
return 0;
}
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+ return cgroup_pidlist_open(file, false);
+}
+static int cgroup_procs_open(struct inode *unused, struct file *file)
+{
+ return cgroup_pidlist_open(file, true);
+}

static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
struct cftype *cft)
@@ -2388,21 +2453,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
/*
* for the common functions, 'private' gives the type of file
*/
+/* for hysterical raisins, we can't put this on the older files */
+#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
static struct cftype files[] = {
{
.name = "tasks",
.open = cgroup_tasks_open,
.write_u64 = cgroup_tasks_write,
- .release = cgroup_tasks_release,
- .private = FILE_TASKLIST,
+ .release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
-
+ {
+ .name = CGROUP_FILE_GENERIC_PREFIX "procs",
+ .open = cgroup_procs_open,
+ /* .write_u64 = cgroup_procs_write, TODO */
+ .release = cgroup_pidlist_release,
+ .mode = S_IRUGO,
+ },
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
- .private = FILE_NOTIFY_ON_RELEASE,
},
};

@@ -2411,7 +2482,6 @@ static struct cftype cft_release_agent = {
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
- .private = FILE_RELEASE_AGENT,
};

static int cgroup_populate_dir(struct cgroup *cgrp)

2009-07-10 23:02:53

[permalink] [raw]

Subject: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

Quick vmalloc vs kmalloc fix to the case where array size is too large

Separates all pidlist allocation requests to a separate function that judges
based on the requested size whether or not the array needs to be vmalloced or
can be gotten via kmalloc, and similar for kfree/vfree. Should be replaced
entirely with a kernel-wide solution to this general problem.

Depends on cgroup-pidlist-namespace.patch, cgroup-procs.patch

Signed-off-by: Ben Blum <[email protected]>

---

kernel/cgroup.c | 34 ++++++++++++++++++++++++++++------
1 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 33d89be..0ed85fa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -48,6 +48,7 @@
#include <linux/namei.h>
#include <linux/smp_lock.h>
#include <linux/pid_namespace.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

#include <asm/atomic.h>

@@ -2121,6 +2122,27 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/

/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static inline void *pidlist_allocate(int count)
+{
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+static inline void pidlist_free(void *p)
+{
+ if (is_vmalloc_addr(p))
+ vfree(p);
+ else
+ kfree(p);
+}
+
+/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* If the new stripped list is sufficiently smaller and there's enough memory
* to allocate a new buffer, will let go of the unneeded memory. Returns the
@@ -2160,10 +2182,10 @@ static int pidlist_uniq(pid_t **p, int length)
* we'll just stay with what we've got.
*/
if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = kmalloc(dest * sizeof(pid_t), GFP_KERNEL);
+ newlist = pidlist_allocate(dest);
if (newlist) {
memcpy(newlist, list, dest * sizeof(pid_t));
- kfree(list);
+ pidlist_free(list);
*p = newlist;
}
}
@@ -2243,7 +2265,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
- array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
+ array = pidlist_allocate(length);
if (!array)
return -ENOMEM;
/* now, populate the array */
@@ -2268,11 +2290,11 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
}
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
- kfree(array);
+ pidlist_free(array);
return -ENOMEM;
}
/* store array, freeing old if necessary - lock already held */
- kfree(l->list);
+ pidlist_free(l->list);
l->list = array;
l->length = length;
l->use_count++;
@@ -2433,7 +2455,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
- kfree(l->list);
+ pidlist_free(l->list);
put_pid_ns(l->key.ns);
up_write(&l->mutex);
kfree(l);

2009-07-11 21:59:42

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [PATCH 2/3] Ensures correct concurrent opening/reading of pidlists across pid namespaces

Quoting Ben Blum ([email protected]):
...
> +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
> + enum cgroup_filetype type)
> +{
> + struct cgroup_pidlist *l;
> + /* don't need task_nsproxy() if we're looking at ourself */
> + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
> + /*
> + * We can't drop the pidlist_mutex before taking the l->mutex in case
> + * the last ref-holder is trying to remove l from the list at the same
> + * time. Holding the pidlist_mutex precludes somebody taking whichever
> + * list we find out from under us - compare release_pid_array().
> + */
> + mutex_lock(&cgrp->pidlist_mutex);
> + list_for_each_entry(l, &cgrp->pidlists, links) {
> + if (l->key.type == type && l->key.ns == ns) {
> + /* found a matching list - drop the extra refcount */
> + put_pid_ns(ns);
> + /* make sure l doesn't vanish out from under us */
> + down_write(&l->mutex);
> + mutex_unlock(&cgrp->pidlist_mutex);
> + l->use_count++;
> + return l;
> + }
> + }
> + /* entry not found; create a new one */
> + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
> + if (!l) {
> + mutex_unlock(&cgrp->pidlist_mutex);

Again, you need to put_pid_ns(ns) here.

> + return l;
> + }
> + init_rwsem(&l->mutex);
> + down_write(&l->mutex);
> + l->key.type = type;
> + l->key.ns = ns;
> + l->use_count = 0; /* don't increment here */
> + l->list = NULL;
> + l->owner = cgrp;
> + list_add(&l->links, &cgrp->pidlists);
> + mutex_unlock(&cgrp->pidlist_mutex);
> + return l;
> +}

2009-07-13 03:03:13

by Li Zefan

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

Ben Blum wrote:
> Quick vmalloc vs kmalloc fix to the case where array size is too large
>
> Separates all pidlist allocation requests to a separate function that judges
> based on the requested size whether or not the array needs to be vmalloced or
> can be gotten via kmalloc, and similar for kfree/vfree. Should be replaced
> entirely with a kernel-wide solution to this general problem.
>
> Depends on cgroup-pidlist-namespace.patch, cgroup-procs.patch
>

Since this is a patchset, you don't need to tell the dependencies of
this patch, at least not in changelog, but can put it ...

> Signed-off-by: Ben Blum <[email protected]>
>
> ---
>

... here

--- <- followed by this mark

> kernel/cgroup.c | 34 ++++++++++++++++++++++++++++------
> 1 files changed, 28 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 33d89be..0ed85fa 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -48,6 +48,7 @@
> #include <linux/namei.h>
> #include <linux/smp_lock.h>
> #include <linux/pid_namespace.h>
> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
>

Is this TODO different with the below one?

> #include <asm/atomic.h>
>
> @@ -2121,6 +2122,27 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
> */
>
> /*
> + * The following two functions "fix" the issue where there are more pids
> + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
> + * TODO: replace with a kernel-wide solution to this problem
> + */
> +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))

I think order-0 is most robust and should be used as much as possible.

> +static inline void *pidlist_allocate(int count)

It's better to let gcc decide to inline it or not.

> +{
> + if (PIDLIST_TOO_LARGE(count))
> + return vmalloc(count * sizeof(pid_t));
> + else
> + return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
> +}
> +static inline void pidlist_free(void *p)

ditto

> +{
> + if (is_vmalloc_addr(p))
> + vfree(p);
> + else
> + kfree(p);
> +}
> +

2009-07-13 03:46:19

by Li Zefan

[permalink] [raw]

Subject: Re: [PATCH 1/3] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

Ben Blum wrote:
> Adds a read-only "procs" file similar to "tasks" that shows only unique tgids
>
> struct cgroup used to have a bunch of fields for keeping track of the pidlist
> for the tasks file. Those are now separated into a new struct cgroup_pidlist,
> of which two are had, one for procs and one for tasks. The way the seq_file
> operations are set up is changed so that just the pidlist struct gets passed
> around as the private data.
>
> Interface example: suppose a process with tgid 1000 has additional threads
> with ids 1001 and 1002. The tasks file will show ids 1000, 1001, and 1002, and
> the procs file will only show id 1000.
>

I think a better demonstration is:

$ cat tasks
1000
1001
1002

$ cat procs
1000

> Possible future functionality is making the procs file writable for purposes
> of adding all threads with the same tgid at once.
>
> Signed-off-by: Ben Blum <[email protected]>
>
> ---
>
> include/linux/cgroup.h | 22 ++--
> kernel/cgroup.c | 282 ++++++++++++++++++++++++++++++------------------
> 2 files changed, 190 insertions(+), 114 deletions(-)
...
> +/* is the size difference enough that we should re-allocate the array? */
> +#define PIDLIST_REALLOC_DIFFERENCE(old,new) ((old) - PAGE_SIZE >= (new))
> +static int pidlist_uniq(pid_t **p, int length)
> +{
> + int src, dest = 1;
> + pid_t *list = *p;
> + pid_t *newlist;
> +
> + /*
> + * we presume the 0th element is unique, so i starts at 1. trivial
> + * edge cases first; no work needs to be done for either
> + */
> + if (length == 0 || length == 1)
> + return length;
> + /* src and dest walk down the list; dest counts unique elements */
> + for (src = 1; src < length; src++) {
> + /* find next unique element */
> + while (list[src] == list[src-1]) {
> + src++;
> + if (src == length)
> + break;

'goto' is better

> + }
> + if (src == length)
> + break;
> + /* dest always points to where the next unique element goes */
> + list[dest] = list[src];
> + dest++;
> + }
> + /*
> + * if the length difference is large enough, we want to allocate a
> + * smaller buffer to save memory. if this fails due to out of memory,
> + * we'll just stay with what we've got.
> + */
> + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
> + newlist = kmalloc(dest * sizeof(pid_t), GFP_KERNEL);

krealloc()

> + if (newlist) {
> + memcpy(newlist, list, dest * sizeof(pid_t));
> + kfree(list);
> + *p = newlist;
> + }
> + }
> + return dest;
> +}

2009-07-13 06:04:56

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, 13 Jul 2009 11:03:03 +0800
Li Zefan <[email protected]> wrote:

> Ben Blum wrote:
> > Quick vmalloc vs kmalloc fix to the case where array size is too large
> >
> > Separates all pidlist allocation requests to a separate function that judges
> > based on the requested size whether or not the array needs to be vmalloced or
> > can be gotten via kmalloc, and similar for kfree/vfree. Should be replaced
> > entirely with a kernel-wide solution to this general problem.
> >
> > Depends on cgroup-pidlist-namespace.patch, cgroup-procs.patch
> >
>
> Since this is a patchset, you don't need to tell the dependencies of
> this patch, at least not in changelog, but can put it ...
>
> > Signed-off-by: Ben Blum <[email protected]>
> >
> > ---
> >
>
> ... here
>
> --- <- followed by this mark
>
> > kernel/cgroup.c | 34 ++++++++++++++++++++++++++++------
> > 1 files changed, 28 insertions(+), 6 deletions(-)
> >
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index 33d89be..0ed85fa 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -48,6 +48,7 @@
> > #include <linux/namei.h>
> > #include <linux/smp_lock.h>
> > #include <linux/pid_namespace.h>
> > +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
> >
>
> Is this TODO different with the below one?
>
> > #include <asm/atomic.h>
> >
> > @@ -2121,6 +2122,27 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
> > */
> >
> > /*
> > + * The following two functions "fix" the issue where there are more pids
> > + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
> > + * TODO: replace with a kernel-wide solution to this problem
> > + */
> > +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
>
> I think order-0 is most robust and should be used as much as possible.
>
> > +static inline void *pidlist_allocate(int count)
>
> It's better to let gcc decide to inline it or not.
>
> > +{
> > + if (PIDLIST_TOO_LARGE(count))
> > + return vmalloc(count * sizeof(pid_t));
> > + else
> > + return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
> > +}
> > +static inline void pidlist_free(void *p)
>
> ditto
>
???? why not using vmalloc() always ?

Thanks,
-Kame

> > +{
> > + if (is_vmalloc_addr(p))
> > + vfree(p);
> > + else
> > + kfree(p);
> > +}
> > +
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-07-13 15:25:30

[permalink] [raw]

Subject: Re: [PATCH 1/3] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

On Sun, Jul 12, 2009 at 11:46 PM, Li Zefan<[email protected]> wrote:
> I think a better demonstration is:
>
> ?$ cat tasks
> ?1000
> ?1001
> ?1002
>
> ?$ cat procs
> ?1000

Indeed.

>> + ? ? /*
>> + ? ? ?* if the length difference is large enough, we want to allocate a
>> + ? ? ?* smaller buffer to save memory. if this fails due to out of memory,
>> + ? ? ?* we'll just stay with what we've got.
>> + ? ? ?*/
>> + ? ? if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
>> + ? ? ? ? ? ? newlist = kmalloc(dest * sizeof(pid_t), GFP_KERNEL);
>
> krealloc()
>

Ah yes. I had thought about krealloc and determined that it wasn't
clever enough to do in-place resizing, but forgot that it could be
used anyway here.

2009-07-13 15:27:48

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, Jul 13, 2009 at 2:03 AM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> ???? why not using vmalloc() always ?
>
> Thanks,
> -Kame
>

The idea behind this patch is that in most cases, we want to use
kmalloc for performance, but sometimes the array will be too big. See
the previous thread (linked in the root message of this thread) for
discussion of the issue.

2009-07-13 23:51:38

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, 13 Jul 2009 11:27:43 -0400
Benjamin Blum <[email protected]> wrote:

> On Mon, Jul 13, 2009 at 2:03 AM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> > ???? why not using vmalloc() always ?
> >
> > Thanks,
> > -Kame
> >
>
> The idea behind this patch is that in most cases, we want to use
> kmalloc for performance, but sometimes the array will be too big. See
> the previous thread (linked in the root message of this thread) for
> discussion of the issue.
>
IIUC, this place, .../procs interface is not so important for performance
as to being allowed this ugly conding.

Thanks,
-Kame

2009-07-14 03:50:42

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, Jul 13, 2009 at 4:49 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> IIUC, this place, .../procs interface is not so important for performance
> as to being allowed this ugly conding.
>

It's not just the calling thread that suffers from the overhead -
we've seen performance hits on other processes on a machine due to the
TLB-shootdown overhead associated with vmalloc()/vfree().

Paul

2009-07-14 03:55:29

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, 13 Jul 2009 20:50:33 -0700
Paul Menage <[email protected]> wrote:

> On Mon, Jul 13, 2009 at 4:49 PM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> > IIUC, this place, .../procs interface is not so important for performance
> > as to being allowed this ugly conding.
> >
>
> It's not just the calling thread that suffers from the overhead -
> we've seen performance hits on other processes on a machine due to the
> TLB-shootdown overhead associated with vmalloc()/vfree().
>
Hmm, ok...then, if too much pids, you hit vmalloc()/vfree() problem again.
So, it's not good idea to use vmalloc/vfree here after all.

Thanks,
-Kame

2009-07-14 04:04:39

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, Jul 13, 2009 at 8:53 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> Hmm, ok...then, if too much pids, you hit vmalloc()/vfree() problem again.
> So, it's not good idea to use vmalloc/vfree here after all.
>

Agreed, using vmalloc()/vfree() isn't a perfect solution, but at least
the approach of kmalloc() for small allocations and vmalloc() for
larger allocations solves the current problem (reading tasks/procs
files can fail due to lack of contiguous kmalloc memory) without
introducing overhead in the typical case.

Paul

2009-07-14 04:27:32

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, 13 Jul 2009 21:04:32 -0700
Paul Menage <[email protected]> wrote:

> On Mon, Jul 13, 2009 at 8:53 PM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> > Hmm, ok...then, if too much pids, you hit vmalloc()/vfree() problem again.
> > So, it's not good idea to use vmalloc/vfree here after all.
> >
>
> Agreed, using vmalloc()/vfree() isn't a perfect solution, but at least
> the approach of kmalloc() for small allocations and vmalloc() for
> larger allocations solves the current problem (reading tasks/procs
> files can fail due to lack of contiguous kmalloc memory) without
> introducing overhead in the typical case.
>
My point is
- More PIDs, More time necessary to read procs file.
This patch boost it ;) Seems like "visit this later again" ,or FIXME patch.

Thanks,
-Kame

> Paul
>

2009-07-14 17:26:37

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Tue, Jul 14, 2009 at 12:25 AM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> My point is
> ?- More PIDs, More time necessary to read procs file.
> This patch boost it ;) Seems like "visit this later again" ,or FIXME patch.
>
> Thanks,
> -Kame

Indeed. You'll notice the TODOs in the code here referring to the
discussion of a possible dynamic array system in the previous thread.
This is simply a correctness patch that aims to keep performance as
good as it can for the current approach.

(Kame, forgive the double-post; forgot to reply-all the first time)

- Ben

2009-07-14 17:28:48

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Mon, Jul 13, 2009 at 9:25 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> My point is
> ?- More PIDs, More time necessary to read procs file.

Right now, many pids => impossible to read procs file due to kmalloc
failure. (This was always the case with cpusets too). So using kmalloc
in those cases is a strict improvement.

> This patch boost it ;) Seems like "visit this later again" ,or FIXME patch.

Agreed.

Paul

2009-07-14 17:47:44

by Dave Hansen

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Tue, 2009-07-14 at 10:28 -0700, Paul Menage wrote:
> On Mon, Jul 13, 2009 at 9:25 PM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> > My point is
> > - More PIDs, More time necessary to read procs file.
>
> Right now, many pids => impossible to read procs file due to kmalloc
> failure. (This was always the case with cpusets too). So using kmalloc
> in those cases is a strict improvement.

How big were those allocations that were failing? The code made it
appear that order-2 (PAGE_SIZE*4) allocations were failing. That's a
bit lower than I'd expect the page allocator to start failing.

-- Dave

2009-07-14 17:50:15

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

On Tue, Jul 14, 2009 at 10:47 AM, Dave Hansen<[email protected]> wrote:
>
> How big were those allocations that were failing? ?The code made it
> appear that order-2 (PAGE_SIZE*4) allocations were failing. ?That's a
> bit lower than I'd expect the page allocator to start failing.

I think it depends on how much fragmentation you've got.

We've seen it fail for cpusets with (I guess) hundreds or thousands of threads.

Paul

2009-07-14 19:01:30

by Dave Hansen

[permalink] [raw]

Subject: Re: [PATCH 1/3] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

On Fri, 2009-07-10 at 16:01 -0700, Ben Blum wrote:
> +struct cgroup_pidlist {
> + /* protects the other fields */
> + struct rw_semaphore mutex;
> + /* array of xids */
> + pid_t *list;
> + /* how many elements the above list has */
> + int length;
> + /* how many files are using the current array */
> + int use_count;
> +};

I think a slightly nicer way of doing this would be to use a structure
like this:

#define NR_PIDS (PAGE_SIZE-sizeof(struct list_head))
struct pid_list {
struct list_head list;
pid_t pids[NR_PIDS];
};

That way, you can always kmalloc(sizeof(pid_list)), it fits nicely in
PAGE_SIZE, and you can chain them together.

Or, you could always just use one of the other flexible structures in
the kernel like a radix_tree.

-- Dave

2009-07-14 21:26:23

[permalink] [raw]

Subject: Re: [PATCH 1/3] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

On Tue, Jul 14, 2009 at 11:34 AM, Dave Hansen<[email protected]> wrote:
> On Fri, 2009-07-10 at 16:01 -0700, Ben Blum wrote:
>> +struct cgroup_pidlist {
>> + ? ? ? /* protects the other fields */
>> + ? ? ? struct rw_semaphore mutex;
>> + ? ? ? /* array of xids */
>> + ? ? ? pid_t *list;
>> + ? ? ? /* how many elements the above list has */
>> + ? ? ? int length;
>> + ? ? ? /* how many files are using the current array */
>> + ? ? ? int use_count;
>> +};
>
> I think a slightly nicer way of doing this would be to use a structure
> like this:
>
> #define NR_PIDS (PAGE_SIZE-sizeof(struct list_head))
> struct pid_list {
> ? ? ? ?struct list_head list;
> ? ? ? ?pid_t pids[NR_PIDS];
> };
>
> That way, you can always kmalloc(sizeof(pid_list)), it fits nicely in
> PAGE_SIZE, and you can chain them together.
>
> Or, you could always just use one of the other flexible structures in
> the kernel like a radix_tree.
>
> -- Dave

This method looks to be a compromise between Andrew's proposed
generalized solution ( http://lkml.org/lkml/2009/7/2/518 ) and the
current quick-fix. The problem with it is that it'll require a layer
between whoever's using the array and managing the list structs (for
the case where we need to chain multiple blocks together), and if
we're going to put forth enough effort for that, we may as well go
ahead and write up a generalized kernel-wide library to fix this size
problem globally.

2009-07-14 21:30:36

[permalink] [raw]

Subject: Re: [PATCH 3/3] Quick vmalloc vs kmalloc fix to the case where array size is too large

Indeed.

Alternatively, I could make it case on KMALLOC_MAX_SIZE as follows:

if (size > KMALLOC_MAX_SIZE) {
/* use vmalloc directly */
} else {
/* try kmalloc, and, expecting fragmentation, if that fails, use vmalloc */
}

As the free wrapper uses is_vmalloc_addr, it'd work fine and be able
to decide for a certain range of sizes whether kmalloc or vmalloc is
appropriate.

On Tue, Jul 14, 2009 at 10:50 AM, Paul Menage<[email protected]> wrote:
> On Tue, Jul 14, 2009 at 10:47 AM, Dave Hansen<[email protected]> wrote:
>>
>> How big were those allocations that were failing? ?The code made it
>> appear that order-2 (PAGE_SIZE*4) allocations were failing. ?That's a
>> bit lower than I'd expect the page allocator to start failing.
>
> I think it depends on how much fragmentation you've got.
>
> We've seen it fail for cpusets with (I guess) hundreds or thousands of threads.
>
> Paul
>

2009-07-14 21:49:33

by Dave Hansen

[permalink] [raw]

Subject: Re: [PATCH 1/3] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids

On Tue, 2009-07-14 at 14:26 -0700, Benjamin Blum wrote:
> This method looks to be a compromise between Andrew's proposed
> generalized solution ( http://lkml.org/lkml/2009/7/2/518 ) and the
> current quick-fix. The problem with it is that it'll require a layer
> between whoever's using the array and managing the list structs (for
> the case where we need to chain multiple blocks together), and if
> we're going to put forth enough effort for that, we may as well go
> ahead and write up a generalized kernel-wide library to fix this size
> problem globally.

This "Andrew" guy seems to know what he's talking about. :)

We've also got a set of pgarrs (struct page arrays) in the
checkpoint/restart code that would make use of something generic like
this.

-- Dave

2009-07-14 22:55:22