2006-11-23 12:36:59

by Paul Menage

[permalink] [raw]
Subject: [PATCH 2/7] Cpusets hooked into containers

This patch removes the process grouping code from the cpusets code,
instead hooking it into the generic container system. This temporarily
adds cpuset-specific code in kernel/container.c, which is removed by
the next patch in the series.

Signed-off-by: Paul Menage <[email protected]>

---
Documentation/cpusets.txt | 81 +-
fs/super.c | 5
include/linux/container.h | 7
include/linux/cpuset.h | 25
include/linux/fs.h | 2
include/linux/mempolicy.h | 2
include/linux/sched.h | 4
init/Kconfig | 23
kernel/container.c | 107 +++
kernel/cpuset.c | 1273 +++++-----------------------------------------
kernel/exit.c | 2
kernel/fork.c | 7
mm/oom_kill.c | 6
13 files changed, 330 insertions(+), 1214 deletions(-)

Index: container-2.6.19-rc5/include/linux/container.h
===================================================================
--- container-2.6.19-rc5.orig/include/linux/container.h
+++ container-2.6.19-rc5/include/linux/container.h
@@ -47,6 +47,10 @@ struct container {

struct container *parent; /* my parent */
struct dentry *dentry; /* container fs entry */
+
+#ifdef CONFIG_CPUSETS
+ struct cpuset *cpuset;
+#endif
};

/* struct cftype:
@@ -79,6 +83,9 @@ struct cftype {
int container_add_file(struct container *cont, const struct cftype *cft);

int container_is_removed(const struct container *cont);
+void container_set_release_agent_path(const char *path);
+
+int container_path(const struct container *cont, char *buf, int buflen);

#else /* !CONFIG_CONTAINERS */

Index: container-2.6.19-rc5/include/linux/cpuset.h
===================================================================
--- container-2.6.19-rc5.orig/include/linux/cpuset.h
+++ container-2.6.19-rc5/include/linux/cpuset.h
@@ -11,16 +11,15 @@
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
+#include <linux/container.h>

#ifdef CONFIG_CPUSETS

-extern int number_of_cpusets; /* How many cpusets are defined in system? */
+extern int number_of_cpusets; /* How many cpusets are defined in system? */

extern int cpuset_init_early(void);
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
-extern void cpuset_fork(struct task_struct *p);
-extern void cpuset_exit(struct task_struct *p);
extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
void cpuset_init_current_mems_allowed(void);
@@ -47,10 +46,6 @@ extern void __cpuset_memory_pressure_bum

extern struct file_operations proc_cpuset_operations;
extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
-
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
extern int cpuset_mem_spread_node(void);

static inline int cpuset_do_page_mem_spread(void)
@@ -65,13 +60,22 @@ static inline int cpuset_do_slab_mem_spr

extern void cpuset_track_online_nodes(void);

+extern int cpuset_can_attach_task(struct container *cont,
+ struct task_struct *tsk);
+extern void cpuset_attach_task(struct container *cont,
+ struct task_struct *tsk);
+extern void cpuset_post_attach_task(struct container *cont,
+ struct container *oldcont,
+ struct task_struct *tsk);
+extern int cpuset_populate_dir(struct container *cont);
+extern int cpuset_create(struct container *cont);
+extern void cpuset_destroy(struct container *cont);
+
#else /* !CONFIG_CPUSETS */

static inline int cpuset_init_early(void) { return 0; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
-static inline void cpuset_fork(struct task_struct *p) {}
-static inline void cpuset_exit(struct task_struct *p) {}

static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
{
@@ -110,9 +114,6 @@ static inline char *cpuset_task_status_a
return buffer;
}

-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
static inline int cpuset_mem_spread_node(void)
{
return 0;
Index: container-2.6.19-rc5/kernel/exit.c
===================================================================
--- container-2.6.19-rc5.orig/kernel/exit.c
+++ container-2.6.19-rc5/kernel/exit.c
@@ -29,7 +29,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/cpuset.h>
#include <linux/container.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -923,7 +922,6 @@ fastcall NORET_TYPE void do_exit(long co
__exit_files(tsk);
__exit_fs(tsk);
exit_thread();
- cpuset_exit(tsk);
container_exit(tsk);
exit_keys(tsk);

Index: container-2.6.19-rc5/kernel/fork.c
===================================================================
--- container-2.6.19-rc5.orig/kernel/fork.c
+++ container-2.6.19-rc5/kernel/fork.c
@@ -30,7 +30,6 @@
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
-#include <linux/cpuset.h>
#include <linux/container.h>
#include <linux/security.h>
#include <linux/swap.h>
@@ -1056,13 +1055,12 @@ static struct task_struct *copy_process(
p->io_wait = NULL;
p->audit_context = NULL;
container_fork(p);
- cpuset_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_cpuset;
+ goto bad_fork_cleanup_container;
}
mpol_fix_fork_child_flag(p);
#endif
@@ -1286,9 +1284,8 @@ bad_fork_cleanup_security:
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_free(p->mempolicy);
-bad_fork_cleanup_cpuset:
+bad_fork_cleanup_container:
#endif
- cpuset_exit(p);
container_exit(p);
bad_fork_cleanup_delays_binfmt:
delayacct_tsk_free(p);
Index: container-2.6.19-rc5/kernel/container.c
===================================================================
--- container-2.6.19-rc5.orig/kernel/container.c
+++ container-2.6.19-rc5/kernel/container.c
@@ -55,6 +55,7 @@
#include <linux/time.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/cpuset.h>

#include <asm/uaccess.h>
#include <asm/atomic.h>
@@ -92,6 +93,18 @@ static struct container top_container =
.children = LIST_HEAD_INIT(top_container.children),
};

+/* The path to use for release notifications. No locking between
+ * setting and use - so if userspace updates this while subcontainers
+ * exist, you could miss a notification */
+static char release_agent_path[PATH_MAX] = "/sbin/container_release_agent";
+
+void container_set_release_agent_path(const char *path)
+{
+ container_manage_lock();
+ strcpy(release_agent_path, path);
+ container_manage_unlock();
+}
+
static struct vfsmount *container_mount;
static struct super_block *container_sb;

@@ -333,7 +346,7 @@ static inline struct cftype *__d_cft(str
* Returns 0 on success, -errno on error.
*/

-static int container_path(const struct container *cont, char *buf, int buflen)
+int container_path(const struct container *cont, char *buf, int buflen)
{
char *start;

@@ -397,7 +410,7 @@ static void container_release_agent(cons
return;

i = 0;
- argv[i++] = "/sbin/container_release_agent";
+ argv[i++] = release_agent_path;
argv[i++] = (char *)pathbuf;
argv[i] = NULL;

@@ -438,6 +451,7 @@ static void check_for_release(struct con
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
return;
+
if (container_path(cont, buf, PAGE_SIZE) < 0)
kfree(buf);
else
@@ -486,7 +500,7 @@ static int attach_task(struct container
pid_t pid;
struct task_struct *tsk;
struct container *oldcont;
- int retval;
+ int retval = 0;

if (sscanf(pidbuf, "%d", &pid) != 1)
return -EIO;
@@ -513,7 +527,9 @@ static int attach_task(struct container
get_task_struct(tsk);
}

- retval = security_task_setscheduler(tsk, 0, NULL);
+#ifdef CONFIG_CPUSETS
+ retval = cpuset_can_attach_task(cont, tsk);
+#endif
if (retval) {
put_task_struct(tsk);
return retval;
@@ -533,8 +549,16 @@ static int attach_task(struct container
rcu_assign_pointer(tsk->container, cont);
task_unlock(tsk);

+#ifdef CONFIG_CPUSETS
+ cpuset_attach_task(cont, tsk);
+#endif
+
mutex_unlock(&callback_mutex);

+#ifdef CONFIG_CPUSETS
+ cpuset_post_attach_task(cont, oldcont, tsk);
+#endif
+
put_task_struct(tsk);
synchronize_rcu();
if (atomic_dec_and_test(&oldcont->count))
@@ -549,6 +573,7 @@ typedef enum {
FILE_DIR,
FILE_NOTIFY_ON_RELEASE,
FILE_TASKLIST,
+ FILE_RELEASE_AGENT,
} container_filetype_t;

static ssize_t container_common_file_write(struct container *cont,
@@ -562,8 +587,7 @@ static ssize_t container_common_file_wri
char *pathbuf = NULL;
int retval = 0;

- /* Crude upper limit on largest legitimate cpulist user might write. */
- if (nbytes > 100 + 6 * NR_CPUS)
+ if (nbytes >= PATH_MAX)
return -E2BIG;

/* +1 for nul-terminator */
@@ -590,6 +614,20 @@ static ssize_t container_common_file_wri
case FILE_TASKLIST:
retval = attach_task(cont, buffer, &pathbuf);
break;
+ case FILE_RELEASE_AGENT:
+ {
+ if (nbytes < sizeof(release_agent_path)) {
+ /* We never write anything other than '\0'
+ * into the last char of release_agent_path,
+ * so it always remains a NUL-terminated
+ * string */
+ strncpy(release_agent_path, buffer, nbytes);
+ release_agent_path[nbytes] = 0;
+ } else {
+ retval = -ENOSPC;
+ }
+ break;
+ }
default:
retval = -EINVAL;
goto out2;
@@ -643,6 +681,17 @@ static ssize_t container_common_file_rea
case FILE_NOTIFY_ON_RELEASE:
*s++ = notify_on_release(cont) ? '1' : '0';
break;
+ case FILE_RELEASE_AGENT:
+ {
+ size_t n;
+ container_manage_lock();
+ n = strnlen(release_agent_path, sizeof(release_agent_path));
+ n = min(n, (size_t) PAGE_SIZE);
+ strncpy(s, release_agent_path, n);
+ container_manage_unlock();
+ s += n;
+ break;
+ }
default:
retval = -EINVAL;
goto out;
@@ -978,6 +1027,13 @@ static struct cftype cft_notify_on_relea
.private = FILE_NOTIFY_ON_RELEASE,
};

+static struct cftype cft_release_agent = {
+ .name = "release_agent",
+ .read = container_common_file_read,
+ .write = container_common_file_write,
+ .private = FILE_RELEASE_AGENT,
+};
+
static int container_populate_dir(struct container *cont)
{
int err;
@@ -986,6 +1042,13 @@ static int container_populate_dir(struct
return err;
if ((err = container_add_file(cont, &cft_tasks)) < 0)
return err;
+ if ((cont == &top_container) &&
+ (err = container_add_file(cont, &cft_release_agent)) < 0)
+ return err;
+#ifdef CONFIG_CPUSETS
+ if ((err = cpuset_populate_dir(cont)) < 0)
+ return err;
+#endif
return 0;
}

@@ -1017,6 +1080,12 @@ static long container_create(struct cont

cont->parent = parent;

+#ifdef CONFIG_CPUSETS
+ err = cpuset_create(cont);
+ if (err)
+ goto err_unlock_free;
+#endif
+
mutex_lock(&callback_mutex);
list_add(&cont->sibling, &cont->parent->children);
number_of_containers++;
@@ -1038,11 +1107,14 @@ static long container_create(struct cont
return 0;

err_remove:
+#ifdef CONFIG_CPUSETS
+ cpuset_destroy(cont);
+#endif
mutex_lock(&callback_mutex);
list_del(&cont->sibling);
number_of_containers--;
mutex_unlock(&callback_mutex);
-
+ err_unlock_free:
mutex_unlock(&manage_mutex);
kfree(cont);
return err;
@@ -1097,6 +1169,9 @@ static int container_rmdir(struct inode
dput(d);
number_of_containers--;
mutex_unlock(&callback_mutex);
+#ifdef CONFIG_CPUSETS
+ cpuset_destroy(cont);
+#endif
if (list_empty(&parent->children))
check_for_release(parent, &pathbuf);
mutex_unlock(&manage_mutex);
@@ -1283,6 +1358,24 @@ void container_unlock(void)
mutex_unlock(&callback_mutex);
}

+void container_manage_lock(void)
+{
+ mutex_lock(&manage_mutex);
+}
+
+/**
+ * container_manage_unlock - release lock on container changes
+ *
+ * Undo the lock taken in a previous container_manage_lock() call.
+ */
+
+void container_manage_unlock(void)
+{
+ mutex_unlock(&manage_mutex);
+}
+
+
+
/*
* proc_container_show()
* - Print tasks container path into seq_file.
Index: container-2.6.19-rc5/kernel/cpuset.c
===================================================================
--- container-2.6.19-rc5.orig/kernel/cpuset.c
+++ container-2.6.19-rc5/kernel/cpuset.c
@@ -54,8 +54,6 @@
#include <asm/atomic.h>
#include <linux/mutex.h>

-#define CPUSET_SUPER_MAGIC 0x27e0eb
-
/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
@@ -77,20 +75,8 @@ struct cpuset {
cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */

- /*
- * Count is atomic so can incr (fork) or decr (exit) without a lock.
- */
- atomic_t count; /* count tasks using this cpuset */
-
- /*
- * We link our 'sibling' struct into our parents 'children'.
- * Our children link their 'sibling' into our 'children'.
- */
- struct list_head sibling; /* my parents children */
- struct list_head children; /* my children */
-
+ struct container *container; /* Task container */
struct cpuset *parent; /* my parent */
- struct dentry *dentry; /* cpuset fs entry */

/*
* Copy of global cpuset_mems_generation as of the most
@@ -106,8 +92,6 @@ typedef enum {
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEMORY_MIGRATE,
- CS_REMOVED,
- CS_NOTIFY_ON_RELEASE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
@@ -123,16 +107,6 @@ static inline int is_mem_exclusive(const
return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}

-static inline int is_removed(const struct cpuset *cs)
-{
- return test_bit(CS_REMOVED, &cs->flags);
-}
-
-static inline int notify_on_release(const struct cpuset *cs)
-{
- return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
-}
-
static inline int is_memory_migrate(const struct cpuset *cs)
{
return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
@@ -173,388 +147,32 @@ static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
.cpus_allowed = CPU_MASK_ALL,
.mems_allowed = NODE_MASK_ALL,
- .count = ATOMIC_INIT(0),
- .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
- .children = LIST_HEAD_INIT(top_cpuset.children),
-};
-
-static struct vfsmount *cpuset_mount;
-static struct super_block *cpuset_sb;
-
-/*
- * We have two global cpuset mutexes below. They can nest.
- * It is ok to first take manage_mutex, then nest callback_mutex. We also
- * require taking task_lock() when dereferencing a tasks cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds manage_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding manage_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
- *
- * Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
- * from one of the callbacks into the cpuset code from within
- * __alloc_pages().
- *
- * If a task is only holding callback_mutex, then it has read-only
- * access to cpusets.
- *
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
- *
- * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_mutex or callback_mutex can't rely
- * on the count field not changing. However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
- * increment it again. Because a count of zero means that no tasks
- * are currently attached, therefore there is no way a task attached
- * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_mutex or callback_mutex can safely assume that
- * if the count is zero, it will stay zero. Similarly, if a task
- * holds manage_mutex or callback_mutex on a cpuset with zero count, it
- * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those mutexes.
- *
- * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_mutex across the entire operation,
- * single threading all such cpuset modifications across the system.
- *
- * The cpuset_common_file_read() handlers only hold callback_mutex across
- * small pieces of code, such as when reading out possibly multi-word
- * cpumasks and nodemasks.
- *
- * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either mutex. These are the two most performance
- * critical pieces of code here. The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits. Then manage_mutex
- * is taken, and if the cpuset count is zero, a usermode call made
- * to /sbin/cpuset_release_agent with the name of the cpuset (path
- * relative to the root of cpuset file system) as the argument.
- *
- * A cpuset can only be deleted if both its 'count' of using tasks
- * is zero, and its list of 'children' cpusets is empty. Since all
- * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init), therefore, top_cpuset
- * always has either children cpusets and/or using tasks. So we don't
- * need a special hack to ensure that top_cpuset cannot be deleted.
- *
- * The above "Tale of Two Semaphores" would be complete, but for:
- *
- * The task_lock() exception
- *
- * The need for this exception arises from the action of attach_task(),
- * which overwrites one tasks cpuset pointer with another. It does
- * so using both mutexes, however there are several performance
- * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global mutex. Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
- * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
- * (task->alloc_lock) already in the task_struct routinely used for
- * such matters.
- *
- * P.S. One more locking exception. RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
- * access of task->cpuset->mems_generation via that pointer in
- * the routine cpuset_update_task_memory_state().
- */
-
-static DEFINE_MUTEX(manage_mutex);
-static DEFINE_MUTEX(callback_mutex);
-
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
- * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
- */
-
-static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
-static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
-
-static struct backing_dev_info cpuset_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
-};
-
-static struct inode *cpuset_new_inode(mode_t mode)
-{
- struct inode *inode = new_inode(cpuset_sb);
-
- if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current->fsuid;
- inode->i_gid = current->fsgid;
- inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
- }
- return inode;
-}
-
-static void cpuset_diput(struct dentry *dentry, struct inode *inode)
-{
- /* is dentry a directory ? if so, kfree() associated cpuset */
- if (S_ISDIR(inode->i_mode)) {
- struct cpuset *cs = dentry->d_fsdata;
- BUG_ON(!(is_removed(cs)));
- kfree(cs);
- }
- iput(inode);
-}
-
-static struct dentry_operations cpuset_dops = {
- .d_iput = cpuset_diput,
-};
-
-static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
-{
- struct dentry *d = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(d))
- d->d_op = &cpuset_dops;
- return d;
-}
-
-static void remove_dir(struct dentry *d)
-{
- struct dentry *parent = dget(d->d_parent);
-
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
-}
-
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cpuset_d_remove_dir(struct dentry *dentry)
-{
- struct list_head *node;
-
- spin_lock(&dcache_lock);
- node = dentry->d_subdirs.next;
- while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
- list_del_init(node);
- if (d->d_inode) {
- d = dget_locked(d);
- spin_unlock(&dcache_lock);
- d_delete(d);
- simple_unlink(dentry->d_inode, d);
- dput(d);
- spin_lock(&dcache_lock);
- }
- node = dentry->d_subdirs.next;
- }
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dcache_lock);
- remove_dir(dentry);
-}
-
-static struct super_operations cpuset_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
};

-static int cpuset_fill_super(struct super_block *sb, void *unused_data,
- int unused_silent)
-{
- struct inode *inode;
- struct dentry *root;
-
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = CPUSET_SUPER_MAGIC;
- sb->s_op = &cpuset_ops;
- cpuset_sb = sb;
-
- inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
- if (inode) {
- inode->i_op = &simple_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
- /* directories start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- } else {
- return -ENOMEM;
- }
-
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
- return -ENOMEM;
- }
- sb->s_root = root;
- return 0;
-}
-
+/* This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "container" instead */
static int cpuset_get_sb(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data, struct vfsmount *mnt)
{
- return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
+ struct file_system_type *container_fs = get_fs_type("container");
+ int ret = -ENODEV;
+ container_set_release_agent_path("/sbin/cpuset_release_agent");
+ if (container_fs) {
+ ret = container_fs->get_sb(container_fs, flags,
+ unused_dev_name,
+ data, mnt);
+ put_filesystem(container_fs);
+ }
+ return ret;
}

static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.get_sb = cpuset_get_sb,
- .kill_sb = kill_litter_super,
};

-/* struct cftype:
- *
- * The files in the cpuset filesystem mostly have a very simple read/write
- * handling, some common function will take care of it. Nevertheless some cases
- * (read tasks) are special and therefore I define this structure for every
- * kind of file.
- *
- *
- * When reading/writing to a file:
- * - the cpuset to use in file->f_dentry->d_parent->d_fsdata
- * - the 'cftype' of the file is file->f_dentry->d_fsdata
- */
-
-struct cftype {
- char *name;
- int private;
- int (*open) (struct inode *inode, struct file *file);
- ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
- loff_t *ppos);
- int (*write) (struct file *file, const char __user *buf, size_t nbytes,
- loff_t *ppos);
- int (*release) (struct inode *inode, struct file *file);
-};
-
-static inline struct cpuset *__d_cs(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-/*
- * Call with manage_mutex held. Writes path of cpuset into buf.
- * Returns 0 on success, -errno on error.
- */
-
-static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
-{
- char *start;
-
- start = buf + buflen;
-
- *--start = '\0';
- for (;;) {
- int len = cs->dentry->d_name.len;
- if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, cs->dentry->d_name.name, len);
- cs = cs->parent;
- if (!cs)
- break;
- if (!cs->parent)
- continue;
- if (--start < buf)
- return -ENAMETOOLONG;
- *start = '/';
- }
- memmove(buf, start, buf + buflen - start);
- return 0;
-}
-
-/*
- * Notify userspace when a cpuset is released, by running
- * /sbin/cpuset_release_agent with the name of the cpuset (path
- * relative to the root of cpuset file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cpuset.
- *
- * This races with the possibility that some other task will be
- * attached to this cpuset before it is removed, or that some other
- * user task will 'mkdir' a child cpuset of this cpuset. That's ok.
- * The presumed 'rmdir' will fail quietly if this cpuset is no longer
- * unused, and this cpuset will be reprieved from its death sentence,
- * to continue to serve a useful existence. Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is 0, which means don't
- * wait. The separate /sbin/cpuset_release_agent task is forked by
- * call_usermodehelper(), then control in this thread returns here,
- * without waiting for the release agent task. We don't bother to
- * wait because the caller of this routine has no use for the exit
- * status of the /sbin/cpuset_release_agent task, so no sense holding
- * our caller up for that.
- *
- * When we had only one cpuset mutex, we had to call this
- * without holding it, to avoid deadlock when call_usermodehelper()
- * allocated memory. With two locks, we could now call this while
- * holding manage_mutex, but we still don't, so as to minimize
- * the time manage_mutex is held.
- */
-
-static void cpuset_release_agent(const char *pathbuf)
-{
- char *argv[3], *envp[3];
- int i;
-
- if (!pathbuf)
- return;
-
- i = 0;
- argv[i++] = "/sbin/cpuset_release_agent";
- argv[i++] = (char *)pathbuf;
- argv[i] = NULL;
-
- i = 0;
- /* minimal command environment */
- envp[i++] = "HOME=/";
- envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[i] = NULL;
-
- call_usermodehelper(argv[0], argv, envp, 0);
- kfree(pathbuf);
-}
-
-/*
- * Either cs->count of using tasks transitioned to zero, or the
- * cs->children list of child cpusets just became empty. If this
- * cs is notify_on_release() and now both the user count is zero and
- * the list of children is empty, prepare cpuset path in a kmalloc'd
- * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once manage_mutex is dropped.
- * Call here with manage_mutex held.
- *
- * This check_for_release() routine is responsible for kmalloc'ing
- * pathbuf. The above cpuset_release_agent() is responsible for
- * kfree'ing pathbuf. The caller of these routines is responsible
- * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with manage_mutex held and the address
- * of the pathbuf pointer, then dropping manage_mutex, then calling
- * cpuset_release_agent() with pathbuf, as set by check_for_release().
- */
-
-static void check_for_release(struct cpuset *cs, char **ppathbuf)
-{
- if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
- list_empty(&cs->children)) {
- char *buf;
-
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!buf)
- return;
- if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
- kfree(buf);
- else
- *ppathbuf = buf;
- }
-}
-
/*
* Return in *pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
@@ -652,20 +270,20 @@ void cpuset_update_task_memory_state(voi
struct task_struct *tsk = current;
struct cpuset *cs;

- if (tsk->cpuset == &top_cpuset) {
+ if (tsk->container->cpuset == &top_cpuset) {
/* Don't need rcu for top_cpuset. It's never freed. */
my_cpusets_mem_gen = top_cpuset.mems_generation;
} else {
rcu_read_lock();
- cs = rcu_dereference(tsk->cpuset);
+ cs = rcu_dereference(tsk->container->cpuset);
my_cpusets_mem_gen = cs->mems_generation;
rcu_read_unlock();
}

if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
- mutex_lock(&callback_mutex);
+ container_lock();
task_lock(tsk);
- cs = tsk->cpuset; /* Maybe changed when task not locked */
+ cs = tsk->container->cpuset; /* Maybe changed when task not locked */
guarantee_online_mems(cs, &tsk->mems_allowed);
tsk->cpuset_mems_generation = cs->mems_generation;
if (is_spread_page(cs))
@@ -677,7 +295,7 @@ void cpuset_update_task_memory_state(voi
else
tsk->flags &= ~PF_SPREAD_SLAB;
task_unlock(tsk);
- mutex_unlock(&callback_mutex);
+ container_unlock();
mpol_rebind_task(tsk, &tsk->mems_allowed);
}
}
@@ -720,10 +338,12 @@ static int is_cpuset_subset(const struct

static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
{
+ struct container *cont;
struct cpuset *c, *par;

/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(c, &cur->children, sibling) {
+ list_for_each_entry(cont, &cur->container->children, sibling) {
+ c = cont->cpuset;
if (!is_cpuset_subset(c, trial))
return -EBUSY;
}
@@ -737,7 +357,8 @@ static int validate_change(const struct
return -EACCES;

/* If either I or some sibling (!= me) is exclusive, we can't overlap */
- list_for_each_entry(c, &par->children, sibling) {
+ list_for_each_entry(cont, &par->container->children, sibling) {
+ c = cont->cpuset;
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -767,6 +388,7 @@ static int validate_change(const struct

static void update_cpu_domains(struct cpuset *cur)
{
+ struct container *cont;
struct cpuset *c, *par = cur->parent;
cpumask_t pspan, cspan;

@@ -778,7 +400,8 @@ static void update_cpu_domains(struct cp
* children
*/
pspan = par->cpus_allowed;
- list_for_each_entry(c, &par->children, sibling) {
+ list_for_each_entry(cont, &par->container->children, sibling) {
+ c = cont->cpuset;
if (is_cpu_exclusive(c))
cpus_andnot(pspan, pspan, c->cpus_allowed);
}
@@ -795,7 +418,8 @@ static void update_cpu_domains(struct cp
* Get all cpus from current cpuset's cpus_allowed not part
* of exclusive children
*/
- list_for_each_entry(c, &cur->children, sibling) {
+ list_for_each_entry(cont, &cur->container->children, sibling) {
+ c = cont->cpuset;
if (is_cpu_exclusive(c))
cpus_andnot(cspan, cspan, c->cpus_allowed);
}
@@ -830,9 +454,9 @@ static int update_cpumask(struct cpuset
if (retval < 0)
return retval;
cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
- mutex_lock(&callback_mutex);
+ container_lock();
cs->cpus_allowed = trialcs.cpus_allowed;
- mutex_unlock(&callback_mutex);
+ container_unlock();
if (is_cpu_exclusive(cs) && !cpus_unchanged)
update_cpu_domains(cs);
return 0;
@@ -876,15 +500,15 @@ static void cpuset_migrate_mm(struct mm_

cpuset_update_task_memory_state();

- mutex_lock(&callback_mutex);
+ container_lock();
tsk->mems_allowed = *to;
- mutex_unlock(&callback_mutex);
+ container_unlock();

do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);

- mutex_lock(&callback_mutex);
- guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
- mutex_unlock(&callback_mutex);
+ container_lock();
+ guarantee_online_mems(tsk->container->cpuset, &tsk->mems_allowed);
+ container_unlock();
}

/*
@@ -911,12 +535,14 @@ static int update_nodemask(struct cpuset
int migrate;
int fudge;
int retval;
+ struct container *cont;

/* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
if (cs == &top_cpuset)
return -EACCES;

trialcs = *cs;
+ cont = cs->container;
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
goto done;
@@ -934,10 +560,10 @@ static int update_nodemask(struct cpuset
if (retval < 0)
goto done;

- mutex_lock(&callback_mutex);
+ container_lock();
cs->mems_allowed = trialcs.mems_allowed;
cs->mems_generation = cpuset_mems_generation++;
- mutex_unlock(&callback_mutex);
+ container_unlock();

set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */

@@ -953,13 +579,13 @@ static int update_nodemask(struct cpuset
* enough mmarray[] w/o using GFP_ATOMIC.
*/
while (1) {
- ntasks = atomic_read(&cs->count); /* guess */
+ ntasks = atomic_read(&cs->container->count); /* guess */
ntasks += fudge;
mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
if (!mmarray)
goto done;
write_lock_irq(&tasklist_lock); /* block fork */
- if (atomic_read(&cs->count) <= ntasks)
+ if (atomic_read(&cs->container->count) <= ntasks)
break; /* got enough */
write_unlock_irq(&tasklist_lock); /* try again */
kfree(mmarray);
@@ -976,7 +602,7 @@ static int update_nodemask(struct cpuset
"Cpuset mempolicy rebind incomplete.\n");
continue;
}
- if (p->cpuset != cs)
+ if (p->container != cont)
continue;
mm = get_task_mm(p);
if (!mm)
@@ -1059,15 +685,15 @@ static int update_flag(cpuset_flagbits_t
return err;
cpu_exclusive_changed =
(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
- mutex_lock(&callback_mutex);
+ container_lock();
if (turning_on)
set_bit(bit, &cs->flags);
else
clear_bit(bit, &cs->flags);
- mutex_unlock(&callback_mutex);
+ container_unlock();

if (cpu_exclusive_changed)
- update_cpu_domains(cs);
+ update_cpu_domains(cs);
return 0;
}

@@ -1169,85 +795,35 @@ static int fmeter_getrate(struct fmeter
return val;
}

-/*
- * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
- * writing the path of the old cpuset in 'ppathbuf' if it needs to be
- * notified on release.
- *
- * Call holding manage_mutex. May take callback_mutex and task_lock of
- * the task 'pid' during call.
- */
-
-static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
+int cpuset_can_attach_task(struct container *cont, struct task_struct *tsk)
{
- pid_t pid;
- struct task_struct *tsk;
- struct cpuset *oldcs;
- cpumask_t cpus;
- nodemask_t from, to;
- struct mm_struct *mm;
- int retval;
+ struct cpuset *cs = cont->cpuset;

- if (sscanf(pidbuf, "%d", &pid) != 1)
- return -EIO;
if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;

- if (pid) {
- read_lock(&tasklist_lock);
-
- tsk = find_task_by_pid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
- read_unlock(&tasklist_lock);
- return -ESRCH;
- }
-
- get_task_struct(tsk);
- read_unlock(&tasklist_lock);
-
- if ((current->euid) && (current->euid != tsk->uid)
- && (current->euid != tsk->suid)) {
- put_task_struct(tsk);
- return -EACCES;
- }
- } else {
- tsk = current;
- get_task_struct(tsk);
- }
-
- retval = security_task_setscheduler(tsk, 0, NULL);
- if (retval) {
- put_task_struct(tsk);
- return retval;
- }
-
- mutex_lock(&callback_mutex);
-
- task_lock(tsk);
- oldcs = tsk->cpuset;
- /*
- * After getting 'oldcs' cpuset ptr, be sure still not exiting.
- * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
- * then fail this attach_task(), to avoid breaking top_cpuset.count.
- */
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- mutex_unlock(&callback_mutex);
- put_task_struct(tsk);
- return -ESRCH;
- }
- atomic_inc(&cs->count);
- rcu_assign_pointer(tsk->cpuset, cs);
- task_unlock(tsk);
+ return security_task_setscheduler(tsk, 0, NULL);
+}

+void cpuset_attach_task(struct container *cont, struct task_struct *tsk)
+{
+ cpumask_t cpus;
+ struct cpuset *cs = cont->cpuset;
guarantee_online_cpus(cs, &cpus);
set_cpus_allowed(tsk, cpus);
+}
+
+void cpuset_post_attach_task(struct container *cont,
+ struct container *oldcont,
+ struct task_struct *tsk)
+{
+ nodemask_t from, to;
+ struct mm_struct *mm;
+ struct cpuset *cs = cont->cpuset;
+ struct cpuset *oldcs = oldcont->cpuset;

from = oldcs->mems_allowed;
to = cs->mems_allowed;
-
- mutex_unlock(&callback_mutex);
-
mm = get_task_mm(tsk);
if (mm) {
mpol_rebind_mm(mm, &to);
@@ -1256,43 +832,35 @@ static int attach_task(struct cpuset *cs
mmput(mm);
}

- put_task_struct(tsk);
- synchronize_rcu();
- if (atomic_dec_and_test(&oldcs->count))
- check_for_release(oldcs, ppathbuf);
- return 0;
}

/* The various types of files and directories in a cpuset file system */

typedef enum {
- FILE_ROOT,
- FILE_DIR,
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
- FILE_NOTIFY_ON_RELEASE,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
- FILE_TASKLIST,
} cpuset_filetype_t;

-static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+static ssize_t cpuset_common_file_write(struct container *cont,
+ struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
{
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cpuset *cs = cont->cpuset;
cpuset_filetype_t type = cft->private;
char *buffer;
- char *pathbuf = NULL;
int retval = 0;

- /* Crude upper limit on largest legitimate cpulist user might write. */
- if (nbytes > 100 + 6 * NR_CPUS)
+ /* Crude upper limit on largest legitimate list user might write. */
+ if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
return -E2BIG;

/* +1 for nul-terminator */
@@ -1305,9 +873,9 @@ static ssize_t cpuset_common_file_write(
}
buffer[nbytes] = 0; /* nul-terminate */

- mutex_lock(&manage_mutex);
+ container_manage_lock();

- if (is_removed(cs)) {
+ if (container_is_removed(cont)) {
retval = -ENODEV;
goto out2;
}
@@ -1325,9 +893,6 @@ static ssize_t cpuset_common_file_write(
case FILE_MEM_EXCLUSIVE:
retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
break;
- case FILE_NOTIFY_ON_RELEASE:
- retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
- break;
case FILE_MEMORY_MIGRATE:
retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
break;
@@ -1345,9 +910,6 @@ static ssize_t cpuset_common_file_write(
retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
cs->mems_generation = cpuset_mems_generation++;
break;
- case FILE_TASKLIST:
- retval = attach_task(cs, buffer, &pathbuf);
- break;
default:
retval = -EINVAL;
goto out2;
@@ -1356,30 +918,12 @@ static ssize_t cpuset_common_file_write(
if (retval == 0)
retval = nbytes;
out2:
- mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
+ container_manage_unlock();
out1:
kfree(buffer);
return retval;
}

-static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- ssize_t retval = 0;
- struct cftype *cft = __d_cft(file->f_dentry);
- if (!cft)
- return -ENODEV;
-
- /* special function ? */
- if (cft->write)
- retval = cft->write(file, buf, nbytes, ppos);
- else
- retval = cpuset_common_file_write(file, buf, nbytes, ppos);
-
- return retval;
-}
-
/*
* These ascii lists should be read in a single call, by using a user
* buffer large enough to hold the entire map. If read in smaller
@@ -1396,9 +940,9 @@ static int cpuset_sprintf_cpulist(char *
{
cpumask_t mask;

- mutex_lock(&callback_mutex);
+ container_lock();
mask = cs->cpus_allowed;
- mutex_unlock(&callback_mutex);
+ container_unlock();

return cpulist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -1407,18 +951,20 @@ static int cpuset_sprintf_memlist(char *
{
nodemask_t mask;

- mutex_lock(&callback_mutex);
+ container_lock();
mask = cs->mems_allowed;
- mutex_unlock(&callback_mutex);
+ container_unlock();

return nodelist_scnprintf(page, PAGE_SIZE, mask);
}

-static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
+static ssize_t cpuset_common_file_read(struct container *cont,
+ struct cftype *cft,
+ struct file *file,
+ char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ struct cpuset *cs = cont->cpuset;
cpuset_filetype_t type = cft->private;
char *page;
ssize_t retval = 0;
@@ -1442,9 +988,6 @@ static ssize_t cpuset_common_file_read(s
case FILE_MEM_EXCLUSIVE:
*s++ = is_mem_exclusive(cs) ? '1' : '0';
break;
- case FILE_NOTIFY_ON_RELEASE:
- *s++ = notify_on_release(cs) ? '1' : '0';
- break;
case FILE_MEMORY_MIGRATE:
*s++ = is_memory_migrate(cs) ? '1' : '0';
break;
@@ -1472,391 +1015,97 @@ out:
return retval;
}

-static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
- loff_t *ppos)
-{
- ssize_t retval = 0;
- struct cftype *cft = __d_cft(file->f_dentry);
- if (!cft)
- return -ENODEV;
-
- /* special function ? */
- if (cft->read)
- retval = cft->read(file, buf, nbytes, ppos);
- else
- retval = cpuset_common_file_read(file, buf, nbytes, ppos);
-
- return retval;
-}
-
-static int cpuset_file_open(struct inode *inode, struct file *file)
-{
- int err;
- struct cftype *cft;
-
- err = generic_file_open(inode, file);
- if (err)
- return err;
-
- cft = __d_cft(file->f_dentry);
- if (!cft)
- return -ENODEV;
- if (cft->open)
- err = cft->open(inode, file);
- else
- err = 0;
-
- return err;
-}
-
-static int cpuset_file_release(struct inode *inode, struct file *file)
-{
- struct cftype *cft = __d_cft(file->f_dentry);
- if (cft->release)
- return cft->release(inode, file);
- return 0;
-}
-
-/*
- * cpuset_rename - Only allow simple rename of directories in place.
- */
-static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- if (!S_ISDIR(old_dentry->d_inode->i_mode))
- return -ENOTDIR;
- if (new_dentry->d_inode)
- return -EEXIST;
- if (old_dir != new_dir)
- return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
-}
-
-static struct file_operations cpuset_file_operations = {
- .read = cpuset_file_read,
- .write = cpuset_file_write,
- .llseek = generic_file_llseek,
- .open = cpuset_file_open,
- .release = cpuset_file_release,
-};
-
-static struct inode_operations cpuset_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = cpuset_mkdir,
- .rmdir = cpuset_rmdir,
- .rename = cpuset_rename,
-};
-
-static int cpuset_create_file(struct dentry *dentry, int mode)
-{
- struct inode *inode;
-
- if (!dentry)
- return -ENOENT;
- if (dentry->d_inode)
- return -EEXIST;
-
- inode = cpuset_new_inode(mode);
- if (!inode)
- return -ENOMEM;
-
- if (S_ISDIR(mode)) {
- inode->i_op = &cpuset_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
-
- /* start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- } else if (S_ISREG(mode)) {
- inode->i_size = 0;
- inode->i_fop = &cpuset_file_operations;
- }
-
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
- return 0;
-}
-
-/*
- * cpuset_create_dir - create a directory for an object.
- * cs: the cpuset we create the directory for.
- * It must have a valid ->parent field
- * And we are going to fill its ->dentry field.
- * name: The name to give to the cpuset directory. Will be copied.
- * mode: mode to set on new directory.
- */
-
-static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
-{
- struct dentry *dentry = NULL;
- struct dentry *parent;
- int error = 0;
-
- parent = cs->parent->dentry;
- dentry = cpuset_get_dentry(parent, name);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- error = cpuset_create_file(dentry, S_IFDIR | mode);
- if (!error) {
- dentry->d_fsdata = cs;
- inc_nlink(parent->d_inode);
- cs->dentry = dentry;
- }
- dput(dentry);
-
- return error;
-}
-
-static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
-{
- struct dentry *dentry;
- int error;
-
- mutex_lock(&dir->d_inode->i_mutex);
- dentry = cpuset_get_dentry(dir, cft->name);
- if (!IS_ERR(dentry)) {
- error = cpuset_create_file(dentry, 0644 | S_IFREG);
- if (!error)
- dentry->d_fsdata = (void *)cft;
- dput(dentry);
- } else
- error = PTR_ERR(dentry);
- mutex_unlock(&dir->d_inode->i_mutex);
- return error;
-}
-
-/*
- * Stuff for reading the 'tasks' file.
- *
- * Reading this file can return large amounts of data if a cpuset has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- * Upon tasks file open(), a struct ctr_struct is allocated, that
- * will have a pointer to an array (also allocated here). The struct
- * ctr_struct * is stored in file->private_data. Its resources will
- * be freed by release() when the file is closed. The array is used
- * to sprintf the PIDs and then used by read().
- */
-
-/* cpusets_tasks_read array */
-
-struct ctr_struct {
- char *buf;
- int bufsz;
-};
-
-/*
- * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
- * Return actual number of pids loaded. No need to task_lock(p)
- * when reading out p->cpuset, as we don't really care if it changes
- * on the next cycle, and we are not going to try to dereference it.
- */
-static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
-{
- int n = 0;
- struct task_struct *g, *p;
-
- read_lock(&tasklist_lock);
-
- do_each_thread(g, p) {
- if (p->cpuset == cs) {
- pidarray[n++] = p->pid;
- if (unlikely(n == npids))
- goto array_full;
- }
- } while_each_thread(g, p);
-
-array_full:
- read_unlock(&tasklist_lock);
- return n;
-}
-
-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-
-/*
- * Convert array 'a' of 'npids' pid_t's to a string of newline separated
- * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
- * count 'cnt' of how many chars would be written if buf were large enough.
- */
-static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
-{
- int cnt = 0;
- int i;
-
- for (i = 0; i < npids; i++)
- cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
- return cnt;
-}
-
-/*
- * Handle an open on 'tasks' file. Prepare a buffer listing the
- * process id's of tasks currently attached to the cpuset being opened.
- *
- * Does not require any specific cpuset mutexes, and does not take any.
- */
-static int cpuset_tasks_open(struct inode *unused, struct file *file)
-{
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
- struct ctr_struct *ctr;
- pid_t *pidarray;
- int npids;
- char c;
-
- if (!(file->f_mode & FMODE_READ))
- return 0;
-
- ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
- if (!ctr)
- goto err0;
-
- /*
- * If cpuset gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cpuset users didn't
- * show up until sometime later on.
- */
- npids = atomic_read(&cs->count);
- pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
- if (!pidarray)
- goto err1;
-
- npids = pid_array_load(pidarray, npids, cs);
- sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
- /* Call pid_array_to_buf() twice, first just to get bufsz */
- ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
- ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
- if (!ctr->buf)
- goto err2;
- ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
-
- kfree(pidarray);
- file->private_data = ctr;
- return 0;
-
-err2:
- kfree(pidarray);
-err1:
- kfree(ctr);
-err0:
- return -ENOMEM;
-}
-
-static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct ctr_struct *ctr = file->private_data;
-
- if (*ppos + nbytes > ctr->bufsz)
- nbytes = ctr->bufsz - *ppos;
- if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
- return -EFAULT;
- *ppos += nbytes;
- return nbytes;
-}
-
-static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
-{
- struct ctr_struct *ctr;
-
- if (file->f_mode & FMODE_READ) {
- ctr = file->private_data;
- kfree(ctr->buf);
- kfree(ctr);
- }
- return 0;
-}

/*
* for the common functions, 'private' gives the type of file
*/

-static struct cftype cft_tasks = {
- .name = "tasks",
- .open = cpuset_tasks_open,
- .read = cpuset_tasks_read,
- .release = cpuset_tasks_release,
- .private = FILE_TASKLIST,
-};
-
static struct cftype cft_cpus = {
.name = "cpus",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_CPULIST,
};

static struct cftype cft_mems = {
.name = "mems",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_MEMLIST,
};

static struct cftype cft_cpu_exclusive = {
.name = "cpu_exclusive",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_CPU_EXCLUSIVE,
};

static struct cftype cft_mem_exclusive = {
.name = "mem_exclusive",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_MEM_EXCLUSIVE,
};

-static struct cftype cft_notify_on_release = {
- .name = "notify_on_release",
- .private = FILE_NOTIFY_ON_RELEASE,
-};
-
static struct cftype cft_memory_migrate = {
.name = "memory_migrate",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_MEMORY_MIGRATE,
};

static struct cftype cft_memory_pressure_enabled = {
.name = "memory_pressure_enabled",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_MEMORY_PRESSURE_ENABLED,
};

static struct cftype cft_memory_pressure = {
.name = "memory_pressure",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_MEMORY_PRESSURE,
};

static struct cftype cft_spread_page = {
.name = "memory_spread_page",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_SPREAD_PAGE,
};

static struct cftype cft_spread_slab = {
.name = "memory_spread_slab",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
.private = FILE_SPREAD_SLAB,
};

-static int cpuset_populate_dir(struct dentry *cs_dentry)
+int cpuset_populate_dir(struct container *cont)
{
int err;

- if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0)
- return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
+ if ((err = container_add_file(cont, &cft_cpus)) < 0)
return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
+ if ((err = container_add_file(cont, &cft_mems)) < 0)
return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
+ if ((err = container_add_file(cont, &cft_cpu_exclusive)) < 0)
return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
+ if ((err = container_add_file(cont, &cft_mem_exclusive)) < 0)
return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
+ if ((err = container_add_file(cont, &cft_memory_migrate)) < 0)
return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
+ if ((err = container_add_file(cont, &cft_memory_pressure)) < 0)
return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
+ if ((err = container_add_file(cont, &cft_spread_page)) < 0)
return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
- return err;
- if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
+ if ((err = container_add_file(cont, &cft_spread_slab)) < 0)
return err;
+ /* memory_pressure_enabled is in root cpuset only */
+ if (err == 0 && !cont->parent)
+ err = container_add_file(cont, &cft_memory_pressure_enabled);
return 0;
}

@@ -1869,66 +1118,31 @@ static int cpuset_populate_dir(struct de
* Must be called with the mutex on the parent inode held
*/

-static long cpuset_create(struct cpuset *parent, const char *name, int mode)
+int cpuset_create(struct container *cont)
{
struct cpuset *cs;
- int err;
+ struct cpuset *parent = cont->parent->cpuset;

cs = kmalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return -ENOMEM;

- mutex_lock(&manage_mutex);
cpuset_update_task_memory_state();
cs->flags = 0;
- if (notify_on_release(parent))
- set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
cs->cpus_allowed = CPU_MASK_NONE;
cs->mems_allowed = NODE_MASK_NONE;
- atomic_set(&cs->count, 0);
- INIT_LIST_HEAD(&cs->sibling);
- INIT_LIST_HEAD(&cs->children);
cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);

cs->parent = parent;
-
- mutex_lock(&callback_mutex);
- list_add(&cs->sibling, &cs->parent->children);
+ cont->cpuset = cs;
+ cs->container = cont;
number_of_cpusets++;
- mutex_unlock(&callback_mutex);
-
- err = cpuset_create_dir(cs, name, mode);
- if (err < 0)
- goto err;
-
- /*
- * Release manage_mutex before cpuset_populate_dir() because it
- * will down() this new directory's i_mutex and if we race with
- * another mkdir, we might deadlock.
- */
- mutex_unlock(&manage_mutex);
-
- err = cpuset_populate_dir(cs->dentry);
- /* If err < 0, we have a half-filled directory - oh well ;) */
return 0;
-err:
- list_del(&cs->sibling);
- mutex_unlock(&manage_mutex);
- kfree(cs);
- return err;
-}
-
-static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
- struct cpuset *c_parent = dentry->d_parent->d_fsdata;
-
- /* the vfs holds inode->i_mutex already */
- return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
}

/*
@@ -1942,49 +1156,16 @@ static int cpuset_mkdir(struct inode *di
* nesting would risk an ABBA deadlock.
*/

-static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
+void cpuset_destroy(struct container *cont)
{
- struct cpuset *cs = dentry->d_fsdata;
- struct dentry *d;
- struct cpuset *parent;
- char *pathbuf = NULL;
-
- /* the vfs holds both inode->i_mutex already */
+ struct cpuset *cs = cont->cpuset;

- mutex_lock(&manage_mutex);
cpuset_update_task_memory_state();
- if (atomic_read(&cs->count) > 0) {
- mutex_unlock(&manage_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cs->children)) {
- mutex_unlock(&manage_mutex);
- return -EBUSY;
- }
if (is_cpu_exclusive(cs)) {
int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
- if (retval < 0) {
- mutex_unlock(&manage_mutex);
- return retval;
- }
+ BUG_ON(retval);
}
- parent = cs->parent;
- mutex_lock(&callback_mutex);
- set_bit(CS_REMOVED, &cs->flags);
- list_del(&cs->sibling); /* delete my sibling from parent->children */
- spin_lock(&cs->dentry->d_lock);
- d = dget(cs->dentry);
- cs->dentry = NULL;
- spin_unlock(&d->d_lock);
- cpuset_d_remove_dir(d);
- dput(d);
number_of_cpusets--;
- mutex_unlock(&callback_mutex);
- if (list_empty(&parent->children))
- check_for_release(parent, &pathbuf);
- mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
- return 0;
}

/*
@@ -1995,10 +1176,10 @@ static int cpuset_rmdir(struct inode *un

int __init cpuset_init_early(void)
{
- struct task_struct *tsk = current;
-
- tsk->cpuset = &top_cpuset;
- tsk->cpuset->mems_generation = cpuset_mems_generation++;
+ struct container *cont = current->container;
+ cont->cpuset = &top_cpuset;
+ top_cpuset.container = cont;
+ cont->cpuset->mems_generation = cpuset_mems_generation++;
return 0;
}

@@ -2010,39 +1191,19 @@ int __init cpuset_init_early(void)

int __init cpuset_init(void)
{
- struct dentry *root;
- int err;
-
+ int err = 0;
top_cpuset.cpus_allowed = CPU_MASK_ALL;
top_cpuset.mems_allowed = NODE_MASK_ALL;

fmeter_init(&top_cpuset.fmeter);
top_cpuset.mems_generation = cpuset_mems_generation++;

- init_task.cpuset = &top_cpuset;
-
err = register_filesystem(&cpuset_fs_type);
if (err < 0)
- goto out;
- cpuset_mount = kern_mount(&cpuset_fs_type);
- if (IS_ERR(cpuset_mount)) {
- printk(KERN_ERR "cpuset: could not mount!\n");
- err = PTR_ERR(cpuset_mount);
- cpuset_mount = NULL;
- goto out;
- }
- root = cpuset_mount->mnt_sb->s_root;
- root->d_fsdata = &top_cpuset;
- inc_nlink(root->d_inode);
- top_cpuset.dentry = root;
- root->d_inode->i_op = &cpuset_dir_inode_operations;
+ return err;
+
number_of_cpusets = 1;
- err = cpuset_populate_dir(root);
- /* memory_pressure_enabled is in root cpuset only */
- if (err == 0)
- err = cpuset_add_file(root, &cft_memory_pressure_enabled);
-out:
- return err;
+ return 0;
}

#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
@@ -2069,10 +1230,12 @@ out:

static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
{
+ struct container *cont;
struct cpuset *c;

/* Each of our child cpusets mems must be online */
- list_for_each_entry(c, &cur->children, sibling) {
+ list_for_each_entry(c, &cur->container->children, sibling) {
+ c = container_cs(cont);
guarantee_online_cpus_mems_in_subtree(c);
if (!cpus_empty(c->cpus_allowed))
guarantee_online_cpus(c, &c->cpus_allowed);
@@ -2099,15 +1262,15 @@ static void guarantee_online_cpus_mems_i

static void common_cpu_mem_hotplug_unplug(void)
{
- mutex_lock(&manage_mutex);
- mutex_lock(&callback_mutex);
+ container_manage_lock();
+ container_lock();

guarantee_online_cpus_mems_in_subtree(&top_cpuset);
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_online_map;

- mutex_unlock(&callback_mutex);
- mutex_unlock(&manage_mutex);
+ container_unlock();
+ container_manage_unlock();
}
#endif

@@ -2158,111 +1321,6 @@ void __init cpuset_init_smp(void)
}

/**
- * cpuset_fork - attach newly forked task to its parents cpuset.
- * @tsk: pointer to task_struct of forking parent process.
- *
- * Description: A task inherits its parent's cpuset at fork().
- *
- * A pointer to the shared cpuset was automatically copied in fork.c
- * by dup_task_struct(). However, we ignore that copy, since it was
- * not made under the protection of task_lock(), so might no longer be
- * a valid cpuset pointer. attach_task() might have already changed
- * current->cpuset, allowing the previously referenced cpuset to
- * be removed and freed. Instead, we task_lock(current) and copy
- * its present value of current->cpuset for our freshly forked child.
- *
- * At the point that cpuset_fork() is called, 'current' is the parent
- * task, and the passed argument 'child' points to the child task.
- **/
-
-void cpuset_fork(struct task_struct *child)
-{
- task_lock(current);
- child->cpuset = current->cpuset;
- atomic_inc(&child->cpuset->count);
- task_unlock(current);
-}
-
-/**
- * cpuset_exit - detach cpuset from exiting task
- * @tsk: pointer to task_struct of exiting process
- *
- * Description: Detach cpuset from @tsk and release it.
- *
- * Note that cpusets marked notify_on_release force every task in
- * them to take the global manage_mutex mutex when exiting.
- * This could impact scaling on very large systems. Be reluctant to
- * use notify_on_release cpusets where very high task exit scaling
- * is required on large systems.
- *
- * Don't even think about derefencing 'cs' after the cpuset use count
- * goes to zero, except inside a critical section guarded by manage_mutex
- * or callback_mutex. Otherwise a zero cpuset use count is a license to
- * any other task to nuke the cpuset immediately, via cpuset_rmdir().
- *
- * This routine has to take manage_mutex, not callback_mutex, because
- * it is holding that mutex while calling check_for_release(),
- * which calls kmalloc(), so can't be called holding callback_mutex().
- *
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
- * the_top_cpuset_hack:
- *
- * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
- *
- * Don't leave a task unable to allocate memory, as that is an
- * accident waiting to happen should someone add a callout in
- * do_exit() after the cpuset_exit() call that might allocate.
- * If a task tries to allocate memory with an invalid cpuset,
- * it will oops in cpuset_update_task_memory_state().
- *
- * We call cpuset_exit() while the task is still competent to
- * handle notify_on_release(), then leave the task attached to
- * the root cpuset (top_cpuset) for the remainder of its exit.
- *
- * To do this properly, we would increment the reference count on
- * top_cpuset, and near the very end of the kernel/exit.c do_exit()
- * code we would add a second cpuset function call, to drop that
- * reference. This would just create an unnecessary hot spot on
- * the top_cpuset reference count, to no avail.
- *
- * Normally, holding a reference to a cpuset without bumping its
- * count is unsafe. The cpuset could go away, or someone could
- * attach us to a different cpuset, decrementing the count on
- * the first cpuset that we never incremented. But in this case,
- * top_cpuset isn't going away, and either task has PF_EXITING set,
- * which wards off any attach_task() attempts, or task is a failed
- * fork, never visible to attach_task.
- *
- * Another way to do this would be to set the cpuset pointer
- * to NULL here, and check in cpuset_update_task_memory_state()
- * for a NULL pointer. This hack avoids that NULL check, for no
- * cost (other than this way too long comment ;).
- **/
-
-void cpuset_exit(struct task_struct *tsk)
-{
- struct cpuset *cs;
-
- cs = tsk->cpuset;
- tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
-
- if (notify_on_release(cs)) {
- char *pathbuf = NULL;
-
- mutex_lock(&manage_mutex);
- if (atomic_dec_and_test(&cs->count))
- check_for_release(cs, &pathbuf);
- mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
- } else {
- atomic_dec(&cs->count);
- }
-}
-
-/**
* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
*
@@ -2276,11 +1334,11 @@ cpumask_t cpuset_cpus_allowed(struct tas
{
cpumask_t mask;

- mutex_lock(&callback_mutex);
+ container_lock();
task_lock(tsk);
- guarantee_online_cpus(tsk->cpuset, &mask);
+ guarantee_online_cpus(tsk->container->cpuset, &mask);
task_unlock(tsk);
- mutex_unlock(&callback_mutex);
+ container_unlock();

return mask;
}
@@ -2304,11 +1362,11 @@ nodemask_t cpuset_mems_allowed(struct ta
{
nodemask_t mask;

- mutex_lock(&callback_mutex);
+ container_lock();
task_lock(tsk);
- guarantee_online_mems(tsk->cpuset, &mask);
+ guarantee_online_mems(tsk->container->cpuset, &mask);
task_unlock(tsk);
- mutex_unlock(&callback_mutex);
+ container_unlock();

return mask;
}
@@ -2408,45 +1466,18 @@ int __cpuset_zone_allowed(struct zone *z
return 1;

/* Not hardwall and node outside mems_allowed: scan up cpusets */
- mutex_lock(&callback_mutex);
+ container_lock();

task_lock(current);
- cs = nearest_exclusive_ancestor(current->cpuset);
+ cs = nearest_exclusive_ancestor(current->container->cpuset);
task_unlock(current);

allowed = node_isset(node, cs->mems_allowed);
- mutex_unlock(&callback_mutex);
+ container_unlock();
return allowed;
}

/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset. Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list. The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
- mutex_lock(&callback_mutex);
-}
-
-/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
* cpuset_mem_spread_node() - On which node to begin search for a page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
@@ -2506,7 +1537,7 @@ int cpuset_excl_nodes_overlap(const stru
task_unlock(current);
goto done;
}
- cs1 = nearest_exclusive_ancestor(current->cpuset);
+ cs1 = nearest_exclusive_ancestor(current->container->cpuset);
task_unlock(current);

task_lock((struct task_struct *)p);
@@ -2514,7 +1545,7 @@ int cpuset_excl_nodes_overlap(const stru
task_unlock((struct task_struct *)p);
goto done;
}
- cs2 = nearest_exclusive_ancestor(p->cpuset);
+ cs2 = nearest_exclusive_ancestor(p->container->cpuset);
task_unlock((struct task_struct *)p);

overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
@@ -2553,11 +1584,12 @@ void __cpuset_memory_pressure_bump(void)
struct cpuset *cs;

task_lock(current);
- cs = current->cpuset;
+ cs = current->container->cpuset;
fmeter_markevent(&cs->fmeter);
task_unlock(current);
}

+#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
* - Print tasks cpuset path into seq_file.
@@ -2588,15 +1620,15 @@ static int proc_cpuset_show(struct seq_f
goto out_free;

retval = -EINVAL;
- mutex_lock(&manage_mutex);
+ container_manage_lock();

- retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
+ retval = container_path(tsk->container, buf, PAGE_SIZE);
if (retval < 0)
goto out_unlock;
seq_puts(m, buf);
seq_putc(m, '\n');
out_unlock:
- mutex_unlock(&manage_mutex);
+ container_manage_unlock();
put_task_struct(tsk);
out_free:
kfree(buf);
@@ -2616,6 +1648,7 @@ struct file_operations proc_cpuset_opera
.llseek = seq_lseek,
.release = single_release,
};
+#endif /* CONFIG_PROC_PID_CPUSET */

/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
Index: container-2.6.19-rc5/init/Kconfig
===================================================================
--- container-2.6.19-rc5.orig/init/Kconfig
+++ container-2.6.19-rc5/init/Kconfig
@@ -239,17 +239,24 @@ config IKCONFIG_PROC
through /proc/config.gz.

config CONTAINERS
- bool "Container support"
- help
- This option will let you create and manage process containers,
- which can be used to aggregate multiple processes, e.g. for
- the purposes of resource tracking.
+ bool

- Say N if unsure
+config MAX_CONTAINER_SUBSYS
+ int "Number of container subsystems to support"
+ depends on CONTAINERS
+ range 1 255
+ default 8
+
+config MAX_CONTAINER_HIERARCHIES
+ int "Number of container hierarchies to support"
+ depends on CONTAINERS
+ range 2 255
+ default 4

config CPUSETS
bool "Cpuset support"
depends on SMP
+ select CONTAINERS
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
@@ -258,6 +265,10 @@ config CPUSETS

Say N if unsure.

+config PROC_PID_CPUSET
+ bool "Include legacy /proc/<pid>/cpuset file"
+ depends on CPUSETS
+
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.19-rc5/mm/oom_kill.c
===================================================================
--- container-2.6.19-rc5.orig/mm/oom_kill.c
+++ container-2.6.19-rc5/mm/oom_kill.c
@@ -395,7 +395,7 @@ void out_of_memory(struct zonelist *zone
show_mem();
}

- cpuset_lock();
+ container_lock();
read_lock(&tasklist_lock);

/*
@@ -429,7 +429,7 @@ retry:
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
- cpuset_unlock();
+ container_unlock();
panic("Out of memory and no killable processes...\n");
}

@@ -441,7 +441,7 @@ retry:

out:
read_unlock(&tasklist_lock);
- cpuset_unlock();
+ container_unlock();

/*
* Give "p" a good chance of killing itself before we
Index: container-2.6.19-rc5/include/linux/sched.h
===================================================================
--- container-2.6.19-rc5.orig/include/linux/sched.h
+++ container-2.6.19-rc5/include/linux/sched.h
@@ -720,7 +720,6 @@ extern unsigned int max_cache_size;

struct io_context; /* See blkdev.h */
struct container;
-struct cpuset;
#define NGROUPS_SMALL 32
#define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t)))
struct group_info {
@@ -1001,7 +1000,6 @@ struct task_struct {
short il_next;
#endif
#ifdef CONFIG_CPUSETS
- struct cpuset *cpuset;
nodemask_t mems_allowed;
int cpuset_mems_generation;
int cpuset_mem_spread_rotor;
@@ -1432,7 +1430,7 @@ static inline int thread_group_empty(str
/*
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
- * pins the final release of task.io_context. Also protects ->cpuset.
+ * pins the final release of task.io_context. Also protects ->container.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
Index: container-2.6.19-rc5/Documentation/cpusets.txt
===================================================================
--- container-2.6.19-rc5.orig/Documentation/cpusets.txt
+++ container-2.6.19-rc5/Documentation/cpusets.txt
@@ -7,6 +7,7 @@ Written by [email protected]
Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
Modified by Paul Jackson <[email protected]>
Modified by Christoph Lameter <[email protected]>
+Modified by Paul Menage <[email protected]>

CONTENTS:
=========
@@ -16,10 +17,9 @@ CONTENTS:
1.2 Why are cpusets needed ?
1.3 How are cpusets implemented ?
1.4 What are exclusive cpusets ?
- 1.5 What does notify_on_release do ?
- 1.6 What is memory_pressure ?
- 1.7 What is memory spread ?
- 1.8 How do I use cpusets ?
+ 1.5 What is memory_pressure ?
+ 1.6 What is memory spread ?
+ 1.7 How do I use cpusets ?
2. Usage Examples and Syntax
2.1 Basic Usage
2.2 Adding/removing cpus
@@ -43,18 +43,19 @@ hierarchy visible in a virtual file syst
hooks, beyond what is already present, required to manage dynamic
job placement on large systems.

-Each task has a pointer to a cpuset. Multiple tasks may reference
-the same cpuset. Requests by a task, using the sched_setaffinity(2)
-system call to include CPUs in its CPU affinity mask, and using the
-mbind(2) and set_mempolicy(2) system calls to include Memory Nodes
-in its memory policy, are both filtered through that tasks cpuset,
-filtering out any CPUs or Memory Nodes not in that cpuset. The
-scheduler will not schedule a task on a CPU that is not allowed in
-its cpus_allowed vector, and the kernel page allocator will not
-allocate a page on a node that is not allowed in the requesting tasks
-mems_allowed vector.
+Cpusets use the generic container subsystem described in
+Documentation/container.txt.

-User level code may create and destroy cpusets by name in the cpuset
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that tasks cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset. The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting tasks mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the container
virtual file system, manage the attributes and permissions of these
cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
specify and query to which cpuset a task is assigned, and list the
@@ -117,7 +118,7 @@ Cpusets extends these two mechanisms as
- Cpusets are sets of allowed CPUs and Memory Nodes, known to the
kernel.
- Each task in the system is attached to a cpuset, via a pointer
- in the task structure to a reference counted cpuset structure.
+ in the task structure to a reference counted container structure.
- Calls to sched_setaffinity are filtered to just those CPUs
allowed in that tasks cpuset.
- Calls to mbind and set_mempolicy are filtered to just
@@ -152,15 +153,10 @@ into the rest of the kernel, none in per
- in page_alloc.c, to restrict memory to allowed nodes.
- in vmscan.c, to restrict page recovery to the current cpuset.

-In addition a new file system, of type "cpuset" may be mounted,
-typically at /dev/cpuset, to enable browsing and modifying the cpusets
-presently known to the kernel. No new system calls are added for
-cpusets - all support for querying and modifying cpusets is via
-this cpuset file system.
-
-Each task under /proc has an added file named 'cpuset', displaying
-the cpuset name, as the path relative to the root of the cpuset file
-system.
+You should mount the "container" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel. No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.

The /proc/<pid>/status file for each task has two added lines,
displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
@@ -170,16 +166,15 @@ in the format seen in the following exam
Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
Mems_allowed: ffffffff,ffffffff

-Each cpuset is represented by a directory in the cpuset file system
-containing the following files describing that cpuset:
+Each cpuset is represented by a directory in the container file system
+containing (on top of the standard container files) the following
+files describing that cpuset:

- cpus: list of CPUs in that cpuset
- mems: list of Memory Nodes in that cpuset
- memory_migrate flag: if set, move pages to cpusets nodes
- cpu_exclusive flag: is cpu placement exclusive?
- mem_exclusive flag: is memory placement exclusive?
- - tasks: list of tasks (by pid) attached to that cpuset
- - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
- memory_pressure: measure of how much paging pressure in cpuset

In addition, the root cpuset only has the following file:
@@ -253,21 +248,7 @@ such as requests from interrupt handlers
outside even a mem_exclusive cpuset.


-1.5 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cpuset, then whenever
-the last task in the cpuset leaves (exits or attaches to some other
-cpuset) and the last child cpuset of that cpuset is removed, then
-the kernel runs the command /sbin/cpuset_release_agent, supplying the
-pathname (relative to the mount point of the cpuset file system) of the
-abandoned cpuset. This enables automatic removal of abandoned cpusets.
-The default value of notify_on_release in the root cpuset at system
-boot is disabled (0). The default value of other cpusets at creation
-is the current value of their parents notify_on_release setting.
-
-
-1.6 What is memory_pressure ?
+1.5 What is memory_pressure ?
-----------------------------
The memory_pressure of a cpuset provides a simple per-cpuset metric
of the rate that the tasks in a cpuset are attempting to free up in
@@ -324,7 +305,7 @@ the tasks in the cpuset, in units of rec
times 1000.


-1.7 What is memory spread ?
+1.6 What is memory spread ?
---------------------------
There are two boolean flag files per cpuset that control where the
kernel allocates pages for the file system buffers and related in
@@ -395,7 +376,7 @@ data set, the memory allocation across t
can become very uneven.


-1.8 How do I use cpusets ?
+1.7 How do I use cpusets ?
--------------------------

In order to minimize the impact of cpusets on critical kernel
@@ -485,7 +466,7 @@ than stress the kernel.
To start a new job that is to be contained within a cpuset, the steps are:

1) mkdir /dev/cpuset
- 2) mount -t cpuset none /dev/cpuset
+ 2) mount -t container none /dev/cpuset
3) Create the new cpuset by doing mkdir's and write's (or echo's) in
the /dev/cpuset virtual file system.
4) Start a task that will be the "founding father" of the new job.
@@ -497,7 +478,7 @@ For example, the following sequence of c
named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
and then start a subshell 'sh' in that cpuset:

- mount -t cpuset none /dev/cpuset
+ mount -t container none /dev/cpuset
cd /dev/cpuset
mkdir Charlie
cd Charlie
@@ -507,7 +488,7 @@ and then start a subshell 'sh' in that c
sh
# The subshell 'sh' is now running in cpuset Charlie
# The next line should display '/Charlie'
- cat /proc/self/cpuset
+ cat /proc/self/container

In the future, a C library interface to cpusets will likely be
available. For now, the only way to query or modify cpusets is
@@ -529,7 +510,7 @@ Creating, modifying, using the cpusets c
virtual filesystem.

To mount it, type:
-# mount -t cpuset none /dev/cpuset
+# mount -t container none /dev/cpuset

Then under /dev/cpuset you can find a tree that corresponds to the
tree of the cpusets in the system. For instance, /dev/cpuset
Index: container-2.6.19-rc5/fs/super.c
===================================================================
--- container-2.6.19-rc5.orig/fs/super.c
+++ container-2.6.19-rc5/fs/super.c
@@ -39,11 +39,6 @@
#include <linux/mutex.h>
#include <asm/uaccess.h>

-
-void get_filesystem(struct file_system_type *fs);
-void put_filesystem(struct file_system_type *fs);
-struct file_system_type *get_fs_type(const char *name);
-
LIST_HEAD(super_blocks);
DEFINE_SPINLOCK(sb_lock);

Index: container-2.6.19-rc5/include/linux/fs.h
===================================================================
--- container-2.6.19-rc5.orig/include/linux/fs.h
+++ container-2.6.19-rc5/include/linux/fs.h
@@ -1875,6 +1875,8 @@ extern int vfs_fstat(unsigned int, struc

extern int vfs_ioctl(struct file *, unsigned int, unsigned int, unsigned long);

+extern void get_filesystem(struct file_system_type *fs);
+extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern struct super_block *get_super(struct block_device *);
extern struct super_block *user_get_super(dev_t);
Index: container-2.6.19-rc5/include/linux/mempolicy.h
===================================================================
--- container-2.6.19-rc5.orig/include/linux/mempolicy.h
+++ container-2.6.19-rc5/include/linux/mempolicy.h
@@ -152,7 +152,7 @@ extern void mpol_fix_fork_child_flag(str

#ifdef CONFIG_CPUSETS
#define current_cpuset_is_being_rebound() \
- (cpuset_being_rebound == current->cpuset)
+ (cpuset_being_rebound == current->container->cpuset)
#else
#define current_cpuset_is_being_rebound() 0
#endif

--


2006-12-01 16:49:32

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: [Patch 1/3] Miscellaneous container fixes

This patches fixes various bugs I hit in the recently posted container
patches.

1. If a subsystem registers with fork/exit hook during bootup (much
before rcu is initialized), then the resulting synchronize_rcu() in
container_register_subsys() hangs. Avoid this by not calling
synchronize_rcu() if we arent fully booted yet.

2. If cpuset_create fails() for some reason, then the resulting
call to cpuset_destroy can trip. Avoid this by initializing
container->...->cpuset pointer to NULL in cpuset_create().

3. container_rmdir->cpuset_destroy->update_flag can deadlock on
container_lock(). Avoid this by introducing __update_flag, which
doesnt take container_lock().

(I have also hit some lockdep warnings. Will post them after some
review, to make sure that they are not introduced by my patches).

Signed-off-by : Srivatsa Vaddagiri <[email protected]>


---

linux-2.6.19-rc6-vatsa/kernel/container.c | 4 ++-
linux-2.6.19-rc6-vatsa/kernel/cpuset.c | 35 ++++++++++++++++++++++++------
2 files changed, 31 insertions(+), 8 deletions(-)

diff -puN kernel/container.c~container_fixes kernel/container.c
--- linux-2.6.19-rc6/kernel/container.c~container_fixes 2006-12-01 16:19:41.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/kernel/container.c 2006-12-01 16:20:20.000000000 +0530
@@ -1344,6 +1344,7 @@ static long container_create(struct cont
cont->parent = parent;
cont->root = parent->root;
cont->hierarchy = parent->hierarchy;
+ cont->top_container = parent->top_container;

for_each_subsys(cont->hierarchy, ss) {
err = ss->create(ss, cont);
@@ -1580,7 +1581,8 @@ int container_register_subsys(struct con
if (!need_forkexit_callback &&
(new_subsys->fork || new_subsys->exit)) {
need_forkexit_callback = 1;
- synchronize_rcu();
+ if (system_state == SYSTEM_RUNNING)
+ synchronize_rcu();
}

/* If this subsystem requested that it be notified with fork
diff -puN kernel/cpuset.c~container_fixes kernel/cpuset.c
--- linux-2.6.19-rc6/kernel/cpuset.c~container_fixes 2006-12-01 19:02:35.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/kernel/cpuset.c 2006-12-01 20:43:52.000000000 +0530
@@ -97,14 +97,22 @@ struct cpuset {
/* Update the cpuset for a container */
static inline void set_container_cs(struct container *cont, struct cpuset *cs)
{
- cont->subsys[cpuset_subsys.subsys_id] = &cs->css;
+ if (cs)
+ cont->subsys[cpuset_subsys.subsys_id] = &cs->css;
+ else
+ cont->subsys[cpuset_subsys.subsys_id] = NULL;
}

/* Retrieve the cpuset for a container */
static inline struct cpuset *container_cs(struct container *cont)
{
- return container_of(container_subsys_state(cont, &cpuset_subsys),
- struct cpuset, css);
+ struct container_subsys_state *css;
+
+ css = container_subsys_state(cont, &cpuset_subsys);
+ if (css)
+ return container_of(css, struct cpuset, css);
+ else
+ return NULL;
}

/* Retrieve the cpuset for a task */
@@ -698,7 +706,7 @@ static int update_memory_pressure_enable
* Call with manage_mutex held.
*/

-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+static int __update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
{
int turning_on;
struct cpuset trialcs;
@@ -717,18 +725,27 @@ static int update_flag(cpuset_flagbits_t
return err;
cpu_exclusive_changed =
(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
- container_lock();
if (turning_on)
set_bit(bit, &cs->flags);
else
clear_bit(bit, &cs->flags);
- container_unlock();

if (cpu_exclusive_changed)
update_cpu_domains(cs);
return 0;
}

+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+{
+ int err;
+
+ container_lock();
+ err = __update_flag(bit, cs, buf);
+ container_unlock();
+
+ return err;
+}
+
/*
* Frequency meter - How fast is some event occurring?
*
@@ -1165,6 +1182,7 @@ int cpuset_create(struct container_subsy
return 0;
}
parent = container_cs(cont->parent);
+ set_container_cs(cont, NULL);
cs = kmalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return -ENOMEM;
@@ -1202,9 +1220,12 @@ void cpuset_destroy(struct container_sub
{
struct cpuset *cs = container_cs(cont);

+ if (!cs)
+ return;
+
cpuset_update_task_memory_state();
if (is_cpu_exclusive(cs)) {
- int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+ int retval = __update_flag(CS_CPU_EXCLUSIVE, cs, "0");
BUG_ON(retval);
}
number_of_cpusets--;
_


--
Regards,
vatsa

2006-12-01 16:51:36

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: [Patch 2/3] cpu controller (v3) - based on RTLIMIT_RT_CPU patch


Nick/Ingo,

Here's another approach for a minimal cpu controller. Would greatly
appreciate any feedback as before.

This version is inspired by Ingo's RTLIMIT_RT_CPU patches found here:

http://kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.11-rc2/2.6.11-rc2-mm2/broken-out/rlimit_rt_cpu.patch

This patch is also about 80% smaller than the patches I had posted earlier:

http://lkml.org/lkml/2006/9/28/236

Primary differences between Ingo's RT_LIMIT_CPU patch and this one:

- This patch handles starvation of lower priority tasks in a group.
- This patch uses tokens for accounting cpu consumption. I didnt
get good results with decaying avg approach used in the rtlimit patches.
- Task grouping based on cpuset/containers and not on rtlimit.

Other features:

- Retains one-runqueue-per-cpu, as is prevalent today
- scheduling no longer of O(1) complexity, similar to rtlimit patches.
This can be avoided if we use separate runqueues for different groups
(as was done in http://lkml.org/lkml/2006/9/28/236)
- Only limit supported (no guarantee)

Unsupported feature:

- SMP load balance aware of group limits. This can be handled using
smpnice later if required (http://lkml.org/lkml/2006/9/28/244)


Signed-off-by : Srivatsa Vaddagiri <[email protected]>

---

linux-2.6.19-rc6-vatsa/include/linux/sched.h | 3
linux-2.6.19-rc6-vatsa/kernel/sched.c | 195 ++++++++++++++++++++++++++-
2 files changed, 195 insertions(+), 3 deletions(-)

diff -puN include/linux/sched.h~cpu_ctlr include/linux/sched.h
--- linux-2.6.19-rc6/include/linux/sched.h~cpu_ctlr 2006-12-01 20:45:08.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/include/linux/sched.h 2006-12-01 20:45:08.000000000 +0530
@@ -1095,6 +1095,9 @@ static inline void put_task_struct(struc
/* Not implemented yet, only for 486*/
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
+#ifdef CONFIG_CPUMETER
+#define PF_STARVING 0x00000010 /* Task starving for CPU */
+#endif
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
diff -puN kernel/sched.c~cpu_ctlr kernel/sched.c
--- linux-2.6.19-rc6/kernel/sched.c~cpu_ctlr 2006-12-01 20:45:08.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/kernel/sched.c 2006-12-01 20:45:08.000000000 +0530
@@ -264,10 +264,25 @@ struct rq {
unsigned long ttwu_local;
#endif
struct lock_class_key rq_lock_key;
+#ifdef CONFIG_CPUMETER
+ unsigned long last_update;
+ int need_recheck;
+#endif
};

static DEFINE_PER_CPU(struct rq, runqueues);

+struct cpu_usage {
+ long tokens;
+ unsigned long last_update;
+ int starve_count;
+};
+
+struct task_grp {
+ unsigned long limit;
+ struct cpu_usage *cpu_usage; /* per-cpu ptr */
+};
+
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -705,6 +720,137 @@ enqueue_task_head(struct task_struct *p,
p->array = array;
}

+#ifdef CONFIG_CPUMETER
+
+#define task_starving(p) (p->flags & PF_STARVING)
+
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+ return NULL;
+}
+
+/* Mark a task starving - either we shortcircuited its timeslice or we didnt
+ * pick it to run (because its group ran out of bandwidth limit).
+ */
+static inline void set_tsk_starving(struct task_struct *p, struct task_grp *grp)
+{
+ struct cpu_usage *grp_usage;
+
+ if (task_starving(p))
+ return;
+
+ BUG_ON(!grp);
+ grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
+ grp_usage->starve_count++;
+ p->flags |= PF_STARVING;
+}
+
+/* Clear a task's starving flag */
+static inline void clear_tsk_starving(struct task_struct *p,
+ struct task_grp *grp)
+{
+ struct cpu_usage *grp_usage;
+
+ if (!task_starving(p))
+ return;
+
+ BUG_ON(!grp);
+ grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
+ grp_usage->starve_count--;
+ p->flags &= ~PF_STARVING;
+}
+
+/* Does the task's group have starving tasks? */
+static inline int is_grp_starving(struct task_struct *p)
+{
+ struct task_grp *grp = task_grp(p);
+ struct cpu_usage *grp_usage;
+
+ if (!grp)
+ return 0;
+
+ grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
+ if (grp_usage->starve_count)
+ return 1;
+
+ return 0;
+}
+
+/* Are we past the 1-sec control window? If so, all groups get to renew their
+ * expired tokens.
+ */
+static inline void adjust_control_window(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+ unsigned long delta;
+
+ delta = jiffies - rq->last_update;
+ if (delta >= HZ) {
+ rq->last_update += (delta/HZ) * HZ;
+ rq->need_recheck = 1;
+ }
+}
+
+/* Account group's cpu usage */
+static inline void inc_cpu_usage(struct task_struct *p)
+{
+ struct task_grp *grp = task_grp(p);
+ struct cpu_usage *grp_usage;
+
+ adjust_control_window(p);
+
+ if (!grp || !grp->limit || rt_task(p))
+ return;
+
+ grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
+ grp_usage->tokens--;
+}
+
+static inline int task_over_cpu_limit(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+ struct task_grp *grp = task_grp(p);
+ struct cpu_usage *grp_usage;
+
+ adjust_control_window(p);
+
+ if (!grp || !grp->limit || !rq->need_recheck)
+ return 0;
+
+ grp_usage = per_cpu_ptr(grp->cpu_usage, task_cpu(p));
+ if (grp_usage->last_update != rq->last_update) {
+ /* Replenish tokens */
+ grp_usage->tokens = grp->limit * HZ / 100;
+ grp_usage->last_update = rq->last_update;
+ }
+
+ if (grp_usage->tokens <= 0)
+ return 1;
+
+ return 0;
+}
+
+static inline void rq_set_recheck(struct rq *rq, int check)
+{
+ rq->need_recheck = check;
+}
+
+#else
+
+#define task_starving(p) 0
+
+struct task_grp;
+
+static struct task_grp *task_grp(struct task_struct *p) { return NULL;}
+static void inc_cpu_usage(struct task_struct *p) { }
+static int task_over_cpu_limit(struct task_struct *p) { return 0; }
+static void set_tsk_starving(struct task_struct *p, struct task_grp *grp) { }
+static void clear_tsk_starving(struct task_struct *p, struct task_grp *grp) { }
+static int is_grp_starving(struct task_struct *p) { return 0;}
+static inline void rq_set_recheck(struct rq *rq, int check) { }
+
+#endif /* CONFIG_CPUMETER */
+
/*
* __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
@@ -847,6 +993,7 @@ static void __activate_task(struct task_
target = rq->expired;
enqueue_task(p, target);
inc_nr_running(p, rq);
+ rq_set_recheck(rq, 1);
}

/*
@@ -1586,6 +1733,9 @@ void fastcall sched_fork(struct task_str
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+#ifdef CONFIG_CPUMETER
+ p->flags &= ~PF_STARVING;
+#endif
/*
* Share the timeslice between parent and child, thus the
* total amount of pending timeslices in the system doesn't change,
@@ -2047,6 +2197,8 @@ static void pull_task(struct rq *src_rq,
{
dequeue_task(p, src_array);
dec_nr_running(p, src_rq);
+ clear_tsk_starving(p, task_grp(p));
+ rq_set_recheck(this_rq, 1);
set_task_cpu(p, this_cpu);
inc_nr_running(p, this_rq);
enqueue_task(p, this_array);
@@ -3068,6 +3220,9 @@ void scheduler_tick(void)
goto out;
}
spin_lock(&rq->lock);
+
+ inc_cpu_usage(p);
+
/*
* The task was running during this tick - update the
* time slice counter. Note: we do not update a thread's
@@ -3094,17 +3249,18 @@ void scheduler_tick(void)
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
p->first_time_slice = 0;

if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+ if (!TASK_INTERACTIVE(p) || expired_starving(rq)
+ || task_over_cpu_limit(p)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
rq->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active);
+ goto out_unlock;
} else {
/*
* Prevent a too long timeslice allowing a task to monopolize
@@ -3131,6 +3287,14 @@ void scheduler_tick(void)
set_tsk_need_resched(p);
}
}
+
+ if (task_over_cpu_limit(p)) {
+ dequeue_task(p, rq->active);
+ set_tsk_need_resched(p);
+ enqueue_task(p, rq->expired);
+ set_tsk_starving(p, task_grp(p));
+ }
+
out_unlock:
spin_unlock(&rq->lock);
out:
@@ -3320,7 +3484,7 @@ asmlinkage void __sched schedule(void)
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx, new_prio;
+ int cpu, idx, new_prio, array_switch;
long *switch_count;
struct rq *rq;

@@ -3379,6 +3543,7 @@ need_resched_nonpreemptible:
else {
if (prev->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
+ clear_tsk_starving(prev, task_grp(prev));
deactivate_task(prev, rq);
}
}
@@ -3394,11 +3559,15 @@ need_resched_nonpreemptible:
}
}

+ array_switch = 0;
+
+pick_next_task:
array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
+ array_switch++;
schedstat_inc(rq, sched_switch);
rq->active = rq->expired;
rq->expired = array;
@@ -3411,6 +3580,25 @@ need_resched_nonpreemptible:
queue = array->queue + idx;
next = list_entry(queue->next, struct task_struct, run_list);

+ /* If we have done an array switch twice, it means we cant find any
+ * task which isn't above its limit and hence we just run the
+ * first task on the active array.
+ */
+ if (array_switch < 2 && (task_over_cpu_limit(next) ||
+ (!task_starving(next) && is_grp_starving(next)))) {
+ dequeue_task(next, rq->active);
+ enqueue_task(next, rq->expired);
+ if (next->time_slice)
+ set_tsk_starving(next, task_grp(next));
+ goto pick_next_task;
+ }
+
+ if (task_over_cpu_limit(next))
+ rq_set_recheck(rq, 0);
+ if (!next->time_slice)
+ next->time_slice = task_timeslice(next);
+ clear_tsk_starving(next, task_grp(next));
+
if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
unsigned long long delta = now - next->timestamp;
if (unlikely((long long)(now - next->timestamp) < 0))
@@ -4965,6 +5153,7 @@ static int __migrate_task(struct task_st
if (!cpu_isset(dest_cpu, p->cpus_allowed))
goto out;

+ clear_tsk_starving(p, task_grp(p));
set_task_cpu(p, dest_cpu);
if (p->array) {
/*
_
--
Regards,
vatsa

2006-12-01 16:54:11

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: [Patch 3/3] cpuset interface


This patchs extends cpusets to support creation of metered cpusets. Changes
made to cpusets are:

- Every cpuset directory has two new files:
cpu_meter_enabled, cpu_meter_limit
- Writing '1' into 'cpu_meter_enabled' marks the cpuset as "metered" i.e
tasks in this cpuset and its child cpusets will have their CPU
usage controlled by the CPU controller as per the limit settings
mentioned in 'cpu_meter_limit'.
- Writing a number between 0-100 into 'cpu_meter_limit' file of a
metered cpuset assigns corresponding bandwidth for all tasks in the
cpuset (task-group).
- Only cpusets marked as "exclusive" can be metered.
- cpusets that have child cpusets cannot be metered.
- Metered cpusets cannot have grand-children!

Metered cpusets was discussed elaborately at:

http://lkml.org/lkml/2005/9/26/58

This patch was inspired from those discussions and is a subset of the features
provided in the earlier patch.

To use this feature, apply the following patches in order (on top of
2.6.19-rc6):

http://lkml.org/lkml/2006/11/23/97
http://lkml.org/lkml/2006/11/23/99
http://lkml.org/lkml/2006/11/23/101
Patch 1/3 (in this series)
Patch 2/3 (in this series)
Patch 3/3 (this patch)

Enable CONFIG_CPUSETS and CONFIG_CPUMETER.

As an example, follow these steps to create metered cpusets:


# alias echo="/bin/echo"

# cd /dev
# mkdir container
# mount -t container container container

# cd container

# echo 1 > cpu_exclusive

# mkdir test
# cd test

# echo 1 > cpus # Assign CPU3 to the cpuset.
# echo 0 > mems # assign node 0
# echo 1 > cpu_exclusive
# echo 1 > cpu_meter_enabled # "test" is now a "metered" cpuset

# # Because parent is marked 'cpu_meter_enabled',
# # mkdir below automatically copied 'cpus', 'mems' and
# # 'cpu_meter_enabled' from the parent to the new child.

# mkdir very_imp_grp
# mkdir less_imp_grp

# # Assign 70% bandwidth to very_imp_grp
# echo 70 > very_imp_grp/cpu_meter_limit

# # Assign 20% bandwidth to less_imp_grp
# echo 20 > less_imp_grp/cpu_meter_limit

# echo $very_imp_task1_pid > very_imp_grp/tasks
# echo $very_imp_task2_pid > very_imp_grp/tasks

# echo $less_imp_task1_pid > less_imp_grp/tasks
# echo $less_imp_task2_pid > less_imp_grp/tasks


Signed-off-by : Srivatsa Vaddagiri <[email protected]>

---

linux-2.6.19-rc6-vatsa/include/linux/cpuset.h | 13 +
linux-2.6.19-rc6-vatsa/include/linux/sched.h | 15 +
linux-2.6.19-rc6-vatsa/init/Kconfig | 8
linux-2.6.19-rc6-vatsa/init/main.c | 4
linux-2.6.19-rc6-vatsa/kernel/cpuset.c | 222 +++++++++++++++++++++++++-
linux-2.6.19-rc6-vatsa/kernel/sched.c | 86 +++++++++-
6 files changed, 343 insertions(+), 5 deletions(-)

diff -puN include/linux/cpuset.h~cpuset_interface include/linux/cpuset.h
--- linux-2.6.19-rc6/include/linux/cpuset.h~cpuset_interface 2006-12-01 20:45:14.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/include/linux/cpuset.h 2006-12-01 20:45:14.000000000 +0530
@@ -62,6 +62,15 @@ extern void cpuset_track_online_nodes(vo

extern int current_cpuset_is_being_rebound(void);

+#ifdef CONFIG_CPUMETER
+void *cpu_ctlr_data(struct task_struct *p);
+#else
+static inline void *cpu_ctlr_data(struct task_struct *p)
+{
+ return NULL;
+}
+#endif
+
#else /* !CONFIG_CPUSETS */

static inline int cpuset_init_early(void) { return 0; }
@@ -127,6 +136,10 @@ static inline int current_cpuset_is_bein
return 0;
}

+static inline void *cpu_ctlr_data(struct task_struct *p)
+{
+ return NULL;
+}
#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */
diff -puN init/main.c~cpuset_interface init/main.c
--- linux-2.6.19-rc6/init/main.c~cpuset_interface 2006-12-01 20:45:14.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/init/main.c 2006-12-01 20:45:14.000000000 +0530
@@ -507,6 +507,8 @@ asmlinkage void __init start_kernel(void
unwind_setup();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
+ container_init_early();
+ cpuset_init_early();

/*
* Set up the scheduler prior starting any interrupts (such as the
@@ -569,8 +571,6 @@ asmlinkage void __init start_kernel(void
}
#endif
vfs_caches_init_early();
- container_init_early();
- cpuset_init_early();
mem_init();
kmem_cache_init();
setup_per_cpu_pageset();
diff -puN kernel/cpuset.c~cpuset_interface kernel/cpuset.c
--- linux-2.6.19-rc6/kernel/cpuset.c~cpuset_interface 2006-12-01 20:45:14.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/kernel/cpuset.c 2006-12-01 20:46:08.000000000 +0530
@@ -92,6 +92,9 @@ struct cpuset {
int mems_generation;

struct fmeter fmeter; /* memory_pressure filter */
+#ifdef CONFIG_CPUMETER
+ void *cpu_ctlr_data;
+#endif
};

/* Update the cpuset for a container */
@@ -129,6 +132,9 @@ typedef enum {
CS_MEMORY_MIGRATE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
+#ifdef CONFIG_CPUMETER
+ CS_METER_FLAG,
+#endif
} cpuset_flagbits_t;

/* convenient tests for these bits */
@@ -157,6 +163,140 @@ static inline int is_spread_slab(const s
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

+#ifdef CONFIG_CPUMETER
+static inline int is_metered(const struct cpuset *cs)
+{
+ return test_bit(CS_METER_FLAG, &cs->flags);
+}
+
+/* does the new combination of meter and exclusive flags makes sense? */
+static int validate_meters(const struct cpuset *cur, const struct cpuset *trial)
+{
+ struct cpuset *parent = cur->parent;
+ int is_changed, is_exclusive_changed, rc = -EINVAL;
+ cpumask_t cur_mask, trial_mask;
+
+ /* checks for change in meter flag */
+ is_changed = (is_metered(trial) != is_metered(cur));
+
+ is_exclusive_changed = (is_cpu_exclusive(trial) !=
+ is_cpu_exclusive(cur));
+
+ /* Cant change meter setting if a cpuset already has children */
+ if (is_changed && !list_empty(&cur->css.container->children)) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ /* Can't change meter setting for children of metered cpusets */
+ if (parent && is_metered(parent) && !is_metered(trial))
+ goto out;
+
+ /* Turn on metering only if a cpuset is exclusive */
+ if (parent && !is_metered(parent) && is_metered(trial) &&
+ !is_cpu_exclusive(cur))
+ goto out;
+
+ /* Cant change exclusive setting if a cpuset is metered */
+ if (is_exclusive_changed && is_metered(cur))
+ goto out;
+
+ /* Cant change cpus_allowed of a metered cpuset */
+ cpus_and(trial_mask, trial->cpus_allowed, cpu_possible_map);
+ cpus_and(cur_mask, cur->cpus_allowed, cpu_possible_map);
+ if (is_metered(cur) && !cpus_equal(trial_mask, cur_mask)) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+ return rc;
+}
+
+static void update_meter(struct cpuset *cs)
+{
+ if (is_metered(cs))
+ cs->cpu_ctlr_data = cpu_ctlr->create_group();
+ else {
+ cpu_ctlr->destroy_group(cs->cpu_ctlr_data);
+ /* Do we have any races here ? */
+ cs->cpu_ctlr_data = NULL;
+ }
+}
+
+static int update_limit(struct cpuset *cs, char *buf)
+{
+ unsigned long limit;
+
+ if (!buf || !is_metered(cs))
+ return -EINVAL;
+
+ limit = simple_strtoul(buf, NULL, 10);
+ cpu_ctlr->set_limit(cs->cpu_ctlr_data, limit);
+
+ return 0;
+}
+
+static int cpuset_sprintf_limit(char *page, struct cpuset *cs)
+{
+ unsigned long limit = 0;
+ char *c = page;
+
+ if (is_metered(cs))
+ limit = cpu_ctlr->get_limit(cs->cpu_ctlr_data);
+ c += sprintf (c, "%lu", limit);
+
+ return c - page;
+}
+
+void *cpu_ctlr_data(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ return cs->cpu_ctlr_data;
+}
+
+void cpu_ctlr_move_task(struct task_struct *p, struct cpuset *old,
+ struct cpuset *new)
+{
+ cpu_ctlr->move_task(p, old->cpu_ctlr_data, new->cpu_ctlr_data);
+}
+
+#define init_cpu_ctlr_data(cs) (cs->cpu_ctlr_data = NULL)
+
+#else
+
+#define init_cpu_ctlr_data(cs)
+
+static inline int is_metered(const struct cpuset *cs)
+{
+ return 0;
+}
+
+static int validate_meters(const struct cpuset *cur, const struct cpuset *trial)
+{
+ return 0;
+}
+
+static void update_meter(struct cpuset *cs)
+{
+ return 0;
+}
+
+static int update_limit(struct cpuset *cs, char *buf)
+{
+ return 0;
+}
+
+static void cpu_ctlr_move_task(struct task_struct *p, struct cpuset *old,
+ struct cpuset *new)
+{
+}
+
+#endif /* CONFIG_CPUMETER */
+
/*
* Increment this integer everytime any cpuset changes its
* mems_allowed value. Users of cpusets can track this generation
@@ -374,6 +514,10 @@ static int validate_change(const struct
{
struct container *cont;
struct cpuset *c, *par;
+ int rc = 0;
+
+ if ((rc = validate_meters(cur, trial)))
+ return rc;

/* Each of our child cpusets must be a subset of us */
list_for_each_entry(cont, &cur->css.container->children, sibling) {
@@ -710,7 +854,7 @@ static int __update_flag(cpuset_flagbits
{
int turning_on;
struct cpuset trialcs;
- int err, cpu_exclusive_changed;
+ int err, cpu_exclusive_changed, meter_changed;

turning_on = (simple_strtoul(buf, NULL, 10) != 0);

@@ -725,6 +869,7 @@ static int __update_flag(cpuset_flagbits
return err;
cpu_exclusive_changed =
(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+ meter_changed = (is_metered(cs) != is_metered(&trialcs));
if (turning_on)
set_bit(bit, &cs->flags);
else
@@ -732,7 +877,9 @@ static int __update_flag(cpuset_flagbits

if (cpu_exclusive_changed)
update_cpu_domains(cs);
- return 0;
+ if (meter_changed)
+ update_meter(cs);
+ return err;
}

static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -860,6 +1007,7 @@ void cpuset_attach(struct container_subs
{
cpumask_t cpus;
guarantee_online_cpus(container_cs(cont), &cpus);
+ cpu_ctlr_move_task(tsk, container_cs(old_cont), container_cs(cont));
set_cpus_allowed(tsk, cpus);
}

@@ -897,6 +1045,10 @@ typedef enum {
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
+#ifdef CONFIG_CPUMETER
+ FILE_CPU_METER_ENABLED,
+ FILE_CPU_METER_LIMIT,
+#endif
} cpuset_filetype_t;

static ssize_t cpuset_common_file_write(struct container *cont,
@@ -961,6 +1113,14 @@ static ssize_t cpuset_common_file_write(
retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
cs->mems_generation = cpuset_mems_generation++;
break;
+#ifdef CONFIG_CPUMETER
+ case FILE_CPU_METER_ENABLED:
+ retval = update_flag(CS_METER_FLAG, cs, buffer);
+ break;
+ case FILE_CPU_METER_LIMIT:
+ retval = update_limit(cs, buffer);
+ break;
+#endif
default:
retval = -EINVAL;
goto out2;
@@ -1054,6 +1214,14 @@ static ssize_t cpuset_common_file_read(s
case FILE_SPREAD_SLAB:
*s++ = is_spread_slab(cs) ? '1' : '0';
break;
+#ifdef CONFIG_CPUMETER
+ case FILE_CPU_METER_ENABLED:
+ *s++ = is_metered(cs) ? '1' : '0';
+ break;
+ case FILE_CPU_METER_LIMIT:
+ s += cpuset_sprintf_limit(s, cs);
+ break;
+#endif
default:
retval = -EINVAL;
goto out;
@@ -1134,6 +1302,23 @@ static struct cftype cft_spread_slab = {
.private = FILE_SPREAD_SLAB,
};

+#ifdef CONFIG_CPUMETER
+static struct cftype cft_meter_flag = {
+ .name = "cpu_meter_enabled",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
+ .private = FILE_CPU_METER_ENABLED,
+};
+
+static struct cftype cft_meter_limit = {
+ .name = "cpu_meter_limit",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
+ .private = FILE_CPU_METER_LIMIT,
+};
+
+#endif
+
int cpuset_populate(struct container_subsys *ss, struct container *cont)
{
int err;
@@ -1157,6 +1342,12 @@ int cpuset_populate(struct container_sub
/* memory_pressure_enabled is in root cpuset only */
if (err == 0 && !cont->parent)
err = container_add_file(cont, &cft_memory_pressure_enabled);
+#ifdef CONFIG_CPUMETER
+ if ((err = container_add_file(cont, &cft_meter_flag)) < 0)
+ return err;
+ if ((err = container_add_file(cont, &cft_meter_limit)) < 0)
+ return err;
+#endif
return 0;
}

@@ -1183,12 +1374,18 @@ int cpuset_create(struct container_subsy
}
parent = container_cs(cont->parent);
set_container_cs(cont, NULL);
+
+ /* metered cpusets cant have grand-children! */
+ if (parent->parent && is_metered(parent->parent))
+ return -EINVAL;
+
cs = kmalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return -ENOMEM;

cpuset_update_task_memory_state();
cs->flags = 0;
+ init_cpu_ctlr_data(cs);
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
@@ -1202,6 +1399,13 @@ int cpuset_create(struct container_subsy
set_container_cs(cont, cs);
cs->css.container = cont;
number_of_cpusets++;
+ if (is_metered(parent)) {
+ cs->cpus_allowed = parent->cpus_allowed;
+ cs->mems_allowed = parent->mems_allowed;
+ set_bit(CS_METER_FLAG, &cs->flags);
+ update_meter(cs);
+ }
+
return 0;
}

@@ -1224,6 +1428,10 @@ void cpuset_destroy(struct container_sub
return;

cpuset_update_task_memory_state();
+ if (is_metered(cs)) {
+ clear_bit(CS_METER_FLAG, &cs->flags);
+ update_meter(cs);
+ }
if (is_cpu_exclusive(cs)) {
int retval = __update_flag(CS_CPU_EXCLUSIVE, cs, "0");
BUG_ON(retval);
@@ -1232,6 +1440,13 @@ void cpuset_destroy(struct container_sub
kfree(cs);
}

+#ifdef CONFIG_CPUMETER
+static void cpuset_exit(struct container_subsys *ss, struct task_struct *tsk)
+{
+ cpu_ctlr_move_task(tsk, task_cs(tsk), &top_cpuset);
+}
+#endif
+
static struct container_subsys cpuset_subsys = {
.name = "cpuset",
.create = cpuset_create,
@@ -1240,6 +1455,9 @@ static struct container_subsys cpuset_su
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.populate = cpuset_populate,
+#ifdef CONFIG_CPUMETER
+ .exit = cpuset_exit,
+#endif
.subsys_id = -1,
};

diff -puN kernel/sched.c~cpuset_interface kernel/sched.c
--- linux-2.6.19-rc6/kernel/sched.c~cpuset_interface 2006-12-01 20:45:14.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/kernel/sched.c 2006-12-01 20:53:10.000000000 +0530
@@ -726,7 +726,7 @@ enqueue_task_head(struct task_struct *p,

static inline struct task_grp *task_grp(struct task_struct *p)
{
- return NULL;
+ return cpu_ctlr_data(p);
}

/* Mark a task starving - either we shortcircuited its timeslice or we didnt
@@ -835,6 +835,90 @@ static inline void rq_set_recheck(struct
rq->need_recheck = check;
}

+/* Allocate cpu_usage structure for the new task-group */
+static void *sched_create_group(void)
+{
+ struct task_grp *tg;
+ int i;
+
+ tg = kmalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return NULL;
+
+ tg->cpu_usage = alloc_percpu(struct cpu_usage);
+ if (!tg->cpu_usage) {
+ kfree(tg);
+ return NULL;
+ }
+
+ tg->limit = 0; /* No limit */
+
+ for_each_possible_cpu(i) {
+ struct cpu_usage *grp;
+
+ grp = per_cpu_ptr(tg->cpu_usage, i);
+ grp->tokens = 0;
+ grp->last_update = cpu_rq(i)->last_update;
+ grp->starve_count = 0;
+ }
+
+ return tg;
+}
+
+/* Deallocate cpu_usage structure */
+static void sched_destroy_group(struct task_grp *tg)
+{
+ free_percpu(tg->cpu_usage);
+ kfree(tg);
+}
+
+/* Assign quota to this group */
+static int sched_set_limit(struct task_grp *tg, unsigned long limit)
+{
+ int i;
+
+ tg->limit = limit;
+ for_each_possible_cpu(i) {
+ struct cpu_usage *grp;
+
+ grp = per_cpu_ptr(tg->cpu_usage, i);
+ grp->tokens = limit * HZ / 100;
+ grp->last_update = jiffies;
+ }
+ return 0;
+}
+
+/* Return assigned quota for this group */
+static unsigned long sched_get_limit(struct task_grp *tg)
+{
+ if (!tg)
+ return 0;
+
+ return tg->limit;
+}
+
+static void sched_move_task(struct task_struct *p, struct task_grp *old,
+ struct task_grp *new)
+{
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+ if (p->array && old)
+ clear_tsk_starving(p, old);
+ task_rq_unlock(rq, &flags);
+}
+
+static struct task_grp_ops sched_grp_ops = {
+ .create_group = sched_create_group,
+ .destroy_group = sched_destroy_group,
+ .set_limit = sched_set_limit,
+ .get_limit = sched_get_limit,
+ .move_task = sched_move_task,
+};
+
+struct task_grp_ops *cpu_ctlr = &sched_grp_ops;
+
#else

#define task_starving(p) 0
diff -puN include/linux/sched.h~cpuset_interface include/linux/sched.h
--- linux-2.6.19-rc6/include/linux/sched.h~cpuset_interface 2006-12-01 20:45:14.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/include/linux/sched.h 2006-12-01 20:45:14.000000000 +0530
@@ -1695,6 +1695,21 @@ static inline void thaw_processes(void)
static inline int try_to_freeze(void) { return 0; }

#endif /* CONFIG_PM */
+
+#ifdef CONFIG_CPUMETER
+struct task_grp;
+struct task_grp_ops {
+ void *(*create_group)(void);
+ void (*destroy_group)(struct task_grp *grp);
+ int (*set_limit)(struct task_grp *grp, unsigned long limit);
+ unsigned long (*get_limit)(struct task_grp *grp);
+ void (*move_task)(struct task_struct *p, struct task_grp *old,
+ struct task_grp *new);
+};
+
+extern struct task_grp_ops *cpu_ctlr;
+#endif
+
#endif /* __KERNEL__ */

#endif
diff -puN init/Kconfig~cpuset_interface init/Kconfig
--- linux-2.6.19-rc6/init/Kconfig~cpuset_interface 2006-12-01 20:45:14.000000000 +0530
+++ linux-2.6.19-rc6-vatsa/init/Kconfig 2006-12-01 20:45:14.000000000 +0530
@@ -265,6 +265,14 @@ config CPUSETS

Say N if unsure.

+config CPUMETER
+ bool "CPU resource control"
+ depends on CPUSETS && EXPERIMENTAL
+ help
+ This options lets you create cpu resource partitions within
+ cpusets. Each resource partition can be given a different quota
+ of CPU usage.
+
config PROC_PID_CPUSET
bool "Include legacy /proc/<pid>/cpuset file"
depends on CPUSETS
_

--
Regards,
vatsa

2006-12-01 17:26:15

by Paul Menage

[permalink] [raw]
Subject: Re: [Patch 1/3] Miscellaneous container fixes

On 12/1/06, Srivatsa Vaddagiri <[email protected]> wrote:
> This patches fixes various bugs I hit in the recently posted container
> patches.
>
> 1. If a subsystem registers with fork/exit hook during bootup (much
> before rcu is initialized), then the resulting synchronize_rcu() in
> container_register_subsys() hangs. Avoid this by not calling
> synchronize_rcu() if we arent fully booted yet.
>
> 2. If cpuset_create fails() for some reason, then the resulting
> call to cpuset_destroy can trip. Avoid this by initializing
> container->...->cpuset pointer to NULL in cpuset_create().
>
> 3. container_rmdir->cpuset_destroy->update_flag can deadlock on
> container_lock(). Avoid this by introducing __update_flag, which
> doesnt take container_lock().

Ah - this may be the lockup that PaulJ hit.

Thanks for these fixes.

Paul

2006-12-01 20:33:30

by Paul Jackson

[permalink] [raw]
Subject: Re: [Patch 1/3] Miscellaneous container fixes

Paul M wrote:
> Ah - this may be the lockup that PaulJ hit.

Yes - looks like this fixes it. Thanks, Srivatsa.

And with that fix, it becomes obvious how to reproduce this problem:

mount -t cpuset cpuset /dev/cpuset # if not already mounted
cd /dev/cpuset
mkdir foo
echo 1 > foo/cpu_exclusive
rmdir foo # hangs ...

However ...

Read the comment in kernel/cpuset.c for the routine cpuset_destroy().
It explains that update_flag() is called where it is (turning off
the cpu_exclusive flag, if it was set), to avoid the calling sequence:

cpuset_destroy->update_flag->update_cpu_domains->lock_cpu_hotplug

while holding the callback_mutex, as that could ABBA deadlock with the
CPU hotplug code.

But with this container based rewrite of cpusets, it now seems that
cpuset_destroy -is- called holding the callback_mutex (though I don't
see any mention of that in the cpuset_destroy comment ;), so it would
seem that we once again are at risk for this ABBA deadlock.

I also notice that the comment for container_lock() in the file
kernel/container.c only mentions its use in the oom code. That is
no longer the only, or even primary, user of this lock routine.
The kernel/cpuset.c code uses it frequently (without comment ;),
and I wouldn't be surprised to see other future controllers calling
container_lock() as well.

Looks like its time to update those comments, and think about what
was written there before, as that might catch a bug or two, such as
the one Srivatsa just fixed for us.

Most of those long locking comments in kernel/cpuset.c are there
for a reason - recording the results of a lesson learned in the
school of hard knocks.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2006-12-05 12:05:22

by Paul Menage

[permalink] [raw]
Subject: Re: [Patch 1/3] Miscellaneous container fixes

On 12/1/06, Paul Jackson <[email protected]> wrote:
> Read the comment in kernel/cpuset.c for the routine cpuset_destroy().
> It explains that update_flag() is called where it is (turning off
> the cpu_exclusive flag, if it was set), to avoid the calling sequence:
>
> cpuset_destroy->update_flag->update_cpu_domains->lock_cpu_hotplug
>
> while holding the callback_mutex, as that could ABBA deadlock with the
> CPU hotplug code.

This particular race is gone in the -mm2 kernel since cpus_exclusive
no longer drives sched_domains - can we assume that this will be
reaching mainline some time soon?

>
> But with this container based rewrite of cpusets, it now seems that
> cpuset_destroy -is- called holding the callback_mutex (though I don't
> see any mention of that in the cpuset_destroy comment ;), so it would

And in fact I explicitly documented it as only holding manage_mutex,
not callback_mutex in Documentation/containers.txt. I think maybe this
slipped in during the multi-hierarchy rewrite. :-(

Looking at the various *_destroy() functions in the container
subsystems in my patch set, I think that it should be OK to call the
destructors prior to taking callback_mutex for the unlinking of the
container from its parents.

>
> I also notice that the comment for container_lock() in the file
> kernel/container.c only mentions its use in the oom code. That is
> no longer the only, or even primary, user of this lock routine.
> The kernel/cpuset.c code uses it frequently (without comment ;),
> and I wouldn't be surprised to see other future controllers calling
> container_lock() as well.

As was pointed out by Chandra Seetharaman, it would be nice if we
could avoid having all the container subsystems relying on
callback_mutex for their locking needs - particularly since that's
likely to be acquired at performance-sensitive times.

The cpu_acct and beancounters subsystems that I included in my patch
set both use their own per-container locks for synchronization, so
it's not completely necessary to use the central locks. There's
probably a happy medium between "one big lock" and "way too many small
locks".

Paul